1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreate2.h"
51 #include "messages/MOSDPGCreated.h"
52 #include "messages/MOSDPGTemp.h"
53 #include "messages/MOSDPGReadyToMerge.h"
54 #include "messages/MMonCommand.h"
55 #include "messages/MRemoveSnaps.h"
56 #include "messages/MOSDScrub.h"
57 #include "messages/MRoute.h"
59 #include "common/TextTable.h"
60 #include "common/Timer.h"
61 #include "common/ceph_argparse.h"
62 #include "common/perf_counters.h"
63 #include "common/strtol.h"
64 #include "common/numa.h"
66 #include "common/config.h"
67 #include "common/errno.h"
69 #include "erasure-code/ErasureCodePlugin.h"
70 #include "compressor/Compressor.h"
71 #include "common/Checksummer.h"
73 #include "include/compat.h"
74 #include "include/ceph_assert.h"
75 #include "include/stringify.h"
76 #include "include/util.h"
77 #include "common/cmdparse.h"
78 #include "include/str_list.h"
79 #include "include/str_map.h"
80 #include "include/scope_guard.h"
82 #include "auth/cephx/CephxKeyServer.h"
83 #include "osd/OSDCap.h"
85 #include "json_spirit/json_spirit_reader.h"
87 #include <boost/algorithm/string/predicate.hpp>
89 #define dout_subsys ceph_subsys_mon
90 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
91 static const string
OSD_METADATA_PREFIX("osd_metadata");
92 static const string
OSD_SNAP_PREFIX("osd_snap");
96 const uint32_t MAX_POOL_APPLICATIONS
= 4;
97 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
98 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
100 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
101 // Note: this doesn't include support for the application tag match
102 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
103 auto& match
= grant
.match
;
104 if (match
.is_match_all()) {
106 } else if (pool_name
!= nullptr &&
107 !match
.pool_namespace
.pool_name
.empty() &&
108 match
.pool_namespace
.pool_name
== *pool_name
) {
115 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
116 const KeyServer
& key_server
,
117 const EntityName
& entity_name
,
118 const MonCap
& mon_caps
,
119 const entity_addr_t
& peer_socket_addr
,
120 const std::string
* pool_name
)
122 typedef std::map
<std::string
, std::string
> CommandArgs
;
124 if (mon_caps
.is_capable(
125 cct
, CEPH_ENTITY_TYPE_MON
,
127 "osd pool op unmanaged-snap",
128 (pool_name
== nullptr ?
129 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
130 CommandArgs
{{"poolname", *pool_name
}}),
136 AuthCapsInfo caps_info
;
137 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
139 dout(10) << "unable to locate OSD cap data for " << entity_name
140 << " in auth db" << dendl
;
145 if (caps_info
.caps
.length() > 0) {
146 auto p
= caps_info
.caps
.cbegin();
149 } catch (const buffer::error
&err
) {
150 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
157 if (!osd_cap
.parse(caps_str
, nullptr)) {
158 dout(10) << "unable to parse OSD cap data for " << entity_name
159 << " in auth db" << dendl
;
163 // if the entity has write permissions in one or all pools, permit
164 // usage of unmanaged-snapshots
165 if (osd_cap
.allow_all()) {
169 for (auto& grant
: osd_cap
.grants
) {
170 if (grant
.profile
.is_valid()) {
171 for (auto& profile_grant
: grant
.profile_grants
) {
172 if (is_osd_writable(profile_grant
, pool_name
)) {
176 } else if (is_osd_writable(grant
, pool_name
)) {
184 } // anonymous namespace
186 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
188 if (epoch_by_pg
.size() <= ps
) {
189 epoch_by_pg
.resize(ps
+ 1, 0);
191 const auto old_lec
= epoch_by_pg
[ps
];
192 if (old_lec
>= last_epoch_clean
) {
196 epoch_by_pg
[ps
] = last_epoch_clean
;
197 if (last_epoch_clean
< floor
) {
198 floor
= last_epoch_clean
;
199 } else if (last_epoch_clean
> floor
) {
200 if (old_lec
== floor
) {
201 // probably should increase floor?
202 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
203 std::end(epoch_by_pg
));
207 if (ps
!= next_missing
) {
210 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
211 if (epoch_by_pg
[next_missing
] == 0) {
217 void LastEpochClean::remove_pool(uint64_t pool
)
219 report_by_pool
.erase(pool
);
222 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
224 auto& lec
= report_by_pool
[pg
.pool()];
225 return lec
.report(pg
.ps(), last_epoch_clean
);
228 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
230 auto floor
= latest
.get_epoch();
231 for (auto& pool
: latest
.get_pools()) {
232 auto reported
= report_by_pool
.find(pool
.first
);
233 if (reported
== report_by_pool
.end()) {
236 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
239 if (reported
->second
.floor
< floor
) {
240 floor
= reported
->second
.floor
;
247 class C_UpdateCreatingPGs
: public Context
{
252 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
253 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
254 void finish(int r
) override
{
256 utime_t end
= ceph_clock_now();
257 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
258 << (end
- start
) << " seconds" << dendl
;
259 osdmon
->update_creating_pgs();
260 osdmon
->check_pg_creates_subs();
266 #define dout_prefix _prefix(_dout, mon, osdmap)
267 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
268 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
269 << "(" << mon
->get_state_name()
270 << ").osd e" << osdmap
.get_epoch() << " ";
273 OSDMonitor::OSDMonitor(
277 const string
& service_name
)
278 : PaxosService(mn
, p
, service_name
),
280 inc_osd_cache(g_conf()->mon_osd_cache_size
),
281 full_osd_cache(g_conf()->mon_osd_cache_size
),
282 has_osdmap_manifest(false),
283 mapper(mn
->cct
, &mn
->cpu_tp
)
286 bool OSDMonitor::_have_pending_crush()
288 return pending_inc
.crush
.length() > 0;
291 CrushWrapper
&OSDMonitor::_get_stable_crush()
293 return *osdmap
.crush
;
296 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
299 if (pending_inc
.crush
.length())
300 bl
= pending_inc
.crush
;
302 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
304 auto p
= bl
.cbegin();
308 void OSDMonitor::create_initial()
310 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
315 mon
->store
->get("mkfs", "osdmap", bl
);
319 newmap
.set_fsid(mon
->monmap
->fsid
);
321 newmap
.build_simple(cct
, 0, mon
->monmap
->fsid
, 0);
324 newmap
.created
= newmap
.modified
= ceph_clock_now();
326 // new clusters should sort bitwise by default.
327 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
330 CEPH_OSDMAP_RECOVERY_DELETES
|
331 CEPH_OSDMAP_PURGED_SNAPDIRS
|
332 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
333 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
334 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
335 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
336 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
337 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
338 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
340 // new cluster should require latest by default
341 if (g_conf().get_val
<bool>("mon_debug_no_require_nautilus")) {
342 if (g_conf()->mon_debug_no_require_mimic
) {
343 derr
<< __func__
<< " mon_debug_no_require_mimic=true and nautilus=true" << dendl
;
344 newmap
.require_osd_release
= CEPH_RELEASE_LUMINOUS
;
346 derr
<< __func__
<< " mon_debug_no_require_nautilus=true" << dendl
;
347 newmap
.require_osd_release
= CEPH_RELEASE_MIMIC
;
350 newmap
.require_osd_release
= CEPH_RELEASE_NAUTILUS
;
351 int r
= ceph_release_from_name(
352 g_conf()->mon_osd_initial_require_min_compat_client
.c_str());
354 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
356 newmap
.require_min_compat_client
= r
;
359 // encode into pending incremental
360 uint64_t features
= newmap
.get_encoding_features();
361 newmap
.encode(pending_inc
.fullmap
,
362 features
| CEPH_FEATURE_RESERVED
);
363 pending_inc
.full_crc
= newmap
.get_crc();
364 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
367 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
369 s
.insert(service_name
);
370 s
.insert(OSD_PG_CREATING_PREFIX
);
371 s
.insert(OSD_METADATA_PREFIX
);
372 s
.insert(OSD_SNAP_PREFIX
);
375 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
377 // we really don't care if the version has been updated, because we may
378 // have trimmed without having increased the last committed; yet, we may
379 // need to update the in-memory manifest.
380 load_osdmap_manifest();
382 version_t version
= get_last_committed();
383 if (version
== osdmap
.epoch
)
385 ceph_assert(version
> osdmap
.epoch
);
387 dout(15) << "update_from_paxos paxos e " << version
388 << ", my e " << osdmap
.epoch
<< dendl
;
391 if (!mapping_job
->is_done()) {
392 dout(1) << __func__
<< " mapping job "
393 << mapping_job
.get() << " did not complete, "
394 << mapping_job
->shards
<< " left, canceling" << dendl
;
395 mapping_job
->abort();
403 * We will possibly have a stashed latest that *we* wrote, and we will
404 * always be sure to have the oldest full map in the first..last range
405 * due to encode_trim_extra(), which includes the oldest full map in the trim
408 * encode_trim_extra() does not however write the full map's
409 * version to 'full_latest'. This is only done when we are building the
410 * full maps from the incremental versions. But don't panic! We make sure
411 * that the following conditions find whichever full map version is newer.
413 version_t latest_full
= get_version_latest_full();
414 if (latest_full
== 0 && get_first_committed() > 1)
415 latest_full
= get_first_committed();
417 if (get_first_committed() > 1 &&
418 latest_full
< get_first_committed()) {
419 // the monitor could be just sync'ed with its peer, and the latest_full key
420 // is not encoded in the paxos commits in encode_pending(), so we need to
421 // make sure we get it pointing to a proper version.
422 version_t lc
= get_last_committed();
423 version_t fc
= get_first_committed();
425 dout(10) << __func__
<< " looking for valid full map in interval"
426 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
429 for (version_t v
= lc
; v
>= fc
; v
--) {
430 string full_key
= "full_" + stringify(v
);
431 if (mon
->store
->exists(get_service_name(), full_key
)) {
432 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
438 ceph_assert(latest_full
> 0);
439 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
440 put_version_latest_full(t
, latest_full
);
441 mon
->store
->apply_transaction(t
);
442 dout(10) << __func__
<< " updated the on-disk full map version to "
443 << latest_full
<< dendl
;
446 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
447 bufferlist latest_bl
;
448 get_version_full(latest_full
, latest_bl
);
449 ceph_assert(latest_bl
.length() != 0);
450 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
452 osdmap
.decode(latest_bl
);
456 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
457 auto p
= bl
.cbegin();
458 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
459 creating_pgs
.decode(p
);
460 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
461 << creating_pgs
.last_scan_epoch
462 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
464 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
468 // walk through incrementals
469 MonitorDBStore::TransactionRef t
;
471 while (version
> osdmap
.epoch
) {
473 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
474 ceph_assert(err
== 0);
475 ceph_assert(inc_bl
.length());
477 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
479 OSDMap::Incremental
inc(inc_bl
);
480 err
= osdmap
.apply_incremental(inc
);
481 ceph_assert(err
== 0);
484 t
.reset(new MonitorDBStore::Transaction
);
486 // Write out the full map for all past epochs. Encode the full
487 // map with the same features as the incremental. If we don't
488 // know, use the quorum features. If we don't know those either,
489 // encode with all features.
490 uint64_t f
= inc
.encode_features
;
492 f
= mon
->get_quorum_con_features();
496 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
497 tx_size
+= full_bl
.length();
499 bufferlist orig_full_bl
;
500 get_version_full(osdmap
.epoch
, orig_full_bl
);
501 if (orig_full_bl
.length()) {
502 // the primary provided the full map
503 ceph_assert(inc
.have_crc
);
504 if (inc
.full_crc
!= osdmap
.crc
) {
505 // This will happen if the mons were running mixed versions in
506 // the past or some other circumstance made the full encoded
507 // maps divergent. Reloading here will bring us back into
508 // sync with the primary for this and all future maps. OSDs
509 // will also be brought back into sync when they discover the
510 // crc mismatch and request a full map from a mon.
511 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
514 dout(20) << __func__
<< " my (bad) full osdmap:\n";
515 JSONFormatter
jf(true);
516 jf
.dump_object("osdmap", osdmap
);
518 *_dout
<< "\nhexdump:\n";
519 full_bl
.hexdump(*_dout
);
523 osdmap
.decode(orig_full_bl
);
525 dout(20) << __func__
<< " canonical full osdmap:\n";
526 JSONFormatter
jf(true);
527 jf
.dump_object("osdmap", osdmap
);
529 *_dout
<< "\nhexdump:\n";
530 orig_full_bl
.hexdump(*_dout
);
534 ceph_assert(!inc
.have_crc
);
535 put_version_full(t
, osdmap
.epoch
, full_bl
);
537 put_version_latest_full(t
, osdmap
.epoch
);
540 dout(1) << osdmap
<< dendl
;
542 if (osdmap
.epoch
== 1) {
543 t
->erase("mkfs", "osdmap");
546 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
547 mon
->store
->apply_transaction(t
);
548 t
= MonitorDBStore::TransactionRef();
551 for (const auto &osd_state
: inc
.new_state
) {
552 if (osd_state
.second
& CEPH_OSD_UP
) {
553 // could be marked up *or* down, but we're too lazy to check which
554 last_osd_report
.erase(osd_state
.first
);
556 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
557 // could be created *or* destroyed, but we can safely drop it
558 osd_epochs
.erase(osd_state
.first
);
564 mon
->store
->apply_transaction(t
);
567 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
568 if (osdmap
.is_out(o
))
570 auto found
= down_pending_out
.find(o
);
571 if (osdmap
.is_down(o
)) {
572 // populate down -> out map
573 if (found
== down_pending_out
.end()) {
574 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
575 down_pending_out
[o
] = ceph_clock_now();
578 if (found
!= down_pending_out
.end()) {
579 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
580 down_pending_out
.erase(found
);
584 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
587 check_pg_creates_subs();
589 share_map_with_random_osd();
594 // make sure our feature bits reflect the latest map
595 update_msgr_features();
597 if (!mon
->is_leader()) {
598 // will be called by on_active() on the leader, avoid doing so twice
603 void OSDMonitor::start_mapping()
605 // initiate mapping job
607 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
609 mapping_job
->abort();
611 if (!osdmap
.get_pools().empty()) {
612 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
613 mapping_job
= mapping
.start_update(osdmap
, mapper
,
614 g_conf()->mon_osd_mapping_pgs_per_chunk
);
615 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
616 << " at " << fin
->start
<< dendl
;
617 mapping_job
->set_finish_event(fin
);
619 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
620 mapping_job
= nullptr;
624 void OSDMonitor::update_msgr_features()
627 types
.insert((int)entity_name_t::TYPE_OSD
);
628 types
.insert((int)entity_name_t::TYPE_CLIENT
);
629 types
.insert((int)entity_name_t::TYPE_MDS
);
630 types
.insert((int)entity_name_t::TYPE_MON
);
631 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
633 uint64_t features
= osdmap
.get_features(*q
, &mask
);
634 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
635 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
636 ceph::net::Policy p
= mon
->messenger
->get_policy(*q
);
637 p
.features_required
= (p
.features_required
& ~mask
) | features
;
638 mon
->messenger
->set_policy(*q
, p
);
643 void OSDMonitor::on_active()
647 if (mon
->is_leader()) {
648 mon
->clog
->debug() << "osdmap " << osdmap
;
649 if (!priority_convert
) {
650 // Only do this once at start-up
651 convert_pool_priorities();
652 priority_convert
= true;
655 list
<MonOpRequestRef
> ls
;
656 take_all_failures(ls
);
657 while (!ls
.empty()) {
658 MonOpRequestRef op
= ls
.front();
659 op
->mark_osdmon_event(__func__
);
667 void OSDMonitor::on_restart()
669 last_osd_report
.clear();
672 void OSDMonitor::on_shutdown()
674 dout(10) << __func__
<< dendl
;
676 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
678 mapping_job
->abort();
681 // discard failure info, waiters
682 list
<MonOpRequestRef
> ls
;
683 take_all_failures(ls
);
687 void OSDMonitor::update_logger()
689 dout(10) << "update_logger" << dendl
;
691 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
692 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
693 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
694 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
697 void OSDMonitor::create_pending()
699 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
700 pending_inc
.fsid
= mon
->monmap
->fsid
;
701 pending_metadata
.clear();
702 pending_metadata_rm
.clear();
704 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
706 // safety checks (this shouldn't really happen)
708 if (osdmap
.backfillfull_ratio
<= 0) {
709 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
710 if (pending_inc
.new_backfillfull_ratio
> 1.0)
711 pending_inc
.new_backfillfull_ratio
/= 100;
712 dout(1) << __func__
<< " setting backfillfull_ratio = "
713 << pending_inc
.new_backfillfull_ratio
<< dendl
;
715 if (osdmap
.full_ratio
<= 0) {
716 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
717 if (pending_inc
.new_full_ratio
> 1.0)
718 pending_inc
.new_full_ratio
/= 100;
719 dout(1) << __func__
<< " setting full_ratio = "
720 << pending_inc
.new_full_ratio
<< dendl
;
722 if (osdmap
.nearfull_ratio
<= 0) {
723 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
724 if (pending_inc
.new_nearfull_ratio
> 1.0)
725 pending_inc
.new_nearfull_ratio
/= 100;
726 dout(1) << __func__
<< " setting nearfull_ratio = "
727 << pending_inc
.new_nearfull_ratio
<< dendl
;
731 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
733 if (osdmap
.crush
->has_legacy_rule_ids()) {
734 CrushWrapper newcrush
;
735 _get_pending_crush(newcrush
);
737 // First, for all pools, work out which rule they really used
738 // by resolving ruleset to rule.
739 for (const auto &i
: osdmap
.get_pools()) {
740 const auto pool_id
= i
.first
;
741 const auto &pool
= i
.second
;
742 int new_rule_id
= newcrush
.find_rule(pool
.crush_rule
,
743 pool
.type
, pool
.size
);
745 dout(1) << __func__
<< " rewriting pool "
746 << osdmap
.get_pool_name(pool_id
) << " crush ruleset "
747 << pool
.crush_rule
<< " -> rule id " << new_rule_id
<< dendl
;
748 if (pending_inc
.new_pools
.count(pool_id
) == 0) {
749 pending_inc
.new_pools
[pool_id
] = pool
;
751 pending_inc
.new_pools
[pool_id
].crush_rule
= new_rule_id
;
754 // Now, go ahead and renumber all the rules so that their
755 // rule_id field corresponds to their position in the array
756 auto old_to_new
= newcrush
.renumber_rules();
757 dout(1) << __func__
<< " Rewrote " << old_to_new
<< " crush IDs:" << dendl
;
758 for (const auto &i
: old_to_new
) {
759 dout(1) << __func__
<< " " << i
.first
<< " -> " << i
.second
<< dendl
;
761 pending_inc
.crush
.clear();
762 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
767 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
768 const OSDMap
& nextmap
)
770 dout(10) << __func__
<< dendl
;
771 creating_pgs_t pending_creatings
;
773 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
774 pending_creatings
= creating_pgs
;
776 // check for new or old pools
777 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
779 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
783 queued
+= scan_for_creating_pgs(inc
.new_pools
,
787 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
788 for (auto deleted_pool
: inc
.old_pools
) {
789 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
790 dout(10) << __func__
<< " " << removed
791 << " pg removed because containing pool deleted: "
792 << deleted_pool
<< dendl
;
793 last_epoch_clean
.remove_pool(deleted_pool
);
795 // pgmon updates its creating_pgs in check_osd_map() which is called by
796 // on_active() and check_osd_map() could be delayed if lease expires, so its
797 // creating_pgs could be stale in comparison with the one of osdmon. let's
798 // trim them here. otherwise, they will be added back after being erased.
799 unsigned removed
= 0;
800 for (auto& pg
: pending_created_pgs
) {
801 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
802 pending_creatings
.created_pools
.insert(pg
.pool());
803 removed
+= pending_creatings
.pgs
.erase(pg
);
805 pending_created_pgs
.clear();
806 dout(10) << __func__
<< " " << removed
807 << " pgs removed because they're created" << dendl
;
808 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
811 // filter out any pgs that shouldn't exist.
813 auto i
= pending_creatings
.pgs
.begin();
814 while (i
!= pending_creatings
.pgs
.end()) {
815 if (!nextmap
.pg_exists(i
->first
)) {
816 dout(10) << __func__
<< " removing pg " << i
->first
817 << " which should not exist" << dendl
;
818 i
= pending_creatings
.pgs
.erase(i
);
826 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
827 const auto total
= pending_creatings
.pgs
.size();
828 while (pending_creatings
.pgs
.size() < max
&&
829 !pending_creatings
.queue
.empty()) {
830 auto p
= pending_creatings
.queue
.begin();
831 int64_t poolid
= p
->first
;
832 dout(10) << __func__
<< " pool " << poolid
833 << " created " << p
->second
.created
834 << " modified " << p
->second
.modified
835 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
837 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
838 p
->second
.end
- p
->second
.start
);
839 ps_t first
= p
->second
.start
;
840 ps_t end
= first
+ n
;
841 for (ps_t ps
= first
; ps
< end
; ++ps
) {
842 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
843 // NOTE: use the *current* epoch as the PG creation epoch so that the
844 // OSD does not have to generate a long set of PastIntervals.
845 pending_creatings
.pgs
.emplace(pgid
, make_pair(inc
.epoch
,
846 p
->second
.modified
));
847 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
849 p
->second
.start
= end
;
850 if (p
->second
.done()) {
851 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
852 pending_creatings
.queue
.erase(p
);
854 dout(10) << __func__
<< " pool " << poolid
855 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
859 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
860 << " pools" << dendl
;
862 << " " << (pending_creatings
.pgs
.size() - total
)
863 << "/" << pending_creatings
.pgs
.size()
864 << " pgs added from queued pools" << dendl
;
865 return pending_creatings
;
868 void OSDMonitor::maybe_prime_pg_temp()
871 if (pending_inc
.crush
.length()) {
872 dout(10) << __func__
<< " new crush map, all" << dendl
;
876 if (!pending_inc
.new_up_client
.empty()) {
877 dout(10) << __func__
<< " new up osds, all" << dendl
;
881 // check for interesting OSDs
883 for (auto p
= pending_inc
.new_state
.begin();
884 !all
&& p
!= pending_inc
.new_state
.end();
886 if ((p
->second
& CEPH_OSD_UP
) &&
887 osdmap
.is_up(p
->first
)) {
888 osds
.insert(p
->first
);
891 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
892 !all
&& p
!= pending_inc
.new_weight
.end();
894 if (p
->second
< osdmap
.get_weight(p
->first
)) {
896 osds
.insert(p
->first
);
898 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
904 if (!all
&& osds
.empty())
909 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
910 if (estimate
> mapping
.get_num_pgs() *
911 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
912 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
913 << osds
.size() << " osds >= "
914 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
915 << mapping
.get_num_pgs() << " pgs, all"
919 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
920 << osds
.size() << " osds" << dendl
;
925 next
.deepish_copy_from(osdmap
);
926 next
.apply_incremental(pending_inc
);
928 if (next
.get_pools().empty()) {
929 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
931 PrimeTempJob
job(next
, this);
932 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
933 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
934 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
936 dout(10) << __func__
<< " did not finish in "
937 << g_conf()->mon_osd_prime_pg_temp_max_time
938 << ", stopping" << dendl
;
942 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
943 utime_t stop
= ceph_clock_now();
944 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
945 const int chunk
= 1000;
947 std::unordered_set
<pg_t
> did_pgs
;
948 for (auto osd
: osds
) {
949 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
950 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
951 for (auto pgid
: pgs
) {
952 if (!did_pgs
.insert(pgid
).second
) {
955 prime_pg_temp(next
, pgid
);
958 if (ceph_clock_now() > stop
) {
959 dout(10) << __func__
<< " consumed more than "
960 << g_conf()->mon_osd_prime_pg_temp_max_time
961 << " seconds, stopping"
971 void OSDMonitor::prime_pg_temp(
975 // TODO: remove this creating_pgs direct access?
976 if (creating_pgs
.pgs
.count(pgid
)) {
979 if (!osdmap
.pg_exists(pgid
)) {
983 vector
<int> up
, acting
;
984 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
986 vector
<int> next_up
, next_acting
;
987 int next_up_primary
, next_acting_primary
;
988 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
989 &next_acting
, &next_acting_primary
);
990 if (acting
== next_acting
&&
991 !(up
!= acting
&& next_up
== next_acting
))
992 return; // no change since last epoch
995 return; // if previously empty now we can be no worse off
996 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
997 if (pool
&& acting
.size() < pool
->min_size
)
998 return; // can be no worse off than before
1000 if (next_up
== next_acting
) {
1002 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1006 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1007 << " -> " << next_up
<< "/" << next_acting
1008 << ", priming " << acting
1011 std::lock_guard
l(prime_pg_temp_lock
);
1012 // do not touch a mapping if a change is pending
1013 pending_inc
.new_pg_temp
.emplace(
1015 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1020 * @note receiving a transaction in this function gives a fair amount of
1021 * freedom to the service implementation if it does need it. It shouldn't.
1023 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1025 dout(10) << "encode_pending e " << pending_inc
.epoch
1029 dout(1) << __func__
<< " osdmap full prune encoded e"
1030 << pending_inc
.epoch
<< dendl
;
1033 // finalize up pending_inc
1034 pending_inc
.modified
= ceph_clock_now();
1036 int r
= pending_inc
.propagate_snaps_to_tiers(cct
, osdmap
);
1037 ceph_assert(r
== 0);
1040 if (!mapping_job
->is_done()) {
1041 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1042 << mapping_job
.get() << " did not complete, "
1043 << mapping_job
->shards
<< " left" << dendl
;
1044 mapping_job
->abort();
1045 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1046 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1047 << mapping_job
.get() << " is prior epoch "
1048 << mapping
.get_epoch() << dendl
;
1050 if (g_conf()->mon_osd_prime_pg_temp
) {
1051 maybe_prime_pg_temp();
1054 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1055 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1058 mapping_job
.reset();
1060 // ensure we don't have blank new_state updates. these are interrpeted as
1061 // CEPH_OSD_UP (and almost certainly not what we want!).
1062 auto p
= pending_inc
.new_state
.begin();
1063 while (p
!= pending_inc
.new_state
.end()) {
1064 if (p
->second
== 0) {
1065 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1066 p
= pending_inc
.new_state
.erase(p
);
1068 if (p
->second
& CEPH_OSD_UP
) {
1069 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1074 if (!pending_inc
.new_up_client
.empty()) {
1075 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1077 for (auto& i
: pending_inc
.new_weight
) {
1078 if (i
.first
> osdmap
.max_osd
) {
1080 // new osd is already marked in
1081 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1083 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1084 // existing osd marked in or out
1085 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1091 tmp
.deepish_copy_from(osdmap
);
1092 tmp
.apply_incremental(pending_inc
);
1094 // clean pg_temp mappings
1095 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1097 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1099 // check every upmapped pg for now
1100 // until we could reliably identify certain cases to ignore,
1101 // which is obviously the hard part TBD..
1102 vector
<pg_t
> pgs_to_check
;
1103 tmp
.get_upmap_pgs(&pgs_to_check
);
1104 if (pgs_to_check
.size() < g_conf()->mon_clean_pg_upmaps_per_chunk
* 2) {
1105 // not enough pgs, do it inline
1106 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1108 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1109 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1114 // update creating pgs first so that we can remove the created pgid and
1115 // process the pool flag removal below in the same osdmap epoch.
1116 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1117 bufferlist creatings_bl
;
1118 encode(pending_creatings
, creatings_bl
);
1119 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1121 // remove any old (or incompat) POOL_CREATING flags
1122 for (auto& i
: tmp
.get_pools()) {
1123 if (tmp
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
1124 // pre-nautilus OSDMaps shouldn't get this flag.
1125 if (pending_inc
.new_pools
.count(i
.first
)) {
1126 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1129 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1130 !pending_creatings
.still_creating_pool(i
.first
)) {
1131 dout(10) << __func__
<< " done creating pool " << i
.first
1132 << ", clearing CREATING flag" << dendl
;
1133 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1134 pending_inc
.new_pools
[i
.first
] = i
.second
;
1136 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1140 // remove any legacy osdmap nearfull/full flags
1142 if (tmp
.test_flag(CEPH_OSDMAP_FULL
| CEPH_OSDMAP_NEARFULL
)) {
1143 dout(10) << __func__
<< " clearing legacy osdmap nearfull/full flag"
1145 remove_flag(CEPH_OSDMAP_NEARFULL
);
1146 remove_flag(CEPH_OSDMAP_FULL
);
1149 // collect which pools are currently affected by
1150 // the near/backfill/full osd(s),
1151 // and set per-pool near/backfill/full flag instead
1152 set
<int64_t> full_pool_ids
;
1153 set
<int64_t> backfillfull_pool_ids
;
1154 set
<int64_t> nearfull_pool_ids
;
1155 tmp
.get_full_pools(cct
,
1157 &backfillfull_pool_ids
,
1158 &nearfull_pool_ids
);
1159 if (full_pool_ids
.empty() ||
1160 backfillfull_pool_ids
.empty() ||
1161 nearfull_pool_ids
.empty()) {
1162 // normal case - no nearfull, backfillfull or full osds
1163 // try cancel any improper nearfull/backfillfull/full pool
1165 for (auto &pool
: tmp
.get_pools()) {
1166 auto p
= pool
.first
;
1167 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1168 nearfull_pool_ids
.empty()) {
1169 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1170 << "'s nearfull flag" << dendl
;
1171 if (pending_inc
.new_pools
.count(p
) == 0) {
1172 // load original pool info first!
1173 pending_inc
.new_pools
[p
] = pool
.second
;
1175 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1177 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1178 backfillfull_pool_ids
.empty()) {
1179 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1180 << "'s backfillfull flag" << dendl
;
1181 if (pending_inc
.new_pools
.count(p
) == 0) {
1182 pending_inc
.new_pools
[p
] = pool
.second
;
1184 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1186 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1187 full_pool_ids
.empty()) {
1188 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1189 // set by EQUOTA, skipping
1192 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1193 << "'s full flag" << dendl
;
1194 if (pending_inc
.new_pools
.count(p
) == 0) {
1195 pending_inc
.new_pools
[p
] = pool
.second
;
1197 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1201 if (!full_pool_ids
.empty()) {
1202 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1203 << " as full" << dendl
;
1204 for (auto &p
: full_pool_ids
) {
1205 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1208 if (pending_inc
.new_pools
.count(p
) == 0) {
1209 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1211 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1212 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1213 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1215 // cancel FLAG_FULL for pools which are no longer full too
1216 for (auto &pool
: tmp
.get_pools()) {
1217 auto p
= pool
.first
;
1218 if (full_pool_ids
.count(p
)) {
1219 // skip pools we have just marked as full above
1222 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1223 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1224 // don't touch if currently is not full
1225 // or is running out of quota (and hence considered as full)
1228 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1229 << "'s full flag" << dendl
;
1230 if (pending_inc
.new_pools
.count(p
) == 0) {
1231 pending_inc
.new_pools
[p
] = pool
.second
;
1233 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1236 if (!backfillfull_pool_ids
.empty()) {
1237 for (auto &p
: backfillfull_pool_ids
) {
1238 if (full_pool_ids
.count(p
)) {
1239 // skip pools we have already considered as full above
1242 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1243 // make sure FLAG_FULL is truly set, so we are safe not
1244 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1245 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1248 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1249 // don't bother if pool is already marked as backfillfull
1252 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1253 << "'s as backfillfull" << dendl
;
1254 if (pending_inc
.new_pools
.count(p
) == 0) {
1255 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1257 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1258 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1260 // cancel FLAG_BACKFILLFULL for pools
1261 // which are no longer backfillfull too
1262 for (auto &pool
: tmp
.get_pools()) {
1263 auto p
= pool
.first
;
1264 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1265 // skip pools we have just marked as backfillfull/full above
1268 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1269 // and don't touch if currently is not backfillfull
1272 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1273 << "'s backfillfull flag" << dendl
;
1274 if (pending_inc
.new_pools
.count(p
) == 0) {
1275 pending_inc
.new_pools
[p
] = pool
.second
;
1277 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1280 if (!nearfull_pool_ids
.empty()) {
1281 for (auto &p
: nearfull_pool_ids
) {
1282 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1285 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1286 // make sure FLAG_FULL is truly set, so we are safe not
1287 // to set a extra (redundant) FLAG_NEARFULL flag
1288 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1291 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1292 // don't bother if pool is already marked as nearfull
1295 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1296 << "'s as nearfull" << dendl
;
1297 if (pending_inc
.new_pools
.count(p
) == 0) {
1298 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1300 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1302 // cancel FLAG_NEARFULL for pools
1303 // which are no longer nearfull too
1304 for (auto &pool
: tmp
.get_pools()) {
1305 auto p
= pool
.first
;
1306 if (full_pool_ids
.count(p
) ||
1307 backfillfull_pool_ids
.count(p
) ||
1308 nearfull_pool_ids
.count(p
)) {
1309 // skip pools we have just marked as
1310 // nearfull/backfillfull/full above
1313 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1314 // and don't touch if currently is not nearfull
1317 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1318 << "'s nearfull flag" << dendl
;
1319 if (pending_inc
.new_pools
.count(p
) == 0) {
1320 pending_inc
.new_pools
[p
] = pool
.second
;
1322 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1326 // min_compat_client?
1327 if (tmp
.require_min_compat_client
== 0) {
1328 auto mv
= tmp
.get_min_compat_client();
1329 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1330 << "required " << ceph_release_name(mv
) << dendl
;
1331 mon
->clog
->info() << "setting require_min_compat_client to currently "
1332 << "required " << ceph_release_name(mv
);
1333 pending_inc
.new_require_min_compat_client
= mv
;
1336 // upgrade to mimic?
1337 if (osdmap
.require_osd_release
< CEPH_RELEASE_MIMIC
&&
1338 tmp
.require_osd_release
>= CEPH_RELEASE_MIMIC
) {
1339 dout(10) << __func__
<< " first mimic+ epoch" << dendl
;
1340 // record this epoch as the deletion for all legacy removed_snaps
1341 for (auto& p
: tmp
.get_pools()) {
1342 // update every pool
1343 if (pending_inc
.new_pools
.count(p
.first
) == 0) {
1344 pending_inc
.new_pools
[p
.first
] = p
.second
;
1346 auto& pi
= pending_inc
.new_pools
[p
.first
];
1347 if (pi
.snap_seq
== 0) {
1348 // no snaps on this pool
1351 if ((pi
.flags
& (pg_pool_t::FLAG_SELFMANAGED_SNAPS
|
1352 pg_pool_t::FLAG_POOL_SNAPS
)) == 0) {
1353 if (!pi
.removed_snaps
.empty()) {
1354 pi
.flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
1356 pi
.flags
|= pg_pool_t::FLAG_POOL_SNAPS
;
1360 // Make all previously removed snaps appear to be removed in this
1361 // epoch. this populates removed_snaps_queue. The OSD will subtract
1362 // off its purged_snaps, as before, and this set will shrink over the
1363 // following epochs as the purged snaps are reported back through the
1365 OSDMap::snap_interval_set_t removed
;
1366 if (!p
.second
.removed_snaps
.empty()) {
1367 // different flavor of interval_set :(
1368 for (auto q
= p
.second
.removed_snaps
.begin();
1369 q
!= p
.second
.removed_snaps
.end();
1371 removed
.insert(q
.get_start(), q
.get_len());
1374 for (snapid_t s
= 1; s
<= pi
.get_snap_seq(); s
= s
+ 1) {
1375 if (pi
.snaps
.count(s
) == 0) {
1380 pending_inc
.new_removed_snaps
[p
.first
].union_of(removed
);
1382 dout(10) << __func__
<< " converting pool " << p
.first
1383 << " with " << p
.second
.removed_snaps
.size()
1384 << " legacy removed_snaps" << dendl
;
1385 string k
= make_snap_epoch_key(p
.first
, pending_inc
.epoch
);
1387 encode(p
.second
.removed_snaps
, v
);
1388 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1389 for (auto q
= p
.second
.removed_snaps
.begin();
1390 q
!= p
.second
.removed_snaps
.end();
1393 string k
= make_snap_key_value(p
.first
, q
.get_start(),
1394 q
.get_len(), pending_inc
.epoch
, &v
);
1395 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1399 if (osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
&&
1400 tmp
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
1401 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1402 // add creating flags?
1403 for (auto& i
: tmp
.get_pools()) {
1404 if (pending_creatings
.still_creating_pool(i
.first
)) {
1405 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1407 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1408 pending_inc
.new_pools
[i
.first
] = i
.second
;
1410 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1413 // adjust blacklist items to all be TYPE_ANY
1414 for (auto& i
: tmp
.blacklist
) {
1416 a
.set_type(entity_addr_t::TYPE_ANY
);
1417 pending_inc
.new_blacklist
[a
] = i
.second
;
1418 pending_inc
.old_blacklist
.push_back(i
.first
);
1424 for (auto i
= pending_inc
.new_state
.begin();
1425 i
!= pending_inc
.new_state
.end();
1427 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1428 if (s
& CEPH_OSD_UP
)
1429 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1430 if (s
& CEPH_OSD_EXISTS
)
1431 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1433 for (auto i
= pending_inc
.new_up_client
.begin();
1434 i
!= pending_inc
.new_up_client
.end();
1436 //FIXME: insert cluster addresses too
1437 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1439 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1440 i
!= pending_inc
.new_weight
.end();
1442 if (i
->second
== CEPH_OSD_OUT
) {
1443 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1444 } else if (i
->second
== CEPH_OSD_IN
) {
1445 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1447 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1451 // features for osdmap and its incremental
1454 // encode full map and determine its crc
1457 tmp
.deepish_copy_from(osdmap
);
1458 tmp
.apply_incremental(pending_inc
);
1460 // determine appropriate features
1461 features
= tmp
.get_encoding_features();
1462 dout(10) << __func__
<< " encoding full map with "
1463 << ceph_release_name(tmp
.require_osd_release
)
1464 << " features " << features
<< dendl
;
1466 // the features should be a subset of the mon quorum's features!
1467 ceph_assert((features
& ~mon
->get_quorum_con_features()) == 0);
1470 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1471 pending_inc
.full_crc
= tmp
.get_crc();
1473 // include full map in the txn. note that old monitors will
1474 // overwrite this. new ones will now skip the local full map
1475 // encode and reload from this.
1476 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1480 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
1482 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1484 dout(20) << " full_crc " << tmp
.get_crc()
1485 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1487 /* put everything in the transaction */
1488 put_version(t
, pending_inc
.epoch
, bl
);
1489 put_last_committed(t
, pending_inc
.epoch
);
1492 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1493 p
!= pending_metadata
.end();
1495 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1496 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1497 p
!= pending_metadata_rm
.end();
1499 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1500 pending_metadata
.clear();
1501 pending_metadata_rm
.clear();
1504 if (tmp
.require_osd_release
>= CEPH_RELEASE_MIMIC
) {
1505 for (auto& i
: pending_inc
.new_removed_snaps
) {
1507 // all snaps removed this epoch
1508 string k
= make_snap_epoch_key(i
.first
, pending_inc
.epoch
);
1510 encode(i
.second
, v
);
1511 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1513 for (auto q
= i
.second
.begin();
1514 q
!= i
.second
.end();
1517 string k
= make_snap_key_value(i
.first
, q
.get_start(),
1518 q
.get_len(), pending_inc
.epoch
, &v
);
1519 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1522 for (auto& i
: pending_inc
.new_purged_snaps
) {
1523 for (auto q
= i
.second
.begin();
1524 q
!= i
.second
.end();
1527 string k
= make_snap_purged_key_value(i
.first
, q
.get_start(),
1528 q
.get_len(), pending_inc
.epoch
,
1530 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1536 health_check_map_t next
;
1537 tmp
.check_health(&next
);
1538 encode_health(next
, t
);
1541 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
1544 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
1548 auto p
= bl
.cbegin();
1551 catch (buffer::error
& e
) {
1553 *err
<< "osd." << osd
<< " metadata is corrupt";
1559 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
1561 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
1562 if (osdmap
.is_up(osd
)) {
1563 map
<string
,string
> meta
;
1564 load_metadata(osd
, meta
, nullptr);
1565 auto p
= meta
.find(field
);
1566 if (p
== meta
.end()) {
1567 (*out
)["unknown"]++;
1569 (*out
)[p
->second
]++;
1575 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
1577 map
<string
,int> by_val
;
1578 count_metadata(field
, &by_val
);
1579 f
->open_object_section(field
.c_str());
1580 for (auto& p
: by_val
) {
1581 f
->dump_int(p
.first
.c_str(), p
.second
);
1586 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
1588 map
<string
, string
> metadata
;
1589 int r
= load_metadata(osd
, metadata
, nullptr);
1593 auto it
= metadata
.find("osd_objectstore");
1594 if (it
== metadata
.end())
1600 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
1601 const pg_pool_t
&pool
,
1604 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1605 // since filestore osds could always join the pool later
1606 set
<int> checked_osds
;
1607 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
1608 vector
<int> up
, acting
;
1609 pg_t
pgid(ps
, pool_id
);
1610 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
1611 for (int osd
: up
) {
1612 if (checked_osds
.find(osd
) != checked_osds
.end())
1614 string objectstore_type
;
1615 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
1616 // allow with missing metadata, e.g. due to an osd never booting yet
1617 if (r
< 0 || objectstore_type
== "bluestore") {
1618 checked_osds
.insert(osd
);
1621 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
1628 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
1630 map
<string
,string
> m
;
1631 if (int r
= load_metadata(osd
, m
, err
))
1633 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
1634 f
->dump_string(p
->first
.c_str(), p
->second
);
1638 void OSDMonitor::print_nodes(Formatter
*f
)
1640 // group OSDs by their hosts
1641 map
<string
, list
<int> > osds
; // hostname => osd
1642 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
1643 map
<string
, string
> m
;
1644 if (load_metadata(osd
, m
, NULL
)) {
1647 map
<string
, string
>::iterator hostname
= m
.find("hostname");
1648 if (hostname
== m
.end()) {
1649 // not likely though
1652 osds
[hostname
->second
].push_back(osd
);
1655 dump_services(f
, osds
, "osd");
1658 void OSDMonitor::share_map_with_random_osd()
1660 if (osdmap
.get_num_up_osds() == 0) {
1661 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
1665 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
1667 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
1671 dout(10) << "committed, telling random " << s
->name
1672 << " all about it" << dendl
;
1674 // get feature of the peer
1675 // use quorum_con_features, if it's an anonymous connection.
1676 uint64_t features
= s
->con_features
? s
->con_features
:
1677 mon
->get_quorum_con_features();
1678 // whatev, they'll request more if they need it
1679 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
1680 s
->con
->send_message(m
);
1681 // NOTE: do *not* record osd has up to this epoch (as we do
1682 // elsewhere) as they may still need to request older values.
1685 version_t
OSDMonitor::get_trim_to() const
1687 if (mon
->get_quorum().empty()) {
1688 dout(10) << __func__
<< ": quorum not formed" << dendl
;
1693 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1694 if (!creating_pgs
.pgs
.empty()) {
1699 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
1701 << " blocking osdmap trim"
1702 " ('mon_debug_block_osdmap_trim' set to 'true')"
1708 epoch_t floor
= get_min_last_epoch_clean();
1709 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
1710 if (g_conf()->mon_osd_force_trim_to
> 0 &&
1711 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
1712 floor
= g_conf()->mon_osd_force_trim_to
;
1713 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
1715 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
1716 if (floor
+ min
> get_last_committed()) {
1717 if (min
< get_last_committed())
1718 floor
= get_last_committed() - min
;
1722 if (floor
> get_first_committed())
1728 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
1730 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
1731 // also scan osd epochs
1732 // don't trim past the oldest reported osd epoch
1733 for (auto& osd_epoch
: osd_epochs
) {
1734 if (osd_epoch
.second
< floor
) {
1735 floor
= osd_epoch
.second
;
1741 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
1744 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
1746 get_version_full(first
, bl
);
1747 put_version_full(tx
, first
, bl
);
1749 if (has_osdmap_manifest
&&
1750 first
> osdmap_manifest
.get_first_pinned()) {
1751 _prune_update_trimmed(tx
, first
);
1756 /* full osdmap prune
1758 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
1761 void OSDMonitor::load_osdmap_manifest()
1763 bool store_has_manifest
=
1764 mon
->store
->exists(get_service_name(), "osdmap_manifest");
1766 if (!store_has_manifest
) {
1767 if (!has_osdmap_manifest
) {
1771 dout(20) << __func__
1772 << " dropping osdmap manifest from memory." << dendl
;
1773 osdmap_manifest
= osdmap_manifest_t();
1774 has_osdmap_manifest
= false;
1778 dout(20) << __func__
1779 << " osdmap manifest detected in store; reload." << dendl
;
1781 bufferlist manifest_bl
;
1782 int r
= get_value("osdmap_manifest", manifest_bl
);
1784 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
1785 ceph_abort_msg("error reading manifest");
1787 osdmap_manifest
.decode(manifest_bl
);
1788 has_osdmap_manifest
= true;
1790 dout(10) << __func__
<< " store osdmap manifest pinned ("
1791 << osdmap_manifest
.get_first_pinned()
1793 << osdmap_manifest
.get_last_pinned()
1798 bool OSDMonitor::should_prune() const
1800 version_t first
= get_first_committed();
1801 version_t last
= get_last_committed();
1802 version_t min_osdmap_epochs
=
1803 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
1804 version_t prune_min
=
1805 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
1806 version_t prune_interval
=
1807 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
1808 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
1809 version_t last_to_pin
= last
- min_osdmap_epochs
;
1811 // Make it or break it constraints.
1813 // If any of these conditions fails, we will not prune, regardless of
1814 // whether we have an on-disk manifest with an on-going pruning state.
1816 if ((last
- first
) <= min_osdmap_epochs
) {
1817 // between the first and last committed epochs, we don't have
1818 // enough epochs to trim, much less to prune.
1819 dout(10) << __func__
1820 << " currently holding only " << (last
- first
)
1821 << " epochs (min osdmap epochs: " << min_osdmap_epochs
1822 << "); do not prune."
1826 } else if ((last_to_pin
- first
) < prune_min
) {
1827 // between the first committed epoch and the last epoch we would prune,
1828 // we simply don't have enough versions over the minimum to prune maps.
1829 dout(10) << __func__
1830 << " could only prune " << (last_to_pin
- first
)
1831 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
1832 " is less than the required minimum (" << prune_min
<< ")"
1836 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
1837 dout(10) << __func__
1838 << " we have pruned as far as we can; do not prune."
1842 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
1843 dout(10) << __func__
1844 << " not enough epochs to form an interval (last pinned: "
1845 << last_pinned
<< ", last to pin: "
1846 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
1851 dout(15) << __func__
1852 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
1853 << " lc (" << first
<< ".." << last
<< ")"
1858 void OSDMonitor::_prune_update_trimmed(
1859 MonitorDBStore::TransactionRef tx
,
1862 dout(10) << __func__
1863 << " first " << first
1864 << " last_pinned " << osdmap_manifest
.get_last_pinned()
1865 << " last_pinned " << osdmap_manifest
.get_last_pinned()
1868 osdmap_manifest_t manifest
= osdmap_manifest
;
1870 if (!manifest
.is_pinned(first
)) {
1871 manifest
.pin(first
);
1874 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
1875 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
1876 manifest
.pinned
.erase(p
, p_end
);
1877 ceph_assert(manifest
.get_first_pinned() == first
);
1879 if (manifest
.get_last_pinned() == first
+1 ||
1880 manifest
.pinned
.size() == 1) {
1881 // we reached the end of the line, as pinned maps go; clean up our
1882 // manifest, and let `should_prune()` decide whether we should prune
1884 tx
->erase(get_service_name(), "osdmap_manifest");
1889 manifest
.encode(bl
);
1890 tx
->put(get_service_name(), "osdmap_manifest", bl
);
1893 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
1895 dout(1) << __func__
<< dendl
;
1897 version_t pin_first
;
1899 // verify constrainsts on stable in-memory state
1900 if (!has_osdmap_manifest
) {
1901 // we must have never pruned, OR if we pruned the state must no longer
1902 // be relevant (i.e., the state must have been removed alongside with
1903 // the trim that *must* have removed past the last pinned map in a
1905 ceph_assert(osdmap_manifest
.pinned
.empty());
1906 ceph_assert(!mon
->store
->exists(get_service_name(), "osdmap_manifest"));
1907 pin_first
= get_first_committed();
1910 // we must have pruned in the past AND its state is still relevant
1911 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
1912 // and thus we still hold a manifest in the store).
1913 ceph_assert(!osdmap_manifest
.pinned
.empty());
1914 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
1915 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
1917 dout(10) << __func__
1918 << " first_pinned " << osdmap_manifest
.get_first_pinned()
1919 << " last_pinned " << osdmap_manifest
.get_last_pinned()
1922 pin_first
= osdmap_manifest
.get_last_pinned();
1925 manifest
.pin(pin_first
);
1928 bool OSDMonitor::_prune_sanitize_options() const
1930 uint64_t prune_interval
=
1931 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
1932 uint64_t prune_min
=
1933 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
1935 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
1939 if (prune_interval
== 0) {
1941 << " prune is enabled BUT prune interval is zero; abort."
1944 } else if (prune_interval
== 1) {
1946 << " prune interval is equal to one, which essentially means"
1947 " no pruning; abort."
1951 if (prune_min
== 0) {
1953 << " prune is enabled BUT prune min is zero; abort."
1957 if (prune_interval
> prune_min
) {
1959 << " impossible to ascertain proper prune interval because"
1960 << " it is greater than the minimum prune epochs"
1961 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
1966 if (txsize
< prune_interval
- 1) {
1968 << "'mon_osdmap_full_prune_txsize' (" << txsize
1969 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
1970 << "); abort." << dendl
;
1976 bool OSDMonitor::is_prune_enabled() const {
1977 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
1980 bool OSDMonitor::is_prune_supported() const {
1981 return mon
->get_required_mon_features().contains_any(
1982 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
1987 * @returns true if has side-effects; false otherwise.
1989 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
1991 bool enabled
= is_prune_enabled();
1993 dout(1) << __func__
<< " osdmap full prune "
1994 << ( enabled
? "enabled" : "disabled")
1997 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2001 // we are beyond the minimum prune versions, we need to remove maps because
2002 // otherwise the store will grow unbounded and we may end up having issues
2003 // with available disk space or store hangs.
2005 // we will not pin all versions. We will leave a buffer number of versions.
2006 // this allows us the monitor to trim maps without caring too much about
2007 // pinned maps, and then allow us to use another ceph-mon without these
2008 // capabilities, without having to repair the store.
2010 osdmap_manifest_t manifest
= osdmap_manifest
;
2012 version_t first
= get_first_committed();
2013 version_t last
= get_last_committed();
2015 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2016 version_t last_pinned
= manifest
.get_last_pinned();
2017 uint64_t prune_interval
=
2018 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2020 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2022 prune_init(manifest
);
2024 // we need to get rid of some osdmaps
2027 << " lc (" << first
<< " .. " << last
<< ")"
2028 << " last_pinned " << last_pinned
2029 << " interval " << prune_interval
2030 << " last_to_pin " << last_to_pin
2033 // We will be erasing maps as we go.
2035 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2037 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2038 // we stop pruning. We could prune the maps between `next_to_pin` and
2039 // `last_to_pin`, but by not doing it we end up with neater pruned
2040 // intervals, aligned with `prune_interval`. Besides, this should not be a
2041 // problem as long as `prune_interval` is set to a sane value, instead of
2042 // hundreds or thousands of maps.
2044 auto map_exists
= [this](version_t v
) {
2045 string k
= mon
->store
->combine_strings("full", v
);
2046 return mon
->store
->exists(get_service_name(), k
);
2049 // 'interval' represents the number of maps from the last pinned
2050 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2051 // version 11 next; all intermediate versions will be removed.
2053 // 'txsize' represents the maximum number of versions we'll be removing in
2054 // this iteration. If 'txsize' is large enough to perform multiple passes
2055 // pinning and removing maps, we will do so; if not, we'll do at least one
2056 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2057 // ensure that we never go *over* the maximum.
2059 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2060 uint64_t removal_interval
= prune_interval
- 1;
2062 if (txsize
< removal_interval
) {
2064 << " setting txsize to removal interval size ("
2065 << removal_interval
<< " versions"
2067 txsize
= removal_interval
;
2069 ceph_assert(removal_interval
> 0);
2071 uint64_t num_pruned
= 0;
2072 while (num_pruned
+ removal_interval
<= txsize
) {
2073 last_pinned
= manifest
.get_last_pinned();
2075 if (last_pinned
+ prune_interval
> last_to_pin
) {
2078 ceph_assert(last_pinned
< last_to_pin
);
2080 version_t next_pinned
= last_pinned
+ prune_interval
;
2081 ceph_assert(next_pinned
<= last_to_pin
);
2082 manifest
.pin(next_pinned
);
2084 dout(20) << __func__
2085 << " last_pinned " << last_pinned
2086 << " next_pinned " << next_pinned
2087 << " num_pruned " << num_pruned
2088 << " removal interval (" << (last_pinned
+1)
2089 << ".." << (next_pinned
-1) << ")"
2090 << " txsize " << txsize
<< dendl
;
2092 ceph_assert(map_exists(last_pinned
));
2093 ceph_assert(map_exists(next_pinned
));
2095 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2096 ceph_assert(!manifest
.is_pinned(v
));
2098 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2099 string full_key
= mon
->store
->combine_strings("full", v
);
2100 tx
->erase(get_service_name(), full_key
);
2105 ceph_assert(num_pruned
> 0);
2108 manifest
.encode(bl
);
2109 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2117 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2119 op
->mark_osdmon_event(__func__
);
2120 Message
*m
= op
->get_req();
2121 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2123 switch (m
->get_type()) {
2125 case MSG_MON_COMMAND
:
2127 return preprocess_command(op
);
2128 } catch (const bad_cmd_get
& e
) {
2130 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2133 case CEPH_MSG_MON_GET_OSDMAP
:
2134 return preprocess_get_osdmap(op
);
2137 case MSG_OSD_MARK_ME_DOWN
:
2138 return preprocess_mark_me_down(op
);
2140 return preprocess_full(op
);
2141 case MSG_OSD_FAILURE
:
2142 return preprocess_failure(op
);
2144 return preprocess_boot(op
);
2146 return preprocess_alive(op
);
2147 case MSG_OSD_PG_CREATED
:
2148 return preprocess_pg_created(op
);
2149 case MSG_OSD_PG_READY_TO_MERGE
:
2150 return preprocess_pg_ready_to_merge(op
);
2151 case MSG_OSD_PGTEMP
:
2152 return preprocess_pgtemp(op
);
2153 case MSG_OSD_BEACON
:
2154 return preprocess_beacon(op
);
2156 case CEPH_MSG_POOLOP
:
2157 return preprocess_pool_op(op
);
2159 case MSG_REMOVE_SNAPS
:
2160 return preprocess_remove_snaps(op
);
2168 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2170 op
->mark_osdmon_event(__func__
);
2171 Message
*m
= op
->get_req();
2172 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2174 switch (m
->get_type()) {
2176 case MSG_OSD_MARK_ME_DOWN
:
2177 return prepare_mark_me_down(op
);
2179 return prepare_full(op
);
2180 case MSG_OSD_FAILURE
:
2181 return prepare_failure(op
);
2183 return prepare_boot(op
);
2185 return prepare_alive(op
);
2186 case MSG_OSD_PG_CREATED
:
2187 return prepare_pg_created(op
);
2188 case MSG_OSD_PGTEMP
:
2189 return prepare_pgtemp(op
);
2190 case MSG_OSD_PG_READY_TO_MERGE
:
2191 return prepare_pg_ready_to_merge(op
);
2192 case MSG_OSD_BEACON
:
2193 return prepare_beacon(op
);
2195 case MSG_MON_COMMAND
:
2197 return prepare_command(op
);
2198 } catch (const bad_cmd_get
& e
) {
2200 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2204 case CEPH_MSG_POOLOP
:
2205 return prepare_pool_op(op
);
2207 case MSG_REMOVE_SNAPS
:
2208 return prepare_remove_snaps(op
);
2218 bool OSDMonitor::should_propose(double& delay
)
2220 dout(10) << "should_propose" << dendl
;
2222 // if full map, propose immediately! any subsequent changes will be clobbered.
2223 if (pending_inc
.fullmap
.length())
2226 // adjust osd weights?
2227 if (!osd_weight
.empty() &&
2228 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2229 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2230 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2236 return PaxosService::should_propose(delay
);
2241 // ---------------------------
2244 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2246 op
->mark_osdmon_event(__func__
);
2247 MMonGetOSDMap
*m
= static_cast<MMonGetOSDMap
*>(op
->get_req());
2249 uint64_t features
= mon
->get_quorum_con_features();
2250 if (op
->get_session() && op
->get_session()->con_features
)
2251 features
= op
->get_session()->con_features
;
2253 dout(10) << __func__
<< " " << *m
<< dendl
;
2254 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
, features
);
2255 epoch_t first
= get_first_committed();
2256 epoch_t last
= osdmap
.get_epoch();
2257 int max
= g_conf()->osd_map_message_max
;
2258 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2259 for (epoch_t e
= std::max(first
, m
->get_full_first());
2260 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2262 bufferlist
& bl
= reply
->maps
[e
];
2263 int r
= get_version_full(e
, features
, bl
);
2264 ceph_assert(r
>= 0);
2265 max_bytes
-= bl
.length();
2267 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2268 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2270 bufferlist
& bl
= reply
->incremental_maps
[e
];
2271 int r
= get_version(e
, features
, bl
);
2272 ceph_assert(r
>= 0);
2273 max_bytes
-= bl
.length();
2275 reply
->oldest_map
= first
;
2276 reply
->newest_map
= last
;
2277 mon
->send_reply(op
, reply
);
2282 // ---------------------------
2287 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2288 // check permissions
2289 MonSession
*session
= op
->get_session();
2292 if (!session
->is_capable("osd", MON_CAP_X
)) {
2293 dout(0) << "got MOSDFailure from entity with insufficient caps "
2294 << session
->caps
<< dendl
;
2297 if (fsid
!= mon
->monmap
->fsid
) {
2298 dout(0) << "check_source: on fsid " << fsid
2299 << " != " << mon
->monmap
->fsid
<< dendl
;
2306 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2308 op
->mark_osdmon_event(__func__
);
2309 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
2310 // who is target_osd
2311 int badboy
= m
->get_target_osd();
2313 // check permissions
2314 if (check_source(op
, m
->fsid
))
2317 // first, verify the reporting host is valid
2318 if (m
->get_orig_source().is_osd()) {
2319 int from
= m
->get_orig_source().num();
2320 if (!osdmap
.exists(from
) ||
2321 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2322 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2323 dout(5) << "preprocess_failure from dead osd." << from
2324 << ", ignoring" << dendl
;
2325 send_incremental(op
, m
->get_epoch()+1);
2332 if (osdmap
.is_down(badboy
)) {
2333 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2334 << " " << m
->get_target_addrs()
2335 << ", from " << m
->get_orig_source() << dendl
;
2336 if (m
->get_epoch() < osdmap
.get_epoch())
2337 send_incremental(op
, m
->get_epoch()+1);
2340 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2341 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2342 << " " << m
->get_target_addrs()
2343 << " != map's " << osdmap
.get_addrs(badboy
)
2344 << ", from " << m
->get_orig_source() << dendl
;
2345 if (m
->get_epoch() < osdmap
.get_epoch())
2346 send_incremental(op
, m
->get_epoch()+1);
2350 // already reported?
2351 if (osdmap
.is_down(badboy
) ||
2352 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2353 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2354 << " " << m
->get_target_addrs()
2355 << ", from " << m
->get_orig_source() << dendl
;
2356 if (m
->get_epoch() < osdmap
.get_epoch())
2357 send_incremental(op
, m
->get_epoch()+1);
2361 if (!can_mark_down(badboy
)) {
2362 dout(5) << "preprocess_failure ignoring report of osd."
2363 << m
->get_target_osd() << " " << m
->get_target_addrs()
2364 << " from " << m
->get_orig_source() << dendl
;
2368 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2369 << " " << m
->get_target_addrs()
2370 << ", from " << m
->get_orig_source() << dendl
;
2378 class C_AckMarkedDown
: public C_MonOp
{
2384 : C_MonOp(op
), osdmon(osdmon
) {}
2386 void _finish(int) override
{
2387 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
2388 osdmon
->mon
->send_reply(
2395 false)); // ACK itself does not request an ack
2397 ~C_AckMarkedDown() override
{
2401 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
2403 op
->mark_osdmon_event(__func__
);
2404 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
2405 int from
= m
->target_osd
;
2407 // check permissions
2408 if (check_source(op
, m
->fsid
))
2411 // first, verify the reporting host is valid
2412 if (!m
->get_orig_source().is_osd())
2415 if (!osdmap
.exists(from
) ||
2416 osdmap
.is_down(from
) ||
2417 osdmap
.get_addrs(from
) != m
->target_addrs
) {
2418 dout(5) << "preprocess_mark_me_down from dead osd."
2419 << from
<< ", ignoring" << dendl
;
2420 send_incremental(op
, m
->get_epoch()+1);
2424 // no down might be set
2425 if (!can_mark_down(from
))
2428 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
2429 << " " << m
->target_addrs
<< dendl
;
2433 if (m
->request_ack
) {
2434 Context
*c(new C_AckMarkedDown(this, op
));
2440 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
2442 op
->mark_osdmon_event(__func__
);
2443 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
2444 int target_osd
= m
->target_osd
;
2446 ceph_assert(osdmap
.is_up(target_osd
));
2447 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
2449 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
2450 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2452 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
2456 bool OSDMonitor::can_mark_down(int i
)
2458 if (osdmap
.is_nodown(i
)) {
2459 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
2460 << "will not mark it down" << dendl
;
2464 int num_osds
= osdmap
.get_num_osds();
2465 if (num_osds
== 0) {
2466 dout(5) << __func__
<< " no osds" << dendl
;
2469 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
2470 float up_ratio
= (float)up
/ (float)num_osds
;
2471 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
2472 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
2473 << g_conf()->mon_osd_min_up_ratio
2474 << ", will not mark osd." << i
<< " down" << dendl
;
2480 bool OSDMonitor::can_mark_up(int i
)
2482 if (osdmap
.is_noup(i
)) {
2483 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
2484 << "will not mark it up" << dendl
;
2492 * @note the parameter @p i apparently only exists here so we can output the
2493 * osd's id on messages.
2495 bool OSDMonitor::can_mark_out(int i
)
2497 if (osdmap
.is_noout(i
)) {
2498 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
2499 << "will not mark it out" << dendl
;
2503 int num_osds
= osdmap
.get_num_osds();
2504 if (num_osds
== 0) {
2505 dout(5) << __func__
<< " no osds" << dendl
;
2508 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
2509 float in_ratio
= (float)in
/ (float)num_osds
;
2510 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
2512 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
2513 << g_conf()->mon_osd_min_in_ratio
2514 << ", will not mark osd." << i
<< " out" << dendl
;
2516 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
2517 << g_conf()->mon_osd_min_in_ratio
2518 << ", will not mark osds out" << dendl
;
2525 bool OSDMonitor::can_mark_in(int i
)
2527 if (osdmap
.is_noin(i
)) {
2528 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
2529 << "will not mark it in" << dendl
;
2536 bool OSDMonitor::check_failures(utime_t now
)
2538 bool found_failure
= false;
2539 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2540 p
!= failure_info
.end();
2542 if (can_mark_down(p
->first
)) {
2543 found_failure
|= check_failure(now
, p
->first
, p
->second
);
2546 return found_failure
;
2549 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
2551 // already pending failure?
2552 if (pending_inc
.new_state
.count(target_osd
) &&
2553 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
2554 dout(10) << " already pending failure" << dendl
;
2558 set
<string
> reporters_by_subtree
;
2559 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
2560 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
2561 utime_t max_failed_since
= fi
.get_failed_since();
2562 utime_t failed_for
= now
- max_failed_since
;
2564 utime_t grace
= orig_grace
;
2565 double my_grace
= 0, peer_grace
= 0;
2567 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
2568 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
2569 decay_k
= ::log(.5) / halflife
;
2571 // scale grace period based on historical probability of 'lagginess'
2572 // (false positive failures due to slowness).
2573 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
2574 double decay
= exp((double)failed_for
* decay_k
);
2575 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
2576 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
2577 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
2581 // consider the peers reporting a failure a proxy for a potential
2582 // 'subcluster' over the overall cluster that is similarly
2583 // laggy. this is clearly not true in all cases, but will sometimes
2584 // help us localize the grace correction to a subset of the system
2585 // (say, a rack with a bad switch) that is unhappy.
2586 ceph_assert(fi
.reporters
.size());
2587 for (map
<int,failure_reporter_t
>::iterator p
= fi
.reporters
.begin();
2588 p
!= fi
.reporters
.end();
2590 // get the parent bucket whose type matches with "reporter_subtree_level".
2591 // fall back to OSD if the level doesn't exist.
2592 map
<string
, string
> reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
2593 map
<string
, string
>::iterator iter
= reporter_loc
.find(reporter_subtree_level
);
2594 if (iter
== reporter_loc
.end()) {
2595 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
2597 reporters_by_subtree
.insert(iter
->second
);
2599 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
2600 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
2601 utime_t elapsed
= now
- xi
.down_stamp
;
2602 double decay
= exp((double)elapsed
* decay_k
);
2603 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
2607 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
2608 peer_grace
/= (double)fi
.reporters
.size();
2609 grace
+= peer_grace
;
2612 dout(10) << " osd." << target_osd
<< " has "
2613 << fi
.reporters
.size() << " reporters, "
2614 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
2615 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
2618 if (failed_for
>= grace
&&
2619 reporters_by_subtree
.size() >= g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
2620 dout(1) << " we have enough reporters to mark osd." << target_osd
2621 << " down" << dendl
;
2622 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2624 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
2625 << osdmap
.crush
->get_full_location_ordered_string(
2628 << (int)reporters_by_subtree
.size()
2629 << " reporters from different "
2630 << reporter_subtree_level
<< " after "
2631 << failed_for
<< " >= grace " << grace
<< ")";
2637 void OSDMonitor::force_failure(int target_osd
, int by
)
2639 // already pending failure?
2640 if (pending_inc
.new_state
.count(target_osd
) &&
2641 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
2642 dout(10) << " already pending failure" << dendl
;
2646 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
2647 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2649 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
2650 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
2651 << ") (connection refused reported by osd." << by
<< ")";
2655 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
2657 op
->mark_osdmon_event(__func__
);
2658 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
2659 dout(1) << "prepare_failure osd." << m
->get_target_osd()
2660 << " " << m
->get_target_addrs()
2661 << " from " << m
->get_orig_source()
2662 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
2664 int target_osd
= m
->get_target_osd();
2665 int reporter
= m
->get_orig_source().num();
2666 ceph_assert(osdmap
.is_up(target_osd
));
2667 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
2669 if (m
->if_osd_failed()) {
2670 // calculate failure time
2671 utime_t now
= ceph_clock_now();
2672 utime_t failed_since
=
2673 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
2676 if (m
->is_immediate()) {
2677 mon
->clog
->debug() << "osd." << m
->get_target_osd()
2678 << " reported immediately failed by "
2679 << m
->get_orig_source();
2680 force_failure(target_osd
, reporter
);
2684 mon
->clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
2685 << m
->get_orig_source();
2687 failure_info_t
& fi
= failure_info
[target_osd
];
2688 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
2690 mon
->no_reply(old_op
);
2693 return check_failure(now
, target_osd
, fi
);
2695 // remove the report
2696 mon
->clog
->debug() << "osd." << m
->get_target_osd()
2697 << " failure report canceled by "
2698 << m
->get_orig_source();
2699 if (failure_info
.count(target_osd
)) {
2700 failure_info_t
& fi
= failure_info
[target_osd
];
2701 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
2703 mon
->no_reply(report_op
);
2705 if (fi
.reporters
.empty()) {
2706 dout(10) << " removing last failure_info for osd." << target_osd
2708 failure_info
.erase(target_osd
);
2710 dout(10) << " failure_info for osd." << target_osd
<< " now "
2711 << fi
.reporters
.size() << " reporters" << dendl
;
2714 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
2722 void OSDMonitor::process_failures()
2724 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2725 while (p
!= failure_info
.end()) {
2726 if (osdmap
.is_up(p
->first
)) {
2729 dout(10) << "process_failures osd." << p
->first
<< dendl
;
2730 list
<MonOpRequestRef
> ls
;
2731 p
->second
.take_report_messages(ls
);
2732 failure_info
.erase(p
++);
2734 while (!ls
.empty()) {
2735 MonOpRequestRef o
= ls
.front();
2737 o
->mark_event(__func__
);
2738 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
2739 send_latest(o
, m
->get_epoch());
2748 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
2750 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
2752 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2753 p
!= failure_info
.end();
2755 p
->second
.take_report_messages(ls
);
2757 failure_info
.clear();
2763 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
2765 op
->mark_osdmon_event(__func__
);
2766 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2767 int from
= m
->get_orig_source_inst().name
.num();
2769 // check permissions, ignore if failed (no response expected)
2770 MonSession
*session
= op
->get_session();
2773 if (!session
->is_capable("osd", MON_CAP_X
)) {
2774 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2775 << session
->caps
<< dendl
;
2779 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
2780 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
2781 << " != " << mon
->monmap
->fsid
<< dendl
;
2785 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
2786 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
2790 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
2792 // force all osds to have gone through luminous prior to upgrade to nautilus
2794 vector
<string
> missing
;
2795 if (!HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
2796 missing
.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
2798 if (!HAVE_FEATURE(m
->osd_features
, SERVER_JEWEL
)) {
2799 missing
.push_back("CEPH_FEATURE_SERVER_JEWEL");
2801 if (!HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
2802 missing
.push_back("CEPH_FEATURE_SERVER_KRAKEN");
2804 if (!HAVE_FEATURE(m
->osd_features
, OSD_RECOVERY_DELETES
)) {
2805 missing
.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
2808 if (!missing
.empty()) {
2809 using std::experimental::make_ostream_joiner
;
2812 copy(begin(missing
), end(missing
), make_ostream_joiner(ss
, ";"));
2814 mon
->clog
->info() << "disallowing boot of OSD "
2815 << m
->get_orig_source_inst()
2816 << " because the osd lacks " << ss
.str();
2821 // make sure upgrades stop at nautilus
2822 if (HAVE_FEATURE(m
->osd_features
, SERVER_O
) &&
2823 osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
2824 mon
->clog
->info() << "disallowing boot of post-nautilus OSD "
2825 << m
->get_orig_source_inst()
2826 << " because require_osd_release < nautilus";
2830 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
2831 // we are reusing a jewel feature bit that was retired in luminous.
2832 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
2833 osdmap
.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT
) &&
2834 !(m
->osd_features
& CEPH_FEATURE_OSD_PGLOG_HARDLIMIT
)) {
2835 mon
->clog
->info() << "disallowing boot of OSD "
2836 << m
->get_orig_source_inst()
2837 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
2842 if (osdmap
.is_up(from
) &&
2843 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
2844 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
2846 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
2847 << " " << m
->get_orig_source_addrs()
2848 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
2853 if (osdmap
.exists(from
) &&
2854 !osdmap
.get_uuid(from
).is_zero() &&
2855 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2856 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
2857 << " clashes with existing osd: different fsid"
2858 << " (ours: " << osdmap
.get_uuid(from
)
2859 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
2863 if (osdmap
.exists(from
) &&
2864 osdmap
.get_info(from
).up_from
> m
->version
&&
2865 osdmap
.get_most_recent_addrs(from
).legacy_equals(
2866 m
->get_orig_source_addrs())) {
2867 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
2868 send_latest(op
, m
->sb
.current_epoch
+1);
2873 if (!can_mark_up(from
)) {
2874 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
2875 send_latest(op
, m
->sb
.current_epoch
+1);
2879 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
2886 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
2888 op
->mark_osdmon_event(__func__
);
2889 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2890 dout(7) << __func__
<< " from " << m
->get_source()
2892 << " client_addrs" << m
->get_connection()->get_peer_addrs()
2893 << " cluster_addrs " << m
->cluster_addrs
2894 << " hb_back_addrs " << m
->hb_back_addrs
2895 << " hb_front_addrs " << m
->hb_front_addrs
2898 ceph_assert(m
->get_orig_source().is_osd());
2899 int from
= m
->get_orig_source().num();
2901 // does this osd exist?
2902 if (from
>= osdmap
.get_max_osd()) {
2903 dout(1) << "boot from osd." << from
<< " >= max_osd "
2904 << osdmap
.get_max_osd() << dendl
;
2908 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
2909 if (pending_inc
.new_state
.count(from
))
2910 oldstate
^= pending_inc
.new_state
[from
];
2912 // already up? mark down first?
2913 if (osdmap
.is_up(from
)) {
2914 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
2915 << osdmap
.get_addrs(from
) << dendl
;
2916 // preprocess should have caught these; if not, assert.
2917 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
2918 m
->get_orig_source_addrs()) ||
2919 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
2920 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
2922 if (pending_inc
.new_state
.count(from
) == 0 ||
2923 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
2924 // mark previous guy down
2925 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
2927 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2928 } else if (pending_inc
.new_up_client
.count(from
)) {
2929 // already prepared, just wait
2930 dout(7) << __func__
<< " already prepared, waiting on "
2931 << m
->get_orig_source_addr() << dendl
;
2932 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2935 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
2936 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
2937 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
2938 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
2940 down_pending_out
.erase(from
); // if any
2943 osd_weight
[from
] = m
->sb
.weight
;
2946 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
2948 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2949 // preprocess should have caught this; if not, assert.
2950 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
2951 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
2955 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
2956 const osd_info_t
& i
= osdmap
.get_info(from
);
2957 if (i
.up_from
> i
.lost_at
) {
2958 dout(10) << " fresh osd; marking lost_at too" << dendl
;
2959 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
2964 bufferlist osd_metadata
;
2965 encode(m
->metadata
, osd_metadata
);
2966 pending_metadata
[from
] = osd_metadata
;
2967 pending_metadata_rm
.erase(from
);
2969 // adjust last clean unmount epoch?
2970 const osd_info_t
& info
= osdmap
.get_info(from
);
2971 dout(10) << " old osd_info: " << info
<< dendl
;
2972 if (m
->sb
.mounted
> info
.last_clean_begin
||
2973 (m
->sb
.mounted
== info
.last_clean_begin
&&
2974 m
->sb
.clean_thru
> info
.last_clean_end
)) {
2975 epoch_t begin
= m
->sb
.mounted
;
2976 epoch_t end
= m
->sb
.clean_thru
;
2978 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
2979 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
2980 << ") -> [" << begin
<< "-" << end
<< ")"
2982 pending_inc
.new_last_clean_interval
[from
] =
2983 pair
<epoch_t
,epoch_t
>(begin
, end
);
2986 osd_xinfo_t xi
= osdmap
.get_xinfo(from
);
2987 if (m
->boot_epoch
== 0) {
2988 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
2989 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
2990 dout(10) << " not laggy, new xi " << xi
<< dendl
;
2992 if (xi
.down_stamp
.sec()) {
2993 int interval
= ceph_clock_now().sec() -
2994 xi
.down_stamp
.sec();
2995 if (g_conf()->mon_osd_laggy_max_interval
&&
2996 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
2997 interval
= g_conf()->mon_osd_laggy_max_interval
;
3000 interval
* g_conf()->mon_osd_laggy_weight
+
3001 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3003 xi
.laggy_probability
=
3004 g_conf()->mon_osd_laggy_weight
+
3005 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3006 dout(10) << " laggy, now xi " << xi
<< dendl
;
3009 // set features shared by the osd
3010 if (m
->osd_features
)
3011 xi
.features
= m
->osd_features
;
3013 xi
.features
= m
->get_connection()->get_features();
3016 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3017 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3018 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3019 (g_conf()->mon_osd_auto_mark_in
)) {
3020 if (can_mark_in(from
)) {
3021 if (osdmap
.osd_xinfo
[from
].old_weight
> 0) {
3022 pending_inc
.new_weight
[from
] = osdmap
.osd_xinfo
[from
].old_weight
;
3025 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3028 dout(7) << __func__
<< " NOIN set, will not mark in "
3029 << m
->get_orig_source_addr() << dendl
;
3033 pending_inc
.new_xinfo
[from
] = xi
;
3036 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3041 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3043 op
->mark_osdmon_event(__func__
);
3044 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
3045 dout(7) << "_booted " << m
->get_orig_source_inst()
3046 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3049 mon
->clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3053 send_latest(op
, m
->sb
.current_epoch
+1);
3060 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3062 op
->mark_osdmon_event(__func__
);
3063 MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
3064 int from
= m
->get_orig_source().num();
3066 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3068 // check permissions, ignore if failed
3069 MonSession
*session
= op
->get_session();
3072 if (!session
->is_capable("osd", MON_CAP_X
)) {
3073 dout(0) << "MOSDFull from entity with insufficient privileges:"
3074 << session
->caps
<< dendl
;
3078 // ignore a full message from the osd instance that already went down
3079 if (!osdmap
.exists(from
)) {
3080 dout(7) << __func__
<< " ignoring full message from nonexistent "
3081 << m
->get_orig_source_inst() << dendl
;
3084 if ((!osdmap
.is_up(from
) &&
3085 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3086 m
->get_orig_source_addrs())) ||
3087 (osdmap
.is_up(from
) &&
3088 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3089 dout(7) << __func__
<< " ignoring full message from down "
3090 << m
->get_orig_source_inst() << dendl
;
3094 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3096 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3097 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3098 << " " << m
->get_orig_source_inst() << dendl
;
3099 _reply_map(op
, m
->version
);
3103 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3104 << " " << m
->get_orig_source_inst() << dendl
;
3111 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3113 op
->mark_osdmon_event(__func__
);
3114 const MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
3115 const int from
= m
->get_orig_source().num();
3117 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3118 const unsigned want_state
= m
->state
& mask
; // safety first
3120 unsigned cur_state
= osdmap
.get_state(from
);
3121 auto p
= pending_inc
.new_state
.find(from
);
3122 if (p
!= pending_inc
.new_state
.end()) {
3123 cur_state
^= p
->second
;
3127 set
<string
> want_state_set
, cur_state_set
;
3128 OSDMap::calc_state_set(want_state
, want_state_set
);
3129 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3131 if (cur_state
!= want_state
) {
3132 if (p
!= pending_inc
.new_state
.end()) {
3135 pending_inc
.new_state
[from
] = 0;
3137 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3138 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3139 << " -> " << want_state_set
<< dendl
;
3141 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3142 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3145 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3152 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3154 op
->mark_osdmon_event(__func__
);
3155 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
3156 int from
= m
->get_orig_source().num();
3158 // check permissions, ignore if failed
3159 MonSession
*session
= op
->get_session();
3162 if (!session
->is_capable("osd", MON_CAP_X
)) {
3163 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3164 << session
->caps
<< dendl
;
3168 if (!osdmap
.is_up(from
) ||
3169 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3170 dout(7) << "preprocess_alive ignoring alive message from down "
3171 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3176 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3178 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3179 _reply_map(op
, m
->version
);
3183 dout(10) << "preprocess_alive want up_thru " << m
->want
3184 << " from " << m
->get_orig_source_inst() << dendl
;
3191 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3193 op
->mark_osdmon_event(__func__
);
3194 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
3195 int from
= m
->get_orig_source().num();
3197 if (0) { // we probably don't care much about these
3198 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
3201 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3202 << " from " << m
->get_orig_source_inst() << dendl
;
3204 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3205 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3209 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3211 op
->mark_osdmon_event(__func__
);
3212 dout(7) << "_reply_map " << e
3213 << " from " << op
->get_req()->get_orig_source_inst()
3219 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3221 op
->mark_osdmon_event(__func__
);
3222 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
3223 dout(10) << __func__
<< " " << *m
<< dendl
;
3224 auto session
= op
->get_session();
3227 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3230 if (!session
->is_capable("osd", MON_CAP_X
)) {
3231 derr
<< __func__
<< " received from entity "
3232 << "with insufficient privileges " << session
->caps
<< dendl
;
3235 // always forward the "created!" to the leader
3239 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3241 op
->mark_osdmon_event(__func__
);
3242 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
3243 dout(10) << __func__
<< " " << *m
<< dendl
;
3244 auto src
= m
->get_orig_source();
3245 auto from
= src
.num();
3246 if (!src
.is_osd() ||
3247 !mon
->osdmon()->osdmap
.is_up(from
) ||
3248 !mon
->osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3249 m
->get_orig_source_addrs())) {
3250 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
3253 pending_created_pgs
.push_back(m
->pgid
);
3257 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
3259 op
->mark_osdmon_event(__func__
);
3260 auto m
= static_cast<MOSDPGReadyToMerge
*>(op
->get_req());
3261 dout(10) << __func__
<< " " << *m
<< dendl
;
3262 const pg_pool_t
*pi
;
3263 auto session
= op
->get_session();
3265 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3268 if (!session
->is_capable("osd", MON_CAP_X
)) {
3269 derr
<< __func__
<< " received from entity "
3270 << "with insufficient privileges " << session
->caps
<< dendl
;
3273 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
3275 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
3278 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
3279 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
3282 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
3283 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
3286 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
3287 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
3297 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
3299 op
->mark_osdmon_event(__func__
);
3300 auto m
= static_cast<MOSDPGReadyToMerge
*>(op
->get_req());
3301 dout(10) << __func__
<< " " << *m
<< dendl
;
3303 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
3304 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
3306 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
3307 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
3308 p
.get_pg_num_pending() > m
->pgid
.ps()) {
3309 dout(10) << __func__
3310 << " race with concurrent pg_num[_pending] update, will retry"
3312 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3317 p
.dec_pg_num(m
->pgid
,
3321 m
->last_epoch_started
,
3322 m
->last_epoch_clean
);
3323 p
.last_change
= pending_inc
.epoch
;
3325 // back off the merge attempt!
3326 p
.set_pg_num_pending(p
.get_pg_num());
3329 // force pre-nautilus clients to resend their ops, since they
3330 // don't understand pg_num_pending changes form a new interval
3331 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
3333 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
3335 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
3338 prob
> (double)(rand() % 1000)/1000.0) {
3339 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
3340 auto n
= new MMonCommand(mon
->monmap
->get_fsid());
3341 n
->set_connection(m
->get_connection());
3342 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3343 osdmap
.get_pool_name(m
->pgid
.pool()) +
3344 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3345 stringify(m
->pgid
.ps() + 1) + "\"}" };
3346 MonOpRequestRef nop
= mon
->op_tracker
.create_request
<MonOpRequest
>(n
);
3347 nop
->set_type_service();
3348 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
3350 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3359 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
3361 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
3362 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
3363 mempool::osdmap::vector
<int> empty
;
3364 int from
= m
->get_orig_source().num();
3365 size_t ignore_cnt
= 0;
3368 MonSession
*session
= op
->get_session();
3371 if (!session
->is_capable("osd", MON_CAP_X
)) {
3372 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3373 << session
->caps
<< dendl
;
3377 if (!osdmap
.is_up(from
) ||
3378 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3379 dout(7) << "ignoring pgtemp message from down "
3380 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3389 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
3390 dout(20) << " " << p
->first
3391 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
3392 << " -> " << p
->second
<< dendl
;
3394 // does the pool exist?
3395 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
3397 * 1. If the osdmap does not have the pool, it means the pool has been
3398 * removed in-between the osd sending this message and us handling it.
3399 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3400 * not exist in the pending either, as the osds would not send a
3401 * message about a pool they know nothing about (yet).
3402 * 3. However, if the pool does exist in the pending, then it must be a
3403 * new pool, and not relevant to this message (see 1).
3405 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3406 << ": pool has been removed" << dendl
;
3411 int acting_primary
= -1;
3412 osdmap
.pg_to_up_acting_osds(
3413 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
3414 if (acting_primary
!= from
) {
3415 /* If the source isn't the primary based on the current osdmap, we know
3416 * that the interval changed and that we can discard this message.
3417 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3418 * which of two pg temp mappings on the same pg is more recent.
3420 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3421 << ": primary has changed" << dendl
;
3427 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
3428 osdmap
.primary_temp
->count(p
->first
)))
3431 // NOTE: we assume that this will clear pg_primary, so consider
3432 // an existing pg_primary field to imply a change
3433 if (p
->second
.size() &&
3434 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
3435 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
3436 osdmap
.primary_temp
->count(p
->first
)))
3440 // should we ignore all the pgs?
3441 if (ignore_cnt
== m
->pg_temp
.size())
3444 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
3445 _reply_map(op
, m
->map_epoch
);
3452 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
3454 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
3455 auto ut
= pending_inc
.new_up_thru
.find(from
);
3456 if (ut
!= pending_inc
.new_up_thru
.end()) {
3457 old_up_thru
= ut
->second
;
3459 if (up_thru
> old_up_thru
) {
3460 // set up_thru too, so the osd doesn't have to ask again
3461 pending_inc
.new_up_thru
[from
] = up_thru
;
3465 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
3467 op
->mark_osdmon_event(__func__
);
3468 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
3469 int from
= m
->get_orig_source().num();
3470 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
3471 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
3472 uint64_t pool
= p
->first
.pool();
3473 if (pending_inc
.old_pools
.count(pool
)) {
3474 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3475 << ": pool pending removal" << dendl
;
3478 if (!osdmap
.have_pg_pool(pool
)) {
3479 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3480 << ": pool has been removed" << dendl
;
3483 pending_inc
.new_pg_temp
[p
->first
] =
3484 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
3486 // unconditionally clear pg_primary (until this message can encode
3487 // a change for that, too.. at which point we need to also fix
3488 // preprocess_pg_temp)
3489 if (osdmap
.primary_temp
->count(p
->first
) ||
3490 pending_inc
.new_primary_temp
.count(p
->first
))
3491 pending_inc
.new_primary_temp
[p
->first
] = -1;
3494 // set up_thru too, so the osd doesn't have to ask again
3495 update_up_thru(from
, m
->map_epoch
);
3497 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
3504 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
3506 op
->mark_osdmon_event(__func__
);
3507 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
3508 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
3510 // check privilege, ignore if failed
3511 MonSession
*session
= op
->get_session();
3515 if (!session
->caps
.is_capable(
3517 CEPH_ENTITY_TYPE_MON
,
3518 session
->entity_name
,
3519 "osd", "osd pool rmsnap", {}, true, true, false,
3520 session
->get_peer_socket_addr())) {
3521 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3522 << session
->caps
<< dendl
;
3526 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
3527 q
!= m
->snaps
.end();
3529 if (!osdmap
.have_pg_pool(q
->first
)) {
3530 dout(10) << " ignoring removed_snaps " << q
->second
<< " on non-existent pool " << q
->first
<< dendl
;
3533 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
3534 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
3535 p
!= q
->second
.end();
3537 if (*p
> pi
->get_snap_seq() ||
3538 !pi
->removed_snaps
.contains(*p
))
3547 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
3549 op
->mark_osdmon_event(__func__
);
3550 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
3551 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
3553 for (map
<int, vector
<snapid_t
> >::iterator p
= m
->snaps
.begin();
3554 p
!= m
->snaps
.end();
3557 if (!osdmap
.have_pg_pool(p
->first
)) {
3558 dout(10) << " ignoring removed_snaps " << p
->second
<< " on non-existent pool " << p
->first
<< dendl
;
3562 pg_pool_t
& pi
= osdmap
.pools
[p
->first
];
3563 for (vector
<snapid_t
>::iterator q
= p
->second
.begin();
3564 q
!= p
->second
.end();
3566 if (!pi
.removed_snaps
.contains(*q
) &&
3567 (!pending_inc
.new_pools
.count(p
->first
) ||
3568 !pending_inc
.new_pools
[p
->first
].removed_snaps
.contains(*q
))) {
3569 pg_pool_t
*newpi
= pending_inc
.get_new_pool(p
->first
, &pi
);
3570 newpi
->removed_snaps
.insert(*q
);
3571 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
3572 dout(10) << " pool " << p
->first
<< " removed_snaps added " << *q
3573 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
3574 if (*q
> newpi
->get_snap_seq()) {
3575 dout(10) << " pool " << p
->first
<< " snap_seq "
3576 << newpi
->get_snap_seq() << " -> " << *q
<< dendl
;
3577 newpi
->set_snap_seq(*q
);
3579 newpi
->set_snap_epoch(pending_inc
.epoch
);
3580 pending_inc
.new_removed_snaps
[p
->first
].insert(*q
);
3588 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
3590 op
->mark_osdmon_event(__func__
);
3592 auto session
= op
->get_session();
3595 dout(10) << __func__
<< " no monitor session!" << dendl
;
3598 if (!session
->is_capable("osd", MON_CAP_X
)) {
3599 derr
<< __func__
<< " received from entity "
3600 << "with insufficient privileges " << session
->caps
<< dendl
;
3603 // Always forward the beacon to the leader, even if they are the same as
3604 // the old one. The leader will mark as down osds that haven't sent
3605 // beacon for a few minutes.
3609 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
3611 op
->mark_osdmon_event(__func__
);
3612 const auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
3613 const auto src
= beacon
->get_orig_source();
3614 dout(10) << __func__
<< " " << *beacon
3615 << " from " << src
<< dendl
;
3616 int from
= src
.num();
3618 if (!src
.is_osd() ||
3619 !osdmap
.is_up(from
) ||
3620 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
3621 if (src
.is_osd() && !osdmap
.is_up(from
)) {
3622 // share some new maps with this guy in case it may not be
3623 // aware of its own deadness...
3624 send_latest(op
, beacon
->version
+1);
3626 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
3630 last_osd_report
[from
] = ceph_clock_now();
3631 osd_epochs
[from
] = beacon
->version
;
3633 for (const auto& pg
: beacon
->pgs
) {
3634 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
3642 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
3644 op
->mark_osdmon_event(__func__
);
3645 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
3646 << " start " << start
<< dendl
;
3650 send_incremental(op
, start
);
3654 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
3656 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
, features
);
3657 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
3658 r
->oldest_map
= get_first_committed();
3659 r
->newest_map
= osdmap
.get_epoch();
3663 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
3665 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
3666 << std::hex
<< features
<< std::dec
<< dendl
;
3667 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
, features
);
3668 m
->oldest_map
= get_first_committed();
3669 m
->newest_map
= osdmap
.get_epoch();
3671 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
3673 int err
= get_version(e
, features
, bl
);
3675 ceph_assert(bl
.length());
3676 // if (get_version(e, bl) > 0) {
3677 dout(20) << "build_incremental inc " << e
<< " "
3678 << bl
.length() << " bytes" << dendl
;
3679 m
->incremental_maps
[e
] = bl
;
3681 ceph_assert(err
== -ENOENT
);
3682 ceph_assert(!bl
.length());
3683 get_version_full(e
, features
, bl
);
3684 if (bl
.length() > 0) {
3685 //else if (get_version("full", e, bl) > 0) {
3686 dout(20) << "build_incremental full " << e
<< " "
3687 << bl
.length() << " bytes" << dendl
;
3690 ceph_abort(); // we should have all maps.
3697 void OSDMonitor::send_full(MonOpRequestRef op
)
3699 op
->mark_osdmon_event(__func__
);
3700 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
3701 mon
->send_reply(op
, build_latest_full(op
->get_session()->con_features
));
3704 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
3706 op
->mark_osdmon_event(__func__
);
3708 MonSession
*s
= op
->get_session();
3712 // oh, we can tell the other mon to do it
3713 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
3715 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
3716 r
->send_osdmap_first
= first
;
3717 s
->proxy_con
->send_message(r
);
3718 op
->mark_event("reply: send routed send_osdmap_first reply");
3721 send_incremental(first
, s
, false, op
);
3725 void OSDMonitor::send_incremental(epoch_t first
,
3726 MonSession
*session
,
3728 MonOpRequestRef req
)
3730 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
3731 << " to " << session
->name
<< dendl
;
3733 // get feature of the peer
3734 // use quorum_con_features, if it's an anonymous connection.
3735 uint64_t features
= session
->con_features
? session
->con_features
:
3736 mon
->get_quorum_con_features();
3738 if (first
<= session
->osd_epoch
) {
3739 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
3740 << session
->osd_epoch
<< dendl
;
3741 first
= session
->osd_epoch
+ 1;
3744 if (first
< get_first_committed()) {
3745 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
3746 m
->oldest_map
= get_first_committed();
3747 m
->newest_map
= osdmap
.get_epoch();
3749 // share removed snaps during the gap
3750 get_removed_snaps_range(first
, m
->oldest_map
, &m
->gap_removed_snaps
);
3752 first
= get_first_committed();
3754 int err
= get_version_full(first
, features
, bl
);
3755 ceph_assert(err
== 0);
3756 ceph_assert(bl
.length());
3757 dout(20) << "send_incremental starting with base full "
3758 << first
<< " " << bl
.length() << " bytes" << dendl
;
3759 m
->maps
[first
] = bl
;
3762 mon
->send_reply(req
, m
);
3763 session
->osd_epoch
= first
;
3766 session
->con
->send_message(m
);
3767 session
->osd_epoch
= first
;
3772 while (first
<= osdmap
.get_epoch()) {
3773 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
3774 osdmap
.get_epoch());
3775 MOSDMap
*m
= build_incremental(first
, last
, features
);
3778 // send some maps. it may not be all of them, but it will get them
3780 mon
->send_reply(req
, m
);
3782 session
->con
->send_message(m
);
3785 session
->osd_epoch
= last
;
3791 void OSDMonitor::get_removed_snaps_range(
3792 epoch_t start
, epoch_t end
,
3793 mempool::osdmap::map
<int64_t,OSDMap::snap_interval_set_t
> *gap_removed_snaps
)
3795 // we only care about pools that exist now.
3796 for (auto& p
: osdmap
.get_pools()) {
3797 auto& t
= (*gap_removed_snaps
)[p
.first
];
3798 for (epoch_t epoch
= start
; epoch
< end
; ++epoch
) {
3799 string k
= make_snap_epoch_key(p
.first
, epoch
);
3801 mon
->store
->get(OSD_SNAP_PREFIX
, k
, v
);
3803 auto q
= v
.cbegin();
3804 OSDMap::snap_interval_set_t snaps
;
3809 dout(10) << __func__
<< " " << p
.first
<< " " << t
<< dendl
;
3813 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
3815 return get_version(ver
, mon
->get_quorum_con_features(), bl
);
3818 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
3820 OSDMap::Incremental inc
;
3821 auto q
= bl
.cbegin();
3823 // always encode with subset of osdmap's canonical features
3824 uint64_t f
= features
& inc
.encode_features
;
3825 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
3828 if (inc
.fullmap
.length()) {
3829 // embedded full map?
3831 m
.decode(inc
.fullmap
);
3832 inc
.fullmap
.clear();
3833 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
3835 if (inc
.crush
.length()) {
3836 // embedded crush map
3838 auto p
= inc
.crush
.cbegin();
3841 c
.encode(inc
.crush
, f
);
3843 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
3846 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
3849 auto q
= bl
.cbegin();
3851 // always encode with subset of osdmap's canonical features
3852 uint64_t f
= features
& m
.get_encoding_features();
3853 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
3856 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
3859 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
3861 uint64_t significant_features
= OSDMap::get_significant_features(features
);
3862 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
3865 int ret
= PaxosService::get_version(ver
, bl
);
3869 // NOTE: this check is imprecise; the OSDMap encoding features may
3870 // be a subset of the latest mon quorum features, but worst case we
3871 // reencode once and then cache the (identical) result under both
3873 if (significant_features
!=
3874 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
3875 reencode_incremental_map(bl
, features
);
3877 inc_osd_cache
.add({ver
, significant_features
}, bl
);
3881 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
3884 int err
= get_version(ver
, inc_bl
);
3885 ceph_assert(err
== 0);
3886 ceph_assert(inc_bl
.length());
3888 auto p
= inc_bl
.cbegin();
3890 dout(10) << __func__
<< " "
3891 << " epoch " << inc
.epoch
3892 << " inc_crc " << inc
.inc_crc
3893 << " full_crc " << inc
.full_crc
3894 << " encode_features " << inc
.encode_features
<< dendl
;
3898 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
3900 dout(10) << __func__
<< " ver " << ver
<< dendl
;
3902 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
3903 if (closest_pinned
== 0) {
3906 if (closest_pinned
> ver
) {
3907 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
3909 ceph_assert(closest_pinned
<= ver
);
3911 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
3913 // get osdmap incremental maps and apply on top of this one.
3915 bool has_cached_osdmap
= false;
3916 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
3917 if (full_osd_cache
.lookup({v
, mon
->get_quorum_con_features()},
3919 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
3921 has_cached_osdmap
= true;
3926 if (!has_cached_osdmap
) {
3927 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
3929 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
3930 << " not available! error: " << cpp_strerror(err
) << dendl
;
3932 ceph_assert(err
== 0);
3935 ceph_assert(osdm_bl
.length());
3938 osdm
.decode(osdm_bl
);
3940 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
3941 << " e" << osdm
.epoch
3942 << " crc " << osdm
.get_crc()
3943 << " -- applying incremental maps." << dendl
;
3945 uint64_t encode_features
= 0;
3946 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
3947 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
3949 OSDMap::Incremental inc
;
3950 int err
= get_inc(v
, inc
);
3951 ceph_assert(err
== 0);
3953 encode_features
= inc
.encode_features
;
3955 err
= osdm
.apply_incremental(inc
);
3956 ceph_assert(err
== 0);
3958 // this block performs paranoid checks on map retrieval
3959 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
3960 inc
.full_crc
!= 0) {
3962 uint64_t f
= encode_features
;
3964 f
= (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
3967 // encode osdmap to force calculating crcs
3969 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
3970 // decode osdmap to compare crcs with what's expected by incremental
3974 if (tosdm
.get_crc() != inc
.full_crc
) {
3976 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
3977 << ", expected " << inc
.full_crc
<< ")" << dendl
;
3978 ceph_abort_msg("osdmap crc mismatch");
3982 // note: we cannot add the recently computed map to the cache, as is,
3983 // because we have not encoded the map into a bl.
3986 if (!encode_features
) {
3987 dout(10) << __func__
3988 << " last incremental map didn't have features;"
3989 << " defaulting to quorum's or all" << dendl
;
3991 (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
3993 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
3998 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4000 return get_version_full(ver
, mon
->get_quorum_con_features(), bl
);
4003 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4006 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4007 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4010 int ret
= PaxosService::get_version_full(ver
, bl
);
4011 if (ret
== -ENOENT
) {
4013 ret
= get_full_from_pinned_map(ver
, bl
);
4018 // NOTE: this check is imprecise; the OSDMap encoding features may
4019 // be a subset of the latest mon quorum features, but worst case we
4020 // reencode once and then cache the (identical) result under both
4022 if (significant_features
!=
4023 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4024 reencode_full_map(bl
, features
);
4026 full_osd_cache
.add({ver
, significant_features
}, bl
);
4030 epoch_t
OSDMonitor::blacklist(const entity_addrvec_t
& av
, utime_t until
)
4032 dout(10) << "blacklist " << av
<< " until " << until
<< dendl
;
4033 for (auto a
: av
.v
) {
4034 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
4035 a
.set_type(entity_addr_t::TYPE_ANY
);
4037 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4039 pending_inc
.new_blacklist
[a
] = until
;
4041 return pending_inc
.epoch
;
4044 epoch_t
OSDMonitor::blacklist(entity_addr_t a
, utime_t until
)
4046 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
4047 a
.set_type(entity_addr_t::TYPE_ANY
);
4049 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4051 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
4052 pending_inc
.new_blacklist
[a
] = until
;
4053 return pending_inc
.epoch
;
4057 void OSDMonitor::check_osdmap_subs()
4059 dout(10) << __func__
<< dendl
;
4060 if (!osdmap
.get_epoch()) {
4063 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
4064 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
4067 auto p
= osdmap_subs
->second
->begin();
4071 check_osdmap_sub(sub
);
4075 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4077 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4078 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4079 if (sub
->next
<= osdmap
.get_epoch()) {
4081 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4083 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4085 mon
->session_map
.remove_sub(sub
);
4087 sub
->next
= osdmap
.get_epoch() + 1;
4091 void OSDMonitor::check_pg_creates_subs()
4093 if (!osdmap
.get_num_up_osds()) {
4096 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4097 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
4098 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4099 if (pg_creates_subs
== session_map
.subs
.end()) {
4102 for (auto sub
: *pg_creates_subs
->second
) {
4103 check_pg_creates_sub(sub
);
4108 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4110 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4111 ceph_assert(sub
->type
== "osd_pg_creates");
4112 // only send these if the OSD is up. we will check_subs() when they do
4113 // come up so they will get the creates then.
4114 if (sub
->session
->name
.is_osd() &&
4115 mon
->osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4116 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4117 sub
->session
->con
.get(),
4122 void OSDMonitor::do_application_enable(int64_t pool_id
,
4123 const std::string
&app_name
,
4124 const std::string
&app_key
,
4125 const std::string
&app_value
)
4127 ceph_assert(paxos
->is_plugged() && is_writeable());
4129 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4132 ceph_assert(osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
);
4134 auto pp
= osdmap
.get_pg_pool(pool_id
);
4135 ceph_assert(pp
!= nullptr);
4138 if (pending_inc
.new_pools
.count(pool_id
)) {
4139 p
= pending_inc
.new_pools
[pool_id
];
4142 if (app_key
.empty()) {
4143 p
.application_metadata
.insert({app_name
, {}});
4145 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4147 p
.last_change
= pending_inc
.epoch
;
4148 pending_inc
.new_pools
[pool_id
] = p
;
4151 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4152 pool_opts_t::key_t opt
,
4153 pool_opts_t::value_t val
)
4155 auto p
= pending_inc
.new_pools
.try_emplace(
4156 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4157 p
.first
->second
.opts
.set(opt
, val
);
4160 unsigned OSDMonitor::scan_for_creating_pgs(
4161 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4162 const mempool::osdmap::set
<int64_t>& removed_pools
,
4164 creating_pgs_t
* creating_pgs
) const
4166 unsigned queued
= 0;
4167 for (auto& p
: pools
) {
4168 int64_t poolid
= p
.first
;
4169 if (creating_pgs
->created_pools
.count(poolid
)) {
4170 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4173 const pg_pool_t
& pool
= p
.second
;
4174 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
4175 pool
.get_type(), pool
.get_size());
4176 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4179 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4180 const auto created
= pool
.get_last_change();
4181 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4182 dout(10) << __func__
<< " no change in pool " << poolid
4183 << " " << pool
<< dendl
;
4186 if (removed_pools
.count(poolid
)) {
4187 dout(10) << __func__
<< " pool is being removed: " << poolid
4188 << " " << pool
<< dendl
;
4191 dout(10) << __func__
<< " queueing pool create for " << poolid
4192 << " " << pool
<< dendl
;
4193 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
4200 void OSDMonitor::update_creating_pgs()
4202 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
4203 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
4204 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
4205 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4206 for (const auto& pg
: creating_pgs
.pgs
) {
4207 int acting_primary
= -1;
4208 auto pgid
= pg
.first
;
4209 if (!osdmap
.pg_exists(pgid
)) {
4210 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
4214 auto mapped
= pg
.second
.first
;
4215 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
4217 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
4218 // check the previous creating_pgs, look for the target to whom the pg was
4219 // previously mapped
4220 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
4221 const auto last_acting_primary
= pgs_by_epoch
.first
;
4222 for (auto& pgs
: pgs_by_epoch
.second
) {
4223 if (pgs
.second
.count(spgid
)) {
4224 if (last_acting_primary
== acting_primary
) {
4227 dout(20) << __func__
<< " " << pgid
<< " "
4228 << " acting_primary:" << last_acting_primary
4229 << " -> " << acting_primary
<< dendl
;
4230 // note epoch if the target of the create message changed.
4231 mapped
= mapping
.get_epoch();
4236 mapped
= mapping
.get_epoch();
4240 dout(10) << __func__
<< " will instruct osd." << acting_primary
4241 << " to create " << pgid
<< "@" << mapped
<< dendl
;
4242 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
4244 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
4245 creating_pgs_epoch
= mapping
.get_epoch();
4248 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
4250 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
4251 << " " << creating_pgs_by_osd_epoch
<< dendl
;
4252 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4253 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
4254 dout(20) << __func__
4255 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
4256 // the subscribers will be updated when the mapping is completed anyway
4259 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
4260 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
4262 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
4264 MOSDPGCreate
*oldm
= nullptr; // for pre-mimic OSD compat
4265 MOSDPGCreate2
*m
= nullptr;
4267 bool old
= osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
;
4270 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
4271 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
4272 auto epoch
= epoch_pgs
->first
;
4273 auto& pgs
= epoch_pgs
->second
;
4274 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4275 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
4277 for (auto& pg
: pgs
) {
4278 // Need the create time from the monitor using its clock to set
4279 // last_scrub_stamp upon pg creation.
4280 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
4281 ceph_assert(create
!= creating_pgs
.pgs
.end());
4284 oldm
= new MOSDPGCreate(creating_pgs_epoch
);
4286 oldm
->mkpg
.emplace(pg
.pgid
,
4287 pg_create_t
{create
->second
.first
, pg
.pgid
, 0});
4288 oldm
->ctimes
.emplace(pg
.pgid
, create
->second
.second
);
4291 m
= new MOSDPGCreate2(creating_pgs_epoch
);
4293 m
->pgs
.emplace(pg
, create
->second
);
4295 dout(20) << __func__
<< " will create " << pg
4296 << " at " << create
->second
.first
<< dendl
;
4300 con
->send_message(m
);
4302 con
->send_message(oldm
);
4304 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4305 << " has nothing to send" << dendl
;
4309 // sub is current through last + 1
4316 void OSDMonitor::tick()
4318 if (!is_active()) return;
4320 dout(10) << osdmap
<< dendl
;
4322 // always update osdmap manifest, regardless of being the leader.
4323 load_osdmap_manifest();
4325 if (!mon
->is_leader()) return;
4327 bool do_propose
= false;
4328 utime_t now
= ceph_clock_now();
4330 if (handle_osd_timeouts(now
, last_osd_report
)) {
4335 if (check_failures(now
)) {
4339 // Force a proposal if we need to prune; pruning is performed on
4340 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4341 // even if there's nothing going on.
4342 if (is_prune_enabled() && should_prune()) {
4346 // mark down osds out?
4348 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4349 * influence at all. The decision is made based on the ratio of "in" osds,
4350 * and the function returns false if this ratio is lower that the minimum
4351 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
4353 if (can_mark_out(-1)) {
4354 string down_out_subtree_limit
= g_conf().get_val
<string
>(
4355 "mon_osd_down_out_subtree_limit");
4356 set
<int> down_cache
; // quick cache of down subtrees
4358 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
4359 while (i
!= down_pending_out
.end()) {
4365 if (osdmap
.is_down(o
) &&
4368 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
4369 utime_t grace
= orig_grace
;
4370 double my_grace
= 0.0;
4372 if (g_conf()->mon_osd_adjust_down_out_interval
) {
4373 // scale grace period the same way we do the heartbeat grace.
4374 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
4375 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
4376 double decay_k
= ::log(.5) / halflife
;
4377 double decay
= exp((double)down
* decay_k
);
4378 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
4379 << " down for " << down
<< " decay " << decay
<< dendl
;
4380 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
4384 // is this an entire large subtree down?
4385 if (down_out_subtree_limit
.length()) {
4386 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
4388 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
4389 dout(10) << "tick entire containing " << down_out_subtree_limit
4390 << " subtree for osd." << o
4391 << " is down; resetting timer" << dendl
;
4392 // reset timer, too.
4393 down_pending_out
[o
] = now
;
4399 bool down_out
= !osdmap
.is_destroyed(o
) &&
4400 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
4401 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
4402 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
4403 // this is not precise enough as we did not make a note when this osd
4404 // was marked as destroyed, but let's not bother with that
4405 // complexity for now.
4406 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
4407 if (down_out
|| destroyed_out
) {
4408 dout(10) << "tick marking osd." << o
<< " OUT after " << down
4409 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
4410 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
4412 // set the AUTOOUT bit.
4413 if (pending_inc
.new_state
.count(o
) == 0)
4414 pending_inc
.new_state
[o
] = 0;
4415 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
4417 // remember previous weight
4418 if (pending_inc
.new_xinfo
.count(o
) == 0)
4419 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
4420 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
4424 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
4425 << int(down
.sec()) << " seconds)";
4430 down_pending_out
.erase(o
);
4433 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
4436 // expire blacklisted items?
4437 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
4438 p
!= osdmap
.blacklist
.end();
4440 if (p
->second
< now
) {
4441 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
4442 pending_inc
.old_blacklist
.push_back(p
->first
);
4447 if (try_prune_purged_snaps()) {
4451 if (update_pools_status())
4455 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
4459 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
4460 std::map
<int,utime_t
> &last_osd_report
)
4462 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
4463 if (now
- mon
->get_leader_since() < timeo
) {
4464 // We haven't been the leader for long enough to consider OSD timeouts
4468 int max_osd
= osdmap
.get_max_osd();
4469 bool new_down
= false;
4471 for (int i
=0; i
< max_osd
; ++i
) {
4472 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
4473 if (!osdmap
.exists(i
)) {
4474 last_osd_report
.erase(i
); // if any
4477 if (!osdmap
.is_up(i
))
4479 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
4480 if (t
== last_osd_report
.end()) {
4481 // it wasn't in the map; start the timer.
4482 last_osd_report
[i
] = now
;
4483 } else if (can_mark_down(i
)) {
4484 utime_t diff
= now
- t
->second
;
4486 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
4487 << diff
<< " seconds";
4488 derr
<< "no beacon from osd." << i
<< " since " << t
->second
4489 << ", " << diff
<< " seconds ago. marking down" << dendl
;
4490 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
4498 static void dump_cpu_list(Formatter
*f
, const char *name
,
4499 const string
& strlist
)
4502 size_t cpu_set_size
;
4503 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
4506 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
4507 f
->open_array_section(name
);
4508 for (auto cpu
: cpus
) {
4509 f
->dump_int("cpu", cpu
);
4514 void OSDMonitor::dump_info(Formatter
*f
)
4516 f
->open_object_section("osdmap");
4520 f
->open_array_section("osd_metadata");
4521 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4522 if (osdmap
.exists(i
)) {
4523 f
->open_object_section("osd");
4524 f
->dump_unsigned("id", i
);
4525 dump_osd_metadata(i
, f
, NULL
);
4531 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
4532 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
4534 f
->open_object_section("crushmap");
4535 osdmap
.crush
->dump(f
);
4538 if (has_osdmap_manifest
) {
4539 f
->open_object_section("osdmap_manifest");
4540 osdmap_manifest
.dump(f
);
4546 enum osd_pool_get_choices
{
4548 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
4549 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
4550 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
4551 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
4552 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
4553 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
4554 CACHE_TARGET_FULL_RATIO
,
4555 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
4556 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
4557 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
4558 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
4559 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
4560 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
4561 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
4562 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
4563 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
4564 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
4565 PG_AUTOSCALE_BIAS
};
4567 std::set
<osd_pool_get_choices
>
4568 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
4569 const std::set
<osd_pool_get_choices
>& second
)
4571 std::set
<osd_pool_get_choices
> result
;
4572 std::set_difference(first
.begin(), first
.end(),
4573 second
.begin(), second
.end(),
4574 std::inserter(result
, result
.end()));
4580 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
4582 op
->mark_osdmon_event(__func__
);
4583 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
4586 stringstream ss
, ds
;
4589 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
4590 string rs
= ss
.str();
4591 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
4595 MonSession
*session
= op
->get_session();
4597 derr
<< __func__
<< " no session" << dendl
;
4598 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
4603 cmd_getval(cct
, cmdmap
, "prefix", prefix
);
4606 cmd_getval(cct
, cmdmap
, "format", format
, string("plain"));
4607 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
4609 if (prefix
== "osd stat") {
4610 osdmap
.print_summary(f
.get(), ds
, "", true);
4616 else if (prefix
== "osd dump" ||
4617 prefix
== "osd tree" ||
4618 prefix
== "osd tree-from" ||
4619 prefix
== "osd ls" ||
4620 prefix
== "osd getmap" ||
4621 prefix
== "osd getcrushmap" ||
4622 prefix
== "osd ls-tree") {
4627 cmd_getval(cct
, cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
4630 bufferlist osdmap_bl
;
4631 int err
= get_version_full(epoch
, osdmap_bl
);
4632 if (err
== -ENOENT
) {
4634 ss
<< "there is no map for epoch " << epoch
;
4637 ceph_assert(err
== 0);
4638 ceph_assert(osdmap_bl
.length());
4641 if (epoch
== osdmap
.get_epoch()) {
4645 p
->decode(osdmap_bl
);
4648 auto sg
= make_scope_guard([&] {
4654 if (prefix
== "osd dump") {
4657 f
->open_object_section("osdmap");
4667 } else if (prefix
== "osd ls") {
4669 f
->open_array_section("osds");
4670 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
4671 if (osdmap
.exists(i
)) {
4672 f
->dump_int("osd", i
);
4679 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
4680 if (osdmap
.exists(i
)) {
4689 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
4691 if (prefix
== "osd tree-from") {
4692 cmd_getval(cct
, cmdmap
, "bucket", bucket
);
4693 if (!osdmap
.crush
->name_exists(bucket
)) {
4694 ss
<< "bucket '" << bucket
<< "' does not exist";
4698 int id
= osdmap
.crush
->get_item_id(bucket
);
4700 ss
<< "\"" << bucket
<< "\" is not a bucket";
4706 vector
<string
> states
;
4707 cmd_getval(cct
, cmdmap
, "states", states
);
4708 unsigned filter
= 0;
4709 for (auto& s
: states
) {
4711 filter
|= OSDMap::DUMP_UP
;
4712 } else if (s
== "down") {
4713 filter
|= OSDMap::DUMP_DOWN
;
4714 } else if (s
== "in") {
4715 filter
|= OSDMap::DUMP_IN
;
4716 } else if (s
== "out") {
4717 filter
|= OSDMap::DUMP_OUT
;
4718 } else if (s
== "destroyed") {
4719 filter
|= OSDMap::DUMP_DESTROYED
;
4721 ss
<< "unrecognized state '" << s
<< "'";
4726 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
4727 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
4728 ss
<< "cannot specify both 'in' and 'out'";
4732 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
4733 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
4734 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
4735 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
4736 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
4737 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
4738 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
4743 f
->open_object_section("tree");
4744 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
4748 p
->print_tree(NULL
, &ds
, filter
, bucket
);
4751 } else if (prefix
== "osd getmap") {
4752 rdata
.append(osdmap_bl
);
4753 ss
<< "got osdmap epoch " << p
->get_epoch();
4754 } else if (prefix
== "osd getcrushmap") {
4755 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
4756 ss
<< p
->get_crush_version();
4757 } else if (prefix
== "osd ls-tree") {
4759 cmd_getval(cct
, cmdmap
, "name", bucket_name
);
4761 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
4763 ss
<< "\"" << bucket_name
<< "\" does not exist";
4766 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
4771 f
->open_array_section("osds");
4772 for (auto &i
: osds
) {
4773 if (osdmap
.exists(i
)) {
4774 f
->dump_int("osd", i
);
4781 for (auto &i
: osds
) {
4782 if (osdmap
.exists(i
)) {
4793 } else if (prefix
== "osd getmaxosd") {
4795 f
->open_object_section("getmaxosd");
4796 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4797 f
->dump_int("max_osd", osdmap
.get_max_osd());
4801 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
4804 } else if (prefix
== "osd utilization") {
4806 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
4813 } else if (prefix
== "osd find") {
4815 if (!cmd_getval(cct
, cmdmap
, "id", osd
)) {
4816 ss
<< "unable to parse osd id value '"
4817 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4821 if (!osdmap
.exists(osd
)) {
4822 ss
<< "osd." << osd
<< " does not exist";
4827 cmd_getval(cct
, cmdmap
, "format", format
);
4828 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4829 f
->open_object_section("osd_location");
4830 f
->dump_int("osd", osd
);
4831 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
4832 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
4834 // try to identify host, pod/container name, etc.
4835 map
<string
,string
> m
;
4836 load_metadata(osd
, m
, nullptr);
4837 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
4838 f
->dump_string("host", p
->second
);
4841 "pod_name", "pod_namespace", // set by rook
4842 "container_name" // set by ceph-ansible
4844 if (auto p
= m
.find(k
); p
!= m
.end()) {
4845 f
->dump_string(k
, p
->second
);
4849 // crush is helpful too
4850 f
->open_object_section("crush_location");
4851 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
4852 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
4853 f
->dump_string(p
->first
.c_str(), p
->second
);
4857 } else if (prefix
== "osd metadata") {
4859 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
4860 !cmd_getval(cct
, cmdmap
, "id", osd
)) {
4861 ss
<< "unable to parse osd id value '"
4862 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4866 if (osd
>= 0 && !osdmap
.exists(osd
)) {
4867 ss
<< "osd." << osd
<< " does not exist";
4872 cmd_getval(cct
, cmdmap
, "format", format
);
4873 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4875 f
->open_object_section("osd_metadata");
4876 f
->dump_unsigned("id", osd
);
4877 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
4883 f
->open_array_section("osd_metadata");
4884 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4885 if (osdmap
.exists(i
)) {
4886 f
->open_object_section("osd");
4887 f
->dump_unsigned("id", i
);
4888 r
= dump_osd_metadata(i
, f
.get(), NULL
);
4889 if (r
== -EINVAL
|| r
== -ENOENT
) {
4890 // Drop error, continue to get other daemons' metadata
4891 dout(4) << "No metadata for osd." << i
<< dendl
;
4903 } else if (prefix
== "osd versions") {
4905 f
.reset(Formatter::create("json-pretty"));
4906 count_metadata("ceph_version", f
.get());
4909 } else if (prefix
== "osd count-metadata") {
4911 f
.reset(Formatter::create("json-pretty"));
4913 cmd_getval(cct
, cmdmap
, "property", field
);
4914 count_metadata(field
, f
.get());
4917 } else if (prefix
== "osd numa-status") {
4920 f
->open_array_section("osds");
4922 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
4923 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
4924 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
4925 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
4926 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
4927 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
4929 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4930 if (osdmap
.exists(i
)) {
4931 map
<string
,string
> m
;
4933 if (load_metadata(i
, m
, &err
) < 0) {
4937 auto p
= m
.find("hostname");
4942 f
->open_object_section("osd");
4943 f
->dump_int("osd", i
);
4944 f
->dump_string("host", host
);
4945 for (auto n
: { "network_numa_node", "objectstore_numa_node",
4949 f
->dump_int(n
, atoi(p
->second
.c_str()));
4952 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
4955 list
<string
> ls
= get_str_list(p
->second
, ",");
4956 f
->open_array_section(n
);
4957 for (auto node
: ls
) {
4958 f
->dump_int("node", atoi(node
.c_str()));
4963 for (auto n
: { "numa_node_cpus" }) {
4966 dump_cpu_list(f
.get(), n
, p
->second
);
4973 p
= m
.find("network_numa_nodes");
4979 p
= m
.find("objectstore_numa_nodes");
4985 p
= m
.find("numa_node");
4986 auto q
= m
.find("numa_node_cpus");
4987 if (p
!= m
.end() && q
!= m
.end()) {
4994 tbl
<< TextTable::endrow
;
5002 rdata
.append(stringify(tbl
));
5004 } else if (prefix
== "osd map") {
5005 string poolstr
, objstr
, namespacestr
;
5006 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
5007 cmd_getval(cct
, cmdmap
, "object", objstr
);
5008 cmd_getval(cct
, cmdmap
, "nspace", namespacestr
);
5010 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5012 ss
<< "pool " << poolstr
<< " does not exist";
5016 object_locator_t
oloc(pool
, namespacestr
);
5017 object_t
oid(objstr
);
5018 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5019 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5020 vector
<int> up
, acting
;
5022 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5025 if (!namespacestr
.empty())
5026 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5028 fullobjname
= oid
.name
;
5030 f
->open_object_section("osd_map");
5031 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5032 f
->dump_string("pool", poolstr
);
5033 f
->dump_int("pool_id", pool
);
5034 f
->dump_stream("objname") << fullobjname
;
5035 f
->dump_stream("raw_pgid") << pgid
;
5036 f
->dump_stream("pgid") << mpgid
;
5037 f
->open_array_section("up");
5038 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5039 f
->dump_int("osd", *p
);
5041 f
->dump_int("up_primary", up_p
);
5042 f
->open_array_section("acting");
5043 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5044 f
->dump_int("osd", *p
);
5046 f
->dump_int("acting_primary", acting_p
);
5047 f
->close_section(); // osd_map
5050 ds
<< "osdmap e" << osdmap
.get_epoch()
5051 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5052 << " object '" << fullobjname
<< "' ->"
5053 << " pg " << pgid
<< " (" << mpgid
<< ")"
5054 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5055 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5059 } else if (prefix
== "pg map") {
5062 cmd_getval(cct
, cmdmap
, "pgid", pgidstr
);
5063 if (!pgid
.parse(pgidstr
.c_str())) {
5064 ss
<< "invalid pgid '" << pgidstr
<< "'";
5068 vector
<int> up
, acting
;
5069 if (!osdmap
.have_pg_pool(pgid
.pool())) {
5070 ss
<< "pg '" << pgidstr
<< "' does not exist";
5074 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5075 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5077 f
->open_object_section("pg_map");
5078 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5079 f
->dump_stream("raw_pgid") << pgid
;
5080 f
->dump_stream("pgid") << mpgid
;
5081 f
->open_array_section("up");
5082 for (auto osd
: up
) {
5083 f
->dump_int("up_osd", osd
);
5086 f
->open_array_section("acting");
5087 for (auto osd
: acting
) {
5088 f
->dump_int("acting_osd", osd
);
5094 ds
<< "osdmap e" << osdmap
.get_epoch()
5095 << " pg " << pgid
<< " (" << mpgid
<< ")"
5096 << " -> up " << up
<< " acting " << acting
;
5101 } else if (prefix
== "osd lspools") {
5103 f
->open_array_section("pools");
5104 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
5105 p
!= osdmap
.pools
.end();
5108 f
->open_object_section("pool");
5109 f
->dump_int("poolnum", p
->first
);
5110 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
5113 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
5114 if (next(p
) != osdmap
.pools
.end()) {
5124 } else if (prefix
== "osd blacklist ls") {
5126 f
->open_array_section("blacklist");
5128 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5129 p
!= osdmap
.blacklist
.end();
5132 f
->open_object_section("entry");
5133 f
->dump_string("addr", p
->first
.get_legacy_str());
5134 f
->dump_stream("until") << p
->second
;
5139 ss
<< p
->first
<< " " << p
->second
;
5149 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
5151 } else if (prefix
== "osd pool ls") {
5153 cmd_getval(cct
, cmdmap
, "detail", detail
);
5154 if (!f
&& detail
== "detail") {
5156 osdmap
.print_pools(ss
);
5157 rdata
.append(ss
.str());
5160 f
->open_array_section("pools");
5161 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
5162 it
!= osdmap
.get_pools().end();
5165 if (detail
== "detail") {
5166 f
->open_object_section("pool");
5167 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5168 it
->second
.dump(f
.get());
5171 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5174 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
5183 } else if (prefix
== "osd crush get-tunable") {
5185 cmd_getval(cct
, cmdmap
, "tunable", tunable
);
5188 f
->open_object_section("tunable");
5189 if (tunable
== "straw_calc_version") {
5191 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
5193 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
5202 rdata
.append(rss
.str());
5206 } else if (prefix
== "osd pool get") {
5208 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
5209 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5211 ss
<< "unrecognized pool '" << poolstr
<< "'";
5216 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
5218 cmd_getval(cct
, cmdmap
, "var", var
);
5220 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
5221 const choices_map_t ALL_CHOICES
= {
5223 {"min_size", MIN_SIZE
},
5224 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
5225 {"crush_rule", CRUSH_RULE
}, {"hashpspool", HASHPSPOOL
},
5226 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
5227 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
5228 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
5229 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
5230 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
5231 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
5232 {"use_gmt_hitset", USE_GMT_HITSET
},
5233 {"target_max_objects", TARGET_MAX_OBJECTS
},
5234 {"target_max_bytes", TARGET_MAX_BYTES
},
5235 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
5236 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
5237 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
5238 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
5239 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
5240 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
5241 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
5242 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
5243 {"fast_read", FAST_READ
},
5244 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
5245 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
5246 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
5247 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
5248 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
5249 {"recovery_priority", RECOVERY_PRIORITY
},
5250 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
5251 {"scrub_priority", SCRUB_PRIORITY
},
5252 {"compression_mode", COMPRESSION_MODE
},
5253 {"compression_algorithm", COMPRESSION_ALGORITHM
},
5254 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
5255 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
5256 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
5257 {"csum_type", CSUM_TYPE
},
5258 {"csum_max_block", CSUM_MAX_BLOCK
},
5259 {"csum_min_block", CSUM_MIN_BLOCK
},
5260 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
5261 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
5262 {"pg_num_min", PG_NUM_MIN
},
5263 {"target_size_bytes", TARGET_SIZE_BYTES
},
5264 {"target_size_ratio", TARGET_SIZE_RATIO
},
5265 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
5268 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
5270 const choices_set_t ONLY_TIER_CHOICES
= {
5271 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5272 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
5273 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5274 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5275 MIN_READ_RECENCY_FOR_PROMOTE
,
5276 MIN_WRITE_RECENCY_FOR_PROMOTE
,
5277 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
5279 const choices_set_t ONLY_ERASURE_CHOICES
= {
5280 EC_OVERWRITES
, ERASURE_CODE_PROFILE
5283 choices_set_t selected_choices
;
5285 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
5286 it
!= ALL_CHOICES
.end(); ++it
) {
5287 selected_choices
.insert(it
->second
);
5291 selected_choices
= subtract_second_from_first(selected_choices
,
5295 if(!p
->is_erasure()) {
5296 selected_choices
= subtract_second_from_first(selected_choices
,
5297 ONLY_ERASURE_CHOICES
);
5299 } else /* var != "all" */ {
5300 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
5301 osd_pool_get_choices selected
= found
->second
;
5303 if (!p
->is_tier() &&
5304 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
5305 ss
<< "pool '" << poolstr
5306 << "' is not a tier pool: variable not applicable";
5311 if (!p
->is_erasure() &&
5312 ONLY_ERASURE_CHOICES
.find(selected
)
5313 != ONLY_ERASURE_CHOICES
.end()) {
5314 ss
<< "pool '" << poolstr
5315 << "' is not a erasure pool: variable not applicable";
5320 if (pool_opts_t::is_opt_name(var
) &&
5321 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
5322 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
5327 selected_choices
.insert(selected
);
5331 f
->open_object_section("pool");
5332 f
->dump_string("pool", poolstr
);
5333 f
->dump_int("pool_id", pool
);
5334 for(choices_set_t::const_iterator it
= selected_choices
.begin();
5335 it
!= selected_choices
.end(); ++it
) {
5336 choices_map_t::const_iterator i
;
5337 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5338 if (i
->second
== *it
) {
5342 ceph_assert(i
!= ALL_CHOICES
.end());
5345 f
->dump_int("pg_num", p
->get_pg_num());
5348 f
->dump_int("pgp_num", p
->get_pgp_num());
5351 f
->dump_int("size", p
->get_size());
5354 f
->dump_int("min_size", p
->get_min_size());
5357 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
5358 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
5359 p
->get_crush_rule()));
5361 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
5365 f
->dump_bool("allow_ec_overwrites",
5366 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
5368 case PG_AUTOSCALE_MODE
:
5369 f
->dump_string("pg_autoscale_mode",
5370 pg_pool_t::get_pg_autoscale_mode_name(
5371 p
->pg_autoscale_mode
));
5377 case WRITE_FADVISE_DONTNEED
:
5380 f
->dump_bool(i
->first
.c_str(),
5381 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
5383 case HIT_SET_PERIOD
:
5384 f
->dump_int("hit_set_period", p
->hit_set_period
);
5387 f
->dump_int("hit_set_count", p
->hit_set_count
);
5390 f
->dump_string("hit_set_type",
5391 HitSet::get_type_name(p
->hit_set_params
.get_type()));
5395 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
5396 BloomHitSet::Params
*bloomp
=
5397 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
5398 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
5399 } else if(var
!= "all") {
5401 ss
<< "hit set is not of type Bloom; " <<
5402 "invalid to get a false positive rate!";
5408 case USE_GMT_HITSET
:
5409 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
5411 case TARGET_MAX_OBJECTS
:
5412 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
5414 case TARGET_MAX_BYTES
:
5415 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
5417 case CACHE_TARGET_DIRTY_RATIO
:
5418 f
->dump_unsigned("cache_target_dirty_ratio_micro",
5419 p
->cache_target_dirty_ratio_micro
);
5420 f
->dump_float("cache_target_dirty_ratio",
5421 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
5423 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
5424 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
5425 p
->cache_target_dirty_high_ratio_micro
);
5426 f
->dump_float("cache_target_dirty_high_ratio",
5427 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
5429 case CACHE_TARGET_FULL_RATIO
:
5430 f
->dump_unsigned("cache_target_full_ratio_micro",
5431 p
->cache_target_full_ratio_micro
);
5432 f
->dump_float("cache_target_full_ratio",
5433 ((float)p
->cache_target_full_ratio_micro
/1000000));
5435 case CACHE_MIN_FLUSH_AGE
:
5436 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
5438 case CACHE_MIN_EVICT_AGE
:
5439 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
5441 case ERASURE_CODE_PROFILE
:
5442 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
5444 case MIN_READ_RECENCY_FOR_PROMOTE
:
5445 f
->dump_int("min_read_recency_for_promote",
5446 p
->min_read_recency_for_promote
);
5448 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
5449 f
->dump_int("min_write_recency_for_promote",
5450 p
->min_write_recency_for_promote
);
5453 f
->dump_int("fast_read", p
->fast_read
);
5455 case HIT_SET_GRADE_DECAY_RATE
:
5456 f
->dump_int("hit_set_grade_decay_rate",
5457 p
->hit_set_grade_decay_rate
);
5459 case HIT_SET_SEARCH_LAST_N
:
5460 f
->dump_int("hit_set_search_last_n",
5461 p
->hit_set_search_last_n
);
5463 case SCRUB_MIN_INTERVAL
:
5464 case SCRUB_MAX_INTERVAL
:
5465 case DEEP_SCRUB_INTERVAL
:
5466 case RECOVERY_PRIORITY
:
5467 case RECOVERY_OP_PRIORITY
:
5468 case SCRUB_PRIORITY
:
5469 case COMPRESSION_MODE
:
5470 case COMPRESSION_ALGORITHM
:
5471 case COMPRESSION_REQUIRED_RATIO
:
5472 case COMPRESSION_MAX_BLOB_SIZE
:
5473 case COMPRESSION_MIN_BLOB_SIZE
:
5475 case CSUM_MAX_BLOCK
:
5476 case CSUM_MIN_BLOCK
:
5477 case FINGERPRINT_ALGORITHM
:
5479 case TARGET_SIZE_BYTES
:
5480 case TARGET_SIZE_RATIO
:
5481 case PG_AUTOSCALE_BIAS
:
5482 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
5483 if (p
->opts
.is_set(key
)) {
5484 if(*it
== CSUM_TYPE
) {
5486 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
5487 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
5489 p
->opts
.dump(i
->first
, f
.get());
5498 for(choices_set_t::const_iterator it
= selected_choices
.begin();
5499 it
!= selected_choices
.end(); ++it
) {
5500 choices_map_t::const_iterator i
;
5503 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
5506 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
5509 ss
<< "size: " << p
->get_size() << "\n";
5512 ss
<< "min_size: " << p
->get_min_size() << "\n";
5515 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
5516 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
5517 p
->get_crush_rule()) << "\n";
5519 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
5522 case PG_AUTOSCALE_MODE
:
5523 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
5524 p
->pg_autoscale_mode
) <<"\n";
5526 case HIT_SET_PERIOD
:
5527 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
5530 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
5533 ss
<< "hit_set_type: " <<
5534 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
5538 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
5539 BloomHitSet::Params
*bloomp
=
5540 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
5541 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
5542 } else if(var
!= "all") {
5543 ss
<< "hit set is not of type Bloom; " <<
5544 "invalid to get a false positive rate!";
5550 case USE_GMT_HITSET
:
5551 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
5553 case TARGET_MAX_OBJECTS
:
5554 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
5556 case TARGET_MAX_BYTES
:
5557 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
5559 case CACHE_TARGET_DIRTY_RATIO
:
5560 ss
<< "cache_target_dirty_ratio: "
5561 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
5563 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
5564 ss
<< "cache_target_dirty_high_ratio: "
5565 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
5567 case CACHE_TARGET_FULL_RATIO
:
5568 ss
<< "cache_target_full_ratio: "
5569 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
5571 case CACHE_MIN_FLUSH_AGE
:
5572 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
5574 case CACHE_MIN_EVICT_AGE
:
5575 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
5577 case ERASURE_CODE_PROFILE
:
5578 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
5580 case MIN_READ_RECENCY_FOR_PROMOTE
:
5581 ss
<< "min_read_recency_for_promote: " <<
5582 p
->min_read_recency_for_promote
<< "\n";
5584 case HIT_SET_GRADE_DECAY_RATE
:
5585 ss
<< "hit_set_grade_decay_rate: " <<
5586 p
->hit_set_grade_decay_rate
<< "\n";
5588 case HIT_SET_SEARCH_LAST_N
:
5589 ss
<< "hit_set_search_last_n: " <<
5590 p
->hit_set_search_last_n
<< "\n";
5593 ss
<< "allow_ec_overwrites: " <<
5594 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
5601 case WRITE_FADVISE_DONTNEED
:
5604 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5605 if (i
->second
== *it
)
5608 ceph_assert(i
!= ALL_CHOICES
.end());
5609 ss
<< i
->first
<< ": " <<
5610 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
5611 "true" : "false") << "\n";
5613 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
5614 ss
<< "min_write_recency_for_promote: " <<
5615 p
->min_write_recency_for_promote
<< "\n";
5618 ss
<< "fast_read: " << p
->fast_read
<< "\n";
5620 case SCRUB_MIN_INTERVAL
:
5621 case SCRUB_MAX_INTERVAL
:
5622 case DEEP_SCRUB_INTERVAL
:
5623 case RECOVERY_PRIORITY
:
5624 case RECOVERY_OP_PRIORITY
:
5625 case SCRUB_PRIORITY
:
5626 case COMPRESSION_MODE
:
5627 case COMPRESSION_ALGORITHM
:
5628 case COMPRESSION_REQUIRED_RATIO
:
5629 case COMPRESSION_MAX_BLOB_SIZE
:
5630 case COMPRESSION_MIN_BLOB_SIZE
:
5632 case CSUM_MAX_BLOCK
:
5633 case CSUM_MIN_BLOCK
:
5634 case FINGERPRINT_ALGORITHM
:
5636 case TARGET_SIZE_BYTES
:
5637 case TARGET_SIZE_RATIO
:
5638 case PG_AUTOSCALE_BIAS
:
5639 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5640 if (i
->second
== *it
)
5643 ceph_assert(i
!= ALL_CHOICES
.end());
5645 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
5646 if (p
->opts
.is_set(key
)) {
5647 if(key
== pool_opts_t::CSUM_TYPE
) {
5649 p
->opts
.get(key
, &val
);
5650 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
5652 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
5658 rdata
.append(ss
.str());
5663 } else if (prefix
== "osd pool get-quota") {
5665 cmd_getval(cct
, cmdmap
, "pool", pool_name
);
5667 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
5669 ceph_assert(poolid
== -ENOENT
);
5670 ss
<< "unrecognized pool '" << pool_name
<< "'";
5674 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
5677 f
->open_object_section("pool_quotas");
5678 f
->dump_string("pool_name", pool_name
);
5679 f
->dump_unsigned("pool_id", poolid
);
5680 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
5681 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
5686 rs
<< "quotas for pool '" << pool_name
<< "':\n"
5687 << " max objects: ";
5688 if (p
->quota_max_objects
== 0)
5691 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
5694 if (p
->quota_max_bytes
== 0)
5697 rs
<< byte_u_t(p
->quota_max_bytes
);
5698 rdata
.append(rs
.str());
5702 } else if (prefix
== "osd crush rule list" ||
5703 prefix
== "osd crush rule ls") {
5705 f
->open_array_section("rules");
5706 osdmap
.crush
->list_rules(f
.get());
5711 osdmap
.crush
->list_rules(&ss
);
5712 rdata
.append(ss
.str());
5714 } else if (prefix
== "osd crush rule ls-by-class") {
5716 cmd_getval(cct
, cmdmap
, "class", class_name
);
5717 if (class_name
.empty()) {
5718 ss
<< "no class specified";
5723 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
5725 ss
<< "failed to get rules by class '" << class_name
<< "'";
5729 f
->open_array_section("rules");
5730 for (auto &rule
: rules
) {
5731 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
5737 for (auto &rule
: rules
) {
5738 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
5740 rdata
.append(rs
.str());
5742 } else if (prefix
== "osd crush rule dump") {
5744 cmd_getval(cct
, cmdmap
, "name", name
);
5746 cmd_getval(cct
, cmdmap
, "format", format
);
5747 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5749 f
->open_array_section("rules");
5750 osdmap
.crush
->dump_rules(f
.get());
5753 int ruleno
= osdmap
.crush
->get_rule_id(name
);
5755 ss
<< "unknown crush rule '" << name
<< "'";
5759 osdmap
.crush
->dump_rule(ruleno
, f
.get());
5764 rdata
.append(rs
.str());
5765 } else if (prefix
== "osd crush dump") {
5767 cmd_getval(cct
, cmdmap
, "format", format
);
5768 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5769 f
->open_object_section("crush_map");
5770 osdmap
.crush
->dump(f
.get());
5775 rdata
.append(rs
.str());
5776 } else if (prefix
== "osd crush show-tunables") {
5778 cmd_getval(cct
, cmdmap
, "format", format
);
5779 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5780 f
->open_object_section("crush_map_tunables");
5781 osdmap
.crush
->dump_tunables(f
.get());
5786 rdata
.append(rs
.str());
5787 } else if (prefix
== "osd crush tree") {
5789 cmd_getval(cct
, cmdmap
, "shadow", shadow
);
5790 bool show_shadow
= shadow
== "--show-shadow";
5791 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5793 f
->open_object_section("crush_tree");
5794 osdmap
.crush
->dump_tree(nullptr,
5796 osdmap
.get_pool_names(),
5802 osdmap
.crush
->dump_tree(&ss
,
5804 osdmap
.get_pool_names(),
5806 rdata
.append(ss
.str());
5808 } else if (prefix
== "osd crush ls") {
5810 if (!cmd_getval(cct
, cmdmap
, "node", name
)) {
5811 ss
<< "no node specified";
5815 if (!osdmap
.crush
->name_exists(name
)) {
5816 ss
<< "node '" << name
<< "' does not exist";
5820 int id
= osdmap
.crush
->get_item_id(name
);
5823 result
.push_back(id
);
5825 int num
= osdmap
.crush
->get_bucket_size(id
);
5826 for (int i
= 0; i
< num
; ++i
) {
5827 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
5831 f
->open_array_section("items");
5832 for (auto i
: result
) {
5833 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
5839 for (auto i
: result
) {
5840 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
5842 rdata
.append(ss
.str());
5845 } else if (prefix
== "osd crush class ls") {
5846 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5847 f
->open_array_section("crush_classes");
5848 for (auto i
: osdmap
.crush
->class_name
)
5849 f
->dump_string("class", i
.second
);
5852 } else if (prefix
== "osd crush class ls-osd") {
5854 cmd_getval(cct
, cmdmap
, "class", name
);
5856 osdmap
.crush
->get_devices_by_class(name
, &osds
);
5858 f
->open_array_section("osds");
5859 for (auto &osd
: osds
)
5860 f
->dump_int("osd", osd
);
5865 for (auto &osd
: osds
) {
5873 } else if (prefix
== "osd crush get-device-class") {
5874 vector
<string
> idvec
;
5875 cmd_getval(cct
, cmdmap
, "ids", idvec
);
5876 map
<int, string
> class_by_osd
;
5877 for (auto& id
: idvec
) {
5879 long osd
= parse_osd_id(id
.c_str(), &ts
);
5881 ss
<< "unable to parse osd id:'" << id
<< "'";
5885 auto device_class
= osdmap
.crush
->get_item_class(osd
);
5887 class_by_osd
[osd
] = device_class
;
5889 class_by_osd
[osd
] = ""; // no class
5892 f
->open_array_section("osd_device_classes");
5893 for (auto& i
: class_by_osd
) {
5894 f
->open_object_section("osd_device_class");
5895 f
->dump_int("osd", i
.first
);
5896 f
->dump_string("device_class", i
.second
);
5902 if (class_by_osd
.size() == 1) {
5903 // for single input, make a clean output
5904 ds
<< class_by_osd
.begin()->second
;
5906 // note that we do not group osds by class here
5907 for (auto it
= class_by_osd
.begin();
5908 it
!= class_by_osd
.end();
5910 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
5911 if (next(it
) != class_by_osd
.end())
5917 } else if (prefix
== "osd erasure-code-profile ls") {
5918 const auto &profiles
= osdmap
.get_erasure_code_profiles();
5920 f
->open_array_section("erasure-code-profiles");
5921 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
5923 f
->dump_string("profile", i
->first
.c_str());
5925 rdata
.append(i
->first
+ "\n");
5932 rdata
.append(rs
.str());
5934 } else if (prefix
== "osd crush weight-set ls") {
5935 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5937 f
->open_array_section("weight_sets");
5938 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5939 f
->dump_string("pool", "(compat)");
5941 for (auto& i
: osdmap
.crush
->choose_args
) {
5943 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
5950 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5953 for (auto& i
: osdmap
.crush
->choose_args
) {
5955 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
5958 rdata
.append(rs
.str());
5960 } else if (prefix
== "osd crush weight-set dump") {
5961 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5963 osdmap
.crush
->dump_choose_args(f
.get());
5965 } else if (prefix
== "osd erasure-code-profile get") {
5967 cmd_getval(cct
, cmdmap
, "name", name
);
5968 if (!osdmap
.has_erasure_code_profile(name
)) {
5969 ss
<< "unknown erasure code profile '" << name
<< "'";
5973 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
5975 f
->open_object_section("profile");
5976 for (map
<string
,string
>::const_iterator i
= profile
.begin();
5980 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
5982 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
5989 rdata
.append(rs
.str());
5991 } else if (prefix
== "osd pool application get") {
5992 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5995 cmd_getval(cct
, cmdmap
, "pool", pool_name
);
5997 cmd_getval(cct
, cmdmap
, "app", app
);
5999 cmd_getval(cct
, cmdmap
, "key", key
);
6001 if (pool_name
.empty()) {
6003 f
->open_object_section("pools");
6004 for (const auto &pool
: osdmap
.pools
) {
6005 std::string
name("<unknown>");
6006 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6007 if (pni
!= osdmap
.pool_name
.end())
6009 f
->open_object_section(name
.c_str());
6010 for (auto &app_pair
: pool
.second
.application_metadata
) {
6011 f
->open_object_section(app_pair
.first
.c_str());
6012 for (auto &kv_pair
: app_pair
.second
) {
6013 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6017 f
->close_section(); // name
6019 f
->close_section(); // pools
6022 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6024 ss
<< "unrecognized pool '" << pool_name
<< "'";
6028 auto p
= osdmap
.get_pg_pool(pool
);
6031 f
->open_object_section(pool_name
.c_str());
6032 for (auto &app_pair
: p
->application_metadata
) {
6033 f
->open_object_section(app_pair
.first
.c_str());
6034 for (auto &kv_pair
: app_pair
.second
) {
6035 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6037 f
->close_section(); // application
6039 f
->close_section(); // pool_name
6044 auto app_it
= p
->application_metadata
.find(app
);
6045 if (app_it
== p
->application_metadata
.end()) {
6046 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
6050 // filter by pool + app
6052 f
->open_object_section(app_it
->first
.c_str());
6053 for (auto &kv_pair
: app_it
->second
) {
6054 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6056 f
->close_section(); // application
6060 // filter by pool + app + key
6061 auto key_it
= app_it
->second
.find(key
);
6062 if (key_it
== app_it
->second
.end()) {
6063 ss
<< "application '" << app
<< "' on pool '" << pool_name
6064 << "' does not have key '" << key
<< "'";
6068 ss
<< key_it
->second
<< "\n";
6069 rdata
.append(ss
.str());
6072 } else if (prefix
== "osd get-require-min-compat-client") {
6073 ss
<< ceph_release_name(osdmap
.require_min_compat_client
) << std::endl
;
6074 rdata
.append(ss
.str());
6077 } else if (prefix
== "osd pool application enable" ||
6078 prefix
== "osd pool application disable" ||
6079 prefix
== "osd pool application set" ||
6080 prefix
== "osd pool application rm") {
6081 bool changed
= false;
6082 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
6086 } else if (changed
) {
6087 // Valid mutation, proceed to prepare phase
6090 // Idempotent case, reply
6094 // try prepare update
6101 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
6105 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
6107 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6108 osdmap
.get_pg_pool(pool_id
));
6110 pool
->set_flag(flags
);
6113 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
6115 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6116 osdmap
.get_pg_pool(pool_id
));
6118 pool
->unset_flag(flags
);
6121 string
OSDMonitor::make_snap_epoch_key(int64_t pool
, epoch_t epoch
)
6124 snprintf(k
, sizeof(k
), "removed_epoch_%llu_%08lx",
6125 (unsigned long long)pool
, (unsigned long)epoch
);
6129 string
OSDMonitor::make_snap_key(int64_t pool
, snapid_t snap
)
6132 snprintf(k
, sizeof(k
), "removed_snap_%llu_%016llx",
6133 (unsigned long long)pool
, (unsigned long long)snap
);
6138 string
OSDMonitor::make_snap_key_value(
6139 int64_t pool
, snapid_t snap
, snapid_t num
,
6140 epoch_t epoch
, bufferlist
*v
)
6142 // encode the *last* epoch in the key so that we can use forward
6143 // iteration only to search for an epoch in an interval.
6145 encode(snap
+ num
, *v
);
6147 return make_snap_key(pool
, snap
+ num
- 1);
6150 string
OSDMonitor::make_snap_purged_key(int64_t pool
, snapid_t snap
)
6153 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
6154 (unsigned long long)pool
, (unsigned long long)snap
);
6157 string
OSDMonitor::make_snap_purged_key_value(
6158 int64_t pool
, snapid_t snap
, snapid_t num
,
6159 epoch_t epoch
, bufferlist
*v
)
6161 // encode the *last* epoch in the key so that we can use forward
6162 // iteration only to search for an epoch in an interval.
6164 encode(snap
+ num
, *v
);
6166 return make_snap_purged_key(pool
, snap
+ num
- 1);
6169 int OSDMonitor::lookup_pruned_snap(int64_t pool
, snapid_t snap
,
6170 snapid_t
*begin
, snapid_t
*end
)
6172 string k
= make_snap_key(pool
, snap
);
6173 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
6178 if (it
->key().find(OSD_SNAP_PREFIX
) != 0) {
6181 bufferlist v
= it
->value();
6182 auto p
= v
.cbegin();
6185 if (snap
< *begin
|| snap
>= *end
) {
6191 bool OSDMonitor::try_prune_purged_snaps()
6193 if (!mon
->mgrstatmon()->is_readable()) {
6196 if (osdmap
.require_osd_release
< CEPH_RELEASE_MIMIC
) {
6199 if (!pending_inc
.new_purged_snaps
.empty()) {
6200 return false; // we already pruned for this epoch
6203 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
6204 "mon_max_snap_prune_per_epoch");
6208 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
6210 unsigned actually_pruned
= 0;
6211 auto& purged_snaps
= mon
->mgrstatmon()->get_digest().purged_snaps
;
6212 for (auto& p
: osdmap
.get_pools()) {
6213 auto q
= purged_snaps
.find(p
.first
);
6214 if (q
== purged_snaps
.end()) {
6217 auto& purged
= q
->second
;
6218 if (purged
.empty()) {
6219 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
6222 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
6223 OSDMap::snap_interval_set_t to_prune
;
6224 unsigned maybe_pruned
= actually_pruned
;
6225 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
6226 snapid_t begin
= i
.get_start();
6227 auto end
= i
.get_start() + i
.get_len();
6228 snapid_t pbegin
= 0, pend
= 0;
6229 int r
= lookup_pruned_snap(p
.first
, begin
, &pbegin
, &pend
);
6232 // be a bit aggressive about backing off here, because the mon may
6233 // do a lot of work going through this set, and if we know the
6234 // purged set from the OSDs is at least *partly* stale we may as
6235 // well wait for it to be fresh.
6236 dout(20) << __func__
<< " we've already pruned " << pbegin
6237 << "~" << (pend
- pbegin
) << dendl
;
6240 if (pbegin
&& pbegin
< end
) {
6241 // the tail of [begin,end) is purged; shorten the range
6242 ceph_assert(pbegin
> begin
);
6245 to_prune
.insert(begin
, end
- begin
);
6246 maybe_pruned
+= end
- begin
;
6247 if (maybe_pruned
>= max_prune
) {
6251 if (!to_prune
.empty()) {
6252 // PGs may still be reporting things as purged that we have already
6253 // pruned from removed_snaps_queue.
6254 OSDMap::snap_interval_set_t actual
;
6255 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
6256 if (r
!= osdmap
.removed_snaps_queue
.end()) {
6257 actual
.intersection_of(to_prune
, r
->second
);
6259 actually_pruned
+= actual
.size();
6260 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
6261 << ", actual pruned " << actual
<< dendl
;
6262 if (!actual
.empty()) {
6263 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
6266 if (actually_pruned
>= max_prune
) {
6270 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
6271 return !!actually_pruned
;
6274 bool OSDMonitor::update_pools_status()
6276 if (!mon
->mgrstatmon()->is_readable())
6281 auto& pools
= osdmap
.get_pools();
6282 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
6283 const pool_stat_t
*pstat
= mon
->mgrstatmon()->get_pool_stat(it
->first
);
6286 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6287 const pg_pool_t
&pool
= it
->second
;
6288 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
6291 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
6292 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
6294 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
6298 mon
->clog
->info() << "pool '" << pool_name
6299 << "' no longer out of quota; removing NO_QUOTA flag";
6300 // below we cancel FLAG_FULL too, we'll set it again in
6301 // OSDMonitor::encode_pending if it still fails the osd-full checking.
6302 clear_pool_flags(it
->first
,
6303 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
6309 if (pool
.quota_max_bytes
> 0 &&
6310 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
6311 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
6312 << " (reached quota's max_bytes: "
6313 << byte_u_t(pool
.quota_max_bytes
) << ")";
6315 if (pool
.quota_max_objects
> 0 &&
6316 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
6317 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
6318 << " (reached quota's max_objects: "
6319 << pool
.quota_max_objects
<< ")";
6321 // set both FLAG_FULL_QUOTA and FLAG_FULL
6322 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
6323 // since FLAG_FULL should always take precedence
6324 set_pool_flags(it
->first
,
6325 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
6326 clear_pool_flags(it
->first
,
6327 pg_pool_t::FLAG_NEARFULL
|
6328 pg_pool_t::FLAG_BACKFILLFULL
);
6335 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
6337 op
->mark_osdmon_event(__func__
);
6338 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
6339 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
6340 MonSession
*session
= op
->get_session();
6343 string erasure_code_profile
;
6347 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
6349 erasure_code_profile
,
6350 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
6353 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
6358 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
6359 const string
& dstname
,
6364 // Avoid creating a pending crush if it does not already exists and
6365 // the rename would fail.
6367 if (!_have_pending_crush()) {
6368 ret
= _get_stable_crush().can_rename_bucket(srcname
,
6375 CrushWrapper newcrush
;
6376 _get_pending_crush(newcrush
);
6378 ret
= newcrush
.rename_bucket(srcname
,
6384 pending_inc
.crush
.clear();
6385 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6386 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
6390 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
6392 string replacement
= "";
6394 if (plugin
== "jerasure_generic" ||
6395 plugin
== "jerasure_sse3" ||
6396 plugin
== "jerasure_sse4" ||
6397 plugin
== "jerasure_neon") {
6398 replacement
= "jerasure";
6399 } else if (plugin
== "shec_generic" ||
6400 plugin
== "shec_sse3" ||
6401 plugin
== "shec_sse4" ||
6402 plugin
== "shec_neon") {
6403 replacement
= "shec";
6406 if (replacement
!= "") {
6407 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
6408 << plugin
<< " that has been deprecated. Please use "
6409 << replacement
<< " instead." << dendl
;
6413 int OSDMonitor::normalize_profile(const string
& profilename
,
6414 ErasureCodeProfile
&profile
,
6418 ErasureCodeInterfaceRef erasure_code
;
6419 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
6420 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
6421 check_legacy_ec_plugin(plugin
->second
, profilename
);
6422 int err
= instance
.factory(plugin
->second
,
6423 g_conf().get_val
<std::string
>("erasure_code_dir"),
6424 profile
, &erasure_code
, ss
);
6429 err
= erasure_code
->init(profile
, ss
);
6434 auto it
= profile
.find("stripe_unit");
6435 if (it
!= profile
.end()) {
6437 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
6438 if (!err_str
.empty()) {
6439 *ss
<< "could not parse stripe_unit '" << it
->second
6440 << "': " << err_str
<< std::endl
;
6443 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
6444 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
6445 if (chunk_size
!= stripe_unit
) {
6446 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
6447 << "alignment. Would be padded to " << chunk_size
6451 if ((stripe_unit
% 4096) != 0 && !force
) {
6452 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
6453 << "use --force to override this check" << std::endl
;
6460 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
6461 const string
&profile
,
6465 int ruleid
= osdmap
.crush
->get_rule_id(name
);
6466 if (ruleid
!= -ENOENT
) {
6467 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
6471 CrushWrapper newcrush
;
6472 _get_pending_crush(newcrush
);
6474 ruleid
= newcrush
.get_rule_id(name
);
6475 if (ruleid
!= -ENOENT
) {
6476 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
6479 ErasureCodeInterfaceRef erasure_code
;
6480 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
6482 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
6486 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
6487 erasure_code
.reset();
6491 pending_inc
.crush
.clear();
6492 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6497 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
6498 ErasureCodeInterfaceRef
*erasure_code
,
6501 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
6503 ErasureCodeProfile profile
=
6504 osdmap
.get_erasure_code_profile(erasure_code_profile
);
6505 ErasureCodeProfile::const_iterator plugin
=
6506 profile
.find("plugin");
6507 if (plugin
== profile
.end()) {
6508 *ss
<< "cannot determine the erasure code plugin"
6509 << " because there is no 'plugin' entry in the erasure_code_profile "
6510 << profile
<< std::endl
;
6513 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
6514 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
6515 return instance
.factory(plugin
->second
,
6516 g_conf().get_val
<std::string
>("erasure_code_dir"),
6517 profile
, erasure_code
, ss
);
6520 int OSDMonitor::check_cluster_features(uint64_t features
,
6523 stringstream unsupported_ss
;
6524 int unsupported_count
= 0;
6525 if ((mon
->get_quorum_con_features() & features
) != features
) {
6526 unsupported_ss
<< "the monitor cluster";
6527 ++unsupported_count
;
6530 set
<int32_t> up_osds
;
6531 osdmap
.get_up_osds(up_osds
);
6532 for (set
<int32_t>::iterator it
= up_osds
.begin();
6533 it
!= up_osds
.end(); ++it
) {
6534 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
6535 if ((xi
.features
& features
) != features
) {
6536 if (unsupported_count
> 0)
6537 unsupported_ss
<< ", ";
6538 unsupported_ss
<< "osd." << *it
;
6539 unsupported_count
++;
6543 if (unsupported_count
> 0) {
6544 ss
<< "features " << features
<< " unsupported by: "
6545 << unsupported_ss
.str();
6549 // check pending osd state, too!
6550 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
6551 pending_inc
.new_xinfo
.begin();
6552 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
6553 const osd_xinfo_t
&xi
= p
->second
;
6554 if ((xi
.features
& features
) != features
) {
6555 dout(10) << __func__
<< " pending osd." << p
->first
6556 << " features are insufficient; retry" << dendl
;
6564 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
6567 OSDMap::Incremental new_pending
= pending_inc
;
6568 encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
6570 newmap
.deepish_copy_from(osdmap
);
6571 newmap
.apply_incremental(new_pending
);
6574 if (newmap
.require_min_compat_client
> 0) {
6575 auto mv
= newmap
.get_min_compat_client();
6576 if (mv
> newmap
.require_min_compat_client
) {
6577 ss
<< "new crush map requires client version " << ceph_release_name(mv
)
6578 << " but require_min_compat_client is "
6579 << ceph_release_name(newmap
.require_min_compat_client
);
6586 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
6587 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
6588 stringstream features_ss
;
6589 int r
= check_cluster_features(features
, features_ss
);
6591 ss
<< "Could not change CRUSH: " << features_ss
.str();
6598 bool OSDMonitor::erasure_code_profile_in_use(
6599 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
6600 const string
&profile
,
6604 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
6607 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
6608 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
6613 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
6618 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
6619 map
<string
,string
> *erasure_code_profile_map
,
6622 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
6625 erasure_code_profile_map
,
6629 ceph_assert((*erasure_code_profile_map
).count("plugin"));
6630 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
6631 map
<string
,string
> user_map
;
6632 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
6633 i
!= erasure_code_profile
.end();
6635 size_t equal
= i
->find('=');
6636 if (equal
== string::npos
) {
6637 user_map
[*i
] = string();
6638 (*erasure_code_profile_map
)[*i
] = string();
6640 const string key
= i
->substr(0, equal
);
6642 const string value
= i
->substr(equal
);
6643 if (key
.find("ruleset-") == 0) {
6644 *ss
<< "property '" << key
<< "' is no longer supported; try "
6645 << "'crush-" << key
.substr(8) << "' instead";
6648 user_map
[key
] = value
;
6649 (*erasure_code_profile_map
)[key
] = value
;
6653 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
6654 (*erasure_code_profile_map
) = user_map
;
6659 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
6660 const string
&erasure_code_profile
,
6662 unsigned *size
, unsigned *min_size
,
6666 switch (pool_type
) {
6667 case pg_pool_t::TYPE_REPLICATED
:
6668 if (repl_size
== 0) {
6669 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
6672 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
6674 case pg_pool_t::TYPE_ERASURE
:
6676 ErasureCodeInterfaceRef erasure_code
;
6677 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
6679 *size
= erasure_code
->get_chunk_count();
6681 erasure_code
->get_data_chunk_count() +
6682 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
6683 assert(*min_size
<= *size
);
6684 assert(*min_size
>= erasure_code
->get_data_chunk_count());
6689 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
6696 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
6697 const string
&erasure_code_profile
,
6698 uint32_t *stripe_width
,
6702 switch (pool_type
) {
6703 case pg_pool_t::TYPE_REPLICATED
:
6706 case pg_pool_t::TYPE_ERASURE
:
6708 ErasureCodeProfile profile
=
6709 osdmap
.get_erasure_code_profile(erasure_code_profile
);
6710 ErasureCodeInterfaceRef erasure_code
;
6711 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
6714 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
6715 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
6716 auto it
= profile
.find("stripe_unit");
6717 if (it
!= profile
.end()) {
6719 stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
6720 ceph_assert(err_str
.empty());
6722 *stripe_width
= data_chunks
*
6723 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
6727 *ss
<< "prepare_pool_stripe_width: "
6728 << pool_type
<< " is not a known pool type";
6735 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
6736 const string
&erasure_code_profile
,
6737 const string
&rule_name
,
6742 if (*crush_rule
< 0) {
6743 switch (pool_type
) {
6744 case pg_pool_t::TYPE_REPLICATED
:
6746 if (rule_name
== "") {
6748 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
6749 if (*crush_rule
< 0) {
6750 // Errors may happen e.g. if no valid rule is available
6751 *ss
<< "No suitable CRUSH rule exists, check "
6752 << "'osd pool default crush *' config options";
6756 return get_crush_rule(rule_name
, crush_rule
, ss
);
6760 case pg_pool_t::TYPE_ERASURE
:
6762 int err
= crush_rule_create_erasure(rule_name
,
6763 erasure_code_profile
,
6767 dout(20) << "prepare_pool_crush_rule: rule "
6768 << rule_name
<< " try again" << dendl
;
6771 // need to wait for the crush rule to be proposed before proceeding
6782 *ss
<< "prepare_pool_crush_rule: " << pool_type
6783 << " is not a known pool type";
6788 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
6789 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
6797 int OSDMonitor::get_crush_rule(const string
&rule_name
,
6802 ret
= osdmap
.crush
->get_rule_id(rule_name
);
6803 if (ret
!= -ENOENT
) {
6807 CrushWrapper newcrush
;
6808 _get_pending_crush(newcrush
);
6810 ret
= newcrush
.get_rule_id(rule_name
);
6811 if (ret
!= -ENOENT
) {
6812 // found it, wait for it to be proposed
6813 dout(20) << __func__
<< ": rule " << rule_name
6814 << " try again" << dendl
;
6817 // Cannot find it , return error
6818 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
6825 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
*ss
)
6827 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
6828 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u); // assume min cluster size 3
6829 auto max_pgs
= max_pgs_per_osd
* num_osds
;
6830 uint64_t projected
= 0;
6832 projected
+= pg_num
* size
;
6834 for (const auto& i
: osdmap
.get_pools()) {
6835 if (i
.first
== pool
) {
6836 projected
+= pg_num
* size
;
6838 projected
+= i
.second
.get_pg_num_target() * i
.second
.get_size();
6841 if (projected
> max_pgs
) {
6843 *ss
<< "pool id " << pool
;
6845 *ss
<< " pg_num " << pg_num
<< " size " << size
6846 << " would mean " << projected
6847 << " total pgs, which exceeds max " << max_pgs
6848 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6849 << " * num_in_osds " << num_osds
<< ")";
6856 * @param name The name of the new pool
6857 * @param crush_rule The crush rule to use. If <0, will use the system default
6858 * @param crush_rule_name The crush rule to use, if crush_rulset <0
6859 * @param pg_num The pg_num to use. If set to 0, will use the system default
6860 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6861 * @param repl_size Replication factor, or 0 for default
6862 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6863 * @param pool_type TYPE_ERASURE, or TYPE_REP
6864 * @param expected_num_objects expected number of objects on the pool
6865 * @param fast_read fast read type.
6866 * @param ss human readable error message, if any.
6868 * @return 0 on success, negative errno on failure.
6870 int OSDMonitor::prepare_new_pool(string
& name
,
6872 const string
&crush_rule_name
,
6873 unsigned pg_num
, unsigned pgp_num
,
6874 unsigned pg_num_min
,
6875 const uint64_t repl_size
,
6876 const uint64_t target_size_bytes
,
6877 const float target_size_ratio
,
6878 const string
&erasure_code_profile
,
6879 const unsigned pool_type
,
6880 const uint64_t expected_num_objects
,
6881 FastReadType fast_read
,
6884 if (name
.length() == 0)
6887 pg_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pg_num");
6889 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
6892 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
6893 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
6894 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
6895 << " (you may adjust 'mon max pool pg num' for higher values)";
6898 if (pgp_num
> pg_num
) {
6899 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6900 << ", which in this case is " << pg_num
;
6903 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
6904 *ss
<< "'fast_read' can only apply to erasure coding pool";
6908 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
6909 crush_rule_name
, &crush_rule
, ss
);
6911 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
6914 if (g_conf()->mon_osd_crush_smoke_test
) {
6915 CrushWrapper newcrush
;
6916 _get_pending_crush(newcrush
);
6918 CrushTester
tester(newcrush
, err
);
6919 tester
.set_min_x(0);
6920 tester
.set_max_x(50);
6921 tester
.set_rule(crush_rule
);
6922 auto start
= ceph::coarse_mono_clock::now();
6923 r
= tester
.test_with_fork(g_conf()->mon_lease
);
6924 auto duration
= ceph::coarse_mono_clock::now() - start
;
6926 dout(10) << "tester.test_with_fork returns " << r
6927 << ": " << err
.str() << dendl
;
6928 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
6931 dout(10) << __func__
<< " crush smoke test duration: "
6932 << duration
<< dendl
;
6934 unsigned size
, min_size
;
6935 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
6936 &size
, &min_size
, ss
);
6938 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
6941 r
= check_pg_num(-1, pg_num
, size
, ss
);
6943 dout(10) << "check_pg_num returns " << r
<< dendl
;
6947 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
6951 uint32_t stripe_width
= 0;
6952 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
6954 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
6959 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
6960 switch (fast_read
) {
6967 case FAST_READ_DEFAULT
:
6968 fread
= g_conf()->osd_pool_default_ec_fast_read
;
6971 *ss
<< "invalid fast_read setting: " << fast_read
;
6976 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
6977 p
!= pending_inc
.new_pool_names
.end();
6979 if (p
->second
== name
)
6983 if (-1 == pending_inc
.new_pool_max
)
6984 pending_inc
.new_pool_max
= osdmap
.pool_max
;
6985 int64_t pool
= ++pending_inc
.new_pool_max
;
6987 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
6988 pi
->create_time
= ceph_clock_now();
6989 pi
->type
= pool_type
;
6990 pi
->fast_read
= fread
;
6991 pi
->flags
= g_conf()->osd_pool_default_flags
;
6992 if (g_conf()->osd_pool_default_flag_hashpspool
)
6993 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
6994 if (g_conf()->osd_pool_default_flag_nodelete
)
6995 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
6996 if (g_conf()->osd_pool_default_flag_nopgchange
)
6997 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
6998 if (g_conf()->osd_pool_default_flag_nosizechange
)
6999 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
7000 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
7001 if (g_conf()->osd_pool_use_gmt_hitset
)
7002 pi
->use_gmt_hitset
= true;
7004 pi
->use_gmt_hitset
= false;
7007 pi
->min_size
= min_size
;
7008 pi
->crush_rule
= crush_rule
;
7009 pi
->expected_num_objects
= expected_num_objects
;
7010 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
7013 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7014 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
7015 pi
->pg_autoscale_mode
= m
>= 0 ? m
: 0;
7017 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
7019 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
7021 pi
->set_pg_num_pending(pi
->get_pg_num());
7022 pi
->set_pg_num_target(pg_num
);
7023 pi
->set_pgp_num(pi
->get_pg_num());
7024 pi
->set_pgp_num_target(pgp_num
);
7025 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
&&
7027 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
7030 pi
->last_change
= pending_inc
.epoch
;
7033 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7034 pi
->erasure_code_profile
= erasure_code_profile
;
7036 pi
->erasure_code_profile
= "";
7038 pi
->stripe_width
= stripe_width
;
7040 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
&&
7041 target_size_bytes
) {
7042 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7043 // larger than int32_t max.
7044 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
7046 if (target_size_ratio
> 0.0 &&
7047 osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
7048 // only store for nautilus+, just to be consistent and tidy.
7049 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
7052 pi
->cache_target_dirty_ratio_micro
=
7053 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
7054 pi
->cache_target_dirty_high_ratio_micro
=
7055 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
7056 pi
->cache_target_full_ratio_micro
=
7057 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
7058 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
7059 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
7061 pending_inc
.new_pool_names
[pool
] = name
;
7065 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
7067 op
->mark_osdmon_event(__func__
);
7069 if (pending_inc
.new_flags
< 0)
7070 pending_inc
.new_flags
= osdmap
.get_flags();
7071 pending_inc
.new_flags
|= flag
;
7072 ss
<< OSDMap::get_flag_string(flag
) << " is set";
7073 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7074 get_last_committed() + 1));
7078 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
7080 op
->mark_osdmon_event(__func__
);
7082 if (pending_inc
.new_flags
< 0)
7083 pending_inc
.new_flags
= osdmap
.get_flags();
7084 pending_inc
.new_flags
&= ~flag
;
7085 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
7086 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7087 get_last_committed() + 1));
7091 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
7095 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
7096 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
7098 ss
<< "unrecognized pool '" << poolstr
<< "'";
7102 cmd_getval(cct
, cmdmap
, "var", var
);
7104 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
7105 if (pending_inc
.new_pools
.count(pool
))
7106 p
= pending_inc
.new_pools
[pool
];
7108 // accept val as a json string in the normal case (current
7109 // generation monitor). parse out int or float values from the
7110 // string as needed. however, if it is not a string, try to pull
7111 // out an int, in case an older monitor with an older json schema is
7112 // forwarding a request.
7114 string interr
, floaterr
;
7117 int64_t uf
= 0; // micro-f
7118 cmd_getval(cct
, cmdmap
, "val", val
);
7120 // parse string as both int and float; different fields use different types.
7121 n
= strict_strtoll(val
.c_str(), 10, &interr
);
7122 f
= strict_strtod(val
.c_str(), &floaterr
);
7123 uf
= llrintl(f
* (double)1000000.0);
7126 (var
== "hit_set_type" || var
== "hit_set_period" ||
7127 var
== "hit_set_count" || var
== "hit_set_fpp" ||
7128 var
== "target_max_objects" || var
== "target_max_bytes" ||
7129 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
7130 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
7131 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
7132 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
7133 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
7137 if (var
== "size") {
7138 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
7139 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
7142 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
7143 ss
<< "can not change the size of an erasure-coded pool";
7146 if (interr
.length()) {
7147 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7150 if (n
<= 0 || n
> 10) {
7151 ss
<< "pool size must be between 1 and 10";
7154 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, &ss
);
7161 } else if (var
== "min_size") {
7162 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
7163 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7166 if (interr
.length()) {
7167 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7171 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
7172 if (n
< 1 || n
> p
.size
) {
7173 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
7177 ErasureCodeInterfaceRef erasure_code
;
7180 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
7182 k
= erasure_code
->get_data_chunk_count();
7184 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
7188 if (n
< k
|| n
> p
.size
) {
7189 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
7194 } else if (var
== "pg_num_actual") {
7195 if (interr
.length()) {
7196 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7199 if (n
== (int)p
.get_pg_num()) {
7202 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7203 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7204 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7205 << " (you may adjust 'mon max pool pg num' for higher values)";
7208 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
7209 ss
<< "cannot adjust pg_num while initial PGs are being created";
7212 if (n
> (int)p
.get_pg_num()) {
7213 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
7214 // force pre-nautilus clients to resend their ops, since they
7215 // don't understand pg_num_pending changes form a new interval
7216 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
7220 if (osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
7221 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
7224 if (n
< (int)p
.get_pgp_num()) {
7225 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
7228 if (n
< (int)p
.get_pg_num() - 1) {
7229 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
7230 << ") - 1; only single pg decrease is currently supported";
7233 p
.set_pg_num_pending(n
);
7234 // force pre-nautilus clients to resend their ops, since they
7235 // don't understand pg_num_pending changes form a new interval
7236 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
7238 // force pre-luminous clients to resend their ops, since they
7239 // don't understand that split PGs now form a new interval.
7240 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
7241 } else if (var
== "pg_num") {
7242 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
7243 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
7246 if (interr
.length()) {
7247 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7250 if (n
== (int)p
.get_pg_num_target()) {
7253 if (n
<= 0 || static_cast<uint64_t>(n
) >
7254 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7255 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7256 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7257 << " (you may adjust 'mon max pool pg num' for higher values)";
7260 if (n
> (int)p
.get_pg_num_target()) {
7261 int r
= check_pg_num(pool
, n
, p
.get_size(), &ss
);
7266 cmd_getval(cct
,cmdmap
, "yes_i_really_mean_it", force
);
7267 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
7268 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
7272 if (osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
7273 ss
<< "nautilus OSDs are required to decrease pg_num";
7277 if (osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
7278 // pre-nautilus osdmap format; increase pg_num directly
7279 assert(n
> (int)p
.get_pg_num());
7280 // force pre-nautilus clients to resend their ops, since they
7281 // don't understand pg_num_target changes form a new interval
7282 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
7283 // force pre-luminous clients to resend their ops, since they
7284 // don't understand that split PGs now form a new interval.
7285 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
7288 // set targets; mgr will adjust pg_num_actual and pgp_num later.
7289 // make pgp_num track pg_num if it already matches. if it is set
7290 // differently, leave it different and let the user control it
7292 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
7293 p
.set_pgp_num_target(n
);
7295 p
.set_pg_num_target(n
);
7297 } else if (var
== "pgp_num_actual") {
7298 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
7299 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7302 if (interr
.length()) {
7303 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7307 ss
<< "specified pgp_num must > 0, but you set to " << n
;
7310 if (n
> (int)p
.get_pg_num()) {
7311 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
7314 if (n
> (int)p
.get_pg_num_pending()) {
7315 ss
<< "specified pgp_num " << n
7316 << " > pg_num_pending " << p
.get_pg_num_pending();
7320 } else if (var
== "pgp_num") {
7321 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
7322 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7325 if (interr
.length()) {
7326 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7330 ss
<< "specified pgp_num must > 0, but you set to " << n
;
7333 if (n
> (int)p
.get_pg_num_target()) {
7334 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
7337 if (osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
7338 // pre-nautilus osdmap format; increase pgp_num directly
7341 p
.set_pgp_num_target(n
);
7343 } else if (var
== "pg_autoscale_mode") {
7344 n
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
7346 ss
<< "specified invalid mode " << val
;
7349 if (osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
7350 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
7353 p
.pg_autoscale_mode
= n
;
7354 } else if (var
== "crush_rule") {
7355 int id
= osdmap
.crush
->get_rule_id(val
);
7356 if (id
== -ENOENT
) {
7357 ss
<< "crush rule " << val
<< " does not exist";
7361 ss
<< cpp_strerror(id
);
7364 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
7368 } else if (var
== "nodelete" || var
== "nopgchange" ||
7369 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
7370 var
== "noscrub" || var
== "nodeep-scrub") {
7371 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
7372 // make sure we only compare against 'n' if we didn't receive a string
7373 if (val
== "true" || (interr
.empty() && n
== 1)) {
7375 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
7378 ss
<< "expecting value 'true', 'false', '0', or '1'";
7381 } else if (var
== "hashpspool") {
7382 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
7384 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", force
);
7387 ss
<< "are you SURE? this will remap all placement groups in this pool,"
7388 " this triggers large data movement,"
7389 " pass --yes-i-really-mean-it if you really do.";
7392 // make sure we only compare against 'n' if we didn't receive a string
7393 if (val
== "true" || (interr
.empty() && n
== 1)) {
7395 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
7398 ss
<< "expecting value 'true', 'false', '0', or '1'";
7401 } else if (var
== "hit_set_type") {
7403 p
.hit_set_params
= HitSet::Params();
7405 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
7408 if (val
== "bloom") {
7409 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
7410 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
7411 p
.hit_set_params
= HitSet::Params(bsp
);
7412 } else if (val
== "explicit_hash")
7413 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
7414 else if (val
== "explicit_object")
7415 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
7417 ss
<< "unrecognized hit_set type '" << val
<< "'";
7421 } else if (var
== "hit_set_period") {
7422 if (interr
.length()) {
7423 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7426 ss
<< "hit_set_period should be non-negative";
7429 p
.hit_set_period
= n
;
7430 } else if (var
== "hit_set_count") {
7431 if (interr
.length()) {
7432 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7435 ss
<< "hit_set_count should be non-negative";
7438 p
.hit_set_count
= n
;
7439 } else if (var
== "hit_set_fpp") {
7440 if (floaterr
.length()) {
7441 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
7443 } else if (f
< 0 || f
> 1.0) {
7444 ss
<< "hit_set_fpp should be in the range 0..1";
7447 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
7448 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
7451 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
7453 } else if (var
== "use_gmt_hitset") {
7454 if (val
== "true" || (interr
.empty() && n
== 1)) {
7455 p
.use_gmt_hitset
= true;
7457 ss
<< "expecting value 'true' or '1'";
7460 } else if (var
== "allow_ec_overwrites") {
7461 if (!p
.is_erasure()) {
7462 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
7466 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
7467 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
7468 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
7471 if (val
== "true" || (interr
.empty() && n
== 1)) {
7472 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
7473 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
7474 ss
<< "ec overwrites cannot be disabled once enabled";
7477 ss
<< "expecting value 'true', 'false', '0', or '1'";
7480 } else if (var
== "target_max_objects") {
7481 if (interr
.length()) {
7482 ss
<< "error parsing int '" << val
<< "': " << interr
;
7485 p
.target_max_objects
= n
;
7486 } else if (var
== "target_max_bytes") {
7487 if (interr
.length()) {
7488 ss
<< "error parsing int '" << val
<< "': " << interr
;
7491 p
.target_max_bytes
= n
;
7492 } else if (var
== "cache_target_dirty_ratio") {
7493 if (floaterr
.length()) {
7494 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
7497 if (f
< 0 || f
> 1.0) {
7498 ss
<< "value must be in the range 0..1";
7501 p
.cache_target_dirty_ratio_micro
= uf
;
7502 } else if (var
== "cache_target_dirty_high_ratio") {
7503 if (floaterr
.length()) {
7504 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
7507 if (f
< 0 || f
> 1.0) {
7508 ss
<< "value must be in the range 0..1";
7511 p
.cache_target_dirty_high_ratio_micro
= uf
;
7512 } else if (var
== "cache_target_full_ratio") {
7513 if (floaterr
.length()) {
7514 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
7517 if (f
< 0 || f
> 1.0) {
7518 ss
<< "value must be in the range 0..1";
7521 p
.cache_target_full_ratio_micro
= uf
;
7522 } else if (var
== "cache_min_flush_age") {
7523 if (interr
.length()) {
7524 ss
<< "error parsing int '" << val
<< "': " << interr
;
7527 p
.cache_min_flush_age
= n
;
7528 } else if (var
== "cache_min_evict_age") {
7529 if (interr
.length()) {
7530 ss
<< "error parsing int '" << val
<< "': " << interr
;
7533 p
.cache_min_evict_age
= n
;
7534 } else if (var
== "min_read_recency_for_promote") {
7535 if (interr
.length()) {
7536 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7539 p
.min_read_recency_for_promote
= n
;
7540 } else if (var
== "hit_set_grade_decay_rate") {
7541 if (interr
.length()) {
7542 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7545 if (n
> 100 || n
< 0) {
7546 ss
<< "value out of range,valid range is 0 - 100";
7549 p
.hit_set_grade_decay_rate
= n
;
7550 } else if (var
== "hit_set_search_last_n") {
7551 if (interr
.length()) {
7552 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7555 if (n
> p
.hit_set_count
|| n
< 0) {
7556 ss
<< "value out of range,valid range is 0 - hit_set_count";
7559 p
.hit_set_search_last_n
= n
;
7560 } else if (var
== "min_write_recency_for_promote") {
7561 if (interr
.length()) {
7562 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7565 p
.min_write_recency_for_promote
= n
;
7566 } else if (var
== "fast_read") {
7567 if (p
.is_replicated()) {
7568 ss
<< "fast read is not supported in replication pool";
7571 if (val
== "true" || (interr
.empty() && n
== 1)) {
7573 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
7574 p
.fast_read
= false;
7576 ss
<< "expecting value 'true', 'false', '0', or '1'";
7579 } else if (pool_opts_t::is_opt_name(var
)) {
7580 bool unset
= val
== "unset";
7581 if (var
== "compression_mode") {
7583 auto cmode
= Compressor::get_comp_mode_type(val
);
7585 ss
<< "unrecognized compression mode '" << val
<< "'";
7589 } else if (var
== "compression_algorithm") {
7591 auto alg
= Compressor::get_comp_alg_type(val
);
7593 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
7597 } else if (var
== "compression_required_ratio") {
7598 if (floaterr
.length()) {
7599 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
7602 if (f
< 0 || f
> 1) {
7603 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
7606 } else if (var
== "csum_type") {
7607 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
7609 ss
<< "unrecognized csum_type '" << val
<< "'";
7612 //preserve csum_type numeric value
7615 } else if (var
== "compression_max_blob_size" ||
7616 var
== "compression_min_blob_size" ||
7617 var
== "csum_max_block" ||
7618 var
== "csum_min_block") {
7619 if (interr
.length()) {
7620 ss
<< "error parsing int value '" << val
<< "': " << interr
;
7623 } else if (var
== "fingerprint_algorithm") {
7625 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
7627 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
7631 } else if (var
== "pg_num_min") {
7632 if (interr
.length()) {
7633 ss
<< "error parsing int value '" << val
<< "': " << interr
;
7636 if (n
> (int)p
.get_pg_num_target()) {
7637 ss
<< "specified pg_num_min " << n
7638 << " > pg_num " << p
.get_pg_num_target();
7641 } else if (var
== "recovery_priority") {
7642 if (interr
.length()) {
7643 ss
<< "error parsing int value '" << val
<< "': " << interr
;
7646 if (!g_conf()->debug_allow_any_pool_priority
) {
7647 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
7648 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
7649 << " and " << OSD_POOL_PRIORITY_MAX
;
7653 } else if (var
== "pg_autoscale_bias") {
7654 if (f
< 0.0 || f
> 1000.0) {
7655 ss
<< "pg_autoscale_bias must be between 0 and 1000";
7660 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
7661 switch (desc
.type
) {
7662 case pool_opts_t::STR
:
7664 p
.opts
.unset(desc
.key
);
7666 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
7669 case pool_opts_t::INT
:
7670 if (interr
.length()) {
7671 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7675 p
.opts
.unset(desc
.key
);
7677 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
7680 case pool_opts_t::DOUBLE
:
7681 if (floaterr
.length()) {
7682 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
7686 p
.opts
.unset(desc
.key
);
7688 p
.opts
.set(desc
.key
, static_cast<double>(f
));
7692 ceph_assert(!"unknown type");
7695 ss
<< "unrecognized variable '" << var
<< "'";
7698 if (val
!= "unset") {
7699 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
7701 ss
<< "unset pool " << pool
<< " " << var
;
7703 p
.last_change
= pending_inc
.epoch
;
7704 pending_inc
.new_pools
[pool
] = p
;
7708 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
7709 const cmdmap_t
& cmdmap
,
7712 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
7715 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
7716 const cmdmap_t
& cmdmap
,
7720 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
7725 * Common logic for preprocess and prepare phases of pool application
7726 * tag commands. In preprocess mode we're only detecting invalid
7727 * commands, and determining whether it was a modification or a no-op.
7728 * In prepare mode we're actually updating the pending state.
7730 int OSDMonitor::_command_pool_application(const string
&prefix
,
7731 const cmdmap_t
& cmdmap
,
7737 cmd_getval(cct
, cmdmap
, "pool", pool_name
);
7738 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
7740 ss
<< "unrecognized pool '" << pool_name
<< "'";
7744 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
7746 if (pending_inc
.new_pools
.count(pool
)) {
7747 p
= pending_inc
.new_pools
[pool
];
7752 cmd_getval(cct
, cmdmap
, "app", app
);
7753 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
7756 cmd_getval(cct
, cmdmap
, "key", key
);
7758 ss
<< "key cannot be 'all'";
7763 cmd_getval(cct
, cmdmap
, "value", value
);
7764 if (value
== "all") {
7765 ss
<< "value cannot be 'all'";
7769 if (boost::algorithm::ends_with(prefix
, "enable")) {
7771 ss
<< "application name must be provided";
7776 ss
<< "application must be enabled on base tier";
7781 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", force
);
7783 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
7784 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
7785 << "application; pass --yes-i-really-mean-it to proceed anyway";
7789 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
7790 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
7791 << "max " << MAX_POOL_APPLICATIONS
;
7795 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
7796 ss
<< "application name '" << app
<< "' too long; max length "
7797 << MAX_POOL_APPLICATION_LENGTH
;
7802 p
.application_metadata
[app
] = {};
7804 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
7806 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
7808 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", force
);
7811 ss
<< "Are you SURE? Disabling an application within a pool might result "
7812 << "in loss of application functionality; pass "
7813 << "--yes-i-really-mean-it to proceed anyway";
7818 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
7820 return 0; // idempotent
7823 p
.application_metadata
.erase(app
);
7824 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
7826 } else if (boost::algorithm::ends_with(prefix
, "set")) {
7828 ss
<< "application metadata must be set on base tier";
7833 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
7839 cmd_getval(cct
, cmdmap
, "key", key
);
7842 ss
<< "key must be provided";
7846 auto &app_keys
= p
.application_metadata
[app
];
7847 if (app_keys
.count(key
) == 0 &&
7848 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
7849 ss
<< "too many keys set for application '" << app
<< "' on pool '"
7850 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
7854 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
7855 ss
<< "key '" << app
<< "' too long; max length "
7856 << MAX_POOL_APPLICATION_LENGTH
;
7861 cmd_getval(cct
, cmdmap
, "value", value
);
7862 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
7863 ss
<< "value '" << value
<< "' too long; max length "
7864 << MAX_POOL_APPLICATION_LENGTH
;
7868 p
.application_metadata
[app
][key
] = value
;
7869 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
7870 << value
<< "' on pool '" << pool_name
<< "'";
7871 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
7873 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
7879 cmd_getval(cct
, cmdmap
, "key", key
);
7880 auto it
= p
.application_metadata
[app
].find(key
);
7881 if (it
== p
.application_metadata
[app
].end()) {
7882 ss
<< "application '" << app
<< "' on pool '" << pool_name
7883 << "' does not have key '" << key
<< "'";
7884 return 0; // idempotent
7887 p
.application_metadata
[app
].erase(it
);
7888 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
7889 << pool_name
<< "'";
7895 p
.last_change
= pending_inc
.epoch
;
7896 pending_inc
.new_pools
[pool
] = p
;
7899 // Because we fell through this far, we didn't hit no-op cases,
7900 // so pool was definitely modified
7901 if (modified
!= nullptr) {
7908 int OSDMonitor::_prepare_command_osd_crush_remove(
7909 CrushWrapper
&newcrush
,
7918 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
7921 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
7926 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
7928 pending_inc
.crush
.clear();
7929 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7932 int OSDMonitor::prepare_command_osd_crush_remove(
7933 CrushWrapper
&newcrush
,
7939 int err
= _prepare_command_osd_crush_remove(
7940 newcrush
, id
, ancestor
,
7941 has_ancestor
, unlink_only
);
7946 ceph_assert(err
== 0);
7947 do_osd_crush_remove(newcrush
);
7952 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
7954 if (osdmap
.is_up(id
)) {
7958 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
7959 pending_inc
.new_uuid
[id
] = uuid_d();
7960 pending_metadata_rm
.insert(id
);
7961 pending_metadata
.erase(id
);
7966 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
7968 ceph_assert(existing_id
);
7971 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
7972 if (!osdmap
.exists(i
) &&
7973 pending_inc
.new_up_client
.count(i
) == 0 &&
7974 (pending_inc
.new_state
.count(i
) == 0 ||
7975 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
7981 if (pending_inc
.new_max_osd
< 0) {
7982 return osdmap
.get_max_osd();
7984 return pending_inc
.new_max_osd
;
7987 void OSDMonitor::do_osd_create(
7990 const string
& device_class
,
7993 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
7994 ceph_assert(new_id
);
7996 // We presume validation has been performed prior to calling this
7997 // function. We assert with prejudice.
7999 int32_t allocated_id
= -1; // declare here so we can jump
8000 int32_t existing_id
= -1;
8001 if (!uuid
.is_zero()) {
8002 existing_id
= osdmap
.identify_osd(uuid
);
8003 if (existing_id
>= 0) {
8004 ceph_assert(id
< 0 || id
== existing_id
);
8005 *new_id
= existing_id
;
8007 } else if (id
>= 0) {
8008 // uuid does not exist, and id has been provided, so just create
8015 // allocate a new id
8016 allocated_id
= _allocate_osd_id(&existing_id
);
8017 dout(10) << __func__
<< " allocated id " << allocated_id
8018 << " existing id " << existing_id
<< dendl
;
8019 if (existing_id
>= 0) {
8020 ceph_assert(existing_id
< osdmap
.get_max_osd());
8021 ceph_assert(allocated_id
< 0);
8022 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
8023 *new_id
= existing_id
;
8024 } else if (allocated_id
>= 0) {
8025 ceph_assert(existing_id
< 0);
8027 if (pending_inc
.new_max_osd
< 0) {
8028 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
8030 ++pending_inc
.new_max_osd
;
8032 *new_id
= pending_inc
.new_max_osd
- 1;
8033 ceph_assert(*new_id
== allocated_id
);
8035 ceph_abort_msg("unexpected condition");
8039 if (device_class
.size()) {
8040 CrushWrapper newcrush
;
8041 _get_pending_crush(newcrush
);
8042 if (newcrush
.get_max_devices() < *new_id
+ 1) {
8043 newcrush
.set_max_devices(*new_id
+ 1);
8045 string name
= string("osd.") + stringify(*new_id
);
8046 if (!newcrush
.item_exists(*new_id
)) {
8047 newcrush
.set_item_name(*new_id
, name
);
8050 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
8052 derr
<< __func__
<< " failed to set " << name
<< " device_class "
8053 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
8055 // non-fatal... this might be a replay and we want to be idempotent.
8057 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
8059 pending_inc
.crush
.clear();
8060 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8063 dout(20) << __func__
<< " no device_class" << dendl
;
8066 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
8067 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
8068 pending_inc
.new_max_osd
= *new_id
+ 1;
8071 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
8072 if (!uuid
.is_zero())
8073 pending_inc
.new_uuid
[*new_id
] = uuid
;
8076 int OSDMonitor::validate_osd_create(
8079 const bool check_osd_exists
,
8080 int32_t* existing_id
,
8084 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
8085 << " check_osd_exists " << check_osd_exists
<< dendl
;
8087 ceph_assert(existing_id
);
8089 if (id
< 0 && uuid
.is_zero()) {
8090 // we have nothing to validate
8093 } else if (uuid
.is_zero()) {
8094 // we have an id but we will ignore it - because that's what
8095 // `osd create` does.
8100 * This function will be used to validate whether we are able to
8101 * create a new osd when the `uuid` is specified.
8103 * It will be used by both `osd create` and `osd new`, as the checks
8104 * are basically the same when it pertains to osd id and uuid validation.
8105 * However, `osd create` presumes an `uuid` is optional, for legacy
8106 * reasons, while `osd new` requires the `uuid` to be provided. This
8107 * means that `osd create` will not be idempotent if an `uuid` is not
8108 * provided, but we will always guarantee the idempotency of `osd new`.
8111 ceph_assert(!uuid
.is_zero());
8112 if (pending_inc
.identify_osd(uuid
) >= 0) {
8113 // osd is about to exist
8117 int32_t i
= osdmap
.identify_osd(uuid
);
8119 // osd already exists
8120 if (id
>= 0 && i
!= id
) {
8121 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
8124 // return a positive errno to distinguish between a blocking error
8125 // and an error we consider to not be a problem (i.e., this would be
8126 // an idempotent operation).
8132 if (pending_inc
.new_state
.count(id
)) {
8133 // osd is about to exist
8136 // we may not care if an osd exists if we are recreating a previously
8138 if (check_osd_exists
&& osdmap
.exists(id
)) {
8139 ss
<< "id " << id
<< " already in use and does not match uuid "
8147 int OSDMonitor::prepare_command_osd_create(
8150 int32_t* existing_id
,
8153 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
8154 ceph_assert(existing_id
);
8155 if (osdmap
.is_destroyed(id
)) {
8156 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
8161 if (uuid
.is_zero()) {
8162 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
8165 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
8168 int OSDMonitor::prepare_command_osd_new(
8170 const cmdmap_t
& cmdmap
,
8171 const map
<string
,string
>& params
,
8179 ceph_assert(paxos
->is_plugged());
8181 dout(10) << __func__
<< " " << op
<< dendl
;
8183 /* validate command. abort now if something's wrong. */
8185 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8187 * If `id` is not specified, we will identify any existing osd based
8188 * on `uuid`. Operation will be idempotent iff secrets match.
8190 * If `id` is specified, we will identify any existing osd based on
8191 * `uuid` and match against `id`. If they match, operation will be
8192 * idempotent iff secrets match.
8194 * `-i secrets.json` will be optional. If supplied, will be used
8195 * to check for idempotency when `id` and `uuid` match.
8197 * If `id` is not specified, and `uuid` does not exist, an id will
8198 * be found or allocated for the osd.
8200 * If `id` is specified, and the osd has been previously marked
8201 * as destroyed, then the `id` will be reused.
8203 if (!cmd_getval(cct
, cmdmap
, "uuid", uuidstr
)) {
8204 ss
<< "requires the OSD's UUID to be specified.";
8206 } else if (!uuid
.parse(uuidstr
.c_str())) {
8207 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
8211 if (cmd_getval(cct
, cmdmap
, "id", id
) &&
8213 ss
<< "invalid OSD id; must be greater or equal than zero.";
8217 // are we running an `osd create`-like command, or recreating
8218 // a previously destroyed osd?
8220 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
8222 // we will care about `id` to assess whether osd is `destroyed`, or
8223 // to create a new osd.
8224 // we will need an `id` by the time we reach auth.
8226 int32_t existing_id
= -1;
8227 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
8230 bool may_be_idempotent
= false;
8231 if (err
== EEXIST
) {
8232 // this is idempotent from the osdmon's point-of-view
8233 may_be_idempotent
= true;
8234 ceph_assert(existing_id
>= 0);
8236 } else if (err
< 0) {
8240 if (!may_be_idempotent
) {
8241 // idempotency is out of the window. We are either creating a new
8242 // osd or recreating a destroyed osd.
8244 // We now need to figure out if we have an `id` (and if it's valid),
8245 // of find an `id` if we don't have one.
8247 // NOTE: we need to consider the case where the `id` is specified for
8248 // `osd create`, and we must honor it. So this means checking if
8249 // the `id` is destroyed, and if so assume the destroy; otherwise,
8250 // check if it `exists` - in which case we complain about not being
8251 // `destroyed`. In the end, if nothing fails, we must allow the
8252 // creation, so that we are compatible with `create`.
8253 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
8254 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
8255 ss
<< "OSD " << id
<< " has not yet been destroyed";
8257 } else if (id
< 0) {
8259 id
= _allocate_osd_id(&existing_id
);
8261 ceph_assert(existing_id
>= 0);
8264 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
8265 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
8266 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
8268 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
8271 ceph_assert(id
>= 0);
8272 ceph_assert(osdmap
.exists(id
));
8275 // we are now able to either create a brand new osd or reuse an existing
8276 // osd that has been previously destroyed.
8278 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
8280 if (may_be_idempotent
&& params
.empty()) {
8281 // nothing to do, really.
8282 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
8283 ceph_assert(id
>= 0);
8285 f
->open_object_section("created_osd");
8286 f
->dump_int("osdid", id
);
8294 string device_class
;
8295 auto p
= params
.find("crush_device_class");
8296 if (p
!= params
.end()) {
8297 device_class
= p
->second
;
8298 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
8300 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
8301 bool has_lockbox
= false;
8302 bool has_secrets
= params
.count("cephx_secret")
8303 || params
.count("cephx_lockbox_secret")
8304 || params
.count("dmcrypt_key");
8306 ConfigKeyService
*svc
= nullptr;
8307 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
8310 if (params
.count("cephx_secret") == 0) {
8311 ss
<< "requires a cephx secret.";
8314 cephx_secret
= params
.at("cephx_secret");
8316 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
8317 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
8319 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
8320 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
8322 if (has_lockbox_secret
&& has_dmcrypt_key
) {
8324 lockbox_secret
= params
.at("cephx_lockbox_secret");
8325 dmcrypt_key
= params
.at("dmcrypt_key");
8326 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
8327 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
8331 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
8333 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
8341 } else if (may_be_idempotent
&& err
!= EEXIST
) {
8342 // for this to be idempotent, `id` should already be >= 0; no need
8343 // to use validate_id.
8344 ceph_assert(id
>= 0);
8345 ss
<< "osd." << id
<< " exists but secrets do not match";
8350 svc
= (ConfigKeyService
*)mon
->config_key_service
;
8351 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
8354 } else if (may_be_idempotent
&& err
!= EEXIST
) {
8355 ceph_assert(id
>= 0);
8356 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
8361 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
8362 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
8364 if (may_be_idempotent
) {
8365 // we have nothing to do for either the osdmon or the authmon,
8366 // and we have no lockbox - so the config key service will not be
8367 // touched. This is therefore an idempotent operation, and we can
8368 // just return right away.
8369 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
8370 ceph_assert(id
>= 0);
8372 f
->open_object_section("created_osd");
8373 f
->dump_int("osdid", id
);
8380 ceph_assert(!may_be_idempotent
);
8384 ceph_assert(!cephx_secret
.empty());
8385 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
8386 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
8388 err
= mon
->authmon()->do_osd_new(cephx_entity
,
8391 ceph_assert(0 == err
);
8394 ceph_assert(nullptr != svc
);
8395 svc
->do_osd_new(uuid
, dmcrypt_key
);
8399 if (is_recreate_destroyed
) {
8400 ceph_assert(id
>= 0);
8401 ceph_assert(osdmap
.is_destroyed(id
));
8402 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
8403 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
8404 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
8405 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
8407 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
8408 // due to http://tracker.ceph.com/issues/20751 some clusters may
8409 // have UP set for non-existent OSDs; make sure it is cleared
8410 // for a newly created osd.
8411 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
8413 pending_inc
.new_uuid
[id
] = uuid
;
8415 ceph_assert(id
>= 0);
8416 int32_t new_id
= -1;
8417 do_osd_create(id
, uuid
, device_class
, &new_id
);
8418 ceph_assert(new_id
>= 0);
8419 ceph_assert(id
== new_id
);
8423 f
->open_object_section("created_osd");
8424 f
->dump_int("osdid", id
);
8433 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
8435 op
->mark_osdmon_event(__func__
);
8436 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
8439 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
8440 string rs
= ss
.str();
8441 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
8445 MonSession
*session
= op
->get_session();
8447 derr
<< __func__
<< " no session" << dendl
;
8448 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
8452 return prepare_command_impl(op
, cmdmap
);
8455 static int parse_reweights(CephContext
*cct
,
8456 const cmdmap_t
& cmdmap
,
8457 const OSDMap
& osdmap
,
8458 map
<int32_t, uint32_t>* weights
)
8461 if (!cmd_getval(cct
, cmdmap
, "weights", weights_str
)) {
8464 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
8465 json_spirit::mValue json_value
;
8466 if (!json_spirit::read(weights_str
, json_value
)) {
8469 if (json_value
.type() != json_spirit::obj_type
) {
8472 const auto obj
= json_value
.get_obj();
8474 for (auto& osd_weight
: obj
) {
8475 auto osd_id
= std::stoi(osd_weight
.first
);
8476 if (!osdmap
.exists(osd_id
)) {
8479 if (osd_weight
.second
.type() != json_spirit::str_type
) {
8482 auto weight
= std::stoul(osd_weight
.second
.get_str());
8483 weights
->insert({osd_id
, weight
});
8485 } catch (const std::logic_error
& e
) {
8491 int OSDMonitor::prepare_command_osd_destroy(
8495 ceph_assert(paxos
->is_plugged());
8497 // we check if the osd exists for the benefit of `osd purge`, which may
8498 // have previously removed the osd. If the osd does not exist, return
8499 // -ENOENT to convey this, and let the caller deal with it.
8501 // we presume that all auth secrets and config keys were removed prior
8502 // to this command being called. if they exist by now, we also assume
8503 // they must have been created by some other command and do not pertain
8504 // to this non-existent osd.
8505 if (!osdmap
.exists(id
)) {
8506 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
8510 uuid_d uuid
= osdmap
.get_uuid(id
);
8511 dout(10) << __func__
<< " destroying osd." << id
8512 << " uuid " << uuid
<< dendl
;
8514 // if it has been destroyed, we assume our work here is done.
8515 if (osdmap
.is_destroyed(id
)) {
8516 ss
<< "destroyed osd." << id
;
8520 EntityName cephx_entity
, lockbox_entity
;
8521 bool idempotent_auth
= false, idempotent_cks
= false;
8523 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
8528 if (err
== -ENOENT
) {
8529 idempotent_auth
= true;
8535 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
8536 err
= svc
->validate_osd_destroy(id
, uuid
);
8538 ceph_assert(err
== -ENOENT
);
8540 idempotent_cks
= true;
8543 if (!idempotent_auth
) {
8544 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
8545 ceph_assert(0 == err
);
8548 if (!idempotent_cks
) {
8549 svc
->do_osd_destroy(id
, uuid
);
8552 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
8553 pending_inc
.new_uuid
[id
] = uuid_d();
8555 // we can only propose_pending() once per service, otherwise we'll be
8556 // defying PaxosService and all laws of nature. Therefore, as we may
8557 // be used during 'osd purge', let's keep the caller responsible for
8559 ceph_assert(err
== 0);
8563 int OSDMonitor::prepare_command_osd_purge(
8567 ceph_assert(paxos
->is_plugged());
8568 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
8570 ceph_assert(!osdmap
.is_up(id
));
8573 * This may look a bit weird, but this is what's going to happen:
8575 * 1. we make sure that removing from crush works
8576 * 2. we call `prepare_command_osd_destroy()`. If it returns an
8577 * error, then we abort the whole operation, as no updates
8578 * have been made. However, we this function will have
8579 * side-effects, thus we need to make sure that all operations
8580 * performed henceforth will *always* succeed.
8581 * 3. we call `prepare_command_osd_remove()`. Although this
8582 * function can return an error, it currently only checks if the
8583 * osd is up - and we have made sure that it is not so, so there
8584 * is no conflict, and it is effectively an update.
8585 * 4. finally, we call `do_osd_crush_remove()`, which will perform
8586 * the crush update we delayed from before.
8589 CrushWrapper newcrush
;
8590 _get_pending_crush(newcrush
);
8592 bool may_be_idempotent
= false;
8594 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
8595 if (err
== -ENOENT
) {
8597 may_be_idempotent
= true;
8598 } else if (err
< 0) {
8599 ss
<< "error removing osd." << id
<< " from crush";
8603 // no point destroying the osd again if it has already been marked destroyed
8604 if (!osdmap
.is_destroyed(id
)) {
8605 err
= prepare_command_osd_destroy(id
, ss
);
8607 if (err
== -ENOENT
) {
8613 may_be_idempotent
= false;
8616 ceph_assert(0 == err
);
8618 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
8619 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
8620 << "we are idempotent." << dendl
;
8624 err
= prepare_command_osd_remove(id
);
8625 // we should not be busy, as we should have made sure this id is not up.
8626 ceph_assert(0 == err
);
8628 do_osd_crush_remove(newcrush
);
8632 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
8633 const cmdmap_t
& cmdmap
)
8635 op
->mark_osdmon_event(__func__
);
8636 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
8644 cmd_getval(cct
, cmdmap
, "format", format
, string("plain"));
8645 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
8648 cmd_getval(cct
, cmdmap
, "prefix", prefix
);
8652 bool osdid_present
= false;
8653 if (prefix
!= "osd pg-temp" &&
8654 prefix
!= "osd pg-upmap" &&
8655 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
8656 osdid_present
= cmd_getval(cct
, cmdmap
, "id", osdid
);
8658 if (osdid_present
) {
8660 oss
<< "osd." << osdid
;
8661 osd_name
= oss
.str();
8664 // Even if there's a pending state with changes that could affect
8665 // a command, considering that said state isn't yet committed, we
8666 // just don't care about those changes if the command currently being
8667 // handled acts as a no-op against the current committed state.
8668 // In a nutshell, we assume this command happens *before*.
8670 // Let me make this clearer:
8672 // - If we have only one client, and that client issues some
8673 // operation that would conflict with this operation but is
8674 // still on the pending state, then we would be sure that said
8675 // operation wouldn't have returned yet, so the client wouldn't
8676 // issue this operation (unless the client didn't wait for the
8677 // operation to finish, and that would be the client's own fault).
8679 // - If we have more than one client, each client will observe
8680 // whatever is the state at the moment of the commit. So, if we
8681 // have two clients, one issuing an unlink and another issuing a
8682 // link, and if the link happens while the unlink is still on the
8683 // pending state, from the link's point-of-view this is a no-op.
8684 // If different clients are issuing conflicting operations and
8685 // they care about that, then the clients should make sure they
8686 // enforce some kind of concurrency mechanism -- from our
8687 // perspective that's what Douglas Adams would call an SEP.
8689 // This should be used as a general guideline for most commands handled
8690 // in this function. Adapt as you see fit, but please bear in mind that
8691 // this is the expected behavior.
8694 if (prefix
== "osd setcrushmap" ||
8695 (prefix
== "osd crush set" && !osdid_present
)) {
8696 if (pending_inc
.crush
.length()) {
8697 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
8698 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8701 dout(10) << "prepare_command setting new crush map" << dendl
;
8702 bufferlist
data(m
->get_data());
8705 auto bl
= data
.cbegin();
8708 catch (const std::exception
&e
) {
8710 ss
<< "Failed to parse crushmap: " << e
.what();
8714 int64_t prior_version
= 0;
8715 if (cmd_getval(cct
, cmdmap
, "prior_version", prior_version
)) {
8716 if (prior_version
== osdmap
.get_crush_version() - 1) {
8717 // see if we are a resend of the last update. this is imperfect
8718 // (multiple racing updaters may not both get reliable success)
8719 // but we expect crush updaters (via this interface) to be rare-ish.
8720 bufferlist current
, proposed
;
8721 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
8722 crush
.encode(proposed
, mon
->get_quorum_con_features());
8723 if (current
.contents_equal(proposed
)) {
8724 dout(10) << __func__
8725 << " proposed matches current and version equals previous"
8728 ss
<< osdmap
.get_crush_version();
8732 if (prior_version
!= osdmap
.get_crush_version()) {
8734 ss
<< "prior_version " << prior_version
<< " != crush version "
8735 << osdmap
.get_crush_version();
8740 if (crush
.has_legacy_rule_ids()) {
8742 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
8745 if (!validate_crush_against_features(&crush
, ss
)) {
8750 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
8755 if (g_conf()->mon_osd_crush_smoke_test
) {
8756 // sanity check: test some inputs to make sure this map isn't
8758 dout(10) << " testing map" << dendl
;
8760 CrushTester
tester(crush
, ess
);
8761 tester
.set_min_x(0);
8762 tester
.set_max_x(50);
8763 auto start
= ceph::coarse_mono_clock::now();
8764 int r
= tester
.test_with_fork(g_conf()->mon_lease
);
8765 auto duration
= ceph::coarse_mono_clock::now() - start
;
8767 dout(10) << " tester.test_with_fork returns " << r
8768 << ": " << ess
.str() << dendl
;
8769 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
8773 dout(10) << __func__
<< " crush somke test duration: "
8774 << duration
<< ", result: " << ess
.str() << dendl
;
8777 pending_inc
.crush
= data
;
8778 ss
<< osdmap
.get_crush_version() + 1;
8781 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
8782 CrushWrapper newcrush
;
8783 _get_pending_crush(newcrush
);
8784 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
8786 if (newcrush
.bucket_exists(bid
) &&
8787 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
8788 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
8789 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
8792 if (!validate_crush_against_features(&newcrush
, ss
)) {
8796 pending_inc
.crush
.clear();
8797 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8798 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8799 get_last_committed() + 1));
8801 } else if (prefix
== "osd crush set-device-class") {
8802 string device_class
;
8803 if (!cmd_getval(cct
, cmdmap
, "class", device_class
)) {
8804 err
= -EINVAL
; // no value!
8809 vector
<string
> idvec
;
8810 cmd_getval(cct
, cmdmap
, "ids", idvec
);
8811 CrushWrapper newcrush
;
8812 _get_pending_crush(newcrush
);
8814 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8818 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8819 osdmap
.get_all_osds(osds
);
8822 // try traditional single osd way
8823 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8825 // ss has reason for failure
8826 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
8833 for (auto &osd
: osds
) {
8834 if (!osdmap
.exists(osd
)) {
8835 ss
<< "osd." << osd
<< " does not exist. ";
8840 oss
<< "osd." << osd
;
8841 string name
= oss
.str();
8843 if (newcrush
.get_max_devices() < osd
+ 1) {
8844 newcrush
.set_max_devices(osd
+ 1);
8847 if (newcrush
.item_exists(osd
)) {
8848 action
= "updating";
8850 action
= "creating";
8851 newcrush
.set_item_name(osd
, name
);
8854 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
8855 << "' device_class '" << device_class
<< "'"
8857 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
8861 if (err
== 0 && !_have_pending_crush()) {
8863 // for single osd only, wildcard makes too much noise
8864 ss
<< "set-device-class item id " << osd
<< " name '" << name
8865 << "' device_class '" << device_class
<< "': no change. ";
8868 updated
.insert(osd
);
8873 if (!updated
.empty()) {
8874 pending_inc
.crush
.clear();
8875 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8876 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
8878 wait_for_finished_proposal(op
,
8879 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
8883 } else if (prefix
== "osd crush rm-device-class") {
8885 vector
<string
> idvec
;
8886 cmd_getval(cct
, cmdmap
, "ids", idvec
);
8887 CrushWrapper newcrush
;
8888 _get_pending_crush(newcrush
);
8891 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8896 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8897 osdmap
.get_all_osds(osds
);
8900 // try traditional single osd way
8901 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8903 // ss has reason for failure
8904 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
8911 for (auto &osd
: osds
) {
8912 if (!osdmap
.exists(osd
)) {
8913 ss
<< "osd." << osd
<< " does not exist. ";
8917 auto class_name
= newcrush
.get_item_class(osd
);
8919 ss
<< "osd." << osd
<< " belongs to no class, ";
8922 // note that we do not verify if class_is_in_use here
8923 // in case the device is misclassified and user wants
8924 // to overridely reset...
8926 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
8928 // ss has reason for failure
8931 updated
.insert(osd
);
8935 if (!updated
.empty()) {
8936 pending_inc
.crush
.clear();
8937 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8938 ss
<< "done removing class of osd(s): " << updated
;
8940 wait_for_finished_proposal(op
,
8941 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
8944 } else if (prefix
== "osd crush class create") {
8945 string device_class
;
8946 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
8947 err
= -EINVAL
; // no value!
8950 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8951 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8952 << "luminous' before using crush device classes";
8956 if (!_have_pending_crush() &&
8957 _get_stable_crush().class_exists(device_class
)) {
8958 ss
<< "class '" << device_class
<< "' already exists";
8961 CrushWrapper newcrush
;
8962 _get_pending_crush(newcrush
);
8963 if (newcrush
.class_exists(device_class
)) {
8964 ss
<< "class '" << device_class
<< "' already exists";
8967 int class_id
= newcrush
.get_or_create_class_id(device_class
);
8968 pending_inc
.crush
.clear();
8969 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8970 ss
<< "created class " << device_class
<< " with id " << class_id
8973 } else if (prefix
== "osd crush class rm") {
8974 string device_class
;
8975 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
8976 err
= -EINVAL
; // no value!
8979 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8980 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8981 << "luminous' before using crush device classes";
8986 if (!osdmap
.crush
->class_exists(device_class
)) {
8991 CrushWrapper newcrush
;
8992 _get_pending_crush(newcrush
);
8993 if (!newcrush
.class_exists(device_class
)) {
8994 err
= 0; // make command idempotent
8997 int class_id
= newcrush
.get_class_id(device_class
);
8999 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
9001 ss
<< "class '" << device_class
<< "' " << ts
.str();
9005 // check if class is used by any erasure-code-profiles
9006 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
9007 osdmap
.get_erasure_code_profiles();
9008 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
9009 #ifdef HAVE_STDLIB_MAP_SPLICING
9010 ec_profiles
.merge(old_ec_profiles
);
9012 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
9013 make_move_iterator(end(old_ec_profiles
)));
9015 list
<string
> referenced_by
;
9016 for (auto &i
: ec_profiles
) {
9017 for (auto &j
: i
.second
) {
9018 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
9019 referenced_by
.push_back(i
.first
);
9023 if (!referenced_by
.empty()) {
9025 ss
<< "class '" << device_class
9026 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
9031 newcrush
.get_devices_by_class(device_class
, &osds
);
9032 for (auto& p
: osds
) {
9033 err
= newcrush
.remove_device_class(g_ceph_context
, p
, &ss
);
9035 // ss has reason for failure
9041 // empty class, remove directly
9042 err
= newcrush
.remove_class_name(device_class
);
9044 ss
<< "class '" << device_class
<< "' cannot be removed '"
9045 << cpp_strerror(err
) << "'";
9050 pending_inc
.crush
.clear();
9051 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9052 ss
<< "removed class " << device_class
<< " with id " << class_id
9053 << " from crush map";
9055 } else if (prefix
== "osd crush class rename") {
9056 string srcname
, dstname
;
9057 if (!cmd_getval(cct
, cmdmap
, "srcname", srcname
)) {
9061 if (!cmd_getval(cct
, cmdmap
, "dstname", dstname
)) {
9066 CrushWrapper newcrush
;
9067 _get_pending_crush(newcrush
);
9068 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
9069 // suppose this is a replay and return success
9070 // so command is idempotent
9071 ss
<< "already renamed to '" << dstname
<< "'";
9076 err
= newcrush
.rename_class(srcname
, dstname
);
9078 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
9079 << cpp_strerror(err
);
9083 pending_inc
.crush
.clear();
9084 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9085 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
9087 } else if (prefix
== "osd crush add-bucket") {
9088 // os crush add-bucket <name> <type>
9089 string name
, typestr
;
9090 vector
<string
> argvec
;
9091 cmd_getval(cct
, cmdmap
, "name", name
);
9092 cmd_getval(cct
, cmdmap
, "type", typestr
);
9093 cmd_getval(cct
, cmdmap
, "args", argvec
);
9094 map
<string
,string
> loc
;
9095 if (!argvec
.empty()) {
9096 CrushWrapper::parse_loc_map(argvec
, &loc
);
9097 dout(0) << "will create and move bucket '" << name
9098 << "' to location " << loc
<< dendl
;
9101 if (!_have_pending_crush() &&
9102 _get_stable_crush().name_exists(name
)) {
9103 ss
<< "bucket '" << name
<< "' already exists";
9107 CrushWrapper newcrush
;
9108 _get_pending_crush(newcrush
);
9110 if (newcrush
.name_exists(name
)) {
9111 ss
<< "bucket '" << name
<< "' already exists";
9114 int type
= newcrush
.get_type_id(typestr
);
9116 ss
<< "type '" << typestr
<< "' does not exist";
9121 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
9126 err
= newcrush
.add_bucket(0, 0,
9127 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
9130 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
9133 err
= newcrush
.set_item_name(bucketno
, name
);
9135 ss
<< "error setting bucket name to '" << name
<< "'";
9140 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
9142 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
9144 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
9148 ss
<< "no need to move item id " << bucketno
<< " name '" << name
9149 << "' to location " << loc
<< " in crush map";
9153 pending_inc
.crush
.clear();
9154 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9156 ss
<< "added bucket " << name
<< " type " << typestr
9159 ss
<< "added bucket " << name
<< " type " << typestr
9160 << " to location " << loc
;
9163 } else if (prefix
== "osd crush rename-bucket") {
9164 string srcname
, dstname
;
9165 cmd_getval(cct
, cmdmap
, "srcname", srcname
);
9166 cmd_getval(cct
, cmdmap
, "dstname", dstname
);
9168 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
9169 if (err
== -EALREADY
) // equivalent to success for idempotency
9175 } else if (prefix
== "osd crush weight-set create" ||
9176 prefix
== "osd crush weight-set create-compat") {
9177 CrushWrapper newcrush
;
9178 _get_pending_crush(newcrush
);
9181 if (newcrush
.has_non_straw2_buckets()) {
9182 ss
<< "crush map contains one or more bucket(s) that are not straw2";
9186 if (prefix
== "osd crush weight-set create") {
9187 if (osdmap
.require_min_compat_client
> 0 &&
9188 osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
9189 ss
<< "require_min_compat_client "
9190 << ceph_release_name(osdmap
.require_min_compat_client
)
9191 << " < luminous, which is required for per-pool weight-sets. "
9192 << "Try 'ceph osd set-require-min-compat-client luminous' "
9193 << "before using the new interface";
9197 string poolname
, mode
;
9198 cmd_getval(cct
, cmdmap
, "pool", poolname
);
9199 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
9201 ss
<< "pool '" << poolname
<< "' not found";
9205 cmd_getval(cct
, cmdmap
, "mode", mode
);
9206 if (mode
!= "flat" && mode
!= "positional") {
9207 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
9211 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
9213 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
9216 if (!newcrush
.create_choose_args(pool
, positions
)) {
9217 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
9218 ss
<< "compat weight-set already created";
9220 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
9221 << "' already created";
9225 pending_inc
.crush
.clear();
9226 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9229 } else if (prefix
== "osd crush weight-set rm" ||
9230 prefix
== "osd crush weight-set rm-compat") {
9231 CrushWrapper newcrush
;
9232 _get_pending_crush(newcrush
);
9234 if (prefix
== "osd crush weight-set rm") {
9236 cmd_getval(cct
, cmdmap
, "pool", poolname
);
9237 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
9239 ss
<< "pool '" << poolname
<< "' not found";
9244 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
9246 newcrush
.rm_choose_args(pool
);
9247 pending_inc
.crush
.clear();
9248 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9251 } else if (prefix
== "osd crush weight-set reweight" ||
9252 prefix
== "osd crush weight-set reweight-compat") {
9253 string poolname
, item
;
9254 vector
<double> weight
;
9255 cmd_getval(cct
, cmdmap
, "pool", poolname
);
9256 cmd_getval(cct
, cmdmap
, "item", item
);
9257 cmd_getval(cct
, cmdmap
, "weight", weight
);
9258 CrushWrapper newcrush
;
9259 _get_pending_crush(newcrush
);
9261 if (prefix
== "osd crush weight-set reweight") {
9262 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
9264 ss
<< "pool '" << poolname
<< "' not found";
9268 if (!newcrush
.have_choose_args(pool
)) {
9269 ss
<< "no weight-set for pool '" << poolname
<< "'";
9273 auto arg_map
= newcrush
.choose_args_get(pool
);
9274 int positions
= newcrush
.get_choose_args_positions(arg_map
);
9275 if (weight
.size() != (size_t)positions
) {
9276 ss
<< "must specify exact " << positions
<< " weight values";
9281 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
9282 if (!newcrush
.have_choose_args(pool
)) {
9283 ss
<< "no backward-compatible weight-set";
9288 if (!newcrush
.name_exists(item
)) {
9289 ss
<< "item '" << item
<< "' does not exist";
9293 err
= newcrush
.choose_args_adjust_item_weightf(
9295 newcrush
.choose_args_get(pool
),
9296 newcrush
.get_item_id(item
),
9303 pending_inc
.crush
.clear();
9304 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9306 } else if (osdid_present
&&
9307 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
9308 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
9309 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
9310 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
9312 if (!osdmap
.exists(osdid
)) {
9315 << " does not exist. Create it before updating the crush map";
9320 if (!cmd_getval(cct
, cmdmap
, "weight", weight
)) {
9321 ss
<< "unable to parse weight value '"
9322 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
9328 vector
<string
> argvec
;
9329 cmd_getval(cct
, cmdmap
, "args", argvec
);
9330 map
<string
,string
> loc
;
9331 CrushWrapper::parse_loc_map(argvec
, &loc
);
9333 if (prefix
== "osd crush set"
9334 && !_get_stable_crush().item_exists(osdid
)) {
9336 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
9337 << "' weight " << weight
<< " at location " << loc
9338 << ": does not exist";
9342 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
9343 << osd_name
<< "' weight " << weight
<< " at location "
9345 CrushWrapper newcrush
;
9346 _get_pending_crush(newcrush
);
9349 if (prefix
== "osd crush set" ||
9350 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
9352 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
9355 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
9363 if (err
== 0 && !_have_pending_crush()) {
9364 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
9365 << "' weight " << weight
<< " at location " << loc
<< ": no change";
9369 pending_inc
.crush
.clear();
9370 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9371 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
9372 << weight
<< " at location " << loc
<< " to crush map";
9374 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9375 get_last_committed() + 1));
9378 } else if (prefix
== "osd crush create-or-move") {
9380 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
9381 if (!osdmap
.exists(osdid
)) {
9384 << " does not exist. create it before updating the crush map";
9389 if (!cmd_getval(cct
, cmdmap
, "weight", weight
)) {
9390 ss
<< "unable to parse weight value '"
9391 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
9397 vector
<string
> argvec
;
9398 cmd_getval(cct
, cmdmap
, "args", argvec
);
9399 map
<string
,string
> loc
;
9400 CrushWrapper::parse_loc_map(argvec
, &loc
);
9402 dout(0) << "create-or-move crush item name '" << osd_name
9403 << "' initial_weight " << weight
<< " at location " << loc
9406 CrushWrapper newcrush
;
9407 _get_pending_crush(newcrush
);
9409 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
9410 g_conf()->osd_crush_update_weight_set
);
9412 ss
<< "create-or-move updated item name '" << osd_name
9413 << "' weight " << weight
9414 << " at location " << loc
<< " to crush map";
9418 pending_inc
.crush
.clear();
9419 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9420 ss
<< "create-or-move updating item name '" << osd_name
9421 << "' weight " << weight
9422 << " at location " << loc
<< " to crush map";
9424 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9425 get_last_committed() + 1));
9430 } else if (prefix
== "osd crush move") {
9432 // osd crush move <name> <loc1> [<loc2> ...]
9434 vector
<string
> argvec
;
9435 cmd_getval(cct
, cmdmap
, "name", name
);
9436 cmd_getval(cct
, cmdmap
, "args", argvec
);
9437 map
<string
,string
> loc
;
9438 CrushWrapper::parse_loc_map(argvec
, &loc
);
9440 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
9441 CrushWrapper newcrush
;
9442 _get_pending_crush(newcrush
);
9444 if (!newcrush
.name_exists(name
)) {
9446 ss
<< "item " << name
<< " does not exist";
9449 int id
= newcrush
.get_item_id(name
);
9451 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
9453 err
= newcrush
.create_or_move_item(
9454 cct
, id
, 0, name
, loc
,
9455 g_conf()->osd_crush_update_weight_set
);
9457 err
= newcrush
.move_bucket(cct
, id
, loc
);
9460 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
9461 pending_inc
.crush
.clear();
9462 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9464 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9465 get_last_committed() + 1));
9469 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
9473 } else if (prefix
== "osd crush swap-bucket") {
9474 string source
, dest
;
9475 cmd_getval(cct
, cmdmap
, "source", source
);
9476 cmd_getval(cct
, cmdmap
, "dest", dest
);
9479 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", force
);
9481 CrushWrapper newcrush
;
9482 _get_pending_crush(newcrush
);
9483 if (!newcrush
.name_exists(source
)) {
9484 ss
<< "source item " << source
<< " does not exist";
9488 if (!newcrush
.name_exists(dest
)) {
9489 ss
<< "dest item " << dest
<< " does not exist";
9493 int sid
= newcrush
.get_item_id(source
);
9494 int did
= newcrush
.get_item_id(dest
);
9496 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
9497 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
9501 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
9503 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
9504 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
9505 << "; pass --yes-i-really-mean-it to proceed anyway";
9509 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
9511 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
9515 ss
<< "swapped bucket of " << source
<< " to " << dest
;
9516 pending_inc
.crush
.clear();
9517 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9518 wait_for_finished_proposal(op
,
9519 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
9520 get_last_committed() + 1));
9522 } else if (prefix
== "osd crush link") {
9523 // osd crush link <name> <loc1> [<loc2> ...]
9525 cmd_getval(cct
, cmdmap
, "name", name
);
9526 vector
<string
> argvec
;
9527 cmd_getval(cct
, cmdmap
, "args", argvec
);
9528 map
<string
,string
> loc
;
9529 CrushWrapper::parse_loc_map(argvec
, &loc
);
9531 // Need an explicit check for name_exists because get_item_id returns
9533 int id
= osdmap
.crush
->get_item_id(name
);
9534 if (!osdmap
.crush
->name_exists(name
)) {
9536 ss
<< "item " << name
<< " does not exist";
9539 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
9541 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
9542 ss
<< "no need to move item id " << id
<< " name '" << name
9543 << "' to location " << loc
<< " in crush map";
9548 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
9549 CrushWrapper newcrush
;
9550 _get_pending_crush(newcrush
);
9552 if (!newcrush
.name_exists(name
)) {
9554 ss
<< "item " << name
<< " does not exist";
9557 int id
= newcrush
.get_item_id(name
);
9558 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
9559 err
= newcrush
.link_bucket(cct
, id
, loc
);
9561 ss
<< "linked item id " << id
<< " name '" << name
9562 << "' to location " << loc
<< " in crush map";
9563 pending_inc
.crush
.clear();
9564 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9566 ss
<< "cannot link item id " << id
<< " name '" << name
9567 << "' to location " << loc
;
9571 ss
<< "no need to move item id " << id
<< " name '" << name
9572 << "' to location " << loc
<< " in crush map";
9576 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
9577 get_last_committed() + 1));
9579 } else if (prefix
== "osd crush rm" ||
9580 prefix
== "osd crush remove" ||
9581 prefix
== "osd crush unlink") {
9583 // osd crush rm <id> [ancestor]
9584 CrushWrapper newcrush
;
9585 _get_pending_crush(newcrush
);
9588 cmd_getval(cct
, cmdmap
, "name", name
);
9590 if (!osdmap
.crush
->name_exists(name
)) {
9592 ss
<< "device '" << name
<< "' does not appear in the crush map";
9595 if (!newcrush
.name_exists(name
)) {
9597 ss
<< "device '" << name
<< "' does not appear in the crush map";
9599 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9600 get_last_committed() + 1));
9603 int id
= newcrush
.get_item_id(name
);
9606 bool unlink_only
= prefix
== "osd crush unlink";
9607 string ancestor_str
;
9608 if (cmd_getval(cct
, cmdmap
, "ancestor", ancestor_str
)) {
9609 if (!newcrush
.name_exists(ancestor_str
)) {
9611 ss
<< "ancestor item '" << ancestor_str
9612 << "' does not appear in the crush map";
9615 ancestor
= newcrush
.get_item_id(ancestor_str
);
9618 err
= prepare_command_osd_crush_remove(
9621 (ancestor
< 0), unlink_only
);
9623 if (err
== -ENOENT
) {
9624 ss
<< "item " << id
<< " does not appear in that position";
9630 pending_inc
.new_crush_node_flags
[id
] = 0;
9631 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
9633 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9634 get_last_committed() + 1));
9639 } else if (prefix
== "osd crush reweight-all") {
9640 CrushWrapper newcrush
;
9641 _get_pending_crush(newcrush
);
9643 newcrush
.reweight(cct
);
9644 pending_inc
.crush
.clear();
9645 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9646 ss
<< "reweighted crush hierarchy";
9648 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9649 get_last_committed() + 1));
9651 } else if (prefix
== "osd crush reweight") {
9652 // osd crush reweight <name> <weight>
9653 CrushWrapper newcrush
;
9654 _get_pending_crush(newcrush
);
9657 cmd_getval(cct
, cmdmap
, "name", name
);
9658 if (!newcrush
.name_exists(name
)) {
9660 ss
<< "device '" << name
<< "' does not appear in the crush map";
9664 int id
= newcrush
.get_item_id(name
);
9666 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
9671 if (!cmd_getval(cct
, cmdmap
, "weight", w
)) {
9672 ss
<< "unable to parse weight value '"
9673 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
9678 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
9679 g_conf()->osd_crush_update_weight_set
);
9682 pending_inc
.crush
.clear();
9683 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9684 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
9687 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9688 get_last_committed() + 1));
9690 } else if (prefix
== "osd crush reweight-subtree") {
9691 // osd crush reweight <name> <weight>
9692 CrushWrapper newcrush
;
9693 _get_pending_crush(newcrush
);
9696 cmd_getval(cct
, cmdmap
, "name", name
);
9697 if (!newcrush
.name_exists(name
)) {
9699 ss
<< "device '" << name
<< "' does not appear in the crush map";
9703 int id
= newcrush
.get_item_id(name
);
9705 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
9710 if (!cmd_getval(cct
, cmdmap
, "weight", w
)) {
9711 ss
<< "unable to parse weight value '"
9712 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
9717 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
9718 g_conf()->osd_crush_update_weight_set
);
9721 pending_inc
.crush
.clear();
9722 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9723 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
9726 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9727 get_last_committed() + 1));
9729 } else if (prefix
== "osd crush tunables") {
9730 CrushWrapper newcrush
;
9731 _get_pending_crush(newcrush
);
9735 cmd_getval(cct
, cmdmap
, "profile", profile
);
9736 if (profile
== "legacy" || profile
== "argonaut") {
9737 newcrush
.set_tunables_legacy();
9738 } else if (profile
== "bobtail") {
9739 newcrush
.set_tunables_bobtail();
9740 } else if (profile
== "firefly") {
9741 newcrush
.set_tunables_firefly();
9742 } else if (profile
== "hammer") {
9743 newcrush
.set_tunables_hammer();
9744 } else if (profile
== "jewel") {
9745 newcrush
.set_tunables_jewel();
9746 } else if (profile
== "optimal") {
9747 newcrush
.set_tunables_optimal();
9748 } else if (profile
== "default") {
9749 newcrush
.set_tunables_default();
9751 ss
<< "unrecognized profile '" << profile
<< "'";
9756 if (!validate_crush_against_features(&newcrush
, ss
)) {
9761 pending_inc
.crush
.clear();
9762 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9763 ss
<< "adjusted tunables profile to " << profile
;
9765 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9766 get_last_committed() + 1));
9768 } else if (prefix
== "osd crush set-tunable") {
9769 CrushWrapper newcrush
;
9770 _get_pending_crush(newcrush
);
9774 cmd_getval(cct
, cmdmap
, "tunable", tunable
);
9777 if (!cmd_getval(cct
, cmdmap
, "value", value
)) {
9779 ss
<< "failed to parse integer value "
9780 << cmd_vartype_stringify(cmdmap
.at("value"));
9784 if (tunable
== "straw_calc_version") {
9785 if (value
!= 0 && value
!= 1) {
9786 ss
<< "value must be 0 or 1; got " << value
;
9790 newcrush
.set_straw_calc_version(value
);
9792 ss
<< "unrecognized tunable '" << tunable
<< "'";
9797 if (!validate_crush_against_features(&newcrush
, ss
)) {
9802 pending_inc
.crush
.clear();
9803 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9804 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
9806 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9807 get_last_committed() + 1));
9810 } else if (prefix
== "osd crush rule create-simple") {
9811 string name
, root
, type
, mode
;
9812 cmd_getval(cct
, cmdmap
, "name", name
);
9813 cmd_getval(cct
, cmdmap
, "root", root
);
9814 cmd_getval(cct
, cmdmap
, "type", type
);
9815 cmd_getval(cct
, cmdmap
, "mode", mode
);
9819 if (osdmap
.crush
->rule_exists(name
)) {
9820 // The name is uniquely associated to a ruleid and the rule it contains
9821 // From the user point of view, the rule is more meaningfull.
9822 ss
<< "rule " << name
<< " already exists";
9827 CrushWrapper newcrush
;
9828 _get_pending_crush(newcrush
);
9830 if (newcrush
.rule_exists(name
)) {
9831 // The name is uniquely associated to a ruleid and the rule it contains
9832 // From the user point of view, the rule is more meaningfull.
9833 ss
<< "rule " << name
<< " already exists";
9836 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
9837 pg_pool_t::TYPE_REPLICATED
, &ss
);
9843 pending_inc
.crush
.clear();
9844 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9847 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9848 get_last_committed() + 1));
9851 } else if (prefix
== "osd crush rule create-replicated") {
9852 string name
, root
, type
, device_class
;
9853 cmd_getval(cct
, cmdmap
, "name", name
);
9854 cmd_getval(cct
, cmdmap
, "root", root
);
9855 cmd_getval(cct
, cmdmap
, "type", type
);
9856 cmd_getval(cct
, cmdmap
, "class", device_class
);
9858 if (osdmap
.crush
->rule_exists(name
)) {
9859 // The name is uniquely associated to a ruleid and the rule it contains
9860 // From the user point of view, the rule is more meaningfull.
9861 ss
<< "rule " << name
<< " already exists";
9866 CrushWrapper newcrush
;
9867 _get_pending_crush(newcrush
);
9869 if (newcrush
.rule_exists(name
)) {
9870 // The name is uniquely associated to a ruleid and the rule it contains
9871 // From the user point of view, the rule is more meaningfull.
9872 ss
<< "rule " << name
<< " already exists";
9875 int ruleno
= newcrush
.add_simple_rule(
9876 name
, root
, type
, device_class
,
9877 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
9883 pending_inc
.crush
.clear();
9884 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9887 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9888 get_last_committed() + 1));
9891 } else if (prefix
== "osd erasure-code-profile rm") {
9893 cmd_getval(cct
, cmdmap
, "name", name
);
9895 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
9898 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
9903 if (osdmap
.has_erasure_code_profile(name
) ||
9904 pending_inc
.new_erasure_code_profiles
.count(name
)) {
9905 if (osdmap
.has_erasure_code_profile(name
)) {
9906 pending_inc
.old_erasure_code_profiles
.push_back(name
);
9908 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
9909 pending_inc
.new_erasure_code_profiles
.erase(name
);
9913 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9914 get_last_committed() + 1));
9917 ss
<< "erasure-code-profile " << name
<< " does not exist";
9922 } else if (prefix
== "osd erasure-code-profile set") {
9924 cmd_getval(cct
, cmdmap
, "name", name
);
9925 vector
<string
> profile
;
9926 cmd_getval(cct
, cmdmap
, "profile", profile
);
9929 cmd_getval(cct
, cmdmap
, "force", force
);
9931 map
<string
,string
> profile_map
;
9932 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
9935 if (profile_map
.find("plugin") == profile_map
.end()) {
9936 ss
<< "erasure-code-profile " << profile_map
9937 << " must contain a plugin entry" << std::endl
;
9941 string plugin
= profile_map
["plugin"];
9943 if (pending_inc
.has_erasure_code_profile(name
)) {
9944 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
9947 err
= normalize_profile(name
, profile_map
, force
, &ss
);
9951 if (osdmap
.has_erasure_code_profile(name
)) {
9952 ErasureCodeProfile existing_profile_map
=
9953 osdmap
.get_erasure_code_profile(name
);
9954 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
9958 if (existing_profile_map
== profile_map
) {
9964 ss
<< "will not override erasure code profile " << name
9965 << " because the existing profile "
9966 << existing_profile_map
9967 << " is different from the proposed profile "
9973 dout(20) << "erasure code profile set " << name
<< "="
9974 << profile_map
<< dendl
;
9975 pending_inc
.set_erasure_code_profile(name
, profile_map
);
9979 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9980 get_last_committed() + 1));
9983 } else if (prefix
== "osd crush rule create-erasure") {
9984 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
9989 string name
, poolstr
;
9990 cmd_getval(cct
, cmdmap
, "name", name
);
9992 cmd_getval(cct
, cmdmap
, "profile", profile
);
9994 profile
= "default";
9995 if (profile
== "default") {
9996 if (!osdmap
.has_erasure_code_profile(profile
)) {
9997 if (pending_inc
.has_erasure_code_profile(profile
)) {
9998 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
10002 map
<string
,string
> profile_map
;
10003 err
= osdmap
.get_erasure_code_profile_default(cct
,
10008 err
= normalize_profile(name
, profile_map
, true, &ss
);
10011 dout(20) << "erasure code profile set " << profile
<< "="
10012 << profile_map
<< dendl
;
10013 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
10019 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
10022 case -EEXIST
: // return immediately
10023 ss
<< "rule " << name
<< " already exists";
10027 case -EALREADY
: // wait for pending to be proposed
10028 ss
<< "rule " << name
<< " already exists";
10031 default: // non recoverable error
10036 ss
<< "created rule " << name
<< " at " << rule
;
10040 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10041 get_last_committed() + 1));
10044 } else if (prefix
== "osd crush rule rm") {
10046 cmd_getval(cct
, cmdmap
, "name", name
);
10048 if (!osdmap
.crush
->rule_exists(name
)) {
10049 ss
<< "rule " << name
<< " does not exist";
10054 CrushWrapper newcrush
;
10055 _get_pending_crush(newcrush
);
10057 if (!newcrush
.rule_exists(name
)) {
10058 ss
<< "rule " << name
<< " does not exist";
10061 int ruleno
= newcrush
.get_rule_id(name
);
10062 ceph_assert(ruleno
>= 0);
10064 // make sure it is not in use.
10065 // FIXME: this is ok in some situations, but let's not bother with that
10067 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
10068 if (osdmap
.crush_rule_in_use(ruleset
)) {
10069 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
10074 err
= newcrush
.remove_rule(ruleno
);
10079 pending_inc
.crush
.clear();
10080 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10083 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10084 get_last_committed() + 1));
10087 } else if (prefix
== "osd crush rule rename") {
10090 cmd_getval(cct
, cmdmap
, "srcname", srcname
);
10091 cmd_getval(cct
, cmdmap
, "dstname", dstname
);
10092 if (srcname
.empty() || dstname
.empty()) {
10093 ss
<< "must specify both source rule name and destination rule name";
10097 if (srcname
== dstname
) {
10098 ss
<< "destination rule name is equal to source rule name";
10103 CrushWrapper newcrush
;
10104 _get_pending_crush(newcrush
);
10105 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
10106 // srcname does not exist and dstname already exists
10107 // suppose this is a replay and return success
10108 // (so this command is idempotent)
10109 ss
<< "already renamed to '" << dstname
<< "'";
10114 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
10116 // ss has reason for failure
10119 pending_inc
.crush
.clear();
10120 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10122 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10123 get_last_committed() + 1));
10126 } else if (prefix
== "osd setmaxosd") {
10128 if (!cmd_getval(cct
, cmdmap
, "newmax", newmax
)) {
10129 ss
<< "unable to parse 'newmax' value '"
10130 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
10135 if (newmax
> g_conf()->mon_max_osd
) {
10137 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
10138 << g_conf()->mon_max_osd
<< ")";
10142 // Don't allow shrinking OSD number as this will cause data loss
10143 // and may cause kernel crashes.
10144 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10145 if (newmax
< osdmap
.get_max_osd()) {
10146 // Check if the OSDs exist between current max and new value.
10147 // If there are any OSDs exist, then don't allow shrinking number
10149 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
10150 if (osdmap
.exists(i
)) {
10152 ss
<< "cannot shrink max_osd to " << newmax
10153 << " because osd." << i
<< " (and possibly others) still in use";
10159 pending_inc
.new_max_osd
= newmax
;
10160 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
10162 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10163 get_last_committed() + 1));
10166 } else if (prefix
== "osd set-full-ratio" ||
10167 prefix
== "osd set-backfillfull-ratio" ||
10168 prefix
== "osd set-nearfull-ratio") {
10170 if (!cmd_getval(cct
, cmdmap
, "ratio", n
)) {
10171 ss
<< "unable to parse 'ratio' value '"
10172 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
10176 if (prefix
== "osd set-full-ratio")
10177 pending_inc
.new_full_ratio
= n
;
10178 else if (prefix
== "osd set-backfillfull-ratio")
10179 pending_inc
.new_backfillfull_ratio
= n
;
10180 else if (prefix
== "osd set-nearfull-ratio")
10181 pending_inc
.new_nearfull_ratio
= n
;
10182 ss
<< prefix
<< " " << n
;
10184 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10185 get_last_committed() + 1));
10187 } else if (prefix
== "osd set-require-min-compat-client") {
10189 cmd_getval(cct
, cmdmap
, "version", v
);
10190 int vno
= ceph_release_from_name(v
.c_str());
10192 ss
<< "version " << v
<< " is not recognized";
10197 newmap
.deepish_copy_from(osdmap
);
10198 newmap
.apply_incremental(pending_inc
);
10199 newmap
.require_min_compat_client
= vno
;
10200 auto mvno
= newmap
.get_min_compat_client();
10202 ss
<< "osdmap current utilizes features that require "
10203 << ceph_release_name(mvno
)
10204 << "; cannot set require_min_compat_client below that to "
10205 << ceph_release_name(vno
);
10210 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", sure
);
10213 mon
->get_combined_feature_map(&m
);
10214 uint64_t features
= ceph_release_features(vno
);
10218 CEPH_ENTITY_TYPE_CLIENT
,
10219 CEPH_ENTITY_TYPE_MDS
,
10220 CEPH_ENTITY_TYPE_MGR
}) {
10221 auto p
= m
.m
.find(type
);
10222 if (p
== m
.m
.end()) {
10225 for (auto& q
: p
->second
) {
10226 uint64_t missing
= ~q
.first
& features
;
10229 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
10234 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
10235 << "(s) look like " << ceph_release_name(
10236 ceph_release_from_features(q
.first
))
10237 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
10243 ss
<< "; add --yes-i-really-mean-it to do it anyway";
10248 ss
<< "set require_min_compat_client to " << ceph_release_name(vno
);
10249 pending_inc
.new_require_min_compat_client
= vno
;
10251 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10252 get_last_committed() + 1));
10254 } else if (prefix
== "osd pause") {
10255 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
10257 } else if (prefix
== "osd unpause") {
10258 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
10260 } else if (prefix
== "osd set") {
10262 cmd_getval(g_ceph_context
, cmdmap
, "yes_i_really_mean_it", sure
);
10265 cmd_getval(cct
, cmdmap
, "key", key
);
10267 return prepare_set_flag(op
, CEPH_OSDMAP_FULL
);
10268 else if (key
== "pause")
10269 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
10270 else if (key
== "noup")
10271 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
10272 else if (key
== "nodown")
10273 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
10274 else if (key
== "noout")
10275 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
10276 else if (key
== "noin")
10277 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
10278 else if (key
== "nobackfill")
10279 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
10280 else if (key
== "norebalance")
10281 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
10282 else if (key
== "norecover")
10283 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
10284 else if (key
== "noscrub")
10285 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
10286 else if (key
== "nodeep-scrub")
10287 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
10288 else if (key
== "notieragent")
10289 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
10290 else if (key
== "nosnaptrim")
10291 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
10292 else if (key
== "pglog_hardlimit") {
10293 if (!osdmap
.get_num_up_osds() && !sure
) {
10294 ss
<< "Not advisable to continue since no OSDs are up. Pass "
10295 << "--yes-i-really-mean-it if you really wish to continue.";
10299 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
10300 // we are reusing a jewel feature bit that was retired in luminous.
10301 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
10302 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
10304 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
10306 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
10311 ss
<< "unrecognized flag '" << key
<< "'";
10315 } else if (prefix
== "osd unset") {
10317 cmd_getval(cct
, cmdmap
, "key", key
);
10319 return prepare_unset_flag(op
, CEPH_OSDMAP_FULL
);
10320 else if (key
== "pause")
10321 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
10322 else if (key
== "noup")
10323 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
10324 else if (key
== "nodown")
10325 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
10326 else if (key
== "noout")
10327 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
10328 else if (key
== "noin")
10329 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
10330 else if (key
== "nobackfill")
10331 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
10332 else if (key
== "norebalance")
10333 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
10334 else if (key
== "norecover")
10335 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
10336 else if (key
== "noscrub")
10337 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
10338 else if (key
== "nodeep-scrub")
10339 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
10340 else if (key
== "notieragent")
10341 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
10342 else if (key
== "nosnaptrim")
10343 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
10345 ss
<< "unrecognized flag '" << key
<< "'";
10349 } else if (prefix
== "osd require-osd-release") {
10351 cmd_getval(cct
, cmdmap
, "release", release
);
10353 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", sure
);
10354 int rel
= ceph_release_from_name(release
.c_str());
10356 ss
<< "unrecognized release " << release
;
10360 if (rel
== osdmap
.require_osd_release
) {
10365 ceph_assert(osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
);
10366 if (!osdmap
.get_num_up_osds() && !sure
) {
10367 ss
<< "Not advisable to continue since no OSDs are up. Pass "
10368 << "--yes-i-really-mean-it if you really wish to continue.";
10372 if (rel
== CEPH_RELEASE_MIMIC
) {
10373 if (!mon
->monmap
->get_required_features().contains_all(
10374 ceph::features::mon::FEATURE_MIMIC
)) {
10375 ss
<< "not all mons are mimic";
10379 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_MIMIC
))
10381 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
10385 } else if (rel
== CEPH_RELEASE_NAUTILUS
) {
10386 if (!mon
->monmap
->get_required_features().contains_all(
10387 ceph::features::mon::FEATURE_NAUTILUS
)) {
10388 ss
<< "not all mons are nautilus";
10392 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_NAUTILUS
))
10394 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
10399 ss
<< "not supported for this release yet";
10403 if (rel
< osdmap
.require_osd_release
) {
10404 ss
<< "require_osd_release cannot be lowered once it has been set";
10408 pending_inc
.new_require_osd_release
= rel
;
10410 } else if (prefix
== "osd down" ||
10411 prefix
== "osd out" ||
10412 prefix
== "osd in" ||
10413 prefix
== "osd rm") {
10417 bool verbose
= true;
10419 vector
<string
> idvec
;
10420 cmd_getval(cct
, cmdmap
, "ids", idvec
);
10421 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10426 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10427 if (prefix
== "osd in") {
10428 // touch out osds only
10429 osdmap
.get_out_existing_osds(osds
);
10431 osdmap
.get_all_osds(osds
);
10434 verbose
= false; // so the output is less noisy.
10436 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10438 ss
<< "invalid osd id" << osd
;
10441 } else if (!osdmap
.exists(osd
)) {
10442 ss
<< "osd." << osd
<< " does not exist. ";
10449 for (auto &osd
: osds
) {
10450 if (prefix
== "osd down") {
10451 if (osdmap
.is_down(osd
)) {
10453 ss
<< "osd." << osd
<< " is already down. ";
10455 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
10456 ss
<< "marked down osd." << osd
<< ". ";
10459 } else if (prefix
== "osd out") {
10460 if (osdmap
.is_out(osd
)) {
10462 ss
<< "osd." << osd
<< " is already out. ";
10464 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
10465 if (osdmap
.osd_weight
[osd
]) {
10466 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
10467 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
10469 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
10471 ss
<< "marked out osd." << osd
<< ". ";
10472 std::ostringstream msg
;
10473 msg
<< "Client " << op
->get_session()->entity_name
10474 << " marked osd." << osd
<< " out";
10475 if (osdmap
.is_up(osd
)) {
10476 msg
<< ", while it was still marked up";
10478 auto period
= ceph_clock_now() - down_pending_out
[osd
];
10479 msg
<< ", after it was down for " << int(period
.sec())
10483 mon
->clog
->info() << msg
.str();
10486 } else if (prefix
== "osd in") {
10487 if (osdmap
.is_in(osd
)) {
10489 ss
<< "osd." << osd
<< " is already in. ";
10491 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
10492 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
10493 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
10494 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
10496 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
10498 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
10500 ss
<< "marked in osd." << osd
<< ". ";
10503 } else if (prefix
== "osd rm") {
10504 err
= prepare_command_osd_remove(osd
);
10506 if (err
== -EBUSY
) {
10509 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
10511 ceph_assert(err
== 0);
10513 ss
<< ", osd." << osd
;
10515 ss
<< "removed osd." << osd
;
10524 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
10525 get_last_committed() + 1));
10528 } else if (prefix
== "osd set-group" ||
10529 prefix
== "osd unset-group" ||
10530 prefix
== "osd add-noup" ||
10531 prefix
== "osd add-nodown" ||
10532 prefix
== "osd add-noin" ||
10533 prefix
== "osd add-noout" ||
10534 prefix
== "osd rm-noup" ||
10535 prefix
== "osd rm-nodown" ||
10536 prefix
== "osd rm-noin" ||
10537 prefix
== "osd rm-noout") {
10538 bool do_set
= prefix
== "osd set-group" ||
10539 prefix
.find("add") != string::npos
;
10541 unsigned flags
= 0;
10542 vector
<string
> who
;
10543 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
10544 cmd_getval(cct
, cmdmap
, "flags", flag_str
);
10545 cmd_getval(cct
, cmdmap
, "who", who
);
10546 vector
<string
> raw_flags
;
10547 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
10548 for (auto& f
: raw_flags
) {
10550 flags
|= CEPH_OSD_NOUP
;
10551 else if (f
== "nodown")
10552 flags
|= CEPH_OSD_NODOWN
;
10553 else if (f
== "noin")
10554 flags
|= CEPH_OSD_NOIN
;
10555 else if (f
== "noout")
10556 flags
|= CEPH_OSD_NOOUT
;
10558 ss
<< "unrecognized flag '" << f
<< "', must be one of "
10559 << "{noup,nodown,noin,noout}";
10565 cmd_getval(cct
, cmdmap
, "ids", who
);
10566 if (prefix
.find("noup") != string::npos
)
10567 flags
= CEPH_OSD_NOUP
;
10568 else if (prefix
.find("nodown") != string::npos
)
10569 flags
= CEPH_OSD_NODOWN
;
10570 else if (prefix
.find("noin") != string::npos
)
10571 flags
= CEPH_OSD_NOIN
;
10572 else if (prefix
.find("noout") != string::npos
)
10573 flags
= CEPH_OSD_NOOUT
;
10575 ceph_assert(0 == "Unreachable!");
10578 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
10583 ss
<< "must specify at least one or more targets to set/unset";
10588 set
<int> crush_nodes
;
10589 set
<int> device_classes
;
10590 for (auto& w
: who
) {
10591 if (w
== "any" || w
== "all" || w
== "*") {
10592 osdmap
.get_all_osds(osds
);
10595 std::stringstream ts
;
10596 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
10598 } else if (osdmap
.crush
->name_exists(w
)) {
10599 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
10600 } else if (osdmap
.crush
->class_exists(w
)) {
10601 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
10603 ss
<< "unable to parse osd id or crush node or device class: "
10604 << "\"" << w
<< "\". ";
10607 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
10608 // ss has reason for failure
10613 for (auto osd
: osds
) {
10614 if (!osdmap
.exists(osd
)) {
10615 ss
<< "osd." << osd
<< " does not exist. ";
10619 if (flags
& CEPH_OSD_NOUP
) {
10620 any
|= osdmap
.is_noup_by_osd(osd
) ?
10621 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
10622 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
10624 if (flags
& CEPH_OSD_NODOWN
) {
10625 any
|= osdmap
.is_nodown_by_osd(osd
) ?
10626 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
10627 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
10629 if (flags
& CEPH_OSD_NOIN
) {
10630 any
|= osdmap
.is_noin_by_osd(osd
) ?
10631 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
10632 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
10634 if (flags
& CEPH_OSD_NOOUT
) {
10635 any
|= osdmap
.is_noout_by_osd(osd
) ?
10636 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
10637 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
10640 if (flags
& CEPH_OSD_NOUP
) {
10641 any
|= osdmap
.is_noup_by_osd(osd
) ?
10642 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
10643 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
10645 if (flags
& CEPH_OSD_NODOWN
) {
10646 any
|= osdmap
.is_nodown_by_osd(osd
) ?
10647 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
10648 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
10650 if (flags
& CEPH_OSD_NOIN
) {
10651 any
|= osdmap
.is_noin_by_osd(osd
) ?
10652 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
10653 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
10655 if (flags
& CEPH_OSD_NOOUT
) {
10656 any
|= osdmap
.is_noout_by_osd(osd
) ?
10657 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
10658 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
10662 for (auto& id
: crush_nodes
) {
10663 auto old_flags
= osdmap
.get_crush_node_flags(id
);
10664 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
10665 pending_flags
|= old_flags
; // adopt existing flags first!
10667 pending_flags
|= flags
;
10669 pending_flags
&= ~flags
;
10673 for (auto& id
: device_classes
) {
10674 auto old_flags
= osdmap
.get_device_class_flags(id
);
10675 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
10676 pending_flags
|= old_flags
;
10678 pending_flags
|= flags
;
10680 pending_flags
&= ~flags
;
10686 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
10687 get_last_committed() + 1));
10690 } else if (prefix
== "osd pg-temp") {
10692 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
10693 ss
<< "unable to parse 'pgid' value '"
10694 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
10699 if (!pgid
.parse(pgidstr
.c_str())) {
10700 ss
<< "invalid pgid '" << pgidstr
<< "'";
10704 if (!osdmap
.pg_exists(pgid
)) {
10705 ss
<< "pg " << pgid
<< " does not exist";
10709 if (pending_inc
.new_pg_temp
.count(pgid
)) {
10710 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
10711 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10715 vector
<int64_t> id_vec
;
10716 vector
<int32_t> new_pg_temp
;
10717 cmd_getval(cct
, cmdmap
, "id", id_vec
);
10718 if (id_vec
.empty()) {
10719 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
10720 ss
<< "done cleaning up pg_temp of " << pgid
;
10723 for (auto osd
: id_vec
) {
10724 if (!osdmap
.exists(osd
)) {
10725 ss
<< "osd." << osd
<< " does not exist";
10729 new_pg_temp
.push_back(osd
);
10732 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
10733 if ((int)new_pg_temp
.size() < pool_min_size
) {
10734 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
10735 << pool_min_size
<< ")";
10740 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
10741 if ((int)new_pg_temp
.size() > pool_size
) {
10742 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
10743 << pool_size
<< ")";
10748 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
10749 new_pg_temp
.begin(), new_pg_temp
.end());
10750 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
10752 } else if (prefix
== "osd primary-temp") {
10754 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
10755 ss
<< "unable to parse 'pgid' value '"
10756 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
10761 if (!pgid
.parse(pgidstr
.c_str())) {
10762 ss
<< "invalid pgid '" << pgidstr
<< "'";
10766 if (!osdmap
.pg_exists(pgid
)) {
10767 ss
<< "pg " << pgid
<< " does not exist";
10773 if (!cmd_getval(cct
, cmdmap
, "id", osd
)) {
10774 ss
<< "unable to parse 'id' value '"
10775 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
10779 if (osd
!= -1 && !osdmap
.exists(osd
)) {
10780 ss
<< "osd." << osd
<< " does not exist";
10785 if (osdmap
.require_min_compat_client
> 0 &&
10786 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
10787 ss
<< "require_min_compat_client "
10788 << ceph_release_name(osdmap
.require_min_compat_client
)
10789 << " < firefly, which is required for primary-temp";
10794 pending_inc
.new_primary_temp
[pgid
] = osd
;
10795 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
10797 } else if (prefix
== "pg repeer") {
10800 cmd_getval(cct
, cmdmap
, "pgid", pgidstr
);
10801 if (!pgid
.parse(pgidstr
.c_str())) {
10802 ss
<< "invalid pgid '" << pgidstr
<< "'";
10806 if (!osdmap
.pg_exists(pgid
)) {
10807 ss
<< "pg '" << pgidstr
<< "' does not exist";
10811 vector
<int> acting
;
10813 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
10816 ss
<< "pg currently has no primary";
10819 if (acting
.size() > 1) {
10820 // map to just primary; it will map back to what it wants
10821 pending_inc
.new_pg_temp
[pgid
] = { primary
};
10823 // hmm, pick another arbitrary osd to induce a change. Note
10824 // that this won't work if there is only one suitable OSD in the cluster.
10827 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
10828 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
10831 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
10837 ss
<< "not enough up OSDs in the cluster to force repeer";
10842 } else if (prefix
== "osd pg-upmap" ||
10843 prefix
== "osd rm-pg-upmap" ||
10844 prefix
== "osd pg-upmap-items" ||
10845 prefix
== "osd rm-pg-upmap-items") {
10846 if (osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
10847 ss
<< "min_compat_client "
10848 << ceph_release_name(osdmap
.require_min_compat_client
)
10849 << " < luminous, which is required for pg-upmap. "
10850 << "Try 'ceph osd set-require-min-compat-client luminous' "
10851 << "before using the new interface";
10855 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
10856 if (err
== -EAGAIN
)
10861 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
10862 ss
<< "unable to parse 'pgid' value '"
10863 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
10868 if (!pgid
.parse(pgidstr
.c_str())) {
10869 ss
<< "invalid pgid '" << pgidstr
<< "'";
10873 if (!osdmap
.pg_exists(pgid
)) {
10874 ss
<< "pg " << pgid
<< " does not exist";
10878 if (pending_inc
.old_pools
.count(pgid
.pool())) {
10879 ss
<< "pool of " << pgid
<< " is pending removal";
10882 wait_for_finished_proposal(op
,
10883 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
10891 OP_RM_PG_UPMAP_ITEMS
,
10894 if (prefix
== "osd pg-upmap") {
10895 option
= OP_PG_UPMAP
;
10896 } else if (prefix
== "osd rm-pg-upmap") {
10897 option
= OP_RM_PG_UPMAP
;
10898 } else if (prefix
== "osd pg-upmap-items") {
10899 option
= OP_PG_UPMAP_ITEMS
;
10901 option
= OP_RM_PG_UPMAP_ITEMS
;
10904 // check pending upmap changes
10906 case OP_PG_UPMAP
: // fall through
10907 case OP_RM_PG_UPMAP
:
10908 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
10909 pending_inc
.old_pg_upmap
.count(pgid
)) {
10910 dout(10) << __func__
<< " waiting for pending update on "
10912 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10917 case OP_PG_UPMAP_ITEMS
: // fall through
10918 case OP_RM_PG_UPMAP_ITEMS
:
10919 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
10920 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
10921 dout(10) << __func__
<< " waiting for pending update on "
10923 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10929 ceph_abort_msg("invalid option");
10935 vector
<int64_t> id_vec
;
10936 if (!cmd_getval(cct
, cmdmap
, "id", id_vec
)) {
10937 ss
<< "unable to parse 'id' value(s) '"
10938 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
10943 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
10944 if ((int)id_vec
.size() < pool_min_size
) {
10945 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
10946 << pool_min_size
<< ")";
10951 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
10952 if ((int)id_vec
.size() > pool_size
) {
10953 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
10954 << pool_size
<< ")";
10959 vector
<int32_t> new_pg_upmap
;
10960 for (auto osd
: id_vec
) {
10961 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
10962 ss
<< "osd." << osd
<< " does not exist";
10966 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
10967 if (it
!= new_pg_upmap
.end()) {
10968 ss
<< "osd." << osd
<< " already exists, ";
10971 new_pg_upmap
.push_back(osd
);
10974 if (new_pg_upmap
.empty()) {
10975 ss
<< "no valid upmap items(pairs) is specified";
10980 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
10981 new_pg_upmap
.begin(), new_pg_upmap
.end());
10982 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
10986 case OP_RM_PG_UPMAP
:
10988 pending_inc
.old_pg_upmap
.insert(pgid
);
10989 ss
<< "clear " << pgid
<< " pg_upmap mapping";
10993 case OP_PG_UPMAP_ITEMS
:
10995 vector
<int64_t> id_vec
;
10996 if (!cmd_getval(cct
, cmdmap
, "id", id_vec
)) {
10997 ss
<< "unable to parse 'id' value(s) '"
10998 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11003 if (id_vec
.size() % 2) {
11004 ss
<< "you must specify pairs of osd ids to be remapped";
11009 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11010 if ((int)(id_vec
.size() / 2) > pool_size
) {
11011 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
11012 << pool_size
<< ")";
11017 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
11018 ostringstream items
;
11020 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
11024 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
11027 if (!osdmap
.exists(from
)) {
11028 ss
<< "osd." << from
<< " does not exist";
11032 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
11033 ss
<< "osd." << to
<< " does not exist";
11037 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
11038 auto it
= std::find(new_pg_upmap_items
.begin(),
11039 new_pg_upmap_items
.end(), entry
);
11040 if (it
!= new_pg_upmap_items
.end()) {
11041 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
11044 new_pg_upmap_items
.push_back(entry
);
11045 items
<< from
<< "->" << to
<< ",";
11047 string
out(items
.str());
11048 out
.resize(out
.size() - 1); // drop last ','
11051 if (new_pg_upmap_items
.empty()) {
11052 ss
<< "no valid upmap items(pairs) is specified";
11057 pending_inc
.new_pg_upmap_items
[pgid
] =
11058 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
11059 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
11060 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
11064 case OP_RM_PG_UPMAP_ITEMS
:
11066 pending_inc
.old_pg_upmap_items
.insert(pgid
);
11067 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
11072 ceph_abort_msg("invalid option");
11076 } else if (prefix
== "osd primary-affinity") {
11078 if (!cmd_getval(cct
, cmdmap
, "id", id
)) {
11079 ss
<< "invalid osd id value '"
11080 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11085 if (!cmd_getval(cct
, cmdmap
, "weight", w
)) {
11086 ss
<< "unable to parse 'weight' value '"
11087 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11091 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
11093 ss
<< "weight must be >= 0";
11097 if (osdmap
.require_min_compat_client
> 0 &&
11098 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
11099 ss
<< "require_min_compat_client "
11100 << ceph_release_name(osdmap
.require_min_compat_client
)
11101 << " < firefly, which is required for primary-affinity";
11105 if (osdmap
.exists(id
)) {
11106 pending_inc
.new_primary_affinity
[id
] = ww
;
11107 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
11109 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11110 get_last_committed() + 1));
11113 ss
<< "osd." << id
<< " does not exist";
11117 } else if (prefix
== "osd reweight") {
11119 if (!cmd_getval(cct
, cmdmap
, "id", id
)) {
11120 ss
<< "unable to parse osd id value '"
11121 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11126 if (!cmd_getval(cct
, cmdmap
, "weight", w
)) {
11127 ss
<< "unable to parse weight value '"
11128 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11132 long ww
= (int)((double)CEPH_OSD_IN
*w
);
11134 ss
<< "weight must be >= 0";
11138 if (osdmap
.exists(id
)) {
11139 pending_inc
.new_weight
[id
] = ww
;
11140 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
11142 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11143 get_last_committed() + 1));
11146 ss
<< "osd." << id
<< " does not exist";
11150 } else if (prefix
== "osd reweightn") {
11151 map
<int32_t, uint32_t> weights
;
11152 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
11154 ss
<< "unable to parse 'weights' value '"
11155 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
11158 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
11159 wait_for_finished_proposal(
11161 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
11163 } else if (prefix
== "osd lost") {
11165 if (!cmd_getval(cct
, cmdmap
, "id", id
)) {
11166 ss
<< "unable to parse osd id value '"
11167 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11172 cmd_getval(g_ceph_context
, cmdmap
, "yes_i_really_mean_it", sure
);
11174 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
11175 "--yes-i-really-mean-it if you really do.";
11178 } else if (!osdmap
.exists(id
)) {
11179 ss
<< "osd." << id
<< " does not exist";
11182 } else if (!osdmap
.is_down(id
)) {
11183 ss
<< "osd." << id
<< " is not down";
11187 epoch_t e
= osdmap
.get_info(id
).down_at
;
11188 pending_inc
.new_lost
[id
] = e
;
11189 ss
<< "marked osd lost in epoch " << e
;
11191 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11192 get_last_committed() + 1));
11196 } else if (prefix
== "osd destroy-actual" ||
11197 prefix
== "osd purge-actual" ||
11198 prefix
== "osd purge-new") {
11199 /* Destroying an OSD means that we don't expect to further make use of
11200 * the OSDs data (which may even become unreadable after this operation),
11201 * and that we are okay with scrubbing all its cephx keys and config-key
11202 * data (which may include lockbox keys, thus rendering the osd's data
11205 * The OSD will not be removed. Instead, we will mark it as destroyed,
11206 * such that a subsequent call to `create` will not reuse the osd id.
11207 * This will play into being able to recreate the OSD, at the same
11208 * crush location, with minimal data movement.
11211 // make sure authmon is writeable.
11212 if (!mon
->authmon()->is_writeable()) {
11213 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
11214 << "osd destroy" << dendl
;
11215 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
11220 if (!cmd_getval(cct
, cmdmap
, "id", id
)) {
11221 auto p
= cmdmap
.find("id");
11222 if (p
== cmdmap
.end()) {
11223 ss
<< "no osd id specified";
11225 ss
<< "unable to parse osd id value '"
11226 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
11232 bool is_destroy
= (prefix
== "osd destroy-actual");
11234 ceph_assert("osd purge-actual" == prefix
||
11235 "osd purge-new" == prefix
);
11239 cmd_getval(g_ceph_context
, cmdmap
, "yes_i_really_mean_it", sure
);
11241 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
11242 << "This will mean real, permanent data loss, as well "
11243 << "as deletion of cephx and lockbox keys. "
11244 << "Pass --yes-i-really-mean-it if you really do.";
11247 } else if (!osdmap
.exists(id
)) {
11248 ss
<< "osd." << id
<< " does not exist";
11249 err
= 0; // idempotent
11251 } else if (osdmap
.is_up(id
)) {
11252 ss
<< "osd." << id
<< " is not `down`.";
11255 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
11256 ss
<< "destroyed osd." << id
;
11261 if (prefix
== "osd purge-new" &&
11262 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
11263 ss
<< "osd." << id
<< " is not new";
11268 bool goto_reply
= false;
11272 err
= prepare_command_osd_destroy(id
, ss
);
11273 // we checked above that it should exist.
11274 ceph_assert(err
!= -ENOENT
);
11276 err
= prepare_command_osd_purge(id
, ss
);
11277 if (err
== -ENOENT
) {
11279 ss
<< "osd." << id
<< " does not exist.";
11285 if (err
< 0 || goto_reply
) {
11290 ss
<< "destroyed osd." << id
;
11292 ss
<< "purged osd." << id
;
11296 wait_for_finished_proposal(op
,
11297 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
11298 force_immediate_propose();
11301 } else if (prefix
== "osd new") {
11303 // make sure authmon is writeable.
11304 if (!mon
->authmon()->is_writeable()) {
11305 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
11306 << "osd new" << dendl
;
11307 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
11311 map
<string
,string
> param_map
;
11313 bufferlist bl
= m
->get_data();
11314 string param_json
= bl
.to_str();
11315 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
11317 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
11321 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
11324 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
11337 if (err
== EEXIST
) {
11338 // idempotent operation
11343 wait_for_finished_proposal(op
,
11344 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
11345 get_last_committed() + 1));
11346 force_immediate_propose();
11349 } else if (prefix
== "osd create") {
11351 // optional id provided?
11352 int64_t id
= -1, cmd_id
= -1;
11353 if (cmd_getval(cct
, cmdmap
, "id", cmd_id
)) {
11355 ss
<< "invalid osd id value '" << cmd_id
<< "'";
11359 dout(10) << " osd create got id " << cmd_id
<< dendl
;
11364 if (cmd_getval(cct
, cmdmap
, "uuid", uuidstr
)) {
11365 if (!uuid
.parse(uuidstr
.c_str())) {
11366 ss
<< "invalid uuid value '" << uuidstr
<< "'";
11370 // we only care about the id if we also have the uuid, to
11371 // ensure the operation's idempotency.
11375 int32_t new_id
= -1;
11376 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
11378 if (err
== -EAGAIN
) {
11379 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11382 // a check has failed; reply to the user.
11385 } else if (err
== EEXIST
) {
11386 // this is an idempotent operation; we can go ahead and reply.
11388 f
->open_object_section("created_osd");
11389 f
->dump_int("osdid", new_id
);
11390 f
->close_section();
11400 string empty_device_class
;
11401 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
11404 f
->open_object_section("created_osd");
11405 f
->dump_int("osdid", new_id
);
11406 f
->close_section();
11412 wait_for_finished_proposal(op
,
11413 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
11414 get_last_committed() + 1));
11417 } else if (prefix
== "osd blacklist clear") {
11418 pending_inc
.new_blacklist
.clear();
11419 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
11420 osdmap
.get_blacklist(&blacklist
);
11421 for (const auto &entry
: blacklist
) {
11422 pending_inc
.old_blacklist
.push_back(entry
.first
);
11424 ss
<< " removed all blacklist entries";
11426 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11427 get_last_committed() + 1));
11429 } else if (prefix
== "osd blacklist") {
11431 cmd_getval(cct
, cmdmap
, "addr", addrstr
);
11432 entity_addr_t addr
;
11433 if (!addr
.parse(addrstr
.c_str(), 0)) {
11434 ss
<< "unable to parse address " << addrstr
;
11439 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
11440 // always blacklist type ANY
11441 addr
.set_type(entity_addr_t::TYPE_ANY
);
11443 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
11446 string blacklistop
;
11447 cmd_getval(cct
, cmdmap
, "blacklistop", blacklistop
);
11448 if (blacklistop
== "add") {
11449 utime_t expires
= ceph_clock_now();
11451 // default one hour
11452 cmd_getval(cct
, cmdmap
, "expire", d
,
11453 g_conf()->mon_osd_blacklist_default_expire
);
11456 pending_inc
.new_blacklist
[addr
] = expires
;
11459 // cancel any pending un-blacklisting request too
11460 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
11461 pending_inc
.old_blacklist
.end(), addr
);
11462 if (it
!= pending_inc
.old_blacklist
.end()) {
11463 pending_inc
.old_blacklist
.erase(it
);
11467 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
11469 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11470 get_last_committed() + 1));
11472 } else if (blacklistop
== "rm") {
11473 if (osdmap
.is_blacklisted(addr
) ||
11474 pending_inc
.new_blacklist
.count(addr
)) {
11475 if (osdmap
.is_blacklisted(addr
))
11476 pending_inc
.old_blacklist
.push_back(addr
);
11478 pending_inc
.new_blacklist
.erase(addr
);
11479 ss
<< "un-blacklisting " << addr
;
11481 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11482 get_last_committed() + 1));
11485 ss
<< addr
<< " isn't blacklisted";
11490 } else if (prefix
== "osd pool mksnap") {
11492 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11493 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
11495 ss
<< "unrecognized pool '" << poolstr
<< "'";
11500 cmd_getval(cct
, cmdmap
, "snap", snapname
);
11501 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
11502 if (p
->is_unmanaged_snaps_mode()) {
11503 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
11506 } else if (p
->snap_exists(snapname
.c_str())) {
11507 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
11510 } else if (p
->is_tier()) {
11511 ss
<< "pool " << poolstr
<< " is a cache tier";
11516 if (pending_inc
.new_pools
.count(pool
))
11517 pp
= &pending_inc
.new_pools
[pool
];
11519 pp
= &pending_inc
.new_pools
[pool
];
11522 if (pp
->snap_exists(snapname
.c_str())) {
11523 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
11525 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
11526 pp
->set_snap_epoch(pending_inc
.epoch
);
11527 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
11530 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11531 get_last_committed() + 1));
11533 } else if (prefix
== "osd pool rmsnap") {
11535 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11536 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
11538 ss
<< "unrecognized pool '" << poolstr
<< "'";
11543 cmd_getval(cct
, cmdmap
, "snap", snapname
);
11544 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
11545 if (p
->is_unmanaged_snaps_mode()) {
11546 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
11549 } else if (!p
->snap_exists(snapname
.c_str())) {
11550 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
11555 if (pending_inc
.new_pools
.count(pool
))
11556 pp
= &pending_inc
.new_pools
[pool
];
11558 pp
= &pending_inc
.new_pools
[pool
];
11561 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
11563 pp
->remove_snap(sn
);
11564 pp
->set_snap_epoch(pending_inc
.epoch
);
11565 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
11567 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
11570 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11571 get_last_committed() + 1));
11573 } else if (prefix
== "osd pool create") {
11574 int64_t pg_num
, pg_num_min
;
11576 cmd_getval(cct
, cmdmap
, "pg_num", pg_num
, int64_t(0));
11577 cmd_getval(cct
, cmdmap
, "pgp_num", pgp_num
, pg_num
);
11578 cmd_getval(cct
, cmdmap
, "pg_num_min", pg_num_min
, int64_t(0));
11580 string pool_type_str
;
11581 cmd_getval(cct
, cmdmap
, "pool_type", pool_type_str
);
11582 if (pool_type_str
.empty())
11583 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
11586 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11587 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11588 if (pool_id
>= 0) {
11589 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11590 if (pool_type_str
!= p
->get_type_name()) {
11591 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
11594 ss
<< "pool '" << poolstr
<< "' already exists";
11601 if (pool_type_str
== "replicated") {
11602 pool_type
= pg_pool_t::TYPE_REPLICATED
;
11603 } else if (pool_type_str
== "erasure") {
11604 pool_type
= pg_pool_t::TYPE_ERASURE
;
11606 ss
<< "unknown pool type '" << pool_type_str
<< "'";
11611 bool implicit_rule_creation
= false;
11612 int64_t expected_num_objects
= 0;
11614 cmd_getval(cct
, cmdmap
, "rule", rule_name
);
11615 string erasure_code_profile
;
11616 cmd_getval(cct
, cmdmap
, "erasure_code_profile", erasure_code_profile
);
11618 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
11619 if (erasure_code_profile
== "")
11620 erasure_code_profile
= "default";
11621 //handle the erasure code profile
11622 if (erasure_code_profile
== "default") {
11623 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
11624 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
11625 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
11629 map
<string
,string
> profile_map
;
11630 err
= osdmap
.get_erasure_code_profile_default(cct
,
11635 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
11636 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
11640 if (rule_name
== "") {
11641 implicit_rule_creation
= true;
11642 if (erasure_code_profile
== "default") {
11643 rule_name
= "erasure-code";
11645 dout(1) << "implicitly use rule named after the pool: "
11646 << poolstr
<< dendl
;
11647 rule_name
= poolstr
;
11650 cmd_getval(g_ceph_context
, cmdmap
, "expected_num_objects",
11651 expected_num_objects
, int64_t(0));
11653 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
11654 // and put expected_num_objects to rule field
11655 if (erasure_code_profile
!= "") { // cmd is from CLI
11656 if (rule_name
!= "") {
11658 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
11659 if (interr
.length()) {
11660 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
11665 rule_name
= erasure_code_profile
;
11666 } else { // cmd is well-formed
11667 cmd_getval(g_ceph_context
, cmdmap
, "expected_num_objects",
11668 expected_num_objects
, int64_t(0));
11672 if (!implicit_rule_creation
&& rule_name
!= "") {
11674 err
= get_crush_rule(rule_name
, &rule
, &ss
);
11675 if (err
== -EAGAIN
) {
11676 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11683 if (expected_num_objects
< 0) {
11684 ss
<< "'expected_num_objects' must be non-negative";
11689 if (expected_num_objects
> 0 &&
11690 cct
->_conf
->osd_objectstore
== "filestore" &&
11691 cct
->_conf
->filestore_merge_threshold
> 0) {
11692 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
11697 if (expected_num_objects
== 0 &&
11698 cct
->_conf
->osd_objectstore
== "filestore" &&
11699 cct
->_conf
->filestore_merge_threshold
< 0) {
11700 int osds
= osdmap
.get_num_osds();
11701 if (osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
11702 ss
<< "For better initial performance on pools expected to store a "
11703 << "large number of objects, consider supplying the "
11704 << "expected_num_objects parameter when creating the pool.\n";
11708 int64_t fast_read_param
;
11709 cmd_getval(cct
, cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
11710 FastReadType fast_read
= FAST_READ_DEFAULT
;
11711 if (fast_read_param
== 0)
11712 fast_read
= FAST_READ_OFF
;
11713 else if (fast_read_param
> 0)
11714 fast_read
= FAST_READ_ON
;
11716 int64_t repl_size
= 0;
11717 cmd_getval(cct
, cmdmap
, "size", repl_size
);
11718 int64_t target_size_bytes
= 0;
11719 double target_size_ratio
= 0.0;
11720 cmd_getval(cct
, cmdmap
, "target_size_bytes", target_size_bytes
);
11721 cmd_getval(cct
, cmdmap
, "target_size_ratio", target_size_ratio
);
11723 err
= prepare_new_pool(poolstr
,
11724 -1, // default crush rule
11726 pg_num
, pgp_num
, pg_num_min
,
11727 repl_size
, target_size_bytes
, target_size_ratio
,
11728 erasure_code_profile
, pool_type
,
11729 (uint64_t)expected_num_objects
,
11735 ss
<< "pool '" << poolstr
<< "' already exists";
11738 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11747 ss
<< "pool '" << poolstr
<< "' created";
11750 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11751 get_last_committed() + 1));
11754 } else if (prefix
== "osd pool delete" ||
11755 prefix
== "osd pool rm") {
11756 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
11757 string poolstr
, poolstr2
, sure
;
11758 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11759 cmd_getval(cct
, cmdmap
, "pool2", poolstr2
);
11760 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
11762 ss
<< "pool '" << poolstr
<< "' does not exist";
11767 bool force_no_fake
= false;
11768 cmd_getval(cct
, cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
11769 bool force
= false;
11770 cmd_getval(cct
, cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
11771 if (poolstr2
!= poolstr
||
11772 (!force
&& !force_no_fake
)) {
11773 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
11774 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
11775 << "followed by --yes-i-really-really-mean-it.";
11779 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
11780 if (err
== -EAGAIN
) {
11781 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11787 } else if (prefix
== "osd pool rename") {
11788 string srcpoolstr
, destpoolstr
;
11789 cmd_getval(cct
, cmdmap
, "srcpool", srcpoolstr
);
11790 cmd_getval(cct
, cmdmap
, "destpool", destpoolstr
);
11791 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
11792 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
11794 if (pool_src
< 0) {
11795 if (pool_dst
>= 0) {
11796 // src pool doesn't exist, dst pool does exist: to ensure idempotency
11797 // of operations, assume this rename succeeded, as it is not changing
11798 // the current state. Make sure we output something understandable
11799 // for whoever is issuing the command, if they are paying attention,
11800 // in case it was not intentional; or to avoid a "wtf?" and a bug
11801 // report in case it was intentional, while expecting a failure.
11802 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
11803 << destpoolstr
<< "' does -- assuming successful rename";
11806 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
11810 } else if (pool_dst
>= 0) {
11811 // source pool exists and so does the destination pool
11812 ss
<< "pool '" << destpoolstr
<< "' already exists";
11817 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
11819 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
11821 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
11822 << cpp_strerror(ret
);
11825 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
11826 get_last_committed() + 1));
11829 } else if (prefix
== "osd pool set") {
11830 err
= prepare_command_pool_set(cmdmap
, ss
);
11831 if (err
== -EAGAIN
)
11837 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11838 get_last_committed() + 1));
11840 } else if (prefix
== "osd tier add") {
11841 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11842 if (err
== -EAGAIN
)
11847 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11848 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11850 ss
<< "unrecognized pool '" << poolstr
<< "'";
11854 string tierpoolstr
;
11855 cmd_getval(cct
, cmdmap
, "tierpool", tierpoolstr
);
11856 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
11857 if (tierpool_id
< 0) {
11858 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
11862 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11864 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
11867 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
11871 // make sure new tier is empty
11872 string force_nonempty
;
11873 cmd_getval(cct
, cmdmap
, "force_nonempty", force_nonempty
);
11874 const pool_stat_t
*pstats
= mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
11875 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
11876 force_nonempty
!= "--force-nonempty") {
11877 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
11881 if (tp
->is_erasure()) {
11882 ss
<< "tier pool '" << tierpoolstr
11883 << "' is an ec pool, which cannot be a tier";
11887 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
11888 ((force_nonempty
!= "--force-nonempty") ||
11889 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
11890 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
11895 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11896 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
11897 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
11898 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11901 np
->tiers
.insert(tierpool_id
);
11902 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
11903 ntp
->tier_of
= pool_id
;
11904 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
11905 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11906 get_last_committed() + 1));
11908 } else if (prefix
== "osd tier remove" ||
11909 prefix
== "osd tier rm") {
11911 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11912 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11914 ss
<< "unrecognized pool '" << poolstr
<< "'";
11918 string tierpoolstr
;
11919 cmd_getval(cct
, cmdmap
, "tierpool", tierpoolstr
);
11920 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
11921 if (tierpool_id
< 0) {
11922 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
11926 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11928 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
11931 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
11935 if (p
->tiers
.count(tierpool_id
) == 0) {
11936 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
11940 if (tp
->tier_of
!= pool_id
) {
11941 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
11942 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
11943 // be scary about it; this is an inconsistency and bells must go off
11944 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
11948 if (p
->read_tier
== tierpool_id
) {
11949 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
11954 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11955 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
11956 if (np
->tiers
.count(tierpool_id
) == 0 ||
11957 ntp
->tier_of
!= pool_id
||
11958 np
->read_tier
== tierpool_id
) {
11959 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11962 np
->tiers
.erase(tierpool_id
);
11964 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
11965 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11966 get_last_committed() + 1));
11968 } else if (prefix
== "osd tier set-overlay") {
11969 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11970 if (err
== -EAGAIN
)
11975 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11976 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11978 ss
<< "unrecognized pool '" << poolstr
<< "'";
11982 string overlaypoolstr
;
11983 cmd_getval(cct
, cmdmap
, "overlaypool", overlaypoolstr
);
11984 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
11985 if (overlaypool_id
< 0) {
11986 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
11990 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11992 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
11993 ceph_assert(overlay_p
);
11994 if (p
->tiers
.count(overlaypool_id
) == 0) {
11995 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
11999 if (p
->read_tier
== overlaypool_id
) {
12001 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12004 if (p
->has_read_tier()) {
12005 ss
<< "pool '" << poolstr
<< "' has overlay '"
12006 << osdmap
.get_pool_name(p
->read_tier
)
12007 << "'; please remove-overlay first";
12013 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12014 np
->read_tier
= overlaypool_id
;
12015 np
->write_tier
= overlaypool_id
;
12016 np
->set_last_force_op_resend(pending_inc
.epoch
);
12017 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
12018 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
12019 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12020 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
12021 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
12022 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12023 get_last_committed() + 1));
12025 } else if (prefix
== "osd tier remove-overlay" ||
12026 prefix
== "osd tier rm-overlay") {
12028 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
12029 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12031 ss
<< "unrecognized pool '" << poolstr
<< "'";
12035 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12037 if (!p
->has_read_tier()) {
12039 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12043 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
12048 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12049 if (np
->has_read_tier()) {
12050 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
12051 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
12052 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12054 if (np
->has_write_tier()) {
12055 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
12056 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
12057 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12059 np
->clear_read_tier();
12060 np
->clear_write_tier();
12061 np
->set_last_force_op_resend(pending_inc
.epoch
);
12062 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12063 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12064 get_last_committed() + 1));
12066 } else if (prefix
== "osd tier cache-mode") {
12067 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12068 if (err
== -EAGAIN
)
12073 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
12074 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12076 ss
<< "unrecognized pool '" << poolstr
<< "'";
12080 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12082 if (!p
->is_tier()) {
12083 ss
<< "pool '" << poolstr
<< "' is not a tier";
12088 cmd_getval(cct
, cmdmap
, "mode", modestr
);
12089 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
12091 ss
<< "'" << modestr
<< "' is not a valid cache mode";
12097 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", sure
);
12099 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12100 mode
!= pg_pool_t::CACHEMODE_NONE
&&
12101 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12102 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
12104 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
12105 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12110 // pool already has this cache-mode set and there are no pending changes
12111 if (p
->cache_mode
== mode
&&
12112 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
12113 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
12114 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
12115 << " to " << pg_pool_t::get_cache_mode_name(mode
);
12120 /* Mode description:
12122 * none: No cache-mode defined
12123 * forward: Forward all reads and writes to base pool
12124 * writeback: Cache writes, promote reads from base pool
12125 * readonly: Forward writes to base pool
12126 * readforward: Writes are in writeback mode, Reads are in forward mode
12127 * proxy: Proxy all reads and writes to base pool
12128 * readproxy: Writes are in writeback mode, Reads are in proxy mode
12130 * Hence, these are the allowed transitions:
12133 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12134 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12135 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12136 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
12137 * writeback -> readforward || readproxy || forward || proxy
12141 // We check if the transition is valid against the current pool mode, as
12142 // it is the only committed state thus far. We will blantly squash
12143 // whatever mode is on the pending state.
12145 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
12146 (mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
12147 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12148 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
12149 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
12150 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
12151 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
12152 << "' pool; only '"
12153 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD
)
12155 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
12157 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD
)
12159 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
12164 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
12165 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12166 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
12167 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12168 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
12170 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
12171 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12172 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
12173 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
12174 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
12176 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
12177 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12178 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
12179 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
12180 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
12182 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
12183 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12184 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
12185 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12186 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
12188 const pool_stat_t
* pstats
=
12189 mon
->mgrstatmon()->get_pool_stat(pool_id
);
12191 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
12192 ss
<< "unable to set cache-mode '"
12193 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
12194 << "': dirty objects found";
12200 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12201 np
->cache_mode
= mode
;
12202 // set this both when moving to and from cache_mode NONE. this is to
12203 // capture legacy pools that were set up before this flag existed.
12204 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
12205 ss
<< "set cache-mode for pool '" << poolstr
12206 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
12207 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
12208 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
12209 ceph_assert(base_pool
);
12210 if (base_pool
->read_tier
== pool_id
||
12211 base_pool
->write_tier
== pool_id
)
12212 ss
<<" (WARNING: pool is still configured as read or write tier)";
12214 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12215 get_last_committed() + 1));
12217 } else if (prefix
== "osd tier add-cache") {
12218 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12219 if (err
== -EAGAIN
)
12224 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
12225 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12227 ss
<< "unrecognized pool '" << poolstr
<< "'";
12231 string tierpoolstr
;
12232 cmd_getval(cct
, cmdmap
, "tierpool", tierpoolstr
);
12233 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12234 if (tierpool_id
< 0) {
12235 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12239 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12241 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12244 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
12249 if (!cmd_getval(cct
, cmdmap
, "size", size
)) {
12250 ss
<< "unable to parse 'size' value '"
12251 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
12255 // make sure new tier is empty
12256 const pool_stat_t
*pstats
=
12257 mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
12258 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
12259 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
12263 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
12264 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
12266 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
12270 HitSet::Params hsp
;
12271 auto& cache_hit_set_type
=
12272 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
12273 if (cache_hit_set_type
== "bloom") {
12274 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
12275 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
12276 hsp
= HitSet::Params(bsp
);
12277 } else if (cache_hit_set_type
== "explicit_hash") {
12278 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
12279 } else if (cache_hit_set_type
== "explicit_object") {
12280 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
12282 ss
<< "osd tier cache default hit set type '"
12283 << cache_hit_set_type
<< "' is not a known type";
12288 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12289 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12290 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
12291 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12294 np
->tiers
.insert(tierpool_id
);
12295 np
->read_tier
= np
->write_tier
= tierpool_id
;
12296 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
12297 np
->set_last_force_op_resend(pending_inc
.epoch
);
12298 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
12299 ntp
->tier_of
= pool_id
;
12300 ntp
->cache_mode
= mode
;
12301 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
12302 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
12303 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
12304 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
12305 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
12306 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
12307 ntp
->hit_set_params
= hsp
;
12308 ntp
->target_max_bytes
= size
;
12309 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
12310 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12311 get_last_committed() + 1));
12313 } else if (prefix
== "osd pool set-quota") {
12315 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
12316 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12318 ss
<< "unrecognized pool '" << poolstr
<< "'";
12324 cmd_getval(cct
, cmdmap
, "field", field
);
12325 if (field
!= "max_objects" && field
!= "max_bytes") {
12326 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
12331 // val could contain unit designations, so we treat as a string
12333 cmd_getval(cct
, cmdmap
, "val", val
);
12336 if (field
== "max_objects") {
12337 value
= strict_sistrtoll(val
.c_str(), &tss
);
12338 } else if (field
== "max_bytes") {
12339 value
= strict_iecstrtoll(val
.c_str(), &tss
);
12341 ceph_abort_msg("unrecognized option");
12343 if (!tss
.empty()) {
12344 ss
<< "error parsing value '" << val
<< "': " << tss
;
12349 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
12350 if (field
== "max_objects") {
12351 pi
->quota_max_objects
= value
;
12352 } else if (field
== "max_bytes") {
12353 pi
->quota_max_bytes
= value
;
12355 ceph_abort_msg("unrecognized option");
12357 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
12359 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12360 get_last_committed() + 1));
12362 } else if (prefix
== "osd pool application enable" ||
12363 prefix
== "osd pool application disable" ||
12364 prefix
== "osd pool application set" ||
12365 prefix
== "osd pool application rm") {
12366 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
12367 if (err
== -EAGAIN
) {
12369 } else if (err
< 0) {
12374 } else if (prefix
== "osd force-create-pg") {
12377 cmd_getval(cct
, cmdmap
, "pgid", pgidstr
);
12378 if (!pgid
.parse(pgidstr
.c_str())) {
12379 ss
<< "invalid pgid '" << pgidstr
<< "'";
12383 if (!osdmap
.pg_exists(pgid
)) {
12384 ss
<< "pg " << pgid
<< " should not exist";
12389 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", sure
);
12391 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
12392 << "that the cluster will give up ever trying to recover the lost data. Do this "
12393 << "only if you are certain that all copies of the PG are in fact lost and you are "
12394 << "willing to accept that the data is permanently destroyed. Pass "
12395 << "--yes-i-really-mean-it to proceed.";
12401 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
12402 auto emplaced
= creating_pgs
.pgs
.emplace(pgid
,
12403 make_pair(osdmap
.get_epoch(),
12404 ceph_clock_now()));
12405 creating_now
= emplaced
.second
;
12407 if (creating_now
) {
12408 ss
<< "pg " << pgidstr
<< " now creating, ok";
12409 // set the pool's CREATING flag so that (1) the osd won't ignore our
12410 // create message and (2) we won't propose any future pg_num changes
12411 // until after the PG has been instantiated.
12412 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
12413 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
12415 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
12419 ss
<< "pg " << pgid
<< " already creating";
12429 if (err
< 0 && rs
.length() == 0)
12430 rs
= cpp_strerror(err
);
12431 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
12436 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12437 get_last_committed() + 1));
12441 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12445 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
12447 op
->mark_osdmon_event(__func__
);
12449 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12450 MonSession
*session
= op
->get_session();
12452 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
12457 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12458 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12460 const std::string
* pool_name
= nullptr;
12461 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
12462 if (pg_pool
!= nullptr) {
12463 pool_name
= &osdmap
.get_pool_name(m
->pool
);
12466 if (!is_unmanaged_snap_op_permitted(cct
, mon
->key_server
,
12467 session
->entity_name
, session
->caps
,
12468 session
->get_peer_socket_addr(),
12470 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
12471 << "privileges. message: " << *m
<< std::endl
12472 << "caps: " << session
->caps
<< dendl
;
12473 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
12479 if (!session
->is_capable("osd", MON_CAP_W
)) {
12480 dout(0) << "got pool op from entity with insufficient privileges. "
12481 << "message: " << *m
<< std::endl
12482 << "caps: " << session
->caps
<< dendl
;
12483 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
12492 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
12494 op
->mark_osdmon_event(__func__
);
12495 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12497 if (enforce_pool_op_caps(op
)) {
12501 if (m
->fsid
!= mon
->monmap
->fsid
) {
12502 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
12503 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
12504 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12508 if (m
->op
== POOL_OP_CREATE
)
12509 return preprocess_pool_op_create(op
);
12511 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
12512 if (p
== nullptr) {
12513 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
12514 if (m
->op
== POOL_OP_DELETE
) {
12515 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12517 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
12522 // check if the snap and snapname exist
12523 bool snap_exists
= false;
12524 if (p
->snap_exists(m
->name
.c_str()))
12525 snap_exists
= true;
12528 case POOL_OP_CREATE_SNAP
:
12529 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
12530 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12534 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12538 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12539 if (p
->is_pool_snaps_mode()) {
12540 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12544 case POOL_OP_DELETE_SNAP
:
12545 if (p
->is_unmanaged_snaps_mode()) {
12546 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12549 if (!snap_exists
) {
12550 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12554 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12555 if (p
->is_pool_snaps_mode()) {
12556 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12559 if (p
->is_removed_snap(m
->snapid
)) {
12560 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12564 case POOL_OP_DELETE
:
12565 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
12566 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12570 case POOL_OP_AUID_CHANGE
:
12580 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
12582 op
->mark_osdmon_event(__func__
);
12583 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12584 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
12586 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12593 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
12595 op
->mark_osdmon_event(__func__
);
12596 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12597 dout(10) << "prepare_pool_op " << *m
<< dendl
;
12598 if (m
->op
== POOL_OP_CREATE
) {
12599 return prepare_pool_op_create(op
);
12600 } else if (m
->op
== POOL_OP_DELETE
) {
12601 return prepare_pool_op_delete(op
);
12605 bool changed
= false;
12607 if (!osdmap
.have_pg_pool(m
->pool
)) {
12608 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
12612 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
12615 case POOL_OP_CREATE_SNAP
:
12616 if (pool
->is_tier()) {
12618 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
12620 } // else, fall through
12621 case POOL_OP_DELETE_SNAP
:
12622 if (!pool
->is_unmanaged_snaps_mode()) {
12623 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
12624 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
12625 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
12633 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
12636 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12637 // we won't allow removal of an unmanaged snapshot from a pool
12638 // not in unmanaged snaps mode.
12639 if (!pool
->is_unmanaged_snaps_mode()) {
12640 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
12644 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12645 // but we will allow creating an unmanaged snapshot on any pool
12646 // as long as it is not in 'pool' snaps mode.
12647 if (pool
->is_pool_snaps_mode()) {
12648 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12653 // projected pool info
12655 if (pending_inc
.new_pools
.count(m
->pool
))
12656 pp
= pending_inc
.new_pools
[m
->pool
];
12658 pp
= *osdmap
.get_pg_pool(m
->pool
);
12660 bufferlist reply_data
;
12662 // pool snaps vs unmanaged snaps are mutually exclusive
12664 case POOL_OP_CREATE_SNAP
:
12665 case POOL_OP_DELETE_SNAP
:
12666 if (pp
.is_unmanaged_snaps_mode()) {
12672 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12673 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12674 if (pp
.is_pool_snaps_mode()) {
12681 case POOL_OP_CREATE_SNAP
:
12682 if (!pp
.snap_exists(m
->name
.c_str())) {
12683 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
12684 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
12685 << " seq " << pp
.get_snap_epoch() << dendl
;
12690 case POOL_OP_DELETE_SNAP
:
12692 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
12695 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
12701 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12704 pp
.add_unmanaged_snap(snapid
);
12705 encode(snapid
, reply_data
);
12710 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12711 if (!pp
.is_removed_snap(m
->snapid
)) {
12712 if (m
->snapid
> pp
.get_snap_seq()) {
12713 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
12716 pp
.remove_unmanaged_snap(m
->snapid
);
12717 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
12722 case POOL_OP_AUID_CHANGE
:
12723 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
12732 pp
.set_snap_epoch(pending_inc
.epoch
);
12733 pending_inc
.new_pools
[m
->pool
] = pp
;
12737 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
12741 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
12743 op
->mark_osdmon_event(__func__
);
12744 int err
= prepare_new_pool(op
);
12745 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
12749 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
12752 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
12754 // If the Pool is in use by CephFS, refuse to delete it
12755 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
12756 if (pending_fsmap
.pool_in_use(pool_id
)) {
12757 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
12761 if (pool
.tier_of
>= 0) {
12762 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
12763 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
12766 if (!pool
.tiers
.empty()) {
12767 *ss
<< "pool '" << poolstr
<< "' has tiers";
12768 for(auto tier
: pool
.tiers
) {
12769 *ss
<< " " << osdmap
.get_pool_name(tier
);
12774 if (!g_conf()->mon_allow_pool_delete
) {
12775 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
12779 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
12780 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
12784 *ss
<< "pool '" << poolstr
<< "' removed";
12789 * Check if it is safe to add a tier to a base pool
12792 * True if the operation should proceed, false if we should abort here
12793 * (abort doesn't necessarily mean error, could be idempotency)
12795 bool OSDMonitor::_check_become_tier(
12796 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
12797 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
12801 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
12802 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
12804 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
12805 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
12806 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
12811 if (base_pool
->tiers
.count(tier_pool_id
)) {
12812 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
12814 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
12815 << base_pool_name
<< "'";
12819 if (base_pool
->is_tier()) {
12820 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
12821 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
12822 << "multiple tiers are not yet supported.";
12827 if (tier_pool
->has_tiers()) {
12828 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
12829 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
12830 it
!= tier_pool
->tiers
.end(); ++it
)
12831 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
12832 *ss
<< " multiple tiers are not yet supported.";
12837 if (tier_pool
->is_tier()) {
12838 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
12839 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
12850 * Check if it is safe to remove a tier from this base pool
12853 * True if the operation should proceed, false if we should abort here
12854 * (abort doesn't necessarily mean error, could be idempotency)
12856 bool OSDMonitor::_check_remove_tier(
12857 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
12858 const pg_pool_t
*tier_pool
,
12859 int *err
, ostream
*ss
) const
12861 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
12863 // Apply CephFS-specific checks
12864 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
12865 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
12866 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
12867 // If the underlying pool is erasure coded and does not allow EC
12868 // overwrites, we can't permit the removal of the replicated tier that
12869 // CephFS relies on to access it
12870 *ss
<< "pool '" << base_pool_name
<<
12871 "' does not allow EC overwrites and is in use by CephFS"
12877 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
12878 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
12879 "tier is still in use as a writeback cache. Change the cache "
12880 "mode and flush the cache before removing it";
12890 int OSDMonitor::_prepare_remove_pool(
12891 int64_t pool
, ostream
*ss
, bool no_fake
)
12893 dout(10) << __func__
<< " " << pool
<< dendl
;
12894 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12895 int r
= _check_remove_pool(pool
, *p
, ss
);
12899 auto new_pool
= pending_inc
.new_pools
.find(pool
);
12900 if (new_pool
!= pending_inc
.new_pools
.end()) {
12901 // if there is a problem with the pending info, wait and retry
12903 const auto& p
= new_pool
->second
;
12904 int r
= _check_remove_pool(pool
, p
, ss
);
12909 if (pending_inc
.old_pools
.count(pool
)) {
12910 dout(10) << __func__
<< " " << pool
<< " already pending removal"
12915 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
12916 string old_name
= osdmap
.get_pool_name(pool
);
12917 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
12918 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
12919 << old_name
<< " -> " << new_name
<< dendl
;
12920 pending_inc
.new_pool_names
[pool
] = new_name
;
12925 pending_inc
.old_pools
.insert(pool
);
12927 // remove any pg_temp mappings for this pool
12928 for (auto p
= osdmap
.pg_temp
->begin();
12929 p
!= osdmap
.pg_temp
->end();
12931 if (p
->first
.pool() == pool
) {
12932 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
12933 << p
->first
<< dendl
;
12934 pending_inc
.new_pg_temp
[p
->first
].clear();
12937 // remove any primary_temp mappings for this pool
12938 for (auto p
= osdmap
.primary_temp
->begin();
12939 p
!= osdmap
.primary_temp
->end();
12941 if (p
->first
.pool() == pool
) {
12942 dout(10) << __func__
<< " " << pool
12943 << " removing obsolete primary_temp" << p
->first
<< dendl
;
12944 pending_inc
.new_primary_temp
[p
->first
] = -1;
12947 // remove any pg_upmap mappings for this pool
12948 for (auto& p
: osdmap
.pg_upmap
) {
12949 if (p
.first
.pool() == pool
) {
12950 dout(10) << __func__
<< " " << pool
12951 << " removing obsolete pg_upmap "
12952 << p
.first
<< dendl
;
12953 pending_inc
.old_pg_upmap
.insert(p
.first
);
12956 // remove any pending pg_upmap mappings for this pool
12958 auto it
= pending_inc
.new_pg_upmap
.begin();
12959 while (it
!= pending_inc
.new_pg_upmap
.end()) {
12960 if (it
->first
.pool() == pool
) {
12961 dout(10) << __func__
<< " " << pool
12962 << " removing pending pg_upmap "
12963 << it
->first
<< dendl
;
12964 it
= pending_inc
.new_pg_upmap
.erase(it
);
12970 // remove any pg_upmap_items mappings for this pool
12971 for (auto& p
: osdmap
.pg_upmap_items
) {
12972 if (p
.first
.pool() == pool
) {
12973 dout(10) << __func__
<< " " << pool
12974 << " removing obsolete pg_upmap_items " << p
.first
12976 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
12979 // remove any pending pg_upmap mappings for this pool
12981 auto it
= pending_inc
.new_pg_upmap_items
.begin();
12982 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
12983 if (it
->first
.pool() == pool
) {
12984 dout(10) << __func__
<< " " << pool
12985 << " removing pending pg_upmap_items "
12986 << it
->first
<< dendl
;
12987 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
12994 // remove any choose_args for this pool
12995 CrushWrapper newcrush
;
12996 _get_pending_crush(newcrush
);
12997 if (newcrush
.have_choose_args(pool
)) {
12998 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
12999 newcrush
.rm_choose_args(pool
);
13000 pending_inc
.crush
.clear();
13001 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
13006 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
13008 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
13009 if (pending_inc
.old_pools
.count(pool
)) {
13010 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
13013 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
13014 p
!= pending_inc
.new_pool_names
.end();
13016 if (p
->second
== newname
&& p
->first
!= pool
) {
13021 pending_inc
.new_pool_names
[pool
] = newname
;
13025 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
13027 op
->mark_osdmon_event(__func__
);
13028 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
13030 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
13031 if (ret
== -EAGAIN
) {
13032 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13036 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
13037 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
13038 pending_inc
.epoch
));
13042 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
13043 int ret
, epoch_t epoch
, bufferlist
*blp
)
13045 op
->mark_osdmon_event(__func__
);
13046 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
13047 dout(20) << "_pool_op_reply " << ret
<< dendl
;
13048 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
13049 ret
, epoch
, get_last_committed(), blp
);
13050 mon
->send_reply(op
, reply
);
13053 void OSDMonitor::convert_pool_priorities(void)
13055 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
13056 int64_t max_prio
= 0;
13057 int64_t min_prio
= 0;
13058 for (const auto &i
: osdmap
.get_pools()) {
13059 const auto &pool
= i
.second
;
13061 if (pool
.opts
.is_set(key
)) {
13063 pool
.opts
.get(key
, &prio
);
13064 if (prio
> max_prio
)
13066 if (prio
< min_prio
)
13070 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
13071 dout(20) << __func__
<< " nothing to fix" << dendl
;
13074 // Current pool priorities exceeds new maximum
13075 for (const auto &i
: osdmap
.get_pools()) {
13076 const auto pool_id
= i
.first
;
13077 pg_pool_t pool
= i
.second
;
13080 pool
.opts
.get(key
, &prio
);
13083 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
13084 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13085 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
13086 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
13087 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13088 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
13093 pool
.opts
.unset(key
);
13095 pool
.opts
.set(key
, static_cast<int64_t>(n
));
13097 dout(10) << __func__
<< " pool " << pool_id
13098 << " recovery_priority adjusted "
13099 << prio
<< " to " << n
<< dendl
;
13100 pool
.last_change
= pending_inc
.epoch
;
13101 pending_inc
.new_pools
[pool_id
] = pool
;