1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreate2.h"
51 #include "messages/MOSDPGCreated.h"
52 #include "messages/MOSDPGTemp.h"
53 #include "messages/MOSDPGReadyToMerge.h"
54 #include "messages/MMonCommand.h"
55 #include "messages/MRemoveSnaps.h"
56 #include "messages/MOSDScrub.h"
57 #include "messages/MRoute.h"
59 #include "common/TextTable.h"
60 #include "common/Timer.h"
61 #include "common/ceph_argparse.h"
62 #include "common/perf_counters.h"
63 #include "common/strtol.h"
64 #include "common/numa.h"
66 #include "common/config.h"
67 #include "common/errno.h"
69 #include "erasure-code/ErasureCodePlugin.h"
70 #include "compressor/Compressor.h"
71 #include "common/Checksummer.h"
73 #include "include/compat.h"
74 #include "include/ceph_assert.h"
75 #include "include/stringify.h"
76 #include "include/util.h"
77 #include "common/cmdparse.h"
78 #include "include/str_list.h"
79 #include "include/str_map.h"
80 #include "include/scope_guard.h"
82 #include "auth/cephx/CephxKeyServer.h"
83 #include "osd/OSDCap.h"
85 #include "json_spirit/json_spirit_reader.h"
87 #include <boost/algorithm/string/predicate.hpp>
89 #define dout_subsys ceph_subsys_mon
90 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
91 static const string
OSD_METADATA_PREFIX("osd_metadata");
92 static const string
OSD_SNAP_PREFIX("osd_snap");
96 const uint32_t MAX_POOL_APPLICATIONS
= 4;
97 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
98 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
100 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
101 // Note: this doesn't include support for the application tag match
102 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
103 auto& match
= grant
.match
;
104 if (match
.is_match_all()) {
106 } else if (pool_name
!= nullptr &&
107 !match
.pool_namespace
.pool_name
.empty() &&
108 match
.pool_namespace
.pool_name
== *pool_name
) {
115 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
116 const KeyServer
& key_server
,
117 const EntityName
& entity_name
,
118 const MonCap
& mon_caps
,
119 const entity_addr_t
& peer_socket_addr
,
120 const std::string
* pool_name
)
122 typedef std::map
<std::string
, std::string
> CommandArgs
;
124 if (mon_caps
.is_capable(
125 cct
, CEPH_ENTITY_TYPE_MON
,
127 "osd pool op unmanaged-snap",
128 (pool_name
== nullptr ?
129 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
130 CommandArgs
{{"poolname", *pool_name
}}),
136 AuthCapsInfo caps_info
;
137 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
139 dout(10) << "unable to locate OSD cap data for " << entity_name
140 << " in auth db" << dendl
;
145 if (caps_info
.caps
.length() > 0) {
146 auto p
= caps_info
.caps
.cbegin();
149 } catch (const buffer::error
&err
) {
150 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
157 if (!osd_cap
.parse(caps_str
, nullptr)) {
158 dout(10) << "unable to parse OSD cap data for " << entity_name
159 << " in auth db" << dendl
;
163 // if the entity has write permissions in one or all pools, permit
164 // usage of unmanaged-snapshots
165 if (osd_cap
.allow_all()) {
169 for (auto& grant
: osd_cap
.grants
) {
170 if (grant
.profile
.is_valid()) {
171 for (auto& profile_grant
: grant
.profile_grants
) {
172 if (is_osd_writable(profile_grant
, pool_name
)) {
176 } else if (is_osd_writable(grant
, pool_name
)) {
184 } // anonymous namespace
186 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
188 if (epoch_by_pg
.size() <= ps
) {
189 epoch_by_pg
.resize(ps
+ 1, 0);
191 const auto old_lec
= epoch_by_pg
[ps
];
192 if (old_lec
>= last_epoch_clean
) {
196 epoch_by_pg
[ps
] = last_epoch_clean
;
197 if (last_epoch_clean
< floor
) {
198 floor
= last_epoch_clean
;
199 } else if (last_epoch_clean
> floor
) {
200 if (old_lec
== floor
) {
201 // probably should increase floor?
202 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
203 std::end(epoch_by_pg
));
207 if (ps
!= next_missing
) {
210 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
211 if (epoch_by_pg
[next_missing
] == 0) {
217 void LastEpochClean::remove_pool(uint64_t pool
)
219 report_by_pool
.erase(pool
);
222 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
224 auto& lec
= report_by_pool
[pg
.pool()];
225 return lec
.report(pg
.ps(), last_epoch_clean
);
228 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
230 auto floor
= latest
.get_epoch();
231 for (auto& pool
: latest
.get_pools()) {
232 auto reported
= report_by_pool
.find(pool
.first
);
233 if (reported
== report_by_pool
.end()) {
236 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
239 if (reported
->second
.floor
< floor
) {
240 floor
= reported
->second
.floor
;
247 class C_UpdateCreatingPGs
: public Context
{
252 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
253 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
254 void finish(int r
) override
{
256 utime_t end
= ceph_clock_now();
257 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
258 << (end
- start
) << " seconds" << dendl
;
259 osdmon
->update_creating_pgs();
260 osdmon
->check_pg_creates_subs();
266 #define dout_prefix _prefix(_dout, mon, osdmap)
267 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
268 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
269 << "(" << mon
->get_state_name()
270 << ").osd e" << osdmap
.get_epoch() << " ";
273 OSDMonitor::OSDMonitor(
277 const string
& service_name
)
278 : PaxosService(mn
, p
, service_name
),
280 inc_osd_cache(g_conf()->mon_osd_cache_size
),
281 full_osd_cache(g_conf()->mon_osd_cache_size
),
282 has_osdmap_manifest(false),
283 mapper(mn
->cct
, &mn
->cpu_tp
)
286 bool OSDMonitor::_have_pending_crush()
288 return pending_inc
.crush
.length() > 0;
291 CrushWrapper
&OSDMonitor::_get_stable_crush()
293 return *osdmap
.crush
;
296 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
299 if (pending_inc
.crush
.length())
300 bl
= pending_inc
.crush
;
302 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
304 auto p
= bl
.cbegin();
308 void OSDMonitor::create_initial()
310 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
315 mon
->store
->get("mkfs", "osdmap", bl
);
319 newmap
.set_fsid(mon
->monmap
->fsid
);
321 newmap
.build_simple(cct
, 0, mon
->monmap
->fsid
, 0);
324 newmap
.created
= newmap
.modified
= ceph_clock_now();
326 // new clusters should sort bitwise by default.
327 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
330 CEPH_OSDMAP_RECOVERY_DELETES
|
331 CEPH_OSDMAP_PURGED_SNAPDIRS
|
332 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
333 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
334 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
335 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
336 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
337 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
338 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
340 // new cluster should require latest by default
341 if (g_conf().get_val
<bool>("mon_debug_no_require_nautilus")) {
342 if (g_conf()->mon_debug_no_require_mimic
) {
343 derr
<< __func__
<< " mon_debug_no_require_mimic=true and nautilus=true" << dendl
;
344 newmap
.require_osd_release
= CEPH_RELEASE_LUMINOUS
;
346 derr
<< __func__
<< " mon_debug_no_require_nautilus=true" << dendl
;
347 newmap
.require_osd_release
= CEPH_RELEASE_MIMIC
;
350 newmap
.require_osd_release
= CEPH_RELEASE_NAUTILUS
;
351 int r
= ceph_release_from_name(
352 g_conf()->mon_osd_initial_require_min_compat_client
.c_str());
354 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
356 newmap
.require_min_compat_client
= r
;
359 // encode into pending incremental
360 uint64_t features
= newmap
.get_encoding_features();
361 newmap
.encode(pending_inc
.fullmap
,
362 features
| CEPH_FEATURE_RESERVED
);
363 pending_inc
.full_crc
= newmap
.get_crc();
364 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
367 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
369 s
.insert(service_name
);
370 s
.insert(OSD_PG_CREATING_PREFIX
);
371 s
.insert(OSD_METADATA_PREFIX
);
372 s
.insert(OSD_SNAP_PREFIX
);
375 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
377 // we really don't care if the version has been updated, because we may
378 // have trimmed without having increased the last committed; yet, we may
379 // need to update the in-memory manifest.
380 load_osdmap_manifest();
382 version_t version
= get_last_committed();
383 if (version
== osdmap
.epoch
)
385 ceph_assert(version
> osdmap
.epoch
);
387 dout(15) << "update_from_paxos paxos e " << version
388 << ", my e " << osdmap
.epoch
<< dendl
;
391 if (!mapping_job
->is_done()) {
392 dout(1) << __func__
<< " mapping job "
393 << mapping_job
.get() << " did not complete, "
394 << mapping_job
->shards
<< " left, canceling" << dendl
;
395 mapping_job
->abort();
403 * We will possibly have a stashed latest that *we* wrote, and we will
404 * always be sure to have the oldest full map in the first..last range
405 * due to encode_trim_extra(), which includes the oldest full map in the trim
408 * encode_trim_extra() does not however write the full map's
409 * version to 'full_latest'. This is only done when we are building the
410 * full maps from the incremental versions. But don't panic! We make sure
411 * that the following conditions find whichever full map version is newer.
413 version_t latest_full
= get_version_latest_full();
414 if (latest_full
== 0 && get_first_committed() > 1)
415 latest_full
= get_first_committed();
417 if (get_first_committed() > 1 &&
418 latest_full
< get_first_committed()) {
419 // the monitor could be just sync'ed with its peer, and the latest_full key
420 // is not encoded in the paxos commits in encode_pending(), so we need to
421 // make sure we get it pointing to a proper version.
422 version_t lc
= get_last_committed();
423 version_t fc
= get_first_committed();
425 dout(10) << __func__
<< " looking for valid full map in interval"
426 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
429 for (version_t v
= lc
; v
>= fc
; v
--) {
430 string full_key
= "full_" + stringify(v
);
431 if (mon
->store
->exists(get_service_name(), full_key
)) {
432 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
438 ceph_assert(latest_full
> 0);
439 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
440 put_version_latest_full(t
, latest_full
);
441 mon
->store
->apply_transaction(t
);
442 dout(10) << __func__
<< " updated the on-disk full map version to "
443 << latest_full
<< dendl
;
446 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
447 bufferlist latest_bl
;
448 get_version_full(latest_full
, latest_bl
);
449 ceph_assert(latest_bl
.length() != 0);
450 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
452 osdmap
.decode(latest_bl
);
456 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
457 auto p
= bl
.cbegin();
458 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
459 creating_pgs
.decode(p
);
460 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
461 << creating_pgs
.last_scan_epoch
462 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
464 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
468 // walk through incrementals
469 MonitorDBStore::TransactionRef t
;
471 while (version
> osdmap
.epoch
) {
473 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
474 ceph_assert(err
== 0);
475 ceph_assert(inc_bl
.length());
477 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
479 OSDMap::Incremental
inc(inc_bl
);
480 err
= osdmap
.apply_incremental(inc
);
481 ceph_assert(err
== 0);
484 t
.reset(new MonitorDBStore::Transaction
);
486 // Write out the full map for all past epochs. Encode the full
487 // map with the same features as the incremental. If we don't
488 // know, use the quorum features. If we don't know those either,
489 // encode with all features.
490 uint64_t f
= inc
.encode_features
;
492 f
= mon
->get_quorum_con_features();
496 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
497 tx_size
+= full_bl
.length();
499 bufferlist orig_full_bl
;
500 get_version_full(osdmap
.epoch
, orig_full_bl
);
501 if (orig_full_bl
.length()) {
502 // the primary provided the full map
503 ceph_assert(inc
.have_crc
);
504 if (inc
.full_crc
!= osdmap
.crc
) {
505 // This will happen if the mons were running mixed versions in
506 // the past or some other circumstance made the full encoded
507 // maps divergent. Reloading here will bring us back into
508 // sync with the primary for this and all future maps. OSDs
509 // will also be brought back into sync when they discover the
510 // crc mismatch and request a full map from a mon.
511 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
514 dout(20) << __func__
<< " my (bad) full osdmap:\n";
515 JSONFormatter
jf(true);
516 jf
.dump_object("osdmap", osdmap
);
518 *_dout
<< "\nhexdump:\n";
519 full_bl
.hexdump(*_dout
);
523 osdmap
.decode(orig_full_bl
);
525 dout(20) << __func__
<< " canonical full osdmap:\n";
526 JSONFormatter
jf(true);
527 jf
.dump_object("osdmap", osdmap
);
529 *_dout
<< "\nhexdump:\n";
530 orig_full_bl
.hexdump(*_dout
);
534 ceph_assert(!inc
.have_crc
);
535 put_version_full(t
, osdmap
.epoch
, full_bl
);
537 put_version_latest_full(t
, osdmap
.epoch
);
540 dout(1) << osdmap
<< dendl
;
542 if (osdmap
.epoch
== 1) {
543 t
->erase("mkfs", "osdmap");
546 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
547 mon
->store
->apply_transaction(t
);
548 t
= MonitorDBStore::TransactionRef();
551 for (const auto &osd_state
: inc
.new_state
) {
552 if (osd_state
.second
& CEPH_OSD_UP
) {
553 // could be marked up *or* down, but we're too lazy to check which
554 last_osd_report
.erase(osd_state
.first
);
556 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
557 // could be created *or* destroyed, but we can safely drop it
558 osd_epochs
.erase(osd_state
.first
);
564 mon
->store
->apply_transaction(t
);
567 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
568 if (osdmap
.is_out(o
))
570 auto found
= down_pending_out
.find(o
);
571 if (osdmap
.is_down(o
)) {
572 // populate down -> out map
573 if (found
== down_pending_out
.end()) {
574 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
575 down_pending_out
[o
] = ceph_clock_now();
578 if (found
!= down_pending_out
.end()) {
579 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
580 down_pending_out
.erase(found
);
584 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
587 check_pg_creates_subs();
589 share_map_with_random_osd();
594 // make sure our feature bits reflect the latest map
595 update_msgr_features();
597 if (!mon
->is_leader()) {
598 // will be called by on_active() on the leader, avoid doing so twice
603 void OSDMonitor::start_mapping()
605 // initiate mapping job
607 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
609 mapping_job
->abort();
611 if (!osdmap
.get_pools().empty()) {
612 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
613 mapping_job
= mapping
.start_update(osdmap
, mapper
,
614 g_conf()->mon_osd_mapping_pgs_per_chunk
);
615 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
616 << " at " << fin
->start
<< dendl
;
617 mapping_job
->set_finish_event(fin
);
619 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
620 mapping_job
= nullptr;
624 void OSDMonitor::update_msgr_features()
627 types
.insert((int)entity_name_t::TYPE_OSD
);
628 types
.insert((int)entity_name_t::TYPE_CLIENT
);
629 types
.insert((int)entity_name_t::TYPE_MDS
);
630 types
.insert((int)entity_name_t::TYPE_MON
);
631 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
633 uint64_t features
= osdmap
.get_features(*q
, &mask
);
634 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
635 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
636 ceph::net::Policy p
= mon
->messenger
->get_policy(*q
);
637 p
.features_required
= (p
.features_required
& ~mask
) | features
;
638 mon
->messenger
->set_policy(*q
, p
);
643 void OSDMonitor::on_active()
647 if (mon
->is_leader()) {
648 mon
->clog
->debug() << "osdmap " << osdmap
;
649 if (!priority_convert
) {
650 // Only do this once at start-up
651 convert_pool_priorities();
652 priority_convert
= true;
655 list
<MonOpRequestRef
> ls
;
656 take_all_failures(ls
);
657 while (!ls
.empty()) {
658 MonOpRequestRef op
= ls
.front();
659 op
->mark_osdmon_event(__func__
);
667 void OSDMonitor::on_restart()
669 last_osd_report
.clear();
672 void OSDMonitor::on_shutdown()
674 dout(10) << __func__
<< dendl
;
676 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
678 mapping_job
->abort();
681 // discard failure info, waiters
682 list
<MonOpRequestRef
> ls
;
683 take_all_failures(ls
);
687 void OSDMonitor::update_logger()
689 dout(10) << "update_logger" << dendl
;
691 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
692 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
693 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
694 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
697 void OSDMonitor::create_pending()
699 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
700 pending_inc
.fsid
= mon
->monmap
->fsid
;
701 pending_metadata
.clear();
702 pending_metadata_rm
.clear();
704 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
706 // safety checks (this shouldn't really happen)
708 if (osdmap
.backfillfull_ratio
<= 0) {
709 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
710 if (pending_inc
.new_backfillfull_ratio
> 1.0)
711 pending_inc
.new_backfillfull_ratio
/= 100;
712 dout(1) << __func__
<< " setting backfillfull_ratio = "
713 << pending_inc
.new_backfillfull_ratio
<< dendl
;
715 if (osdmap
.full_ratio
<= 0) {
716 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
717 if (pending_inc
.new_full_ratio
> 1.0)
718 pending_inc
.new_full_ratio
/= 100;
719 dout(1) << __func__
<< " setting full_ratio = "
720 << pending_inc
.new_full_ratio
<< dendl
;
722 if (osdmap
.nearfull_ratio
<= 0) {
723 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
724 if (pending_inc
.new_nearfull_ratio
> 1.0)
725 pending_inc
.new_nearfull_ratio
/= 100;
726 dout(1) << __func__
<< " setting nearfull_ratio = "
727 << pending_inc
.new_nearfull_ratio
<< dendl
;
731 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
733 if (osdmap
.crush
->has_legacy_rule_ids()) {
734 CrushWrapper newcrush
;
735 _get_pending_crush(newcrush
);
737 // First, for all pools, work out which rule they really used
738 // by resolving ruleset to rule.
739 for (const auto &i
: osdmap
.get_pools()) {
740 const auto pool_id
= i
.first
;
741 const auto &pool
= i
.second
;
742 int new_rule_id
= newcrush
.find_rule(pool
.crush_rule
,
743 pool
.type
, pool
.size
);
745 dout(1) << __func__
<< " rewriting pool "
746 << osdmap
.get_pool_name(pool_id
) << " crush ruleset "
747 << pool
.crush_rule
<< " -> rule id " << new_rule_id
<< dendl
;
748 if (pending_inc
.new_pools
.count(pool_id
) == 0) {
749 pending_inc
.new_pools
[pool_id
] = pool
;
751 pending_inc
.new_pools
[pool_id
].crush_rule
= new_rule_id
;
754 // Now, go ahead and renumber all the rules so that their
755 // rule_id field corresponds to their position in the array
756 auto old_to_new
= newcrush
.renumber_rules();
757 dout(1) << __func__
<< " Rewrote " << old_to_new
<< " crush IDs:" << dendl
;
758 for (const auto &i
: old_to_new
) {
759 dout(1) << __func__
<< " " << i
.first
<< " -> " << i
.second
<< dendl
;
761 pending_inc
.crush
.clear();
762 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
767 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
768 const OSDMap
& nextmap
)
770 dout(10) << __func__
<< dendl
;
771 creating_pgs_t pending_creatings
;
773 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
774 pending_creatings
= creating_pgs
;
776 // check for new or old pools
777 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
779 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
783 queued
+= scan_for_creating_pgs(inc
.new_pools
,
787 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
788 for (auto deleted_pool
: inc
.old_pools
) {
789 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
790 dout(10) << __func__
<< " " << removed
791 << " pg removed because containing pool deleted: "
792 << deleted_pool
<< dendl
;
793 last_epoch_clean
.remove_pool(deleted_pool
);
795 // pgmon updates its creating_pgs in check_osd_map() which is called by
796 // on_active() and check_osd_map() could be delayed if lease expires, so its
797 // creating_pgs could be stale in comparison with the one of osdmon. let's
798 // trim them here. otherwise, they will be added back after being erased.
799 unsigned removed
= 0;
800 for (auto& pg
: pending_created_pgs
) {
801 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
802 pending_creatings
.created_pools
.insert(pg
.pool());
803 removed
+= pending_creatings
.pgs
.erase(pg
);
805 pending_created_pgs
.clear();
806 dout(10) << __func__
<< " " << removed
807 << " pgs removed because they're created" << dendl
;
808 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
811 // filter out any pgs that shouldn't exist.
813 auto i
= pending_creatings
.pgs
.begin();
814 while (i
!= pending_creatings
.pgs
.end()) {
815 if (!nextmap
.pg_exists(i
->first
)) {
816 dout(10) << __func__
<< " removing pg " << i
->first
817 << " which should not exist" << dendl
;
818 i
= pending_creatings
.pgs
.erase(i
);
826 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
827 const auto total
= pending_creatings
.pgs
.size();
828 while (pending_creatings
.pgs
.size() < max
&&
829 !pending_creatings
.queue
.empty()) {
830 auto p
= pending_creatings
.queue
.begin();
831 int64_t poolid
= p
->first
;
832 dout(10) << __func__
<< " pool " << poolid
833 << " created " << p
->second
.created
834 << " modified " << p
->second
.modified
835 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
837 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
838 p
->second
.end
- p
->second
.start
);
839 ps_t first
= p
->second
.start
;
840 ps_t end
= first
+ n
;
841 for (ps_t ps
= first
; ps
< end
; ++ps
) {
842 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
843 // NOTE: use the *current* epoch as the PG creation epoch so that the
844 // OSD does not have to generate a long set of PastIntervals.
845 pending_creatings
.pgs
.emplace(pgid
, make_pair(inc
.epoch
,
846 p
->second
.modified
));
847 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
849 p
->second
.start
= end
;
850 if (p
->second
.done()) {
851 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
852 pending_creatings
.queue
.erase(p
);
854 dout(10) << __func__
<< " pool " << poolid
855 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
859 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
860 << " pools" << dendl
;
862 << " " << (pending_creatings
.pgs
.size() - total
)
863 << "/" << pending_creatings
.pgs
.size()
864 << " pgs added from queued pools" << dendl
;
865 return pending_creatings
;
868 void OSDMonitor::maybe_prime_pg_temp()
871 if (pending_inc
.crush
.length()) {
872 dout(10) << __func__
<< " new crush map, all" << dendl
;
876 if (!pending_inc
.new_up_client
.empty()) {
877 dout(10) << __func__
<< " new up osds, all" << dendl
;
881 // check for interesting OSDs
883 for (auto p
= pending_inc
.new_state
.begin();
884 !all
&& p
!= pending_inc
.new_state
.end();
886 if ((p
->second
& CEPH_OSD_UP
) &&
887 osdmap
.is_up(p
->first
)) {
888 osds
.insert(p
->first
);
891 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
892 !all
&& p
!= pending_inc
.new_weight
.end();
894 if (p
->second
< osdmap
.get_weight(p
->first
)) {
896 osds
.insert(p
->first
);
898 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
904 if (!all
&& osds
.empty())
909 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
910 if (estimate
> mapping
.get_num_pgs() *
911 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
912 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
913 << osds
.size() << " osds >= "
914 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
915 << mapping
.get_num_pgs() << " pgs, all"
919 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
920 << osds
.size() << " osds" << dendl
;
925 next
.deepish_copy_from(osdmap
);
926 next
.apply_incremental(pending_inc
);
928 if (next
.get_pools().empty()) {
929 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
931 PrimeTempJob
job(next
, this);
932 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
);
933 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
934 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
936 dout(10) << __func__
<< " did not finish in "
937 << g_conf()->mon_osd_prime_pg_temp_max_time
938 << ", stopping" << dendl
;
942 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
943 utime_t stop
= ceph_clock_now();
944 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
945 const int chunk
= 1000;
947 std::unordered_set
<pg_t
> did_pgs
;
948 for (auto osd
: osds
) {
949 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
950 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
951 for (auto pgid
: pgs
) {
952 if (!did_pgs
.insert(pgid
).second
) {
955 prime_pg_temp(next
, pgid
);
958 if (ceph_clock_now() > stop
) {
959 dout(10) << __func__
<< " consumed more than "
960 << g_conf()->mon_osd_prime_pg_temp_max_time
961 << " seconds, stopping"
971 void OSDMonitor::prime_pg_temp(
975 // TODO: remove this creating_pgs direct access?
976 if (creating_pgs
.pgs
.count(pgid
)) {
979 if (!osdmap
.pg_exists(pgid
)) {
983 vector
<int> up
, acting
;
984 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
986 vector
<int> next_up
, next_acting
;
987 int next_up_primary
, next_acting_primary
;
988 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
989 &next_acting
, &next_acting_primary
);
990 if (acting
== next_acting
&&
991 !(up
!= acting
&& next_up
== next_acting
))
992 return; // no change since last epoch
995 return; // if previously empty now we can be no worse off
996 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
997 if (pool
&& acting
.size() < pool
->min_size
)
998 return; // can be no worse off than before
1000 if (next_up
== next_acting
) {
1002 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1006 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1007 << " -> " << next_up
<< "/" << next_acting
1008 << ", priming " << acting
1011 std::lock_guard
l(prime_pg_temp_lock
);
1012 // do not touch a mapping if a change is pending
1013 pending_inc
.new_pg_temp
.emplace(
1015 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1020 * @note receiving a transaction in this function gives a fair amount of
1021 * freedom to the service implementation if it does need it. It shouldn't.
1023 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1025 dout(10) << "encode_pending e " << pending_inc
.epoch
1029 dout(1) << __func__
<< " osdmap full prune encoded e"
1030 << pending_inc
.epoch
<< dendl
;
1033 // finalize up pending_inc
1034 pending_inc
.modified
= ceph_clock_now();
1036 int r
= pending_inc
.propagate_snaps_to_tiers(cct
, osdmap
);
1037 ceph_assert(r
== 0);
1040 if (!mapping_job
->is_done()) {
1041 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1042 << mapping_job
.get() << " did not complete, "
1043 << mapping_job
->shards
<< " left" << dendl
;
1044 mapping_job
->abort();
1045 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1046 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1047 << mapping_job
.get() << " is prior epoch "
1048 << mapping
.get_epoch() << dendl
;
1050 if (g_conf()->mon_osd_prime_pg_temp
) {
1051 maybe_prime_pg_temp();
1054 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1055 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1058 mapping_job
.reset();
1060 // ensure we don't have blank new_state updates. these are interrpeted as
1061 // CEPH_OSD_UP (and almost certainly not what we want!).
1062 auto p
= pending_inc
.new_state
.begin();
1063 while (p
!= pending_inc
.new_state
.end()) {
1064 if (p
->second
== 0) {
1065 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1066 p
= pending_inc
.new_state
.erase(p
);
1068 if (p
->second
& CEPH_OSD_UP
) {
1069 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1074 if (!pending_inc
.new_up_client
.empty()) {
1075 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1077 for (auto& i
: pending_inc
.new_weight
) {
1078 if (i
.first
> osdmap
.max_osd
) {
1080 // new osd is already marked in
1081 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1083 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1084 // existing osd marked in or out
1085 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1091 tmp
.deepish_copy_from(osdmap
);
1092 tmp
.apply_incremental(pending_inc
);
1094 // clean pg_temp mappings
1095 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1097 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1098 osdmap
.maybe_remove_pg_upmaps(cct
, osdmap
, tmp
, &pending_inc
);
1100 // update creating pgs first so that we can remove the created pgid and
1101 // process the pool flag removal below in the same osdmap epoch.
1102 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1103 bufferlist creatings_bl
;
1104 encode(pending_creatings
, creatings_bl
);
1105 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1107 // remove any old (or incompat) POOL_CREATING flags
1108 for (auto& i
: tmp
.get_pools()) {
1109 if (tmp
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
1110 // pre-nautilus OSDMaps shouldn't get this flag.
1111 if (pending_inc
.new_pools
.count(i
.first
)) {
1112 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1115 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1116 !pending_creatings
.still_creating_pool(i
.first
)) {
1117 dout(10) << __func__
<< " done creating pool " << i
.first
1118 << ", clearing CREATING flag" << dendl
;
1119 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1120 pending_inc
.new_pools
[i
.first
] = i
.second
;
1122 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1126 // remove any legacy osdmap nearfull/full flags
1128 if (tmp
.test_flag(CEPH_OSDMAP_FULL
| CEPH_OSDMAP_NEARFULL
)) {
1129 dout(10) << __func__
<< " clearing legacy osdmap nearfull/full flag"
1131 remove_flag(CEPH_OSDMAP_NEARFULL
);
1132 remove_flag(CEPH_OSDMAP_FULL
);
1135 // collect which pools are currently affected by
1136 // the near/backfill/full osd(s),
1137 // and set per-pool near/backfill/full flag instead
1138 set
<int64_t> full_pool_ids
;
1139 set
<int64_t> backfillfull_pool_ids
;
1140 set
<int64_t> nearfull_pool_ids
;
1141 tmp
.get_full_pools(cct
,
1143 &backfillfull_pool_ids
,
1144 &nearfull_pool_ids
);
1145 if (full_pool_ids
.empty() ||
1146 backfillfull_pool_ids
.empty() ||
1147 nearfull_pool_ids
.empty()) {
1148 // normal case - no nearfull, backfillfull or full osds
1149 // try cancel any improper nearfull/backfillfull/full pool
1151 for (auto &pool
: tmp
.get_pools()) {
1152 auto p
= pool
.first
;
1153 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1154 nearfull_pool_ids
.empty()) {
1155 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1156 << "'s nearfull flag" << dendl
;
1157 if (pending_inc
.new_pools
.count(p
) == 0) {
1158 // load original pool info first!
1159 pending_inc
.new_pools
[p
] = pool
.second
;
1161 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1163 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1164 backfillfull_pool_ids
.empty()) {
1165 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1166 << "'s backfillfull flag" << dendl
;
1167 if (pending_inc
.new_pools
.count(p
) == 0) {
1168 pending_inc
.new_pools
[p
] = pool
.second
;
1170 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1172 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1173 full_pool_ids
.empty()) {
1174 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1175 // set by EQUOTA, skipping
1178 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1179 << "'s full flag" << dendl
;
1180 if (pending_inc
.new_pools
.count(p
) == 0) {
1181 pending_inc
.new_pools
[p
] = pool
.second
;
1183 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1187 if (!full_pool_ids
.empty()) {
1188 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1189 << " as full" << dendl
;
1190 for (auto &p
: full_pool_ids
) {
1191 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1194 if (pending_inc
.new_pools
.count(p
) == 0) {
1195 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1197 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1198 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1199 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1201 // cancel FLAG_FULL for pools which are no longer full too
1202 for (auto &pool
: tmp
.get_pools()) {
1203 auto p
= pool
.first
;
1204 if (full_pool_ids
.count(p
)) {
1205 // skip pools we have just marked as full above
1208 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1209 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1210 // don't touch if currently is not full
1211 // or is running out of quota (and hence considered as full)
1214 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1215 << "'s full flag" << dendl
;
1216 if (pending_inc
.new_pools
.count(p
) == 0) {
1217 pending_inc
.new_pools
[p
] = pool
.second
;
1219 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1222 if (!backfillfull_pool_ids
.empty()) {
1223 for (auto &p
: backfillfull_pool_ids
) {
1224 if (full_pool_ids
.count(p
)) {
1225 // skip pools we have already considered as full above
1228 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1229 // make sure FLAG_FULL is truly set, so we are safe not
1230 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1231 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1234 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1235 // don't bother if pool is already marked as backfillfull
1238 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1239 << "'s as backfillfull" << dendl
;
1240 if (pending_inc
.new_pools
.count(p
) == 0) {
1241 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1243 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1244 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1246 // cancel FLAG_BACKFILLFULL for pools
1247 // which are no longer backfillfull too
1248 for (auto &pool
: tmp
.get_pools()) {
1249 auto p
= pool
.first
;
1250 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1251 // skip pools we have just marked as backfillfull/full above
1254 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1255 // and don't touch if currently is not backfillfull
1258 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1259 << "'s backfillfull flag" << dendl
;
1260 if (pending_inc
.new_pools
.count(p
) == 0) {
1261 pending_inc
.new_pools
[p
] = pool
.second
;
1263 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1266 if (!nearfull_pool_ids
.empty()) {
1267 for (auto &p
: nearfull_pool_ids
) {
1268 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1271 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1272 // make sure FLAG_FULL is truly set, so we are safe not
1273 // to set a extra (redundant) FLAG_NEARFULL flag
1274 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1277 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1278 // don't bother if pool is already marked as nearfull
1281 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1282 << "'s as nearfull" << dendl
;
1283 if (pending_inc
.new_pools
.count(p
) == 0) {
1284 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1286 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1288 // cancel FLAG_NEARFULL for pools
1289 // which are no longer nearfull too
1290 for (auto &pool
: tmp
.get_pools()) {
1291 auto p
= pool
.first
;
1292 if (full_pool_ids
.count(p
) ||
1293 backfillfull_pool_ids
.count(p
) ||
1294 nearfull_pool_ids
.count(p
)) {
1295 // skip pools we have just marked as
1296 // nearfull/backfillfull/full above
1299 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1300 // and don't touch if currently is not nearfull
1303 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1304 << "'s nearfull flag" << dendl
;
1305 if (pending_inc
.new_pools
.count(p
) == 0) {
1306 pending_inc
.new_pools
[p
] = pool
.second
;
1308 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1312 // min_compat_client?
1313 if (tmp
.require_min_compat_client
== 0) {
1314 auto mv
= tmp
.get_min_compat_client();
1315 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1316 << "required " << ceph_release_name(mv
) << dendl
;
1317 mon
->clog
->info() << "setting require_min_compat_client to currently "
1318 << "required " << ceph_release_name(mv
);
1319 pending_inc
.new_require_min_compat_client
= mv
;
1322 // upgrade to mimic?
1323 if (osdmap
.require_osd_release
< CEPH_RELEASE_MIMIC
&&
1324 tmp
.require_osd_release
>= CEPH_RELEASE_MIMIC
) {
1325 dout(10) << __func__
<< " first mimic+ epoch" << dendl
;
1326 // record this epoch as the deletion for all legacy removed_snaps
1327 for (auto& p
: tmp
.get_pools()) {
1328 // update every pool
1329 if (pending_inc
.new_pools
.count(p
.first
) == 0) {
1330 pending_inc
.new_pools
[p
.first
] = p
.second
;
1332 auto& pi
= pending_inc
.new_pools
[p
.first
];
1333 if (pi
.snap_seq
== 0) {
1334 // no snaps on this pool
1337 if ((pi
.flags
& (pg_pool_t::FLAG_SELFMANAGED_SNAPS
|
1338 pg_pool_t::FLAG_POOL_SNAPS
)) == 0) {
1339 if (!pi
.removed_snaps
.empty()) {
1340 pi
.flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
1342 pi
.flags
|= pg_pool_t::FLAG_POOL_SNAPS
;
1346 // Make all previously removed snaps appear to be removed in this
1347 // epoch. this populates removed_snaps_queue. The OSD will subtract
1348 // off its purged_snaps, as before, and this set will shrink over the
1349 // following epochs as the purged snaps are reported back through the
1351 OSDMap::snap_interval_set_t removed
;
1352 if (!p
.second
.removed_snaps
.empty()) {
1353 // different flavor of interval_set :(
1354 for (auto q
= p
.second
.removed_snaps
.begin();
1355 q
!= p
.second
.removed_snaps
.end();
1357 removed
.insert(q
.get_start(), q
.get_len());
1360 for (snapid_t s
= 1; s
<= pi
.get_snap_seq(); s
= s
+ 1) {
1361 if (pi
.snaps
.count(s
) == 0) {
1366 pending_inc
.new_removed_snaps
[p
.first
].union_of(removed
);
1368 dout(10) << __func__
<< " converting pool " << p
.first
1369 << " with " << p
.second
.removed_snaps
.size()
1370 << " legacy removed_snaps" << dendl
;
1371 string k
= make_snap_epoch_key(p
.first
, pending_inc
.epoch
);
1373 encode(p
.second
.removed_snaps
, v
);
1374 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1375 for (auto q
= p
.second
.removed_snaps
.begin();
1376 q
!= p
.second
.removed_snaps
.end();
1379 string k
= make_snap_key_value(p
.first
, q
.get_start(),
1380 q
.get_len(), pending_inc
.epoch
, &v
);
1381 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1385 if (osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
&&
1386 tmp
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
1387 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1388 // add creating flags?
1389 for (auto& i
: tmp
.get_pools()) {
1390 if (pending_creatings
.still_creating_pool(i
.first
)) {
1391 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1393 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1394 pending_inc
.new_pools
[i
.first
] = i
.second
;
1396 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1399 // adjust blacklist items to all be TYPE_ANY
1400 for (auto& i
: tmp
.blacklist
) {
1402 a
.set_type(entity_addr_t::TYPE_ANY
);
1403 pending_inc
.new_blacklist
[a
] = i
.second
;
1404 pending_inc
.old_blacklist
.push_back(i
.first
);
1410 for (auto i
= pending_inc
.new_state
.begin();
1411 i
!= pending_inc
.new_state
.end();
1413 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1414 if (s
& CEPH_OSD_UP
)
1415 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1416 if (s
& CEPH_OSD_EXISTS
)
1417 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1419 for (auto i
= pending_inc
.new_up_client
.begin();
1420 i
!= pending_inc
.new_up_client
.end();
1422 //FIXME: insert cluster addresses too
1423 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1425 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1426 i
!= pending_inc
.new_weight
.end();
1428 if (i
->second
== CEPH_OSD_OUT
) {
1429 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1430 } else if (i
->second
== CEPH_OSD_IN
) {
1431 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1433 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1437 // features for osdmap and its incremental
1440 // encode full map and determine its crc
1443 tmp
.deepish_copy_from(osdmap
);
1444 tmp
.apply_incremental(pending_inc
);
1446 // determine appropriate features
1447 features
= tmp
.get_encoding_features();
1448 dout(10) << __func__
<< " encoding full map with "
1449 << ceph_release_name(tmp
.require_osd_release
)
1450 << " features " << features
<< dendl
;
1452 // the features should be a subset of the mon quorum's features!
1453 ceph_assert((features
& ~mon
->get_quorum_con_features()) == 0);
1456 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1457 pending_inc
.full_crc
= tmp
.get_crc();
1459 // include full map in the txn. note that old monitors will
1460 // overwrite this. new ones will now skip the local full map
1461 // encode and reload from this.
1462 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1466 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
1468 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1470 dout(20) << " full_crc " << tmp
.get_crc()
1471 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1473 /* put everything in the transaction */
1474 put_version(t
, pending_inc
.epoch
, bl
);
1475 put_last_committed(t
, pending_inc
.epoch
);
1478 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1479 p
!= pending_metadata
.end();
1481 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1482 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1483 p
!= pending_metadata_rm
.end();
1485 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1486 pending_metadata
.clear();
1487 pending_metadata_rm
.clear();
1490 if (tmp
.require_osd_release
>= CEPH_RELEASE_MIMIC
) {
1491 for (auto& i
: pending_inc
.new_removed_snaps
) {
1493 // all snaps removed this epoch
1494 string k
= make_snap_epoch_key(i
.first
, pending_inc
.epoch
);
1496 encode(i
.second
, v
);
1497 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1499 for (auto q
= i
.second
.begin();
1500 q
!= i
.second
.end();
1503 string k
= make_snap_key_value(i
.first
, q
.get_start(),
1504 q
.get_len(), pending_inc
.epoch
, &v
);
1505 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1508 for (auto& i
: pending_inc
.new_purged_snaps
) {
1509 for (auto q
= i
.second
.begin();
1510 q
!= i
.second
.end();
1513 string k
= make_snap_purged_key_value(i
.first
, q
.get_start(),
1514 q
.get_len(), pending_inc
.epoch
,
1516 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1522 health_check_map_t next
;
1523 tmp
.check_health(&next
);
1524 encode_health(next
, t
);
1527 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
1530 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
1534 auto p
= bl
.cbegin();
1537 catch (buffer::error
& e
) {
1539 *err
<< "osd." << osd
<< " metadata is corrupt";
1545 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
1547 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
1548 if (osdmap
.is_up(osd
)) {
1549 map
<string
,string
> meta
;
1550 load_metadata(osd
, meta
, nullptr);
1551 auto p
= meta
.find(field
);
1552 if (p
== meta
.end()) {
1553 (*out
)["unknown"]++;
1555 (*out
)[p
->second
]++;
1561 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
1563 map
<string
,int> by_val
;
1564 count_metadata(field
, &by_val
);
1565 f
->open_object_section(field
.c_str());
1566 for (auto& p
: by_val
) {
1567 f
->dump_int(p
.first
.c_str(), p
.second
);
1572 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
1574 map
<string
, string
> metadata
;
1575 int r
= load_metadata(osd
, metadata
, nullptr);
1579 auto it
= metadata
.find("osd_objectstore");
1580 if (it
== metadata
.end())
1586 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
1587 const pg_pool_t
&pool
,
1590 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1591 // since filestore osds could always join the pool later
1592 set
<int> checked_osds
;
1593 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
1594 vector
<int> up
, acting
;
1595 pg_t
pgid(ps
, pool_id
);
1596 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
1597 for (int osd
: up
) {
1598 if (checked_osds
.find(osd
) != checked_osds
.end())
1600 string objectstore_type
;
1601 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
1602 // allow with missing metadata, e.g. due to an osd never booting yet
1603 if (r
< 0 || objectstore_type
== "bluestore") {
1604 checked_osds
.insert(osd
);
1607 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
1614 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
1616 map
<string
,string
> m
;
1617 if (int r
= load_metadata(osd
, m
, err
))
1619 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
1620 f
->dump_string(p
->first
.c_str(), p
->second
);
1624 void OSDMonitor::print_nodes(Formatter
*f
)
1626 // group OSDs by their hosts
1627 map
<string
, list
<int> > osds
; // hostname => osd
1628 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
1629 map
<string
, string
> m
;
1630 if (load_metadata(osd
, m
, NULL
)) {
1633 map
<string
, string
>::iterator hostname
= m
.find("hostname");
1634 if (hostname
== m
.end()) {
1635 // not likely though
1638 osds
[hostname
->second
].push_back(osd
);
1641 dump_services(f
, osds
, "osd");
1644 void OSDMonitor::share_map_with_random_osd()
1646 if (osdmap
.get_num_up_osds() == 0) {
1647 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
1651 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
1653 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
1657 dout(10) << "committed, telling random " << s
->name
1658 << " all about it" << dendl
;
1660 // get feature of the peer
1661 // use quorum_con_features, if it's an anonymous connection.
1662 uint64_t features
= s
->con_features
? s
->con_features
:
1663 mon
->get_quorum_con_features();
1664 // whatev, they'll request more if they need it
1665 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
1666 s
->con
->send_message(m
);
1667 // NOTE: do *not* record osd has up to this epoch (as we do
1668 // elsewhere) as they may still need to request older values.
1671 version_t
OSDMonitor::get_trim_to() const
1673 if (mon
->get_quorum().empty()) {
1674 dout(10) << __func__
<< ": quorum not formed" << dendl
;
1679 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1680 if (!creating_pgs
.pgs
.empty()) {
1685 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
1687 << " blocking osdmap trim"
1688 " ('mon_debug_block_osdmap_trim' set to 'true')"
1694 epoch_t floor
= get_min_last_epoch_clean();
1695 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
1696 if (g_conf()->mon_osd_force_trim_to
> 0 &&
1697 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
1698 floor
= g_conf()->mon_osd_force_trim_to
;
1699 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
1701 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
1702 if (floor
+ min
> get_last_committed()) {
1703 if (min
< get_last_committed())
1704 floor
= get_last_committed() - min
;
1708 if (floor
> get_first_committed())
1714 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
1716 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
1717 // also scan osd epochs
1718 // don't trim past the oldest reported osd epoch
1719 for (auto& osd_epoch
: osd_epochs
) {
1720 if (osd_epoch
.second
< floor
) {
1721 floor
= osd_epoch
.second
;
1727 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
1730 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
1732 get_version_full(first
, bl
);
1733 put_version_full(tx
, first
, bl
);
1735 if (has_osdmap_manifest
&&
1736 first
> osdmap_manifest
.get_first_pinned()) {
1737 _prune_update_trimmed(tx
, first
);
1742 /* full osdmap prune
1744 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
1747 void OSDMonitor::load_osdmap_manifest()
1749 bool store_has_manifest
=
1750 mon
->store
->exists(get_service_name(), "osdmap_manifest");
1752 if (!store_has_manifest
) {
1753 if (!has_osdmap_manifest
) {
1757 dout(20) << __func__
1758 << " dropping osdmap manifest from memory." << dendl
;
1759 osdmap_manifest
= osdmap_manifest_t();
1760 has_osdmap_manifest
= false;
1764 dout(20) << __func__
1765 << " osdmap manifest detected in store; reload." << dendl
;
1767 bufferlist manifest_bl
;
1768 int r
= get_value("osdmap_manifest", manifest_bl
);
1770 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
1771 ceph_abort_msg("error reading manifest");
1773 osdmap_manifest
.decode(manifest_bl
);
1774 has_osdmap_manifest
= true;
1776 dout(10) << __func__
<< " store osdmap manifest pinned ("
1777 << osdmap_manifest
.get_first_pinned()
1779 << osdmap_manifest
.get_last_pinned()
1784 bool OSDMonitor::should_prune() const
1786 version_t first
= get_first_committed();
1787 version_t last
= get_last_committed();
1788 version_t min_osdmap_epochs
=
1789 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
1790 version_t prune_min
=
1791 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
1792 version_t prune_interval
=
1793 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
1794 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
1795 version_t last_to_pin
= last
- min_osdmap_epochs
;
1797 // Make it or break it constraints.
1799 // If any of these conditions fails, we will not prune, regardless of
1800 // whether we have an on-disk manifest with an on-going pruning state.
1802 if ((last
- first
) <= min_osdmap_epochs
) {
1803 // between the first and last committed epochs, we don't have
1804 // enough epochs to trim, much less to prune.
1805 dout(10) << __func__
1806 << " currently holding only " << (last
- first
)
1807 << " epochs (min osdmap epochs: " << min_osdmap_epochs
1808 << "); do not prune."
1812 } else if ((last_to_pin
- first
) < prune_min
) {
1813 // between the first committed epoch and the last epoch we would prune,
1814 // we simply don't have enough versions over the minimum to prune maps.
1815 dout(10) << __func__
1816 << " could only prune " << (last_to_pin
- first
)
1817 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
1818 " is less than the required minimum (" << prune_min
<< ")"
1822 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
1823 dout(10) << __func__
1824 << " we have pruned as far as we can; do not prune."
1828 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
1829 dout(10) << __func__
1830 << " not enough epochs to form an interval (last pinned: "
1831 << last_pinned
<< ", last to pin: "
1832 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
1837 dout(15) << __func__
1838 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
1839 << " lc (" << first
<< ".." << last
<< ")"
1844 void OSDMonitor::_prune_update_trimmed(
1845 MonitorDBStore::TransactionRef tx
,
1848 dout(10) << __func__
1849 << " first " << first
1850 << " last_pinned " << osdmap_manifest
.get_last_pinned()
1851 << " last_pinned " << osdmap_manifest
.get_last_pinned()
1854 osdmap_manifest_t manifest
= osdmap_manifest
;
1856 if (!manifest
.is_pinned(first
)) {
1857 manifest
.pin(first
);
1860 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
1861 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
1862 manifest
.pinned
.erase(p
, p_end
);
1863 ceph_assert(manifest
.get_first_pinned() == first
);
1865 if (manifest
.get_last_pinned() == first
+1 ||
1866 manifest
.pinned
.size() == 1) {
1867 // we reached the end of the line, as pinned maps go; clean up our
1868 // manifest, and let `should_prune()` decide whether we should prune
1870 tx
->erase(get_service_name(), "osdmap_manifest");
1875 manifest
.encode(bl
);
1876 tx
->put(get_service_name(), "osdmap_manifest", bl
);
1879 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
1881 dout(1) << __func__
<< dendl
;
1883 version_t pin_first
;
1885 // verify constrainsts on stable in-memory state
1886 if (!has_osdmap_manifest
) {
1887 // we must have never pruned, OR if we pruned the state must no longer
1888 // be relevant (i.e., the state must have been removed alongside with
1889 // the trim that *must* have removed past the last pinned map in a
1891 ceph_assert(osdmap_manifest
.pinned
.empty());
1892 ceph_assert(!mon
->store
->exists(get_service_name(), "osdmap_manifest"));
1893 pin_first
= get_first_committed();
1896 // we must have pruned in the past AND its state is still relevant
1897 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
1898 // and thus we still hold a manifest in the store).
1899 ceph_assert(!osdmap_manifest
.pinned
.empty());
1900 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
1901 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
1903 dout(10) << __func__
1904 << " first_pinned " << osdmap_manifest
.get_first_pinned()
1905 << " last_pinned " << osdmap_manifest
.get_last_pinned()
1908 pin_first
= osdmap_manifest
.get_last_pinned();
1911 manifest
.pin(pin_first
);
1914 bool OSDMonitor::_prune_sanitize_options() const
1916 uint64_t prune_interval
=
1917 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
1918 uint64_t prune_min
=
1919 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
1921 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
1925 if (prune_interval
== 0) {
1927 << " prune is enabled BUT prune interval is zero; abort."
1930 } else if (prune_interval
== 1) {
1932 << " prune interval is equal to one, which essentially means"
1933 " no pruning; abort."
1937 if (prune_min
== 0) {
1939 << " prune is enabled BUT prune min is zero; abort."
1943 if (prune_interval
> prune_min
) {
1945 << " impossible to ascertain proper prune interval because"
1946 << " it is greater than the minimum prune epochs"
1947 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
1952 if (txsize
< prune_interval
- 1) {
1954 << "'mon_osdmap_full_prune_txsize' (" << txsize
1955 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
1956 << "); abort." << dendl
;
1962 bool OSDMonitor::is_prune_enabled() const {
1963 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
1966 bool OSDMonitor::is_prune_supported() const {
1967 return mon
->get_required_mon_features().contains_any(
1968 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
1973 * @returns true if has side-effects; false otherwise.
1975 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
1977 bool enabled
= is_prune_enabled();
1979 dout(1) << __func__
<< " osdmap full prune "
1980 << ( enabled
? "enabled" : "disabled")
1983 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
1987 // we are beyond the minimum prune versions, we need to remove maps because
1988 // otherwise the store will grow unbounded and we may end up having issues
1989 // with available disk space or store hangs.
1991 // we will not pin all versions. We will leave a buffer number of versions.
1992 // this allows us the monitor to trim maps without caring too much about
1993 // pinned maps, and then allow us to use another ceph-mon without these
1994 // capabilities, without having to repair the store.
1996 osdmap_manifest_t manifest
= osdmap_manifest
;
1998 version_t first
= get_first_committed();
1999 version_t last
= get_last_committed();
2001 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2002 version_t last_pinned
= manifest
.get_last_pinned();
2003 uint64_t prune_interval
=
2004 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2006 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2008 prune_init(manifest
);
2010 // we need to get rid of some osdmaps
2013 << " lc (" << first
<< " .. " << last
<< ")"
2014 << " last_pinned " << last_pinned
2015 << " interval " << prune_interval
2016 << " last_to_pin " << last_to_pin
2019 // We will be erasing maps as we go.
2021 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2023 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2024 // we stop pruning. We could prune the maps between `next_to_pin` and
2025 // `last_to_pin`, but by not doing it we end up with neater pruned
2026 // intervals, aligned with `prune_interval`. Besides, this should not be a
2027 // problem as long as `prune_interval` is set to a sane value, instead of
2028 // hundreds or thousands of maps.
2030 auto map_exists
= [this](version_t v
) {
2031 string k
= mon
->store
->combine_strings("full", v
);
2032 return mon
->store
->exists(get_service_name(), k
);
2035 // 'interval' represents the number of maps from the last pinned
2036 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2037 // version 11 next; all intermediate versions will be removed.
2039 // 'txsize' represents the maximum number of versions we'll be removing in
2040 // this iteration. If 'txsize' is large enough to perform multiple passes
2041 // pinning and removing maps, we will do so; if not, we'll do at least one
2042 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2043 // ensure that we never go *over* the maximum.
2045 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2046 uint64_t removal_interval
= prune_interval
- 1;
2048 if (txsize
< removal_interval
) {
2050 << " setting txsize to removal interval size ("
2051 << removal_interval
<< " versions"
2053 txsize
= removal_interval
;
2055 ceph_assert(removal_interval
> 0);
2057 uint64_t num_pruned
= 0;
2058 while (num_pruned
+ removal_interval
<= txsize
) {
2059 last_pinned
= manifest
.get_last_pinned();
2061 if (last_pinned
+ prune_interval
> last_to_pin
) {
2064 ceph_assert(last_pinned
< last_to_pin
);
2066 version_t next_pinned
= last_pinned
+ prune_interval
;
2067 ceph_assert(next_pinned
<= last_to_pin
);
2068 manifest
.pin(next_pinned
);
2070 dout(20) << __func__
2071 << " last_pinned " << last_pinned
2072 << " next_pinned " << next_pinned
2073 << " num_pruned " << num_pruned
2074 << " removal interval (" << (last_pinned
+1)
2075 << ".." << (next_pinned
-1) << ")"
2076 << " txsize " << txsize
<< dendl
;
2078 ceph_assert(map_exists(last_pinned
));
2079 ceph_assert(map_exists(next_pinned
));
2081 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2082 ceph_assert(!manifest
.is_pinned(v
));
2084 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2085 string full_key
= mon
->store
->combine_strings("full", v
);
2086 tx
->erase(get_service_name(), full_key
);
2091 ceph_assert(num_pruned
> 0);
2094 manifest
.encode(bl
);
2095 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2103 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2105 op
->mark_osdmon_event(__func__
);
2106 Message
*m
= op
->get_req();
2107 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2109 switch (m
->get_type()) {
2111 case MSG_MON_COMMAND
:
2113 return preprocess_command(op
);
2114 } catch (const bad_cmd_get
& e
) {
2116 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2119 case CEPH_MSG_MON_GET_OSDMAP
:
2120 return preprocess_get_osdmap(op
);
2123 case MSG_OSD_MARK_ME_DOWN
:
2124 return preprocess_mark_me_down(op
);
2126 return preprocess_full(op
);
2127 case MSG_OSD_FAILURE
:
2128 return preprocess_failure(op
);
2130 return preprocess_boot(op
);
2132 return preprocess_alive(op
);
2133 case MSG_OSD_PG_CREATED
:
2134 return preprocess_pg_created(op
);
2135 case MSG_OSD_PG_READY_TO_MERGE
:
2136 return preprocess_pg_ready_to_merge(op
);
2137 case MSG_OSD_PGTEMP
:
2138 return preprocess_pgtemp(op
);
2139 case MSG_OSD_BEACON
:
2140 return preprocess_beacon(op
);
2142 case CEPH_MSG_POOLOP
:
2143 return preprocess_pool_op(op
);
2145 case MSG_REMOVE_SNAPS
:
2146 return preprocess_remove_snaps(op
);
2154 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2156 op
->mark_osdmon_event(__func__
);
2157 Message
*m
= op
->get_req();
2158 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2160 switch (m
->get_type()) {
2162 case MSG_OSD_MARK_ME_DOWN
:
2163 return prepare_mark_me_down(op
);
2165 return prepare_full(op
);
2166 case MSG_OSD_FAILURE
:
2167 return prepare_failure(op
);
2169 return prepare_boot(op
);
2171 return prepare_alive(op
);
2172 case MSG_OSD_PG_CREATED
:
2173 return prepare_pg_created(op
);
2174 case MSG_OSD_PGTEMP
:
2175 return prepare_pgtemp(op
);
2176 case MSG_OSD_PG_READY_TO_MERGE
:
2177 return prepare_pg_ready_to_merge(op
);
2178 case MSG_OSD_BEACON
:
2179 return prepare_beacon(op
);
2181 case MSG_MON_COMMAND
:
2183 return prepare_command(op
);
2184 } catch (const bad_cmd_get
& e
) {
2186 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2190 case CEPH_MSG_POOLOP
:
2191 return prepare_pool_op(op
);
2193 case MSG_REMOVE_SNAPS
:
2194 return prepare_remove_snaps(op
);
2204 bool OSDMonitor::should_propose(double& delay
)
2206 dout(10) << "should_propose" << dendl
;
2208 // if full map, propose immediately! any subsequent changes will be clobbered.
2209 if (pending_inc
.fullmap
.length())
2212 // adjust osd weights?
2213 if (!osd_weight
.empty() &&
2214 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2215 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2216 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2222 return PaxosService::should_propose(delay
);
2227 // ---------------------------
2230 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2232 op
->mark_osdmon_event(__func__
);
2233 MMonGetOSDMap
*m
= static_cast<MMonGetOSDMap
*>(op
->get_req());
2235 uint64_t features
= mon
->get_quorum_con_features();
2236 if (op
->get_session() && op
->get_session()->con_features
)
2237 features
= op
->get_session()->con_features
;
2239 dout(10) << __func__
<< " " << *m
<< dendl
;
2240 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
, features
);
2241 epoch_t first
= get_first_committed();
2242 epoch_t last
= osdmap
.get_epoch();
2243 int max
= g_conf()->osd_map_message_max
;
2244 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2245 for (epoch_t e
= std::max(first
, m
->get_full_first());
2246 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2248 bufferlist
& bl
= reply
->maps
[e
];
2249 int r
= get_version_full(e
, features
, bl
);
2250 ceph_assert(r
>= 0);
2251 max_bytes
-= bl
.length();
2253 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2254 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2256 bufferlist
& bl
= reply
->incremental_maps
[e
];
2257 int r
= get_version(e
, features
, bl
);
2258 ceph_assert(r
>= 0);
2259 max_bytes
-= bl
.length();
2261 reply
->oldest_map
= first
;
2262 reply
->newest_map
= last
;
2263 mon
->send_reply(op
, reply
);
2268 // ---------------------------
2273 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2274 // check permissions
2275 MonSession
*session
= op
->get_session();
2278 if (!session
->is_capable("osd", MON_CAP_X
)) {
2279 dout(0) << "got MOSDFailure from entity with insufficient caps "
2280 << session
->caps
<< dendl
;
2283 if (fsid
!= mon
->monmap
->fsid
) {
2284 dout(0) << "check_source: on fsid " << fsid
2285 << " != " << mon
->monmap
->fsid
<< dendl
;
2292 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2294 op
->mark_osdmon_event(__func__
);
2295 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
2296 // who is target_osd
2297 int badboy
= m
->get_target_osd();
2299 // check permissions
2300 if (check_source(op
, m
->fsid
))
2303 // first, verify the reporting host is valid
2304 if (m
->get_orig_source().is_osd()) {
2305 int from
= m
->get_orig_source().num();
2306 if (!osdmap
.exists(from
) ||
2307 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2308 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2309 dout(5) << "preprocess_failure from dead osd." << from
2310 << ", ignoring" << dendl
;
2311 send_incremental(op
, m
->get_epoch()+1);
2318 if (osdmap
.is_down(badboy
)) {
2319 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2320 << " " << m
->get_target_addrs()
2321 << ", from " << m
->get_orig_source() << dendl
;
2322 if (m
->get_epoch() < osdmap
.get_epoch())
2323 send_incremental(op
, m
->get_epoch()+1);
2326 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2327 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2328 << " " << m
->get_target_addrs()
2329 << " != map's " << osdmap
.get_addrs(badboy
)
2330 << ", from " << m
->get_orig_source() << dendl
;
2331 if (m
->get_epoch() < osdmap
.get_epoch())
2332 send_incremental(op
, m
->get_epoch()+1);
2336 // already reported?
2337 if (osdmap
.is_down(badboy
) ||
2338 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2339 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2340 << " " << m
->get_target_addrs()
2341 << ", from " << m
->get_orig_source() << dendl
;
2342 if (m
->get_epoch() < osdmap
.get_epoch())
2343 send_incremental(op
, m
->get_epoch()+1);
2347 if (!can_mark_down(badboy
)) {
2348 dout(5) << "preprocess_failure ignoring report of osd."
2349 << m
->get_target_osd() << " " << m
->get_target_addrs()
2350 << " from " << m
->get_orig_source() << dendl
;
2354 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2355 << " " << m
->get_target_addrs()
2356 << ", from " << m
->get_orig_source() << dendl
;
2364 class C_AckMarkedDown
: public C_MonOp
{
2370 : C_MonOp(op
), osdmon(osdmon
) {}
2372 void _finish(int) override
{
2373 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
2374 osdmon
->mon
->send_reply(
2381 false)); // ACK itself does not request an ack
2383 ~C_AckMarkedDown() override
{
2387 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
2389 op
->mark_osdmon_event(__func__
);
2390 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
2391 int from
= m
->target_osd
;
2393 // check permissions
2394 if (check_source(op
, m
->fsid
))
2397 // first, verify the reporting host is valid
2398 if (!m
->get_orig_source().is_osd())
2401 if (!osdmap
.exists(from
) ||
2402 osdmap
.is_down(from
) ||
2403 osdmap
.get_addrs(from
) != m
->target_addrs
) {
2404 dout(5) << "preprocess_mark_me_down from dead osd."
2405 << from
<< ", ignoring" << dendl
;
2406 send_incremental(op
, m
->get_epoch()+1);
2410 // no down might be set
2411 if (!can_mark_down(from
))
2414 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
2415 << " " << m
->target_addrs
<< dendl
;
2419 if (m
->request_ack
) {
2420 Context
*c(new C_AckMarkedDown(this, op
));
2426 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
2428 op
->mark_osdmon_event(__func__
);
2429 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
2430 int target_osd
= m
->target_osd
;
2432 ceph_assert(osdmap
.is_up(target_osd
));
2433 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
2435 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
2436 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2438 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
2442 bool OSDMonitor::can_mark_down(int i
)
2444 if (osdmap
.is_nodown(i
)) {
2445 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
2446 << "will not mark it down" << dendl
;
2450 int num_osds
= osdmap
.get_num_osds();
2451 if (num_osds
== 0) {
2452 dout(5) << __func__
<< " no osds" << dendl
;
2455 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
2456 float up_ratio
= (float)up
/ (float)num_osds
;
2457 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
2458 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
2459 << g_conf()->mon_osd_min_up_ratio
2460 << ", will not mark osd." << i
<< " down" << dendl
;
2466 bool OSDMonitor::can_mark_up(int i
)
2468 if (osdmap
.is_noup(i
)) {
2469 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
2470 << "will not mark it up" << dendl
;
2478 * @note the parameter @p i apparently only exists here so we can output the
2479 * osd's id on messages.
2481 bool OSDMonitor::can_mark_out(int i
)
2483 if (osdmap
.is_noout(i
)) {
2484 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
2485 << "will not mark it out" << dendl
;
2489 int num_osds
= osdmap
.get_num_osds();
2490 if (num_osds
== 0) {
2491 dout(5) << __func__
<< " no osds" << dendl
;
2494 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
2495 float in_ratio
= (float)in
/ (float)num_osds
;
2496 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
2498 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
2499 << g_conf()->mon_osd_min_in_ratio
2500 << ", will not mark osd." << i
<< " out" << dendl
;
2502 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
2503 << g_conf()->mon_osd_min_in_ratio
2504 << ", will not mark osds out" << dendl
;
2511 bool OSDMonitor::can_mark_in(int i
)
2513 if (osdmap
.is_noin(i
)) {
2514 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
2515 << "will not mark it in" << dendl
;
2522 bool OSDMonitor::check_failures(utime_t now
)
2524 bool found_failure
= false;
2525 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2526 p
!= failure_info
.end();
2528 if (can_mark_down(p
->first
)) {
2529 found_failure
|= check_failure(now
, p
->first
, p
->second
);
2532 return found_failure
;
2535 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
2537 // already pending failure?
2538 if (pending_inc
.new_state
.count(target_osd
) &&
2539 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
2540 dout(10) << " already pending failure" << dendl
;
2544 set
<string
> reporters_by_subtree
;
2545 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
2546 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
2547 utime_t max_failed_since
= fi
.get_failed_since();
2548 utime_t failed_for
= now
- max_failed_since
;
2550 utime_t grace
= orig_grace
;
2551 double my_grace
= 0, peer_grace
= 0;
2553 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
2554 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
2555 decay_k
= ::log(.5) / halflife
;
2557 // scale grace period based on historical probability of 'lagginess'
2558 // (false positive failures due to slowness).
2559 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
2560 double decay
= exp((double)failed_for
* decay_k
);
2561 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
2562 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
2563 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
2567 // consider the peers reporting a failure a proxy for a potential
2568 // 'subcluster' over the overall cluster that is similarly
2569 // laggy. this is clearly not true in all cases, but will sometimes
2570 // help us localize the grace correction to a subset of the system
2571 // (say, a rack with a bad switch) that is unhappy.
2572 ceph_assert(fi
.reporters
.size());
2573 for (map
<int,failure_reporter_t
>::iterator p
= fi
.reporters
.begin();
2574 p
!= fi
.reporters
.end();
2576 // get the parent bucket whose type matches with "reporter_subtree_level".
2577 // fall back to OSD if the level doesn't exist.
2578 map
<string
, string
> reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
2579 map
<string
, string
>::iterator iter
= reporter_loc
.find(reporter_subtree_level
);
2580 if (iter
== reporter_loc
.end()) {
2581 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
2583 reporters_by_subtree
.insert(iter
->second
);
2585 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
2586 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
2587 utime_t elapsed
= now
- xi
.down_stamp
;
2588 double decay
= exp((double)elapsed
* decay_k
);
2589 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
2593 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
2594 peer_grace
/= (double)fi
.reporters
.size();
2595 grace
+= peer_grace
;
2598 dout(10) << " osd." << target_osd
<< " has "
2599 << fi
.reporters
.size() << " reporters, "
2600 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
2601 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
2604 if (failed_for
>= grace
&&
2605 reporters_by_subtree
.size() >= g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
2606 dout(1) << " we have enough reporters to mark osd." << target_osd
2607 << " down" << dendl
;
2608 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2610 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
2611 << osdmap
.crush
->get_full_location_ordered_string(
2614 << (int)reporters_by_subtree
.size()
2615 << " reporters from different "
2616 << reporter_subtree_level
<< " after "
2617 << failed_for
<< " >= grace " << grace
<< ")";
2623 void OSDMonitor::force_failure(int target_osd
, int by
)
2625 // already pending failure?
2626 if (pending_inc
.new_state
.count(target_osd
) &&
2627 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
2628 dout(10) << " already pending failure" << dendl
;
2632 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
2633 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2635 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
2636 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
2637 << ") (connection refused reported by osd." << by
<< ")";
2641 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
2643 op
->mark_osdmon_event(__func__
);
2644 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
2645 dout(1) << "prepare_failure osd." << m
->get_target_osd()
2646 << " " << m
->get_target_addrs()
2647 << " from " << m
->get_orig_source()
2648 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
2650 int target_osd
= m
->get_target_osd();
2651 int reporter
= m
->get_orig_source().num();
2652 ceph_assert(osdmap
.is_up(target_osd
));
2653 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
2655 if (m
->if_osd_failed()) {
2656 // calculate failure time
2657 utime_t now
= ceph_clock_now();
2658 utime_t failed_since
=
2659 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
2662 if (m
->is_immediate()) {
2663 mon
->clog
->debug() << "osd." << m
->get_target_osd()
2664 << " reported immediately failed by "
2665 << m
->get_orig_source();
2666 force_failure(target_osd
, reporter
);
2670 mon
->clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
2671 << m
->get_orig_source();
2673 failure_info_t
& fi
= failure_info
[target_osd
];
2674 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
2676 mon
->no_reply(old_op
);
2679 return check_failure(now
, target_osd
, fi
);
2681 // remove the report
2682 mon
->clog
->debug() << "osd." << m
->get_target_osd()
2683 << " failure report canceled by "
2684 << m
->get_orig_source();
2685 if (failure_info
.count(target_osd
)) {
2686 failure_info_t
& fi
= failure_info
[target_osd
];
2687 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
2689 mon
->no_reply(report_op
);
2691 if (fi
.reporters
.empty()) {
2692 dout(10) << " removing last failure_info for osd." << target_osd
2694 failure_info
.erase(target_osd
);
2696 dout(10) << " failure_info for osd." << target_osd
<< " now "
2697 << fi
.reporters
.size() << " reporters" << dendl
;
2700 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
2708 void OSDMonitor::process_failures()
2710 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2711 while (p
!= failure_info
.end()) {
2712 if (osdmap
.is_up(p
->first
)) {
2715 dout(10) << "process_failures osd." << p
->first
<< dendl
;
2716 list
<MonOpRequestRef
> ls
;
2717 p
->second
.take_report_messages(ls
);
2718 failure_info
.erase(p
++);
2720 while (!ls
.empty()) {
2721 MonOpRequestRef o
= ls
.front();
2723 o
->mark_event(__func__
);
2724 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
2725 send_latest(o
, m
->get_epoch());
2734 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
2736 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
2738 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2739 p
!= failure_info
.end();
2741 p
->second
.take_report_messages(ls
);
2743 failure_info
.clear();
2749 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
2751 op
->mark_osdmon_event(__func__
);
2752 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2753 int from
= m
->get_orig_source_inst().name
.num();
2755 // check permissions, ignore if failed (no response expected)
2756 MonSession
*session
= op
->get_session();
2759 if (!session
->is_capable("osd", MON_CAP_X
)) {
2760 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2761 << session
->caps
<< dendl
;
2765 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
2766 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
2767 << " != " << mon
->monmap
->fsid
<< dendl
;
2771 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
2772 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
2776 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
2778 // force all osds to have gone through luminous prior to upgrade to nautilus
2780 vector
<string
> missing
;
2781 if (!HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
2782 missing
.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
2784 if (!HAVE_FEATURE(m
->osd_features
, SERVER_JEWEL
)) {
2785 missing
.push_back("CEPH_FEATURE_SERVER_JEWEL");
2787 if (!HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
2788 missing
.push_back("CEPH_FEATURE_SERVER_KRAKEN");
2790 if (!HAVE_FEATURE(m
->osd_features
, OSD_RECOVERY_DELETES
)) {
2791 missing
.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
2794 if (!missing
.empty()) {
2795 using std::experimental::make_ostream_joiner
;
2798 copy(begin(missing
), end(missing
), make_ostream_joiner(ss
, ";"));
2800 mon
->clog
->info() << "disallowing boot of OSD "
2801 << m
->get_orig_source_inst()
2802 << " because the osd lacks " << ss
.str();
2807 // make sure upgrades stop at nautilus
2808 if (HAVE_FEATURE(m
->osd_features
, SERVER_O
) &&
2809 osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
2810 mon
->clog
->info() << "disallowing boot of post-nautilus OSD "
2811 << m
->get_orig_source_inst()
2812 << " because require_osd_release < nautilus";
2816 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
2817 // we are reusing a jewel feature bit that was retired in luminous.
2818 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
2819 osdmap
.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT
) &&
2820 !(m
->osd_features
& CEPH_FEATURE_OSD_PGLOG_HARDLIMIT
)) {
2821 mon
->clog
->info() << "disallowing boot of OSD "
2822 << m
->get_orig_source_inst()
2823 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
2828 if (osdmap
.is_up(from
) &&
2829 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
2830 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
2832 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
2833 << " " << m
->get_orig_source_addrs()
2834 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
2839 if (osdmap
.exists(from
) &&
2840 !osdmap
.get_uuid(from
).is_zero() &&
2841 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2842 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
2843 << " clashes with existing osd: different fsid"
2844 << " (ours: " << osdmap
.get_uuid(from
)
2845 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
2849 if (osdmap
.exists(from
) &&
2850 osdmap
.get_info(from
).up_from
> m
->version
&&
2851 osdmap
.get_most_recent_addrs(from
).legacy_equals(
2852 m
->get_orig_source_addrs())) {
2853 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
2854 send_latest(op
, m
->sb
.current_epoch
+1);
2859 if (!can_mark_up(from
)) {
2860 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
2861 send_latest(op
, m
->sb
.current_epoch
+1);
2865 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
2872 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
2874 op
->mark_osdmon_event(__func__
);
2875 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2876 dout(7) << __func__
<< " from " << m
->get_source()
2878 << " client_addrs" << m
->get_connection()->get_peer_addrs()
2879 << " cluster_addrs " << m
->cluster_addrs
2880 << " hb_back_addrs " << m
->hb_back_addrs
2881 << " hb_front_addrs " << m
->hb_front_addrs
2884 ceph_assert(m
->get_orig_source().is_osd());
2885 int from
= m
->get_orig_source().num();
2887 // does this osd exist?
2888 if (from
>= osdmap
.get_max_osd()) {
2889 dout(1) << "boot from osd." << from
<< " >= max_osd "
2890 << osdmap
.get_max_osd() << dendl
;
2894 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
2895 if (pending_inc
.new_state
.count(from
))
2896 oldstate
^= pending_inc
.new_state
[from
];
2898 // already up? mark down first?
2899 if (osdmap
.is_up(from
)) {
2900 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
2901 << osdmap
.get_addrs(from
) << dendl
;
2902 // preprocess should have caught these; if not, assert.
2903 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
2904 m
->get_orig_source_addrs()) ||
2905 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
2906 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
2908 if (pending_inc
.new_state
.count(from
) == 0 ||
2909 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
2910 // mark previous guy down
2911 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
2913 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2914 } else if (pending_inc
.new_up_client
.count(from
)) {
2915 // already prepared, just wait
2916 dout(7) << __func__
<< " already prepared, waiting on "
2917 << m
->get_orig_source_addr() << dendl
;
2918 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2921 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
2922 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
2923 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
2924 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
2926 down_pending_out
.erase(from
); // if any
2929 osd_weight
[from
] = m
->sb
.weight
;
2932 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
2934 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2935 // preprocess should have caught this; if not, assert.
2936 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
2937 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
2941 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
2942 const osd_info_t
& i
= osdmap
.get_info(from
);
2943 if (i
.up_from
> i
.lost_at
) {
2944 dout(10) << " fresh osd; marking lost_at too" << dendl
;
2945 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
2950 bufferlist osd_metadata
;
2951 encode(m
->metadata
, osd_metadata
);
2952 pending_metadata
[from
] = osd_metadata
;
2953 pending_metadata_rm
.erase(from
);
2955 // adjust last clean unmount epoch?
2956 const osd_info_t
& info
= osdmap
.get_info(from
);
2957 dout(10) << " old osd_info: " << info
<< dendl
;
2958 if (m
->sb
.mounted
> info
.last_clean_begin
||
2959 (m
->sb
.mounted
== info
.last_clean_begin
&&
2960 m
->sb
.clean_thru
> info
.last_clean_end
)) {
2961 epoch_t begin
= m
->sb
.mounted
;
2962 epoch_t end
= m
->sb
.clean_thru
;
2964 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
2965 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
2966 << ") -> [" << begin
<< "-" << end
<< ")"
2968 pending_inc
.new_last_clean_interval
[from
] =
2969 pair
<epoch_t
,epoch_t
>(begin
, end
);
2972 osd_xinfo_t xi
= osdmap
.get_xinfo(from
);
2973 if (m
->boot_epoch
== 0) {
2974 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
2975 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
2976 dout(10) << " not laggy, new xi " << xi
<< dendl
;
2978 if (xi
.down_stamp
.sec()) {
2979 int interval
= ceph_clock_now().sec() -
2980 xi
.down_stamp
.sec();
2981 if (g_conf()->mon_osd_laggy_max_interval
&&
2982 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
2983 interval
= g_conf()->mon_osd_laggy_max_interval
;
2986 interval
* g_conf()->mon_osd_laggy_weight
+
2987 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
2989 xi
.laggy_probability
=
2990 g_conf()->mon_osd_laggy_weight
+
2991 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
2992 dout(10) << " laggy, now xi " << xi
<< dendl
;
2995 // set features shared by the osd
2996 if (m
->osd_features
)
2997 xi
.features
= m
->osd_features
;
2999 xi
.features
= m
->get_connection()->get_features();
3002 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3003 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3004 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3005 (g_conf()->mon_osd_auto_mark_in
)) {
3006 if (can_mark_in(from
)) {
3007 if (osdmap
.osd_xinfo
[from
].old_weight
> 0) {
3008 pending_inc
.new_weight
[from
] = osdmap
.osd_xinfo
[from
].old_weight
;
3011 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3014 dout(7) << __func__
<< " NOIN set, will not mark in "
3015 << m
->get_orig_source_addr() << dendl
;
3019 pending_inc
.new_xinfo
[from
] = xi
;
3022 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3027 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3029 op
->mark_osdmon_event(__func__
);
3030 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
3031 dout(7) << "_booted " << m
->get_orig_source_inst()
3032 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3035 mon
->clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3039 send_latest(op
, m
->sb
.current_epoch
+1);
3046 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3048 op
->mark_osdmon_event(__func__
);
3049 MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
3050 int from
= m
->get_orig_source().num();
3052 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3054 // check permissions, ignore if failed
3055 MonSession
*session
= op
->get_session();
3058 if (!session
->is_capable("osd", MON_CAP_X
)) {
3059 dout(0) << "MOSDFull from entity with insufficient privileges:"
3060 << session
->caps
<< dendl
;
3064 // ignore a full message from the osd instance that already went down
3065 if (!osdmap
.exists(from
)) {
3066 dout(7) << __func__
<< " ignoring full message from nonexistent "
3067 << m
->get_orig_source_inst() << dendl
;
3070 if ((!osdmap
.is_up(from
) &&
3071 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3072 m
->get_orig_source_addrs())) ||
3073 (osdmap
.is_up(from
) &&
3074 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3075 dout(7) << __func__
<< " ignoring full message from down "
3076 << m
->get_orig_source_inst() << dendl
;
3080 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3082 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3083 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3084 << " " << m
->get_orig_source_inst() << dendl
;
3085 _reply_map(op
, m
->version
);
3089 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3090 << " " << m
->get_orig_source_inst() << dendl
;
3097 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3099 op
->mark_osdmon_event(__func__
);
3100 const MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
3101 const int from
= m
->get_orig_source().num();
3103 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3104 const unsigned want_state
= m
->state
& mask
; // safety first
3106 unsigned cur_state
= osdmap
.get_state(from
);
3107 auto p
= pending_inc
.new_state
.find(from
);
3108 if (p
!= pending_inc
.new_state
.end()) {
3109 cur_state
^= p
->second
;
3113 set
<string
> want_state_set
, cur_state_set
;
3114 OSDMap::calc_state_set(want_state
, want_state_set
);
3115 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3117 if (cur_state
!= want_state
) {
3118 if (p
!= pending_inc
.new_state
.end()) {
3121 pending_inc
.new_state
[from
] = 0;
3123 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3124 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3125 << " -> " << want_state_set
<< dendl
;
3127 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3128 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3131 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3138 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3140 op
->mark_osdmon_event(__func__
);
3141 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
3142 int from
= m
->get_orig_source().num();
3144 // check permissions, ignore if failed
3145 MonSession
*session
= op
->get_session();
3148 if (!session
->is_capable("osd", MON_CAP_X
)) {
3149 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3150 << session
->caps
<< dendl
;
3154 if (!osdmap
.is_up(from
) ||
3155 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3156 dout(7) << "preprocess_alive ignoring alive message from down "
3157 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3162 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3164 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3165 _reply_map(op
, m
->version
);
3169 dout(10) << "preprocess_alive want up_thru " << m
->want
3170 << " from " << m
->get_orig_source_inst() << dendl
;
3177 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3179 op
->mark_osdmon_event(__func__
);
3180 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
3181 int from
= m
->get_orig_source().num();
3183 if (0) { // we probably don't care much about these
3184 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
3187 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3188 << " from " << m
->get_orig_source_inst() << dendl
;
3190 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3191 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3195 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3197 op
->mark_osdmon_event(__func__
);
3198 dout(7) << "_reply_map " << e
3199 << " from " << op
->get_req()->get_orig_source_inst()
3205 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3207 op
->mark_osdmon_event(__func__
);
3208 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
3209 dout(10) << __func__
<< " " << *m
<< dendl
;
3210 auto session
= op
->get_session();
3213 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3216 if (!session
->is_capable("osd", MON_CAP_X
)) {
3217 derr
<< __func__
<< " received from entity "
3218 << "with insufficient privileges " << session
->caps
<< dendl
;
3221 // always forward the "created!" to the leader
3225 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3227 op
->mark_osdmon_event(__func__
);
3228 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
3229 dout(10) << __func__
<< " " << *m
<< dendl
;
3230 auto src
= m
->get_orig_source();
3231 auto from
= src
.num();
3232 if (!src
.is_osd() ||
3233 !mon
->osdmon()->osdmap
.is_up(from
) ||
3234 !mon
->osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3235 m
->get_orig_source_addrs())) {
3236 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
3239 pending_created_pgs
.push_back(m
->pgid
);
3243 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
3245 op
->mark_osdmon_event(__func__
);
3246 auto m
= static_cast<MOSDPGReadyToMerge
*>(op
->get_req());
3247 dout(10) << __func__
<< " " << *m
<< dendl
;
3248 const pg_pool_t
*pi
;
3249 auto session
= op
->get_session();
3251 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3254 if (!session
->is_capable("osd", MON_CAP_X
)) {
3255 derr
<< __func__
<< " received from entity "
3256 << "with insufficient privileges " << session
->caps
<< dendl
;
3259 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
3261 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
3264 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
3265 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
3268 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
3269 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
3272 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
3273 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
3283 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
3285 op
->mark_osdmon_event(__func__
);
3286 auto m
= static_cast<MOSDPGReadyToMerge
*>(op
->get_req());
3287 dout(10) << __func__
<< " " << *m
<< dendl
;
3289 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
3290 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
3292 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
3293 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
3294 p
.get_pg_num_pending() > m
->pgid
.ps()) {
3295 dout(10) << __func__
3296 << " race with concurrent pg_num[_pending] update, will retry"
3298 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3303 p
.dec_pg_num(m
->pgid
,
3307 m
->last_epoch_started
,
3308 m
->last_epoch_clean
);
3309 p
.last_change
= pending_inc
.epoch
;
3311 // back off the merge attempt!
3312 p
.set_pg_num_pending(p
.get_pg_num());
3315 // force pre-nautilus clients to resend their ops, since they
3316 // don't understand pg_num_pending changes form a new interval
3317 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
3319 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
3321 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
3324 prob
> (double)(rand() % 1000)/1000.0) {
3325 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
3326 auto n
= new MMonCommand(mon
->monmap
->get_fsid());
3327 n
->set_connection(m
->get_connection());
3328 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3329 osdmap
.get_pool_name(m
->pgid
.pool()) +
3330 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3331 stringify(m
->pgid
.ps() + 1) + "\"}" };
3332 MonOpRequestRef nop
= mon
->op_tracker
.create_request
<MonOpRequest
>(n
);
3333 nop
->set_type_service();
3334 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
3336 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3345 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
3347 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
3348 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
3349 mempool::osdmap::vector
<int> empty
;
3350 int from
= m
->get_orig_source().num();
3351 size_t ignore_cnt
= 0;
3354 MonSession
*session
= op
->get_session();
3357 if (!session
->is_capable("osd", MON_CAP_X
)) {
3358 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3359 << session
->caps
<< dendl
;
3363 if (!osdmap
.is_up(from
) ||
3364 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3365 dout(7) << "ignoring pgtemp message from down "
3366 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3375 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
3376 dout(20) << " " << p
->first
3377 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
3378 << " -> " << p
->second
<< dendl
;
3380 // does the pool exist?
3381 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
3383 * 1. If the osdmap does not have the pool, it means the pool has been
3384 * removed in-between the osd sending this message and us handling it.
3385 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3386 * not exist in the pending either, as the osds would not send a
3387 * message about a pool they know nothing about (yet).
3388 * 3. However, if the pool does exist in the pending, then it must be a
3389 * new pool, and not relevant to this message (see 1).
3391 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3392 << ": pool has been removed" << dendl
;
3397 int acting_primary
= -1;
3398 osdmap
.pg_to_up_acting_osds(
3399 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
3400 if (acting_primary
!= from
) {
3401 /* If the source isn't the primary based on the current osdmap, we know
3402 * that the interval changed and that we can discard this message.
3403 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3404 * which of two pg temp mappings on the same pg is more recent.
3406 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3407 << ": primary has changed" << dendl
;
3413 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
3414 osdmap
.primary_temp
->count(p
->first
)))
3417 // NOTE: we assume that this will clear pg_primary, so consider
3418 // an existing pg_primary field to imply a change
3419 if (p
->second
.size() &&
3420 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
3421 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
3422 osdmap
.primary_temp
->count(p
->first
)))
3426 // should we ignore all the pgs?
3427 if (ignore_cnt
== m
->pg_temp
.size())
3430 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
3431 _reply_map(op
, m
->map_epoch
);
3438 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
3440 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
3441 auto ut
= pending_inc
.new_up_thru
.find(from
);
3442 if (ut
!= pending_inc
.new_up_thru
.end()) {
3443 old_up_thru
= ut
->second
;
3445 if (up_thru
> old_up_thru
) {
3446 // set up_thru too, so the osd doesn't have to ask again
3447 pending_inc
.new_up_thru
[from
] = up_thru
;
3451 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
3453 op
->mark_osdmon_event(__func__
);
3454 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
3455 int from
= m
->get_orig_source().num();
3456 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
3457 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
3458 uint64_t pool
= p
->first
.pool();
3459 if (pending_inc
.old_pools
.count(pool
)) {
3460 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3461 << ": pool pending removal" << dendl
;
3464 if (!osdmap
.have_pg_pool(pool
)) {
3465 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3466 << ": pool has been removed" << dendl
;
3469 pending_inc
.new_pg_temp
[p
->first
] =
3470 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
3472 // unconditionally clear pg_primary (until this message can encode
3473 // a change for that, too.. at which point we need to also fix
3474 // preprocess_pg_temp)
3475 if (osdmap
.primary_temp
->count(p
->first
) ||
3476 pending_inc
.new_primary_temp
.count(p
->first
))
3477 pending_inc
.new_primary_temp
[p
->first
] = -1;
3480 // set up_thru too, so the osd doesn't have to ask again
3481 update_up_thru(from
, m
->map_epoch
);
3483 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
3490 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
3492 op
->mark_osdmon_event(__func__
);
3493 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
3494 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
3496 // check privilege, ignore if failed
3497 MonSession
*session
= op
->get_session();
3501 if (!session
->caps
.is_capable(
3503 CEPH_ENTITY_TYPE_MON
,
3504 session
->entity_name
,
3505 "osd", "osd pool rmsnap", {}, true, true, false,
3506 session
->get_peer_socket_addr())) {
3507 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3508 << session
->caps
<< dendl
;
3512 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
3513 q
!= m
->snaps
.end();
3515 if (!osdmap
.have_pg_pool(q
->first
)) {
3516 dout(10) << " ignoring removed_snaps " << q
->second
<< " on non-existent pool " << q
->first
<< dendl
;
3519 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
3520 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
3521 p
!= q
->second
.end();
3523 if (*p
> pi
->get_snap_seq() ||
3524 !pi
->removed_snaps
.contains(*p
))
3533 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
3535 op
->mark_osdmon_event(__func__
);
3536 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
3537 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
3539 for (map
<int, vector
<snapid_t
> >::iterator p
= m
->snaps
.begin();
3540 p
!= m
->snaps
.end();
3543 if (!osdmap
.have_pg_pool(p
->first
)) {
3544 dout(10) << " ignoring removed_snaps " << p
->second
<< " on non-existent pool " << p
->first
<< dendl
;
3548 pg_pool_t
& pi
= osdmap
.pools
[p
->first
];
3549 for (vector
<snapid_t
>::iterator q
= p
->second
.begin();
3550 q
!= p
->second
.end();
3552 if (!pi
.removed_snaps
.contains(*q
) &&
3553 (!pending_inc
.new_pools
.count(p
->first
) ||
3554 !pending_inc
.new_pools
[p
->first
].removed_snaps
.contains(*q
))) {
3555 pg_pool_t
*newpi
= pending_inc
.get_new_pool(p
->first
, &pi
);
3556 newpi
->removed_snaps
.insert(*q
);
3557 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
3558 dout(10) << " pool " << p
->first
<< " removed_snaps added " << *q
3559 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
3560 if (*q
> newpi
->get_snap_seq()) {
3561 dout(10) << " pool " << p
->first
<< " snap_seq "
3562 << newpi
->get_snap_seq() << " -> " << *q
<< dendl
;
3563 newpi
->set_snap_seq(*q
);
3565 newpi
->set_snap_epoch(pending_inc
.epoch
);
3566 pending_inc
.new_removed_snaps
[p
->first
].insert(*q
);
3574 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
3576 op
->mark_osdmon_event(__func__
);
3578 auto session
= op
->get_session();
3581 dout(10) << __func__
<< " no monitor session!" << dendl
;
3584 if (!session
->is_capable("osd", MON_CAP_X
)) {
3585 derr
<< __func__
<< " received from entity "
3586 << "with insufficient privileges " << session
->caps
<< dendl
;
3589 // Always forward the beacon to the leader, even if they are the same as
3590 // the old one. The leader will mark as down osds that haven't sent
3591 // beacon for a few minutes.
3595 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
3597 op
->mark_osdmon_event(__func__
);
3598 const auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
3599 const auto src
= beacon
->get_orig_source();
3600 dout(10) << __func__
<< " " << *beacon
3601 << " from " << src
<< dendl
;
3602 int from
= src
.num();
3604 if (!src
.is_osd() ||
3605 !osdmap
.is_up(from
) ||
3606 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
3607 if (src
.is_osd() && !osdmap
.is_up(from
)) {
3608 // share some new maps with this guy in case it may not be
3609 // aware of its own deadness...
3610 send_latest(op
, beacon
->version
+1);
3612 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
3616 last_osd_report
[from
] = ceph_clock_now();
3617 osd_epochs
[from
] = beacon
->version
;
3619 for (const auto& pg
: beacon
->pgs
) {
3620 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
3628 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
3630 op
->mark_osdmon_event(__func__
);
3631 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
3632 << " start " << start
<< dendl
;
3636 send_incremental(op
, start
);
3640 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
3642 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
, features
);
3643 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
3644 r
->oldest_map
= get_first_committed();
3645 r
->newest_map
= osdmap
.get_epoch();
3649 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
3651 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
3652 << std::hex
<< features
<< std::dec
<< dendl
;
3653 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
, features
);
3654 m
->oldest_map
= get_first_committed();
3655 m
->newest_map
= osdmap
.get_epoch();
3657 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
3659 int err
= get_version(e
, features
, bl
);
3661 ceph_assert(bl
.length());
3662 // if (get_version(e, bl) > 0) {
3663 dout(20) << "build_incremental inc " << e
<< " "
3664 << bl
.length() << " bytes" << dendl
;
3665 m
->incremental_maps
[e
] = bl
;
3667 ceph_assert(err
== -ENOENT
);
3668 ceph_assert(!bl
.length());
3669 get_version_full(e
, features
, bl
);
3670 if (bl
.length() > 0) {
3671 //else if (get_version("full", e, bl) > 0) {
3672 dout(20) << "build_incremental full " << e
<< " "
3673 << bl
.length() << " bytes" << dendl
;
3676 ceph_abort(); // we should have all maps.
3683 void OSDMonitor::send_full(MonOpRequestRef op
)
3685 op
->mark_osdmon_event(__func__
);
3686 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
3687 mon
->send_reply(op
, build_latest_full(op
->get_session()->con_features
));
3690 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
3692 op
->mark_osdmon_event(__func__
);
3694 MonSession
*s
= op
->get_session();
3698 // oh, we can tell the other mon to do it
3699 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
3701 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
3702 r
->send_osdmap_first
= first
;
3703 s
->proxy_con
->send_message(r
);
3704 op
->mark_event("reply: send routed send_osdmap_first reply");
3707 send_incremental(first
, s
, false, op
);
3711 void OSDMonitor::send_incremental(epoch_t first
,
3712 MonSession
*session
,
3714 MonOpRequestRef req
)
3716 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
3717 << " to " << session
->name
<< dendl
;
3719 // get feature of the peer
3720 // use quorum_con_features, if it's an anonymous connection.
3721 uint64_t features
= session
->con_features
? session
->con_features
:
3722 mon
->get_quorum_con_features();
3724 if (first
<= session
->osd_epoch
) {
3725 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
3726 << session
->osd_epoch
<< dendl
;
3727 first
= session
->osd_epoch
+ 1;
3730 if (first
< get_first_committed()) {
3731 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
3732 m
->oldest_map
= get_first_committed();
3733 m
->newest_map
= osdmap
.get_epoch();
3735 // share removed snaps during the gap
3736 get_removed_snaps_range(first
, m
->oldest_map
, &m
->gap_removed_snaps
);
3738 first
= get_first_committed();
3740 int err
= get_version_full(first
, features
, bl
);
3741 ceph_assert(err
== 0);
3742 ceph_assert(bl
.length());
3743 dout(20) << "send_incremental starting with base full "
3744 << first
<< " " << bl
.length() << " bytes" << dendl
;
3745 m
->maps
[first
] = bl
;
3748 mon
->send_reply(req
, m
);
3749 session
->osd_epoch
= first
;
3752 session
->con
->send_message(m
);
3753 session
->osd_epoch
= first
;
3758 while (first
<= osdmap
.get_epoch()) {
3759 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
3760 osdmap
.get_epoch());
3761 MOSDMap
*m
= build_incremental(first
, last
, features
);
3764 // send some maps. it may not be all of them, but it will get them
3766 mon
->send_reply(req
, m
);
3768 session
->con
->send_message(m
);
3771 session
->osd_epoch
= last
;
3777 void OSDMonitor::get_removed_snaps_range(
3778 epoch_t start
, epoch_t end
,
3779 mempool::osdmap::map
<int64_t,OSDMap::snap_interval_set_t
> *gap_removed_snaps
)
3781 // we only care about pools that exist now.
3782 for (auto& p
: osdmap
.get_pools()) {
3783 auto& t
= (*gap_removed_snaps
)[p
.first
];
3784 for (epoch_t epoch
= start
; epoch
< end
; ++epoch
) {
3785 string k
= make_snap_epoch_key(p
.first
, epoch
);
3787 mon
->store
->get(OSD_SNAP_PREFIX
, k
, v
);
3789 auto q
= v
.cbegin();
3790 OSDMap::snap_interval_set_t snaps
;
3795 dout(10) << __func__
<< " " << p
.first
<< " " << t
<< dendl
;
3799 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
3801 return get_version(ver
, mon
->get_quorum_con_features(), bl
);
3804 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
3806 OSDMap::Incremental inc
;
3807 auto q
= bl
.cbegin();
3809 // always encode with subset of osdmap's canonical features
3810 uint64_t f
= features
& inc
.encode_features
;
3811 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
3814 if (inc
.fullmap
.length()) {
3815 // embedded full map?
3817 m
.decode(inc
.fullmap
);
3818 inc
.fullmap
.clear();
3819 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
3821 if (inc
.crush
.length()) {
3822 // embedded crush map
3824 auto p
= inc
.crush
.cbegin();
3827 c
.encode(inc
.crush
, f
);
3829 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
3832 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
3835 auto q
= bl
.cbegin();
3837 // always encode with subset of osdmap's canonical features
3838 uint64_t f
= features
& m
.get_encoding_features();
3839 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
3842 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
3845 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
3847 uint64_t significant_features
= OSDMap::get_significant_features(features
);
3848 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
3851 int ret
= PaxosService::get_version(ver
, bl
);
3855 // NOTE: this check is imprecise; the OSDMap encoding features may
3856 // be a subset of the latest mon quorum features, but worst case we
3857 // reencode once and then cache the (identical) result under both
3859 if (significant_features
!=
3860 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
3861 reencode_incremental_map(bl
, features
);
3863 inc_osd_cache
.add({ver
, significant_features
}, bl
);
3867 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
3870 int err
= get_version(ver
, inc_bl
);
3871 ceph_assert(err
== 0);
3872 ceph_assert(inc_bl
.length());
3874 auto p
= inc_bl
.cbegin();
3876 dout(10) << __func__
<< " "
3877 << " epoch " << inc
.epoch
3878 << " inc_crc " << inc
.inc_crc
3879 << " full_crc " << inc
.full_crc
3880 << " encode_features " << inc
.encode_features
<< dendl
;
3884 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
3886 dout(10) << __func__
<< " ver " << ver
<< dendl
;
3888 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
3889 if (closest_pinned
== 0) {
3892 if (closest_pinned
> ver
) {
3893 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
3895 ceph_assert(closest_pinned
<= ver
);
3897 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
3899 // get osdmap incremental maps and apply on top of this one.
3901 bool has_cached_osdmap
= false;
3902 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
3903 if (full_osd_cache
.lookup({v
, mon
->get_quorum_con_features()},
3905 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
3907 has_cached_osdmap
= true;
3912 if (!has_cached_osdmap
) {
3913 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
3915 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
3916 << " not available! error: " << cpp_strerror(err
) << dendl
;
3918 ceph_assert(err
== 0);
3921 ceph_assert(osdm_bl
.length());
3924 osdm
.decode(osdm_bl
);
3926 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
3927 << " e" << osdm
.epoch
3928 << " crc " << osdm
.get_crc()
3929 << " -- applying incremental maps." << dendl
;
3931 uint64_t encode_features
= 0;
3932 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
3933 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
3935 OSDMap::Incremental inc
;
3936 int err
= get_inc(v
, inc
);
3937 ceph_assert(err
== 0);
3939 encode_features
= inc
.encode_features
;
3941 err
= osdm
.apply_incremental(inc
);
3942 ceph_assert(err
== 0);
3944 // this block performs paranoid checks on map retrieval
3945 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
3946 inc
.full_crc
!= 0) {
3948 uint64_t f
= encode_features
;
3950 f
= (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
3953 // encode osdmap to force calculating crcs
3955 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
3956 // decode osdmap to compare crcs with what's expected by incremental
3960 if (tosdm
.get_crc() != inc
.full_crc
) {
3962 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
3963 << ", expected " << inc
.full_crc
<< ")" << dendl
;
3964 ceph_abort_msg("osdmap crc mismatch");
3968 // note: we cannot add the recently computed map to the cache, as is,
3969 // because we have not encoded the map into a bl.
3972 if (!encode_features
) {
3973 dout(10) << __func__
3974 << " last incremental map didn't have features;"
3975 << " defaulting to quorum's or all" << dendl
;
3977 (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
3979 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
3984 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
3986 return get_version_full(ver
, mon
->get_quorum_con_features(), bl
);
3989 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
3992 uint64_t significant_features
= OSDMap::get_significant_features(features
);
3993 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
3996 int ret
= PaxosService::get_version_full(ver
, bl
);
3997 if (ret
== -ENOENT
) {
3999 ret
= get_full_from_pinned_map(ver
, bl
);
4004 // NOTE: this check is imprecise; the OSDMap encoding features may
4005 // be a subset of the latest mon quorum features, but worst case we
4006 // reencode once and then cache the (identical) result under both
4008 if (significant_features
!=
4009 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4010 reencode_full_map(bl
, features
);
4012 full_osd_cache
.add({ver
, significant_features
}, bl
);
4016 epoch_t
OSDMonitor::blacklist(const entity_addrvec_t
& av
, utime_t until
)
4018 dout(10) << "blacklist " << av
<< " until " << until
<< dendl
;
4019 for (auto a
: av
.v
) {
4020 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
4021 a
.set_type(entity_addr_t::TYPE_ANY
);
4023 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4025 pending_inc
.new_blacklist
[a
] = until
;
4027 return pending_inc
.epoch
;
4030 epoch_t
OSDMonitor::blacklist(entity_addr_t a
, utime_t until
)
4032 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
4033 a
.set_type(entity_addr_t::TYPE_ANY
);
4035 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4037 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
4038 pending_inc
.new_blacklist
[a
] = until
;
4039 return pending_inc
.epoch
;
4043 void OSDMonitor::check_osdmap_subs()
4045 dout(10) << __func__
<< dendl
;
4046 if (!osdmap
.get_epoch()) {
4049 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
4050 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
4053 auto p
= osdmap_subs
->second
->begin();
4057 check_osdmap_sub(sub
);
4061 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4063 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4064 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4065 if (sub
->next
<= osdmap
.get_epoch()) {
4067 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4069 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4071 mon
->session_map
.remove_sub(sub
);
4073 sub
->next
= osdmap
.get_epoch() + 1;
4077 void OSDMonitor::check_pg_creates_subs()
4079 if (!osdmap
.get_num_up_osds()) {
4082 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4083 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
4084 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4085 if (pg_creates_subs
== session_map
.subs
.end()) {
4088 for (auto sub
: *pg_creates_subs
->second
) {
4089 check_pg_creates_sub(sub
);
4094 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4096 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4097 ceph_assert(sub
->type
== "osd_pg_creates");
4098 // only send these if the OSD is up. we will check_subs() when they do
4099 // come up so they will get the creates then.
4100 if (sub
->session
->name
.is_osd() &&
4101 mon
->osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4102 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4103 sub
->session
->con
.get(),
4108 void OSDMonitor::do_application_enable(int64_t pool_id
,
4109 const std::string
&app_name
,
4110 const std::string
&app_key
,
4111 const std::string
&app_value
)
4113 ceph_assert(paxos
->is_plugged() && is_writeable());
4115 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4118 ceph_assert(osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
);
4120 auto pp
= osdmap
.get_pg_pool(pool_id
);
4121 ceph_assert(pp
!= nullptr);
4124 if (pending_inc
.new_pools
.count(pool_id
)) {
4125 p
= pending_inc
.new_pools
[pool_id
];
4128 if (app_key
.empty()) {
4129 p
.application_metadata
.insert({app_name
, {}});
4131 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4133 p
.last_change
= pending_inc
.epoch
;
4134 pending_inc
.new_pools
[pool_id
] = p
;
4137 unsigned OSDMonitor::scan_for_creating_pgs(
4138 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4139 const mempool::osdmap::set
<int64_t>& removed_pools
,
4141 creating_pgs_t
* creating_pgs
) const
4143 unsigned queued
= 0;
4144 for (auto& p
: pools
) {
4145 int64_t poolid
= p
.first
;
4146 if (creating_pgs
->created_pools
.count(poolid
)) {
4147 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4150 const pg_pool_t
& pool
= p
.second
;
4151 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
4152 pool
.get_type(), pool
.get_size());
4153 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4156 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4157 const auto created
= pool
.get_last_change();
4158 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4159 dout(10) << __func__
<< " no change in pool " << poolid
4160 << " " << pool
<< dendl
;
4163 if (removed_pools
.count(poolid
)) {
4164 dout(10) << __func__
<< " pool is being removed: " << poolid
4165 << " " << pool
<< dendl
;
4168 dout(10) << __func__
<< " queueing pool create for " << poolid
4169 << " " << pool
<< dendl
;
4170 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
4177 void OSDMonitor::update_creating_pgs()
4179 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
4180 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
4181 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
4182 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4183 for (const auto& pg
: creating_pgs
.pgs
) {
4184 int acting_primary
= -1;
4185 auto pgid
= pg
.first
;
4186 if (!osdmap
.pg_exists(pgid
)) {
4187 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
4191 auto mapped
= pg
.second
.first
;
4192 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
4194 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
4195 // check the previous creating_pgs, look for the target to whom the pg was
4196 // previously mapped
4197 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
4198 const auto last_acting_primary
= pgs_by_epoch
.first
;
4199 for (auto& pgs
: pgs_by_epoch
.second
) {
4200 if (pgs
.second
.count(spgid
)) {
4201 if (last_acting_primary
== acting_primary
) {
4204 dout(20) << __func__
<< " " << pgid
<< " "
4205 << " acting_primary:" << last_acting_primary
4206 << " -> " << acting_primary
<< dendl
;
4207 // note epoch if the target of the create message changed.
4208 mapped
= mapping
.get_epoch();
4213 mapped
= mapping
.get_epoch();
4217 dout(10) << __func__
<< " will instruct osd." << acting_primary
4218 << " to create " << pgid
<< "@" << mapped
<< dendl
;
4219 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
4221 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
4222 creating_pgs_epoch
= mapping
.get_epoch();
4225 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
4227 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
4228 << " " << creating_pgs_by_osd_epoch
<< dendl
;
4229 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4230 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
4231 dout(20) << __func__
4232 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
4233 // the subscribers will be updated when the mapping is completed anyway
4236 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
4237 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
4239 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
4241 MOSDPGCreate
*oldm
= nullptr; // for pre-mimic OSD compat
4242 MOSDPGCreate2
*m
= nullptr;
4244 bool old
= osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
;
4247 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
4248 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
4249 auto epoch
= epoch_pgs
->first
;
4250 auto& pgs
= epoch_pgs
->second
;
4251 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4252 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
4254 for (auto& pg
: pgs
) {
4255 // Need the create time from the monitor using its clock to set
4256 // last_scrub_stamp upon pg creation.
4257 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
4258 ceph_assert(create
!= creating_pgs
.pgs
.end());
4261 oldm
= new MOSDPGCreate(creating_pgs_epoch
);
4263 oldm
->mkpg
.emplace(pg
.pgid
,
4264 pg_create_t
{create
->second
.first
, pg
.pgid
, 0});
4265 oldm
->ctimes
.emplace(pg
.pgid
, create
->second
.second
);
4268 m
= new MOSDPGCreate2(creating_pgs_epoch
);
4270 m
->pgs
.emplace(pg
, create
->second
);
4272 dout(20) << __func__
<< " will create " << pg
4273 << " at " << create
->second
.first
<< dendl
;
4277 con
->send_message(m
);
4279 con
->send_message(oldm
);
4281 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4282 << " has nothing to send" << dendl
;
4286 // sub is current through last + 1
4293 void OSDMonitor::tick()
4295 if (!is_active()) return;
4297 dout(10) << osdmap
<< dendl
;
4299 // always update osdmap manifest, regardless of being the leader.
4300 load_osdmap_manifest();
4302 if (!mon
->is_leader()) return;
4304 bool do_propose
= false;
4305 utime_t now
= ceph_clock_now();
4307 if (handle_osd_timeouts(now
, last_osd_report
)) {
4312 if (check_failures(now
)) {
4316 // Force a proposal if we need to prune; pruning is performed on
4317 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4318 // even if there's nothing going on.
4319 if (is_prune_enabled() && should_prune()) {
4323 // mark down osds out?
4325 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4326 * influence at all. The decision is made based on the ratio of "in" osds,
4327 * and the function returns false if this ratio is lower that the minimum
4328 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
4330 if (can_mark_out(-1)) {
4331 string down_out_subtree_limit
= g_conf().get_val
<string
>(
4332 "mon_osd_down_out_subtree_limit");
4333 set
<int> down_cache
; // quick cache of down subtrees
4335 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
4336 while (i
!= down_pending_out
.end()) {
4342 if (osdmap
.is_down(o
) &&
4345 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
4346 utime_t grace
= orig_grace
;
4347 double my_grace
= 0.0;
4349 if (g_conf()->mon_osd_adjust_down_out_interval
) {
4350 // scale grace period the same way we do the heartbeat grace.
4351 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
4352 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
4353 double decay_k
= ::log(.5) / halflife
;
4354 double decay
= exp((double)down
* decay_k
);
4355 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
4356 << " down for " << down
<< " decay " << decay
<< dendl
;
4357 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
4361 // is this an entire large subtree down?
4362 if (down_out_subtree_limit
.length()) {
4363 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
4365 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
4366 dout(10) << "tick entire containing " << down_out_subtree_limit
4367 << " subtree for osd." << o
4368 << " is down; resetting timer" << dendl
;
4369 // reset timer, too.
4370 down_pending_out
[o
] = now
;
4376 bool down_out
= !osdmap
.is_destroyed(o
) &&
4377 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
4378 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
4379 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
4380 // this is not precise enough as we did not make a note when this osd
4381 // was marked as destroyed, but let's not bother with that
4382 // complexity for now.
4383 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
4384 if (down_out
|| destroyed_out
) {
4385 dout(10) << "tick marking osd." << o
<< " OUT after " << down
4386 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
4387 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
4389 // set the AUTOOUT bit.
4390 if (pending_inc
.new_state
.count(o
) == 0)
4391 pending_inc
.new_state
[o
] = 0;
4392 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
4394 // remember previous weight
4395 if (pending_inc
.new_xinfo
.count(o
) == 0)
4396 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
4397 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
4401 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
4402 << int(down
.sec()) << " seconds)";
4407 down_pending_out
.erase(o
);
4410 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
4413 // expire blacklisted items?
4414 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
4415 p
!= osdmap
.blacklist
.end();
4417 if (p
->second
< now
) {
4418 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
4419 pending_inc
.old_blacklist
.push_back(p
->first
);
4424 if (try_prune_purged_snaps()) {
4428 if (update_pools_status())
4432 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
4436 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
4437 std::map
<int,utime_t
> &last_osd_report
)
4439 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
4440 if (now
- mon
->get_leader_since() < timeo
) {
4441 // We haven't been the leader for long enough to consider OSD timeouts
4445 int max_osd
= osdmap
.get_max_osd();
4446 bool new_down
= false;
4448 for (int i
=0; i
< max_osd
; ++i
) {
4449 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
4450 if (!osdmap
.exists(i
)) {
4451 last_osd_report
.erase(i
); // if any
4454 if (!osdmap
.is_up(i
))
4456 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
4457 if (t
== last_osd_report
.end()) {
4458 // it wasn't in the map; start the timer.
4459 last_osd_report
[i
] = now
;
4460 } else if (can_mark_down(i
)) {
4461 utime_t diff
= now
- t
->second
;
4463 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
4464 << diff
<< " seconds";
4465 derr
<< "no beacon from osd." << i
<< " since " << t
->second
4466 << ", " << diff
<< " seconds ago. marking down" << dendl
;
4467 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
4475 static void dump_cpu_list(Formatter
*f
, const char *name
,
4476 const string
& strlist
)
4479 size_t cpu_set_size
;
4480 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
4483 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
4484 f
->open_array_section(name
);
4485 for (auto cpu
: cpus
) {
4486 f
->dump_int("cpu", cpu
);
4491 void OSDMonitor::dump_info(Formatter
*f
)
4493 f
->open_object_section("osdmap");
4497 f
->open_array_section("osd_metadata");
4498 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4499 if (osdmap
.exists(i
)) {
4500 f
->open_object_section("osd");
4501 f
->dump_unsigned("id", i
);
4502 dump_osd_metadata(i
, f
, NULL
);
4508 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
4509 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
4511 f
->open_object_section("crushmap");
4512 osdmap
.crush
->dump(f
);
4515 if (has_osdmap_manifest
) {
4516 f
->open_object_section("osdmap_manifest");
4517 osdmap_manifest
.dump(f
);
4523 enum osd_pool_get_choices
{
4525 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
4526 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
4527 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
4528 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
4529 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
4530 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
4531 CACHE_TARGET_FULL_RATIO
,
4532 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
4533 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
4534 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
4535 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
4536 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
4537 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
4538 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
4539 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
4540 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
4541 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
4542 PG_AUTOSCALE_BIAS
};
4544 std::set
<osd_pool_get_choices
>
4545 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
4546 const std::set
<osd_pool_get_choices
>& second
)
4548 std::set
<osd_pool_get_choices
> result
;
4549 std::set_difference(first
.begin(), first
.end(),
4550 second
.begin(), second
.end(),
4551 std::inserter(result
, result
.end()));
4557 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
4559 op
->mark_osdmon_event(__func__
);
4560 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
4563 stringstream ss
, ds
;
4566 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
4567 string rs
= ss
.str();
4568 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
4572 MonSession
*session
= op
->get_session();
4574 derr
<< __func__
<< " no session" << dendl
;
4575 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
4580 cmd_getval(cct
, cmdmap
, "prefix", prefix
);
4583 cmd_getval(cct
, cmdmap
, "format", format
, string("plain"));
4584 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
4586 if (prefix
== "osd stat") {
4587 osdmap
.print_summary(f
.get(), ds
, "", true);
4593 else if (prefix
== "osd dump" ||
4594 prefix
== "osd tree" ||
4595 prefix
== "osd tree-from" ||
4596 prefix
== "osd ls" ||
4597 prefix
== "osd getmap" ||
4598 prefix
== "osd getcrushmap" ||
4599 prefix
== "osd ls-tree") {
4604 cmd_getval(cct
, cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
4607 bufferlist osdmap_bl
;
4608 int err
= get_version_full(epoch
, osdmap_bl
);
4609 if (err
== -ENOENT
) {
4611 ss
<< "there is no map for epoch " << epoch
;
4614 ceph_assert(err
== 0);
4615 ceph_assert(osdmap_bl
.length());
4618 if (epoch
== osdmap
.get_epoch()) {
4622 p
->decode(osdmap_bl
);
4625 auto sg
= make_scope_guard([&] {
4631 if (prefix
== "osd dump") {
4634 f
->open_object_section("osdmap");
4644 } else if (prefix
== "osd ls") {
4646 f
->open_array_section("osds");
4647 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
4648 if (osdmap
.exists(i
)) {
4649 f
->dump_int("osd", i
);
4656 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
4657 if (osdmap
.exists(i
)) {
4666 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
4668 if (prefix
== "osd tree-from") {
4669 cmd_getval(cct
, cmdmap
, "bucket", bucket
);
4670 if (!osdmap
.crush
->name_exists(bucket
)) {
4671 ss
<< "bucket '" << bucket
<< "' does not exist";
4675 int id
= osdmap
.crush
->get_item_id(bucket
);
4677 ss
<< "\"" << bucket
<< "\" is not a bucket";
4683 vector
<string
> states
;
4684 cmd_getval(cct
, cmdmap
, "states", states
);
4685 unsigned filter
= 0;
4686 for (auto& s
: states
) {
4688 filter
|= OSDMap::DUMP_UP
;
4689 } else if (s
== "down") {
4690 filter
|= OSDMap::DUMP_DOWN
;
4691 } else if (s
== "in") {
4692 filter
|= OSDMap::DUMP_IN
;
4693 } else if (s
== "out") {
4694 filter
|= OSDMap::DUMP_OUT
;
4695 } else if (s
== "destroyed") {
4696 filter
|= OSDMap::DUMP_DESTROYED
;
4698 ss
<< "unrecognized state '" << s
<< "'";
4703 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
4704 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
4705 ss
<< "cannot specify both 'in' and 'out'";
4709 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
4710 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
4711 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
4712 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
4713 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
4714 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
4715 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
4720 f
->open_object_section("tree");
4721 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
4725 p
->print_tree(NULL
, &ds
, filter
, bucket
);
4728 } else if (prefix
== "osd getmap") {
4729 rdata
.append(osdmap_bl
);
4730 ss
<< "got osdmap epoch " << p
->get_epoch();
4731 } else if (prefix
== "osd getcrushmap") {
4732 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
4733 ss
<< p
->get_crush_version();
4734 } else if (prefix
== "osd ls-tree") {
4736 cmd_getval(cct
, cmdmap
, "name", bucket_name
);
4738 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
4740 ss
<< "\"" << bucket_name
<< "\" does not exist";
4743 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
4748 f
->open_array_section("osds");
4749 for (auto &i
: osds
) {
4750 if (osdmap
.exists(i
)) {
4751 f
->dump_int("osd", i
);
4758 for (auto &i
: osds
) {
4759 if (osdmap
.exists(i
)) {
4770 } else if (prefix
== "osd getmaxosd") {
4772 f
->open_object_section("getmaxosd");
4773 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4774 f
->dump_int("max_osd", osdmap
.get_max_osd());
4778 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
4781 } else if (prefix
== "osd utilization") {
4783 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
4790 } else if (prefix
== "osd find") {
4792 if (!cmd_getval(cct
, cmdmap
, "id", osd
)) {
4793 ss
<< "unable to parse osd id value '"
4794 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4798 if (!osdmap
.exists(osd
)) {
4799 ss
<< "osd." << osd
<< " does not exist";
4804 cmd_getval(cct
, cmdmap
, "format", format
);
4805 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4806 f
->open_object_section("osd_location");
4807 f
->dump_int("osd", osd
);
4808 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
4809 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
4811 // try to identify host, pod/container name, etc.
4812 map
<string
,string
> m
;
4813 load_metadata(osd
, m
, nullptr);
4814 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
4815 f
->dump_string("host", p
->second
);
4818 "pod_name", "pod_namespace", // set by rook
4819 "container_name" // set by ceph-ansible
4821 if (auto p
= m
.find(k
); p
!= m
.end()) {
4822 f
->dump_string(k
, p
->second
);
4826 // crush is helpful too
4827 f
->open_object_section("crush_location");
4828 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
4829 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
4830 f
->dump_string(p
->first
.c_str(), p
->second
);
4834 } else if (prefix
== "osd metadata") {
4836 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
4837 !cmd_getval(cct
, cmdmap
, "id", osd
)) {
4838 ss
<< "unable to parse osd id value '"
4839 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4843 if (osd
>= 0 && !osdmap
.exists(osd
)) {
4844 ss
<< "osd." << osd
<< " does not exist";
4849 cmd_getval(cct
, cmdmap
, "format", format
);
4850 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4852 f
->open_object_section("osd_metadata");
4853 f
->dump_unsigned("id", osd
);
4854 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
4860 f
->open_array_section("osd_metadata");
4861 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4862 if (osdmap
.exists(i
)) {
4863 f
->open_object_section("osd");
4864 f
->dump_unsigned("id", i
);
4865 r
= dump_osd_metadata(i
, f
.get(), NULL
);
4866 if (r
== -EINVAL
|| r
== -ENOENT
) {
4867 // Drop error, continue to get other daemons' metadata
4868 dout(4) << "No metadata for osd." << i
<< dendl
;
4880 } else if (prefix
== "osd versions") {
4882 f
.reset(Formatter::create("json-pretty"));
4883 count_metadata("ceph_version", f
.get());
4886 } else if (prefix
== "osd count-metadata") {
4888 f
.reset(Formatter::create("json-pretty"));
4890 cmd_getval(cct
, cmdmap
, "property", field
);
4891 count_metadata(field
, f
.get());
4894 } else if (prefix
== "osd numa-status") {
4897 f
->open_array_section("osds");
4899 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
4900 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
4901 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
4902 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
4903 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
4904 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
4906 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4907 if (osdmap
.exists(i
)) {
4908 map
<string
,string
> m
;
4910 if (load_metadata(i
, m
, &err
) < 0) {
4914 auto p
= m
.find("hostname");
4919 f
->open_object_section("osd");
4920 f
->dump_int("osd", i
);
4921 f
->dump_string("host", host
);
4922 for (auto n
: { "network_numa_node", "objectstore_numa_node",
4926 f
->dump_int(n
, atoi(p
->second
.c_str()));
4929 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
4932 list
<string
> ls
= get_str_list(p
->second
, ",");
4933 f
->open_array_section(n
);
4934 for (auto node
: ls
) {
4935 f
->dump_int("node", atoi(node
.c_str()));
4940 for (auto n
: { "numa_node_cpus" }) {
4943 dump_cpu_list(f
.get(), n
, p
->second
);
4950 p
= m
.find("network_numa_nodes");
4956 p
= m
.find("objectstore_numa_nodes");
4962 p
= m
.find("numa_node");
4963 auto q
= m
.find("numa_node_cpus");
4964 if (p
!= m
.end() && q
!= m
.end()) {
4971 tbl
<< TextTable::endrow
;
4979 rdata
.append(stringify(tbl
));
4981 } else if (prefix
== "osd map") {
4982 string poolstr
, objstr
, namespacestr
;
4983 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
4984 cmd_getval(cct
, cmdmap
, "object", objstr
);
4985 cmd_getval(cct
, cmdmap
, "nspace", namespacestr
);
4987 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4989 ss
<< "pool " << poolstr
<< " does not exist";
4993 object_locator_t
oloc(pool
, namespacestr
);
4994 object_t
oid(objstr
);
4995 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
4996 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4997 vector
<int> up
, acting
;
4999 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5002 if (!namespacestr
.empty())
5003 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5005 fullobjname
= oid
.name
;
5007 f
->open_object_section("osd_map");
5008 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5009 f
->dump_string("pool", poolstr
);
5010 f
->dump_int("pool_id", pool
);
5011 f
->dump_stream("objname") << fullobjname
;
5012 f
->dump_stream("raw_pgid") << pgid
;
5013 f
->dump_stream("pgid") << mpgid
;
5014 f
->open_array_section("up");
5015 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5016 f
->dump_int("osd", *p
);
5018 f
->dump_int("up_primary", up_p
);
5019 f
->open_array_section("acting");
5020 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5021 f
->dump_int("osd", *p
);
5023 f
->dump_int("acting_primary", acting_p
);
5024 f
->close_section(); // osd_map
5027 ds
<< "osdmap e" << osdmap
.get_epoch()
5028 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5029 << " object '" << fullobjname
<< "' ->"
5030 << " pg " << pgid
<< " (" << mpgid
<< ")"
5031 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5032 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5036 } else if (prefix
== "pg map") {
5039 cmd_getval(cct
, cmdmap
, "pgid", pgidstr
);
5040 if (!pgid
.parse(pgidstr
.c_str())) {
5041 ss
<< "invalid pgid '" << pgidstr
<< "'";
5045 vector
<int> up
, acting
;
5046 if (!osdmap
.have_pg_pool(pgid
.pool())) {
5047 ss
<< "pg '" << pgidstr
<< "' does not exist";
5051 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5052 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5054 f
->open_object_section("pg_map");
5055 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5056 f
->dump_stream("raw_pgid") << pgid
;
5057 f
->dump_stream("pgid") << mpgid
;
5058 f
->open_array_section("up");
5059 for (auto osd
: up
) {
5060 f
->dump_int("up_osd", osd
);
5063 f
->open_array_section("acting");
5064 for (auto osd
: acting
) {
5065 f
->dump_int("acting_osd", osd
);
5071 ds
<< "osdmap e" << osdmap
.get_epoch()
5072 << " pg " << pgid
<< " (" << mpgid
<< ")"
5073 << " -> up " << up
<< " acting " << acting
;
5078 } else if (prefix
== "osd lspools") {
5080 f
->open_array_section("pools");
5081 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
5082 p
!= osdmap
.pools
.end();
5085 f
->open_object_section("pool");
5086 f
->dump_int("poolnum", p
->first
);
5087 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
5090 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
5091 if (next(p
) != osdmap
.pools
.end()) {
5101 } else if (prefix
== "osd blacklist ls") {
5103 f
->open_array_section("blacklist");
5105 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5106 p
!= osdmap
.blacklist
.end();
5109 f
->open_object_section("entry");
5110 f
->dump_string("addr", p
->first
.get_legacy_str());
5111 f
->dump_stream("until") << p
->second
;
5116 ss
<< p
->first
<< " " << p
->second
;
5126 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
5128 } else if (prefix
== "osd pool ls") {
5130 cmd_getval(cct
, cmdmap
, "detail", detail
);
5131 if (!f
&& detail
== "detail") {
5133 osdmap
.print_pools(ss
);
5134 rdata
.append(ss
.str());
5137 f
->open_array_section("pools");
5138 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
5139 it
!= osdmap
.get_pools().end();
5142 if (detail
== "detail") {
5143 f
->open_object_section("pool");
5144 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5145 it
->second
.dump(f
.get());
5148 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5151 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
5160 } else if (prefix
== "osd crush get-tunable") {
5162 cmd_getval(cct
, cmdmap
, "tunable", tunable
);
5165 f
->open_object_section("tunable");
5166 if (tunable
== "straw_calc_version") {
5168 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
5170 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
5179 rdata
.append(rss
.str());
5183 } else if (prefix
== "osd pool get") {
5185 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
5186 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5188 ss
<< "unrecognized pool '" << poolstr
<< "'";
5193 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
5195 cmd_getval(cct
, cmdmap
, "var", var
);
5197 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
5198 const choices_map_t ALL_CHOICES
= {
5200 {"min_size", MIN_SIZE
},
5201 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
5202 {"crush_rule", CRUSH_RULE
}, {"hashpspool", HASHPSPOOL
},
5203 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
5204 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
5205 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
5206 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
5207 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
5208 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
5209 {"use_gmt_hitset", USE_GMT_HITSET
},
5210 {"target_max_objects", TARGET_MAX_OBJECTS
},
5211 {"target_max_bytes", TARGET_MAX_BYTES
},
5212 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
5213 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
5214 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
5215 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
5216 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
5217 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
5218 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
5219 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
5220 {"fast_read", FAST_READ
},
5221 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
5222 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
5223 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
5224 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
5225 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
5226 {"recovery_priority", RECOVERY_PRIORITY
},
5227 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
5228 {"scrub_priority", SCRUB_PRIORITY
},
5229 {"compression_mode", COMPRESSION_MODE
},
5230 {"compression_algorithm", COMPRESSION_ALGORITHM
},
5231 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
5232 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
5233 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
5234 {"csum_type", CSUM_TYPE
},
5235 {"csum_max_block", CSUM_MAX_BLOCK
},
5236 {"csum_min_block", CSUM_MIN_BLOCK
},
5237 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
5238 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
5239 {"pg_num_min", PG_NUM_MIN
},
5240 {"target_size_bytes", TARGET_SIZE_BYTES
},
5241 {"target_size_ratio", TARGET_SIZE_RATIO
},
5242 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
5245 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
5247 const choices_set_t ONLY_TIER_CHOICES
= {
5248 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5249 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
5250 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5251 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5252 MIN_READ_RECENCY_FOR_PROMOTE
,
5253 MIN_WRITE_RECENCY_FOR_PROMOTE
,
5254 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
5256 const choices_set_t ONLY_ERASURE_CHOICES
= {
5257 EC_OVERWRITES
, ERASURE_CODE_PROFILE
5260 choices_set_t selected_choices
;
5262 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
5263 it
!= ALL_CHOICES
.end(); ++it
) {
5264 selected_choices
.insert(it
->second
);
5268 selected_choices
= subtract_second_from_first(selected_choices
,
5272 if(!p
->is_erasure()) {
5273 selected_choices
= subtract_second_from_first(selected_choices
,
5274 ONLY_ERASURE_CHOICES
);
5276 } else /* var != "all" */ {
5277 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
5278 osd_pool_get_choices selected
= found
->second
;
5280 if (!p
->is_tier() &&
5281 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
5282 ss
<< "pool '" << poolstr
5283 << "' is not a tier pool: variable not applicable";
5288 if (!p
->is_erasure() &&
5289 ONLY_ERASURE_CHOICES
.find(selected
)
5290 != ONLY_ERASURE_CHOICES
.end()) {
5291 ss
<< "pool '" << poolstr
5292 << "' is not a erasure pool: variable not applicable";
5297 if (pool_opts_t::is_opt_name(var
) &&
5298 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
5299 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
5304 selected_choices
.insert(selected
);
5308 f
->open_object_section("pool");
5309 f
->dump_string("pool", poolstr
);
5310 f
->dump_int("pool_id", pool
);
5311 for(choices_set_t::const_iterator it
= selected_choices
.begin();
5312 it
!= selected_choices
.end(); ++it
) {
5313 choices_map_t::const_iterator i
;
5314 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5315 if (i
->second
== *it
) {
5319 ceph_assert(i
!= ALL_CHOICES
.end());
5322 f
->dump_int("pg_num", p
->get_pg_num());
5325 f
->dump_int("pgp_num", p
->get_pgp_num());
5328 f
->dump_int("size", p
->get_size());
5331 f
->dump_int("min_size", p
->get_min_size());
5334 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
5335 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
5336 p
->get_crush_rule()));
5338 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
5342 f
->dump_bool("allow_ec_overwrites",
5343 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
5345 case PG_AUTOSCALE_MODE
:
5346 f
->dump_string("pg_autoscale_mode",
5347 pg_pool_t::get_pg_autoscale_mode_name(
5348 p
->pg_autoscale_mode
));
5354 case WRITE_FADVISE_DONTNEED
:
5357 f
->dump_bool(i
->first
.c_str(),
5358 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
5360 case HIT_SET_PERIOD
:
5361 f
->dump_int("hit_set_period", p
->hit_set_period
);
5364 f
->dump_int("hit_set_count", p
->hit_set_count
);
5367 f
->dump_string("hit_set_type",
5368 HitSet::get_type_name(p
->hit_set_params
.get_type()));
5372 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
5373 BloomHitSet::Params
*bloomp
=
5374 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
5375 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
5376 } else if(var
!= "all") {
5378 ss
<< "hit set is not of type Bloom; " <<
5379 "invalid to get a false positive rate!";
5385 case USE_GMT_HITSET
:
5386 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
5388 case TARGET_MAX_OBJECTS
:
5389 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
5391 case TARGET_MAX_BYTES
:
5392 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
5394 case CACHE_TARGET_DIRTY_RATIO
:
5395 f
->dump_unsigned("cache_target_dirty_ratio_micro",
5396 p
->cache_target_dirty_ratio_micro
);
5397 f
->dump_float("cache_target_dirty_ratio",
5398 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
5400 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
5401 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
5402 p
->cache_target_dirty_high_ratio_micro
);
5403 f
->dump_float("cache_target_dirty_high_ratio",
5404 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
5406 case CACHE_TARGET_FULL_RATIO
:
5407 f
->dump_unsigned("cache_target_full_ratio_micro",
5408 p
->cache_target_full_ratio_micro
);
5409 f
->dump_float("cache_target_full_ratio",
5410 ((float)p
->cache_target_full_ratio_micro
/1000000));
5412 case CACHE_MIN_FLUSH_AGE
:
5413 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
5415 case CACHE_MIN_EVICT_AGE
:
5416 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
5418 case ERASURE_CODE_PROFILE
:
5419 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
5421 case MIN_READ_RECENCY_FOR_PROMOTE
:
5422 f
->dump_int("min_read_recency_for_promote",
5423 p
->min_read_recency_for_promote
);
5425 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
5426 f
->dump_int("min_write_recency_for_promote",
5427 p
->min_write_recency_for_promote
);
5430 f
->dump_int("fast_read", p
->fast_read
);
5432 case HIT_SET_GRADE_DECAY_RATE
:
5433 f
->dump_int("hit_set_grade_decay_rate",
5434 p
->hit_set_grade_decay_rate
);
5436 case HIT_SET_SEARCH_LAST_N
:
5437 f
->dump_int("hit_set_search_last_n",
5438 p
->hit_set_search_last_n
);
5440 case SCRUB_MIN_INTERVAL
:
5441 case SCRUB_MAX_INTERVAL
:
5442 case DEEP_SCRUB_INTERVAL
:
5443 case RECOVERY_PRIORITY
:
5444 case RECOVERY_OP_PRIORITY
:
5445 case SCRUB_PRIORITY
:
5446 case COMPRESSION_MODE
:
5447 case COMPRESSION_ALGORITHM
:
5448 case COMPRESSION_REQUIRED_RATIO
:
5449 case COMPRESSION_MAX_BLOB_SIZE
:
5450 case COMPRESSION_MIN_BLOB_SIZE
:
5452 case CSUM_MAX_BLOCK
:
5453 case CSUM_MIN_BLOCK
:
5454 case FINGERPRINT_ALGORITHM
:
5456 case TARGET_SIZE_BYTES
:
5457 case TARGET_SIZE_RATIO
:
5458 case PG_AUTOSCALE_BIAS
:
5459 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
5460 if (p
->opts
.is_set(key
)) {
5461 if(*it
== CSUM_TYPE
) {
5463 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
5464 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
5466 p
->opts
.dump(i
->first
, f
.get());
5475 for(choices_set_t::const_iterator it
= selected_choices
.begin();
5476 it
!= selected_choices
.end(); ++it
) {
5477 choices_map_t::const_iterator i
;
5480 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
5483 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
5486 ss
<< "size: " << p
->get_size() << "\n";
5489 ss
<< "min_size: " << p
->get_min_size() << "\n";
5492 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
5493 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
5494 p
->get_crush_rule()) << "\n";
5496 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
5499 case PG_AUTOSCALE_MODE
:
5500 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
5501 p
->pg_autoscale_mode
) <<"\n";
5503 case HIT_SET_PERIOD
:
5504 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
5507 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
5510 ss
<< "hit_set_type: " <<
5511 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
5515 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
5516 BloomHitSet::Params
*bloomp
=
5517 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
5518 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
5519 } else if(var
!= "all") {
5520 ss
<< "hit set is not of type Bloom; " <<
5521 "invalid to get a false positive rate!";
5527 case USE_GMT_HITSET
:
5528 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
5530 case TARGET_MAX_OBJECTS
:
5531 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
5533 case TARGET_MAX_BYTES
:
5534 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
5536 case CACHE_TARGET_DIRTY_RATIO
:
5537 ss
<< "cache_target_dirty_ratio: "
5538 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
5540 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
5541 ss
<< "cache_target_dirty_high_ratio: "
5542 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
5544 case CACHE_TARGET_FULL_RATIO
:
5545 ss
<< "cache_target_full_ratio: "
5546 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
5548 case CACHE_MIN_FLUSH_AGE
:
5549 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
5551 case CACHE_MIN_EVICT_AGE
:
5552 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
5554 case ERASURE_CODE_PROFILE
:
5555 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
5557 case MIN_READ_RECENCY_FOR_PROMOTE
:
5558 ss
<< "min_read_recency_for_promote: " <<
5559 p
->min_read_recency_for_promote
<< "\n";
5561 case HIT_SET_GRADE_DECAY_RATE
:
5562 ss
<< "hit_set_grade_decay_rate: " <<
5563 p
->hit_set_grade_decay_rate
<< "\n";
5565 case HIT_SET_SEARCH_LAST_N
:
5566 ss
<< "hit_set_search_last_n: " <<
5567 p
->hit_set_search_last_n
<< "\n";
5570 ss
<< "allow_ec_overwrites: " <<
5571 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
5578 case WRITE_FADVISE_DONTNEED
:
5581 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5582 if (i
->second
== *it
)
5585 ceph_assert(i
!= ALL_CHOICES
.end());
5586 ss
<< i
->first
<< ": " <<
5587 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
5588 "true" : "false") << "\n";
5590 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
5591 ss
<< "min_write_recency_for_promote: " <<
5592 p
->min_write_recency_for_promote
<< "\n";
5595 ss
<< "fast_read: " << p
->fast_read
<< "\n";
5597 case SCRUB_MIN_INTERVAL
:
5598 case SCRUB_MAX_INTERVAL
:
5599 case DEEP_SCRUB_INTERVAL
:
5600 case RECOVERY_PRIORITY
:
5601 case RECOVERY_OP_PRIORITY
:
5602 case SCRUB_PRIORITY
:
5603 case COMPRESSION_MODE
:
5604 case COMPRESSION_ALGORITHM
:
5605 case COMPRESSION_REQUIRED_RATIO
:
5606 case COMPRESSION_MAX_BLOB_SIZE
:
5607 case COMPRESSION_MIN_BLOB_SIZE
:
5609 case CSUM_MAX_BLOCK
:
5610 case CSUM_MIN_BLOCK
:
5611 case FINGERPRINT_ALGORITHM
:
5613 case TARGET_SIZE_BYTES
:
5614 case TARGET_SIZE_RATIO
:
5615 case PG_AUTOSCALE_BIAS
:
5616 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5617 if (i
->second
== *it
)
5620 ceph_assert(i
!= ALL_CHOICES
.end());
5622 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
5623 if (p
->opts
.is_set(key
)) {
5624 if(key
== pool_opts_t::CSUM_TYPE
) {
5626 p
->opts
.get(key
, &val
);
5627 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
5629 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
5635 rdata
.append(ss
.str());
5640 } else if (prefix
== "osd pool get-quota") {
5642 cmd_getval(cct
, cmdmap
, "pool", pool_name
);
5644 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
5646 ceph_assert(poolid
== -ENOENT
);
5647 ss
<< "unrecognized pool '" << pool_name
<< "'";
5651 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
5654 f
->open_object_section("pool_quotas");
5655 f
->dump_string("pool_name", pool_name
);
5656 f
->dump_unsigned("pool_id", poolid
);
5657 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
5658 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
5663 rs
<< "quotas for pool '" << pool_name
<< "':\n"
5664 << " max objects: ";
5665 if (p
->quota_max_objects
== 0)
5668 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
5671 if (p
->quota_max_bytes
== 0)
5674 rs
<< byte_u_t(p
->quota_max_bytes
);
5675 rdata
.append(rs
.str());
5679 } else if (prefix
== "osd crush rule list" ||
5680 prefix
== "osd crush rule ls") {
5682 f
->open_array_section("rules");
5683 osdmap
.crush
->list_rules(f
.get());
5688 osdmap
.crush
->list_rules(&ss
);
5689 rdata
.append(ss
.str());
5691 } else if (prefix
== "osd crush rule ls-by-class") {
5693 cmd_getval(cct
, cmdmap
, "class", class_name
);
5694 if (class_name
.empty()) {
5695 ss
<< "no class specified";
5700 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
5702 ss
<< "failed to get rules by class '" << class_name
<< "'";
5706 f
->open_array_section("rules");
5707 for (auto &rule
: rules
) {
5708 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
5714 for (auto &rule
: rules
) {
5715 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
5717 rdata
.append(rs
.str());
5719 } else if (prefix
== "osd crush rule dump") {
5721 cmd_getval(cct
, cmdmap
, "name", name
);
5723 cmd_getval(cct
, cmdmap
, "format", format
);
5724 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5726 f
->open_array_section("rules");
5727 osdmap
.crush
->dump_rules(f
.get());
5730 int ruleno
= osdmap
.crush
->get_rule_id(name
);
5732 ss
<< "unknown crush rule '" << name
<< "'";
5736 osdmap
.crush
->dump_rule(ruleno
, f
.get());
5741 rdata
.append(rs
.str());
5742 } else if (prefix
== "osd crush dump") {
5744 cmd_getval(cct
, cmdmap
, "format", format
);
5745 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5746 f
->open_object_section("crush_map");
5747 osdmap
.crush
->dump(f
.get());
5752 rdata
.append(rs
.str());
5753 } else if (prefix
== "osd crush show-tunables") {
5755 cmd_getval(cct
, cmdmap
, "format", format
);
5756 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5757 f
->open_object_section("crush_map_tunables");
5758 osdmap
.crush
->dump_tunables(f
.get());
5763 rdata
.append(rs
.str());
5764 } else if (prefix
== "osd crush tree") {
5766 cmd_getval(cct
, cmdmap
, "shadow", shadow
);
5767 bool show_shadow
= shadow
== "--show-shadow";
5768 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5770 f
->open_object_section("crush_tree");
5771 osdmap
.crush
->dump_tree(nullptr,
5773 osdmap
.get_pool_names(),
5779 osdmap
.crush
->dump_tree(&ss
,
5781 osdmap
.get_pool_names(),
5783 rdata
.append(ss
.str());
5785 } else if (prefix
== "osd crush ls") {
5787 if (!cmd_getval(cct
, cmdmap
, "node", name
)) {
5788 ss
<< "no node specified";
5792 if (!osdmap
.crush
->name_exists(name
)) {
5793 ss
<< "node '" << name
<< "' does not exist";
5797 int id
= osdmap
.crush
->get_item_id(name
);
5800 result
.push_back(id
);
5802 int num
= osdmap
.crush
->get_bucket_size(id
);
5803 for (int i
= 0; i
< num
; ++i
) {
5804 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
5808 f
->open_array_section("items");
5809 for (auto i
: result
) {
5810 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
5816 for (auto i
: result
) {
5817 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
5819 rdata
.append(ss
.str());
5822 } else if (prefix
== "osd crush class ls") {
5823 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5824 f
->open_array_section("crush_classes");
5825 for (auto i
: osdmap
.crush
->class_name
)
5826 f
->dump_string("class", i
.second
);
5829 } else if (prefix
== "osd crush class ls-osd") {
5831 cmd_getval(cct
, cmdmap
, "class", name
);
5833 osdmap
.crush
->get_devices_by_class(name
, &osds
);
5835 f
->open_array_section("osds");
5836 for (auto &osd
: osds
)
5837 f
->dump_int("osd", osd
);
5842 for (auto &osd
: osds
) {
5850 } else if (prefix
== "osd crush get-device-class") {
5851 vector
<string
> idvec
;
5852 cmd_getval(cct
, cmdmap
, "ids", idvec
);
5853 map
<int, string
> class_by_osd
;
5854 for (auto& id
: idvec
) {
5856 long osd
= parse_osd_id(id
.c_str(), &ts
);
5858 ss
<< "unable to parse osd id:'" << id
<< "'";
5862 auto device_class
= osdmap
.crush
->get_item_class(osd
);
5864 class_by_osd
[osd
] = device_class
;
5866 class_by_osd
[osd
] = ""; // no class
5869 f
->open_array_section("osd_device_classes");
5870 for (auto& i
: class_by_osd
) {
5871 f
->open_object_section("osd_device_class");
5872 f
->dump_int("osd", i
.first
);
5873 f
->dump_string("device_class", i
.second
);
5879 if (class_by_osd
.size() == 1) {
5880 // for single input, make a clean output
5881 ds
<< class_by_osd
.begin()->second
;
5883 // note that we do not group osds by class here
5884 for (auto it
= class_by_osd
.begin();
5885 it
!= class_by_osd
.end();
5887 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
5888 if (next(it
) != class_by_osd
.end())
5894 } else if (prefix
== "osd erasure-code-profile ls") {
5895 const auto &profiles
= osdmap
.get_erasure_code_profiles();
5897 f
->open_array_section("erasure-code-profiles");
5898 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
5900 f
->dump_string("profile", i
->first
.c_str());
5902 rdata
.append(i
->first
+ "\n");
5909 rdata
.append(rs
.str());
5911 } else if (prefix
== "osd crush weight-set ls") {
5912 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5914 f
->open_array_section("weight_sets");
5915 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5916 f
->dump_string("pool", "(compat)");
5918 for (auto& i
: osdmap
.crush
->choose_args
) {
5920 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
5927 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5930 for (auto& i
: osdmap
.crush
->choose_args
) {
5932 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
5935 rdata
.append(rs
.str());
5937 } else if (prefix
== "osd crush weight-set dump") {
5938 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5940 osdmap
.crush
->dump_choose_args(f
.get());
5942 } else if (prefix
== "osd erasure-code-profile get") {
5944 cmd_getval(cct
, cmdmap
, "name", name
);
5945 if (!osdmap
.has_erasure_code_profile(name
)) {
5946 ss
<< "unknown erasure code profile '" << name
<< "'";
5950 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
5952 f
->open_object_section("profile");
5953 for (map
<string
,string
>::const_iterator i
= profile
.begin();
5957 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
5959 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
5966 rdata
.append(rs
.str());
5968 } else if (prefix
== "osd pool application get") {
5969 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5972 cmd_getval(cct
, cmdmap
, "pool", pool_name
);
5974 cmd_getval(cct
, cmdmap
, "app", app
);
5976 cmd_getval(cct
, cmdmap
, "key", key
);
5978 if (pool_name
.empty()) {
5980 f
->open_object_section("pools");
5981 for (const auto &pool
: osdmap
.pools
) {
5982 std::string
name("<unknown>");
5983 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
5984 if (pni
!= osdmap
.pool_name
.end())
5986 f
->open_object_section(name
.c_str());
5987 for (auto &app_pair
: pool
.second
.application_metadata
) {
5988 f
->open_object_section(app_pair
.first
.c_str());
5989 for (auto &kv_pair
: app_pair
.second
) {
5990 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5994 f
->close_section(); // name
5996 f
->close_section(); // pools
5999 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6001 ss
<< "unrecognized pool '" << pool_name
<< "'";
6005 auto p
= osdmap
.get_pg_pool(pool
);
6008 f
->open_object_section(pool_name
.c_str());
6009 for (auto &app_pair
: p
->application_metadata
) {
6010 f
->open_object_section(app_pair
.first
.c_str());
6011 for (auto &kv_pair
: app_pair
.second
) {
6012 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6014 f
->close_section(); // application
6016 f
->close_section(); // pool_name
6021 auto app_it
= p
->application_metadata
.find(app
);
6022 if (app_it
== p
->application_metadata
.end()) {
6023 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
6027 // filter by pool + app
6029 f
->open_object_section(app_it
->first
.c_str());
6030 for (auto &kv_pair
: app_it
->second
) {
6031 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6033 f
->close_section(); // application
6037 // filter by pool + app + key
6038 auto key_it
= app_it
->second
.find(key
);
6039 if (key_it
== app_it
->second
.end()) {
6040 ss
<< "application '" << app
<< "' on pool '" << pool_name
6041 << "' does not have key '" << key
<< "'";
6045 ss
<< key_it
->second
<< "\n";
6046 rdata
.append(ss
.str());
6049 } else if (prefix
== "osd get-require-min-compat-client") {
6050 ss
<< ceph_release_name(osdmap
.require_min_compat_client
) << std::endl
;
6051 rdata
.append(ss
.str());
6054 } else if (prefix
== "osd pool application enable" ||
6055 prefix
== "osd pool application disable" ||
6056 prefix
== "osd pool application set" ||
6057 prefix
== "osd pool application rm") {
6058 bool changed
= false;
6059 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
6063 } else if (changed
) {
6064 // Valid mutation, proceed to prepare phase
6067 // Idempotent case, reply
6071 // try prepare update
6078 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
6082 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
6084 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6085 osdmap
.get_pg_pool(pool_id
));
6087 pool
->set_flag(flags
);
6090 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
6092 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6093 osdmap
.get_pg_pool(pool_id
));
6095 pool
->unset_flag(flags
);
6098 string
OSDMonitor::make_snap_epoch_key(int64_t pool
, epoch_t epoch
)
6101 snprintf(k
, sizeof(k
), "removed_epoch_%llu_%08lx",
6102 (unsigned long long)pool
, (unsigned long)epoch
);
6106 string
OSDMonitor::make_snap_key(int64_t pool
, snapid_t snap
)
6109 snprintf(k
, sizeof(k
), "removed_snap_%llu_%016llx",
6110 (unsigned long long)pool
, (unsigned long long)snap
);
6115 string
OSDMonitor::make_snap_key_value(
6116 int64_t pool
, snapid_t snap
, snapid_t num
,
6117 epoch_t epoch
, bufferlist
*v
)
6119 // encode the *last* epoch in the key so that we can use forward
6120 // iteration only to search for an epoch in an interval.
6122 encode(snap
+ num
, *v
);
6124 return make_snap_key(pool
, snap
+ num
- 1);
6127 string
OSDMonitor::make_snap_purged_key(int64_t pool
, snapid_t snap
)
6130 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
6131 (unsigned long long)pool
, (unsigned long long)snap
);
6134 string
OSDMonitor::make_snap_purged_key_value(
6135 int64_t pool
, snapid_t snap
, snapid_t num
,
6136 epoch_t epoch
, bufferlist
*v
)
6138 // encode the *last* epoch in the key so that we can use forward
6139 // iteration only to search for an epoch in an interval.
6141 encode(snap
+ num
, *v
);
6143 return make_snap_purged_key(pool
, snap
+ num
- 1);
6146 int OSDMonitor::lookup_pruned_snap(int64_t pool
, snapid_t snap
,
6147 snapid_t
*begin
, snapid_t
*end
)
6149 string k
= make_snap_key(pool
, snap
);
6150 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
6155 if (it
->key().find(OSD_SNAP_PREFIX
) != 0) {
6158 bufferlist v
= it
->value();
6159 auto p
= v
.cbegin();
6162 if (snap
< *begin
|| snap
>= *end
) {
6168 bool OSDMonitor::try_prune_purged_snaps()
6170 if (!mon
->mgrstatmon()->is_readable()) {
6173 if (osdmap
.require_osd_release
< CEPH_RELEASE_MIMIC
) {
6176 if (!pending_inc
.new_purged_snaps
.empty()) {
6177 return false; // we already pruned for this epoch
6180 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
6181 "mon_max_snap_prune_per_epoch");
6185 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
6187 unsigned actually_pruned
= 0;
6188 auto& purged_snaps
= mon
->mgrstatmon()->get_digest().purged_snaps
;
6189 for (auto& p
: osdmap
.get_pools()) {
6190 auto q
= purged_snaps
.find(p
.first
);
6191 if (q
== purged_snaps
.end()) {
6194 auto& purged
= q
->second
;
6195 if (purged
.empty()) {
6196 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
6199 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
6200 OSDMap::snap_interval_set_t to_prune
;
6201 unsigned maybe_pruned
= actually_pruned
;
6202 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
6203 snapid_t begin
= i
.get_start();
6204 auto end
= i
.get_start() + i
.get_len();
6205 snapid_t pbegin
= 0, pend
= 0;
6206 int r
= lookup_pruned_snap(p
.first
, begin
, &pbegin
, &pend
);
6209 // be a bit aggressive about backing off here, because the mon may
6210 // do a lot of work going through this set, and if we know the
6211 // purged set from the OSDs is at least *partly* stale we may as
6212 // well wait for it to be fresh.
6213 dout(20) << __func__
<< " we've already pruned " << pbegin
6214 << "~" << (pend
- pbegin
) << dendl
;
6217 if (pbegin
&& pbegin
< end
) {
6218 // the tail of [begin,end) is purged; shorten the range
6219 ceph_assert(pbegin
> begin
);
6222 to_prune
.insert(begin
, end
- begin
);
6223 maybe_pruned
+= end
- begin
;
6224 if (maybe_pruned
>= max_prune
) {
6228 if (!to_prune
.empty()) {
6229 // PGs may still be reporting things as purged that we have already
6230 // pruned from removed_snaps_queue.
6231 OSDMap::snap_interval_set_t actual
;
6232 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
6233 if (r
!= osdmap
.removed_snaps_queue
.end()) {
6234 actual
.intersection_of(to_prune
, r
->second
);
6236 actually_pruned
+= actual
.size();
6237 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
6238 << ", actual pruned " << actual
<< dendl
;
6239 if (!actual
.empty()) {
6240 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
6243 if (actually_pruned
>= max_prune
) {
6247 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
6248 return !!actually_pruned
;
6251 bool OSDMonitor::update_pools_status()
6253 if (!mon
->mgrstatmon()->is_readable())
6258 auto& pools
= osdmap
.get_pools();
6259 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
6260 const pool_stat_t
*pstat
= mon
->mgrstatmon()->get_pool_stat(it
->first
);
6263 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6264 const pg_pool_t
&pool
= it
->second
;
6265 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
6268 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
6269 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
6271 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
6275 mon
->clog
->info() << "pool '" << pool_name
6276 << "' no longer out of quota; removing NO_QUOTA flag";
6277 // below we cancel FLAG_FULL too, we'll set it again in
6278 // OSDMonitor::encode_pending if it still fails the osd-full checking.
6279 clear_pool_flags(it
->first
,
6280 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
6286 if (pool
.quota_max_bytes
> 0 &&
6287 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
6288 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
6289 << " (reached quota's max_bytes: "
6290 << byte_u_t(pool
.quota_max_bytes
) << ")";
6292 if (pool
.quota_max_objects
> 0 &&
6293 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
6294 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
6295 << " (reached quota's max_objects: "
6296 << pool
.quota_max_objects
<< ")";
6298 // set both FLAG_FULL_QUOTA and FLAG_FULL
6299 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
6300 // since FLAG_FULL should always take precedence
6301 set_pool_flags(it
->first
,
6302 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
6303 clear_pool_flags(it
->first
,
6304 pg_pool_t::FLAG_NEARFULL
|
6305 pg_pool_t::FLAG_BACKFILLFULL
);
6312 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
6314 op
->mark_osdmon_event(__func__
);
6315 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
6316 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
6317 MonSession
*session
= op
->get_session();
6320 string erasure_code_profile
;
6324 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
6326 erasure_code_profile
,
6327 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
6330 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
6335 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
6336 const string
& dstname
,
6341 // Avoid creating a pending crush if it does not already exists and
6342 // the rename would fail.
6344 if (!_have_pending_crush()) {
6345 ret
= _get_stable_crush().can_rename_bucket(srcname
,
6352 CrushWrapper newcrush
;
6353 _get_pending_crush(newcrush
);
6355 ret
= newcrush
.rename_bucket(srcname
,
6361 pending_inc
.crush
.clear();
6362 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6363 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
6367 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
6369 string replacement
= "";
6371 if (plugin
== "jerasure_generic" ||
6372 plugin
== "jerasure_sse3" ||
6373 plugin
== "jerasure_sse4" ||
6374 plugin
== "jerasure_neon") {
6375 replacement
= "jerasure";
6376 } else if (plugin
== "shec_generic" ||
6377 plugin
== "shec_sse3" ||
6378 plugin
== "shec_sse4" ||
6379 plugin
== "shec_neon") {
6380 replacement
= "shec";
6383 if (replacement
!= "") {
6384 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
6385 << plugin
<< " that has been deprecated. Please use "
6386 << replacement
<< " instead." << dendl
;
6390 int OSDMonitor::normalize_profile(const string
& profilename
,
6391 ErasureCodeProfile
&profile
,
6395 ErasureCodeInterfaceRef erasure_code
;
6396 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
6397 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
6398 check_legacy_ec_plugin(plugin
->second
, profilename
);
6399 int err
= instance
.factory(plugin
->second
,
6400 g_conf().get_val
<std::string
>("erasure_code_dir"),
6401 profile
, &erasure_code
, ss
);
6406 err
= erasure_code
->init(profile
, ss
);
6411 auto it
= profile
.find("stripe_unit");
6412 if (it
!= profile
.end()) {
6414 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
6415 if (!err_str
.empty()) {
6416 *ss
<< "could not parse stripe_unit '" << it
->second
6417 << "': " << err_str
<< std::endl
;
6420 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
6421 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
6422 if (chunk_size
!= stripe_unit
) {
6423 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
6424 << "alignment. Would be padded to " << chunk_size
6428 if ((stripe_unit
% 4096) != 0 && !force
) {
6429 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
6430 << "use --force to override this check" << std::endl
;
6437 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
6438 const string
&profile
,
6442 int ruleid
= osdmap
.crush
->get_rule_id(name
);
6443 if (ruleid
!= -ENOENT
) {
6444 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
6448 CrushWrapper newcrush
;
6449 _get_pending_crush(newcrush
);
6451 ruleid
= newcrush
.get_rule_id(name
);
6452 if (ruleid
!= -ENOENT
) {
6453 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
6456 ErasureCodeInterfaceRef erasure_code
;
6457 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
6459 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
6463 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
6464 erasure_code
.reset();
6468 pending_inc
.crush
.clear();
6469 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6474 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
6475 ErasureCodeInterfaceRef
*erasure_code
,
6478 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
6480 ErasureCodeProfile profile
=
6481 osdmap
.get_erasure_code_profile(erasure_code_profile
);
6482 ErasureCodeProfile::const_iterator plugin
=
6483 profile
.find("plugin");
6484 if (plugin
== profile
.end()) {
6485 *ss
<< "cannot determine the erasure code plugin"
6486 << " because there is no 'plugin' entry in the erasure_code_profile "
6487 << profile
<< std::endl
;
6490 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
6491 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
6492 return instance
.factory(plugin
->second
,
6493 g_conf().get_val
<std::string
>("erasure_code_dir"),
6494 profile
, erasure_code
, ss
);
6497 int OSDMonitor::check_cluster_features(uint64_t features
,
6500 stringstream unsupported_ss
;
6501 int unsupported_count
= 0;
6502 if ((mon
->get_quorum_con_features() & features
) != features
) {
6503 unsupported_ss
<< "the monitor cluster";
6504 ++unsupported_count
;
6507 set
<int32_t> up_osds
;
6508 osdmap
.get_up_osds(up_osds
);
6509 for (set
<int32_t>::iterator it
= up_osds
.begin();
6510 it
!= up_osds
.end(); ++it
) {
6511 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
6512 if ((xi
.features
& features
) != features
) {
6513 if (unsupported_count
> 0)
6514 unsupported_ss
<< ", ";
6515 unsupported_ss
<< "osd." << *it
;
6516 unsupported_count
++;
6520 if (unsupported_count
> 0) {
6521 ss
<< "features " << features
<< " unsupported by: "
6522 << unsupported_ss
.str();
6526 // check pending osd state, too!
6527 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
6528 pending_inc
.new_xinfo
.begin();
6529 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
6530 const osd_xinfo_t
&xi
= p
->second
;
6531 if ((xi
.features
& features
) != features
) {
6532 dout(10) << __func__
<< " pending osd." << p
->first
6533 << " features are insufficient; retry" << dendl
;
6541 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
6544 OSDMap::Incremental new_pending
= pending_inc
;
6545 encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
6547 newmap
.deepish_copy_from(osdmap
);
6548 newmap
.apply_incremental(new_pending
);
6551 if (newmap
.require_min_compat_client
> 0) {
6552 auto mv
= newmap
.get_min_compat_client();
6553 if (mv
> newmap
.require_min_compat_client
) {
6554 ss
<< "new crush map requires client version " << ceph_release_name(mv
)
6555 << " but require_min_compat_client is "
6556 << ceph_release_name(newmap
.require_min_compat_client
);
6563 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
6564 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
6565 stringstream features_ss
;
6566 int r
= check_cluster_features(features
, features_ss
);
6568 ss
<< "Could not change CRUSH: " << features_ss
.str();
6575 bool OSDMonitor::erasure_code_profile_in_use(
6576 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
6577 const string
&profile
,
6581 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
6584 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
6585 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
6590 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
6595 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
6596 map
<string
,string
> *erasure_code_profile_map
,
6599 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
6602 erasure_code_profile_map
,
6606 ceph_assert((*erasure_code_profile_map
).count("plugin"));
6607 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
6608 map
<string
,string
> user_map
;
6609 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
6610 i
!= erasure_code_profile
.end();
6612 size_t equal
= i
->find('=');
6613 if (equal
== string::npos
) {
6614 user_map
[*i
] = string();
6615 (*erasure_code_profile_map
)[*i
] = string();
6617 const string key
= i
->substr(0, equal
);
6619 const string value
= i
->substr(equal
);
6620 if (key
.find("ruleset-") == 0) {
6621 *ss
<< "property '" << key
<< "' is no longer supported; try "
6622 << "'crush-" << key
.substr(8) << "' instead";
6625 user_map
[key
] = value
;
6626 (*erasure_code_profile_map
)[key
] = value
;
6630 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
6631 (*erasure_code_profile_map
) = user_map
;
6636 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
6637 const string
&erasure_code_profile
,
6639 unsigned *size
, unsigned *min_size
,
6643 switch (pool_type
) {
6644 case pg_pool_t::TYPE_REPLICATED
:
6645 if (repl_size
== 0) {
6646 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
6649 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
6651 case pg_pool_t::TYPE_ERASURE
:
6653 ErasureCodeInterfaceRef erasure_code
;
6654 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
6656 *size
= erasure_code
->get_chunk_count();
6658 erasure_code
->get_data_chunk_count() +
6659 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
6660 assert(*min_size
<= *size
);
6661 assert(*min_size
>= erasure_code
->get_data_chunk_count());
6666 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
6673 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
6674 const string
&erasure_code_profile
,
6675 uint32_t *stripe_width
,
6679 switch (pool_type
) {
6680 case pg_pool_t::TYPE_REPLICATED
:
6683 case pg_pool_t::TYPE_ERASURE
:
6685 ErasureCodeProfile profile
=
6686 osdmap
.get_erasure_code_profile(erasure_code_profile
);
6687 ErasureCodeInterfaceRef erasure_code
;
6688 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
6691 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
6692 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
6693 auto it
= profile
.find("stripe_unit");
6694 if (it
!= profile
.end()) {
6696 stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
6697 ceph_assert(err_str
.empty());
6699 *stripe_width
= data_chunks
*
6700 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
6704 *ss
<< "prepare_pool_stripe_width: "
6705 << pool_type
<< " is not a known pool type";
6712 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
6713 const string
&erasure_code_profile
,
6714 const string
&rule_name
,
6719 if (*crush_rule
< 0) {
6720 switch (pool_type
) {
6721 case pg_pool_t::TYPE_REPLICATED
:
6723 if (rule_name
== "") {
6725 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
6726 if (*crush_rule
< 0) {
6727 // Errors may happen e.g. if no valid rule is available
6728 *ss
<< "No suitable CRUSH rule exists, check "
6729 << "'osd pool default crush *' config options";
6733 return get_crush_rule(rule_name
, crush_rule
, ss
);
6737 case pg_pool_t::TYPE_ERASURE
:
6739 int err
= crush_rule_create_erasure(rule_name
,
6740 erasure_code_profile
,
6744 dout(20) << "prepare_pool_crush_rule: rule "
6745 << rule_name
<< " try again" << dendl
;
6748 // need to wait for the crush rule to be proposed before proceeding
6759 *ss
<< "prepare_pool_crush_rule: " << pool_type
6760 << " is not a known pool type";
6765 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
6766 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
6774 int OSDMonitor::get_crush_rule(const string
&rule_name
,
6779 ret
= osdmap
.crush
->get_rule_id(rule_name
);
6780 if (ret
!= -ENOENT
) {
6784 CrushWrapper newcrush
;
6785 _get_pending_crush(newcrush
);
6787 ret
= newcrush
.get_rule_id(rule_name
);
6788 if (ret
!= -ENOENT
) {
6789 // found it, wait for it to be proposed
6790 dout(20) << __func__
<< ": rule " << rule_name
6791 << " try again" << dendl
;
6794 // Cannot find it , return error
6795 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
6802 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
*ss
)
6804 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
6805 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u); // assume min cluster size 3
6806 auto max_pgs
= max_pgs_per_osd
* num_osds
;
6807 uint64_t projected
= 0;
6809 projected
+= pg_num
* size
;
6811 for (const auto& i
: osdmap
.get_pools()) {
6812 if (i
.first
== pool
) {
6813 projected
+= pg_num
* size
;
6815 projected
+= i
.second
.get_pg_num_target() * i
.second
.get_size();
6818 if (projected
> max_pgs
) {
6820 *ss
<< "pool id " << pool
;
6822 *ss
<< " pg_num " << pg_num
<< " size " << size
6823 << " would mean " << projected
6824 << " total pgs, which exceeds max " << max_pgs
6825 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6826 << " * num_in_osds " << num_osds
<< ")";
6833 * @param name The name of the new pool
6834 * @param crush_rule The crush rule to use. If <0, will use the system default
6835 * @param crush_rule_name The crush rule to use, if crush_rulset <0
6836 * @param pg_num The pg_num to use. If set to 0, will use the system default
6837 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6838 * @param repl_size Replication factor, or 0 for default
6839 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6840 * @param pool_type TYPE_ERASURE, or TYPE_REP
6841 * @param expected_num_objects expected number of objects on the pool
6842 * @param fast_read fast read type.
6843 * @param ss human readable error message, if any.
6845 * @return 0 on success, negative errno on failure.
6847 int OSDMonitor::prepare_new_pool(string
& name
,
6849 const string
&crush_rule_name
,
6850 unsigned pg_num
, unsigned pgp_num
,
6851 unsigned pg_num_min
,
6852 const uint64_t repl_size
,
6853 const uint64_t target_size_bytes
,
6854 const float target_size_ratio
,
6855 const string
&erasure_code_profile
,
6856 const unsigned pool_type
,
6857 const uint64_t expected_num_objects
,
6858 FastReadType fast_read
,
6861 if (name
.length() == 0)
6864 pg_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pg_num");
6866 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
6869 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
6870 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
6871 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
6872 << " (you may adjust 'mon max pool pg num' for higher values)";
6875 if (pgp_num
> pg_num
) {
6876 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6877 << ", which in this case is " << pg_num
;
6880 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
6881 *ss
<< "'fast_read' can only apply to erasure coding pool";
6885 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
6886 crush_rule_name
, &crush_rule
, ss
);
6888 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
6891 if (g_conf()->mon_osd_crush_smoke_test
) {
6892 CrushWrapper newcrush
;
6893 _get_pending_crush(newcrush
);
6895 CrushTester
tester(newcrush
, err
);
6896 tester
.set_min_x(0);
6897 tester
.set_max_x(50);
6898 tester
.set_rule(crush_rule
);
6899 auto start
= ceph::coarse_mono_clock::now();
6900 r
= tester
.test_with_fork(g_conf()->mon_lease
);
6901 auto duration
= ceph::coarse_mono_clock::now() - start
;
6903 dout(10) << "tester.test_with_fork returns " << r
6904 << ": " << err
.str() << dendl
;
6905 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
6908 dout(10) << __func__
<< " crush smoke test duration: "
6909 << duration
<< dendl
;
6911 unsigned size
, min_size
;
6912 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
6913 &size
, &min_size
, ss
);
6915 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
6918 r
= check_pg_num(-1, pg_num
, size
, ss
);
6920 dout(10) << "check_pg_num returns " << r
<< dendl
;
6924 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
6928 uint32_t stripe_width
= 0;
6929 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
6931 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
6936 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
6937 switch (fast_read
) {
6944 case FAST_READ_DEFAULT
:
6945 fread
= g_conf()->osd_pool_default_ec_fast_read
;
6948 *ss
<< "invalid fast_read setting: " << fast_read
;
6953 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
6954 p
!= pending_inc
.new_pool_names
.end();
6956 if (p
->second
== name
)
6960 if (-1 == pending_inc
.new_pool_max
)
6961 pending_inc
.new_pool_max
= osdmap
.pool_max
;
6962 int64_t pool
= ++pending_inc
.new_pool_max
;
6964 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
6965 pi
->create_time
= ceph_clock_now();
6966 pi
->type
= pool_type
;
6967 pi
->fast_read
= fread
;
6968 pi
->flags
= g_conf()->osd_pool_default_flags
;
6969 if (g_conf()->osd_pool_default_flag_hashpspool
)
6970 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
6971 if (g_conf()->osd_pool_default_flag_nodelete
)
6972 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
6973 if (g_conf()->osd_pool_default_flag_nopgchange
)
6974 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
6975 if (g_conf()->osd_pool_default_flag_nosizechange
)
6976 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
6977 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
6978 if (g_conf()->osd_pool_use_gmt_hitset
)
6979 pi
->use_gmt_hitset
= true;
6981 pi
->use_gmt_hitset
= false;
6984 pi
->min_size
= min_size
;
6985 pi
->crush_rule
= crush_rule
;
6986 pi
->expected_num_objects
= expected_num_objects
;
6987 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
6990 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
6991 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
6992 pi
->pg_autoscale_mode
= m
>= 0 ? m
: 0;
6994 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
6996 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
6998 pi
->set_pg_num_pending(pi
->get_pg_num());
6999 pi
->set_pg_num_target(pg_num
);
7000 pi
->set_pgp_num(pi
->get_pg_num());
7001 pi
->set_pgp_num_target(pgp_num
);
7002 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
&&
7004 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
7007 pi
->last_change
= pending_inc
.epoch
;
7010 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7011 pi
->erasure_code_profile
= erasure_code_profile
;
7013 pi
->erasure_code_profile
= "";
7015 pi
->stripe_width
= stripe_width
;
7017 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
&&
7018 target_size_bytes
) {
7019 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7020 // larger than int32_t max.
7021 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
7023 if (target_size_ratio
> 0.0 &&
7024 osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
7025 // only store for nautilus+, just to be consistent and tidy.
7026 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
7029 pi
->cache_target_dirty_ratio_micro
=
7030 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
7031 pi
->cache_target_dirty_high_ratio_micro
=
7032 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
7033 pi
->cache_target_full_ratio_micro
=
7034 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
7035 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
7036 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
7038 pending_inc
.new_pool_names
[pool
] = name
;
7042 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
7044 op
->mark_osdmon_event(__func__
);
7046 if (pending_inc
.new_flags
< 0)
7047 pending_inc
.new_flags
= osdmap
.get_flags();
7048 pending_inc
.new_flags
|= flag
;
7049 ss
<< OSDMap::get_flag_string(flag
) << " is set";
7050 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7051 get_last_committed() + 1));
7055 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
7057 op
->mark_osdmon_event(__func__
);
7059 if (pending_inc
.new_flags
< 0)
7060 pending_inc
.new_flags
= osdmap
.get_flags();
7061 pending_inc
.new_flags
&= ~flag
;
7062 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
7063 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7064 get_last_committed() + 1));
7068 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
7072 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
7073 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
7075 ss
<< "unrecognized pool '" << poolstr
<< "'";
7079 cmd_getval(cct
, cmdmap
, "var", var
);
7081 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
7082 if (pending_inc
.new_pools
.count(pool
))
7083 p
= pending_inc
.new_pools
[pool
];
7085 // accept val as a json string in the normal case (current
7086 // generation monitor). parse out int or float values from the
7087 // string as needed. however, if it is not a string, try to pull
7088 // out an int, in case an older monitor with an older json schema is
7089 // forwarding a request.
7091 string interr
, floaterr
;
7094 int64_t uf
= 0; // micro-f
7095 cmd_getval(cct
, cmdmap
, "val", val
);
7097 // parse string as both int and float; different fields use different types.
7098 n
= strict_strtoll(val
.c_str(), 10, &interr
);
7099 f
= strict_strtod(val
.c_str(), &floaterr
);
7100 uf
= llrintl(f
* (double)1000000.0);
7103 (var
== "hit_set_type" || var
== "hit_set_period" ||
7104 var
== "hit_set_count" || var
== "hit_set_fpp" ||
7105 var
== "target_max_objects" || var
== "target_max_bytes" ||
7106 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
7107 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
7108 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
7109 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
7110 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
7114 if (var
== "size") {
7115 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
7116 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
7119 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
7120 ss
<< "can not change the size of an erasure-coded pool";
7123 if (interr
.length()) {
7124 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7127 if (n
<= 0 || n
> 10) {
7128 ss
<< "pool size must be between 1 and 10";
7131 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, &ss
);
7138 } else if (var
== "min_size") {
7139 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
7140 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7143 if (interr
.length()) {
7144 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7148 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
7149 if (n
< 1 || n
> p
.size
) {
7150 ss
<< "pool min_size must be between 1 and " << (int)p
.size
;
7154 ErasureCodeInterfaceRef erasure_code
;
7157 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
7159 k
= erasure_code
->get_data_chunk_count();
7161 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
7165 if (n
< k
|| n
> p
.size
) {
7166 ss
<< "pool min_size must be between " << k
<< " and " << (int)p
.size
;
7171 } else if (var
== "pg_num_actual") {
7172 if (interr
.length()) {
7173 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7176 if (n
== (int)p
.get_pg_num()) {
7179 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7180 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7181 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7182 << " (you may adjust 'mon max pool pg num' for higher values)";
7185 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
7186 ss
<< "cannot adjust pg_num while initial PGs are being created";
7189 if (n
> (int)p
.get_pg_num()) {
7190 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
7191 // force pre-nautilus clients to resend their ops, since they
7192 // don't understand pg_num_pending changes form a new interval
7193 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
7197 if (osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
7198 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
7201 if (n
< (int)p
.get_pgp_num()) {
7202 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
7205 if (n
< (int)p
.get_pg_num() - 1) {
7206 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
7207 << ") - 1; only single pg decrease is currently supported";
7210 p
.set_pg_num_pending(n
);
7211 // force pre-nautilus clients to resend their ops, since they
7212 // don't understand pg_num_pending changes form a new interval
7213 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
7215 // force pre-luminous clients to resend their ops, since they
7216 // don't understand that split PGs now form a new interval.
7217 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
7218 } else if (var
== "pg_num") {
7219 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
7220 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
7223 if (interr
.length()) {
7224 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7227 if (n
== (int)p
.get_pg_num_target()) {
7230 if (n
<= 0 || static_cast<uint64_t>(n
) >
7231 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7232 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7233 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7234 << " (you may adjust 'mon max pool pg num' for higher values)";
7237 if (n
> (int)p
.get_pg_num_target()) {
7238 int r
= check_pg_num(pool
, n
, p
.get_size(), &ss
);
7243 cmd_getval(cct
,cmdmap
, "yes_i_really_mean_it", force
);
7244 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
7245 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
7249 if (osdmap
.require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
7250 ss
<< "nautilus OSDs are required to decrease pg_num";
7254 // set targets; mgr will adjust pg_num_actual and pgp_num later.
7255 // make pgp_num track pg_num if it already matches. if it is set
7256 // differently, leave it different and let the user control it
7258 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
7259 p
.set_pgp_num_target(n
);
7261 p
.set_pg_num_target(n
);
7262 } else if (var
== "pgp_num_actual") {
7263 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
7264 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7267 if (interr
.length()) {
7268 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7272 ss
<< "specified pgp_num must > 0, but you set to " << n
;
7275 if (n
> (int)p
.get_pg_num()) {
7276 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
7279 if (n
> (int)p
.get_pg_num_pending()) {
7280 ss
<< "specified pgp_num " << n
7281 << " > pg_num_pending " << p
.get_pg_num_pending();
7285 } else if (var
== "pgp_num") {
7286 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
7287 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7290 if (interr
.length()) {
7291 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7295 ss
<< "specified pgp_num must > 0, but you set to " << n
;
7298 if (n
> (int)p
.get_pg_num_target()) {
7299 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
7302 p
.set_pgp_num_target(n
);
7303 } else if (var
== "pg_autoscale_mode") {
7304 n
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
7306 ss
<< "specified invalid mode " << val
;
7309 p
.pg_autoscale_mode
= n
;
7310 } else if (var
== "crush_rule") {
7311 int id
= osdmap
.crush
->get_rule_id(val
);
7312 if (id
== -ENOENT
) {
7313 ss
<< "crush rule " << val
<< " does not exist";
7317 ss
<< cpp_strerror(id
);
7320 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
7324 } else if (var
== "nodelete" || var
== "nopgchange" ||
7325 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
7326 var
== "noscrub" || var
== "nodeep-scrub") {
7327 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
7328 // make sure we only compare against 'n' if we didn't receive a string
7329 if (val
== "true" || (interr
.empty() && n
== 1)) {
7331 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
7334 ss
<< "expecting value 'true', 'false', '0', or '1'";
7337 } else if (var
== "hashpspool") {
7338 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
7340 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", force
);
7343 ss
<< "are you SURE? this will remap all placement groups in this pool,"
7344 " this triggers large data movement,"
7345 " pass --yes-i-really-mean-it if you really do.";
7348 // make sure we only compare against 'n' if we didn't receive a string
7349 if (val
== "true" || (interr
.empty() && n
== 1)) {
7351 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
7354 ss
<< "expecting value 'true', 'false', '0', or '1'";
7357 } else if (var
== "hit_set_type") {
7359 p
.hit_set_params
= HitSet::Params();
7361 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
7364 if (val
== "bloom") {
7365 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
7366 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
7367 p
.hit_set_params
= HitSet::Params(bsp
);
7368 } else if (val
== "explicit_hash")
7369 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
7370 else if (val
== "explicit_object")
7371 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
7373 ss
<< "unrecognized hit_set type '" << val
<< "'";
7377 } else if (var
== "hit_set_period") {
7378 if (interr
.length()) {
7379 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7382 ss
<< "hit_set_period should be non-negative";
7385 p
.hit_set_period
= n
;
7386 } else if (var
== "hit_set_count") {
7387 if (interr
.length()) {
7388 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7391 ss
<< "hit_set_count should be non-negative";
7394 p
.hit_set_count
= n
;
7395 } else if (var
== "hit_set_fpp") {
7396 if (floaterr
.length()) {
7397 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
7399 } else if (f
< 0 || f
> 1.0) {
7400 ss
<< "hit_set_fpp should be in the range 0..1";
7403 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
7404 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
7407 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
7409 } else if (var
== "use_gmt_hitset") {
7410 if (val
== "true" || (interr
.empty() && n
== 1)) {
7411 p
.use_gmt_hitset
= true;
7413 ss
<< "expecting value 'true' or '1'";
7416 } else if (var
== "allow_ec_overwrites") {
7417 if (!p
.is_erasure()) {
7418 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
7422 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
7423 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
7424 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
7427 if (val
== "true" || (interr
.empty() && n
== 1)) {
7428 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
7429 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
7430 ss
<< "ec overwrites cannot be disabled once enabled";
7433 ss
<< "expecting value 'true', 'false', '0', or '1'";
7436 } else if (var
== "target_max_objects") {
7437 if (interr
.length()) {
7438 ss
<< "error parsing int '" << val
<< "': " << interr
;
7441 p
.target_max_objects
= n
;
7442 } else if (var
== "target_max_bytes") {
7443 if (interr
.length()) {
7444 ss
<< "error parsing int '" << val
<< "': " << interr
;
7447 p
.target_max_bytes
= n
;
7448 } else if (var
== "cache_target_dirty_ratio") {
7449 if (floaterr
.length()) {
7450 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
7453 if (f
< 0 || f
> 1.0) {
7454 ss
<< "value must be in the range 0..1";
7457 p
.cache_target_dirty_ratio_micro
= uf
;
7458 } else if (var
== "cache_target_dirty_high_ratio") {
7459 if (floaterr
.length()) {
7460 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
7463 if (f
< 0 || f
> 1.0) {
7464 ss
<< "value must be in the range 0..1";
7467 p
.cache_target_dirty_high_ratio_micro
= uf
;
7468 } else if (var
== "cache_target_full_ratio") {
7469 if (floaterr
.length()) {
7470 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
7473 if (f
< 0 || f
> 1.0) {
7474 ss
<< "value must be in the range 0..1";
7477 p
.cache_target_full_ratio_micro
= uf
;
7478 } else if (var
== "cache_min_flush_age") {
7479 if (interr
.length()) {
7480 ss
<< "error parsing int '" << val
<< "': " << interr
;
7483 p
.cache_min_flush_age
= n
;
7484 } else if (var
== "cache_min_evict_age") {
7485 if (interr
.length()) {
7486 ss
<< "error parsing int '" << val
<< "': " << interr
;
7489 p
.cache_min_evict_age
= n
;
7490 } else if (var
== "min_read_recency_for_promote") {
7491 if (interr
.length()) {
7492 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7495 p
.min_read_recency_for_promote
= n
;
7496 } else if (var
== "hit_set_grade_decay_rate") {
7497 if (interr
.length()) {
7498 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7501 if (n
> 100 || n
< 0) {
7502 ss
<< "value out of range,valid range is 0 - 100";
7505 p
.hit_set_grade_decay_rate
= n
;
7506 } else if (var
== "hit_set_search_last_n") {
7507 if (interr
.length()) {
7508 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7511 if (n
> p
.hit_set_count
|| n
< 0) {
7512 ss
<< "value out of range,valid range is 0 - hit_set_count";
7515 p
.hit_set_search_last_n
= n
;
7516 } else if (var
== "min_write_recency_for_promote") {
7517 if (interr
.length()) {
7518 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7521 p
.min_write_recency_for_promote
= n
;
7522 } else if (var
== "fast_read") {
7523 if (p
.is_replicated()) {
7524 ss
<< "fast read is not supported in replication pool";
7527 if (val
== "true" || (interr
.empty() && n
== 1)) {
7529 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
7530 p
.fast_read
= false;
7532 ss
<< "expecting value 'true', 'false', '0', or '1'";
7535 } else if (pool_opts_t::is_opt_name(var
)) {
7536 bool unset
= val
== "unset";
7537 if (var
== "compression_mode") {
7539 auto cmode
= Compressor::get_comp_mode_type(val
);
7541 ss
<< "unrecognized compression mode '" << val
<< "'";
7545 } else if (var
== "compression_algorithm") {
7547 auto alg
= Compressor::get_comp_alg_type(val
);
7549 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
7553 } else if (var
== "compression_required_ratio") {
7554 if (floaterr
.length()) {
7555 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
7558 if (f
< 0 || f
> 1) {
7559 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
7562 } else if (var
== "csum_type") {
7563 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
7565 ss
<< "unrecognized csum_type '" << val
<< "'";
7568 //preserve csum_type numeric value
7571 } else if (var
== "compression_max_blob_size" ||
7572 var
== "compression_min_blob_size" ||
7573 var
== "csum_max_block" ||
7574 var
== "csum_min_block") {
7575 if (interr
.length()) {
7576 ss
<< "error parsing int value '" << val
<< "': " << interr
;
7579 } else if (var
== "fingerprint_algorithm") {
7581 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
7583 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
7587 } else if (var
== "pg_num_min") {
7588 if (interr
.length()) {
7589 ss
<< "error parsing int value '" << val
<< "': " << interr
;
7592 if (n
> (int)p
.get_pg_num_target()) {
7593 ss
<< "specified pg_num_min " << n
7594 << " > pg_num " << p
.get_pg_num_target();
7597 } else if (var
== "recovery_priority") {
7598 if (interr
.length()) {
7599 ss
<< "error parsing int value '" << val
<< "': " << interr
;
7602 if (!g_conf()->debug_allow_any_pool_priority
) {
7603 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
7604 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
7605 << " and " << OSD_POOL_PRIORITY_MAX
;
7609 } else if (var
== "pg_autoscale_bias") {
7610 if (f
< 0.0 || f
> 1000.0) {
7611 ss
<< "pg_autoscale_bias must be between 0 and 1000";
7616 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
7617 switch (desc
.type
) {
7618 case pool_opts_t::STR
:
7620 p
.opts
.unset(desc
.key
);
7622 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
7625 case pool_opts_t::INT
:
7626 if (interr
.length()) {
7627 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7631 p
.opts
.unset(desc
.key
);
7633 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
7636 case pool_opts_t::DOUBLE
:
7637 if (floaterr
.length()) {
7638 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
7642 p
.opts
.unset(desc
.key
);
7644 p
.opts
.set(desc
.key
, static_cast<double>(f
));
7648 ceph_assert(!"unknown type");
7651 ss
<< "unrecognized variable '" << var
<< "'";
7654 if (val
!= "unset") {
7655 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
7657 ss
<< "unset pool " << pool
<< " " << var
;
7659 p
.last_change
= pending_inc
.epoch
;
7660 pending_inc
.new_pools
[pool
] = p
;
7664 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
7665 const cmdmap_t
& cmdmap
,
7668 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
7671 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
7672 const cmdmap_t
& cmdmap
,
7676 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
7681 * Common logic for preprocess and prepare phases of pool application
7682 * tag commands. In preprocess mode we're only detecting invalid
7683 * commands, and determining whether it was a modification or a no-op.
7684 * In prepare mode we're actually updating the pending state.
7686 int OSDMonitor::_command_pool_application(const string
&prefix
,
7687 const cmdmap_t
& cmdmap
,
7693 cmd_getval(cct
, cmdmap
, "pool", pool_name
);
7694 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
7696 ss
<< "unrecognized pool '" << pool_name
<< "'";
7700 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
7702 if (pending_inc
.new_pools
.count(pool
)) {
7703 p
= pending_inc
.new_pools
[pool
];
7708 cmd_getval(cct
, cmdmap
, "app", app
);
7709 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
7712 cmd_getval(cct
, cmdmap
, "key", key
);
7714 ss
<< "key cannot be 'all'";
7719 cmd_getval(cct
, cmdmap
, "value", value
);
7720 if (value
== "all") {
7721 ss
<< "value cannot be 'all'";
7725 if (boost::algorithm::ends_with(prefix
, "enable")) {
7727 ss
<< "application name must be provided";
7732 ss
<< "application must be enabled on base tier";
7737 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", force
);
7739 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
7740 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
7741 << "application; pass --yes-i-really-mean-it to proceed anyway";
7745 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
7746 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
7747 << "max " << MAX_POOL_APPLICATIONS
;
7751 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
7752 ss
<< "application name '" << app
<< "' too long; max length "
7753 << MAX_POOL_APPLICATION_LENGTH
;
7758 p
.application_metadata
[app
] = {};
7760 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
7762 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
7764 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", force
);
7767 ss
<< "Are you SURE? Disabling an application within a pool might result "
7768 << "in loss of application functionality; pass "
7769 << "--yes-i-really-mean-it to proceed anyway";
7774 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
7776 return 0; // idempotent
7779 p
.application_metadata
.erase(app
);
7780 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
7782 } else if (boost::algorithm::ends_with(prefix
, "set")) {
7784 ss
<< "application metadata must be set on base tier";
7789 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
7795 cmd_getval(cct
, cmdmap
, "key", key
);
7798 ss
<< "key must be provided";
7802 auto &app_keys
= p
.application_metadata
[app
];
7803 if (app_keys
.count(key
) == 0 &&
7804 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
7805 ss
<< "too many keys set for application '" << app
<< "' on pool '"
7806 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
7810 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
7811 ss
<< "key '" << app
<< "' too long; max length "
7812 << MAX_POOL_APPLICATION_LENGTH
;
7817 cmd_getval(cct
, cmdmap
, "value", value
);
7818 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
7819 ss
<< "value '" << value
<< "' too long; max length "
7820 << MAX_POOL_APPLICATION_LENGTH
;
7824 p
.application_metadata
[app
][key
] = value
;
7825 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
7826 << value
<< "' on pool '" << pool_name
<< "'";
7827 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
7829 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
7835 cmd_getval(cct
, cmdmap
, "key", key
);
7836 auto it
= p
.application_metadata
[app
].find(key
);
7837 if (it
== p
.application_metadata
[app
].end()) {
7838 ss
<< "application '" << app
<< "' on pool '" << pool_name
7839 << "' does not have key '" << key
<< "'";
7840 return 0; // idempotent
7843 p
.application_metadata
[app
].erase(it
);
7844 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
7845 << pool_name
<< "'";
7851 p
.last_change
= pending_inc
.epoch
;
7852 pending_inc
.new_pools
[pool
] = p
;
7855 // Because we fell through this far, we didn't hit no-op cases,
7856 // so pool was definitely modified
7857 if (modified
!= nullptr) {
7864 int OSDMonitor::_prepare_command_osd_crush_remove(
7865 CrushWrapper
&newcrush
,
7874 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
7877 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
7882 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
7884 pending_inc
.crush
.clear();
7885 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7888 int OSDMonitor::prepare_command_osd_crush_remove(
7889 CrushWrapper
&newcrush
,
7895 int err
= _prepare_command_osd_crush_remove(
7896 newcrush
, id
, ancestor
,
7897 has_ancestor
, unlink_only
);
7902 ceph_assert(err
== 0);
7903 do_osd_crush_remove(newcrush
);
7908 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
7910 if (osdmap
.is_up(id
)) {
7914 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
7915 pending_inc
.new_uuid
[id
] = uuid_d();
7916 pending_metadata_rm
.insert(id
);
7917 pending_metadata
.erase(id
);
7922 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
7924 ceph_assert(existing_id
);
7927 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
7928 if (!osdmap
.exists(i
) &&
7929 pending_inc
.new_up_client
.count(i
) == 0 &&
7930 (pending_inc
.new_state
.count(i
) == 0 ||
7931 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
7937 if (pending_inc
.new_max_osd
< 0) {
7938 return osdmap
.get_max_osd();
7940 return pending_inc
.new_max_osd
;
7943 void OSDMonitor::do_osd_create(
7946 const string
& device_class
,
7949 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
7950 ceph_assert(new_id
);
7952 // We presume validation has been performed prior to calling this
7953 // function. We assert with prejudice.
7955 int32_t allocated_id
= -1; // declare here so we can jump
7956 int32_t existing_id
= -1;
7957 if (!uuid
.is_zero()) {
7958 existing_id
= osdmap
.identify_osd(uuid
);
7959 if (existing_id
>= 0) {
7960 ceph_assert(id
< 0 || id
== existing_id
);
7961 *new_id
= existing_id
;
7963 } else if (id
>= 0) {
7964 // uuid does not exist, and id has been provided, so just create
7971 // allocate a new id
7972 allocated_id
= _allocate_osd_id(&existing_id
);
7973 dout(10) << __func__
<< " allocated id " << allocated_id
7974 << " existing id " << existing_id
<< dendl
;
7975 if (existing_id
>= 0) {
7976 ceph_assert(existing_id
< osdmap
.get_max_osd());
7977 ceph_assert(allocated_id
< 0);
7978 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
7979 *new_id
= existing_id
;
7980 } else if (allocated_id
>= 0) {
7981 ceph_assert(existing_id
< 0);
7983 if (pending_inc
.new_max_osd
< 0) {
7984 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
7986 ++pending_inc
.new_max_osd
;
7988 *new_id
= pending_inc
.new_max_osd
- 1;
7989 ceph_assert(*new_id
== allocated_id
);
7991 ceph_abort_msg("unexpected condition");
7995 if (device_class
.size()) {
7996 CrushWrapper newcrush
;
7997 _get_pending_crush(newcrush
);
7998 if (newcrush
.get_max_devices() < *new_id
+ 1) {
7999 newcrush
.set_max_devices(*new_id
+ 1);
8001 string name
= string("osd.") + stringify(*new_id
);
8002 if (!newcrush
.item_exists(*new_id
)) {
8003 newcrush
.set_item_name(*new_id
, name
);
8006 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
8008 derr
<< __func__
<< " failed to set " << name
<< " device_class "
8009 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
8011 // non-fatal... this might be a replay and we want to be idempotent.
8013 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
8015 pending_inc
.crush
.clear();
8016 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8019 dout(20) << __func__
<< " no device_class" << dendl
;
8022 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
8023 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
8024 pending_inc
.new_max_osd
= *new_id
+ 1;
8027 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
8028 if (!uuid
.is_zero())
8029 pending_inc
.new_uuid
[*new_id
] = uuid
;
8032 int OSDMonitor::validate_osd_create(
8035 const bool check_osd_exists
,
8036 int32_t* existing_id
,
8040 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
8041 << " check_osd_exists " << check_osd_exists
<< dendl
;
8043 ceph_assert(existing_id
);
8045 if (id
< 0 && uuid
.is_zero()) {
8046 // we have nothing to validate
8049 } else if (uuid
.is_zero()) {
8050 // we have an id but we will ignore it - because that's what
8051 // `osd create` does.
8056 * This function will be used to validate whether we are able to
8057 * create a new osd when the `uuid` is specified.
8059 * It will be used by both `osd create` and `osd new`, as the checks
8060 * are basically the same when it pertains to osd id and uuid validation.
8061 * However, `osd create` presumes an `uuid` is optional, for legacy
8062 * reasons, while `osd new` requires the `uuid` to be provided. This
8063 * means that `osd create` will not be idempotent if an `uuid` is not
8064 * provided, but we will always guarantee the idempotency of `osd new`.
8067 ceph_assert(!uuid
.is_zero());
8068 if (pending_inc
.identify_osd(uuid
) >= 0) {
8069 // osd is about to exist
8073 int32_t i
= osdmap
.identify_osd(uuid
);
8075 // osd already exists
8076 if (id
>= 0 && i
!= id
) {
8077 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
8080 // return a positive errno to distinguish between a blocking error
8081 // and an error we consider to not be a problem (i.e., this would be
8082 // an idempotent operation).
8088 if (pending_inc
.new_state
.count(id
)) {
8089 // osd is about to exist
8092 // we may not care if an osd exists if we are recreating a previously
8094 if (check_osd_exists
&& osdmap
.exists(id
)) {
8095 ss
<< "id " << id
<< " already in use and does not match uuid "
8103 int OSDMonitor::prepare_command_osd_create(
8106 int32_t* existing_id
,
8109 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
8110 ceph_assert(existing_id
);
8111 if (osdmap
.is_destroyed(id
)) {
8112 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
8117 if (uuid
.is_zero()) {
8118 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
8121 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
8124 int OSDMonitor::prepare_command_osd_new(
8126 const cmdmap_t
& cmdmap
,
8127 const map
<string
,string
>& params
,
8135 ceph_assert(paxos
->is_plugged());
8137 dout(10) << __func__
<< " " << op
<< dendl
;
8139 /* validate command. abort now if something's wrong. */
8141 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8143 * If `id` is not specified, we will identify any existing osd based
8144 * on `uuid`. Operation will be idempotent iff secrets match.
8146 * If `id` is specified, we will identify any existing osd based on
8147 * `uuid` and match against `id`. If they match, operation will be
8148 * idempotent iff secrets match.
8150 * `-i secrets.json` will be optional. If supplied, will be used
8151 * to check for idempotency when `id` and `uuid` match.
8153 * If `id` is not specified, and `uuid` does not exist, an id will
8154 * be found or allocated for the osd.
8156 * If `id` is specified, and the osd has been previously marked
8157 * as destroyed, then the `id` will be reused.
8159 if (!cmd_getval(cct
, cmdmap
, "uuid", uuidstr
)) {
8160 ss
<< "requires the OSD's UUID to be specified.";
8162 } else if (!uuid
.parse(uuidstr
.c_str())) {
8163 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
8167 if (cmd_getval(cct
, cmdmap
, "id", id
) &&
8169 ss
<< "invalid OSD id; must be greater or equal than zero.";
8173 // are we running an `osd create`-like command, or recreating
8174 // a previously destroyed osd?
8176 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
8178 // we will care about `id` to assess whether osd is `destroyed`, or
8179 // to create a new osd.
8180 // we will need an `id` by the time we reach auth.
8182 int32_t existing_id
= -1;
8183 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
8186 bool may_be_idempotent
= false;
8187 if (err
== EEXIST
) {
8188 // this is idempotent from the osdmon's point-of-view
8189 may_be_idempotent
= true;
8190 ceph_assert(existing_id
>= 0);
8192 } else if (err
< 0) {
8196 if (!may_be_idempotent
) {
8197 // idempotency is out of the window. We are either creating a new
8198 // osd or recreating a destroyed osd.
8200 // We now need to figure out if we have an `id` (and if it's valid),
8201 // of find an `id` if we don't have one.
8203 // NOTE: we need to consider the case where the `id` is specified for
8204 // `osd create`, and we must honor it. So this means checking if
8205 // the `id` is destroyed, and if so assume the destroy; otherwise,
8206 // check if it `exists` - in which case we complain about not being
8207 // `destroyed`. In the end, if nothing fails, we must allow the
8208 // creation, so that we are compatible with `create`.
8209 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
8210 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
8211 ss
<< "OSD " << id
<< " has not yet been destroyed";
8213 } else if (id
< 0) {
8215 id
= _allocate_osd_id(&existing_id
);
8217 ceph_assert(existing_id
>= 0);
8220 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
8221 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
8222 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
8224 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
8227 ceph_assert(id
>= 0);
8228 ceph_assert(osdmap
.exists(id
));
8231 // we are now able to either create a brand new osd or reuse an existing
8232 // osd that has been previously destroyed.
8234 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
8236 if (may_be_idempotent
&& params
.empty()) {
8237 // nothing to do, really.
8238 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
8239 ceph_assert(id
>= 0);
8241 f
->open_object_section("created_osd");
8242 f
->dump_int("osdid", id
);
8250 string device_class
;
8251 auto p
= params
.find("crush_device_class");
8252 if (p
!= params
.end()) {
8253 device_class
= p
->second
;
8254 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
8256 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
8257 bool has_lockbox
= false;
8258 bool has_secrets
= params
.count("cephx_secret")
8259 || params
.count("cephx_lockbox_secret")
8260 || params
.count("dmcrypt_key");
8262 ConfigKeyService
*svc
= nullptr;
8263 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
8266 if (params
.count("cephx_secret") == 0) {
8267 ss
<< "requires a cephx secret.";
8270 cephx_secret
= params
.at("cephx_secret");
8272 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
8273 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
8275 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
8276 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
8278 if (has_lockbox_secret
&& has_dmcrypt_key
) {
8280 lockbox_secret
= params
.at("cephx_lockbox_secret");
8281 dmcrypt_key
= params
.at("dmcrypt_key");
8282 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
8283 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
8287 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
8289 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
8297 } else if (may_be_idempotent
&& err
!= EEXIST
) {
8298 // for this to be idempotent, `id` should already be >= 0; no need
8299 // to use validate_id.
8300 ceph_assert(id
>= 0);
8301 ss
<< "osd." << id
<< " exists but secrets do not match";
8306 svc
= (ConfigKeyService
*)mon
->config_key_service
;
8307 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
8310 } else if (may_be_idempotent
&& err
!= EEXIST
) {
8311 ceph_assert(id
>= 0);
8312 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
8317 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
8318 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
8320 if (may_be_idempotent
) {
8321 // we have nothing to do for either the osdmon or the authmon,
8322 // and we have no lockbox - so the config key service will not be
8323 // touched. This is therefore an idempotent operation, and we can
8324 // just return right away.
8325 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
8326 ceph_assert(id
>= 0);
8328 f
->open_object_section("created_osd");
8329 f
->dump_int("osdid", id
);
8336 ceph_assert(!may_be_idempotent
);
8340 ceph_assert(!cephx_secret
.empty());
8341 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
8342 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
8344 err
= mon
->authmon()->do_osd_new(cephx_entity
,
8347 ceph_assert(0 == err
);
8350 ceph_assert(nullptr != svc
);
8351 svc
->do_osd_new(uuid
, dmcrypt_key
);
8355 if (is_recreate_destroyed
) {
8356 ceph_assert(id
>= 0);
8357 ceph_assert(osdmap
.is_destroyed(id
));
8358 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
8359 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
8360 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
8361 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
8363 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
8364 // due to http://tracker.ceph.com/issues/20751 some clusters may
8365 // have UP set for non-existent OSDs; make sure it is cleared
8366 // for a newly created osd.
8367 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
8369 pending_inc
.new_uuid
[id
] = uuid
;
8371 ceph_assert(id
>= 0);
8372 int32_t new_id
= -1;
8373 do_osd_create(id
, uuid
, device_class
, &new_id
);
8374 ceph_assert(new_id
>= 0);
8375 ceph_assert(id
== new_id
);
8379 f
->open_object_section("created_osd");
8380 f
->dump_int("osdid", id
);
8389 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
8391 op
->mark_osdmon_event(__func__
);
8392 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
8395 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
8396 string rs
= ss
.str();
8397 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
8401 MonSession
*session
= op
->get_session();
8403 derr
<< __func__
<< " no session" << dendl
;
8404 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
8408 return prepare_command_impl(op
, cmdmap
);
8411 static int parse_reweights(CephContext
*cct
,
8412 const cmdmap_t
& cmdmap
,
8413 const OSDMap
& osdmap
,
8414 map
<int32_t, uint32_t>* weights
)
8417 if (!cmd_getval(cct
, cmdmap
, "weights", weights_str
)) {
8420 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
8421 json_spirit::mValue json_value
;
8422 if (!json_spirit::read(weights_str
, json_value
)) {
8425 if (json_value
.type() != json_spirit::obj_type
) {
8428 const auto obj
= json_value
.get_obj();
8430 for (auto& osd_weight
: obj
) {
8431 auto osd_id
= std::stoi(osd_weight
.first
);
8432 if (!osdmap
.exists(osd_id
)) {
8435 if (osd_weight
.second
.type() != json_spirit::str_type
) {
8438 auto weight
= std::stoul(osd_weight
.second
.get_str());
8439 weights
->insert({osd_id
, weight
});
8441 } catch (const std::logic_error
& e
) {
8447 int OSDMonitor::prepare_command_osd_destroy(
8451 ceph_assert(paxos
->is_plugged());
8453 // we check if the osd exists for the benefit of `osd purge`, which may
8454 // have previously removed the osd. If the osd does not exist, return
8455 // -ENOENT to convey this, and let the caller deal with it.
8457 // we presume that all auth secrets and config keys were removed prior
8458 // to this command being called. if they exist by now, we also assume
8459 // they must have been created by some other command and do not pertain
8460 // to this non-existent osd.
8461 if (!osdmap
.exists(id
)) {
8462 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
8466 uuid_d uuid
= osdmap
.get_uuid(id
);
8467 dout(10) << __func__
<< " destroying osd." << id
8468 << " uuid " << uuid
<< dendl
;
8470 // if it has been destroyed, we assume our work here is done.
8471 if (osdmap
.is_destroyed(id
)) {
8472 ss
<< "destroyed osd." << id
;
8476 EntityName cephx_entity
, lockbox_entity
;
8477 bool idempotent_auth
= false, idempotent_cks
= false;
8479 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
8484 if (err
== -ENOENT
) {
8485 idempotent_auth
= true;
8491 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
8492 err
= svc
->validate_osd_destroy(id
, uuid
);
8494 ceph_assert(err
== -ENOENT
);
8496 idempotent_cks
= true;
8499 if (!idempotent_auth
) {
8500 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
8501 ceph_assert(0 == err
);
8504 if (!idempotent_cks
) {
8505 svc
->do_osd_destroy(id
, uuid
);
8508 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
8509 pending_inc
.new_uuid
[id
] = uuid_d();
8511 // we can only propose_pending() once per service, otherwise we'll be
8512 // defying PaxosService and all laws of nature. Therefore, as we may
8513 // be used during 'osd purge', let's keep the caller responsible for
8515 ceph_assert(err
== 0);
8519 int OSDMonitor::prepare_command_osd_purge(
8523 ceph_assert(paxos
->is_plugged());
8524 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
8526 ceph_assert(!osdmap
.is_up(id
));
8529 * This may look a bit weird, but this is what's going to happen:
8531 * 1. we make sure that removing from crush works
8532 * 2. we call `prepare_command_osd_destroy()`. If it returns an
8533 * error, then we abort the whole operation, as no updates
8534 * have been made. However, we this function will have
8535 * side-effects, thus we need to make sure that all operations
8536 * performed henceforth will *always* succeed.
8537 * 3. we call `prepare_command_osd_remove()`. Although this
8538 * function can return an error, it currently only checks if the
8539 * osd is up - and we have made sure that it is not so, so there
8540 * is no conflict, and it is effectively an update.
8541 * 4. finally, we call `do_osd_crush_remove()`, which will perform
8542 * the crush update we delayed from before.
8545 CrushWrapper newcrush
;
8546 _get_pending_crush(newcrush
);
8548 bool may_be_idempotent
= false;
8550 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
8551 if (err
== -ENOENT
) {
8553 may_be_idempotent
= true;
8554 } else if (err
< 0) {
8555 ss
<< "error removing osd." << id
<< " from crush";
8559 // no point destroying the osd again if it has already been marked destroyed
8560 if (!osdmap
.is_destroyed(id
)) {
8561 err
= prepare_command_osd_destroy(id
, ss
);
8563 if (err
== -ENOENT
) {
8569 may_be_idempotent
= false;
8572 ceph_assert(0 == err
);
8574 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
8575 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
8576 << "we are idempotent." << dendl
;
8580 err
= prepare_command_osd_remove(id
);
8581 // we should not be busy, as we should have made sure this id is not up.
8582 ceph_assert(0 == err
);
8584 do_osd_crush_remove(newcrush
);
8588 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
8589 const cmdmap_t
& cmdmap
)
8591 op
->mark_osdmon_event(__func__
);
8592 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
8600 cmd_getval(cct
, cmdmap
, "format", format
, string("plain"));
8601 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
8604 cmd_getval(cct
, cmdmap
, "prefix", prefix
);
8608 bool osdid_present
= false;
8609 if (prefix
!= "osd pg-temp" &&
8610 prefix
!= "osd pg-upmap" &&
8611 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
8612 osdid_present
= cmd_getval(cct
, cmdmap
, "id", osdid
);
8614 if (osdid_present
) {
8616 oss
<< "osd." << osdid
;
8617 osd_name
= oss
.str();
8620 // Even if there's a pending state with changes that could affect
8621 // a command, considering that said state isn't yet committed, we
8622 // just don't care about those changes if the command currently being
8623 // handled acts as a no-op against the current committed state.
8624 // In a nutshell, we assume this command happens *before*.
8626 // Let me make this clearer:
8628 // - If we have only one client, and that client issues some
8629 // operation that would conflict with this operation but is
8630 // still on the pending state, then we would be sure that said
8631 // operation wouldn't have returned yet, so the client wouldn't
8632 // issue this operation (unless the client didn't wait for the
8633 // operation to finish, and that would be the client's own fault).
8635 // - If we have more than one client, each client will observe
8636 // whatever is the state at the moment of the commit. So, if we
8637 // have two clients, one issuing an unlink and another issuing a
8638 // link, and if the link happens while the unlink is still on the
8639 // pending state, from the link's point-of-view this is a no-op.
8640 // If different clients are issuing conflicting operations and
8641 // they care about that, then the clients should make sure they
8642 // enforce some kind of concurrency mechanism -- from our
8643 // perspective that's what Douglas Adams would call an SEP.
8645 // This should be used as a general guideline for most commands handled
8646 // in this function. Adapt as you see fit, but please bear in mind that
8647 // this is the expected behavior.
8650 if (prefix
== "osd setcrushmap" ||
8651 (prefix
== "osd crush set" && !osdid_present
)) {
8652 if (pending_inc
.crush
.length()) {
8653 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
8654 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8657 dout(10) << "prepare_command setting new crush map" << dendl
;
8658 bufferlist
data(m
->get_data());
8661 auto bl
= data
.cbegin();
8664 catch (const std::exception
&e
) {
8666 ss
<< "Failed to parse crushmap: " << e
.what();
8670 int64_t prior_version
= 0;
8671 if (cmd_getval(cct
, cmdmap
, "prior_version", prior_version
)) {
8672 if (prior_version
== osdmap
.get_crush_version() - 1) {
8673 // see if we are a resend of the last update. this is imperfect
8674 // (multiple racing updaters may not both get reliable success)
8675 // but we expect crush updaters (via this interface) to be rare-ish.
8676 bufferlist current
, proposed
;
8677 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
8678 crush
.encode(proposed
, mon
->get_quorum_con_features());
8679 if (current
.contents_equal(proposed
)) {
8680 dout(10) << __func__
8681 << " proposed matches current and version equals previous"
8684 ss
<< osdmap
.get_crush_version();
8688 if (prior_version
!= osdmap
.get_crush_version()) {
8690 ss
<< "prior_version " << prior_version
<< " != crush version "
8691 << osdmap
.get_crush_version();
8696 if (crush
.has_legacy_rule_ids()) {
8698 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
8701 if (!validate_crush_against_features(&crush
, ss
)) {
8706 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
8711 if (g_conf()->mon_osd_crush_smoke_test
) {
8712 // sanity check: test some inputs to make sure this map isn't
8714 dout(10) << " testing map" << dendl
;
8716 CrushTester
tester(crush
, ess
);
8717 tester
.set_min_x(0);
8718 tester
.set_max_x(50);
8719 auto start
= ceph::coarse_mono_clock::now();
8720 int r
= tester
.test_with_fork(g_conf()->mon_lease
);
8721 auto duration
= ceph::coarse_mono_clock::now() - start
;
8723 dout(10) << " tester.test_with_fork returns " << r
8724 << ": " << ess
.str() << dendl
;
8725 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
8729 dout(10) << __func__
<< " crush somke test duration: "
8730 << duration
<< ", result: " << ess
.str() << dendl
;
8733 pending_inc
.crush
= data
;
8734 ss
<< osdmap
.get_crush_version() + 1;
8737 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
8738 CrushWrapper newcrush
;
8739 _get_pending_crush(newcrush
);
8740 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
8742 if (newcrush
.bucket_exists(bid
) &&
8743 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
8744 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
8745 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
8748 if (!validate_crush_against_features(&newcrush
, ss
)) {
8752 pending_inc
.crush
.clear();
8753 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8754 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8755 get_last_committed() + 1));
8757 } else if (prefix
== "osd crush set-device-class") {
8758 string device_class
;
8759 if (!cmd_getval(cct
, cmdmap
, "class", device_class
)) {
8760 err
= -EINVAL
; // no value!
8765 vector
<string
> idvec
;
8766 cmd_getval(cct
, cmdmap
, "ids", idvec
);
8767 CrushWrapper newcrush
;
8768 _get_pending_crush(newcrush
);
8770 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8774 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8775 osdmap
.get_all_osds(osds
);
8778 // try traditional single osd way
8779 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8781 // ss has reason for failure
8782 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
8789 for (auto &osd
: osds
) {
8790 if (!osdmap
.exists(osd
)) {
8791 ss
<< "osd." << osd
<< " does not exist. ";
8796 oss
<< "osd." << osd
;
8797 string name
= oss
.str();
8799 if (newcrush
.get_max_devices() < osd
+ 1) {
8800 newcrush
.set_max_devices(osd
+ 1);
8803 if (newcrush
.item_exists(osd
)) {
8804 action
= "updating";
8806 action
= "creating";
8807 newcrush
.set_item_name(osd
, name
);
8810 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
8811 << "' device_class '" << device_class
<< "'"
8813 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
8817 if (err
== 0 && !_have_pending_crush()) {
8819 // for single osd only, wildcard makes too much noise
8820 ss
<< "set-device-class item id " << osd
<< " name '" << name
8821 << "' device_class '" << device_class
<< "': no change. ";
8824 updated
.insert(osd
);
8829 if (!updated
.empty()) {
8830 pending_inc
.crush
.clear();
8831 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8832 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
8834 wait_for_finished_proposal(op
,
8835 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
8839 } else if (prefix
== "osd crush rm-device-class") {
8841 vector
<string
> idvec
;
8842 cmd_getval(cct
, cmdmap
, "ids", idvec
);
8843 CrushWrapper newcrush
;
8844 _get_pending_crush(newcrush
);
8847 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8852 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8853 osdmap
.get_all_osds(osds
);
8856 // try traditional single osd way
8857 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8859 // ss has reason for failure
8860 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
8867 for (auto &osd
: osds
) {
8868 if (!osdmap
.exists(osd
)) {
8869 ss
<< "osd." << osd
<< " does not exist. ";
8873 auto class_name
= newcrush
.get_item_class(osd
);
8875 ss
<< "osd." << osd
<< " belongs to no class, ";
8878 // note that we do not verify if class_is_in_use here
8879 // in case the device is misclassified and user wants
8880 // to overridely reset...
8882 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
8884 // ss has reason for failure
8887 updated
.insert(osd
);
8891 if (!updated
.empty()) {
8892 pending_inc
.crush
.clear();
8893 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8894 ss
<< "done removing class of osd(s): " << updated
;
8896 wait_for_finished_proposal(op
,
8897 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
8900 } else if (prefix
== "osd crush class create") {
8901 string device_class
;
8902 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
8903 err
= -EINVAL
; // no value!
8906 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8907 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8908 << "luminous' before using crush device classes";
8912 if (!_have_pending_crush() &&
8913 _get_stable_crush().class_exists(device_class
)) {
8914 ss
<< "class '" << device_class
<< "' already exists";
8917 CrushWrapper newcrush
;
8918 _get_pending_crush(newcrush
);
8919 if (newcrush
.class_exists(device_class
)) {
8920 ss
<< "class '" << device_class
<< "' already exists";
8923 int class_id
= newcrush
.get_or_create_class_id(device_class
);
8924 pending_inc
.crush
.clear();
8925 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8926 ss
<< "created class " << device_class
<< " with id " << class_id
8929 } else if (prefix
== "osd crush class rm") {
8930 string device_class
;
8931 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
8932 err
= -EINVAL
; // no value!
8935 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8936 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8937 << "luminous' before using crush device classes";
8942 if (!osdmap
.crush
->class_exists(device_class
)) {
8947 CrushWrapper newcrush
;
8948 _get_pending_crush(newcrush
);
8949 if (!newcrush
.class_exists(device_class
)) {
8950 err
= 0; // make command idempotent
8953 int class_id
= newcrush
.get_class_id(device_class
);
8955 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
8957 ss
<< "class '" << device_class
<< "' " << ts
.str();
8961 // check if class is used by any erasure-code-profiles
8962 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
8963 osdmap
.get_erasure_code_profiles();
8964 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
8965 #ifdef HAVE_STDLIB_MAP_SPLICING
8966 ec_profiles
.merge(old_ec_profiles
);
8968 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
8969 make_move_iterator(end(old_ec_profiles
)));
8971 list
<string
> referenced_by
;
8972 for (auto &i
: ec_profiles
) {
8973 for (auto &j
: i
.second
) {
8974 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
8975 referenced_by
.push_back(i
.first
);
8979 if (!referenced_by
.empty()) {
8981 ss
<< "class '" << device_class
8982 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
8987 newcrush
.get_devices_by_class(device_class
, &osds
);
8988 for (auto& p
: osds
) {
8989 err
= newcrush
.remove_device_class(g_ceph_context
, p
, &ss
);
8991 // ss has reason for failure
8997 // empty class, remove directly
8998 err
= newcrush
.remove_class_name(device_class
);
9000 ss
<< "class '" << device_class
<< "' cannot be removed '"
9001 << cpp_strerror(err
) << "'";
9006 pending_inc
.crush
.clear();
9007 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9008 ss
<< "removed class " << device_class
<< " with id " << class_id
9009 << " from crush map";
9011 } else if (prefix
== "osd crush class rename") {
9012 string srcname
, dstname
;
9013 if (!cmd_getval(cct
, cmdmap
, "srcname", srcname
)) {
9017 if (!cmd_getval(cct
, cmdmap
, "dstname", dstname
)) {
9022 CrushWrapper newcrush
;
9023 _get_pending_crush(newcrush
);
9024 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
9025 // suppose this is a replay and return success
9026 // so command is idempotent
9027 ss
<< "already renamed to '" << dstname
<< "'";
9032 err
= newcrush
.rename_class(srcname
, dstname
);
9034 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
9035 << cpp_strerror(err
);
9039 pending_inc
.crush
.clear();
9040 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9041 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
9043 } else if (prefix
== "osd crush add-bucket") {
9044 // os crush add-bucket <name> <type>
9045 string name
, typestr
;
9046 vector
<string
> argvec
;
9047 cmd_getval(cct
, cmdmap
, "name", name
);
9048 cmd_getval(cct
, cmdmap
, "type", typestr
);
9049 cmd_getval(cct
, cmdmap
, "args", argvec
);
9050 map
<string
,string
> loc
;
9051 if (!argvec
.empty()) {
9052 CrushWrapper::parse_loc_map(argvec
, &loc
);
9053 dout(0) << "will create and move bucket '" << name
9054 << "' to location " << loc
<< dendl
;
9057 if (!_have_pending_crush() &&
9058 _get_stable_crush().name_exists(name
)) {
9059 ss
<< "bucket '" << name
<< "' already exists";
9063 CrushWrapper newcrush
;
9064 _get_pending_crush(newcrush
);
9066 if (newcrush
.name_exists(name
)) {
9067 ss
<< "bucket '" << name
<< "' already exists";
9070 int type
= newcrush
.get_type_id(typestr
);
9072 ss
<< "type '" << typestr
<< "' does not exist";
9077 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
9082 err
= newcrush
.add_bucket(0, 0,
9083 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
9086 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
9089 err
= newcrush
.set_item_name(bucketno
, name
);
9091 ss
<< "error setting bucket name to '" << name
<< "'";
9096 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
9098 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
9100 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
9104 ss
<< "no need to move item id " << bucketno
<< " name '" << name
9105 << "' to location " << loc
<< " in crush map";
9109 pending_inc
.crush
.clear();
9110 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9112 ss
<< "added bucket " << name
<< " type " << typestr
9115 ss
<< "added bucket " << name
<< " type " << typestr
9116 << " to location " << loc
;
9119 } else if (prefix
== "osd crush rename-bucket") {
9120 string srcname
, dstname
;
9121 cmd_getval(cct
, cmdmap
, "srcname", srcname
);
9122 cmd_getval(cct
, cmdmap
, "dstname", dstname
);
9124 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
9125 if (err
== -EALREADY
) // equivalent to success for idempotency
9131 } else if (prefix
== "osd crush weight-set create" ||
9132 prefix
== "osd crush weight-set create-compat") {
9133 CrushWrapper newcrush
;
9134 _get_pending_crush(newcrush
);
9137 if (newcrush
.has_non_straw2_buckets()) {
9138 ss
<< "crush map contains one or more bucket(s) that are not straw2";
9142 if (prefix
== "osd crush weight-set create") {
9143 if (osdmap
.require_min_compat_client
> 0 &&
9144 osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
9145 ss
<< "require_min_compat_client "
9146 << ceph_release_name(osdmap
.require_min_compat_client
)
9147 << " < luminous, which is required for per-pool weight-sets. "
9148 << "Try 'ceph osd set-require-min-compat-client luminous' "
9149 << "before using the new interface";
9153 string poolname
, mode
;
9154 cmd_getval(cct
, cmdmap
, "pool", poolname
);
9155 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
9157 ss
<< "pool '" << poolname
<< "' not found";
9161 cmd_getval(cct
, cmdmap
, "mode", mode
);
9162 if (mode
!= "flat" && mode
!= "positional") {
9163 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
9167 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
9169 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
9172 if (!newcrush
.create_choose_args(pool
, positions
)) {
9173 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
9174 ss
<< "compat weight-set already created";
9176 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
9177 << "' already created";
9181 pending_inc
.crush
.clear();
9182 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9185 } else if (prefix
== "osd crush weight-set rm" ||
9186 prefix
== "osd crush weight-set rm-compat") {
9187 CrushWrapper newcrush
;
9188 _get_pending_crush(newcrush
);
9190 if (prefix
== "osd crush weight-set rm") {
9192 cmd_getval(cct
, cmdmap
, "pool", poolname
);
9193 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
9195 ss
<< "pool '" << poolname
<< "' not found";
9200 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
9202 newcrush
.rm_choose_args(pool
);
9203 pending_inc
.crush
.clear();
9204 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9207 } else if (prefix
== "osd crush weight-set reweight" ||
9208 prefix
== "osd crush weight-set reweight-compat") {
9209 string poolname
, item
;
9210 vector
<double> weight
;
9211 cmd_getval(cct
, cmdmap
, "pool", poolname
);
9212 cmd_getval(cct
, cmdmap
, "item", item
);
9213 cmd_getval(cct
, cmdmap
, "weight", weight
);
9214 CrushWrapper newcrush
;
9215 _get_pending_crush(newcrush
);
9217 if (prefix
== "osd crush weight-set reweight") {
9218 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
9220 ss
<< "pool '" << poolname
<< "' not found";
9224 if (!newcrush
.have_choose_args(pool
)) {
9225 ss
<< "no weight-set for pool '" << poolname
<< "'";
9229 auto arg_map
= newcrush
.choose_args_get(pool
);
9230 int positions
= newcrush
.get_choose_args_positions(arg_map
);
9231 if (weight
.size() != (size_t)positions
) {
9232 ss
<< "must specify exact " << positions
<< " weight values";
9237 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
9238 if (!newcrush
.have_choose_args(pool
)) {
9239 ss
<< "no backward-compatible weight-set";
9244 if (!newcrush
.name_exists(item
)) {
9245 ss
<< "item '" << item
<< "' does not exist";
9249 err
= newcrush
.choose_args_adjust_item_weightf(
9251 newcrush
.choose_args_get(pool
),
9252 newcrush
.get_item_id(item
),
9259 pending_inc
.crush
.clear();
9260 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9262 } else if (osdid_present
&&
9263 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
9264 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
9265 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
9266 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
9268 if (!osdmap
.exists(osdid
)) {
9271 << " does not exist. Create it before updating the crush map";
9276 if (!cmd_getval(cct
, cmdmap
, "weight", weight
)) {
9277 ss
<< "unable to parse weight value '"
9278 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
9284 vector
<string
> argvec
;
9285 cmd_getval(cct
, cmdmap
, "args", argvec
);
9286 map
<string
,string
> loc
;
9287 CrushWrapper::parse_loc_map(argvec
, &loc
);
9289 if (prefix
== "osd crush set"
9290 && !_get_stable_crush().item_exists(osdid
)) {
9292 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
9293 << "' weight " << weight
<< " at location " << loc
9294 << ": does not exist";
9298 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
9299 << osd_name
<< "' weight " << weight
<< " at location "
9301 CrushWrapper newcrush
;
9302 _get_pending_crush(newcrush
);
9305 if (prefix
== "osd crush set" ||
9306 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
9308 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
9311 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
9319 if (err
== 0 && !_have_pending_crush()) {
9320 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
9321 << "' weight " << weight
<< " at location " << loc
<< ": no change";
9325 pending_inc
.crush
.clear();
9326 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9327 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
9328 << weight
<< " at location " << loc
<< " to crush map";
9330 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9331 get_last_committed() + 1));
9334 } else if (prefix
== "osd crush create-or-move") {
9336 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
9337 if (!osdmap
.exists(osdid
)) {
9340 << " does not exist. create it before updating the crush map";
9345 if (!cmd_getval(cct
, cmdmap
, "weight", weight
)) {
9346 ss
<< "unable to parse weight value '"
9347 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
9353 vector
<string
> argvec
;
9354 cmd_getval(cct
, cmdmap
, "args", argvec
);
9355 map
<string
,string
> loc
;
9356 CrushWrapper::parse_loc_map(argvec
, &loc
);
9358 dout(0) << "create-or-move crush item name '" << osd_name
9359 << "' initial_weight " << weight
<< " at location " << loc
9362 CrushWrapper newcrush
;
9363 _get_pending_crush(newcrush
);
9365 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
9366 g_conf()->osd_crush_update_weight_set
);
9368 ss
<< "create-or-move updated item name '" << osd_name
9369 << "' weight " << weight
9370 << " at location " << loc
<< " to crush map";
9374 pending_inc
.crush
.clear();
9375 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9376 ss
<< "create-or-move updating item name '" << osd_name
9377 << "' weight " << weight
9378 << " at location " << loc
<< " to crush map";
9380 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9381 get_last_committed() + 1));
9386 } else if (prefix
== "osd crush move") {
9388 // osd crush move <name> <loc1> [<loc2> ...]
9390 vector
<string
> argvec
;
9391 cmd_getval(cct
, cmdmap
, "name", name
);
9392 cmd_getval(cct
, cmdmap
, "args", argvec
);
9393 map
<string
,string
> loc
;
9394 CrushWrapper::parse_loc_map(argvec
, &loc
);
9396 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
9397 CrushWrapper newcrush
;
9398 _get_pending_crush(newcrush
);
9400 if (!newcrush
.name_exists(name
)) {
9402 ss
<< "item " << name
<< " does not exist";
9405 int id
= newcrush
.get_item_id(name
);
9407 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
9409 err
= newcrush
.create_or_move_item(
9410 cct
, id
, 0, name
, loc
,
9411 g_conf()->osd_crush_update_weight_set
);
9413 err
= newcrush
.move_bucket(cct
, id
, loc
);
9416 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
9417 pending_inc
.crush
.clear();
9418 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9420 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9421 get_last_committed() + 1));
9425 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
9429 } else if (prefix
== "osd crush swap-bucket") {
9430 string source
, dest
;
9431 cmd_getval(cct
, cmdmap
, "source", source
);
9432 cmd_getval(cct
, cmdmap
, "dest", dest
);
9435 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", force
);
9437 CrushWrapper newcrush
;
9438 _get_pending_crush(newcrush
);
9439 if (!newcrush
.name_exists(source
)) {
9440 ss
<< "source item " << source
<< " does not exist";
9444 if (!newcrush
.name_exists(dest
)) {
9445 ss
<< "dest item " << dest
<< " does not exist";
9449 int sid
= newcrush
.get_item_id(source
);
9450 int did
= newcrush
.get_item_id(dest
);
9452 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
9453 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
9457 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
9459 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
9460 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
9461 << "; pass --yes-i-really-mean-it to proceed anyway";
9465 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
9467 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
9471 ss
<< "swapped bucket of " << source
<< " to " << dest
;
9472 pending_inc
.crush
.clear();
9473 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9474 wait_for_finished_proposal(op
,
9475 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
9476 get_last_committed() + 1));
9478 } else if (prefix
== "osd crush link") {
9479 // osd crush link <name> <loc1> [<loc2> ...]
9481 cmd_getval(cct
, cmdmap
, "name", name
);
9482 vector
<string
> argvec
;
9483 cmd_getval(cct
, cmdmap
, "args", argvec
);
9484 map
<string
,string
> loc
;
9485 CrushWrapper::parse_loc_map(argvec
, &loc
);
9487 // Need an explicit check for name_exists because get_item_id returns
9489 int id
= osdmap
.crush
->get_item_id(name
);
9490 if (!osdmap
.crush
->name_exists(name
)) {
9492 ss
<< "item " << name
<< " does not exist";
9495 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
9497 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
9498 ss
<< "no need to move item id " << id
<< " name '" << name
9499 << "' to location " << loc
<< " in crush map";
9504 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
9505 CrushWrapper newcrush
;
9506 _get_pending_crush(newcrush
);
9508 if (!newcrush
.name_exists(name
)) {
9510 ss
<< "item " << name
<< " does not exist";
9513 int id
= newcrush
.get_item_id(name
);
9514 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
9515 err
= newcrush
.link_bucket(cct
, id
, loc
);
9517 ss
<< "linked item id " << id
<< " name '" << name
9518 << "' to location " << loc
<< " in crush map";
9519 pending_inc
.crush
.clear();
9520 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9522 ss
<< "cannot link item id " << id
<< " name '" << name
9523 << "' to location " << loc
;
9527 ss
<< "no need to move item id " << id
<< " name '" << name
9528 << "' to location " << loc
<< " in crush map";
9532 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
9533 get_last_committed() + 1));
9535 } else if (prefix
== "osd crush rm" ||
9536 prefix
== "osd crush remove" ||
9537 prefix
== "osd crush unlink") {
9539 // osd crush rm <id> [ancestor]
9540 CrushWrapper newcrush
;
9541 _get_pending_crush(newcrush
);
9544 cmd_getval(cct
, cmdmap
, "name", name
);
9546 if (!osdmap
.crush
->name_exists(name
)) {
9548 ss
<< "device '" << name
<< "' does not appear in the crush map";
9551 if (!newcrush
.name_exists(name
)) {
9553 ss
<< "device '" << name
<< "' does not appear in the crush map";
9555 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9556 get_last_committed() + 1));
9559 int id
= newcrush
.get_item_id(name
);
9562 bool unlink_only
= prefix
== "osd crush unlink";
9563 string ancestor_str
;
9564 if (cmd_getval(cct
, cmdmap
, "ancestor", ancestor_str
)) {
9565 if (!newcrush
.name_exists(ancestor_str
)) {
9567 ss
<< "ancestor item '" << ancestor_str
9568 << "' does not appear in the crush map";
9571 ancestor
= newcrush
.get_item_id(ancestor_str
);
9574 err
= prepare_command_osd_crush_remove(
9577 (ancestor
< 0), unlink_only
);
9579 if (err
== -ENOENT
) {
9580 ss
<< "item " << id
<< " does not appear in that position";
9586 pending_inc
.new_crush_node_flags
[id
] = 0;
9587 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
9589 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9590 get_last_committed() + 1));
9595 } else if (prefix
== "osd crush reweight-all") {
9596 CrushWrapper newcrush
;
9597 _get_pending_crush(newcrush
);
9599 newcrush
.reweight(cct
);
9600 pending_inc
.crush
.clear();
9601 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9602 ss
<< "reweighted crush hierarchy";
9604 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9605 get_last_committed() + 1));
9607 } else if (prefix
== "osd crush reweight") {
9608 // osd crush reweight <name> <weight>
9609 CrushWrapper newcrush
;
9610 _get_pending_crush(newcrush
);
9613 cmd_getval(cct
, cmdmap
, "name", name
);
9614 if (!newcrush
.name_exists(name
)) {
9616 ss
<< "device '" << name
<< "' does not appear in the crush map";
9620 int id
= newcrush
.get_item_id(name
);
9622 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
9627 if (!cmd_getval(cct
, cmdmap
, "weight", w
)) {
9628 ss
<< "unable to parse weight value '"
9629 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
9634 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
9635 g_conf()->osd_crush_update_weight_set
);
9638 pending_inc
.crush
.clear();
9639 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9640 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
9643 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9644 get_last_committed() + 1));
9646 } else if (prefix
== "osd crush reweight-subtree") {
9647 // osd crush reweight <name> <weight>
9648 CrushWrapper newcrush
;
9649 _get_pending_crush(newcrush
);
9652 cmd_getval(cct
, cmdmap
, "name", name
);
9653 if (!newcrush
.name_exists(name
)) {
9655 ss
<< "device '" << name
<< "' does not appear in the crush map";
9659 int id
= newcrush
.get_item_id(name
);
9661 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
9666 if (!cmd_getval(cct
, cmdmap
, "weight", w
)) {
9667 ss
<< "unable to parse weight value '"
9668 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
9673 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
9674 g_conf()->osd_crush_update_weight_set
);
9677 pending_inc
.crush
.clear();
9678 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9679 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
9682 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9683 get_last_committed() + 1));
9685 } else if (prefix
== "osd crush tunables") {
9686 CrushWrapper newcrush
;
9687 _get_pending_crush(newcrush
);
9691 cmd_getval(cct
, cmdmap
, "profile", profile
);
9692 if (profile
== "legacy" || profile
== "argonaut") {
9693 newcrush
.set_tunables_legacy();
9694 } else if (profile
== "bobtail") {
9695 newcrush
.set_tunables_bobtail();
9696 } else if (profile
== "firefly") {
9697 newcrush
.set_tunables_firefly();
9698 } else if (profile
== "hammer") {
9699 newcrush
.set_tunables_hammer();
9700 } else if (profile
== "jewel") {
9701 newcrush
.set_tunables_jewel();
9702 } else if (profile
== "optimal") {
9703 newcrush
.set_tunables_optimal();
9704 } else if (profile
== "default") {
9705 newcrush
.set_tunables_default();
9707 ss
<< "unrecognized profile '" << profile
<< "'";
9712 if (!validate_crush_against_features(&newcrush
, ss
)) {
9717 pending_inc
.crush
.clear();
9718 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9719 ss
<< "adjusted tunables profile to " << profile
;
9721 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9722 get_last_committed() + 1));
9724 } else if (prefix
== "osd crush set-tunable") {
9725 CrushWrapper newcrush
;
9726 _get_pending_crush(newcrush
);
9730 cmd_getval(cct
, cmdmap
, "tunable", tunable
);
9733 if (!cmd_getval(cct
, cmdmap
, "value", value
)) {
9735 ss
<< "failed to parse integer value "
9736 << cmd_vartype_stringify(cmdmap
.at("value"));
9740 if (tunable
== "straw_calc_version") {
9741 if (value
!= 0 && value
!= 1) {
9742 ss
<< "value must be 0 or 1; got " << value
;
9746 newcrush
.set_straw_calc_version(value
);
9748 ss
<< "unrecognized tunable '" << tunable
<< "'";
9753 if (!validate_crush_against_features(&newcrush
, ss
)) {
9758 pending_inc
.crush
.clear();
9759 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9760 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
9762 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9763 get_last_committed() + 1));
9766 } else if (prefix
== "osd crush rule create-simple") {
9767 string name
, root
, type
, mode
;
9768 cmd_getval(cct
, cmdmap
, "name", name
);
9769 cmd_getval(cct
, cmdmap
, "root", root
);
9770 cmd_getval(cct
, cmdmap
, "type", type
);
9771 cmd_getval(cct
, cmdmap
, "mode", mode
);
9775 if (osdmap
.crush
->rule_exists(name
)) {
9776 // The name is uniquely associated to a ruleid and the rule it contains
9777 // From the user point of view, the rule is more meaningfull.
9778 ss
<< "rule " << name
<< " already exists";
9783 CrushWrapper newcrush
;
9784 _get_pending_crush(newcrush
);
9786 if (newcrush
.rule_exists(name
)) {
9787 // The name is uniquely associated to a ruleid and the rule it contains
9788 // From the user point of view, the rule is more meaningfull.
9789 ss
<< "rule " << name
<< " already exists";
9792 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
9793 pg_pool_t::TYPE_REPLICATED
, &ss
);
9799 pending_inc
.crush
.clear();
9800 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9803 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9804 get_last_committed() + 1));
9807 } else if (prefix
== "osd crush rule create-replicated") {
9808 string name
, root
, type
, device_class
;
9809 cmd_getval(cct
, cmdmap
, "name", name
);
9810 cmd_getval(cct
, cmdmap
, "root", root
);
9811 cmd_getval(cct
, cmdmap
, "type", type
);
9812 cmd_getval(cct
, cmdmap
, "class", device_class
);
9814 if (osdmap
.crush
->rule_exists(name
)) {
9815 // The name is uniquely associated to a ruleid and the rule it contains
9816 // From the user point of view, the rule is more meaningfull.
9817 ss
<< "rule " << name
<< " already exists";
9822 CrushWrapper newcrush
;
9823 _get_pending_crush(newcrush
);
9825 if (newcrush
.rule_exists(name
)) {
9826 // The name is uniquely associated to a ruleid and the rule it contains
9827 // From the user point of view, the rule is more meaningfull.
9828 ss
<< "rule " << name
<< " already exists";
9831 int ruleno
= newcrush
.add_simple_rule(
9832 name
, root
, type
, device_class
,
9833 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
9839 pending_inc
.crush
.clear();
9840 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9843 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9844 get_last_committed() + 1));
9847 } else if (prefix
== "osd erasure-code-profile rm") {
9849 cmd_getval(cct
, cmdmap
, "name", name
);
9851 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
9854 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
9859 if (osdmap
.has_erasure_code_profile(name
) ||
9860 pending_inc
.new_erasure_code_profiles
.count(name
)) {
9861 if (osdmap
.has_erasure_code_profile(name
)) {
9862 pending_inc
.old_erasure_code_profiles
.push_back(name
);
9864 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
9865 pending_inc
.new_erasure_code_profiles
.erase(name
);
9869 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9870 get_last_committed() + 1));
9873 ss
<< "erasure-code-profile " << name
<< " does not exist";
9878 } else if (prefix
== "osd erasure-code-profile set") {
9880 cmd_getval(cct
, cmdmap
, "name", name
);
9881 vector
<string
> profile
;
9882 cmd_getval(cct
, cmdmap
, "profile", profile
);
9885 cmd_getval(cct
, cmdmap
, "force", force
);
9887 map
<string
,string
> profile_map
;
9888 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
9891 if (profile_map
.find("plugin") == profile_map
.end()) {
9892 ss
<< "erasure-code-profile " << profile_map
9893 << " must contain a plugin entry" << std::endl
;
9897 string plugin
= profile_map
["plugin"];
9899 if (pending_inc
.has_erasure_code_profile(name
)) {
9900 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
9903 err
= normalize_profile(name
, profile_map
, force
, &ss
);
9907 if (osdmap
.has_erasure_code_profile(name
)) {
9908 ErasureCodeProfile existing_profile_map
=
9909 osdmap
.get_erasure_code_profile(name
);
9910 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
9914 if (existing_profile_map
== profile_map
) {
9920 ss
<< "will not override erasure code profile " << name
9921 << " because the existing profile "
9922 << existing_profile_map
9923 << " is different from the proposed profile "
9929 dout(20) << "erasure code profile set " << name
<< "="
9930 << profile_map
<< dendl
;
9931 pending_inc
.set_erasure_code_profile(name
, profile_map
);
9935 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9936 get_last_committed() + 1));
9939 } else if (prefix
== "osd crush rule create-erasure") {
9940 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
9945 string name
, poolstr
;
9946 cmd_getval(cct
, cmdmap
, "name", name
);
9948 cmd_getval(cct
, cmdmap
, "profile", profile
);
9950 profile
= "default";
9951 if (profile
== "default") {
9952 if (!osdmap
.has_erasure_code_profile(profile
)) {
9953 if (pending_inc
.has_erasure_code_profile(profile
)) {
9954 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
9958 map
<string
,string
> profile_map
;
9959 err
= osdmap
.get_erasure_code_profile_default(cct
,
9964 err
= normalize_profile(name
, profile_map
, true, &ss
);
9967 dout(20) << "erasure code profile set " << profile
<< "="
9968 << profile_map
<< dendl
;
9969 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
9975 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
9978 case -EEXIST
: // return immediately
9979 ss
<< "rule " << name
<< " already exists";
9983 case -EALREADY
: // wait for pending to be proposed
9984 ss
<< "rule " << name
<< " already exists";
9987 default: // non recoverable error
9992 ss
<< "created rule " << name
<< " at " << rule
;
9996 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9997 get_last_committed() + 1));
10000 } else if (prefix
== "osd crush rule rm") {
10002 cmd_getval(cct
, cmdmap
, "name", name
);
10004 if (!osdmap
.crush
->rule_exists(name
)) {
10005 ss
<< "rule " << name
<< " does not exist";
10010 CrushWrapper newcrush
;
10011 _get_pending_crush(newcrush
);
10013 if (!newcrush
.rule_exists(name
)) {
10014 ss
<< "rule " << name
<< " does not exist";
10017 int ruleno
= newcrush
.get_rule_id(name
);
10018 ceph_assert(ruleno
>= 0);
10020 // make sure it is not in use.
10021 // FIXME: this is ok in some situations, but let's not bother with that
10023 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
10024 if (osdmap
.crush_rule_in_use(ruleset
)) {
10025 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
10030 err
= newcrush
.remove_rule(ruleno
);
10035 pending_inc
.crush
.clear();
10036 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10039 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10040 get_last_committed() + 1));
10043 } else if (prefix
== "osd crush rule rename") {
10046 cmd_getval(cct
, cmdmap
, "srcname", srcname
);
10047 cmd_getval(cct
, cmdmap
, "dstname", dstname
);
10048 if (srcname
.empty() || dstname
.empty()) {
10049 ss
<< "must specify both source rule name and destination rule name";
10053 if (srcname
== dstname
) {
10054 ss
<< "destination rule name is equal to source rule name";
10059 CrushWrapper newcrush
;
10060 _get_pending_crush(newcrush
);
10061 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
10062 // srcname does not exist and dstname already exists
10063 // suppose this is a replay and return success
10064 // (so this command is idempotent)
10065 ss
<< "already renamed to '" << dstname
<< "'";
10070 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
10072 // ss has reason for failure
10075 pending_inc
.crush
.clear();
10076 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10078 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10079 get_last_committed() + 1));
10082 } else if (prefix
== "osd setmaxosd") {
10084 if (!cmd_getval(cct
, cmdmap
, "newmax", newmax
)) {
10085 ss
<< "unable to parse 'newmax' value '"
10086 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
10091 if (newmax
> g_conf()->mon_max_osd
) {
10093 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
10094 << g_conf()->mon_max_osd
<< ")";
10098 // Don't allow shrinking OSD number as this will cause data loss
10099 // and may cause kernel crashes.
10100 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10101 if (newmax
< osdmap
.get_max_osd()) {
10102 // Check if the OSDs exist between current max and new value.
10103 // If there are any OSDs exist, then don't allow shrinking number
10105 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
10106 if (osdmap
.exists(i
)) {
10108 ss
<< "cannot shrink max_osd to " << newmax
10109 << " because osd." << i
<< " (and possibly others) still in use";
10115 pending_inc
.new_max_osd
= newmax
;
10116 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
10118 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10119 get_last_committed() + 1));
10122 } else if (prefix
== "osd set-full-ratio" ||
10123 prefix
== "osd set-backfillfull-ratio" ||
10124 prefix
== "osd set-nearfull-ratio") {
10126 if (!cmd_getval(cct
, cmdmap
, "ratio", n
)) {
10127 ss
<< "unable to parse 'ratio' value '"
10128 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
10132 if (prefix
== "osd set-full-ratio")
10133 pending_inc
.new_full_ratio
= n
;
10134 else if (prefix
== "osd set-backfillfull-ratio")
10135 pending_inc
.new_backfillfull_ratio
= n
;
10136 else if (prefix
== "osd set-nearfull-ratio")
10137 pending_inc
.new_nearfull_ratio
= n
;
10138 ss
<< prefix
<< " " << n
;
10140 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10141 get_last_committed() + 1));
10143 } else if (prefix
== "osd set-require-min-compat-client") {
10145 cmd_getval(cct
, cmdmap
, "version", v
);
10146 int vno
= ceph_release_from_name(v
.c_str());
10148 ss
<< "version " << v
<< " is not recognized";
10153 newmap
.deepish_copy_from(osdmap
);
10154 newmap
.apply_incremental(pending_inc
);
10155 newmap
.require_min_compat_client
= vno
;
10156 auto mvno
= newmap
.get_min_compat_client();
10158 ss
<< "osdmap current utilizes features that require "
10159 << ceph_release_name(mvno
)
10160 << "; cannot set require_min_compat_client below that to "
10161 << ceph_release_name(vno
);
10166 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", sure
);
10169 mon
->get_combined_feature_map(&m
);
10170 uint64_t features
= ceph_release_features(vno
);
10174 CEPH_ENTITY_TYPE_CLIENT
,
10175 CEPH_ENTITY_TYPE_MDS
,
10176 CEPH_ENTITY_TYPE_MGR
}) {
10177 auto p
= m
.m
.find(type
);
10178 if (p
== m
.m
.end()) {
10181 for (auto& q
: p
->second
) {
10182 uint64_t missing
= ~q
.first
& features
;
10185 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
10190 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
10191 << "(s) look like " << ceph_release_name(
10192 ceph_release_from_features(q
.first
))
10193 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
10199 ss
<< "; add --yes-i-really-mean-it to do it anyway";
10204 ss
<< "set require_min_compat_client to " << ceph_release_name(vno
);
10205 pending_inc
.new_require_min_compat_client
= vno
;
10207 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10208 get_last_committed() + 1));
10210 } else if (prefix
== "osd pause") {
10211 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
10213 } else if (prefix
== "osd unpause") {
10214 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
10216 } else if (prefix
== "osd set") {
10218 cmd_getval(g_ceph_context
, cmdmap
, "yes_i_really_mean_it", sure
);
10221 cmd_getval(cct
, cmdmap
, "key", key
);
10223 return prepare_set_flag(op
, CEPH_OSDMAP_FULL
);
10224 else if (key
== "pause")
10225 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
10226 else if (key
== "noup")
10227 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
10228 else if (key
== "nodown")
10229 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
10230 else if (key
== "noout")
10231 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
10232 else if (key
== "noin")
10233 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
10234 else if (key
== "nobackfill")
10235 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
10236 else if (key
== "norebalance")
10237 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
10238 else if (key
== "norecover")
10239 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
10240 else if (key
== "noscrub")
10241 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
10242 else if (key
== "nodeep-scrub")
10243 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
10244 else if (key
== "notieragent")
10245 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
10246 else if (key
== "nosnaptrim")
10247 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
10248 else if (key
== "pglog_hardlimit") {
10249 if (!osdmap
.get_num_up_osds() && !sure
) {
10250 ss
<< "Not advisable to continue since no OSDs are up. Pass "
10251 << "--yes-i-really-mean-it if you really wish to continue.";
10255 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
10256 // we are reusing a jewel feature bit that was retired in luminous.
10257 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
10258 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
10260 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
10262 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
10267 ss
<< "unrecognized flag '" << key
<< "'";
10271 } else if (prefix
== "osd unset") {
10273 cmd_getval(cct
, cmdmap
, "key", key
);
10275 return prepare_unset_flag(op
, CEPH_OSDMAP_FULL
);
10276 else if (key
== "pause")
10277 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
10278 else if (key
== "noup")
10279 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
10280 else if (key
== "nodown")
10281 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
10282 else if (key
== "noout")
10283 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
10284 else if (key
== "noin")
10285 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
10286 else if (key
== "nobackfill")
10287 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
10288 else if (key
== "norebalance")
10289 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
10290 else if (key
== "norecover")
10291 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
10292 else if (key
== "noscrub")
10293 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
10294 else if (key
== "nodeep-scrub")
10295 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
10296 else if (key
== "notieragent")
10297 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
10298 else if (key
== "nosnaptrim")
10299 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
10301 ss
<< "unrecognized flag '" << key
<< "'";
10305 } else if (prefix
== "osd require-osd-release") {
10307 cmd_getval(cct
, cmdmap
, "release", release
);
10309 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", sure
);
10310 int rel
= ceph_release_from_name(release
.c_str());
10312 ss
<< "unrecognized release " << release
;
10316 if (rel
== osdmap
.require_osd_release
) {
10321 ceph_assert(osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
);
10322 if (!osdmap
.get_num_up_osds() && !sure
) {
10323 ss
<< "Not advisable to continue since no OSDs are up. Pass "
10324 << "--yes-i-really-mean-it if you really wish to continue.";
10328 if (rel
== CEPH_RELEASE_MIMIC
) {
10329 if (!mon
->monmap
->get_required_features().contains_all(
10330 ceph::features::mon::FEATURE_MIMIC
)) {
10331 ss
<< "not all mons are mimic";
10335 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_MIMIC
))
10337 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
10341 } else if (rel
== CEPH_RELEASE_NAUTILUS
) {
10342 if (!mon
->monmap
->get_required_features().contains_all(
10343 ceph::features::mon::FEATURE_NAUTILUS
)) {
10344 ss
<< "not all mons are nautilus";
10348 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_NAUTILUS
))
10350 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
10355 ss
<< "not supported for this release yet";
10359 if (rel
< osdmap
.require_osd_release
) {
10360 ss
<< "require_osd_release cannot be lowered once it has been set";
10364 pending_inc
.new_require_osd_release
= rel
;
10366 } else if (prefix
== "osd down" ||
10367 prefix
== "osd out" ||
10368 prefix
== "osd in" ||
10369 prefix
== "osd rm") {
10373 bool verbose
= true;
10375 vector
<string
> idvec
;
10376 cmd_getval(cct
, cmdmap
, "ids", idvec
);
10377 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10382 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10383 if (prefix
== "osd in") {
10384 // touch out osds only
10385 osdmap
.get_out_existing_osds(osds
);
10387 osdmap
.get_all_osds(osds
);
10390 verbose
= false; // so the output is less noisy.
10392 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10394 ss
<< "invalid osd id" << osd
;
10397 } else if (!osdmap
.exists(osd
)) {
10398 ss
<< "osd." << osd
<< " does not exist. ";
10405 for (auto &osd
: osds
) {
10406 if (prefix
== "osd down") {
10407 if (osdmap
.is_down(osd
)) {
10409 ss
<< "osd." << osd
<< " is already down. ";
10411 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
10412 ss
<< "marked down osd." << osd
<< ". ";
10415 } else if (prefix
== "osd out") {
10416 if (osdmap
.is_out(osd
)) {
10418 ss
<< "osd." << osd
<< " is already out. ";
10420 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
10421 if (osdmap
.osd_weight
[osd
]) {
10422 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
10423 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
10425 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
10427 ss
<< "marked out osd." << osd
<< ". ";
10428 std::ostringstream msg
;
10429 msg
<< "Client " << op
->get_session()->entity_name
10430 << " marked osd." << osd
<< " out";
10431 if (osdmap
.is_up(osd
)) {
10432 msg
<< ", while it was still marked up";
10434 auto period
= ceph_clock_now() - down_pending_out
[osd
];
10435 msg
<< ", after it was down for " << int(period
.sec())
10439 mon
->clog
->info() << msg
.str();
10442 } else if (prefix
== "osd in") {
10443 if (osdmap
.is_in(osd
)) {
10445 ss
<< "osd." << osd
<< " is already in. ";
10447 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
10448 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
10449 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
10450 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
10452 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
10454 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
10456 ss
<< "marked in osd." << osd
<< ". ";
10459 } else if (prefix
== "osd rm") {
10460 err
= prepare_command_osd_remove(osd
);
10462 if (err
== -EBUSY
) {
10465 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
10467 ceph_assert(err
== 0);
10469 ss
<< ", osd." << osd
;
10471 ss
<< "removed osd." << osd
;
10480 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
10481 get_last_committed() + 1));
10484 } else if (prefix
== "osd set-group" ||
10485 prefix
== "osd unset-group" ||
10486 prefix
== "osd add-noup" ||
10487 prefix
== "osd add-nodown" ||
10488 prefix
== "osd add-noin" ||
10489 prefix
== "osd add-noout" ||
10490 prefix
== "osd rm-noup" ||
10491 prefix
== "osd rm-nodown" ||
10492 prefix
== "osd rm-noin" ||
10493 prefix
== "osd rm-noout") {
10494 bool do_set
= prefix
== "osd set-group" ||
10495 prefix
.find("add") != string::npos
;
10497 unsigned flags
= 0;
10498 vector
<string
> who
;
10499 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
10500 cmd_getval(cct
, cmdmap
, "flags", flag_str
);
10501 cmd_getval(cct
, cmdmap
, "who", who
);
10502 vector
<string
> raw_flags
;
10503 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
10504 for (auto& f
: raw_flags
) {
10506 flags
|= CEPH_OSD_NOUP
;
10507 else if (f
== "nodown")
10508 flags
|= CEPH_OSD_NODOWN
;
10509 else if (f
== "noin")
10510 flags
|= CEPH_OSD_NOIN
;
10511 else if (f
== "noout")
10512 flags
|= CEPH_OSD_NOOUT
;
10514 ss
<< "unrecognized flag '" << f
<< "', must be one of "
10515 << "{noup,nodown,noin,noout}";
10521 cmd_getval(cct
, cmdmap
, "ids", who
);
10522 if (prefix
.find("noup") != string::npos
)
10523 flags
= CEPH_OSD_NOUP
;
10524 else if (prefix
.find("nodown") != string::npos
)
10525 flags
= CEPH_OSD_NODOWN
;
10526 else if (prefix
.find("noin") != string::npos
)
10527 flags
= CEPH_OSD_NOIN
;
10528 else if (prefix
.find("noout") != string::npos
)
10529 flags
= CEPH_OSD_NOOUT
;
10531 ceph_assert(0 == "Unreachable!");
10534 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
10539 ss
<< "must specify at least one or more targets to set/unset";
10544 set
<int> crush_nodes
;
10545 set
<int> device_classes
;
10546 for (auto& w
: who
) {
10547 if (w
== "any" || w
== "all" || w
== "*") {
10548 osdmap
.get_all_osds(osds
);
10551 std::stringstream ts
;
10552 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
10554 } else if (osdmap
.crush
->name_exists(w
)) {
10555 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
10556 } else if (osdmap
.crush
->class_exists(w
)) {
10557 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
10559 ss
<< "unable to parse osd id or crush node or device class: "
10560 << "\"" << w
<< "\". ";
10563 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
10564 // ss has reason for failure
10569 for (auto osd
: osds
) {
10570 if (!osdmap
.exists(osd
)) {
10571 ss
<< "osd." << osd
<< " does not exist. ";
10575 if (flags
& CEPH_OSD_NOUP
) {
10576 any
|= osdmap
.is_noup_by_osd(osd
) ?
10577 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
10578 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
10580 if (flags
& CEPH_OSD_NODOWN
) {
10581 any
|= osdmap
.is_nodown_by_osd(osd
) ?
10582 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
10583 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
10585 if (flags
& CEPH_OSD_NOIN
) {
10586 any
|= osdmap
.is_noin_by_osd(osd
) ?
10587 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
10588 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
10590 if (flags
& CEPH_OSD_NOOUT
) {
10591 any
|= osdmap
.is_noout_by_osd(osd
) ?
10592 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
10593 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
10596 if (flags
& CEPH_OSD_NOUP
) {
10597 any
|= osdmap
.is_noup_by_osd(osd
) ?
10598 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
10599 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
10601 if (flags
& CEPH_OSD_NODOWN
) {
10602 any
|= osdmap
.is_nodown_by_osd(osd
) ?
10603 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
10604 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
10606 if (flags
& CEPH_OSD_NOIN
) {
10607 any
|= osdmap
.is_noin_by_osd(osd
) ?
10608 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
10609 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
10611 if (flags
& CEPH_OSD_NOOUT
) {
10612 any
|= osdmap
.is_noout_by_osd(osd
) ?
10613 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
10614 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
10618 for (auto& id
: crush_nodes
) {
10619 auto old_flags
= osdmap
.get_crush_node_flags(id
);
10620 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
10621 pending_flags
|= old_flags
; // adopt existing flags first!
10623 pending_flags
|= flags
;
10625 pending_flags
&= ~flags
;
10629 for (auto& id
: device_classes
) {
10630 auto old_flags
= osdmap
.get_device_class_flags(id
);
10631 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
10632 pending_flags
|= old_flags
;
10634 pending_flags
|= flags
;
10636 pending_flags
&= ~flags
;
10642 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
10643 get_last_committed() + 1));
10646 } else if (prefix
== "osd pg-temp") {
10648 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
10649 ss
<< "unable to parse 'pgid' value '"
10650 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
10655 if (!pgid
.parse(pgidstr
.c_str())) {
10656 ss
<< "invalid pgid '" << pgidstr
<< "'";
10660 if (!osdmap
.pg_exists(pgid
)) {
10661 ss
<< "pg " << pgid
<< " does not exist";
10665 if (pending_inc
.new_pg_temp
.count(pgid
)) {
10666 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
10667 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10671 vector
<int64_t> id_vec
;
10672 vector
<int32_t> new_pg_temp
;
10673 cmd_getval(cct
, cmdmap
, "id", id_vec
);
10674 if (id_vec
.empty()) {
10675 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
10676 ss
<< "done cleaning up pg_temp of " << pgid
;
10679 for (auto osd
: id_vec
) {
10680 if (!osdmap
.exists(osd
)) {
10681 ss
<< "osd." << osd
<< " does not exist";
10685 new_pg_temp
.push_back(osd
);
10688 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
10689 if ((int)new_pg_temp
.size() < pool_min_size
) {
10690 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
10691 << pool_min_size
<< ")";
10696 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
10697 if ((int)new_pg_temp
.size() > pool_size
) {
10698 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
10699 << pool_size
<< ")";
10704 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
10705 new_pg_temp
.begin(), new_pg_temp
.end());
10706 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
10708 } else if (prefix
== "osd primary-temp") {
10710 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
10711 ss
<< "unable to parse 'pgid' value '"
10712 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
10717 if (!pgid
.parse(pgidstr
.c_str())) {
10718 ss
<< "invalid pgid '" << pgidstr
<< "'";
10722 if (!osdmap
.pg_exists(pgid
)) {
10723 ss
<< "pg " << pgid
<< " does not exist";
10729 if (!cmd_getval(cct
, cmdmap
, "id", osd
)) {
10730 ss
<< "unable to parse 'id' value '"
10731 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
10735 if (osd
!= -1 && !osdmap
.exists(osd
)) {
10736 ss
<< "osd." << osd
<< " does not exist";
10741 if (osdmap
.require_min_compat_client
> 0 &&
10742 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
10743 ss
<< "require_min_compat_client "
10744 << ceph_release_name(osdmap
.require_min_compat_client
)
10745 << " < firefly, which is required for primary-temp";
10750 pending_inc
.new_primary_temp
[pgid
] = osd
;
10751 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
10753 } else if (prefix
== "pg repeer") {
10756 cmd_getval(cct
, cmdmap
, "pgid", pgidstr
);
10757 if (!pgid
.parse(pgidstr
.c_str())) {
10758 ss
<< "invalid pgid '" << pgidstr
<< "'";
10762 if (!osdmap
.pg_exists(pgid
)) {
10763 ss
<< "pg '" << pgidstr
<< "' does not exist";
10767 vector
<int> acting
;
10769 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
10772 ss
<< "pg currently has no primary";
10775 if (acting
.size() > 1) {
10776 // map to just primary; it will map back to what it wants
10777 pending_inc
.new_pg_temp
[pgid
] = { primary
};
10779 // hmm, pick another arbitrary osd to induce a change. Note
10780 // that this won't work if there is only one suitable OSD in the cluster.
10783 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
10784 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
10787 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
10793 ss
<< "not enough up OSDs in the cluster to force repeer";
10798 } else if (prefix
== "osd pg-upmap" ||
10799 prefix
== "osd rm-pg-upmap" ||
10800 prefix
== "osd pg-upmap-items" ||
10801 prefix
== "osd rm-pg-upmap-items") {
10802 if (osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
10803 ss
<< "min_compat_client "
10804 << ceph_release_name(osdmap
.require_min_compat_client
)
10805 << " < luminous, which is required for pg-upmap. "
10806 << "Try 'ceph osd set-require-min-compat-client luminous' "
10807 << "before using the new interface";
10811 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
10812 if (err
== -EAGAIN
)
10817 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
10818 ss
<< "unable to parse 'pgid' value '"
10819 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
10824 if (!pgid
.parse(pgidstr
.c_str())) {
10825 ss
<< "invalid pgid '" << pgidstr
<< "'";
10829 if (!osdmap
.pg_exists(pgid
)) {
10830 ss
<< "pg " << pgid
<< " does not exist";
10834 if (pending_inc
.old_pools
.count(pgid
.pool())) {
10835 ss
<< "pool of " << pgid
<< " is pending removal";
10838 wait_for_finished_proposal(op
,
10839 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
10847 OP_RM_PG_UPMAP_ITEMS
,
10850 if (prefix
== "osd pg-upmap") {
10851 option
= OP_PG_UPMAP
;
10852 } else if (prefix
== "osd rm-pg-upmap") {
10853 option
= OP_RM_PG_UPMAP
;
10854 } else if (prefix
== "osd pg-upmap-items") {
10855 option
= OP_PG_UPMAP_ITEMS
;
10857 option
= OP_RM_PG_UPMAP_ITEMS
;
10860 // check pending upmap changes
10862 case OP_PG_UPMAP
: // fall through
10863 case OP_RM_PG_UPMAP
:
10864 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
10865 pending_inc
.old_pg_upmap
.count(pgid
)) {
10866 dout(10) << __func__
<< " waiting for pending update on "
10868 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10873 case OP_PG_UPMAP_ITEMS
: // fall through
10874 case OP_RM_PG_UPMAP_ITEMS
:
10875 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
10876 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
10877 dout(10) << __func__
<< " waiting for pending update on "
10879 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10885 ceph_abort_msg("invalid option");
10891 vector
<int64_t> id_vec
;
10892 if (!cmd_getval(cct
, cmdmap
, "id", id_vec
)) {
10893 ss
<< "unable to parse 'id' value(s) '"
10894 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
10899 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
10900 if ((int)id_vec
.size() < pool_min_size
) {
10901 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
10902 << pool_min_size
<< ")";
10907 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
10908 if ((int)id_vec
.size() > pool_size
) {
10909 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
10910 << pool_size
<< ")";
10915 vector
<int32_t> new_pg_upmap
;
10916 for (auto osd
: id_vec
) {
10917 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
10918 ss
<< "osd." << osd
<< " does not exist";
10922 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
10923 if (it
!= new_pg_upmap
.end()) {
10924 ss
<< "osd." << osd
<< " already exists, ";
10927 new_pg_upmap
.push_back(osd
);
10930 if (new_pg_upmap
.empty()) {
10931 ss
<< "no valid upmap items(pairs) is specified";
10936 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
10937 new_pg_upmap
.begin(), new_pg_upmap
.end());
10938 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
10942 case OP_RM_PG_UPMAP
:
10944 pending_inc
.old_pg_upmap
.insert(pgid
);
10945 ss
<< "clear " << pgid
<< " pg_upmap mapping";
10949 case OP_PG_UPMAP_ITEMS
:
10951 vector
<int64_t> id_vec
;
10952 if (!cmd_getval(cct
, cmdmap
, "id", id_vec
)) {
10953 ss
<< "unable to parse 'id' value(s) '"
10954 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
10959 if (id_vec
.size() % 2) {
10960 ss
<< "you must specify pairs of osd ids to be remapped";
10965 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
10966 if ((int)(id_vec
.size() / 2) > pool_size
) {
10967 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
10968 << pool_size
<< ")";
10973 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
10974 ostringstream items
;
10976 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
10980 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
10983 if (!osdmap
.exists(from
)) {
10984 ss
<< "osd." << from
<< " does not exist";
10988 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
10989 ss
<< "osd." << to
<< " does not exist";
10993 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
10994 auto it
= std::find(new_pg_upmap_items
.begin(),
10995 new_pg_upmap_items
.end(), entry
);
10996 if (it
!= new_pg_upmap_items
.end()) {
10997 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
11000 new_pg_upmap_items
.push_back(entry
);
11001 items
<< from
<< "->" << to
<< ",";
11003 string
out(items
.str());
11004 out
.resize(out
.size() - 1); // drop last ','
11007 if (new_pg_upmap_items
.empty()) {
11008 ss
<< "no valid upmap items(pairs) is specified";
11013 pending_inc
.new_pg_upmap_items
[pgid
] =
11014 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
11015 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
11016 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
11020 case OP_RM_PG_UPMAP_ITEMS
:
11022 pending_inc
.old_pg_upmap_items
.insert(pgid
);
11023 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
11028 ceph_abort_msg("invalid option");
11032 } else if (prefix
== "osd primary-affinity") {
11034 if (!cmd_getval(cct
, cmdmap
, "id", id
)) {
11035 ss
<< "invalid osd id value '"
11036 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11041 if (!cmd_getval(cct
, cmdmap
, "weight", w
)) {
11042 ss
<< "unable to parse 'weight' value '"
11043 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11047 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
11049 ss
<< "weight must be >= 0";
11053 if (osdmap
.require_min_compat_client
> 0 &&
11054 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
11055 ss
<< "require_min_compat_client "
11056 << ceph_release_name(osdmap
.require_min_compat_client
)
11057 << " < firefly, which is required for primary-affinity";
11061 if (osdmap
.exists(id
)) {
11062 pending_inc
.new_primary_affinity
[id
] = ww
;
11063 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
11065 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11066 get_last_committed() + 1));
11069 ss
<< "osd." << id
<< " does not exist";
11073 } else if (prefix
== "osd reweight") {
11075 if (!cmd_getval(cct
, cmdmap
, "id", id
)) {
11076 ss
<< "unable to parse osd id value '"
11077 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11082 if (!cmd_getval(cct
, cmdmap
, "weight", w
)) {
11083 ss
<< "unable to parse weight value '"
11084 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11088 long ww
= (int)((double)CEPH_OSD_IN
*w
);
11090 ss
<< "weight must be >= 0";
11094 if (osdmap
.exists(id
)) {
11095 pending_inc
.new_weight
[id
] = ww
;
11096 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
11098 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11099 get_last_committed() + 1));
11102 ss
<< "osd." << id
<< " does not exist";
11106 } else if (prefix
== "osd reweightn") {
11107 map
<int32_t, uint32_t> weights
;
11108 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
11110 ss
<< "unable to parse 'weights' value '"
11111 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
11114 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
11115 wait_for_finished_proposal(
11117 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
11119 } else if (prefix
== "osd lost") {
11121 if (!cmd_getval(cct
, cmdmap
, "id", id
)) {
11122 ss
<< "unable to parse osd id value '"
11123 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11128 cmd_getval(g_ceph_context
, cmdmap
, "yes_i_really_mean_it", sure
);
11130 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
11131 "--yes-i-really-mean-it if you really do.";
11134 } else if (!osdmap
.exists(id
)) {
11135 ss
<< "osd." << id
<< " does not exist";
11138 } else if (!osdmap
.is_down(id
)) {
11139 ss
<< "osd." << id
<< " is not down";
11143 epoch_t e
= osdmap
.get_info(id
).down_at
;
11144 pending_inc
.new_lost
[id
] = e
;
11145 ss
<< "marked osd lost in epoch " << e
;
11147 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11148 get_last_committed() + 1));
11152 } else if (prefix
== "osd destroy-actual" ||
11153 prefix
== "osd purge-actual" ||
11154 prefix
== "osd purge-new") {
11155 /* Destroying an OSD means that we don't expect to further make use of
11156 * the OSDs data (which may even become unreadable after this operation),
11157 * and that we are okay with scrubbing all its cephx keys and config-key
11158 * data (which may include lockbox keys, thus rendering the osd's data
11161 * The OSD will not be removed. Instead, we will mark it as destroyed,
11162 * such that a subsequent call to `create` will not reuse the osd id.
11163 * This will play into being able to recreate the OSD, at the same
11164 * crush location, with minimal data movement.
11167 // make sure authmon is writeable.
11168 if (!mon
->authmon()->is_writeable()) {
11169 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
11170 << "osd destroy" << dendl
;
11171 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
11176 if (!cmd_getval(cct
, cmdmap
, "id", id
)) {
11177 auto p
= cmdmap
.find("id");
11178 if (p
== cmdmap
.end()) {
11179 ss
<< "no osd id specified";
11181 ss
<< "unable to parse osd id value '"
11182 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
11188 bool is_destroy
= (prefix
== "osd destroy-actual");
11190 ceph_assert("osd purge-actual" == prefix
||
11191 "osd purge-new" == prefix
);
11195 cmd_getval(g_ceph_context
, cmdmap
, "yes_i_really_mean_it", sure
);
11197 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
11198 << "This will mean real, permanent data loss, as well "
11199 << "as deletion of cephx and lockbox keys. "
11200 << "Pass --yes-i-really-mean-it if you really do.";
11203 } else if (!osdmap
.exists(id
)) {
11204 ss
<< "osd." << id
<< " does not exist";
11205 err
= 0; // idempotent
11207 } else if (osdmap
.is_up(id
)) {
11208 ss
<< "osd." << id
<< " is not `down`.";
11211 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
11212 ss
<< "destroyed osd." << id
;
11217 if (prefix
== "osd purge-new" &&
11218 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
11219 ss
<< "osd." << id
<< " is not new";
11224 bool goto_reply
= false;
11228 err
= prepare_command_osd_destroy(id
, ss
);
11229 // we checked above that it should exist.
11230 ceph_assert(err
!= -ENOENT
);
11232 err
= prepare_command_osd_purge(id
, ss
);
11233 if (err
== -ENOENT
) {
11235 ss
<< "osd." << id
<< " does not exist.";
11241 if (err
< 0 || goto_reply
) {
11246 ss
<< "destroyed osd." << id
;
11248 ss
<< "purged osd." << id
;
11252 wait_for_finished_proposal(op
,
11253 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
11254 force_immediate_propose();
11257 } else if (prefix
== "osd new") {
11259 // make sure authmon is writeable.
11260 if (!mon
->authmon()->is_writeable()) {
11261 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
11262 << "osd new" << dendl
;
11263 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
11267 map
<string
,string
> param_map
;
11269 bufferlist bl
= m
->get_data();
11270 string param_json
= bl
.to_str();
11271 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
11273 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
11277 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
11280 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
11293 if (err
== EEXIST
) {
11294 // idempotent operation
11299 wait_for_finished_proposal(op
,
11300 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
11301 get_last_committed() + 1));
11302 force_immediate_propose();
11305 } else if (prefix
== "osd create") {
11307 // optional id provided?
11308 int64_t id
= -1, cmd_id
= -1;
11309 if (cmd_getval(cct
, cmdmap
, "id", cmd_id
)) {
11311 ss
<< "invalid osd id value '" << cmd_id
<< "'";
11315 dout(10) << " osd create got id " << cmd_id
<< dendl
;
11320 if (cmd_getval(cct
, cmdmap
, "uuid", uuidstr
)) {
11321 if (!uuid
.parse(uuidstr
.c_str())) {
11322 ss
<< "invalid uuid value '" << uuidstr
<< "'";
11326 // we only care about the id if we also have the uuid, to
11327 // ensure the operation's idempotency.
11331 int32_t new_id
= -1;
11332 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
11334 if (err
== -EAGAIN
) {
11335 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11338 // a check has failed; reply to the user.
11341 } else if (err
== EEXIST
) {
11342 // this is an idempotent operation; we can go ahead and reply.
11344 f
->open_object_section("created_osd");
11345 f
->dump_int("osdid", new_id
);
11346 f
->close_section();
11356 string empty_device_class
;
11357 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
11360 f
->open_object_section("created_osd");
11361 f
->dump_int("osdid", new_id
);
11362 f
->close_section();
11368 wait_for_finished_proposal(op
,
11369 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
11370 get_last_committed() + 1));
11373 } else if (prefix
== "osd blacklist clear") {
11374 pending_inc
.new_blacklist
.clear();
11375 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
11376 osdmap
.get_blacklist(&blacklist
);
11377 for (const auto &entry
: blacklist
) {
11378 pending_inc
.old_blacklist
.push_back(entry
.first
);
11380 ss
<< " removed all blacklist entries";
11382 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11383 get_last_committed() + 1));
11385 } else if (prefix
== "osd blacklist") {
11387 cmd_getval(cct
, cmdmap
, "addr", addrstr
);
11388 entity_addr_t addr
;
11389 if (!addr
.parse(addrstr
.c_str(), 0)) {
11390 ss
<< "unable to parse address " << addrstr
;
11395 if (osdmap
.require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
11396 // always blacklist type ANY
11397 addr
.set_type(entity_addr_t::TYPE_ANY
);
11399 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
11402 string blacklistop
;
11403 cmd_getval(cct
, cmdmap
, "blacklistop", blacklistop
);
11404 if (blacklistop
== "add") {
11405 utime_t expires
= ceph_clock_now();
11407 // default one hour
11408 cmd_getval(cct
, cmdmap
, "expire", d
,
11409 g_conf()->mon_osd_blacklist_default_expire
);
11412 pending_inc
.new_blacklist
[addr
] = expires
;
11415 // cancel any pending un-blacklisting request too
11416 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
11417 pending_inc
.old_blacklist
.end(), addr
);
11418 if (it
!= pending_inc
.old_blacklist
.end()) {
11419 pending_inc
.old_blacklist
.erase(it
);
11423 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
11425 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11426 get_last_committed() + 1));
11428 } else if (blacklistop
== "rm") {
11429 if (osdmap
.is_blacklisted(addr
) ||
11430 pending_inc
.new_blacklist
.count(addr
)) {
11431 if (osdmap
.is_blacklisted(addr
))
11432 pending_inc
.old_blacklist
.push_back(addr
);
11434 pending_inc
.new_blacklist
.erase(addr
);
11435 ss
<< "un-blacklisting " << addr
;
11437 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11438 get_last_committed() + 1));
11441 ss
<< addr
<< " isn't blacklisted";
11446 } else if (prefix
== "osd pool mksnap") {
11448 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11449 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
11451 ss
<< "unrecognized pool '" << poolstr
<< "'";
11456 cmd_getval(cct
, cmdmap
, "snap", snapname
);
11457 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
11458 if (p
->is_unmanaged_snaps_mode()) {
11459 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
11462 } else if (p
->snap_exists(snapname
.c_str())) {
11463 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
11466 } else if (p
->is_tier()) {
11467 ss
<< "pool " << poolstr
<< " is a cache tier";
11472 if (pending_inc
.new_pools
.count(pool
))
11473 pp
= &pending_inc
.new_pools
[pool
];
11475 pp
= &pending_inc
.new_pools
[pool
];
11478 if (pp
->snap_exists(snapname
.c_str())) {
11479 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
11481 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
11482 pp
->set_snap_epoch(pending_inc
.epoch
);
11483 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
11486 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11487 get_last_committed() + 1));
11489 } else if (prefix
== "osd pool rmsnap") {
11491 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11492 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
11494 ss
<< "unrecognized pool '" << poolstr
<< "'";
11499 cmd_getval(cct
, cmdmap
, "snap", snapname
);
11500 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
11501 if (p
->is_unmanaged_snaps_mode()) {
11502 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
11505 } else if (!p
->snap_exists(snapname
.c_str())) {
11506 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
11511 if (pending_inc
.new_pools
.count(pool
))
11512 pp
= &pending_inc
.new_pools
[pool
];
11514 pp
= &pending_inc
.new_pools
[pool
];
11517 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
11519 pp
->remove_snap(sn
);
11520 pp
->set_snap_epoch(pending_inc
.epoch
);
11521 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
11523 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
11526 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11527 get_last_committed() + 1));
11529 } else if (prefix
== "osd pool create") {
11530 int64_t pg_num
, pg_num_min
;
11532 cmd_getval(cct
, cmdmap
, "pg_num", pg_num
, int64_t(0));
11533 cmd_getval(cct
, cmdmap
, "pgp_num", pgp_num
, pg_num
);
11534 cmd_getval(cct
, cmdmap
, "pg_num_min", pg_num_min
, int64_t(0));
11536 string pool_type_str
;
11537 cmd_getval(cct
, cmdmap
, "pool_type", pool_type_str
);
11538 if (pool_type_str
.empty())
11539 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
11542 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11543 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11544 if (pool_id
>= 0) {
11545 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11546 if (pool_type_str
!= p
->get_type_name()) {
11547 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
11550 ss
<< "pool '" << poolstr
<< "' already exists";
11557 if (pool_type_str
== "replicated") {
11558 pool_type
= pg_pool_t::TYPE_REPLICATED
;
11559 } else if (pool_type_str
== "erasure") {
11560 pool_type
= pg_pool_t::TYPE_ERASURE
;
11562 ss
<< "unknown pool type '" << pool_type_str
<< "'";
11567 bool implicit_rule_creation
= false;
11568 int64_t expected_num_objects
= 0;
11570 cmd_getval(cct
, cmdmap
, "rule", rule_name
);
11571 string erasure_code_profile
;
11572 cmd_getval(cct
, cmdmap
, "erasure_code_profile", erasure_code_profile
);
11574 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
11575 if (erasure_code_profile
== "")
11576 erasure_code_profile
= "default";
11577 //handle the erasure code profile
11578 if (erasure_code_profile
== "default") {
11579 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
11580 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
11581 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
11585 map
<string
,string
> profile_map
;
11586 err
= osdmap
.get_erasure_code_profile_default(cct
,
11591 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
11592 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
11596 if (rule_name
== "") {
11597 implicit_rule_creation
= true;
11598 if (erasure_code_profile
== "default") {
11599 rule_name
= "erasure-code";
11601 dout(1) << "implicitly use rule named after the pool: "
11602 << poolstr
<< dendl
;
11603 rule_name
= poolstr
;
11606 cmd_getval(g_ceph_context
, cmdmap
, "expected_num_objects",
11607 expected_num_objects
, int64_t(0));
11609 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
11610 // and put expected_num_objects to rule field
11611 if (erasure_code_profile
!= "") { // cmd is from CLI
11612 if (rule_name
!= "") {
11614 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
11615 if (interr
.length()) {
11616 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
11621 rule_name
= erasure_code_profile
;
11622 } else { // cmd is well-formed
11623 cmd_getval(g_ceph_context
, cmdmap
, "expected_num_objects",
11624 expected_num_objects
, int64_t(0));
11628 if (!implicit_rule_creation
&& rule_name
!= "") {
11630 err
= get_crush_rule(rule_name
, &rule
, &ss
);
11631 if (err
== -EAGAIN
) {
11632 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11639 if (expected_num_objects
< 0) {
11640 ss
<< "'expected_num_objects' must be non-negative";
11645 if (expected_num_objects
> 0 &&
11646 cct
->_conf
->osd_objectstore
== "filestore" &&
11647 cct
->_conf
->filestore_merge_threshold
> 0) {
11648 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
11653 if (expected_num_objects
== 0 &&
11654 cct
->_conf
->osd_objectstore
== "filestore" &&
11655 cct
->_conf
->filestore_merge_threshold
< 0) {
11656 int osds
= osdmap
.get_num_osds();
11657 if (osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
11658 ss
<< "For better initial performance on pools expected to store a "
11659 << "large number of objects, consider supplying the "
11660 << "expected_num_objects parameter when creating the pool.\n";
11664 int64_t fast_read_param
;
11665 cmd_getval(cct
, cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
11666 FastReadType fast_read
= FAST_READ_DEFAULT
;
11667 if (fast_read_param
== 0)
11668 fast_read
= FAST_READ_OFF
;
11669 else if (fast_read_param
> 0)
11670 fast_read
= FAST_READ_ON
;
11672 int64_t repl_size
= 0;
11673 cmd_getval(cct
, cmdmap
, "size", repl_size
);
11674 int64_t target_size_bytes
= 0;
11675 double target_size_ratio
= 0.0;
11676 cmd_getval(cct
, cmdmap
, "target_size_bytes", target_size_bytes
);
11677 cmd_getval(cct
, cmdmap
, "target_size_ratio", target_size_ratio
);
11679 err
= prepare_new_pool(poolstr
,
11680 -1, // default crush rule
11682 pg_num
, pgp_num
, pg_num_min
,
11683 repl_size
, target_size_bytes
, target_size_ratio
,
11684 erasure_code_profile
, pool_type
,
11685 (uint64_t)expected_num_objects
,
11691 ss
<< "pool '" << poolstr
<< "' already exists";
11694 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11703 ss
<< "pool '" << poolstr
<< "' created";
11706 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11707 get_last_committed() + 1));
11710 } else if (prefix
== "osd pool delete" ||
11711 prefix
== "osd pool rm") {
11712 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
11713 string poolstr
, poolstr2
, sure
;
11714 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11715 cmd_getval(cct
, cmdmap
, "pool2", poolstr2
);
11716 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
11718 ss
<< "pool '" << poolstr
<< "' does not exist";
11723 bool force_no_fake
= false;
11724 cmd_getval(cct
, cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
11725 bool force
= false;
11726 cmd_getval(cct
, cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
11727 if (poolstr2
!= poolstr
||
11728 (!force
&& !force_no_fake
)) {
11729 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
11730 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
11731 << "followed by --yes-i-really-really-mean-it.";
11735 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
11736 if (err
== -EAGAIN
) {
11737 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11743 } else if (prefix
== "osd pool rename") {
11744 string srcpoolstr
, destpoolstr
;
11745 cmd_getval(cct
, cmdmap
, "srcpool", srcpoolstr
);
11746 cmd_getval(cct
, cmdmap
, "destpool", destpoolstr
);
11747 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
11748 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
11750 if (pool_src
< 0) {
11751 if (pool_dst
>= 0) {
11752 // src pool doesn't exist, dst pool does exist: to ensure idempotency
11753 // of operations, assume this rename succeeded, as it is not changing
11754 // the current state. Make sure we output something understandable
11755 // for whoever is issuing the command, if they are paying attention,
11756 // in case it was not intentional; or to avoid a "wtf?" and a bug
11757 // report in case it was intentional, while expecting a failure.
11758 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
11759 << destpoolstr
<< "' does -- assuming successful rename";
11762 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
11766 } else if (pool_dst
>= 0) {
11767 // source pool exists and so does the destination pool
11768 ss
<< "pool '" << destpoolstr
<< "' already exists";
11773 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
11775 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
11777 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
11778 << cpp_strerror(ret
);
11781 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
11782 get_last_committed() + 1));
11785 } else if (prefix
== "osd pool set") {
11786 err
= prepare_command_pool_set(cmdmap
, ss
);
11787 if (err
== -EAGAIN
)
11793 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11794 get_last_committed() + 1));
11796 } else if (prefix
== "osd tier add") {
11797 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11798 if (err
== -EAGAIN
)
11803 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11804 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11806 ss
<< "unrecognized pool '" << poolstr
<< "'";
11810 string tierpoolstr
;
11811 cmd_getval(cct
, cmdmap
, "tierpool", tierpoolstr
);
11812 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
11813 if (tierpool_id
< 0) {
11814 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
11818 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11820 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
11823 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
11827 // make sure new tier is empty
11828 string force_nonempty
;
11829 cmd_getval(cct
, cmdmap
, "force_nonempty", force_nonempty
);
11830 const pool_stat_t
*pstats
= mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
11831 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
11832 force_nonempty
!= "--force-nonempty") {
11833 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
11837 if (tp
->is_erasure()) {
11838 ss
<< "tier pool '" << tierpoolstr
11839 << "' is an ec pool, which cannot be a tier";
11843 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
11844 ((force_nonempty
!= "--force-nonempty") ||
11845 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
11846 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
11851 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11852 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
11853 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
11854 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11857 np
->tiers
.insert(tierpool_id
);
11858 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
11859 ntp
->tier_of
= pool_id
;
11860 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
11861 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11862 get_last_committed() + 1));
11864 } else if (prefix
== "osd tier remove" ||
11865 prefix
== "osd tier rm") {
11867 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11868 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11870 ss
<< "unrecognized pool '" << poolstr
<< "'";
11874 string tierpoolstr
;
11875 cmd_getval(cct
, cmdmap
, "tierpool", tierpoolstr
);
11876 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
11877 if (tierpool_id
< 0) {
11878 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
11882 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11884 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
11887 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
11891 if (p
->tiers
.count(tierpool_id
) == 0) {
11892 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
11896 if (tp
->tier_of
!= pool_id
) {
11897 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
11898 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
11899 // be scary about it; this is an inconsistency and bells must go off
11900 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
11904 if (p
->read_tier
== tierpool_id
) {
11905 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
11910 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11911 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
11912 if (np
->tiers
.count(tierpool_id
) == 0 ||
11913 ntp
->tier_of
!= pool_id
||
11914 np
->read_tier
== tierpool_id
) {
11915 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11918 np
->tiers
.erase(tierpool_id
);
11920 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
11921 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11922 get_last_committed() + 1));
11924 } else if (prefix
== "osd tier set-overlay") {
11925 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11926 if (err
== -EAGAIN
)
11931 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11932 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11934 ss
<< "unrecognized pool '" << poolstr
<< "'";
11938 string overlaypoolstr
;
11939 cmd_getval(cct
, cmdmap
, "overlaypool", overlaypoolstr
);
11940 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
11941 if (overlaypool_id
< 0) {
11942 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
11946 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11948 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
11949 ceph_assert(overlay_p
);
11950 if (p
->tiers
.count(overlaypool_id
) == 0) {
11951 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
11955 if (p
->read_tier
== overlaypool_id
) {
11957 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
11960 if (p
->has_read_tier()) {
11961 ss
<< "pool '" << poolstr
<< "' has overlay '"
11962 << osdmap
.get_pool_name(p
->read_tier
)
11963 << "'; please remove-overlay first";
11969 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11970 np
->read_tier
= overlaypool_id
;
11971 np
->write_tier
= overlaypool_id
;
11972 np
->set_last_force_op_resend(pending_inc
.epoch
);
11973 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
11974 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
11975 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
11976 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
11977 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
11978 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11979 get_last_committed() + 1));
11981 } else if (prefix
== "osd tier remove-overlay" ||
11982 prefix
== "osd tier rm-overlay") {
11984 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
11985 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11987 ss
<< "unrecognized pool '" << poolstr
<< "'";
11991 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11993 if (!p
->has_read_tier()) {
11995 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
11999 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
12004 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12005 if (np
->has_read_tier()) {
12006 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
12007 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
12008 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12010 if (np
->has_write_tier()) {
12011 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
12012 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
12013 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12015 np
->clear_read_tier();
12016 np
->clear_write_tier();
12017 np
->set_last_force_op_resend(pending_inc
.epoch
);
12018 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12019 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12020 get_last_committed() + 1));
12022 } else if (prefix
== "osd tier cache-mode") {
12023 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12024 if (err
== -EAGAIN
)
12029 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
12030 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12032 ss
<< "unrecognized pool '" << poolstr
<< "'";
12036 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12038 if (!p
->is_tier()) {
12039 ss
<< "pool '" << poolstr
<< "' is not a tier";
12044 cmd_getval(cct
, cmdmap
, "mode", modestr
);
12045 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
12047 ss
<< "'" << modestr
<< "' is not a valid cache mode";
12053 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", sure
);
12055 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12056 mode
!= pg_pool_t::CACHEMODE_NONE
&&
12057 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12058 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
12060 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
12061 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12066 // pool already has this cache-mode set and there are no pending changes
12067 if (p
->cache_mode
== mode
&&
12068 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
12069 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
12070 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
12071 << " to " << pg_pool_t::get_cache_mode_name(mode
);
12076 /* Mode description:
12078 * none: No cache-mode defined
12079 * forward: Forward all reads and writes to base pool
12080 * writeback: Cache writes, promote reads from base pool
12081 * readonly: Forward writes to base pool
12082 * readforward: Writes are in writeback mode, Reads are in forward mode
12083 * proxy: Proxy all reads and writes to base pool
12084 * readproxy: Writes are in writeback mode, Reads are in proxy mode
12086 * Hence, these are the allowed transitions:
12089 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12090 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12091 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12092 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
12093 * writeback -> readforward || readproxy || forward || proxy
12097 // We check if the transition is valid against the current pool mode, as
12098 // it is the only committed state thus far. We will blantly squash
12099 // whatever mode is on the pending state.
12101 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
12102 (mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
12103 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12104 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
12105 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
12106 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
12107 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
12108 << "' pool; only '"
12109 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD
)
12111 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
12113 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD
)
12115 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
12120 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
12121 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12122 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
12123 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12124 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
12126 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
12127 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12128 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
12129 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
12130 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
12132 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
12133 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12134 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
12135 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
12136 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
12138 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
12139 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12140 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
12141 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12142 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
12144 const pool_stat_t
* pstats
=
12145 mon
->mgrstatmon()->get_pool_stat(pool_id
);
12147 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
12148 ss
<< "unable to set cache-mode '"
12149 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
12150 << "': dirty objects found";
12156 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12157 np
->cache_mode
= mode
;
12158 // set this both when moving to and from cache_mode NONE. this is to
12159 // capture legacy pools that were set up before this flag existed.
12160 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
12161 ss
<< "set cache-mode for pool '" << poolstr
12162 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
12163 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
12164 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
12165 ceph_assert(base_pool
);
12166 if (base_pool
->read_tier
== pool_id
||
12167 base_pool
->write_tier
== pool_id
)
12168 ss
<<" (WARNING: pool is still configured as read or write tier)";
12170 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12171 get_last_committed() + 1));
12173 } else if (prefix
== "osd tier add-cache") {
12174 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12175 if (err
== -EAGAIN
)
12180 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
12181 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12183 ss
<< "unrecognized pool '" << poolstr
<< "'";
12187 string tierpoolstr
;
12188 cmd_getval(cct
, cmdmap
, "tierpool", tierpoolstr
);
12189 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12190 if (tierpool_id
< 0) {
12191 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12195 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12197 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12200 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
12205 if (!cmd_getval(cct
, cmdmap
, "size", size
)) {
12206 ss
<< "unable to parse 'size' value '"
12207 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
12211 // make sure new tier is empty
12212 const pool_stat_t
*pstats
=
12213 mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
12214 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
12215 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
12219 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
12220 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
12222 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
12226 HitSet::Params hsp
;
12227 auto& cache_hit_set_type
=
12228 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
12229 if (cache_hit_set_type
== "bloom") {
12230 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
12231 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
12232 hsp
= HitSet::Params(bsp
);
12233 } else if (cache_hit_set_type
== "explicit_hash") {
12234 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
12235 } else if (cache_hit_set_type
== "explicit_object") {
12236 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
12238 ss
<< "osd tier cache default hit set type '"
12239 << cache_hit_set_type
<< "' is not a known type";
12244 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12245 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12246 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
12247 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12250 np
->tiers
.insert(tierpool_id
);
12251 np
->read_tier
= np
->write_tier
= tierpool_id
;
12252 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
12253 np
->set_last_force_op_resend(pending_inc
.epoch
);
12254 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
12255 ntp
->tier_of
= pool_id
;
12256 ntp
->cache_mode
= mode
;
12257 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
12258 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
12259 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
12260 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
12261 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
12262 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
12263 ntp
->hit_set_params
= hsp
;
12264 ntp
->target_max_bytes
= size
;
12265 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
12266 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12267 get_last_committed() + 1));
12269 } else if (prefix
== "osd pool set-quota") {
12271 cmd_getval(cct
, cmdmap
, "pool", poolstr
);
12272 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12274 ss
<< "unrecognized pool '" << poolstr
<< "'";
12280 cmd_getval(cct
, cmdmap
, "field", field
);
12281 if (field
!= "max_objects" && field
!= "max_bytes") {
12282 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
12287 // val could contain unit designations, so we treat as a string
12289 cmd_getval(cct
, cmdmap
, "val", val
);
12292 if (field
== "max_objects") {
12293 value
= strict_sistrtoll(val
.c_str(), &tss
);
12294 } else if (field
== "max_bytes") {
12295 value
= strict_iecstrtoll(val
.c_str(), &tss
);
12297 ceph_abort_msg("unrecognized option");
12299 if (!tss
.empty()) {
12300 ss
<< "error parsing value '" << val
<< "': " << tss
;
12305 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
12306 if (field
== "max_objects") {
12307 pi
->quota_max_objects
= value
;
12308 } else if (field
== "max_bytes") {
12309 pi
->quota_max_bytes
= value
;
12311 ceph_abort_msg("unrecognized option");
12313 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
12315 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12316 get_last_committed() + 1));
12318 } else if (prefix
== "osd pool application enable" ||
12319 prefix
== "osd pool application disable" ||
12320 prefix
== "osd pool application set" ||
12321 prefix
== "osd pool application rm") {
12322 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
12323 if (err
== -EAGAIN
) {
12325 } else if (err
< 0) {
12330 } else if (prefix
== "osd force-create-pg") {
12333 cmd_getval(cct
, cmdmap
, "pgid", pgidstr
);
12334 if (!pgid
.parse(pgidstr
.c_str())) {
12335 ss
<< "invalid pgid '" << pgidstr
<< "'";
12339 if (!osdmap
.pg_exists(pgid
)) {
12340 ss
<< "pg " << pgid
<< " should not exist";
12345 cmd_getval(cct
, cmdmap
, "yes_i_really_mean_it", sure
);
12347 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
12348 << "that the cluster will give up ever trying to recover the lost data. Do this "
12349 << "only if you are certain that all copies of the PG are in fact lost and you are "
12350 << "willing to accept that the data is permanently destroyed. Pass "
12351 << "--yes-i-really-mean-it to proceed.";
12357 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
12358 auto emplaced
= creating_pgs
.pgs
.emplace(pgid
,
12359 make_pair(osdmap
.get_epoch(),
12360 ceph_clock_now()));
12361 creating_now
= emplaced
.second
;
12363 if (creating_now
) {
12364 ss
<< "pg " << pgidstr
<< " now creating, ok";
12365 // set the pool's CREATING flag so that (1) the osd won't ignore our
12366 // create message and (2) we won't propose any future pg_num changes
12367 // until after the PG has been instantiated.
12368 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
12369 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
12371 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
12375 ss
<< "pg " << pgid
<< " already creating";
12385 if (err
< 0 && rs
.length() == 0)
12386 rs
= cpp_strerror(err
);
12387 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
12392 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12393 get_last_committed() + 1));
12397 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12401 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
12403 op
->mark_osdmon_event(__func__
);
12405 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12406 MonSession
*session
= op
->get_session();
12408 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
12413 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12414 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12416 const std::string
* pool_name
= nullptr;
12417 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
12418 if (pg_pool
!= nullptr) {
12419 pool_name
= &osdmap
.get_pool_name(m
->pool
);
12422 if (!is_unmanaged_snap_op_permitted(cct
, mon
->key_server
,
12423 session
->entity_name
, session
->caps
,
12424 session
->get_peer_socket_addr(),
12426 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
12427 << "privileges. message: " << *m
<< std::endl
12428 << "caps: " << session
->caps
<< dendl
;
12429 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
12435 if (!session
->is_capable("osd", MON_CAP_W
)) {
12436 dout(0) << "got pool op from entity with insufficient privileges. "
12437 << "message: " << *m
<< std::endl
12438 << "caps: " << session
->caps
<< dendl
;
12439 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
12448 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
12450 op
->mark_osdmon_event(__func__
);
12451 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12453 if (enforce_pool_op_caps(op
)) {
12457 if (m
->fsid
!= mon
->monmap
->fsid
) {
12458 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
12459 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
12460 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12464 if (m
->op
== POOL_OP_CREATE
)
12465 return preprocess_pool_op_create(op
);
12467 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
12468 if (p
== nullptr) {
12469 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
12470 if (m
->op
== POOL_OP_DELETE
) {
12471 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12473 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
12478 // check if the snap and snapname exist
12479 bool snap_exists
= false;
12480 if (p
->snap_exists(m
->name
.c_str()))
12481 snap_exists
= true;
12484 case POOL_OP_CREATE_SNAP
:
12485 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
12486 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12490 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12494 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12495 if (p
->is_pool_snaps_mode()) {
12496 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12500 case POOL_OP_DELETE_SNAP
:
12501 if (p
->is_unmanaged_snaps_mode()) {
12502 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12505 if (!snap_exists
) {
12506 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12510 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12511 if (p
->is_pool_snaps_mode()) {
12512 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12515 if (p
->is_removed_snap(m
->snapid
)) {
12516 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12520 case POOL_OP_DELETE
:
12521 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
12522 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12526 case POOL_OP_AUID_CHANGE
:
12536 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
12538 op
->mark_osdmon_event(__func__
);
12539 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12540 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
12542 _pool_op_reply(op
, 0, osdmap
.get_epoch());
12549 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
12551 op
->mark_osdmon_event(__func__
);
12552 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12553 dout(10) << "prepare_pool_op " << *m
<< dendl
;
12554 if (m
->op
== POOL_OP_CREATE
) {
12555 return prepare_pool_op_create(op
);
12556 } else if (m
->op
== POOL_OP_DELETE
) {
12557 return prepare_pool_op_delete(op
);
12561 bool changed
= false;
12563 if (!osdmap
.have_pg_pool(m
->pool
)) {
12564 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
12568 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
12571 case POOL_OP_CREATE_SNAP
:
12572 if (pool
->is_tier()) {
12574 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
12576 } // else, fall through
12577 case POOL_OP_DELETE_SNAP
:
12578 if (!pool
->is_unmanaged_snaps_mode()) {
12579 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
12580 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
12581 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
12589 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
12592 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12593 // we won't allow removal of an unmanaged snapshot from a pool
12594 // not in unmanaged snaps mode.
12595 if (!pool
->is_unmanaged_snaps_mode()) {
12596 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
12600 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12601 // but we will allow creating an unmanaged snapshot on any pool
12602 // as long as it is not in 'pool' snaps mode.
12603 if (pool
->is_pool_snaps_mode()) {
12604 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
12609 // projected pool info
12611 if (pending_inc
.new_pools
.count(m
->pool
))
12612 pp
= pending_inc
.new_pools
[m
->pool
];
12614 pp
= *osdmap
.get_pg_pool(m
->pool
);
12616 bufferlist reply_data
;
12618 // pool snaps vs unmanaged snaps are mutually exclusive
12620 case POOL_OP_CREATE_SNAP
:
12621 case POOL_OP_DELETE_SNAP
:
12622 if (pp
.is_unmanaged_snaps_mode()) {
12628 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12629 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12630 if (pp
.is_pool_snaps_mode()) {
12637 case POOL_OP_CREATE_SNAP
:
12638 if (!pp
.snap_exists(m
->name
.c_str())) {
12639 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
12640 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
12641 << " seq " << pp
.get_snap_epoch() << dendl
;
12646 case POOL_OP_DELETE_SNAP
:
12648 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
12651 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
12657 case POOL_OP_CREATE_UNMANAGED_SNAP
:
12660 pp
.add_unmanaged_snap(snapid
);
12661 encode(snapid
, reply_data
);
12666 case POOL_OP_DELETE_UNMANAGED_SNAP
:
12667 if (!pp
.is_removed_snap(m
->snapid
)) {
12668 if (m
->snapid
> pp
.get_snap_seq()) {
12669 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
12672 pp
.remove_unmanaged_snap(m
->snapid
);
12673 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
12678 case POOL_OP_AUID_CHANGE
:
12679 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
12688 pp
.set_snap_epoch(pending_inc
.epoch
);
12689 pending_inc
.new_pools
[m
->pool
] = pp
;
12693 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
12697 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
12699 op
->mark_osdmon_event(__func__
);
12700 int err
= prepare_new_pool(op
);
12701 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
12705 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
12708 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
12710 // If the Pool is in use by CephFS, refuse to delete it
12711 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
12712 if (pending_fsmap
.pool_in_use(pool_id
)) {
12713 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
12717 if (pool
.tier_of
>= 0) {
12718 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
12719 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
12722 if (!pool
.tiers
.empty()) {
12723 *ss
<< "pool '" << poolstr
<< "' has tiers";
12724 for(auto tier
: pool
.tiers
) {
12725 *ss
<< " " << osdmap
.get_pool_name(tier
);
12730 if (!g_conf()->mon_allow_pool_delete
) {
12731 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
12735 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
12736 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
12740 *ss
<< "pool '" << poolstr
<< "' removed";
12745 * Check if it is safe to add a tier to a base pool
12748 * True if the operation should proceed, false if we should abort here
12749 * (abort doesn't necessarily mean error, could be idempotency)
12751 bool OSDMonitor::_check_become_tier(
12752 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
12753 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
12757 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
12758 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
12760 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
12761 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
12762 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
12767 if (base_pool
->tiers
.count(tier_pool_id
)) {
12768 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
12770 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
12771 << base_pool_name
<< "'";
12775 if (base_pool
->is_tier()) {
12776 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
12777 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
12778 << "multiple tiers are not yet supported.";
12783 if (tier_pool
->has_tiers()) {
12784 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
12785 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
12786 it
!= tier_pool
->tiers
.end(); ++it
)
12787 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
12788 *ss
<< " multiple tiers are not yet supported.";
12793 if (tier_pool
->is_tier()) {
12794 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
12795 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
12806 * Check if it is safe to remove a tier from this base pool
12809 * True if the operation should proceed, false if we should abort here
12810 * (abort doesn't necessarily mean error, could be idempotency)
12812 bool OSDMonitor::_check_remove_tier(
12813 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
12814 const pg_pool_t
*tier_pool
,
12815 int *err
, ostream
*ss
) const
12817 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
12819 // Apply CephFS-specific checks
12820 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
12821 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
12822 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
12823 // If the underlying pool is erasure coded and does not allow EC
12824 // overwrites, we can't permit the removal of the replicated tier that
12825 // CephFS relies on to access it
12826 *ss
<< "pool '" << base_pool_name
<<
12827 "' does not allow EC overwrites and is in use by CephFS"
12833 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
12834 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
12835 "tier is still in use as a writeback cache. Change the cache "
12836 "mode and flush the cache before removing it";
12846 int OSDMonitor::_prepare_remove_pool(
12847 int64_t pool
, ostream
*ss
, bool no_fake
)
12849 dout(10) << __func__
<< " " << pool
<< dendl
;
12850 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12851 int r
= _check_remove_pool(pool
, *p
, ss
);
12855 auto new_pool
= pending_inc
.new_pools
.find(pool
);
12856 if (new_pool
!= pending_inc
.new_pools
.end()) {
12857 // if there is a problem with the pending info, wait and retry
12859 const auto& p
= new_pool
->second
;
12860 int r
= _check_remove_pool(pool
, p
, ss
);
12865 if (pending_inc
.old_pools
.count(pool
)) {
12866 dout(10) << __func__
<< " " << pool
<< " already pending removal"
12871 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
12872 string old_name
= osdmap
.get_pool_name(pool
);
12873 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
12874 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
12875 << old_name
<< " -> " << new_name
<< dendl
;
12876 pending_inc
.new_pool_names
[pool
] = new_name
;
12881 pending_inc
.old_pools
.insert(pool
);
12883 // remove any pg_temp mappings for this pool
12884 for (auto p
= osdmap
.pg_temp
->begin();
12885 p
!= osdmap
.pg_temp
->end();
12887 if (p
->first
.pool() == pool
) {
12888 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
12889 << p
->first
<< dendl
;
12890 pending_inc
.new_pg_temp
[p
->first
].clear();
12893 // remove any primary_temp mappings for this pool
12894 for (auto p
= osdmap
.primary_temp
->begin();
12895 p
!= osdmap
.primary_temp
->end();
12897 if (p
->first
.pool() == pool
) {
12898 dout(10) << __func__
<< " " << pool
12899 << " removing obsolete primary_temp" << p
->first
<< dendl
;
12900 pending_inc
.new_primary_temp
[p
->first
] = -1;
12903 // remove any pg_upmap mappings for this pool
12904 for (auto& p
: osdmap
.pg_upmap
) {
12905 if (p
.first
.pool() == pool
) {
12906 dout(10) << __func__
<< " " << pool
12907 << " removing obsolete pg_upmap "
12908 << p
.first
<< dendl
;
12909 pending_inc
.old_pg_upmap
.insert(p
.first
);
12912 // remove any pending pg_upmap mappings for this pool
12914 auto it
= pending_inc
.new_pg_upmap
.begin();
12915 while (it
!= pending_inc
.new_pg_upmap
.end()) {
12916 if (it
->first
.pool() == pool
) {
12917 dout(10) << __func__
<< " " << pool
12918 << " removing pending pg_upmap "
12919 << it
->first
<< dendl
;
12920 it
= pending_inc
.new_pg_upmap
.erase(it
);
12926 // remove any pg_upmap_items mappings for this pool
12927 for (auto& p
: osdmap
.pg_upmap_items
) {
12928 if (p
.first
.pool() == pool
) {
12929 dout(10) << __func__
<< " " << pool
12930 << " removing obsolete pg_upmap_items " << p
.first
12932 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
12935 // remove any pending pg_upmap mappings for this pool
12937 auto it
= pending_inc
.new_pg_upmap_items
.begin();
12938 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
12939 if (it
->first
.pool() == pool
) {
12940 dout(10) << __func__
<< " " << pool
12941 << " removing pending pg_upmap_items "
12942 << it
->first
<< dendl
;
12943 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
12950 // remove any choose_args for this pool
12951 CrushWrapper newcrush
;
12952 _get_pending_crush(newcrush
);
12953 if (newcrush
.have_choose_args(pool
)) {
12954 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
12955 newcrush
.rm_choose_args(pool
);
12956 pending_inc
.crush
.clear();
12957 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
12962 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
12964 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
12965 if (pending_inc
.old_pools
.count(pool
)) {
12966 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
12969 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
12970 p
!= pending_inc
.new_pool_names
.end();
12972 if (p
->second
== newname
&& p
->first
!= pool
) {
12977 pending_inc
.new_pool_names
[pool
] = newname
;
12981 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
12983 op
->mark_osdmon_event(__func__
);
12984 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12986 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
12987 if (ret
== -EAGAIN
) {
12988 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12992 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
12993 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
12994 pending_inc
.epoch
));
12998 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
12999 int ret
, epoch_t epoch
, bufferlist
*blp
)
13001 op
->mark_osdmon_event(__func__
);
13002 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
13003 dout(20) << "_pool_op_reply " << ret
<< dendl
;
13004 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
13005 ret
, epoch
, get_last_committed(), blp
);
13006 mon
->send_reply(op
, reply
);
13009 void OSDMonitor::convert_pool_priorities(void)
13011 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
13012 int64_t max_prio
= 0;
13013 int64_t min_prio
= 0;
13014 for (const auto &i
: osdmap
.get_pools()) {
13015 const auto &pool
= i
.second
;
13017 if (pool
.opts
.is_set(key
)) {
13019 pool
.opts
.get(key
, &prio
);
13020 if (prio
> max_prio
)
13022 if (prio
< min_prio
)
13026 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
13027 dout(20) << __func__
<< " nothing to fix" << dendl
;
13030 // Current pool priorities exceeds new maximum
13031 for (const auto &i
: osdmap
.get_pools()) {
13032 const auto pool_id
= i
.first
;
13033 pg_pool_t pool
= i
.second
;
13036 pool
.opts
.get(key
, &prio
);
13039 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
13040 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13041 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
13042 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
13043 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13044 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
13049 pool
.opts
.unset(key
);
13051 pool
.opts
.set(key
, static_cast<int64_t>(n
));
13053 dout(10) << __func__
<< " pool " << pool_id
13054 << " recovery_priority adjusted "
13055 << prio
<< " to " << n
<< dendl
;
13056 pool
.last_change
= pending_inc
.epoch
;
13057 pending_inc
.new_pools
[pool_id
] = pool
;