1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
24 #include "mon/OSDMonitor.h"
25 #include "mon/Monitor.h"
26 #include "mon/MDSMonitor.h"
27 #include "mon/PGMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreated.h"
51 #include "messages/MOSDPGTemp.h"
52 #include "messages/MMonCommand.h"
53 #include "messages/MRemoveSnaps.h"
54 #include "messages/MOSDScrub.h"
55 #include "messages/MRoute.h"
57 #include "common/TextTable.h"
58 #include "common/Timer.h"
59 #include "common/ceph_argparse.h"
60 #include "common/perf_counters.h"
61 #include "common/strtol.h"
63 #include "common/config.h"
64 #include "common/errno.h"
66 #include "erasure-code/ErasureCodePlugin.h"
67 #include "compressor/Compressor.h"
68 #include "common/Checksummer.h"
70 #include "include/compat.h"
71 #include "include/assert.h"
72 #include "include/stringify.h"
73 #include "include/util.h"
74 #include "common/cmdparse.h"
75 #include "include/str_list.h"
76 #include "include/str_map.h"
77 #include "include/scope_guard.h"
79 #include "auth/cephx/CephxKeyServer.h"
80 #include "osd/OSDCap.h"
82 #include "json_spirit/json_spirit_reader.h"
84 #include <boost/algorithm/string/predicate.hpp>
86 #define dout_subsys ceph_subsys_mon
87 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
88 static const string
OSD_METADATA_PREFIX("osd_metadata");
92 const uint32_t MAX_POOL_APPLICATIONS
= 4;
93 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
94 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
96 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
97 // Note: this doesn't include support for the application tag match
98 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
99 auto& match
= grant
.match
;
100 if (match
.is_match_all()) {
102 } else if (pool_name
!= nullptr && match
.auid
< 0 &&
103 !match
.pool_namespace
.pool_name
.empty() &&
104 match
.pool_namespace
.pool_name
== *pool_name
) {
111 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
112 const KeyServer
& key_server
,
113 const EntityName
& entity_name
,
114 const MonCap
& mon_caps
,
115 const std::string
* pool_name
)
117 typedef std::map
<std::string
, std::string
> CommandArgs
;
119 if (mon_caps
.is_capable(cct
, CEPH_ENTITY_TYPE_MON
,
121 "osd pool op unmanaged-snap",
122 (pool_name
== nullptr ?
123 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
124 CommandArgs
{{"poolname", *pool_name
}}),
125 false, true, false)) {
129 AuthCapsInfo caps_info
;
130 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
132 dout(10) << "unable to locate OSD cap data for " << entity_name
133 << " in auth db" << dendl
;
138 if (caps_info
.caps
.length() > 0) {
139 auto p
= caps_info
.caps
.begin();
142 } catch (const buffer::error
&err
) {
143 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
150 if (!osd_cap
.parse(caps_str
, nullptr)) {
151 dout(10) << "unable to parse OSD cap data for " << entity_name
152 << " in auth db" << dendl
;
156 // if the entity has write permissions in one or all pools, permit
157 // usage of unmanaged-snapshots
158 if (osd_cap
.allow_all()) {
162 for (auto& grant
: osd_cap
.grants
) {
163 if (grant
.profile
.is_valid()) {
164 for (auto& profile_grant
: grant
.profile_grants
) {
165 if (is_osd_writable(profile_grant
, pool_name
)) {
169 } else if (is_osd_writable(grant
, pool_name
)) {
177 } // anonymous namespace
179 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
181 if (epoch_by_pg
.size() <= ps
) {
182 epoch_by_pg
.resize(ps
+ 1, 0);
184 const auto old_lec
= epoch_by_pg
[ps
];
185 if (old_lec
>= last_epoch_clean
) {
189 epoch_by_pg
[ps
] = last_epoch_clean
;
190 if (last_epoch_clean
< floor
) {
191 floor
= last_epoch_clean
;
192 } else if (last_epoch_clean
> floor
) {
193 if (old_lec
== floor
) {
194 // probably should increase floor?
195 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
196 std::end(epoch_by_pg
));
200 if (ps
!= next_missing
) {
203 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
204 if (epoch_by_pg
[next_missing
] == 0) {
210 void LastEpochClean::remove_pool(uint64_t pool
)
212 report_by_pool
.erase(pool
);
215 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
217 auto& lec
= report_by_pool
[pg
.pool()];
218 return lec
.report(pg
.ps(), last_epoch_clean
);
221 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
223 auto floor
= latest
.get_epoch();
224 for (auto& pool
: latest
.get_pools()) {
225 auto reported
= report_by_pool
.find(pool
.first
);
226 if (reported
== report_by_pool
.end()) {
229 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
232 if (reported
->second
.floor
< floor
) {
233 floor
= reported
->second
.floor
;
240 struct C_UpdateCreatingPGs
: public Context
{
244 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
245 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
246 void finish(int r
) override
{
248 utime_t end
= ceph_clock_now();
249 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
250 << (end
- start
) << " seconds" << dendl
;
251 osdmon
->update_creating_pgs();
252 osdmon
->check_pg_creates_subs();
258 #define dout_prefix _prefix(_dout, mon, osdmap)
259 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
260 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
261 << "(" << mon
->get_state_name()
262 << ").osd e" << osdmap
.get_epoch() << " ";
265 OSDMonitor::OSDMonitor(
269 const string
& service_name
)
270 : PaxosService(mn
, p
, service_name
),
272 inc_osd_cache(g_conf
->mon_osd_cache_size
),
273 full_osd_cache(g_conf
->mon_osd_cache_size
),
274 mapper(mn
->cct
, &mn
->cpu_tp
),
275 op_tracker(cct
, true, 1)
278 bool OSDMonitor::_have_pending_crush()
280 return pending_inc
.crush
.length() > 0;
283 CrushWrapper
&OSDMonitor::_get_stable_crush()
285 return *osdmap
.crush
;
288 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
291 if (pending_inc
.crush
.length())
292 bl
= pending_inc
.crush
;
294 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
296 bufferlist::iterator p
= bl
.begin();
300 void OSDMonitor::create_initial()
302 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
307 mon
->store
->get("mkfs", "osdmap", bl
);
311 newmap
.set_fsid(mon
->monmap
->fsid
);
313 newmap
.build_simple(g_ceph_context
, 0, mon
->monmap
->fsid
, 0);
316 newmap
.created
= newmap
.modified
= ceph_clock_now();
318 // new clusters should sort bitwise by default.
319 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
321 // new cluster should require latest by default
322 if (g_conf
->mon_debug_no_require_luminous
) {
323 newmap
.require_osd_release
= CEPH_RELEASE_KRAKEN
;
324 derr
<< __func__
<< " mon_debug_no_require_luminous=true" << dendl
;
326 newmap
.require_osd_release
= CEPH_RELEASE_LUMINOUS
;
328 CEPH_OSDMAP_RECOVERY_DELETES
|
329 CEPH_OSDMAP_PURGED_SNAPDIRS
;
330 newmap
.full_ratio
= g_conf
->mon_osd_full_ratio
;
331 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
332 newmap
.backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
333 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
334 newmap
.nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
335 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
336 int r
= ceph_release_from_name(
337 g_conf
->mon_osd_initial_require_min_compat_client
.c_str());
339 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
341 newmap
.require_min_compat_client
= r
;
344 // encode into pending incremental
345 uint64_t features
= newmap
.get_encoding_features();
346 newmap
.encode(pending_inc
.fullmap
,
347 features
| CEPH_FEATURE_RESERVED
);
348 pending_inc
.full_crc
= newmap
.get_crc();
349 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
352 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
)
354 s
.insert(service_name
);
355 s
.insert(OSD_PG_CREATING_PREFIX
);
356 s
.insert(OSD_METADATA_PREFIX
);
359 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
361 version_t version
= get_last_committed();
362 if (version
== osdmap
.epoch
)
364 assert(version
> osdmap
.epoch
);
366 dout(15) << "update_from_paxos paxos e " << version
367 << ", my e " << osdmap
.epoch
<< dendl
;
370 if (!mapping_job
->is_done()) {
371 dout(1) << __func__
<< " mapping job "
372 << mapping_job
.get() << " did not complete, "
373 << mapping_job
->shards
<< " left, canceling" << dendl
;
374 mapping_job
->abort();
382 * We will possibly have a stashed latest that *we* wrote, and we will
383 * always be sure to have the oldest full map in the first..last range
384 * due to encode_trim_extra(), which includes the oldest full map in the trim
387 * encode_trim_extra() does not however write the full map's
388 * version to 'full_latest'. This is only done when we are building the
389 * full maps from the incremental versions. But don't panic! We make sure
390 * that the following conditions find whichever full map version is newer.
392 version_t latest_full
= get_version_latest_full();
393 if (latest_full
== 0 && get_first_committed() > 1)
394 latest_full
= get_first_committed();
396 if (get_first_committed() > 1 &&
397 latest_full
< get_first_committed()) {
398 // the monitor could be just sync'ed with its peer, and the latest_full key
399 // is not encoded in the paxos commits in encode_pending(), so we need to
400 // make sure we get it pointing to a proper version.
401 version_t lc
= get_last_committed();
402 version_t fc
= get_first_committed();
404 dout(10) << __func__
<< " looking for valid full map in interval"
405 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
408 for (version_t v
= lc
; v
>= fc
; v
--) {
409 string full_key
= "full_" + stringify(v
);
410 if (mon
->store
->exists(get_service_name(), full_key
)) {
411 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
417 assert(latest_full
> 0);
418 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
419 put_version_latest_full(t
, latest_full
);
420 mon
->store
->apply_transaction(t
);
421 dout(10) << __func__
<< " updated the on-disk full map version to "
422 << latest_full
<< dendl
;
425 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
426 bufferlist latest_bl
;
427 get_version_full(latest_full
, latest_bl
);
428 assert(latest_bl
.length() != 0);
429 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
430 osdmap
.decode(latest_bl
);
433 if (mon
->monmap
->get_required_features().contains_all(
434 ceph::features::mon::FEATURE_LUMINOUS
)) {
436 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
438 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
439 creating_pgs
.decode(p
);
440 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
441 << creating_pgs
.last_scan_epoch
442 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
444 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
449 // make sure we're using the right pg service.. remove me post-luminous!
450 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
451 dout(10) << __func__
<< " pgservice is mgrstat" << dendl
;
452 mon
->pgservice
= mon
->mgrstatmon()->get_pg_stat_service();
454 dout(10) << __func__
<< " pgservice is pg" << dendl
;
455 mon
->pgservice
= mon
->pgmon()->get_pg_stat_service();
458 // walk through incrementals
459 MonitorDBStore::TransactionRef t
;
461 while (version
> osdmap
.epoch
) {
463 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
465 assert(inc_bl
.length());
467 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
469 OSDMap::Incremental
inc(inc_bl
);
470 err
= osdmap
.apply_incremental(inc
);
474 t
.reset(new MonitorDBStore::Transaction
);
476 // Write out the full map for all past epochs. Encode the full
477 // map with the same features as the incremental. If we don't
478 // know, use the quorum features. If we don't know those either,
479 // encode with all features.
480 uint64_t f
= inc
.encode_features
;
482 f
= mon
->get_quorum_con_features();
486 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
487 tx_size
+= full_bl
.length();
489 bufferlist orig_full_bl
;
490 get_version_full(osdmap
.epoch
, orig_full_bl
);
491 if (orig_full_bl
.length()) {
492 // the primary provided the full map
493 assert(inc
.have_crc
);
494 if (inc
.full_crc
!= osdmap
.crc
) {
495 // This will happen if the mons were running mixed versions in
496 // the past or some other circumstance made the full encoded
497 // maps divergent. Reloading here will bring us back into
498 // sync with the primary for this and all future maps. OSDs
499 // will also be brought back into sync when they discover the
500 // crc mismatch and request a full map from a mon.
501 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
504 osdmap
.decode(orig_full_bl
);
507 assert(!inc
.have_crc
);
508 put_version_full(t
, osdmap
.epoch
, full_bl
);
510 put_version_latest_full(t
, osdmap
.epoch
);
513 dout(1) << osdmap
<< dendl
;
515 if (osdmap
.epoch
== 1) {
516 t
->erase("mkfs", "osdmap");
519 // make sure we're using the right pg service.. remove me post-luminous!
520 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
521 dout(10) << __func__
<< " pgservice is mgrstat" << dendl
;
522 mon
->pgservice
= mon
->mgrstatmon()->get_pg_stat_service();
524 dout(10) << __func__
<< " pgservice is pg" << dendl
;
525 mon
->pgservice
= mon
->pgmon()->get_pg_stat_service();
528 if (tx_size
> g_conf
->mon_sync_max_payload_size
*2) {
529 mon
->store
->apply_transaction(t
);
530 t
= MonitorDBStore::TransactionRef();
533 if (mon
->monmap
->get_required_features().contains_all(
534 ceph::features::mon::FEATURE_LUMINOUS
)) {
535 for (const auto &osd_state
: inc
.new_state
) {
536 if (osd_state
.second
& CEPH_OSD_UP
) {
537 // could be marked up *or* down, but we're too lazy to check which
538 last_osd_report
.erase(osd_state
.first
);
540 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
541 // could be created *or* destroyed, but we can safely drop it
542 osd_epochs
.erase(osd_state
.first
);
549 mon
->store
->apply_transaction(t
);
552 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
553 if (osdmap
.is_out(o
))
555 auto found
= down_pending_out
.find(o
);
556 if (osdmap
.is_down(o
)) {
557 // populate down -> out map
558 if (found
== down_pending_out
.end()) {
559 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
560 down_pending_out
[o
] = ceph_clock_now();
563 if (found
!= down_pending_out
.end()) {
564 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
565 down_pending_out
.erase(found
);
569 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
571 if (mon
->is_leader()) {
572 // kick pgmon, make sure it's seen the latest map
573 mon
->pgmon()->check_osd_map(osdmap
.epoch
);
577 check_pg_creates_subs();
579 share_map_with_random_osd();
584 // make sure our feature bits reflect the latest map
585 update_msgr_features();
587 if (!mon
->is_leader()) {
588 // will be called by on_active() on the leader, avoid doing so twice
593 void OSDMonitor::start_mapping()
595 // initiate mapping job
597 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
599 mapping_job
->abort();
601 if (!osdmap
.get_pools().empty()) {
602 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
603 mapping_job
= mapping
.start_update(osdmap
, mapper
,
604 g_conf
->mon_osd_mapping_pgs_per_chunk
);
605 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
606 << " at " << fin
->start
<< dendl
;
607 mapping_job
->set_finish_event(fin
);
609 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
610 mapping_job
= nullptr;
614 void OSDMonitor::update_msgr_features()
617 types
.insert((int)entity_name_t::TYPE_OSD
);
618 types
.insert((int)entity_name_t::TYPE_CLIENT
);
619 types
.insert((int)entity_name_t::TYPE_MDS
);
620 types
.insert((int)entity_name_t::TYPE_MON
);
621 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
623 uint64_t features
= osdmap
.get_features(*q
, &mask
);
624 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
625 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
626 Messenger::Policy p
= mon
->messenger
->get_policy(*q
);
627 p
.features_required
= (p
.features_required
& ~mask
) | features
;
628 mon
->messenger
->set_policy(*q
, p
);
633 void OSDMonitor::on_active()
637 if (mon
->is_leader()) {
638 mon
->clog
->debug() << "osdmap " << osdmap
;
640 list
<MonOpRequestRef
> ls
;
641 take_all_failures(ls
);
642 while (!ls
.empty()) {
643 MonOpRequestRef op
= ls
.front();
644 op
->mark_osdmon_event(__func__
);
652 void OSDMonitor::on_restart()
654 last_osd_report
.clear();
657 void OSDMonitor::on_shutdown()
659 dout(10) << __func__
<< dendl
;
661 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
663 mapping_job
->abort();
666 // discard failure info, waiters
667 list
<MonOpRequestRef
> ls
;
668 take_all_failures(ls
);
672 void OSDMonitor::update_logger()
674 dout(10) << "update_logger" << dendl
;
676 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
677 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
678 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
679 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
682 void OSDMonitor::create_pending()
684 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
685 pending_inc
.fsid
= mon
->monmap
->fsid
;
687 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
689 // clean up pg_temp, primary_temp
690 OSDMap::clean_temps(g_ceph_context
, osdmap
, &pending_inc
);
691 dout(10) << "create_pending did clean_temps" << dendl
;
693 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
694 // instead of osd_backfill_full_ratio config
695 if (osdmap
.backfillfull_ratio
<= 0) {
696 pending_inc
.new_backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
697 if (pending_inc
.new_backfillfull_ratio
> 1.0)
698 pending_inc
.new_backfillfull_ratio
/= 100;
699 dout(1) << __func__
<< " setting backfillfull_ratio = "
700 << pending_inc
.new_backfillfull_ratio
<< dendl
;
702 if (osdmap
.get_epoch() > 0 &&
703 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
704 // transition full ratios from PGMap to OSDMap (on upgrade)
705 float full_ratio
= mon
->pgservice
->get_full_ratio();
706 float nearfull_ratio
= mon
->pgservice
->get_nearfull_ratio();
707 if (osdmap
.full_ratio
!= full_ratio
) {
708 dout(10) << __func__
<< " full_ratio " << osdmap
.full_ratio
709 << " -> " << full_ratio
<< " (from pgmap)" << dendl
;
710 pending_inc
.new_full_ratio
= full_ratio
;
712 if (osdmap
.nearfull_ratio
!= nearfull_ratio
) {
713 dout(10) << __func__
<< " nearfull_ratio " << osdmap
.nearfull_ratio
714 << " -> " << nearfull_ratio
<< " (from pgmap)" << dendl
;
715 pending_inc
.new_nearfull_ratio
= nearfull_ratio
;
718 // safety check (this shouldn't really happen)
719 if (osdmap
.full_ratio
<= 0) {
720 pending_inc
.new_full_ratio
= g_conf
->mon_osd_full_ratio
;
721 if (pending_inc
.new_full_ratio
> 1.0)
722 pending_inc
.new_full_ratio
/= 100;
723 dout(1) << __func__
<< " setting full_ratio = "
724 << pending_inc
.new_full_ratio
<< dendl
;
726 if (osdmap
.nearfull_ratio
<= 0) {
727 pending_inc
.new_nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
728 if (pending_inc
.new_nearfull_ratio
> 1.0)
729 pending_inc
.new_nearfull_ratio
/= 100;
730 dout(1) << __func__
<< " setting nearfull_ratio = "
731 << pending_inc
.new_nearfull_ratio
<< dendl
;
735 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
737 if (osdmap
.crush
->has_legacy_rule_ids()) {
738 CrushWrapper newcrush
;
739 _get_pending_crush(newcrush
);
741 // First, for all pools, work out which rule they really used
742 // by resolving ruleset to rule.
743 for (const auto &i
: osdmap
.get_pools()) {
744 const auto pool_id
= i
.first
;
745 const auto &pool
= i
.second
;
746 int new_rule_id
= newcrush
.find_rule(pool
.crush_rule
,
747 pool
.type
, pool
.size
);
749 dout(1) << __func__
<< " rewriting pool "
750 << osdmap
.get_pool_name(pool_id
) << " crush ruleset "
751 << pool
.crush_rule
<< " -> rule id " << new_rule_id
<< dendl
;
752 if (pending_inc
.new_pools
.count(pool_id
) == 0) {
753 pending_inc
.new_pools
[pool_id
] = pool
;
755 pending_inc
.new_pools
[pool_id
].crush_rule
= new_rule_id
;
758 // Now, go ahead and renumber all the rules so that their
759 // rule_id field corresponds to their position in the array
760 auto old_to_new
= newcrush
.renumber_rules();
761 dout(1) << __func__
<< " Rewrote " << old_to_new
<< " crush IDs:" << dendl
;
762 for (const auto &i
: old_to_new
) {
763 dout(1) << __func__
<< " " << i
.first
<< " -> " << i
.second
<< dendl
;
765 pending_inc
.crush
.clear();
766 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
771 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
772 const OSDMap
& nextmap
)
774 dout(10) << __func__
<< dendl
;
775 creating_pgs_t pending_creatings
;
777 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
778 pending_creatings
= creating_pgs
;
780 // check for new or old pools
781 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
782 if (osdmap
.get_epoch() &&
783 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
785 mon
->pgservice
->maybe_add_creating_pgs(creating_pgs
.last_scan_epoch
,
788 dout(7) << __func__
<< " " << added
<< " pgs added from pgmap" << dendl
;
791 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
795 queued
+= scan_for_creating_pgs(inc
.new_pools
,
799 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
800 for (auto deleted_pool
: inc
.old_pools
) {
801 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
802 dout(10) << __func__
<< " " << removed
803 << " pg removed because containing pool deleted: "
804 << deleted_pool
<< dendl
;
805 last_epoch_clean
.remove_pool(deleted_pool
);
807 // pgmon updates its creating_pgs in check_osd_map() which is called by
808 // on_active() and check_osd_map() could be delayed if lease expires, so its
809 // creating_pgs could be stale in comparison with the one of osdmon. let's
810 // trim them here. otherwise, they will be added back after being erased.
811 unsigned removed
= 0;
812 for (auto& pg
: pending_created_pgs
) {
813 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
814 pending_creatings
.created_pools
.insert(pg
.pool());
815 removed
+= pending_creatings
.pgs
.erase(pg
);
817 pending_created_pgs
.clear();
818 dout(10) << __func__
<< " " << removed
819 << " pgs removed because they're created" << dendl
;
820 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
823 // filter out any pgs that shouldn't exist.
825 auto i
= pending_creatings
.pgs
.begin();
826 while (i
!= pending_creatings
.pgs
.end()) {
827 if (!nextmap
.pg_exists(i
->first
)) {
828 dout(10) << __func__
<< " removing pg " << i
->first
829 << " which should not exist" << dendl
;
830 i
= pending_creatings
.pgs
.erase(i
);
838 unsigned max
= MAX(1, g_conf
->mon_osd_max_creating_pgs
);
839 const auto total
= pending_creatings
.pgs
.size();
840 while (pending_creatings
.pgs
.size() < max
&&
841 !pending_creatings
.queue
.empty()) {
842 auto p
= pending_creatings
.queue
.begin();
843 int64_t poolid
= p
->first
;
844 dout(10) << __func__
<< " pool " << poolid
845 << " created " << p
->second
.created
846 << " modified " << p
->second
.modified
847 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
849 int n
= MIN(max
- pending_creatings
.pgs
.size(),
850 p
->second
.end
- p
->second
.start
);
851 ps_t first
= p
->second
.start
;
852 ps_t end
= first
+ n
;
853 for (ps_t ps
= first
; ps
< end
; ++ps
) {
854 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
855 // NOTE: use the *current* epoch as the PG creation epoch so that the
856 // OSD does not have to generate a long set of PastIntervals.
857 pending_creatings
.pgs
.emplace(pgid
, make_pair(inc
.epoch
,
858 p
->second
.modified
));
859 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
861 p
->second
.start
= end
;
862 if (p
->second
.done()) {
863 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
864 pending_creatings
.queue
.erase(p
);
866 dout(10) << __func__
<< " pool " << poolid
867 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
871 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
872 << " pools" << dendl
;
874 << " " << (pending_creatings
.pgs
.size() - total
)
875 << "/" << pending_creatings
.pgs
.size()
876 << " pgs added from queued pools" << dendl
;
877 return pending_creatings
;
880 void OSDMonitor::maybe_prime_pg_temp()
883 if (pending_inc
.crush
.length()) {
884 dout(10) << __func__
<< " new crush map, all" << dendl
;
888 if (!pending_inc
.new_up_client
.empty()) {
889 dout(10) << __func__
<< " new up osds, all" << dendl
;
893 // check for interesting OSDs
895 for (auto p
= pending_inc
.new_state
.begin();
896 !all
&& p
!= pending_inc
.new_state
.end();
898 if ((p
->second
& CEPH_OSD_UP
) &&
899 osdmap
.is_up(p
->first
)) {
900 osds
.insert(p
->first
);
903 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
904 !all
&& p
!= pending_inc
.new_weight
.end();
906 if (p
->second
< osdmap
.get_weight(p
->first
)) {
908 osds
.insert(p
->first
);
910 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
916 if (!all
&& osds
.empty())
921 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
922 if (estimate
> mapping
.get_num_pgs() *
923 g_conf
->mon_osd_prime_pg_temp_max_estimate
) {
924 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
925 << osds
.size() << " osds >= "
926 << g_conf
->mon_osd_prime_pg_temp_max_estimate
<< " of total "
927 << mapping
.get_num_pgs() << " pgs, all"
931 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
932 << osds
.size() << " osds" << dendl
;
937 next
.deepish_copy_from(osdmap
);
938 next
.apply_incremental(pending_inc
);
940 if (next
.get_pools().empty()) {
941 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
943 PrimeTempJob
job(next
, this);
944 mapper
.queue(&job
, g_conf
->mon_osd_mapping_pgs_per_chunk
);
945 if (job
.wait_for(g_conf
->mon_osd_prime_pg_temp_max_time
)) {
946 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
948 dout(10) << __func__
<< " did not finish in "
949 << g_conf
->mon_osd_prime_pg_temp_max_time
950 << ", stopping" << dendl
;
954 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
955 utime_t stop
= ceph_clock_now();
956 stop
+= g_conf
->mon_osd_prime_pg_temp_max_time
;
957 const int chunk
= 1000;
959 std::unordered_set
<pg_t
> did_pgs
;
960 for (auto osd
: osds
) {
961 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
962 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
963 for (auto pgid
: pgs
) {
964 if (!did_pgs
.insert(pgid
).second
) {
967 prime_pg_temp(next
, pgid
);
970 if (ceph_clock_now() > stop
) {
971 dout(10) << __func__
<< " consumed more than "
972 << g_conf
->mon_osd_prime_pg_temp_max_time
973 << " seconds, stopping"
983 void OSDMonitor::prime_pg_temp(
987 if (mon
->monmap
->get_required_features().contains_all(
988 ceph::features::mon::FEATURE_LUMINOUS
)) {
989 // TODO: remove this creating_pgs direct access?
990 if (creating_pgs
.pgs
.count(pgid
)) {
994 if (mon
->pgservice
->is_creating_pg(pgid
)) {
998 if (!osdmap
.pg_exists(pgid
)) {
1002 vector
<int> up
, acting
;
1003 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1005 vector
<int> next_up
, next_acting
;
1006 int next_up_primary
, next_acting_primary
;
1007 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1008 &next_acting
, &next_acting_primary
);
1009 if (acting
== next_acting
&&
1010 !(up
!= acting
&& next_up
== next_acting
))
1011 return; // no change since last epoch
1014 return; // if previously empty now we can be no worse off
1015 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1016 if (pool
&& acting
.size() < pool
->min_size
)
1017 return; // can be no worse off than before
1019 if (next_up
== next_acting
) {
1021 dout(20) << __func__
<< "next_up === next_acting now, clear pg_temp"
1025 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1026 << " -> " << next_up
<< "/" << next_acting
1027 << ", priming " << acting
1030 Mutex::Locker
l(prime_pg_temp_lock
);
1031 // do not touch a mapping if a change is pending
1032 pending_inc
.new_pg_temp
.emplace(
1034 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1039 * @note receiving a transaction in this function gives a fair amount of
1040 * freedom to the service implementation if it does need it. It shouldn't.
1042 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1044 dout(10) << "encode_pending e " << pending_inc
.epoch
1047 // finalize up pending_inc
1048 pending_inc
.modified
= ceph_clock_now();
1050 int r
= pending_inc
.propagate_snaps_to_tiers(g_ceph_context
, osdmap
);
1054 if (!mapping_job
->is_done()) {
1055 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1056 << mapping_job
.get() << " did not complete, "
1057 << mapping_job
->shards
<< " left" << dendl
;
1058 mapping_job
->abort();
1059 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1060 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1061 << mapping_job
.get() << " is prior epoch "
1062 << mapping
.get_epoch() << dendl
;
1064 if (g_conf
->mon_osd_prime_pg_temp
) {
1065 maybe_prime_pg_temp();
1068 } else if (g_conf
->mon_osd_prime_pg_temp
) {
1069 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1072 mapping_job
.reset();
1074 // ensure we don't have blank new_state updates. these are interrpeted as
1075 // CEPH_OSD_UP (and almost certainly not what we want!).
1076 auto p
= pending_inc
.new_state
.begin();
1077 while (p
!= pending_inc
.new_state
.end()) {
1078 if (p
->second
== 0) {
1079 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1080 p
= pending_inc
.new_state
.erase(p
);
1090 tmp
.deepish_copy_from(osdmap
);
1091 tmp
.apply_incremental(pending_inc
);
1093 if (tmp
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1094 // remove any legacy osdmap nearfull/full flags
1096 if (tmp
.test_flag(CEPH_OSDMAP_FULL
| CEPH_OSDMAP_NEARFULL
)) {
1097 dout(10) << __func__
<< " clearing legacy osdmap nearfull/full flag"
1099 remove_flag(CEPH_OSDMAP_NEARFULL
);
1100 remove_flag(CEPH_OSDMAP_FULL
);
1103 // collect which pools are currently affected by
1104 // the near/backfill/full osd(s),
1105 // and set per-pool near/backfill/full flag instead
1106 set
<int64_t> full_pool_ids
;
1107 set
<int64_t> backfillfull_pool_ids
;
1108 set
<int64_t> nearfull_pool_ids
;
1109 tmp
.get_full_pools(g_ceph_context
,
1111 &backfillfull_pool_ids
,
1112 &nearfull_pool_ids
);
1113 if (full_pool_ids
.empty() ||
1114 backfillfull_pool_ids
.empty() ||
1115 nearfull_pool_ids
.empty()) {
1116 // normal case - no nearfull, backfillfull or full osds
1117 // try cancel any improper nearfull/backfillfull/full pool
1119 for (auto &pool
: tmp
.get_pools()) {
1120 auto p
= pool
.first
;
1121 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1122 nearfull_pool_ids
.empty()) {
1123 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1124 << "'s nearfull flag" << dendl
;
1125 if (pending_inc
.new_pools
.count(p
) == 0) {
1126 // load original pool info first!
1127 pending_inc
.new_pools
[p
] = pool
.second
;
1129 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1131 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1132 backfillfull_pool_ids
.empty()) {
1133 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1134 << "'s backfillfull flag" << dendl
;
1135 if (pending_inc
.new_pools
.count(p
) == 0) {
1136 pending_inc
.new_pools
[p
] = pool
.second
;
1138 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1140 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1141 full_pool_ids
.empty()) {
1142 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
1143 // set by EQUOTA, skipping
1146 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1147 << "'s full flag" << dendl
;
1148 if (pending_inc
.new_pools
.count(p
) == 0) {
1149 pending_inc
.new_pools
[p
] = pool
.second
;
1151 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1155 if (!full_pool_ids
.empty()) {
1156 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1157 << " as full" << dendl
;
1158 for (auto &p
: full_pool_ids
) {
1159 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1162 if (pending_inc
.new_pools
.count(p
) == 0) {
1163 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1165 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1166 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1167 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1169 // cancel FLAG_FULL for pools which are no longer full too
1170 for (auto &pool
: tmp
.get_pools()) {
1171 auto p
= pool
.first
;
1172 if (full_pool_ids
.count(p
)) {
1173 // skip pools we have just marked as full above
1176 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1177 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
1178 // don't touch if currently is not full
1179 // or is running out of quota (and hence considered as full)
1182 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1183 << "'s full flag" << dendl
;
1184 if (pending_inc
.new_pools
.count(p
) == 0) {
1185 pending_inc
.new_pools
[p
] = pool
.second
;
1187 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1190 if (!backfillfull_pool_ids
.empty()) {
1191 for (auto &p
: backfillfull_pool_ids
) {
1192 if (full_pool_ids
.count(p
)) {
1193 // skip pools we have already considered as full above
1196 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
1197 // make sure FLAG_FULL is truly set, so we are safe not
1198 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1199 assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1202 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1203 // don't bother if pool is already marked as backfillfull
1206 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1207 << "'s as backfillfull" << dendl
;
1208 if (pending_inc
.new_pools
.count(p
) == 0) {
1209 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1211 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1212 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1214 // cancel FLAG_BACKFILLFULL for pools
1215 // which are no longer backfillfull too
1216 for (auto &pool
: tmp
.get_pools()) {
1217 auto p
= pool
.first
;
1218 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1219 // skip pools we have just marked as backfillfull/full above
1222 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1223 // and don't touch if currently is not backfillfull
1226 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1227 << "'s backfillfull flag" << dendl
;
1228 if (pending_inc
.new_pools
.count(p
) == 0) {
1229 pending_inc
.new_pools
[p
] = pool
.second
;
1231 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1234 if (!nearfull_pool_ids
.empty()) {
1235 for (auto &p
: nearfull_pool_ids
) {
1236 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1239 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
1240 // make sure FLAG_FULL is truly set, so we are safe not
1241 // to set a extra (redundant) FLAG_NEARFULL flag
1242 assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1245 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1246 // don't bother if pool is already marked as nearfull
1249 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1250 << "'s as nearfull" << dendl
;
1251 if (pending_inc
.new_pools
.count(p
) == 0) {
1252 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1254 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1256 // cancel FLAG_NEARFULL for pools
1257 // which are no longer nearfull too
1258 for (auto &pool
: tmp
.get_pools()) {
1259 auto p
= pool
.first
;
1260 if (full_pool_ids
.count(p
) ||
1261 backfillfull_pool_ids
.count(p
) ||
1262 nearfull_pool_ids
.count(p
)) {
1263 // skip pools we have just marked as
1264 // nearfull/backfillfull/full above
1267 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1268 // and don't touch if currently is not nearfull
1271 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1272 << "'s nearfull flag" << dendl
;
1273 if (pending_inc
.new_pools
.count(p
) == 0) {
1274 pending_inc
.new_pools
[p
] = pool
.second
;
1276 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1280 // min_compat_client?
1281 if (tmp
.require_min_compat_client
== 0) {
1282 auto mv
= tmp
.get_min_compat_client();
1283 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1284 << "required " << ceph_release_name(mv
) << dendl
;
1285 mon
->clog
->info() << "setting require_min_compat_client to currently "
1286 << "required " << ceph_release_name(mv
);
1287 pending_inc
.new_require_min_compat_client
= mv
;
1290 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1291 // convert ec profile ruleset-* -> crush-*
1292 for (auto& p
: tmp
.erasure_code_profiles
) {
1293 bool changed
= false;
1294 map
<string
,string
> newprofile
;
1295 for (auto& q
: p
.second
) {
1296 if (q
.first
.find("ruleset-") == 0) {
1297 string key
= "crush-";
1298 key
+= q
.first
.substr(8);
1299 newprofile
[key
] = q
.second
;
1301 dout(20) << " updating ec profile " << p
.first
1302 << " key " << q
.first
<< " -> " << key
<< dendl
;
1304 newprofile
[q
.first
] = q
.second
;
1308 dout(10) << " updated ec profile " << p
.first
<< ": "
1309 << newprofile
<< dendl
;
1310 pending_inc
.new_erasure_code_profiles
[p
.first
] = newprofile
;
1314 // auto-enable pool applications upon upgrade
1315 // NOTE: this can be removed post-Luminous assuming upgrades need to
1316 // proceed through Luminous
1317 for (auto &pool_pair
: tmp
.pools
) {
1318 int64_t pool_id
= pool_pair
.first
;
1319 pg_pool_t pg_pool
= pool_pair
.second
;
1320 if (pg_pool
.is_tier()) {
1324 std::string pool_name
= tmp
.get_pool_name(pool_id
);
1325 uint32_t match_count
= 0;
1328 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
1329 if (pending_fsmap
.pool_in_use(pool_id
)) {
1330 dout(10) << __func__
<< " auto-enabling CephFS on pool '"
1331 << pool_name
<< "'" << dendl
;
1332 pg_pool
.application_metadata
.insert(
1333 {pg_pool_t::APPLICATION_NAME_CEPHFS
, {}});
1337 // RBD heuristics (default OpenStack pool names from docs and
1339 if (boost::algorithm::contains(pool_name
, "rbd") ||
1340 pool_name
== "images" || pool_name
== "volumes" ||
1341 pool_name
== "backups" || pool_name
== "vms") {
1342 dout(10) << __func__
<< " auto-enabling RBD on pool '"
1343 << pool_name
<< "'" << dendl
;
1344 pg_pool
.application_metadata
.insert(
1345 {pg_pool_t::APPLICATION_NAME_RBD
, {}});
1350 if (boost::algorithm::contains(pool_name
, ".rgw") ||
1351 boost::algorithm::contains(pool_name
, ".log") ||
1352 boost::algorithm::contains(pool_name
, ".intent-log") ||
1353 boost::algorithm::contains(pool_name
, ".usage") ||
1354 boost::algorithm::contains(pool_name
, ".users")) {
1355 dout(10) << __func__
<< " auto-enabling RGW on pool '"
1356 << pool_name
<< "'" << dendl
;
1357 pg_pool
.application_metadata
.insert(
1358 {pg_pool_t::APPLICATION_NAME_RGW
, {}});
1362 // OpenStack gnocchi (from ceph-ansible)
1363 if (pool_name
== "metrics" && match_count
== 0) {
1364 dout(10) << __func__
<< " auto-enabling OpenStack Gnocchi on pool '"
1365 << pool_name
<< "'" << dendl
;
1366 pg_pool
.application_metadata
.insert({"openstack_gnocchi", {}});
1370 if (match_count
== 1) {
1371 pg_pool
.last_change
= pending_inc
.epoch
;
1372 pending_inc
.new_pools
[pool_id
] = pg_pool
;
1373 } else if (match_count
> 1) {
1374 auto pstat
= mon
->pgservice
->get_pool_stat(pool_id
);
1375 if (pstat
!= nullptr && pstat
->stats
.sum
.num_objects
> 0) {
1376 mon
->clog
->info() << "unable to auto-enable application for pool "
1377 << "'" << pool_name
<< "'";
1386 for (auto i
= pending_inc
.new_state
.begin();
1387 i
!= pending_inc
.new_state
.end();
1389 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1390 if (s
& CEPH_OSD_UP
)
1391 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1392 if (s
& CEPH_OSD_EXISTS
)
1393 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1395 for (map
<int32_t,entity_addr_t
>::iterator i
= pending_inc
.new_up_client
.begin();
1396 i
!= pending_inc
.new_up_client
.end();
1398 //FIXME: insert cluster addresses too
1399 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1401 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1402 i
!= pending_inc
.new_weight
.end();
1404 if (i
->second
== CEPH_OSD_OUT
) {
1405 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1406 } else if (i
->second
== CEPH_OSD_IN
) {
1407 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1409 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1413 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1414 osdmap
.maybe_remove_pg_upmaps(cct
, osdmap
, &pending_inc
);
1416 // features for osdmap and its incremental
1419 // encode full map and determine its crc
1422 tmp
.deepish_copy_from(osdmap
);
1423 tmp
.apply_incremental(pending_inc
);
1425 // determine appropriate features
1426 features
= tmp
.get_encoding_features();
1427 dout(10) << __func__
<< " encoding full map with "
1428 << ceph_release_name(tmp
.require_osd_release
)
1429 << " features " << features
<< dendl
;
1431 // the features should be a subset of the mon quorum's features!
1432 assert((features
& ~mon
->get_quorum_con_features()) == 0);
1435 ::encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1436 pending_inc
.full_crc
= tmp
.get_crc();
1438 // include full map in the txn. note that old monitors will
1439 // overwrite this. new ones will now skip the local full map
1440 // encode and reload from this.
1441 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1445 assert(get_last_committed() + 1 == pending_inc
.epoch
);
1446 ::encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1448 dout(20) << " full_crc " << tmp
.get_crc()
1449 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1451 /* put everything in the transaction */
1452 put_version(t
, pending_inc
.epoch
, bl
);
1453 put_last_committed(t
, pending_inc
.epoch
);
1456 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1457 p
!= pending_metadata
.end();
1459 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1460 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1461 p
!= pending_metadata_rm
.end();
1463 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1464 pending_metadata
.clear();
1465 pending_metadata_rm
.clear();
1467 // and pg creating, also!
1468 if (mon
->monmap
->get_required_features().contains_all(
1469 ceph::features::mon::FEATURE_LUMINOUS
)) {
1470 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1471 if (osdmap
.get_epoch() &&
1472 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1473 dout(7) << __func__
<< " in the middle of upgrading, "
1474 << " trimming pending creating_pgs using pgmap" << dendl
;
1475 mon
->pgservice
->maybe_trim_creating_pgs(&pending_creatings
);
1477 bufferlist creatings_bl
;
1478 ::encode(pending_creatings
, creatings_bl
);
1479 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1483 health_check_map_t next
;
1484 tmp
.check_health(&next
);
1485 encode_health(next
, t
);
1488 void OSDMonitor::trim_creating_pgs(creating_pgs_t
* creating_pgs
,
1489 const ceph::unordered_map
<pg_t
,pg_stat_t
>& pg_stat
)
1491 auto p
= creating_pgs
->pgs
.begin();
1492 while (p
!= creating_pgs
->pgs
.end()) {
1493 auto q
= pg_stat
.find(p
->first
);
1494 if (q
!= pg_stat
.end() &&
1495 !(q
->second
.state
& PG_STATE_CREATING
)) {
1496 dout(20) << __func__
<< " pgmap shows " << p
->first
<< " is created"
1498 p
= creating_pgs
->pgs
.erase(p
);
1505 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
1508 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
1512 bufferlist::iterator p
= bl
.begin();
1515 catch (buffer::error
& e
) {
1517 *err
<< "osd." << osd
<< " metadata is corrupt";
1523 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
1525 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
1526 if (osdmap
.is_up(osd
)) {
1527 map
<string
,string
> meta
;
1528 load_metadata(osd
, meta
, nullptr);
1529 auto p
= meta
.find(field
);
1530 if (p
== meta
.end()) {
1531 (*out
)["unknown"]++;
1533 (*out
)[p
->second
]++;
1539 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
1541 map
<string
,int> by_val
;
1542 count_metadata(field
, &by_val
);
1543 f
->open_object_section(field
.c_str());
1544 for (auto& p
: by_val
) {
1545 f
->dump_int(p
.first
.c_str(), p
.second
);
1550 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
1552 map
<string
, string
> metadata
;
1553 int r
= load_metadata(osd
, metadata
, nullptr);
1557 auto it
= metadata
.find("osd_objectstore");
1558 if (it
== metadata
.end())
1564 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
1565 const pg_pool_t
&pool
,
1568 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1569 // since filestore osds could always join the pool later
1570 set
<int> checked_osds
;
1571 for (unsigned ps
= 0; ps
< MIN(8, pool
.get_pg_num()); ++ps
) {
1572 vector
<int> up
, acting
;
1573 pg_t
pgid(ps
, pool_id
, -1);
1574 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
1575 for (int osd
: up
) {
1576 if (checked_osds
.find(osd
) != checked_osds
.end())
1578 string objectstore_type
;
1579 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
1580 // allow with missing metadata, e.g. due to an osd never booting yet
1581 if (r
< 0 || objectstore_type
== "bluestore") {
1582 checked_osds
.insert(osd
);
1585 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
1592 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
1594 map
<string
,string
> m
;
1595 if (int r
= load_metadata(osd
, m
, err
))
1597 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
1598 f
->dump_string(p
->first
.c_str(), p
->second
);
1602 void OSDMonitor::print_nodes(Formatter
*f
)
1604 // group OSDs by their hosts
1605 map
<string
, list
<int> > osds
; // hostname => osd
1606 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
1607 map
<string
, string
> m
;
1608 if (load_metadata(osd
, m
, NULL
)) {
1611 map
<string
, string
>::iterator hostname
= m
.find("hostname");
1612 if (hostname
== m
.end()) {
1613 // not likely though
1616 osds
[hostname
->second
].push_back(osd
);
1619 dump_services(f
, osds
, "osd");
1622 void OSDMonitor::share_map_with_random_osd()
1624 if (osdmap
.get_num_up_osds() == 0) {
1625 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
1629 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
1631 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
1635 dout(10) << "committed, telling random " << s
->inst
<< " all about it" << dendl
;
1637 // get feature of the peer
1638 // use quorum_con_features, if it's an anonymous connection.
1639 uint64_t features
= s
->con_features
? s
->con_features
:
1640 mon
->get_quorum_con_features();
1641 // whatev, they'll request more if they need it
1642 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
1643 s
->con
->send_message(m
);
1644 // NOTE: do *not* record osd has up to this epoch (as we do
1645 // elsewhere) as they may still need to request older values.
1648 version_t
OSDMonitor::get_trim_to()
1650 if (mon
->get_quorum().empty()) {
1651 dout(10) << __func__
<< ": quorum not formed" << dendl
;
1656 if (mon
->monmap
->get_required_features().contains_all(
1657 ceph::features::mon::FEATURE_LUMINOUS
)) {
1659 // TODO: Get this hidden in PGStatService
1660 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1661 if (!creating_pgs
.pgs
.empty()) {
1665 floor
= get_min_last_epoch_clean();
1667 if (!mon
->pgservice
->is_readable())
1669 if (mon
->pgservice
->have_creating_pgs()) {
1672 floor
= mon
->pgservice
->get_min_last_epoch_clean();
1675 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
1676 if (g_conf
->mon_osd_force_trim_to
> 0 &&
1677 g_conf
->mon_osd_force_trim_to
< (int)get_last_committed()) {
1678 floor
= g_conf
->mon_osd_force_trim_to
;
1679 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
1681 unsigned min
= g_conf
->mon_min_osdmap_epochs
;
1682 if (floor
+ min
> get_last_committed()) {
1683 if (min
< get_last_committed())
1684 floor
= get_last_committed() - min
;
1688 if (floor
> get_first_committed())
1694 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
1696 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
1697 // also scan osd epochs
1698 // don't trim past the oldest reported osd epoch
1699 for (auto& osd_epoch
: osd_epochs
) {
1700 if (osd_epoch
.second
< floor
) {
1701 floor
= osd_epoch
.second
;
1707 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
1710 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
1712 get_version_full(first
, bl
);
1713 put_version_full(tx
, first
, bl
);
1718 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
1720 op
->mark_osdmon_event(__func__
);
1721 Message
*m
= op
->get_req();
1722 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1724 switch (m
->get_type()) {
1726 case MSG_MON_COMMAND
:
1728 return preprocess_command(op
);
1730 catch (const bad_cmd_get
& e
) {
1732 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
1735 case CEPH_MSG_MON_GET_OSDMAP
:
1736 return preprocess_get_osdmap(op
);
1739 case MSG_OSD_MARK_ME_DOWN
:
1740 return preprocess_mark_me_down(op
);
1742 return preprocess_full(op
);
1743 case MSG_OSD_FAILURE
:
1744 return preprocess_failure(op
);
1746 return preprocess_boot(op
);
1748 return preprocess_alive(op
);
1749 case MSG_OSD_PG_CREATED
:
1750 return preprocess_pg_created(op
);
1751 case MSG_OSD_PGTEMP
:
1752 return preprocess_pgtemp(op
);
1753 case MSG_OSD_BEACON
:
1754 return preprocess_beacon(op
);
1756 case CEPH_MSG_POOLOP
:
1757 return preprocess_pool_op(op
);
1759 case MSG_REMOVE_SNAPS
:
1760 return preprocess_remove_snaps(op
);
1768 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
1770 op
->mark_osdmon_event(__func__
);
1771 Message
*m
= op
->get_req();
1772 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1774 switch (m
->get_type()) {
1776 case MSG_OSD_MARK_ME_DOWN
:
1777 return prepare_mark_me_down(op
);
1779 return prepare_full(op
);
1780 case MSG_OSD_FAILURE
:
1781 return prepare_failure(op
);
1783 return prepare_boot(op
);
1785 return prepare_alive(op
);
1786 case MSG_OSD_PG_CREATED
:
1787 return prepare_pg_created(op
);
1788 case MSG_OSD_PGTEMP
:
1789 return prepare_pgtemp(op
);
1790 case MSG_OSD_BEACON
:
1791 return prepare_beacon(op
);
1793 case MSG_MON_COMMAND
:
1795 return prepare_command(op
);
1797 catch (const bad_cmd_get
& e
) {
1799 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
1803 case CEPH_MSG_POOLOP
:
1804 return prepare_pool_op(op
);
1806 case MSG_REMOVE_SNAPS
:
1807 return prepare_remove_snaps(op
);
1817 bool OSDMonitor::should_propose(double& delay
)
1819 dout(10) << "should_propose" << dendl
;
1821 // if full map, propose immediately! any subsequent changes will be clobbered.
1822 if (pending_inc
.fullmap
.length())
1825 // adjust osd weights?
1826 if (!osd_weight
.empty() &&
1827 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
1828 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
1829 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
1835 return PaxosService::should_propose(delay
);
1840 // ---------------------------
1843 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
1845 op
->mark_osdmon_event(__func__
);
1846 MMonGetOSDMap
*m
= static_cast<MMonGetOSDMap
*>(op
->get_req());
1848 uint64_t features
= mon
->get_quorum_con_features();
1849 if (m
->get_session() && m
->get_session()->con_features
)
1850 features
= m
->get_session()->con_features
;
1852 dout(10) << __func__
<< " " << *m
<< dendl
;
1853 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
, features
);
1854 epoch_t first
= get_first_committed();
1855 epoch_t last
= osdmap
.get_epoch();
1856 int max
= g_conf
->osd_map_message_max
;
1857 for (epoch_t e
= MAX(first
, m
->get_full_first());
1858 e
<= MIN(last
, m
->get_full_last()) && max
> 0;
1860 int r
= get_version_full(e
, features
, reply
->maps
[e
]);
1863 for (epoch_t e
= MAX(first
, m
->get_inc_first());
1864 e
<= MIN(last
, m
->get_inc_last()) && max
> 0;
1866 int r
= get_version(e
, features
, reply
->incremental_maps
[e
]);
1869 reply
->oldest_map
= first
;
1870 reply
->newest_map
= last
;
1871 mon
->send_reply(op
, reply
);
1876 // ---------------------------
1881 bool OSDMonitor::check_source(PaxosServiceMessage
*m
, uuid_d fsid
) {
1882 // check permissions
1883 MonSession
*session
= m
->get_session();
1886 if (!session
->is_capable("osd", MON_CAP_X
)) {
1887 dout(0) << "got MOSDFailure from entity with insufficient caps "
1888 << session
->caps
<< dendl
;
1891 if (fsid
!= mon
->monmap
->fsid
) {
1892 dout(0) << "check_source: on fsid " << fsid
1893 << " != " << mon
->monmap
->fsid
<< dendl
;
1900 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
1902 op
->mark_osdmon_event(__func__
);
1903 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
1904 // who is target_osd
1905 int badboy
= m
->get_target().name
.num();
1907 // check permissions
1908 if (check_source(m
, m
->fsid
))
1911 // first, verify the reporting host is valid
1912 if (m
->get_orig_source().is_osd()) {
1913 int from
= m
->get_orig_source().num();
1914 if (!osdmap
.exists(from
) ||
1915 osdmap
.get_addr(from
) != m
->get_orig_source_inst().addr
||
1916 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
1917 dout(5) << "preprocess_failure from dead osd." << from
<< ", ignoring" << dendl
;
1918 send_incremental(op
, m
->get_epoch()+1);
1925 if (osdmap
.is_down(badboy
)) {
1926 dout(5) << "preprocess_failure dne(/dup?): " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1927 if (m
->get_epoch() < osdmap
.get_epoch())
1928 send_incremental(op
, m
->get_epoch()+1);
1931 if (osdmap
.get_inst(badboy
) != m
->get_target()) {
1932 dout(5) << "preprocess_failure wrong osd: report " << m
->get_target() << " != map's " << osdmap
.get_inst(badboy
)
1933 << ", from " << m
->get_orig_source_inst() << dendl
;
1934 if (m
->get_epoch() < osdmap
.get_epoch())
1935 send_incremental(op
, m
->get_epoch()+1);
1939 // already reported?
1940 if (osdmap
.is_down(badboy
) ||
1941 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
1942 dout(5) << "preprocess_failure dup/old: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1943 if (m
->get_epoch() < osdmap
.get_epoch())
1944 send_incremental(op
, m
->get_epoch()+1);
1948 if (!can_mark_down(badboy
)) {
1949 dout(5) << "preprocess_failure ignoring report of " << m
->get_target() << " from " << m
->get_orig_source_inst() << dendl
;
1953 dout(10) << "preprocess_failure new: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1961 class C_AckMarkedDown
: public C_MonOp
{
1967 : C_MonOp(op
), osdmon(osdmon
) {}
1969 void _finish(int) override
{
1970 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1971 osdmon
->mon
->send_reply(
1977 false)); // ACK itself does not request an ack
1979 ~C_AckMarkedDown() override
{
1983 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
1985 op
->mark_osdmon_event(__func__
);
1986 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1987 int requesting_down
= m
->get_target().name
.num();
1988 int from
= m
->get_orig_source().num();
1990 // check permissions
1991 if (check_source(m
, m
->fsid
))
1994 // first, verify the reporting host is valid
1995 if (!m
->get_orig_source().is_osd())
1998 if (!osdmap
.exists(from
) ||
1999 osdmap
.is_down(from
) ||
2000 osdmap
.get_addr(from
) != m
->get_target().addr
) {
2001 dout(5) << "preprocess_mark_me_down from dead osd."
2002 << from
<< ", ignoring" << dendl
;
2003 send_incremental(op
, m
->get_epoch()+1);
2007 // no down might be set
2008 if (!can_mark_down(requesting_down
))
2011 dout(10) << "MOSDMarkMeDown for: " << m
->get_target() << dendl
;
2015 if (m
->request_ack
) {
2016 Context
*c(new C_AckMarkedDown(this, op
));
2022 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
2024 op
->mark_osdmon_event(__func__
);
2025 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
2026 int target_osd
= m
->get_target().name
.num();
2028 assert(osdmap
.is_up(target_osd
));
2029 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
2031 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
2032 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2034 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
2038 bool OSDMonitor::can_mark_down(int i
)
2040 if (osdmap
.test_flag(CEPH_OSDMAP_NODOWN
)) {
2041 dout(5) << __func__
<< " NODOWN flag set, will not mark osd." << i
2042 << " down" << dendl
;
2046 if (osdmap
.is_nodown(i
)) {
2047 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
2048 << "will not mark it down" << dendl
;
2052 int num_osds
= osdmap
.get_num_osds();
2053 if (num_osds
== 0) {
2054 dout(5) << __func__
<< " no osds" << dendl
;
2057 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
2058 float up_ratio
= (float)up
/ (float)num_osds
;
2059 if (up_ratio
< g_conf
->mon_osd_min_up_ratio
) {
2060 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
2061 << g_conf
->mon_osd_min_up_ratio
2062 << ", will not mark osd." << i
<< " down" << dendl
;
2068 bool OSDMonitor::can_mark_up(int i
)
2070 if (osdmap
.test_flag(CEPH_OSDMAP_NOUP
)) {
2071 dout(5) << __func__
<< " NOUP flag set, will not mark osd." << i
2076 if (osdmap
.is_noup(i
)) {
2077 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
2078 << "will not mark it up" << dendl
;
2086 * @note the parameter @p i apparently only exists here so we can output the
2087 * osd's id on messages.
2089 bool OSDMonitor::can_mark_out(int i
)
2091 if (osdmap
.test_flag(CEPH_OSDMAP_NOOUT
)) {
2092 dout(5) << __func__
<< " NOOUT flag set, will not mark osds out" << dendl
;
2096 if (osdmap
.is_noout(i
)) {
2097 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
2098 << "will not mark it out" << dendl
;
2102 int num_osds
= osdmap
.get_num_osds();
2103 if (num_osds
== 0) {
2104 dout(5) << __func__
<< " no osds" << dendl
;
2107 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
2108 float in_ratio
= (float)in
/ (float)num_osds
;
2109 if (in_ratio
< g_conf
->mon_osd_min_in_ratio
) {
2111 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
2112 << g_conf
->mon_osd_min_in_ratio
2113 << ", will not mark osd." << i
<< " out" << dendl
;
2115 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
2116 << g_conf
->mon_osd_min_in_ratio
2117 << ", will not mark osds out" << dendl
;
2124 bool OSDMonitor::can_mark_in(int i
)
2126 if (osdmap
.test_flag(CEPH_OSDMAP_NOIN
)) {
2127 dout(5) << __func__
<< " NOIN flag set, will not mark osd." << i
2132 if (osdmap
.is_noin(i
)) {
2133 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
2134 << "will not mark it in" << dendl
;
2141 bool OSDMonitor::check_failures(utime_t now
)
2143 bool found_failure
= false;
2144 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2145 p
!= failure_info
.end();
2147 if (can_mark_down(p
->first
)) {
2148 found_failure
|= check_failure(now
, p
->first
, p
->second
);
2151 return found_failure
;
2154 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
2156 // already pending failure?
2157 if (pending_inc
.new_state
.count(target_osd
) &&
2158 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
2159 dout(10) << " already pending failure" << dendl
;
2163 set
<string
> reporters_by_subtree
;
2164 string reporter_subtree_level
= g_conf
->mon_osd_reporter_subtree_level
;
2165 utime_t
orig_grace(g_conf
->osd_heartbeat_grace
, 0);
2166 utime_t max_failed_since
= fi
.get_failed_since();
2167 utime_t failed_for
= now
- max_failed_since
;
2169 utime_t grace
= orig_grace
;
2170 double my_grace
= 0, peer_grace
= 0;
2172 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
2173 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
2174 decay_k
= ::log(.5) / halflife
;
2176 // scale grace period based on historical probability of 'lagginess'
2177 // (false positive failures due to slowness).
2178 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
2179 double decay
= exp((double)failed_for
* decay_k
);
2180 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
2181 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
2182 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
2186 // consider the peers reporting a failure a proxy for a potential
2187 // 'subcluster' over the overall cluster that is similarly
2188 // laggy. this is clearly not true in all cases, but will sometimes
2189 // help us localize the grace correction to a subset of the system
2190 // (say, a rack with a bad switch) that is unhappy.
2191 assert(fi
.reporters
.size());
2192 for (map
<int,failure_reporter_t
>::iterator p
= fi
.reporters
.begin();
2193 p
!= fi
.reporters
.end();
2195 // get the parent bucket whose type matches with "reporter_subtree_level".
2196 // fall back to OSD if the level doesn't exist.
2197 map
<string
, string
> reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
2198 map
<string
, string
>::iterator iter
= reporter_loc
.find(reporter_subtree_level
);
2199 if (iter
== reporter_loc
.end()) {
2200 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
2202 reporters_by_subtree
.insert(iter
->second
);
2204 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
2205 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
2206 utime_t elapsed
= now
- xi
.down_stamp
;
2207 double decay
= exp((double)elapsed
* decay_k
);
2208 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
2212 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
2213 peer_grace
/= (double)fi
.reporters
.size();
2214 grace
+= peer_grace
;
2217 dout(10) << " osd." << target_osd
<< " has "
2218 << fi
.reporters
.size() << " reporters, "
2219 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
2220 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
2223 if (failed_for
>= grace
&&
2224 (int)reporters_by_subtree
.size() >= g_conf
->mon_osd_min_down_reporters
) {
2225 dout(1) << " we have enough reporters to mark osd." << target_osd
2226 << " down" << dendl
;
2227 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2229 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
2230 << osdmap
.crush
->get_full_location_ordered_string(
2233 << (int)reporters_by_subtree
.size()
2234 << " reporters from different "
2235 << reporter_subtree_level
<< " after "
2236 << failed_for
<< " >= grace " << grace
<< ")";
2242 void OSDMonitor::force_failure(int target_osd
, int by
)
2244 // already pending failure?
2245 if (pending_inc
.new_state
.count(target_osd
) &&
2246 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
2247 dout(10) << " already pending failure" << dendl
;
2251 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
2252 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2254 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
2255 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
2256 << ") (connection refused reported by osd." << by
<< ")";
2260 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
2262 op
->mark_osdmon_event(__func__
);
2263 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
2264 dout(1) << "prepare_failure " << m
->get_target()
2265 << " from " << m
->get_orig_source_inst()
2266 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
2268 int target_osd
= m
->get_target().name
.num();
2269 int reporter
= m
->get_orig_source().num();
2270 assert(osdmap
.is_up(target_osd
));
2271 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
2273 if (m
->if_osd_failed()) {
2274 // calculate failure time
2275 utime_t now
= ceph_clock_now();
2276 utime_t failed_since
=
2277 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
2280 if (m
->is_immediate()) {
2281 mon
->clog
->debug() << m
->get_target() << " reported immediately failed by "
2282 << m
->get_orig_source_inst();
2283 force_failure(target_osd
, reporter
);
2287 mon
->clog
->debug() << m
->get_target() << " reported failed by "
2288 << m
->get_orig_source_inst();
2290 failure_info_t
& fi
= failure_info
[target_osd
];
2291 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
2293 mon
->no_reply(old_op
);
2296 return check_failure(now
, target_osd
, fi
);
2298 // remove the report
2299 mon
->clog
->debug() << m
->get_target() << " failure report canceled by "
2300 << m
->get_orig_source_inst();
2301 if (failure_info
.count(target_osd
)) {
2302 failure_info_t
& fi
= failure_info
[target_osd
];
2303 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
2305 mon
->no_reply(report_op
);
2307 if (fi
.reporters
.empty()) {
2308 dout(10) << " removing last failure_info for osd." << target_osd
2310 failure_info
.erase(target_osd
);
2312 dout(10) << " failure_info for osd." << target_osd
<< " now "
2313 << fi
.reporters
.size() << " reporters" << dendl
;
2316 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
2324 void OSDMonitor::process_failures()
2326 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2327 while (p
!= failure_info
.end()) {
2328 if (osdmap
.is_up(p
->first
)) {
2331 dout(10) << "process_failures osd." << p
->first
<< dendl
;
2332 list
<MonOpRequestRef
> ls
;
2333 p
->second
.take_report_messages(ls
);
2334 failure_info
.erase(p
++);
2336 while (!ls
.empty()) {
2337 MonOpRequestRef o
= ls
.front();
2339 o
->mark_event(__func__
);
2340 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
2341 send_latest(o
, m
->get_epoch());
2350 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
2352 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
2354 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2355 p
!= failure_info
.end();
2357 p
->second
.take_report_messages(ls
);
2359 failure_info
.clear();
2365 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
2367 op
->mark_osdmon_event(__func__
);
2368 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2369 int from
= m
->get_orig_source_inst().name
.num();
2371 // check permissions, ignore if failed (no response expected)
2372 MonSession
*session
= m
->get_session();
2375 if (!session
->is_capable("osd", MON_CAP_X
)) {
2376 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2377 << session
->caps
<< dendl
;
2381 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
2382 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
2383 << " != " << mon
->monmap
->fsid
<< dendl
;
2387 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
2388 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
2392 assert(m
->get_orig_source_inst().name
.is_osd());
2394 // check if osd has required features to boot
2395 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2396 CEPH_FEATURE_OSD_ERASURE_CODES
) &&
2397 !(m
->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES
)) {
2398 dout(0) << __func__
<< " osdmap requires erasure code but osd at "
2399 << m
->get_orig_source_inst()
2400 << " doesn't announce support -- ignore" << dendl
;
2404 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2405 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
) &&
2406 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
)) {
2407 dout(0) << __func__
<< " osdmap requires erasure code plugins v2 but osd at "
2408 << m
->get_orig_source_inst()
2409 << " doesn't announce support -- ignore" << dendl
;
2413 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2414 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
) &&
2415 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
)) {
2416 dout(0) << __func__
<< " osdmap requires erasure code plugins v3 but osd at "
2417 << m
->get_orig_source_inst()
2418 << " doesn't announce support -- ignore" << dendl
;
2422 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
2423 !HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
2424 mon
->clog
->info() << "disallowing boot of OSD "
2425 << m
->get_orig_source_inst()
2426 << " because the osdmap requires"
2427 << " CEPH_FEATURE_SERVER_LUMINOUS"
2428 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2432 if (osdmap
.require_osd_release
>= CEPH_RELEASE_JEWEL
&&
2433 !(m
->osd_features
& CEPH_FEATURE_SERVER_JEWEL
)) {
2434 mon
->clog
->info() << "disallowing boot of OSD "
2435 << m
->get_orig_source_inst()
2436 << " because the osdmap requires"
2437 << " CEPH_FEATURE_SERVER_JEWEL"
2438 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2442 if (osdmap
.require_osd_release
>= CEPH_RELEASE_KRAKEN
&&
2443 !HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
2444 mon
->clog
->info() << "disallowing boot of OSD "
2445 << m
->get_orig_source_inst()
2446 << " because the osdmap requires"
2447 << " CEPH_FEATURE_SERVER_KRAKEN"
2448 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2452 if (osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
2453 !(m
->osd_features
& CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
2454 mon
->clog
->info() << "disallowing boot of OSD "
2455 << m
->get_orig_source_inst()
2456 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2460 if (osdmap
.test_flag(CEPH_OSDMAP_RECOVERY_DELETES
) &&
2461 !(m
->osd_features
& CEPH_FEATURE_OSD_RECOVERY_DELETES
)) {
2462 mon
->clog
->info() << "disallowing boot of OSD "
2463 << m
->get_orig_source_inst()
2464 << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
2468 if (any_of(osdmap
.get_pools().begin(),
2469 osdmap
.get_pools().end(),
2470 [](const std::pair
<int64_t,pg_pool_t
>& pool
)
2471 { return pool
.second
.use_gmt_hitset
; })) {
2472 assert(osdmap
.get_num_up_osds() == 0 ||
2473 osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
);
2474 if (!(m
->osd_features
& CEPH_FEATURE_OSD_HITSET_GMT
)) {
2475 dout(0) << __func__
<< " one or more pools uses GMT hitsets but osd at "
2476 << m
->get_orig_source_inst()
2477 << " doesn't announce support -- ignore" << dendl
;
2482 // make sure upgrades stop at luminous
2483 if (HAVE_FEATURE(m
->osd_features
, SERVER_M
) &&
2484 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
2485 mon
->clog
->info() << "disallowing boot of post-luminous OSD "
2486 << m
->get_orig_source_inst()
2487 << " because require_osd_release < luminous";
2491 // make sure upgrades stop at jewel
2492 if (HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
) &&
2493 osdmap
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
2494 mon
->clog
->info() << "disallowing boot of post-jewel OSD "
2495 << m
->get_orig_source_inst()
2496 << " because require_osd_release < jewel";
2500 // make sure upgrades stop at hammer
2501 // * HAMMER_0_94_4 is the required hammer feature
2502 // * MON_METADATA is the first post-hammer feature
2503 if (osdmap
.get_num_up_osds() > 0) {
2504 if ((m
->osd_features
& CEPH_FEATURE_MON_METADATA
) &&
2505 !(osdmap
.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4
)) {
2506 mon
->clog
->info() << "disallowing boot of post-hammer OSD "
2507 << m
->get_orig_source_inst()
2508 << " because one or more up OSDs is pre-hammer v0.94.4";
2511 if (!(m
->osd_features
& CEPH_FEATURE_HAMMER_0_94_4
) &&
2512 (osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_METADATA
)) {
2513 mon
->clog
->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2514 << m
->get_orig_source_inst()
2515 << " because all up OSDs are post-hammer";
2520 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
2521 // we are reusing a jewel feature bit that was retired in luminous.
2522 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
2523 osdmap
.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT
) &&
2524 !(m
->osd_features
& CEPH_FEATURE_OSD_PGLOG_HARDLIMIT
)) {
2525 mon
->clog
->info() << "disallowing boot of OSD "
2526 << m
->get_orig_source_inst()
2527 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
2532 if (osdmap
.is_up(from
) &&
2533 osdmap
.get_inst(from
) == m
->get_orig_source_inst() &&
2534 osdmap
.get_cluster_addr(from
) == m
->cluster_addr
) {
2536 dout(7) << "preprocess_boot dup from " << m
->get_orig_source_inst()
2537 << " == " << osdmap
.get_inst(from
) << dendl
;
2542 if (osdmap
.exists(from
) &&
2543 !osdmap
.get_uuid(from
).is_zero() &&
2544 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2545 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
2546 << " clashes with existing osd: different fsid"
2547 << " (ours: " << osdmap
.get_uuid(from
)
2548 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
2552 if (osdmap
.exists(from
) &&
2553 osdmap
.get_info(from
).up_from
> m
->version
&&
2554 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) {
2555 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
2556 send_latest(op
, m
->sb
.current_epoch
+1);
2561 if (!can_mark_up(from
)) {
2562 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
2563 send_latest(op
, m
->sb
.current_epoch
+1);
2567 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
2574 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
2576 op
->mark_osdmon_event(__func__
);
2577 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2578 dout(7) << __func__
<< " from " << m
->get_orig_source_inst() << " sb " << m
->sb
2579 << " cluster_addr " << m
->cluster_addr
2580 << " hb_back_addr " << m
->hb_back_addr
2581 << " hb_front_addr " << m
->hb_front_addr
2584 assert(m
->get_orig_source().is_osd());
2585 int from
= m
->get_orig_source().num();
2587 // does this osd exist?
2588 if (from
>= osdmap
.get_max_osd()) {
2589 dout(1) << "boot from osd." << from
<< " >= max_osd "
2590 << osdmap
.get_max_osd() << dendl
;
2594 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
2595 if (pending_inc
.new_state
.count(from
))
2596 oldstate
^= pending_inc
.new_state
[from
];
2598 // already up? mark down first?
2599 if (osdmap
.is_up(from
)) {
2600 dout(7) << __func__
<< " was up, first marking down "
2601 << osdmap
.get_inst(from
) << dendl
;
2602 // preprocess should have caught these; if not, assert.
2603 assert(osdmap
.get_inst(from
) != m
->get_orig_source_inst() ||
2604 osdmap
.get_cluster_addr(from
) != m
->cluster_addr
);
2605 assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
2607 if (pending_inc
.new_state
.count(from
) == 0 ||
2608 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
2609 // mark previous guy down
2610 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
2612 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2613 } else if (pending_inc
.new_up_client
.count(from
)) {
2614 // already prepared, just wait
2615 dout(7) << __func__
<< " already prepared, waiting on "
2616 << m
->get_orig_source_addr() << dendl
;
2617 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2620 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addr();
2621 if (!m
->cluster_addr
.is_blank_ip())
2622 pending_inc
.new_up_cluster
[from
] = m
->cluster_addr
;
2623 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addr
;
2624 if (!m
->hb_front_addr
.is_blank_ip())
2625 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addr
;
2627 down_pending_out
.erase(from
); // if any
2630 osd_weight
[from
] = m
->sb
.weight
;
2633 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
2635 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2636 // preprocess should have caught this; if not, assert.
2637 assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
2638 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
2642 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
2643 const osd_info_t
& i
= osdmap
.get_info(from
);
2644 if (i
.up_from
> i
.lost_at
) {
2645 dout(10) << " fresh osd; marking lost_at too" << dendl
;
2646 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
2651 bufferlist osd_metadata
;
2652 ::encode(m
->metadata
, osd_metadata
);
2653 pending_metadata
[from
] = osd_metadata
;
2654 pending_metadata_rm
.erase(from
);
2656 // adjust last clean unmount epoch?
2657 const osd_info_t
& info
= osdmap
.get_info(from
);
2658 dout(10) << " old osd_info: " << info
<< dendl
;
2659 if (m
->sb
.mounted
> info
.last_clean_begin
||
2660 (m
->sb
.mounted
== info
.last_clean_begin
&&
2661 m
->sb
.clean_thru
> info
.last_clean_end
)) {
2662 epoch_t begin
= m
->sb
.mounted
;
2663 epoch_t end
= m
->sb
.clean_thru
;
2665 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
2666 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
2667 << ") -> [" << begin
<< "-" << end
<< ")"
2669 pending_inc
.new_last_clean_interval
[from
] =
2670 pair
<epoch_t
,epoch_t
>(begin
, end
);
2673 osd_xinfo_t xi
= osdmap
.get_xinfo(from
);
2674 if (m
->boot_epoch
== 0) {
2675 xi
.laggy_probability
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2676 xi
.laggy_interval
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2677 dout(10) << " not laggy, new xi " << xi
<< dendl
;
2679 if (xi
.down_stamp
.sec()) {
2680 int interval
= ceph_clock_now().sec() -
2681 xi
.down_stamp
.sec();
2682 if (g_conf
->mon_osd_laggy_max_interval
&&
2683 (interval
> g_conf
->mon_osd_laggy_max_interval
)) {
2684 interval
= g_conf
->mon_osd_laggy_max_interval
;
2687 interval
* g_conf
->mon_osd_laggy_weight
+
2688 xi
.laggy_interval
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2690 xi
.laggy_probability
=
2691 g_conf
->mon_osd_laggy_weight
+
2692 xi
.laggy_probability
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2693 dout(10) << " laggy, now xi " << xi
<< dendl
;
2696 // set features shared by the osd
2697 if (m
->osd_features
)
2698 xi
.features
= m
->osd_features
;
2700 xi
.features
= m
->get_connection()->get_features();
2703 if ((g_conf
->mon_osd_auto_mark_auto_out_in
&&
2704 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
2705 (g_conf
->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
2706 (g_conf
->mon_osd_auto_mark_in
)) {
2707 if (can_mark_in(from
)) {
2708 if (osdmap
.osd_xinfo
[from
].old_weight
> 0) {
2709 pending_inc
.new_weight
[from
] = osdmap
.osd_xinfo
[from
].old_weight
;
2712 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
2715 dout(7) << __func__
<< " NOIN set, will not mark in "
2716 << m
->get_orig_source_addr() << dendl
;
2720 pending_inc
.new_xinfo
[from
] = xi
;
2723 wait_for_finished_proposal(op
, new C_Booted(this, op
));
2728 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
2730 op
->mark_osdmon_event(__func__
);
2731 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2732 dout(7) << "_booted " << m
->get_orig_source_inst()
2733 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
2736 mon
->clog
->info() << m
->get_orig_source_inst() << " boot";
2739 send_latest(op
, m
->sb
.current_epoch
+1);
2746 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
2748 op
->mark_osdmon_event(__func__
);
2749 MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2750 int from
= m
->get_orig_source().num();
2752 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2754 // check permissions, ignore if failed
2755 MonSession
*session
= m
->get_session();
2758 if (!session
->is_capable("osd", MON_CAP_X
)) {
2759 dout(0) << "MOSDFull from entity with insufficient privileges:"
2760 << session
->caps
<< dendl
;
2764 // ignore a full message from the osd instance that already went down
2765 if (!osdmap
.exists(from
)) {
2766 dout(7) << __func__
<< " ignoring full message from nonexistent "
2767 << m
->get_orig_source_inst() << dendl
;
2770 if ((!osdmap
.is_up(from
) &&
2771 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) ||
2772 (osdmap
.is_up(from
) &&
2773 osdmap
.get_inst(from
) != m
->get_orig_source_inst())) {
2774 dout(7) << __func__
<< " ignoring full message from down "
2775 << m
->get_orig_source_inst() << dendl
;
2779 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
2781 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
2782 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
2783 << " " << m
->get_orig_source_inst() << dendl
;
2784 _reply_map(op
, m
->version
);
2788 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
2789 << " " << m
->get_orig_source_inst() << dendl
;
2796 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
2798 op
->mark_osdmon_event(__func__
);
2799 const MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2800 const int from
= m
->get_orig_source().num();
2802 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2803 const unsigned want_state
= m
->state
& mask
; // safety first
2805 unsigned cur_state
= osdmap
.get_state(from
);
2806 auto p
= pending_inc
.new_state
.find(from
);
2807 if (p
!= pending_inc
.new_state
.end()) {
2808 cur_state
^= p
->second
;
2812 set
<string
> want_state_set
, cur_state_set
;
2813 OSDMap::calc_state_set(want_state
, want_state_set
);
2814 OSDMap::calc_state_set(cur_state
, cur_state_set
);
2816 if (cur_state
!= want_state
) {
2817 if (p
!= pending_inc
.new_state
.end()) {
2820 pending_inc
.new_state
[from
] = 0;
2822 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
2823 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2824 << " -> " << want_state_set
<< dendl
;
2826 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2827 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
2830 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2837 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
2839 op
->mark_osdmon_event(__func__
);
2840 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2841 int from
= m
->get_orig_source().num();
2843 // check permissions, ignore if failed
2844 MonSession
*session
= m
->get_session();
2847 if (!session
->is_capable("osd", MON_CAP_X
)) {
2848 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2849 << session
->caps
<< dendl
;
2853 if (!osdmap
.is_up(from
) ||
2854 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2855 dout(7) << "preprocess_alive ignoring alive message from down " << m
->get_orig_source_inst() << dendl
;
2859 if (osdmap
.get_up_thru(from
) >= m
->want
) {
2861 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
2862 _reply_map(op
, m
->version
);
2866 dout(10) << "preprocess_alive want up_thru " << m
->want
2867 << " from " << m
->get_orig_source_inst() << dendl
;
2874 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
2876 op
->mark_osdmon_event(__func__
);
2877 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2878 int from
= m
->get_orig_source().num();
2880 if (0) { // we probably don't care much about these
2881 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
2884 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
2885 << " from " << m
->get_orig_source_inst() << dendl
;
2887 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
2888 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2892 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
2894 op
->mark_osdmon_event(__func__
);
2895 dout(7) << "_reply_map " << e
2896 << " from " << op
->get_req()->get_orig_source_inst()
2902 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
2904 op
->mark_osdmon_event(__func__
);
2905 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2906 dout(10) << __func__
<< " " << *m
<< dendl
;
2907 auto session
= m
->get_session();
2910 dout(10) << __func__
<< ": no monitor session!" << dendl
;
2913 if (!session
->is_capable("osd", MON_CAP_X
)) {
2914 derr
<< __func__
<< " received from entity "
2915 << "with insufficient privileges " << session
->caps
<< dendl
;
2918 // always forward the "created!" to the leader
2922 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
2924 op
->mark_osdmon_event(__func__
);
2925 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2926 dout(10) << __func__
<< " " << *m
<< dendl
;
2927 auto src
= m
->get_orig_source();
2928 auto from
= src
.num();
2929 if (!src
.is_osd() ||
2930 !mon
->osdmon()->osdmap
.is_up(from
) ||
2931 m
->get_orig_source_inst() != mon
->osdmon()->osdmap
.get_inst(from
)) {
2932 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
2935 pending_created_pgs
.push_back(m
->pgid
);
2942 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
2944 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
2945 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
2946 mempool::osdmap::vector
<int> empty
;
2947 int from
= m
->get_orig_source().num();
2948 size_t ignore_cnt
= 0;
2951 MonSession
*session
= m
->get_session();
2954 if (!session
->is_capable("osd", MON_CAP_X
)) {
2955 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2956 << session
->caps
<< dendl
;
2960 if (!osdmap
.is_up(from
) ||
2961 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2962 dout(7) << "ignoring pgtemp message from down " << m
->get_orig_source_inst() << dendl
;
2970 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
2971 dout(20) << " " << p
->first
2972 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
2973 << " -> " << p
->second
<< dendl
;
2975 // does the pool exist?
2976 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
2978 * 1. If the osdmap does not have the pool, it means the pool has been
2979 * removed in-between the osd sending this message and us handling it.
2980 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2981 * not exist in the pending either, as the osds would not send a
2982 * message about a pool they know nothing about (yet).
2983 * 3. However, if the pool does exist in the pending, then it must be a
2984 * new pool, and not relevant to this message (see 1).
2986 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2987 << ": pool has been removed" << dendl
;
2992 int acting_primary
= -1;
2993 osdmap
.pg_to_up_acting_osds(
2994 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
2995 if (acting_primary
!= from
) {
2996 /* If the source isn't the primary based on the current osdmap, we know
2997 * that the interval changed and that we can discard this message.
2998 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2999 * which of two pg temp mappings on the same pg is more recent.
3001 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3002 << ": primary has changed" << dendl
;
3008 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
3009 osdmap
.primary_temp
->count(p
->first
)))
3012 // NOTE: we assume that this will clear pg_primary, so consider
3013 // an existing pg_primary field to imply a change
3014 if (p
->second
.size() &&
3015 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
3016 !vectors_equal(osdmap
.pg_temp
->get(p
->first
), p
->second
) ||
3017 osdmap
.primary_temp
->count(p
->first
)))
3021 // should we ignore all the pgs?
3022 if (ignore_cnt
== m
->pg_temp
.size())
3025 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
3026 _reply_map(op
, m
->map_epoch
);
3033 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
3035 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
3036 auto ut
= pending_inc
.new_up_thru
.find(from
);
3037 if (ut
!= pending_inc
.new_up_thru
.end()) {
3038 old_up_thru
= ut
->second
;
3040 if (up_thru
> old_up_thru
) {
3041 // set up_thru too, so the osd doesn't have to ask again
3042 pending_inc
.new_up_thru
[from
] = up_thru
;
3046 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
3048 op
->mark_osdmon_event(__func__
);
3049 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
3050 int from
= m
->get_orig_source().num();
3051 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
3052 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
3053 uint64_t pool
= p
->first
.pool();
3054 if (pending_inc
.old_pools
.count(pool
)) {
3055 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3056 << ": pool pending removal" << dendl
;
3059 if (!osdmap
.have_pg_pool(pool
)) {
3060 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3061 << ": pool has been removed" << dendl
;
3064 pending_inc
.new_pg_temp
[p
->first
] =
3065 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
3067 // unconditionally clear pg_primary (until this message can encode
3068 // a change for that, too.. at which point we need to also fix
3069 // preprocess_pg_temp)
3070 if (osdmap
.primary_temp
->count(p
->first
) ||
3071 pending_inc
.new_primary_temp
.count(p
->first
))
3072 pending_inc
.new_primary_temp
[p
->first
] = -1;
3075 // set up_thru too, so the osd doesn't have to ask again
3076 update_up_thru(from
, m
->map_epoch
);
3078 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
3085 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
3087 op
->mark_osdmon_event(__func__
);
3088 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
3089 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
3091 // check privilege, ignore if failed
3092 MonSession
*session
= m
->get_session();
3096 if (!session
->caps
.is_capable(
3098 CEPH_ENTITY_TYPE_MON
,
3099 session
->entity_name
,
3100 "osd", "osd pool rmsnap", {}, true, true, false)) {
3101 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3102 << session
->caps
<< dendl
;
3106 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
3107 q
!= m
->snaps
.end();
3109 if (!osdmap
.have_pg_pool(q
->first
)) {
3110 dout(10) << " ignoring removed_snaps " << q
->second
<< " on non-existent pool " << q
->first
<< dendl
;
3113 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
3114 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
3115 p
!= q
->second
.end();
3117 if (*p
> pi
->get_snap_seq() ||
3118 !pi
->removed_snaps
.contains(*p
))
3127 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
3129 op
->mark_osdmon_event(__func__
);
3130 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
3131 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
3133 for (map
<int, vector
<snapid_t
> >::iterator p
= m
->snaps
.begin();
3134 p
!= m
->snaps
.end();
3137 if (!osdmap
.have_pg_pool(p
->first
)) {
3138 dout(10) << " ignoring removed_snaps " << p
->second
<< " on non-existent pool " << p
->first
<< dendl
;
3142 pg_pool_t
& pi
= osdmap
.pools
[p
->first
];
3143 for (vector
<snapid_t
>::iterator q
= p
->second
.begin();
3144 q
!= p
->second
.end();
3146 if (!pi
.removed_snaps
.contains(*q
) &&
3147 (!pending_inc
.new_pools
.count(p
->first
) ||
3148 !pending_inc
.new_pools
[p
->first
].removed_snaps
.contains(*q
))) {
3149 pg_pool_t
*newpi
= pending_inc
.get_new_pool(p
->first
, &pi
);
3150 newpi
->removed_snaps
.insert(*q
);
3151 dout(10) << " pool " << p
->first
<< " removed_snaps added " << *q
3152 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
3153 if (*q
> newpi
->get_snap_seq()) {
3154 dout(10) << " pool " << p
->first
<< " snap_seq " << newpi
->get_snap_seq() << " -> " << *q
<< dendl
;
3155 newpi
->set_snap_seq(*q
);
3157 newpi
->set_snap_epoch(pending_inc
.epoch
);
3165 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
3167 op
->mark_osdmon_event(__func__
);
3168 auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
3170 auto session
= beacon
->get_session();
3173 dout(10) << __func__
<< " no monitor session!" << dendl
;
3176 if (!session
->is_capable("osd", MON_CAP_X
)) {
3177 derr
<< __func__
<< " received from entity "
3178 << "with insufficient privileges " << session
->caps
<< dendl
;
3181 // Always forward the beacon to the leader, even if they are the same as
3182 // the old one. The leader will mark as down osds that haven't sent
3183 // beacon for a few minutes.
3187 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
3189 op
->mark_osdmon_event(__func__
);
3190 const auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
3191 const auto src
= beacon
->get_orig_source();
3192 dout(10) << __func__
<< " " << *beacon
3193 << " from " << src
<< dendl
;
3194 int from
= src
.num();
3196 if (!src
.is_osd() ||
3197 !osdmap
.is_up(from
) ||
3198 beacon
->get_orig_source_inst() != osdmap
.get_inst(from
)) {
3199 dout(1) << " ignoring beacon from non-active osd." << dendl
;
3203 last_osd_report
[from
] = ceph_clock_now();
3204 osd_epochs
[from
] = beacon
->version
;
3206 for (const auto& pg
: beacon
->pgs
) {
3207 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
3215 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
3217 op
->mark_osdmon_event(__func__
);
3218 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
3219 << " start " << start
<< dendl
;
3223 send_incremental(op
, start
);
3227 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
3229 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
, features
);
3230 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
3231 r
->oldest_map
= get_first_committed();
3232 r
->newest_map
= osdmap
.get_epoch();
3236 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
3238 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features " << std::hex
<< features
<< dendl
;
3239 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
, features
);
3240 m
->oldest_map
= get_first_committed();
3241 m
->newest_map
= osdmap
.get_epoch();
3243 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
3245 int err
= get_version(e
, features
, bl
);
3247 assert(bl
.length());
3248 // if (get_version(e, bl) > 0) {
3249 dout(20) << "build_incremental inc " << e
<< " "
3250 << bl
.length() << " bytes" << dendl
;
3251 m
->incremental_maps
[e
] = bl
;
3253 assert(err
== -ENOENT
);
3254 assert(!bl
.length());
3255 get_version_full(e
, features
, bl
);
3256 if (bl
.length() > 0) {
3257 //else if (get_version("full", e, bl) > 0) {
3258 dout(20) << "build_incremental full " << e
<< " "
3259 << bl
.length() << " bytes" << dendl
;
3262 ceph_abort(); // we should have all maps.
3269 void OSDMonitor::send_full(MonOpRequestRef op
)
3271 op
->mark_osdmon_event(__func__
);
3272 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
3273 mon
->send_reply(op
, build_latest_full(op
->get_session()->con_features
));
3276 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
3278 op
->mark_osdmon_event(__func__
);
3280 MonSession
*s
= op
->get_session();
3284 s
->proxy_con
->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP
)) {
3285 // oh, we can tell the other mon to do it
3286 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
3288 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
3289 r
->send_osdmap_first
= first
;
3290 s
->proxy_con
->send_message(r
);
3291 op
->mark_event("reply: send routed send_osdmap_first reply");
3294 send_incremental(first
, s
, false, op
);
3298 void OSDMonitor::send_incremental(epoch_t first
,
3299 MonSession
*session
,
3301 MonOpRequestRef req
)
3303 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
3304 << " to " << session
->inst
<< dendl
;
3306 // get feature of the peer
3307 // use quorum_con_features, if it's an anonymous connection.
3308 uint64_t features
= session
->con_features
? session
->con_features
:
3309 mon
->get_quorum_con_features();
3311 if (first
<= session
->osd_epoch
) {
3312 dout(10) << __func__
<< " " << session
->inst
<< " should already have epoch "
3313 << session
->osd_epoch
<< dendl
;
3314 first
= session
->osd_epoch
+ 1;
3317 if (first
< get_first_committed()) {
3318 first
= get_first_committed();
3320 int err
= get_version_full(first
, features
, bl
);
3322 assert(bl
.length());
3324 dout(20) << "send_incremental starting with base full "
3325 << first
<< " " << bl
.length() << " bytes" << dendl
;
3327 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
3328 m
->oldest_map
= get_first_committed();
3329 m
->newest_map
= osdmap
.get_epoch();
3330 m
->maps
[first
] = bl
;
3333 mon
->send_reply(req
, m
);
3334 session
->osd_epoch
= first
;
3337 session
->con
->send_message(m
);
3338 session
->osd_epoch
= first
;
3343 while (first
<= osdmap
.get_epoch()) {
3344 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf
->osd_map_message_max
- 1,
3345 osdmap
.get_epoch());
3346 MOSDMap
*m
= build_incremental(first
, last
, features
);
3349 // send some maps. it may not be all of them, but it will get them
3351 mon
->send_reply(req
, m
);
3353 session
->con
->send_message(m
);
3356 session
->osd_epoch
= last
;
3362 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
3364 return get_version(ver
, mon
->get_quorum_con_features(), bl
);
3367 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
3369 OSDMap::Incremental inc
;
3370 bufferlist::iterator q
= bl
.begin();
3372 // always encode with subset of osdmap's canonical features
3373 uint64_t f
= features
& inc
.encode_features
;
3374 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
3377 if (inc
.fullmap
.length()) {
3378 // embedded full map?
3380 m
.decode(inc
.fullmap
);
3381 inc
.fullmap
.clear();
3382 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
3384 if (inc
.crush
.length()) {
3385 // embedded crush map
3387 auto p
= inc
.crush
.begin();
3390 c
.encode(inc
.crush
, f
);
3392 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
3395 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
3398 bufferlist::iterator q
= bl
.begin();
3400 // always encode with subset of osdmap's canonical features
3401 uint64_t f
= features
& m
.get_encoding_features();
3402 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
3405 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
3408 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
3410 uint64_t significant_features
= OSDMap::get_significant_features(features
);
3411 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
3414 int ret
= PaxosService::get_version(ver
, bl
);
3418 // NOTE: this check is imprecise; the OSDMap encoding features may
3419 // be a subset of the latest mon quorum features, but worst case we
3420 // reencode once and then cache the (identical) result under both
3422 if (significant_features
!=
3423 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
3424 reencode_incremental_map(bl
, features
);
3426 inc_osd_cache
.add({ver
, significant_features
}, bl
);
3430 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
3432 return get_version_full(ver
, mon
->get_quorum_con_features(), bl
);
3435 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
3438 uint64_t significant_features
= OSDMap::get_significant_features(features
);
3439 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
3442 int ret
= PaxosService::get_version_full(ver
, bl
);
3446 // NOTE: this check is imprecise; the OSDMap encoding features may
3447 // be a subset of the latest mon quorum features, but worst case we
3448 // reencode once and then cache the (identical) result under both
3450 if (significant_features
!=
3451 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
3452 reencode_full_map(bl
, features
);
3454 full_osd_cache
.add({ver
, significant_features
}, bl
);
3458 epoch_t
OSDMonitor::blacklist(const entity_addr_t
& a
, utime_t until
)
3460 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
3461 pending_inc
.new_blacklist
[a
] = until
;
3462 return pending_inc
.epoch
;
3466 void OSDMonitor::check_osdmap_subs()
3468 dout(10) << __func__
<< dendl
;
3469 if (!osdmap
.get_epoch()) {
3472 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
3473 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
3476 auto p
= osdmap_subs
->second
->begin();
3480 check_osdmap_sub(sub
);
3484 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
3486 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
3487 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
3488 if (sub
->next
<= osdmap
.get_epoch()) {
3490 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
3492 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
3494 mon
->session_map
.remove_sub(sub
);
3496 sub
->next
= osdmap
.get_epoch() + 1;
3500 void OSDMonitor::check_pg_creates_subs()
3502 if (!mon
->monmap
->get_required_features().contains_all(
3503 ceph::features::mon::FEATURE_LUMINOUS
)) {
3504 // PGMonitor takes care of this in pre-luminous era.
3507 if (!osdmap
.get_num_up_osds()) {
3510 assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
3511 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
3512 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
3513 if (pg_creates_subs
== session_map
.subs
.end()) {
3516 for (auto sub
: *pg_creates_subs
->second
) {
3517 check_pg_creates_sub(sub
);
3522 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
3524 dout(20) << __func__
<< " .. " << sub
->session
->inst
<< dendl
;
3525 assert(sub
->type
== "osd_pg_creates");
3526 // only send these if the OSD is up. we will check_subs() when they do
3527 // come up so they will get the creates then.
3528 if (sub
->session
->inst
.name
.is_osd() &&
3529 mon
->osdmon()->osdmap
.is_up(sub
->session
->inst
.name
.num())) {
3530 sub
->next
= send_pg_creates(sub
->session
->inst
.name
.num(),
3531 sub
->session
->con
.get(),
3536 void OSDMonitor::do_application_enable(int64_t pool_id
,
3537 const std::string
&app_name
)
3539 assert(paxos
->is_plugged() && is_writeable());
3541 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
3544 assert(osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
||
3545 pending_inc
.new_require_osd_release
>= CEPH_RELEASE_LUMINOUS
);
3547 auto pp
= osdmap
.get_pg_pool(pool_id
);
3548 assert(pp
!= nullptr);
3551 if (pending_inc
.new_pools
.count(pool_id
)) {
3552 p
= pending_inc
.new_pools
[pool_id
];
3555 p
.application_metadata
.insert({app_name
, {}});
3556 p
.last_change
= pending_inc
.epoch
;
3557 pending_inc
.new_pools
[pool_id
] = p
;
3560 unsigned OSDMonitor::scan_for_creating_pgs(
3561 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
3562 const mempool::osdmap::set
<int64_t>& removed_pools
,
3564 creating_pgs_t
* creating_pgs
) const
3566 unsigned queued
= 0;
3567 for (auto& p
: pools
) {
3568 int64_t poolid
= p
.first
;
3569 const pg_pool_t
& pool
= p
.second
;
3570 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
3571 pool
.get_type(), pool
.get_size());
3572 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
3575 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
3576 const auto created
= pool
.get_last_change();
3577 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
3578 dout(10) << __func__
<< " no change in pool " << poolid
3579 << " " << pool
<< dendl
;
3582 if (removed_pools
.count(poolid
)) {
3583 dout(10) << __func__
<< " pool is being removed: " << poolid
3584 << " " << pool
<< dendl
;
3587 dout(10) << __func__
<< " queueing pool create for " << poolid
3588 << " " << pool
<< dendl
;
3589 if (creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
3590 created
, modified
)) {
3597 void OSDMonitor::update_creating_pgs()
3599 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
3600 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
3601 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
3602 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3603 for (const auto& pg
: creating_pgs
.pgs
) {
3604 int acting_primary
= -1;
3605 auto pgid
= pg
.first
;
3606 if (!osdmap
.pg_exists(pgid
)) {
3607 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
3611 auto mapped
= pg
.second
.first
;
3612 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
3613 mapping
.get(pgid
, nullptr, nullptr, nullptr, &acting_primary
);
3614 // check the previous creating_pgs, look for the target to whom the pg was
3615 // previously mapped
3616 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
3617 const auto last_acting_primary
= pgs_by_epoch
.first
;
3618 for (auto& pgs
: pgs_by_epoch
.second
) {
3619 if (pgs
.second
.count(pgid
)) {
3620 if (last_acting_primary
== acting_primary
) {
3623 dout(20) << __func__
<< " " << pgid
<< " "
3624 << " acting_primary:" << last_acting_primary
3625 << " -> " << acting_primary
<< dendl
;
3626 // note epoch if the target of the create message changed.
3627 mapped
= mapping
.get_epoch();
3632 mapped
= mapping
.get_epoch();
3636 dout(10) << __func__
<< " will instruct osd." << acting_primary
3637 << " to create " << pgid
<< "@" << mapped
<< dendl
;
3638 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(pgid
);
3640 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
3641 creating_pgs_epoch
= mapping
.get_epoch();
3644 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
3646 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
3647 << " " << creating_pgs_by_osd_epoch
<< dendl
;
3648 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3649 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
3650 dout(20) << __func__
3651 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
3652 // the subscribers will be updated when the mapping is completed anyway
3655 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
3656 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
3658 assert(!creating_pgs_by_epoch
->second
.empty());
3660 MOSDPGCreate
*m
= nullptr;
3662 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
3663 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
3664 auto epoch
= epoch_pgs
->first
;
3665 auto& pgs
= epoch_pgs
->second
;
3666 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3667 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
3669 for (auto& pg
: pgs
) {
3671 m
= new MOSDPGCreate(creating_pgs_epoch
);
3672 // Need the create time from the monitor using its clock to set
3673 // last_scrub_stamp upon pg creation.
3674 auto create
= creating_pgs
.pgs
.find(pg
);
3675 assert(create
!= creating_pgs
.pgs
.end());
3676 m
->mkpg
.emplace(pg
, pg_create_t
{create
->second
.first
, pg
, 0});
3677 m
->ctimes
.emplace(pg
, create
->second
.second
);
3678 dout(20) << __func__
<< " will create " << pg
3679 << " at " << create
->second
.first
<< dendl
;
3683 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3684 << " has nothing to send" << dendl
;
3687 con
->send_message(m
);
3688 // sub is current through last + 1
3695 void OSDMonitor::tick()
3697 if (!is_active()) return;
3699 dout(10) << osdmap
<< dendl
;
3701 if (!mon
->is_leader()) return;
3703 bool do_propose
= false;
3704 utime_t now
= ceph_clock_now();
3706 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
3707 mon
->monmap
->get_required_features().contains_all(
3708 ceph::features::mon::FEATURE_LUMINOUS
)) {
3709 if (handle_osd_timeouts(now
, last_osd_report
)) {
3713 if (!osdmap
.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS
) &&
3714 osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
3715 mon
->mgrstatmon()->is_readable() &&
3716 mon
->mgrstatmon()->definitely_converted_snapsets()) {
3717 dout(1) << __func__
<< " all snapsets converted, setting purged_snapdirs"
3719 add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS
);
3724 if (check_failures(now
))
3727 // mark down osds out?
3729 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3730 * influence at all. The decision is made based on the ratio of "in" osds,
3731 * and the function returns false if this ratio is lower that the minimum
3732 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3734 if (can_mark_out(-1)) {
3735 set
<int> down_cache
; // quick cache of down subtrees
3737 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
3738 while (i
!= down_pending_out
.end()) {
3744 if (osdmap
.is_down(o
) &&
3747 utime_t
orig_grace(g_conf
->mon_osd_down_out_interval
, 0);
3748 utime_t grace
= orig_grace
;
3749 double my_grace
= 0.0;
3751 if (g_conf
->mon_osd_adjust_down_out_interval
) {
3752 // scale grace period the same way we do the heartbeat grace.
3753 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
3754 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
3755 double decay_k
= ::log(.5) / halflife
;
3756 double decay
= exp((double)down
* decay_k
);
3757 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
3758 << " down for " << down
<< " decay " << decay
<< dendl
;
3759 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3763 // is this an entire large subtree down?
3764 if (g_conf
->mon_osd_down_out_subtree_limit
.length()) {
3765 int type
= osdmap
.crush
->get_type_id(g_conf
->mon_osd_down_out_subtree_limit
);
3767 if (osdmap
.containing_subtree_is_down(g_ceph_context
, o
, type
, &down_cache
)) {
3768 dout(10) << "tick entire containing " << g_conf
->mon_osd_down_out_subtree_limit
3769 << " subtree for osd." << o
<< " is down; resetting timer" << dendl
;
3770 // reset timer, too.
3771 down_pending_out
[o
] = now
;
3777 bool down_out
= !osdmap
.is_destroyed(o
) &&
3778 g_conf
->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
3779 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
3780 g_conf
->mon_osd_destroyed_out_interval
> 0 &&
3781 // this is not precise enough as we did not make a note when this osd
3782 // was marked as destroyed, but let's not bother with that
3783 // complexity for now.
3784 down
.sec() >= g_conf
->mon_osd_destroyed_out_interval
;
3785 if (down_out
|| destroyed_out
) {
3786 dout(10) << "tick marking osd." << o
<< " OUT after " << down
3787 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
3788 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
3790 // set the AUTOOUT bit.
3791 if (pending_inc
.new_state
.count(o
) == 0)
3792 pending_inc
.new_state
[o
] = 0;
3793 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
3795 // remember previous weight
3796 if (pending_inc
.new_xinfo
.count(o
) == 0)
3797 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
3798 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
3802 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
3803 << int(down
.sec()) << " seconds)";
3808 down_pending_out
.erase(o
);
3811 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
3814 // expire blacklisted items?
3815 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
3816 p
!= osdmap
.blacklist
.end();
3818 if (p
->second
< now
) {
3819 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
3820 pending_inc
.old_blacklist
.push_back(p
->first
);
3825 // if map full setting has changed, get that info out there!
3826 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
&&
3827 mon
->pgservice
->is_readable()) {
3828 // for pre-luminous compat only!
3829 if (mon
->pgservice
->have_full_osds()) {
3830 dout(5) << "There are full osds, setting full flag" << dendl
;
3831 add_flag(CEPH_OSDMAP_FULL
);
3832 } else if (osdmap
.test_flag(CEPH_OSDMAP_FULL
)){
3833 dout(10) << "No full osds, removing full flag" << dendl
;
3834 remove_flag(CEPH_OSDMAP_FULL
);
3837 if (mon
->pgservice
->have_nearfull_osds()) {
3838 dout(5) << "There are near full osds, setting nearfull flag" << dendl
;
3839 add_flag(CEPH_OSDMAP_NEARFULL
);
3840 } else if (osdmap
.test_flag(CEPH_OSDMAP_NEARFULL
)){
3841 dout(10) << "No near full osds, removing nearfull flag" << dendl
;
3842 remove_flag(CEPH_OSDMAP_NEARFULL
);
3844 if (pending_inc
.new_flags
!= -1 &&
3845 (pending_inc
.new_flags
^ osdmap
.flags
) & (CEPH_OSDMAP_FULL
| CEPH_OSDMAP_NEARFULL
)) {
3846 dout(1) << "New setting for" <<
3847 (pending_inc
.new_flags
& CEPH_OSDMAP_FULL
? " CEPH_OSDMAP_FULL" : "") <<
3848 (pending_inc
.new_flags
& CEPH_OSDMAP_NEARFULL
? " CEPH_OSDMAP_NEARFULL" : "")
3849 << " -- doing propose" << dendl
;
3854 if (update_pools_status())
3858 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
3862 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
3863 std::map
<int,utime_t
> &last_osd_report
)
3865 utime_t
timeo(g_conf
->mon_osd_report_timeout
, 0);
3866 if (now
- mon
->get_leader_since() < timeo
) {
3867 // We haven't been the leader for long enough to consider OSD timeouts
3871 int max_osd
= osdmap
.get_max_osd();
3872 bool new_down
= false;
3874 for (int i
=0; i
< max_osd
; ++i
) {
3875 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
3876 if (!osdmap
.exists(i
)) {
3877 last_osd_report
.erase(i
); // if any
3880 if (!osdmap
.is_up(i
))
3882 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
3883 if (t
== last_osd_report
.end()) {
3884 // it wasn't in the map; start the timer.
3885 last_osd_report
[i
] = now
;
3886 } else if (can_mark_down(i
)) {
3887 utime_t diff
= now
- t
->second
;
3889 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
3890 << diff
<< " seconds";
3891 derr
<< "no beacon from osd." << i
<< " since " << t
->second
3892 << ", " << diff
<< " seconds ago. marking down" << dendl
;
3893 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
3901 void OSDMonitor::get_health(list
<pair
<health_status_t
,string
> >& summary
,
3902 list
<pair
<health_status_t
,string
> > *detail
,
3903 CephContext
*cct
) const
3905 int num_osds
= osdmap
.get_num_osds();
3907 if (num_osds
== 0) {
3908 summary
.push_back(make_pair(HEALTH_ERR
, "no osds"));
3910 int num_in_osds
= 0;
3911 int num_down_in_osds
= 0;
3913 set
<int> down_in_osds
;
3914 set
<int> up_in_osds
;
3915 set
<int> subtree_up
;
3916 unordered_map
<int, set
<int> > subtree_type_down
;
3917 unordered_map
<int, int> num_osds_subtree
;
3918 int max_type
= osdmap
.crush
->get_max_type_id();
3920 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3921 if (!osdmap
.exists(i
)) {
3922 if (osdmap
.crush
->item_exists(i
)) {
3927 if (osdmap
.is_out(i
))
3930 if (down_in_osds
.count(i
) || up_in_osds
.count(i
))
3932 if (!osdmap
.is_up(i
)) {
3933 down_in_osds
.insert(i
);
3936 for (int type
= 0; type
<= max_type
; type
++) {
3937 if (!osdmap
.crush
->get_type_name(type
))
3939 int r
= osdmap
.crush
->get_immediate_parent_id(current
, &parent_id
);
3942 // break early if this parent is already marked as up
3943 if (subtree_up
.count(parent_id
))
3945 type
= osdmap
.crush
->get_bucket_type(parent_id
);
3946 if (!osdmap
.subtree_type_is_down(
3947 g_ceph_context
, parent_id
, type
,
3948 &down_in_osds
, &up_in_osds
, &subtree_up
, &subtree_type_down
))
3950 current
= parent_id
;
3955 // calculate the number of down osds in each down subtree and
3956 // store it in num_osds_subtree
3957 for (int type
= 1; type
<= max_type
; type
++) {
3958 if (!osdmap
.crush
->get_type_name(type
))
3960 for (auto j
= subtree_type_down
[type
].begin();
3961 j
!= subtree_type_down
[type
].end();
3965 int num
= osdmap
.crush
->get_children(*j
, &children
);
3966 num_osds_subtree
[*j
] = num
;
3970 int num_children
= osdmap
.crush
->get_children(*j
, &children
);
3971 if (num_children
== 0)
3973 for (auto l
= children
.begin(); l
!= children
.end(); ++l
) {
3974 if (num_osds_subtree
[*l
] > 0) {
3975 num
= num
+ num_osds_subtree
[*l
];
3978 num_osds_subtree
[*j
] = num
;
3982 num_down_in_osds
= down_in_osds
.size();
3983 assert(num_down_in_osds
<= num_in_osds
);
3984 if (num_down_in_osds
> 0) {
3985 // summary of down subtree types and osds
3986 for (int type
= max_type
; type
> 0; type
--) {
3987 if (!osdmap
.crush
->get_type_name(type
))
3989 if (subtree_type_down
[type
].size() > 0) {
3991 ss
<< subtree_type_down
[type
].size() << " "
3992 << osdmap
.crush
->get_type_name(type
);
3993 if (subtree_type_down
[type
].size() > 1) {
3996 int sum_down_osds
= 0;
3997 for (auto j
= subtree_type_down
[type
].begin();
3998 j
!= subtree_type_down
[type
].end();
4000 sum_down_osds
= sum_down_osds
+ num_osds_subtree
[*j
];
4002 ss
<< " (" << sum_down_osds
<< " osds) down";
4003 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4007 ss
<< down_in_osds
.size() << " osds down";
4008 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4011 // details of down subtree types
4012 for (int type
= max_type
; type
> 0; type
--) {
4013 if (!osdmap
.crush
->get_type_name(type
))
4015 for (auto j
= subtree_type_down
[type
].rbegin();
4016 j
!= subtree_type_down
[type
].rend();
4019 ss
<< osdmap
.crush
->get_type_name(type
);
4021 ss
<< osdmap
.crush
->get_item_name(*j
);
4022 // at the top level, do not print location
4023 if (type
!= max_type
) {
4025 ss
<< osdmap
.crush
->get_full_location_ordered_string(*j
);
4028 int num
= num_osds_subtree
[*j
];
4029 ss
<< " (" << num
<< " osds)";
4031 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4034 // details of down osds
4035 for (auto it
= down_in_osds
.begin(); it
!= down_in_osds
.end(); ++it
) {
4037 ss
<< "osd." << *it
<< " (";
4038 ss
<< osdmap
.crush
->get_full_location_ordered_string(*it
);
4040 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4045 if (!osds
.empty()) {
4047 ss
<< osds
.size() << " osds exist in the crush map but not in the osdmap";
4048 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4050 ss
<< " (osds: " << osds
<< ")";
4051 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4055 // note: we leave it to ceph-mgr to generate details health warnings
4056 // with actual osd utilizations
4059 uint64_t warn_flags
=
4061 CEPH_OSDMAP_PAUSERD
|
4062 CEPH_OSDMAP_PAUSEWR
|
4063 CEPH_OSDMAP_PAUSEREC
|
4065 CEPH_OSDMAP_NODOWN
|
4068 CEPH_OSDMAP_NOBACKFILL
|
4069 CEPH_OSDMAP_NORECOVER
|
4070 CEPH_OSDMAP_NOSCRUB
|
4071 CEPH_OSDMAP_NODEEP_SCRUB
|
4072 CEPH_OSDMAP_NOTIERAGENT
|
4073 CEPH_OSDMAP_NOREBALANCE
;
4074 if (osdmap
.test_flag(warn_flags
)) {
4076 ss
<< osdmap
.get_flag_string(osdmap
.get_flags() & warn_flags
)
4078 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4080 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4083 // old crush tunables?
4084 if (g_conf
->mon_warn_on_legacy_crush_tunables
) {
4085 string min
= osdmap
.crush
->get_min_required_version();
4086 if (min
< g_conf
->mon_crush_min_required_version
) {
4088 ss
<< "crush map has legacy tunables (require " << min
4089 << ", min is " << g_conf
->mon_crush_min_required_version
<< ")";
4090 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4092 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
4093 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4097 if (g_conf
->mon_warn_on_crush_straw_calc_version_zero
) {
4098 if (osdmap
.crush
->get_straw_calc_version() == 0) {
4100 ss
<< "crush map has straw_calc_version=0";
4101 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4103 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
4104 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4109 // hit_set-less cache_mode?
4110 if (g_conf
->mon_warn_on_cache_pools_without_hit_sets
) {
4111 int problem_cache_pools
= 0;
4112 for (map
<int64_t, pg_pool_t
>::const_iterator p
= osdmap
.pools
.begin();
4113 p
!= osdmap
.pools
.end();
4115 const pg_pool_t
& info
= p
->second
;
4116 if (info
.cache_mode_requires_hit_set() &&
4117 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
4118 ++problem_cache_pools
;
4121 ss
<< "pool '" << osdmap
.get_pool_name(p
->first
)
4122 << "' with cache_mode " << info
.get_cache_mode_name()
4123 << " needs hit_set_type to be set but it is not";
4124 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4128 if (problem_cache_pools
) {
4130 ss
<< problem_cache_pools
<< " cache pools are missing hit_sets";
4131 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4135 // Not using 'sortbitwise' and should be?
4136 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
4137 (osdmap
.get_up_osd_features() &
4138 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
4140 ss
<< "no legacy OSD present but 'sortbitwise' flag is not set";
4141 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4144 // Warn if 'mon_osd_down_out_interval' is set to zero.
4145 // Having this option set to zero on the leader acts much like the
4146 // 'noout' flag. It's hard to figure out what's going wrong with clusters
4147 // without the 'noout' flag set but acting like that just the same, so
4148 // we report a HEALTH_WARN in case this option is set to zero.
4149 // This is an ugly hack to get the warning out, but until we find a way
4150 // to spread global options throughout the mon cluster and have all mons
4151 // using a base set of the same options, we need to work around this sort
4153 // There's also the obvious drawback that if this is set on a single
4154 // monitor on a 3-monitor cluster, this warning will only be shown every
4155 // third monitor connection.
4156 if (g_conf
->mon_warn_on_osd_down_out_interval_zero
&&
4157 g_conf
->mon_osd_down_out_interval
== 0) {
4159 ss
<< "mon." << mon
->name
<< " has mon_osd_down_out_interval set to 0";
4160 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4162 ss
<< "; this has the same effect as the 'noout' flag";
4163 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4167 // warn about upgrade flags that can be set but are not.
4168 if (g_conf
->mon_debug_no_require_luminous
) {
4169 // ignore these checks
4170 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
) &&
4171 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
4172 string msg
= "all OSDs are running luminous or later but"
4173 " require_osd_release < luminous";
4174 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
4176 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
4178 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
) &&
4179 osdmap
.require_osd_release
< CEPH_RELEASE_KRAKEN
) {
4180 string msg
= "all OSDs are running kraken or later but"
4181 " require_osd_release < kraken";
4182 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
4184 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
4186 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
) &&
4187 osdmap
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
4188 string msg
= "all OSDs are running jewel or later but"
4189 " require_osd_release < jewel";
4190 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
4192 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
4196 for (auto it
: osdmap
.get_pools()) {
4197 const pg_pool_t
&pool
= it
.second
;
4198 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
4199 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
4201 ss
<< "pool '" << pool_name
<< "' is full";
4202 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4204 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4210 void OSDMonitor::dump_info(Formatter
*f
)
4212 f
->open_object_section("osdmap");
4216 f
->open_array_section("osd_metadata");
4217 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4218 if (osdmap
.exists(i
)) {
4219 f
->open_object_section("osd");
4220 f
->dump_unsigned("id", i
);
4221 dump_osd_metadata(i
, f
, NULL
);
4227 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
4228 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
4230 f
->open_object_section("crushmap");
4231 osdmap
.crush
->dump(f
);
4236 enum osd_pool_get_choices
{
4237 SIZE
, MIN_SIZE
, CRASH_REPLAY_INTERVAL
,
4238 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
4239 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
4240 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
4241 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
4242 USE_GMT_HITSET
, AUID
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
4243 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
4244 CACHE_TARGET_FULL_RATIO
,
4245 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
4246 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
4247 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
4248 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
4249 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
4250 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
4251 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
4252 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
4253 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
};
4255 std::set
<osd_pool_get_choices
>
4256 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
4257 const std::set
<osd_pool_get_choices
>& second
)
4259 std::set
<osd_pool_get_choices
> result
;
4260 std::set_difference(first
.begin(), first
.end(),
4261 second
.begin(), second
.end(),
4262 std::inserter(result
, result
.end()));
4268 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
4270 op
->mark_osdmon_event(__func__
);
4271 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
4274 stringstream ss
, ds
;
4276 map
<string
, cmd_vartype
> cmdmap
;
4277 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
4278 string rs
= ss
.str();
4279 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
4283 MonSession
*session
= m
->get_session();
4285 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
4290 cmd_getval_throws(g_ceph_context
, cmdmap
, "prefix", prefix
);
4293 cmd_getval_throws(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
4294 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
4296 if (prefix
== "osd stat") {
4297 osdmap
.print_summary(f
.get(), ds
, "");
4303 else if (prefix
== "osd perf" ||
4304 prefix
== "osd blocked-by") {
4305 r
= mon
->pgservice
->process_pg_command(prefix
, cmdmap
,
4306 osdmap
, f
.get(), &ss
, &rdata
);
4308 else if (prefix
== "osd dump" ||
4309 prefix
== "osd tree" ||
4310 prefix
== "osd ls" ||
4311 prefix
== "osd getmap" ||
4312 prefix
== "osd getcrushmap" ||
4313 prefix
== "osd ls-tree") {
4318 cmd_getval_throws(g_ceph_context
, cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
4321 bufferlist osdmap_bl
;
4322 int err
= get_version_full(epoch
, osdmap_bl
);
4323 if (err
== -ENOENT
) {
4325 ss
<< "there is no map for epoch " << epoch
;
4329 assert(osdmap_bl
.length());
4332 if (epoch
== osdmap
.get_epoch()) {
4336 p
->decode(osdmap_bl
);
4339 auto sg
= make_scope_guard([&] {
4345 if (prefix
== "osd dump") {
4348 f
->open_object_section("osdmap");
4358 } else if (prefix
== "osd ls") {
4360 f
->open_array_section("osds");
4361 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
4362 if (osdmap
.exists(i
)) {
4363 f
->dump_int("osd", i
);
4370 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
4371 if (osdmap
.exists(i
)) {
4380 } else if (prefix
== "osd tree") {
4381 vector
<string
> states
;
4382 cmd_getval_throws(g_ceph_context
, cmdmap
, "states", states
);
4383 unsigned filter
= 0;
4384 for (auto& s
: states
) {
4386 filter
|= OSDMap::DUMP_UP
;
4387 } else if (s
== "down") {
4388 filter
|= OSDMap::DUMP_DOWN
;
4389 } else if (s
== "in") {
4390 filter
|= OSDMap::DUMP_IN
;
4391 } else if (s
== "out") {
4392 filter
|= OSDMap::DUMP_OUT
;
4393 } else if (s
== "destroyed") {
4394 filter
|= OSDMap::DUMP_DESTROYED
;
4396 ss
<< "unrecognized state '" << s
<< "'";
4401 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
4402 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
4403 ss
<< "cannot specify both 'in' and 'out'";
4407 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
4408 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
4409 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
4410 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
4411 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
4412 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
4413 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
4418 f
->open_object_section("tree");
4419 p
->print_tree(f
.get(), NULL
, filter
);
4423 p
->print_tree(NULL
, &ds
, filter
);
4426 } else if (prefix
== "osd getmap") {
4427 rdata
.append(osdmap_bl
);
4428 ss
<< "got osdmap epoch " << p
->get_epoch();
4429 } else if (prefix
== "osd getcrushmap") {
4430 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
4431 ss
<< p
->get_crush_version();
4432 } else if (prefix
== "osd ls-tree") {
4434 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", bucket_name
);
4436 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
4438 ss
<< "\"" << bucket_name
<< "\" does not exist";
4441 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
4446 f
->open_array_section("osds");
4447 for (auto &i
: osds
) {
4448 if (osdmap
.exists(i
)) {
4449 f
->dump_int("osd", i
);
4456 for (auto &i
: osds
) {
4457 if (osdmap
.exists(i
)) {
4468 } else if (prefix
== "osd df") {
4470 cmd_getval(g_ceph_context
, cmdmap
, "output_method", method
);
4471 print_osd_utilization(osdmap
, mon
->pgservice
, ds
,
4472 f
.get(), method
== "tree");
4474 } else if (prefix
== "osd getmaxosd") {
4476 f
->open_object_section("getmaxosd");
4477 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4478 f
->dump_int("max_osd", osdmap
.get_max_osd());
4482 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
4485 } else if (prefix
== "osd utilization") {
4487 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
4494 } else if (prefix
== "osd find") {
4496 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "id", osd
)) {
4497 ss
<< "unable to parse osd id value '"
4498 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4502 if (!osdmap
.exists(osd
)) {
4503 ss
<< "osd." << osd
<< " does not exist";
4508 cmd_getval_throws(g_ceph_context
, cmdmap
, "format", format
);
4509 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4510 f
->open_object_section("osd_location");
4511 f
->dump_int("osd", osd
);
4512 f
->dump_stream("ip") << osdmap
.get_addr(osd
);
4513 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
4514 f
->open_object_section("crush_location");
4515 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
4516 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
4517 f
->dump_string(p
->first
.c_str(), p
->second
);
4521 } else if (prefix
== "osd metadata") {
4523 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
4524 !cmd_getval_throws(g_ceph_context
, cmdmap
, "id", osd
)) {
4525 ss
<< "unable to parse osd id value '"
4526 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4530 if (osd
>= 0 && !osdmap
.exists(osd
)) {
4531 ss
<< "osd." << osd
<< " does not exist";
4536 cmd_getval_throws(g_ceph_context
, cmdmap
, "format", format
);
4537 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4539 f
->open_object_section("osd_metadata");
4540 f
->dump_unsigned("id", osd
);
4541 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
4547 f
->open_array_section("osd_metadata");
4548 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4549 if (osdmap
.exists(i
)) {
4550 f
->open_object_section("osd");
4551 f
->dump_unsigned("id", i
);
4552 r
= dump_osd_metadata(i
, f
.get(), NULL
);
4553 if (r
== -EINVAL
|| r
== -ENOENT
) {
4554 // Drop error, continue to get other daemons' metadata
4555 dout(4) << "No metadata for osd." << i
<< dendl
;
4567 } else if (prefix
== "osd versions") {
4569 f
.reset(Formatter::create("json-pretty"));
4570 count_metadata("ceph_version", f
.get());
4573 } else if (prefix
== "osd count-metadata") {
4575 f
.reset(Formatter::create("json-pretty"));
4577 cmd_getval_throws(g_ceph_context
, cmdmap
, "property", field
);
4578 count_metadata(field
, f
.get());
4581 } else if (prefix
== "osd map") {
4582 string poolstr
, objstr
, namespacestr
;
4583 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
4584 cmd_getval_throws(g_ceph_context
, cmdmap
, "object", objstr
);
4585 cmd_getval_throws(g_ceph_context
, cmdmap
, "nspace", namespacestr
);
4587 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4589 ss
<< "pool " << poolstr
<< " does not exist";
4593 object_locator_t
oloc(pool
, namespacestr
);
4594 object_t
oid(objstr
);
4595 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
4596 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4597 vector
<int> up
, acting
;
4599 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
4602 if (!namespacestr
.empty())
4603 fullobjname
= namespacestr
+ string("/") + oid
.name
;
4605 fullobjname
= oid
.name
;
4607 f
->open_object_section("osd_map");
4608 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4609 f
->dump_string("pool", poolstr
);
4610 f
->dump_int("pool_id", pool
);
4611 f
->dump_stream("objname") << fullobjname
;
4612 f
->dump_stream("raw_pgid") << pgid
;
4613 f
->dump_stream("pgid") << mpgid
;
4614 f
->open_array_section("up");
4615 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
4616 f
->dump_int("osd", *p
);
4618 f
->dump_int("up_primary", up_p
);
4619 f
->open_array_section("acting");
4620 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
4621 f
->dump_int("osd", *p
);
4623 f
->dump_int("acting_primary", acting_p
);
4624 f
->close_section(); // osd_map
4627 ds
<< "osdmap e" << osdmap
.get_epoch()
4628 << " pool '" << poolstr
<< "' (" << pool
<< ")"
4629 << " object '" << fullobjname
<< "' ->"
4630 << " pg " << pgid
<< " (" << mpgid
<< ")"
4631 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
4632 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
4636 } else if (prefix
== "pg map") {
4639 cmd_getval_throws(g_ceph_context
, cmdmap
, "pgid", pgidstr
);
4640 if (!pgid
.parse(pgidstr
.c_str())) {
4641 ss
<< "invalid pgid '" << pgidstr
<< "'";
4645 vector
<int> up
, acting
;
4646 if (!osdmap
.have_pg_pool(pgid
.pool())) {
4647 ss
<< "pg '" << pgidstr
<< "' does not exist";
4651 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4652 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
4654 f
->open_object_section("pg_map");
4655 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4656 f
->dump_stream("raw_pgid") << pgid
;
4657 f
->dump_stream("pgid") << mpgid
;
4658 f
->open_array_section("up");
4659 for (auto osd
: up
) {
4660 f
->dump_int("up_osd", osd
);
4663 f
->open_array_section("acting");
4664 for (auto osd
: acting
) {
4665 f
->dump_int("acting_osd", osd
);
4671 ds
<< "osdmap e" << osdmap
.get_epoch()
4672 << " pg " << pgid
<< " (" << mpgid
<< ")"
4673 << " -> up " << up
<< " acting " << acting
;
4678 } else if (prefix
== "osd scrub" ||
4679 prefix
== "osd deep-scrub" ||
4680 prefix
== "osd repair") {
4682 cmd_getval(g_ceph_context
, cmdmap
, "who", whostr
);
4683 vector
<string
> pvec
;
4684 get_str_vec(prefix
, pvec
);
4686 if (whostr
== "*" || whostr
== "all" || whostr
== "any") {
4689 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++)
4690 if (osdmap
.is_up(i
)) {
4691 ss
<< (c
++ ? "," : "") << i
;
4692 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4693 pvec
.back() == "repair",
4694 pvec
.back() == "deep-scrub"),
4695 osdmap
.get_inst(i
));
4698 ss
<< " instructed to " << pvec
.back();
4700 long osd
= parse_osd_id(whostr
.c_str(), &ss
);
4703 } else if (osdmap
.is_up(osd
)) {
4704 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4705 pvec
.back() == "repair",
4706 pvec
.back() == "deep-scrub"),
4707 osdmap
.get_inst(osd
));
4708 ss
<< "osd." << osd
<< " instructed to " << pvec
.back();
4710 ss
<< "osd." << osd
<< " is not up";
4714 } else if (prefix
== "osd lspools") {
4716 cmd_getval_throws(g_ceph_context
, cmdmap
, "auid", auid
, int64_t(0));
4718 f
->open_array_section("pools");
4719 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
4720 p
!= osdmap
.pools
.end();
4722 if (!auid
|| p
->second
.auid
== (uint64_t)auid
) {
4724 f
->open_object_section("pool");
4725 f
->dump_int("poolnum", p
->first
);
4726 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
4729 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
] << ',';
4738 } else if (prefix
== "osd blacklist ls") {
4740 f
->open_array_section("blacklist");
4742 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
4743 p
!= osdmap
.blacklist
.end();
4746 f
->open_object_section("entry");
4747 f
->dump_stream("addr") << p
->first
;
4748 f
->dump_stream("until") << p
->second
;
4753 ss
<< p
->first
<< " " << p
->second
;
4763 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
4765 } else if (prefix
== "osd pool ls") {
4767 cmd_getval_throws(g_ceph_context
, cmdmap
, "detail", detail
);
4768 if (!f
&& detail
== "detail") {
4770 osdmap
.print_pools(ss
);
4771 rdata
.append(ss
.str());
4774 f
->open_array_section("pools");
4775 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
4776 it
!= osdmap
.get_pools().end();
4779 if (detail
== "detail") {
4780 f
->open_object_section("pool");
4781 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4782 it
->second
.dump(f
.get());
4785 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4788 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
4797 } else if (prefix
== "osd crush get-tunable") {
4799 cmd_getval_throws(g_ceph_context
, cmdmap
, "tunable", tunable
);
4802 f
->open_object_section("tunable");
4803 if (tunable
== "straw_calc_version") {
4805 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
4807 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
4816 rdata
.append(rss
.str());
4820 } else if (prefix
== "osd pool get") {
4822 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
4823 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4825 ss
<< "unrecognized pool '" << poolstr
<< "'";
4830 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
4832 cmd_getval_throws(g_ceph_context
, cmdmap
, "var", var
);
4834 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
4835 const choices_map_t ALL_CHOICES
= {
4837 {"min_size", MIN_SIZE
},
4838 {"crash_replay_interval", CRASH_REPLAY_INTERVAL
},
4839 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
4840 {"crush_rule", CRUSH_RULE
}, {"hashpspool", HASHPSPOOL
},
4841 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
4842 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
4843 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
4844 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
4845 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
4846 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
4847 {"use_gmt_hitset", USE_GMT_HITSET
},
4848 {"auid", AUID
}, {"target_max_objects", TARGET_MAX_OBJECTS
},
4849 {"target_max_bytes", TARGET_MAX_BYTES
},
4850 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
4851 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
4852 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
4853 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
4854 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
4855 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
4856 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
4857 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
4858 {"fast_read", FAST_READ
},
4859 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
4860 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
4861 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
4862 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
4863 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
4864 {"recovery_priority", RECOVERY_PRIORITY
},
4865 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
4866 {"scrub_priority", SCRUB_PRIORITY
},
4867 {"compression_mode", COMPRESSION_MODE
},
4868 {"compression_algorithm", COMPRESSION_ALGORITHM
},
4869 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
4870 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
4871 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
4872 {"csum_type", CSUM_TYPE
},
4873 {"csum_max_block", CSUM_MAX_BLOCK
},
4874 {"csum_min_block", CSUM_MIN_BLOCK
},
4877 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
4879 const choices_set_t ONLY_TIER_CHOICES
= {
4880 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
4881 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
4882 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
4883 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
4884 MIN_READ_RECENCY_FOR_PROMOTE
,
4885 MIN_WRITE_RECENCY_FOR_PROMOTE
,
4886 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
4888 const choices_set_t ONLY_ERASURE_CHOICES
= {
4889 EC_OVERWRITES
, ERASURE_CODE_PROFILE
4892 choices_set_t selected_choices
;
4894 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
4895 it
!= ALL_CHOICES
.end(); ++it
) {
4896 selected_choices
.insert(it
->second
);
4900 selected_choices
= subtract_second_from_first(selected_choices
,
4904 if(!p
->is_erasure()) {
4905 selected_choices
= subtract_second_from_first(selected_choices
,
4906 ONLY_ERASURE_CHOICES
);
4908 } else /* var != "all" */ {
4909 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
4910 osd_pool_get_choices selected
= found
->second
;
4912 if (!p
->is_tier() &&
4913 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
4914 ss
<< "pool '" << poolstr
4915 << "' is not a tier pool: variable not applicable";
4920 if (!p
->is_erasure() &&
4921 ONLY_ERASURE_CHOICES
.find(selected
)
4922 != ONLY_ERASURE_CHOICES
.end()) {
4923 ss
<< "pool '" << poolstr
4924 << "' is not a erasure pool: variable not applicable";
4929 if (pool_opts_t::is_opt_name(var
) &&
4930 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
4931 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
4936 selected_choices
.insert(selected
);
4940 f
->open_object_section("pool");
4941 f
->dump_string("pool", poolstr
);
4942 f
->dump_int("pool_id", pool
);
4943 for(choices_set_t::const_iterator it
= selected_choices
.begin();
4944 it
!= selected_choices
.end(); ++it
) {
4945 choices_map_t::const_iterator i
;
4946 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4947 if (i
->second
== *it
) {
4951 assert(i
!= ALL_CHOICES
.end());
4954 f
->dump_int("pg_num", p
->get_pg_num());
4957 f
->dump_int("pgp_num", p
->get_pgp_num());
4960 f
->dump_int("auid", p
->get_auid());
4963 f
->dump_int("size", p
->get_size());
4966 f
->dump_int("min_size", p
->get_min_size());
4968 case CRASH_REPLAY_INTERVAL
:
4969 f
->dump_int("crash_replay_interval",
4970 p
->get_crash_replay_interval());
4973 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
4974 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
4975 p
->get_crush_rule()));
4977 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
4981 f
->dump_bool("allow_ec_overwrites",
4982 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
4988 case WRITE_FADVISE_DONTNEED
:
4991 f
->dump_bool(i
->first
.c_str(),
4992 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
4994 case HIT_SET_PERIOD
:
4995 f
->dump_int("hit_set_period", p
->hit_set_period
);
4998 f
->dump_int("hit_set_count", p
->hit_set_count
);
5001 f
->dump_string("hit_set_type",
5002 HitSet::get_type_name(p
->hit_set_params
.get_type()));
5006 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
5007 BloomHitSet::Params
*bloomp
=
5008 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
5009 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
5010 } else if(var
!= "all") {
5012 ss
<< "hit set is not of type Bloom; " <<
5013 "invalid to get a false positive rate!";
5019 case USE_GMT_HITSET
:
5020 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
5022 case TARGET_MAX_OBJECTS
:
5023 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
5025 case TARGET_MAX_BYTES
:
5026 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
5028 case CACHE_TARGET_DIRTY_RATIO
:
5029 f
->dump_unsigned("cache_target_dirty_ratio_micro",
5030 p
->cache_target_dirty_ratio_micro
);
5031 f
->dump_float("cache_target_dirty_ratio",
5032 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
5034 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
5035 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
5036 p
->cache_target_dirty_high_ratio_micro
);
5037 f
->dump_float("cache_target_dirty_high_ratio",
5038 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
5040 case CACHE_TARGET_FULL_RATIO
:
5041 f
->dump_unsigned("cache_target_full_ratio_micro",
5042 p
->cache_target_full_ratio_micro
);
5043 f
->dump_float("cache_target_full_ratio",
5044 ((float)p
->cache_target_full_ratio_micro
/1000000));
5046 case CACHE_MIN_FLUSH_AGE
:
5047 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
5049 case CACHE_MIN_EVICT_AGE
:
5050 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
5052 case ERASURE_CODE_PROFILE
:
5053 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
5055 case MIN_READ_RECENCY_FOR_PROMOTE
:
5056 f
->dump_int("min_read_recency_for_promote",
5057 p
->min_read_recency_for_promote
);
5059 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
5060 f
->dump_int("min_write_recency_for_promote",
5061 p
->min_write_recency_for_promote
);
5064 f
->dump_int("fast_read", p
->fast_read
);
5066 case HIT_SET_GRADE_DECAY_RATE
:
5067 f
->dump_int("hit_set_grade_decay_rate",
5068 p
->hit_set_grade_decay_rate
);
5070 case HIT_SET_SEARCH_LAST_N
:
5071 f
->dump_int("hit_set_search_last_n",
5072 p
->hit_set_search_last_n
);
5074 case SCRUB_MIN_INTERVAL
:
5075 case SCRUB_MAX_INTERVAL
:
5076 case DEEP_SCRUB_INTERVAL
:
5077 case RECOVERY_PRIORITY
:
5078 case RECOVERY_OP_PRIORITY
:
5079 case SCRUB_PRIORITY
:
5080 case COMPRESSION_MODE
:
5081 case COMPRESSION_ALGORITHM
:
5082 case COMPRESSION_REQUIRED_RATIO
:
5083 case COMPRESSION_MAX_BLOB_SIZE
:
5084 case COMPRESSION_MIN_BLOB_SIZE
:
5086 case CSUM_MAX_BLOCK
:
5087 case CSUM_MIN_BLOCK
:
5088 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
5089 if (p
->opts
.is_set(key
)) {
5090 if(*it
== CSUM_TYPE
) {
5092 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
5093 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
5095 p
->opts
.dump(i
->first
, f
.get());
5104 for(choices_set_t::const_iterator it
= selected_choices
.begin();
5105 it
!= selected_choices
.end(); ++it
) {
5106 choices_map_t::const_iterator i
;
5109 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
5112 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
5115 ss
<< "auid: " << p
->get_auid() << "\n";
5118 ss
<< "size: " << p
->get_size() << "\n";
5121 ss
<< "min_size: " << p
->get_min_size() << "\n";
5123 case CRASH_REPLAY_INTERVAL
:
5124 ss
<< "crash_replay_interval: " <<
5125 p
->get_crash_replay_interval() << "\n";
5128 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
5129 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
5130 p
->get_crush_rule()) << "\n";
5132 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
5135 case HIT_SET_PERIOD
:
5136 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
5139 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
5142 ss
<< "hit_set_type: " <<
5143 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
5147 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
5148 BloomHitSet::Params
*bloomp
=
5149 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
5150 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
5151 } else if(var
!= "all") {
5152 ss
<< "hit set is not of type Bloom; " <<
5153 "invalid to get a false positive rate!";
5159 case USE_GMT_HITSET
:
5160 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
5162 case TARGET_MAX_OBJECTS
:
5163 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
5165 case TARGET_MAX_BYTES
:
5166 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
5168 case CACHE_TARGET_DIRTY_RATIO
:
5169 ss
<< "cache_target_dirty_ratio: "
5170 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
5172 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
5173 ss
<< "cache_target_dirty_high_ratio: "
5174 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
5176 case CACHE_TARGET_FULL_RATIO
:
5177 ss
<< "cache_target_full_ratio: "
5178 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
5180 case CACHE_MIN_FLUSH_AGE
:
5181 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
5183 case CACHE_MIN_EVICT_AGE
:
5184 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
5186 case ERASURE_CODE_PROFILE
:
5187 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
5189 case MIN_READ_RECENCY_FOR_PROMOTE
:
5190 ss
<< "min_read_recency_for_promote: " <<
5191 p
->min_read_recency_for_promote
<< "\n";
5193 case HIT_SET_GRADE_DECAY_RATE
:
5194 ss
<< "hit_set_grade_decay_rate: " <<
5195 p
->hit_set_grade_decay_rate
<< "\n";
5197 case HIT_SET_SEARCH_LAST_N
:
5198 ss
<< "hit_set_search_last_n: " <<
5199 p
->hit_set_search_last_n
<< "\n";
5202 ss
<< "allow_ec_overwrites: " <<
5203 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
5210 case WRITE_FADVISE_DONTNEED
:
5213 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5214 if (i
->second
== *it
)
5217 assert(i
!= ALL_CHOICES
.end());
5218 ss
<< i
->first
<< ": " <<
5219 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
5220 "true" : "false") << "\n";
5222 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
5223 ss
<< "min_write_recency_for_promote: " <<
5224 p
->min_write_recency_for_promote
<< "\n";
5227 ss
<< "fast_read: " << p
->fast_read
<< "\n";
5229 case SCRUB_MIN_INTERVAL
:
5230 case SCRUB_MAX_INTERVAL
:
5231 case DEEP_SCRUB_INTERVAL
:
5232 case RECOVERY_PRIORITY
:
5233 case RECOVERY_OP_PRIORITY
:
5234 case SCRUB_PRIORITY
:
5235 case COMPRESSION_MODE
:
5236 case COMPRESSION_ALGORITHM
:
5237 case COMPRESSION_REQUIRED_RATIO
:
5238 case COMPRESSION_MAX_BLOB_SIZE
:
5239 case COMPRESSION_MIN_BLOB_SIZE
:
5241 case CSUM_MAX_BLOCK
:
5242 case CSUM_MIN_BLOCK
:
5243 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5244 if (i
->second
== *it
)
5247 assert(i
!= ALL_CHOICES
.end());
5249 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
5250 if (p
->opts
.is_set(key
)) {
5251 if(key
== pool_opts_t::CSUM_TYPE
) {
5253 p
->opts
.get(key
, &val
);
5254 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
5256 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
5262 rdata
.append(ss
.str());
5267 } else if (prefix
== "osd pool stats") {
5268 r
= mon
->pgservice
->process_pg_command(prefix
, cmdmap
,
5269 osdmap
, f
.get(), &ss
, &rdata
);
5270 } else if (prefix
== "osd pool get-quota") {
5272 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", pool_name
);
5274 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
5276 assert(poolid
== -ENOENT
);
5277 ss
<< "unrecognized pool '" << pool_name
<< "'";
5281 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
5284 f
->open_object_section("pool_quotas");
5285 f
->dump_string("pool_name", pool_name
);
5286 f
->dump_unsigned("pool_id", poolid
);
5287 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
5288 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
5293 rs
<< "quotas for pool '" << pool_name
<< "':\n"
5294 << " max objects: ";
5295 if (p
->quota_max_objects
== 0)
5298 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
5301 if (p
->quota_max_bytes
== 0)
5304 rs
<< byte_u_t(p
->quota_max_bytes
);
5305 rdata
.append(rs
.str());
5309 } else if (prefix
== "osd crush rule list" ||
5310 prefix
== "osd crush rule ls") {
5312 f
->open_array_section("rules");
5313 osdmap
.crush
->list_rules(f
.get());
5318 osdmap
.crush
->list_rules(&ss
);
5319 rdata
.append(ss
.str());
5321 } else if (prefix
== "osd crush rule ls-by-class") {
5323 cmd_getval_throws(g_ceph_context
, cmdmap
, "class", class_name
);
5324 if (class_name
.empty()) {
5325 ss
<< "no class specified";
5330 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
5332 ss
<< "failed to get rules by class '" << class_name
<< "'";
5336 f
->open_array_section("rules");
5337 for (auto &rule
: rules
) {
5338 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
5344 for (auto &rule
: rules
) {
5345 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
5347 rdata
.append(rs
.str());
5349 } else if (prefix
== "osd crush rule dump") {
5351 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
5353 cmd_getval_throws(g_ceph_context
, cmdmap
, "format", format
);
5354 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5356 f
->open_array_section("rules");
5357 osdmap
.crush
->dump_rules(f
.get());
5360 int ruleno
= osdmap
.crush
->get_rule_id(name
);
5362 ss
<< "unknown crush rule '" << name
<< "'";
5366 osdmap
.crush
->dump_rule(ruleno
, f
.get());
5371 rdata
.append(rs
.str());
5372 } else if (prefix
== "osd crush dump") {
5374 cmd_getval_throws(g_ceph_context
, cmdmap
, "format", format
);
5375 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5376 f
->open_object_section("crush_map");
5377 osdmap
.crush
->dump(f
.get());
5382 rdata
.append(rs
.str());
5383 } else if (prefix
== "osd crush show-tunables") {
5385 cmd_getval_throws(g_ceph_context
, cmdmap
, "format", format
);
5386 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5387 f
->open_object_section("crush_map_tunables");
5388 osdmap
.crush
->dump_tunables(f
.get());
5393 rdata
.append(rs
.str());
5394 } else if (prefix
== "osd crush tree") {
5396 cmd_getval_throws(g_ceph_context
, cmdmap
, "shadow", shadow
);
5397 bool show_shadow
= shadow
== "--show-shadow";
5398 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5400 f
->open_object_section("crush_tree");
5401 osdmap
.crush
->dump_tree(nullptr,
5403 osdmap
.get_pool_names(),
5409 osdmap
.crush
->dump_tree(&ss
,
5411 osdmap
.get_pool_names(),
5413 rdata
.append(ss
.str());
5415 } else if (prefix
== "osd crush ls") {
5417 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "node", name
)) {
5418 ss
<< "no node specified";
5422 if (!osdmap
.crush
->name_exists(name
)) {
5423 ss
<< "node '" << name
<< "' does not exist";
5427 int id
= osdmap
.crush
->get_item_id(name
);
5430 result
.push_back(id
);
5432 int num
= osdmap
.crush
->get_bucket_size(id
);
5433 for (int i
= 0; i
< num
; ++i
) {
5434 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
5438 f
->open_array_section("items");
5439 for (auto i
: result
) {
5440 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
5446 for (auto i
: result
) {
5447 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
5449 rdata
.append(ss
.str());
5452 } else if (prefix
== "osd crush class ls") {
5453 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5454 f
->open_array_section("crush_classes");
5455 for (auto i
: osdmap
.crush
->class_name
)
5456 f
->dump_string("class", i
.second
);
5459 } else if (prefix
== "osd crush class ls-osd") {
5461 cmd_getval_throws(g_ceph_context
, cmdmap
, "class", name
);
5463 osdmap
.crush
->get_devices_by_class(name
, &osds
);
5465 f
->open_array_section("osds");
5466 for (auto &osd
: osds
)
5467 f
->dump_int("osd", osd
);
5472 for (auto &osd
: osds
) {
5480 } else if (prefix
== "osd erasure-code-profile ls") {
5481 const auto &profiles
= osdmap
.get_erasure_code_profiles();
5483 f
->open_array_section("erasure-code-profiles");
5484 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
5486 f
->dump_string("profile", i
->first
.c_str());
5488 rdata
.append(i
->first
+ "\n");
5495 rdata
.append(rs
.str());
5497 } else if (prefix
== "osd crush weight-set ls") {
5498 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5500 f
->open_array_section("weight_sets");
5501 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5502 f
->dump_string("pool", "(compat)");
5504 for (auto& i
: osdmap
.crush
->choose_args
) {
5506 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
5513 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5516 for (auto& i
: osdmap
.crush
->choose_args
) {
5518 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
5521 rdata
.append(rs
.str());
5523 } else if (prefix
== "osd crush weight-set dump") {
5524 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5526 osdmap
.crush
->dump_choose_args(f
.get());
5528 } else if (prefix
== "osd erasure-code-profile get") {
5530 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
5531 if (!osdmap
.has_erasure_code_profile(name
)) {
5532 ss
<< "unknown erasure code profile '" << name
<< "'";
5536 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
5538 f
->open_object_section("profile");
5539 for (map
<string
,string
>::const_iterator i
= profile
.begin();
5543 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
5545 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
5552 rdata
.append(rs
.str());
5554 } else if (prefix
== "osd pool application get") {
5555 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5558 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", pool_name
);
5560 cmd_getval_throws(g_ceph_context
, cmdmap
, "app", app
);
5562 cmd_getval_throws(g_ceph_context
, cmdmap
, "key", key
);
5564 if (pool_name
.empty()) {
5566 f
->open_object_section("pools");
5567 for (const auto &pool
: osdmap
.pools
) {
5568 std::string
name("<unknown>");
5569 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
5570 if (pni
!= osdmap
.pool_name
.end())
5572 f
->open_object_section(name
.c_str());
5573 for (auto &app_pair
: pool
.second
.application_metadata
) {
5574 f
->open_object_section(app_pair
.first
.c_str());
5575 for (auto &kv_pair
: app_pair
.second
) {
5576 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5580 f
->close_section(); // name
5582 f
->close_section(); // pools
5585 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
5587 ss
<< "unrecognized pool '" << pool_name
<< "'";
5591 auto p
= osdmap
.get_pg_pool(pool
);
5594 f
->open_object_section(pool_name
.c_str());
5595 for (auto &app_pair
: p
->application_metadata
) {
5596 f
->open_object_section(app_pair
.first
.c_str());
5597 for (auto &kv_pair
: app_pair
.second
) {
5598 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5600 f
->close_section(); // application
5602 f
->close_section(); // pool_name
5607 auto app_it
= p
->application_metadata
.find(app
);
5608 if (app_it
== p
->application_metadata
.end()) {
5609 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
5613 // filter by pool + app
5615 f
->open_object_section(app_it
->first
.c_str());
5616 for (auto &kv_pair
: app_it
->second
) {
5617 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5619 f
->close_section(); // application
5623 // filter by pool + app + key
5624 auto key_it
= app_it
->second
.find(key
);
5625 if (key_it
== app_it
->second
.end()) {
5626 ss
<< "application '" << app
<< "' on pool '" << pool_name
5627 << "' does not have key '" << key
<< "'";
5631 ss
<< key_it
->second
<< "\n";
5632 rdata
.append(ss
.str());
5636 // try prepare update
5643 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
5647 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
5649 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
5650 osdmap
.get_pg_pool(pool_id
));
5652 pool
->set_flag(flags
);
5655 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
5657 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
5658 osdmap
.get_pg_pool(pool_id
));
5660 pool
->unset_flag(flags
);
5663 bool OSDMonitor::update_pools_status()
5665 if (!mon
->pgservice
->is_readable())
5670 auto& pools
= osdmap
.get_pools();
5671 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
5672 const pool_stat_t
*pstat
= mon
->pgservice
->get_pool_stat(it
->first
);
5675 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
5676 const pg_pool_t
&pool
= it
->second
;
5677 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
5680 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
5681 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
5683 if (pool
.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
5687 mon
->clog
->info() << "pool '" << pool_name
5688 << "' no longer out of quota; removing NO_QUOTA flag";
5689 // below we cancel FLAG_FULL too, we'll set it again in
5690 // OSDMonitor::encode_pending if it still fails the osd-full checking.
5691 clear_pool_flags(it
->first
,
5692 pg_pool_t::FLAG_FULL_NO_QUOTA
| pg_pool_t::FLAG_FULL
);
5698 if (pool
.quota_max_bytes
> 0 &&
5699 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
5700 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
5701 << " (reached quota's max_bytes: "
5702 << byte_u_t(pool
.quota_max_bytes
) << ")";
5704 if (pool
.quota_max_objects
> 0 &&
5705 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
5706 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
5707 << " (reached quota's max_objects: "
5708 << pool
.quota_max_objects
<< ")";
5710 // set both FLAG_FULL_NO_QUOTA and FLAG_FULL
5711 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
5712 // since FLAG_FULL should always take precedence
5713 set_pool_flags(it
->first
,
5714 pg_pool_t::FLAG_FULL_NO_QUOTA
| pg_pool_t::FLAG_FULL
);
5715 clear_pool_flags(it
->first
,
5716 pg_pool_t::FLAG_NEARFULL
|
5717 pg_pool_t::FLAG_BACKFILLFULL
);
5724 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
5726 op
->mark_osdmon_event(__func__
);
5727 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
5728 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
5729 MonSession
*session
= m
->get_session();
5732 string erasure_code_profile
;
5737 ret
= prepare_new_pool(m
->name
, m
->auid
, m
->crush_rule
, rule_name
,
5739 erasure_code_profile
,
5740 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5742 ret
= prepare_new_pool(m
->name
, session
->auid
, m
->crush_rule
, rule_name
,
5744 erasure_code_profile
,
5745 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5748 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
5753 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
5754 const string
& dstname
,
5759 // Avoid creating a pending crush if it does not already exists and
5760 // the rename would fail.
5762 if (!_have_pending_crush()) {
5763 ret
= _get_stable_crush().can_rename_bucket(srcname
,
5770 CrushWrapper newcrush
;
5771 _get_pending_crush(newcrush
);
5773 ret
= newcrush
.rename_bucket(srcname
,
5779 pending_inc
.crush
.clear();
5780 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5781 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
5785 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
5787 string replacement
= "";
5789 if (plugin
== "jerasure_generic" ||
5790 plugin
== "jerasure_sse3" ||
5791 plugin
== "jerasure_sse4" ||
5792 plugin
== "jerasure_neon") {
5793 replacement
= "jerasure";
5794 } else if (plugin
== "shec_generic" ||
5795 plugin
== "shec_sse3" ||
5796 plugin
== "shec_sse4" ||
5797 plugin
== "shec_neon") {
5798 replacement
= "shec";
5801 if (replacement
!= "") {
5802 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
5803 << plugin
<< " that has been deprecated. Please use "
5804 << replacement
<< " instead." << dendl
;
5808 int OSDMonitor::normalize_profile(const string
& profilename
,
5809 ErasureCodeProfile
&profile
,
5813 ErasureCodeInterfaceRef erasure_code
;
5814 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5815 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
5816 check_legacy_ec_plugin(plugin
->second
, profilename
);
5817 int err
= instance
.factory(plugin
->second
,
5818 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5819 profile
, &erasure_code
, ss
);
5824 err
= erasure_code
->init(profile
, ss
);
5829 auto it
= profile
.find("stripe_unit");
5830 if (it
!= profile
.end()) {
5832 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
5833 if (!err_str
.empty()) {
5834 *ss
<< "could not parse stripe_unit '" << it
->second
5835 << "': " << err_str
<< std::endl
;
5838 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
5839 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
5840 if (chunk_size
!= stripe_unit
) {
5841 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
5842 << "alignment. Would be padded to " << chunk_size
5846 if ((stripe_unit
% 4096) != 0 && !force
) {
5847 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
5848 << "use --force to override this check" << std::endl
;
5855 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
5856 const string
&profile
,
5860 int ruleid
= osdmap
.crush
->get_rule_id(name
);
5861 if (ruleid
!= -ENOENT
) {
5862 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
5866 CrushWrapper newcrush
;
5867 _get_pending_crush(newcrush
);
5869 ruleid
= newcrush
.get_rule_id(name
);
5870 if (ruleid
!= -ENOENT
) {
5871 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
5874 ErasureCodeInterfaceRef erasure_code
;
5875 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
5877 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
5881 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
5882 erasure_code
.reset();
5886 pending_inc
.crush
.clear();
5887 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5892 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
5893 ErasureCodeInterfaceRef
*erasure_code
,
5896 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
5898 ErasureCodeProfile profile
=
5899 osdmap
.get_erasure_code_profile(erasure_code_profile
);
5900 ErasureCodeProfile::const_iterator plugin
=
5901 profile
.find("plugin");
5902 if (plugin
== profile
.end()) {
5903 *ss
<< "cannot determine the erasure code plugin"
5904 << " because there is no 'plugin' entry in the erasure_code_profile "
5905 << profile
<< std::endl
;
5908 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
5909 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5910 return instance
.factory(plugin
->second
,
5911 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5912 profile
, erasure_code
, ss
);
5915 int OSDMonitor::check_cluster_features(uint64_t features
,
5918 stringstream unsupported_ss
;
5919 int unsupported_count
= 0;
5920 if ((mon
->get_quorum_con_features() & features
) != features
) {
5921 unsupported_ss
<< "the monitor cluster";
5922 ++unsupported_count
;
5925 set
<int32_t> up_osds
;
5926 osdmap
.get_up_osds(up_osds
);
5927 for (set
<int32_t>::iterator it
= up_osds
.begin();
5928 it
!= up_osds
.end(); ++it
) {
5929 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
5930 if ((xi
.features
& features
) != features
) {
5931 if (unsupported_count
> 0)
5932 unsupported_ss
<< ", ";
5933 unsupported_ss
<< "osd." << *it
;
5934 unsupported_count
++;
5938 if (unsupported_count
> 0) {
5939 ss
<< "features " << features
<< " unsupported by: "
5940 << unsupported_ss
.str();
5944 // check pending osd state, too!
5945 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
5946 pending_inc
.new_xinfo
.begin();
5947 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
5948 const osd_xinfo_t
&xi
= p
->second
;
5949 if ((xi
.features
& features
) != features
) {
5950 dout(10) << __func__
<< " pending osd." << p
->first
5951 << " features are insufficient; retry" << dendl
;
5959 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
5962 OSDMap::Incremental new_pending
= pending_inc
;
5963 ::encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
5965 newmap
.deepish_copy_from(osdmap
);
5966 newmap
.apply_incremental(new_pending
);
5969 if (newmap
.require_min_compat_client
> 0) {
5970 auto mv
= newmap
.get_min_compat_client();
5971 if (mv
> newmap
.require_min_compat_client
) {
5972 ss
<< "new crush map requires client version " << ceph_release_name(mv
)
5973 << " but require_min_compat_client is "
5974 << ceph_release_name(newmap
.require_min_compat_client
);
5981 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
5982 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
5983 stringstream features_ss
;
5984 int r
= check_cluster_features(features
, features_ss
);
5986 ss
<< "Could not change CRUSH: " << features_ss
.str();
5993 bool OSDMonitor::erasure_code_profile_in_use(
5994 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
5995 const string
&profile
,
5999 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
6002 if (p
->second
.erasure_code_profile
== profile
) {
6003 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
6008 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
6013 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
6014 map
<string
,string
> *erasure_code_profile_map
,
6017 int r
= get_json_str_map(g_conf
->osd_pool_default_erasure_code_profile
,
6019 erasure_code_profile_map
);
6022 assert((*erasure_code_profile_map
).count("plugin"));
6023 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
6024 map
<string
,string
> user_map
;
6025 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
6026 i
!= erasure_code_profile
.end();
6028 size_t equal
= i
->find('=');
6029 if (equal
== string::npos
) {
6030 user_map
[*i
] = string();
6031 (*erasure_code_profile_map
)[*i
] = string();
6033 string key
= i
->substr(0, equal
);
6035 const string value
= i
->substr(equal
);
6036 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
6037 key
.find("ruleset-") == 0) {
6038 if (g_conf
->get_val
<bool>("mon_fixup_legacy_erasure_code_profiles")) {
6039 mon
->clog
->warn() << "erasure code profile property '" << key
6040 << "' is no longer supported; try "
6041 << "'crush-" << key
.substr(8) << "' instead";
6042 key
= string("crush-") + key
.substr(8);
6044 *ss
<< "property '" << key
<< "' is no longer supported; try "
6045 << "'crush-" << key
.substr(8) << "' instead";
6049 user_map
[key
] = value
;
6050 (*erasure_code_profile_map
)[key
] = value
;
6054 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
6055 (*erasure_code_profile_map
) = user_map
;
6060 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
6061 const string
&erasure_code_profile
,
6062 unsigned *size
, unsigned *min_size
,
6066 switch (pool_type
) {
6067 case pg_pool_t::TYPE_REPLICATED
:
6068 *size
= g_conf
->osd_pool_default_size
;
6069 *min_size
= g_conf
->get_osd_pool_default_min_size();
6071 case pg_pool_t::TYPE_ERASURE
:
6073 ErasureCodeInterfaceRef erasure_code
;
6074 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
6076 *size
= erasure_code
->get_chunk_count();
6077 *min_size
= MIN(erasure_code
->get_data_chunk_count() + 1, *size
);
6082 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
6089 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
6090 const string
&erasure_code_profile
,
6091 uint32_t *stripe_width
,
6095 switch (pool_type
) {
6096 case pg_pool_t::TYPE_REPLICATED
:
6099 case pg_pool_t::TYPE_ERASURE
:
6101 ErasureCodeProfile profile
=
6102 osdmap
.get_erasure_code_profile(erasure_code_profile
);
6103 ErasureCodeInterfaceRef erasure_code
;
6104 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
6107 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
6108 uint32_t stripe_unit
= g_conf
->osd_pool_erasure_code_stripe_unit
;
6109 auto it
= profile
.find("stripe_unit");
6110 if (it
!= profile
.end()) {
6112 stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
6113 assert(err_str
.empty());
6115 *stripe_width
= data_chunks
*
6116 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
6120 *ss
<< "prepare_pool_stripe_width: "
6121 << pool_type
<< " is not a known pool type";
6128 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
6129 const string
&erasure_code_profile
,
6130 const string
&rule_name
,
6135 if (*crush_rule
< 0) {
6136 switch (pool_type
) {
6137 case pg_pool_t::TYPE_REPLICATED
:
6139 if (rule_name
== "") {
6141 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context
);
6142 if (*crush_rule
< 0) {
6143 // Errors may happen e.g. if no valid rule is available
6144 *ss
<< "No suitable CRUSH rule exists, check "
6145 << "'osd pool default crush *' config options";
6149 return get_crush_rule(rule_name
, crush_rule
, ss
);
6153 case pg_pool_t::TYPE_ERASURE
:
6155 int err
= crush_rule_create_erasure(rule_name
,
6156 erasure_code_profile
,
6160 dout(20) << "prepare_pool_crush_rule: rule "
6161 << rule_name
<< " try again" << dendl
;
6164 // need to wait for the crush rule to be proposed before proceeding
6175 *ss
<< "prepare_pool_crush_rule: " << pool_type
6176 << " is not a known pool type";
6181 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
6182 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
6190 int OSDMonitor::get_crush_rule(const string
&rule_name
,
6195 ret
= osdmap
.crush
->get_rule_id(rule_name
);
6196 if (ret
!= -ENOENT
) {
6200 CrushWrapper newcrush
;
6201 _get_pending_crush(newcrush
);
6203 ret
= newcrush
.get_rule_id(rule_name
);
6204 if (ret
!= -ENOENT
) {
6205 // found it, wait for it to be proposed
6206 dout(20) << __func__
<< ": rule " << rule_name
6207 << " try again" << dendl
;
6210 // Cannot find it , return error
6211 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
6218 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
*ss
)
6220 auto max_pgs_per_osd
= g_conf
->get_val
<uint64_t>("mon_max_pg_per_osd");
6221 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u); // assume min cluster size 3
6222 auto max_pgs
= max_pgs_per_osd
* num_osds
;
6223 uint64_t projected
= 0;
6225 projected
+= pg_num
* size
;
6227 for (const auto& i
: osdmap
.get_pools()) {
6228 if (i
.first
== pool
) {
6229 projected
+= pg_num
* size
;
6231 projected
+= i
.second
.get_pg_num() * i
.second
.get_size();
6234 if (projected
> max_pgs
) {
6236 *ss
<< "pool id " << pool
;
6238 *ss
<< " pg_num " << pg_num
<< " size " << size
6239 << " would mean " << projected
6240 << " total pgs, which exceeds max " << max_pgs
6241 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6242 << " * num_in_osds " << num_osds
<< ")";
6249 * @param name The name of the new pool
6250 * @param auid The auid of the pool owner. Can be -1
6251 * @param crush_rule The crush rule to use. If <0, will use the system default
6252 * @param crush_rule_name The crush rule to use, if crush_rulset <0
6253 * @param pg_num The pg_num to use. If set to 0, will use the system default
6254 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6255 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6256 * @param pool_type TYPE_ERASURE, or TYPE_REP
6257 * @param expected_num_objects expected number of objects on the pool
6258 * @param fast_read fast read type.
6259 * @param ss human readable error message, if any.
6261 * @return 0 on success, negative errno on failure.
6263 int OSDMonitor::prepare_new_pool(string
& name
, uint64_t auid
,
6265 const string
&crush_rule_name
,
6266 unsigned pg_num
, unsigned pgp_num
,
6267 const string
&erasure_code_profile
,
6268 const unsigned pool_type
,
6269 const uint64_t expected_num_objects
,
6270 FastReadType fast_read
,
6273 if (name
.length() == 0)
6276 pg_num
= g_conf
->osd_pool_default_pg_num
;
6278 pgp_num
= g_conf
->osd_pool_default_pgp_num
;
6279 if (pg_num
> (unsigned)g_conf
->mon_max_pool_pg_num
) {
6280 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
6281 << g_conf
->mon_max_pool_pg_num
6282 << " (you may adjust 'mon max pool pg num' for higher values)";
6285 if (pgp_num
> pg_num
) {
6286 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6287 << ", which in this case is " << pg_num
;
6290 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
6291 *ss
<< "'fast_read' can only apply to erasure coding pool";
6295 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
6296 crush_rule_name
, &crush_rule
, ss
);
6298 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
6301 if (g_conf
->mon_osd_crush_smoke_test
) {
6302 CrushWrapper newcrush
;
6303 _get_pending_crush(newcrush
);
6305 CrushTester
tester(newcrush
, err
);
6306 tester
.set_min_x(0);
6307 tester
.set_max_x(50);
6308 tester
.set_rule(crush_rule
);
6309 auto start
= ceph::coarse_mono_clock::now();
6310 r
= tester
.test_with_fork(g_conf
->mon_lease
);
6311 auto duration
= ceph::coarse_mono_clock::now() - start
;
6313 dout(10) << "tester.test_with_fork returns " << r
6314 << ": " << err
.str() << dendl
;
6315 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
6318 dout(10) << __func__
<< " crush smoke test duration: "
6319 << duration
<< dendl
;
6321 unsigned size
, min_size
;
6322 r
= prepare_pool_size(pool_type
, erasure_code_profile
, &size
, &min_size
, ss
);
6324 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
6327 r
= check_pg_num(-1, pg_num
, size
, ss
);
6329 dout(10) << "check_pg_num returns " << r
<< dendl
;
6333 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
6337 uint32_t stripe_width
= 0;
6338 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
6340 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
6345 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
6346 switch (fast_read
) {
6353 case FAST_READ_DEFAULT
:
6354 fread
= g_conf
->mon_osd_pool_ec_fast_read
;
6357 *ss
<< "invalid fast_read setting: " << fast_read
;
6362 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
6363 p
!= pending_inc
.new_pool_names
.end();
6365 if (p
->second
== name
)
6369 if (-1 == pending_inc
.new_pool_max
)
6370 pending_inc
.new_pool_max
= osdmap
.pool_max
;
6371 int64_t pool
= ++pending_inc
.new_pool_max
;
6373 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
6374 pi
->type
= pool_type
;
6375 pi
->fast_read
= fread
;
6376 pi
->flags
= g_conf
->osd_pool_default_flags
;
6377 if (g_conf
->osd_pool_default_flag_hashpspool
)
6378 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
6379 if (g_conf
->osd_pool_default_flag_nodelete
)
6380 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
6381 if (g_conf
->osd_pool_default_flag_nopgchange
)
6382 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
6383 if (g_conf
->osd_pool_default_flag_nosizechange
)
6384 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
6385 if (g_conf
->osd_pool_use_gmt_hitset
&&
6386 (osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
))
6387 pi
->use_gmt_hitset
= true;
6389 pi
->use_gmt_hitset
= false;
6392 pi
->min_size
= min_size
;
6393 pi
->crush_rule
= crush_rule
;
6394 pi
->expected_num_objects
= expected_num_objects
;
6395 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
6396 pi
->set_pg_num(pg_num
);
6397 pi
->set_pgp_num(pgp_num
);
6398 pi
->last_change
= pending_inc
.epoch
;
6400 pi
->erasure_code_profile
= erasure_code_profile
;
6401 pi
->stripe_width
= stripe_width
;
6402 pi
->cache_target_dirty_ratio_micro
=
6403 g_conf
->osd_pool_default_cache_target_dirty_ratio
* 1000000;
6404 pi
->cache_target_dirty_high_ratio_micro
=
6405 g_conf
->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
6406 pi
->cache_target_full_ratio_micro
=
6407 g_conf
->osd_pool_default_cache_target_full_ratio
* 1000000;
6408 pi
->cache_min_flush_age
= g_conf
->osd_pool_default_cache_min_flush_age
;
6409 pi
->cache_min_evict_age
= g_conf
->osd_pool_default_cache_min_evict_age
;
6410 pending_inc
.new_pool_names
[pool
] = name
;
6414 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
6416 op
->mark_osdmon_event(__func__
);
6418 if (pending_inc
.new_flags
< 0)
6419 pending_inc
.new_flags
= osdmap
.get_flags();
6420 pending_inc
.new_flags
|= flag
;
6421 ss
<< OSDMap::get_flag_string(flag
) << " is set";
6422 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
6423 get_last_committed() + 1));
6427 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
6429 op
->mark_osdmon_event(__func__
);
6431 if (pending_inc
.new_flags
< 0)
6432 pending_inc
.new_flags
= osdmap
.get_flags();
6433 pending_inc
.new_flags
&= ~flag
;
6434 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
6435 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
6436 get_last_committed() + 1));
6440 int OSDMonitor::prepare_command_pool_set(map
<string
,cmd_vartype
> &cmdmap
,
6444 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
6445 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
6447 ss
<< "unrecognized pool '" << poolstr
<< "'";
6451 cmd_getval_throws(g_ceph_context
, cmdmap
, "var", var
);
6453 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
6454 if (pending_inc
.new_pools
.count(pool
))
6455 p
= pending_inc
.new_pools
[pool
];
6457 // accept val as a json string in the normal case (current
6458 // generation monitor). parse out int or float values from the
6459 // string as needed. however, if it is not a string, try to pull
6460 // out an int, in case an older monitor with an older json schema is
6461 // forwarding a request.
6463 string interr
, floaterr
;
6466 int64_t uf
= 0; // micro-f
6467 cmd_getval(g_ceph_context
, cmdmap
, "val", val
);
6469 // parse string as both int and float; different fields use different types.
6470 n
= strict_strtoll(val
.c_str(), 10, &interr
);
6471 f
= strict_strtod(val
.c_str(), &floaterr
);
6472 uf
= llrintl(f
* (double)1000000.0);
6475 (var
== "hit_set_type" || var
== "hit_set_period" ||
6476 var
== "hit_set_count" || var
== "hit_set_fpp" ||
6477 var
== "target_max_objects" || var
== "target_max_bytes" ||
6478 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
6479 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
6480 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
6481 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
6482 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
6486 if (var
== "size") {
6487 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
6488 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
6491 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
6492 ss
<< "can not change the size of an erasure-coded pool";
6495 if (interr
.length()) {
6496 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6499 if (n
<= 0 || n
> 10) {
6500 ss
<< "pool size must be between 1 and 10";
6503 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, &ss
);
6510 } else if (var
== "min_size") {
6511 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
6512 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
6515 if (interr
.length()) {
6516 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6520 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
6521 if (n
< 1 || n
> p
.size
) {
6522 ss
<< "pool min_size must be between 1 and " << (int)p
.size
;
6526 ErasureCodeInterfaceRef erasure_code
;
6529 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
6531 k
= erasure_code
->get_data_chunk_count();
6533 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
6537 if (n
< k
|| n
> p
.size
) {
6538 ss
<< "pool min_size must be between " << k
<< " and " << (int)p
.size
;
6543 } else if (var
== "auid") {
6544 if (interr
.length()) {
6545 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6549 } else if (var
== "crash_replay_interval") {
6550 if (interr
.length()) {
6551 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6554 p
.crash_replay_interval
= n
;
6555 } else if (var
== "pg_num") {
6556 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
6557 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
6560 if (interr
.length()) {
6561 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6564 if (n
<= (int)p
.get_pg_num()) {
6565 ss
<< "specified pg_num " << n
<< " <= current " << p
.get_pg_num();
6566 if (n
< (int)p
.get_pg_num())
6570 if (n
> (unsigned)g_conf
->mon_max_pool_pg_num
) {
6571 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
6572 << g_conf
->mon_max_pool_pg_num
6573 << " (you may adjust 'mon max pool pg num' for higher values)";
6576 int r
= check_pg_num(pool
, n
, p
.get_size(), &ss
);
6581 cmd_getval_throws(g_ceph_context
,cmdmap
, "force", force
);
6582 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&&
6583 force
!= "--yes-i-really-mean-it") {
6584 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
6587 int expected_osds
= MIN(p
.get_pg_num(), osdmap
.get_num_osds());
6588 int64_t new_pgs
= n
- p
.get_pg_num();
6589 if (new_pgs
> g_conf
->mon_osd_max_split_count
* expected_osds
) {
6590 ss
<< "specified pg_num " << n
<< " is too large (creating "
6591 << new_pgs
<< " new PGs on ~" << expected_osds
6592 << " OSDs exceeds per-OSD max of " << g_conf
->mon_osd_max_split_count
6597 // force pre-luminous clients to resend their ops, since they
6598 // don't understand that split PGs now form a new interval.
6599 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
6600 } else if (var
== "pgp_num") {
6601 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
6602 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
6605 if (interr
.length()) {
6606 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6610 ss
<< "specified pgp_num must > 0, but you set to " << n
;
6613 if (n
> (int)p
.get_pg_num()) {
6614 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
6618 } else if (var
== "crush_rule") {
6619 int id
= osdmap
.crush
->get_rule_id(val
);
6620 if (id
== -ENOENT
) {
6621 ss
<< "crush rule " << val
<< " does not exist";
6625 ss
<< cpp_strerror(id
);
6628 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
6632 } else if (var
== "nodelete" || var
== "nopgchange" ||
6633 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
6634 var
== "noscrub" || var
== "nodeep-scrub") {
6635 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
6636 // make sure we only compare against 'n' if we didn't receive a string
6637 if (val
== "true" || (interr
.empty() && n
== 1)) {
6639 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6642 ss
<< "expecting value 'true', 'false', '0', or '1'";
6645 } else if (var
== "hashpspool") {
6646 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
6648 cmd_getval_throws(g_ceph_context
, cmdmap
, "force", force
);
6649 if (force
!= "--yes-i-really-mean-it") {
6650 ss
<< "are you SURE? this will remap all placement groups in this pool,"
6651 " this triggers large data movement,"
6652 " pass --yes-i-really-mean-it if you really do.";
6655 // make sure we only compare against 'n' if we didn't receive a string
6656 if (val
== "true" || (interr
.empty() && n
== 1)) {
6658 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6661 ss
<< "expecting value 'true', 'false', '0', or '1'";
6664 } else if (var
== "hit_set_type") {
6666 p
.hit_set_params
= HitSet::Params();
6668 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
6671 if (val
== "bloom") {
6672 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
6673 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
6674 p
.hit_set_params
= HitSet::Params(bsp
);
6675 } else if (val
== "explicit_hash")
6676 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
6677 else if (val
== "explicit_object")
6678 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
6680 ss
<< "unrecognized hit_set type '" << val
<< "'";
6684 } else if (var
== "hit_set_period") {
6685 if (interr
.length()) {
6686 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6689 p
.hit_set_period
= n
;
6690 } else if (var
== "hit_set_count") {
6691 if (interr
.length()) {
6692 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6695 p
.hit_set_count
= n
;
6696 } else if (var
== "hit_set_fpp") {
6697 if (floaterr
.length()) {
6698 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
6701 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
6702 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
6705 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
6707 } else if (var
== "use_gmt_hitset") {
6708 if (val
== "true" || (interr
.empty() && n
== 1)) {
6710 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
6711 if (!osdmap
.get_num_up_osds() && force
!= "--yes-i-really-mean-it") {
6712 ss
<< "Not advisable to continue since no OSDs are up. Pass "
6713 << "--yes-i-really-mean-it if you really wish to continue.";
6716 if (!(osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
)
6717 && force
!= "--yes-i-really-mean-it") {
6718 ss
<< "not all OSDs support GMT hit set.";
6721 p
.use_gmt_hitset
= true;
6723 ss
<< "expecting value 'true' or '1'";
6726 } else if (var
== "allow_ec_overwrites") {
6727 if (!p
.is_erasure()) {
6728 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
6732 if (!g_conf
->mon_debug_no_require_bluestore_for_ec_overwrites
&&
6733 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
6734 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
6737 if (val
== "true" || (interr
.empty() && n
== 1)) {
6738 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
6739 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6740 ss
<< "ec overwrites cannot be disabled once enabled";
6743 ss
<< "expecting value 'true', 'false', '0', or '1'";
6746 } else if (var
== "target_max_objects") {
6747 if (interr
.length()) {
6748 ss
<< "error parsing int '" << val
<< "': " << interr
;
6751 p
.target_max_objects
= n
;
6752 } else if (var
== "target_max_bytes") {
6753 if (interr
.length()) {
6754 ss
<< "error parsing int '" << val
<< "': " << interr
;
6757 p
.target_max_bytes
= n
;
6758 } else if (var
== "cache_target_dirty_ratio") {
6759 if (floaterr
.length()) {
6760 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6763 if (f
< 0 || f
> 1.0) {
6764 ss
<< "value must be in the range 0..1";
6767 p
.cache_target_dirty_ratio_micro
= uf
;
6768 } else if (var
== "cache_target_dirty_high_ratio") {
6769 if (floaterr
.length()) {
6770 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6773 if (f
< 0 || f
> 1.0) {
6774 ss
<< "value must be in the range 0..1";
6777 p
.cache_target_dirty_high_ratio_micro
= uf
;
6778 } else if (var
== "cache_target_full_ratio") {
6779 if (floaterr
.length()) {
6780 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6783 if (f
< 0 || f
> 1.0) {
6784 ss
<< "value must be in the range 0..1";
6787 p
.cache_target_full_ratio_micro
= uf
;
6788 } else if (var
== "cache_min_flush_age") {
6789 if (interr
.length()) {
6790 ss
<< "error parsing int '" << val
<< "': " << interr
;
6793 p
.cache_min_flush_age
= n
;
6794 } else if (var
== "cache_min_evict_age") {
6795 if (interr
.length()) {
6796 ss
<< "error parsing int '" << val
<< "': " << interr
;
6799 p
.cache_min_evict_age
= n
;
6800 } else if (var
== "min_read_recency_for_promote") {
6801 if (interr
.length()) {
6802 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6805 p
.min_read_recency_for_promote
= n
;
6806 } else if (var
== "hit_set_grade_decay_rate") {
6807 if (interr
.length()) {
6808 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6811 if (n
> 100 || n
< 0) {
6812 ss
<< "value out of range,valid range is 0 - 100";
6815 p
.hit_set_grade_decay_rate
= n
;
6816 } else if (var
== "hit_set_search_last_n") {
6817 if (interr
.length()) {
6818 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6821 if (n
> p
.hit_set_count
|| n
< 0) {
6822 ss
<< "value out of range,valid range is 0 - hit_set_count";
6825 p
.hit_set_search_last_n
= n
;
6826 } else if (var
== "min_write_recency_for_promote") {
6827 if (interr
.length()) {
6828 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6831 p
.min_write_recency_for_promote
= n
;
6832 } else if (var
== "fast_read") {
6833 if (p
.is_replicated()) {
6834 ss
<< "fast read is not supported in replication pool";
6837 if (val
== "true" || (interr
.empty() && n
== 1)) {
6839 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6840 p
.fast_read
= false;
6842 ss
<< "expecting value 'true', 'false', '0', or '1'";
6845 } else if (pool_opts_t::is_opt_name(var
)) {
6846 bool unset
= val
== "unset";
6847 if (var
== "compression_mode") {
6849 auto cmode
= Compressor::get_comp_mode_type(val
);
6851 ss
<< "unrecognized compression mode '" << val
<< "'";
6855 } else if (var
== "compression_algorithm") {
6857 auto alg
= Compressor::get_comp_alg_type(val
);
6859 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
6863 } else if (var
== "compression_required_ratio") {
6864 if (floaterr
.length()) {
6865 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
6868 if (f
< 0 || f
> 1) {
6869 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
6872 } else if (var
== "csum_type") {
6873 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
6875 ss
<< "unrecognized csum_type '" << val
<< "'";
6878 //preserve csum_type numeric value
6881 } else if (var
== "compression_max_blob_size" ||
6882 var
== "compression_min_blob_size" ||
6883 var
== "csum_max_block" ||
6884 var
== "csum_min_block") {
6885 if (interr
.length()) {
6886 ss
<< "error parsing int value '" << val
<< "': " << interr
;
6891 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
6892 switch (desc
.type
) {
6893 case pool_opts_t::STR
:
6895 p
.opts
.unset(desc
.key
);
6897 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
6900 case pool_opts_t::INT
:
6901 if (interr
.length()) {
6902 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6906 p
.opts
.unset(desc
.key
);
6908 p
.opts
.set(desc
.key
, static_cast<int>(n
));
6911 case pool_opts_t::DOUBLE
:
6912 if (floaterr
.length()) {
6913 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
6917 p
.opts
.unset(desc
.key
);
6919 p
.opts
.set(desc
.key
, static_cast<double>(f
));
6923 assert(!"unknown type");
6926 ss
<< "unrecognized variable '" << var
<< "'";
6929 if (val
!= "unset") {
6930 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
6932 ss
<< "unset pool " << pool
<< " " << var
;
6934 p
.last_change
= pending_inc
.epoch
;
6935 pending_inc
.new_pools
[pool
] = p
;
6939 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
6940 map
<string
,cmd_vartype
> &cmdmap
,
6944 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", pool_name
);
6945 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6947 ss
<< "unrecognized pool '" << pool_name
<< "'";
6951 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
6952 if (pending_inc
.new_pools
.count(pool
)) {
6953 p
= pending_inc
.new_pools
[pool
];
6957 cmd_getval_throws(g_ceph_context
, cmdmap
, "app", app
);
6958 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
6960 if (boost::algorithm::ends_with(prefix
, "enable")) {
6962 ss
<< "application name must be provided";
6967 ss
<< "application must be enabled on base tier";
6972 cmd_getval_throws(g_ceph_context
, cmdmap
, "force", force
);
6974 if (!app_exists
&& !p
.application_metadata
.empty() &&
6975 force
!= "--yes-i-really-mean-it") {
6976 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
6977 << "application; pass --yes-i-really-mean-it to proceed anyway";
6981 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
6982 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
6983 << "max " << MAX_POOL_APPLICATIONS
;
6987 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
6988 ss
<< "application name '" << app
<< "' too long; max length "
6989 << MAX_POOL_APPLICATION_LENGTH
;
6994 p
.application_metadata
[app
] = {};
6996 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
6998 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
7000 cmd_getval_throws(g_ceph_context
, cmdmap
, "force", force
);
7002 if (force
!= "--yes-i-really-mean-it") {
7003 ss
<< "Are you SURE? Disabling an application within a pool might result "
7004 << "in loss of application functionality; pass "
7005 << "--yes-i-really-mean-it to proceed anyway";
7010 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
7012 return 0; // idempotent
7015 p
.application_metadata
.erase(app
);
7016 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
7018 } else if (boost::algorithm::ends_with(prefix
, "set")) {
7020 ss
<< "application metadata must be set on base tier";
7025 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
7031 cmd_getval_throws(g_ceph_context
, cmdmap
, "key", key
);
7034 ss
<< "key must be provided";
7038 auto &app_keys
= p
.application_metadata
[app
];
7039 if (app_keys
.count(key
) == 0 &&
7040 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
7041 ss
<< "too many keys set for application '" << app
<< "' on pool '"
7042 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
7046 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
7047 ss
<< "key '" << app
<< "' too long; max length "
7048 << MAX_POOL_APPLICATION_LENGTH
;
7053 cmd_getval_throws(g_ceph_context
, cmdmap
, "value", value
);
7054 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
7055 ss
<< "value '" << value
<< "' too long; max length "
7056 << MAX_POOL_APPLICATION_LENGTH
;
7060 p
.application_metadata
[app
][key
] = value
;
7061 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
7062 << value
<< "' on pool '" << pool_name
<< "'";
7063 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
7065 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
7071 cmd_getval_throws(g_ceph_context
, cmdmap
, "key", key
);
7072 auto it
= p
.application_metadata
[app
].find(key
);
7073 if (it
== p
.application_metadata
[app
].end()) {
7074 ss
<< "application '" << app
<< "' on pool '" << pool_name
7075 << "' does not have key '" << key
<< "'";
7076 return 0; // idempotent
7079 p
.application_metadata
[app
].erase(it
);
7080 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
7081 << pool_name
<< "'";
7086 p
.last_change
= pending_inc
.epoch
;
7087 pending_inc
.new_pools
[pool
] = p
;
7091 int OSDMonitor::_prepare_command_osd_crush_remove(
7092 CrushWrapper
&newcrush
,
7101 err
= newcrush
.remove_item_under(g_ceph_context
, id
, ancestor
,
7104 err
= newcrush
.remove_item(g_ceph_context
, id
, unlink_only
);
7109 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
7111 pending_inc
.crush
.clear();
7112 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7115 int OSDMonitor::prepare_command_osd_crush_remove(
7116 CrushWrapper
&newcrush
,
7122 int err
= _prepare_command_osd_crush_remove(
7123 newcrush
, id
, ancestor
,
7124 has_ancestor
, unlink_only
);
7130 do_osd_crush_remove(newcrush
);
7135 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
7137 if (osdmap
.is_up(id
)) {
7141 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
7142 pending_inc
.new_uuid
[id
] = uuid_d();
7143 pending_metadata_rm
.insert(id
);
7144 pending_metadata
.erase(id
);
7149 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
7151 assert(existing_id
);
7154 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
7155 if (!osdmap
.exists(i
) &&
7156 pending_inc
.new_up_client
.count(i
) == 0 &&
7157 (pending_inc
.new_state
.count(i
) == 0 ||
7158 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
7164 if (pending_inc
.new_max_osd
< 0) {
7165 return osdmap
.get_max_osd();
7167 return pending_inc
.new_max_osd
;
7170 void OSDMonitor::do_osd_create(
7173 const string
& device_class
,
7176 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
7179 // We presume validation has been performed prior to calling this
7180 // function. We assert with prejudice.
7182 int32_t allocated_id
= -1; // declare here so we can jump
7183 int32_t existing_id
= -1;
7184 if (!uuid
.is_zero()) {
7185 existing_id
= osdmap
.identify_osd(uuid
);
7186 if (existing_id
>= 0) {
7187 assert(id
< 0 || id
== existing_id
);
7188 *new_id
= existing_id
;
7190 } else if (id
>= 0) {
7191 // uuid does not exist, and id has been provided, so just create
7198 // allocate a new id
7199 allocated_id
= _allocate_osd_id(&existing_id
);
7200 dout(10) << __func__
<< " allocated id " << allocated_id
7201 << " existing id " << existing_id
<< dendl
;
7202 if (existing_id
>= 0) {
7203 assert(existing_id
< osdmap
.get_max_osd());
7204 assert(allocated_id
< 0);
7205 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
7206 *new_id
= existing_id
;
7207 } else if (allocated_id
>= 0) {
7208 assert(existing_id
< 0);
7210 if (pending_inc
.new_max_osd
< 0) {
7211 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
7213 ++pending_inc
.new_max_osd
;
7215 *new_id
= pending_inc
.new_max_osd
- 1;
7216 assert(*new_id
== allocated_id
);
7218 assert(0 == "unexpected condition");
7222 if (device_class
.size()) {
7223 CrushWrapper newcrush
;
7224 _get_pending_crush(newcrush
);
7225 if (newcrush
.get_max_devices() < *new_id
+ 1) {
7226 newcrush
.set_max_devices(*new_id
+ 1);
7228 string name
= string("osd.") + stringify(*new_id
);
7229 if (!newcrush
.item_exists(*new_id
)) {
7230 newcrush
.set_item_name(*new_id
, name
);
7233 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
7235 derr
<< __func__
<< " failed to set " << name
<< " device_class "
7236 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
7238 // non-fatal... this might be a replay and we want to be idempotent.
7240 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
7242 pending_inc
.crush
.clear();
7243 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7246 dout(20) << __func__
<< " no device_class" << dendl
;
7249 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
7250 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
7251 pending_inc
.new_max_osd
= *new_id
+ 1;
7254 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
7255 if (!uuid
.is_zero())
7256 pending_inc
.new_uuid
[*new_id
] = uuid
;
7259 int OSDMonitor::validate_osd_create(
7262 const bool check_osd_exists
,
7263 int32_t* existing_id
,
7267 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
7268 << " check_osd_exists " << check_osd_exists
<< dendl
;
7270 assert(existing_id
);
7272 if (id
< 0 && uuid
.is_zero()) {
7273 // we have nothing to validate
7276 } else if (uuid
.is_zero()) {
7277 // we have an id but we will ignore it - because that's what
7278 // `osd create` does.
7283 * This function will be used to validate whether we are able to
7284 * create a new osd when the `uuid` is specified.
7286 * It will be used by both `osd create` and `osd new`, as the checks
7287 * are basically the same when it pertains to osd id and uuid validation.
7288 * However, `osd create` presumes an `uuid` is optional, for legacy
7289 * reasons, while `osd new` requires the `uuid` to be provided. This
7290 * means that `osd create` will not be idempotent if an `uuid` is not
7291 * provided, but we will always guarantee the idempotency of `osd new`.
7294 assert(!uuid
.is_zero());
7295 if (pending_inc
.identify_osd(uuid
) >= 0) {
7296 // osd is about to exist
7300 int32_t i
= osdmap
.identify_osd(uuid
);
7302 // osd already exists
7303 if (id
>= 0 && i
!= id
) {
7304 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
7307 // return a positive errno to distinguish between a blocking error
7308 // and an error we consider to not be a problem (i.e., this would be
7309 // an idempotent operation).
7315 if (pending_inc
.new_state
.count(id
)) {
7316 // osd is about to exist
7319 // we may not care if an osd exists if we are recreating a previously
7321 if (check_osd_exists
&& osdmap
.exists(id
)) {
7322 ss
<< "id " << id
<< " already in use and does not match uuid "
7330 int OSDMonitor::prepare_command_osd_create(
7333 int32_t* existing_id
,
7336 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
7337 assert(existing_id
);
7338 if (osdmap
.is_destroyed(id
)) {
7339 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
7344 if (uuid
.is_zero()) {
7345 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
7348 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
7351 int OSDMonitor::prepare_command_osd_new(
7353 const map
<string
,cmd_vartype
>& cmdmap
,
7354 const map
<string
,string
>& params
,
7362 assert(paxos
->is_plugged());
7364 dout(10) << __func__
<< " " << op
<< dendl
;
7366 /* validate command. abort now if something's wrong. */
7368 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
7370 * If `id` is not specified, we will identify any existing osd based
7371 * on `uuid`. Operation will be idempotent iff secrets match.
7373 * If `id` is specified, we will identify any existing osd based on
7374 * `uuid` and match against `id`. If they match, operation will be
7375 * idempotent iff secrets match.
7377 * `-i secrets.json` will be optional. If supplied, will be used
7378 * to check for idempotency when `id` and `uuid` match.
7380 * If `id` is not specified, and `uuid` does not exist, an id will
7381 * be found or allocated for the osd.
7383 * If `id` is specified, and the osd has been previously marked
7384 * as destroyed, then the `id` will be reused.
7386 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "uuid", uuidstr
)) {
7387 ss
<< "requires the OSD's UUID to be specified.";
7389 } else if (!uuid
.parse(uuidstr
.c_str())) {
7390 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
7394 if (cmd_getval_throws(g_ceph_context
, cmdmap
, "id", id
) &&
7396 ss
<< "invalid OSD id; must be greater or equal than zero.";
7400 // are we running an `osd create`-like command, or recreating
7401 // a previously destroyed osd?
7403 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
7405 // we will care about `id` to assess whether osd is `destroyed`, or
7406 // to create a new osd.
7407 // we will need an `id` by the time we reach auth.
7409 int32_t existing_id
= -1;
7410 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
7413 bool may_be_idempotent
= false;
7414 if (err
== EEXIST
) {
7415 // this is idempotent from the osdmon's point-of-view
7416 may_be_idempotent
= true;
7417 assert(existing_id
>= 0);
7419 } else if (err
< 0) {
7423 if (!may_be_idempotent
) {
7424 // idempotency is out of the window. We are either creating a new
7425 // osd or recreating a destroyed osd.
7427 // We now need to figure out if we have an `id` (and if it's valid),
7428 // of find an `id` if we don't have one.
7430 // NOTE: we need to consider the case where the `id` is specified for
7431 // `osd create`, and we must honor it. So this means checking if
7432 // the `id` is destroyed, and if so assume the destroy; otherwise,
7433 // check if it `exists` - in which case we complain about not being
7434 // `destroyed`. In the end, if nothing fails, we must allow the
7435 // creation, so that we are compatible with `create`.
7436 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
7437 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
7438 ss
<< "OSD " << id
<< " has not yet been destroyed";
7440 } else if (id
< 0) {
7442 id
= _allocate_osd_id(&existing_id
);
7444 assert(existing_id
>= 0);
7447 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
7448 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
7449 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
7451 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
7455 assert(osdmap
.exists(id
));
7458 // we are now able to either create a brand new osd or reuse an existing
7459 // osd that has been previously destroyed.
7461 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
7463 if (may_be_idempotent
&& params
.empty()) {
7464 // nothing to do, really.
7465 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
7468 f
->open_object_section("created_osd");
7469 f
->dump_int("osdid", id
);
7477 string device_class
;
7478 auto p
= params
.find("crush_device_class");
7479 if (p
!= params
.end()) {
7480 device_class
= p
->second
;
7481 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
7483 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
7484 bool has_lockbox
= false;
7485 bool has_secrets
= params
.count("cephx_secret")
7486 || params
.count("cephx_lockbox_secret")
7487 || params
.count("dmcrypt_key");
7489 ConfigKeyService
*svc
= nullptr;
7490 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
7493 if (params
.count("cephx_secret") == 0) {
7494 ss
<< "requires a cephx secret.";
7497 cephx_secret
= params
.at("cephx_secret");
7499 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
7500 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
7502 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
7503 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
7505 if (has_lockbox_secret
&& has_dmcrypt_key
) {
7507 lockbox_secret
= params
.at("cephx_lockbox_secret");
7508 dmcrypt_key
= params
.at("dmcrypt_key");
7509 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
7510 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
7514 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
7516 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
7524 } else if (may_be_idempotent
&& err
!= EEXIST
) {
7525 // for this to be idempotent, `id` should already be >= 0; no need
7526 // to use validate_id.
7528 ss
<< "osd." << id
<< " exists but secrets do not match";
7533 svc
= (ConfigKeyService
*)mon
->config_key_service
;
7534 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
7537 } else if (may_be_idempotent
&& err
!= EEXIST
) {
7539 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
7544 assert(!has_secrets
|| !cephx_secret
.empty());
7545 assert(!has_lockbox
|| !lockbox_secret
.empty());
7547 if (may_be_idempotent
) {
7548 // we have nothing to do for either the osdmon or the authmon,
7549 // and we have no lockbox - so the config key service will not be
7550 // touched. This is therefore an idempotent operation, and we can
7551 // just return right away.
7552 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
7555 f
->open_object_section("created_osd");
7556 f
->dump_int("osdid", id
);
7563 assert(!may_be_idempotent
);
7567 assert(!cephx_secret
.empty());
7568 assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
7569 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
7571 err
= mon
->authmon()->do_osd_new(cephx_entity
,
7577 assert(nullptr != svc
);
7578 svc
->do_osd_new(uuid
, dmcrypt_key
);
7582 if (is_recreate_destroyed
) {
7584 assert(osdmap
.is_destroyed(id
));
7585 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
7586 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
| CEPH_OSD_NEW
;
7587 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
7588 // due to http://tracker.ceph.com/issues/20751 some clusters may
7589 // have UP set for non-existent OSDs; make sure it is cleared
7590 // for a newly created osd.
7591 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
7593 pending_inc
.new_uuid
[id
] = uuid
;
7596 int32_t new_id
= -1;
7597 do_osd_create(id
, uuid
, device_class
, &new_id
);
7598 assert(new_id
>= 0);
7599 assert(id
== new_id
);
7603 f
->open_object_section("created_osd");
7604 f
->dump_int("osdid", id
);
7613 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
7615 op
->mark_osdmon_event(__func__
);
7616 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
7618 map
<string
, cmd_vartype
> cmdmap
;
7619 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
7620 string rs
= ss
.str();
7621 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
7625 MonSession
*session
= m
->get_session();
7627 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
7631 return prepare_command_impl(op
, cmdmap
);
7634 static int parse_reweights(CephContext
*cct
,
7635 const map
<string
,cmd_vartype
> &cmdmap
,
7636 const OSDMap
& osdmap
,
7637 map
<int32_t, uint32_t>* weights
)
7640 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "weights", weights_str
)) {
7643 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
7644 json_spirit::mValue json_value
;
7645 if (!json_spirit::read(weights_str
, json_value
)) {
7648 if (json_value
.type() != json_spirit::obj_type
) {
7651 const auto obj
= json_value
.get_obj();
7653 for (auto& osd_weight
: obj
) {
7654 auto osd_id
= std::stoi(osd_weight
.first
);
7655 if (!osdmap
.exists(osd_id
)) {
7658 if (osd_weight
.second
.type() != json_spirit::str_type
) {
7661 auto weight
= std::stoul(osd_weight
.second
.get_str());
7662 weights
->insert({osd_id
, weight
});
7664 } catch (const std::logic_error
& e
) {
7670 int OSDMonitor::prepare_command_osd_destroy(
7674 assert(paxos
->is_plugged());
7676 // we check if the osd exists for the benefit of `osd purge`, which may
7677 // have previously removed the osd. If the osd does not exist, return
7678 // -ENOENT to convey this, and let the caller deal with it.
7680 // we presume that all auth secrets and config keys were removed prior
7681 // to this command being called. if they exist by now, we also assume
7682 // they must have been created by some other command and do not pertain
7683 // to this non-existent osd.
7684 if (!osdmap
.exists(id
)) {
7685 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
7689 uuid_d uuid
= osdmap
.get_uuid(id
);
7690 dout(10) << __func__
<< " destroying osd." << id
7691 << " uuid " << uuid
<< dendl
;
7693 // if it has been destroyed, we assume our work here is done.
7694 if (osdmap
.is_destroyed(id
)) {
7695 ss
<< "destroyed osd." << id
;
7699 EntityName cephx_entity
, lockbox_entity
;
7700 bool idempotent_auth
= false, idempotent_cks
= false;
7702 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
7707 if (err
== -ENOENT
) {
7708 idempotent_auth
= true;
7714 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
7715 err
= svc
->validate_osd_destroy(id
, uuid
);
7717 assert(err
== -ENOENT
);
7719 idempotent_cks
= true;
7722 if (!idempotent_auth
) {
7723 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
7727 if (!idempotent_cks
) {
7728 svc
->do_osd_destroy(id
, uuid
);
7731 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
7732 pending_inc
.new_uuid
[id
] = uuid_d();
7734 // we can only propose_pending() once per service, otherwise we'll be
7735 // defying PaxosService and all laws of nature. Therefore, as we may
7736 // be used during 'osd purge', let's keep the caller responsible for
7742 int OSDMonitor::prepare_command_osd_purge(
7746 assert(paxos
->is_plugged());
7747 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
7749 assert(!osdmap
.is_up(id
));
7752 * This may look a bit weird, but this is what's going to happen:
7754 * 1. we make sure that removing from crush works
7755 * 2. we call `prepare_command_osd_destroy()`. If it returns an
7756 * error, then we abort the whole operation, as no updates
7757 * have been made. However, we this function will have
7758 * side-effects, thus we need to make sure that all operations
7759 * performed henceforth will *always* succeed.
7760 * 3. we call `prepare_command_osd_remove()`. Although this
7761 * function can return an error, it currently only checks if the
7762 * osd is up - and we have made sure that it is not so, so there
7763 * is no conflict, and it is effectively an update.
7764 * 4. finally, we call `do_osd_crush_remove()`, which will perform
7765 * the crush update we delayed from before.
7768 CrushWrapper newcrush
;
7769 _get_pending_crush(newcrush
);
7771 bool may_be_idempotent
= false;
7773 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
7774 if (err
== -ENOENT
) {
7776 may_be_idempotent
= true;
7777 } else if (err
< 0) {
7778 ss
<< "error removing osd." << id
<< " from crush";
7782 // no point destroying the osd again if it has already been marked destroyed
7783 if (!osdmap
.is_destroyed(id
)) {
7784 err
= prepare_command_osd_destroy(id
, ss
);
7786 if (err
== -ENOENT
) {
7792 may_be_idempotent
= false;
7797 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
7798 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
7799 << "we are idempotent." << dendl
;
7803 err
= prepare_command_osd_remove(id
);
7804 // we should not be busy, as we should have made sure this id is not up.
7807 do_osd_crush_remove(newcrush
);
7811 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
7812 map
<string
,cmd_vartype
> &cmdmap
)
7814 op
->mark_osdmon_event(__func__
);
7815 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
7823 cmd_getval_throws(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
7824 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
7827 cmd_getval_throws(g_ceph_context
, cmdmap
, "prefix", prefix
);
7831 bool osdid_present
= false;
7832 if (prefix
!= "osd pg-temp" &&
7833 prefix
!= "osd pg-upmap" &&
7834 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
7835 osdid_present
= cmd_getval_throws(g_ceph_context
, cmdmap
, "id", osdid
);
7837 if (osdid_present
) {
7839 oss
<< "osd." << osdid
;
7843 // Even if there's a pending state with changes that could affect
7844 // a command, considering that said state isn't yet committed, we
7845 // just don't care about those changes if the command currently being
7846 // handled acts as a no-op against the current committed state.
7847 // In a nutshell, we assume this command happens *before*.
7849 // Let me make this clearer:
7851 // - If we have only one client, and that client issues some
7852 // operation that would conflict with this operation but is
7853 // still on the pending state, then we would be sure that said
7854 // operation wouldn't have returned yet, so the client wouldn't
7855 // issue this operation (unless the client didn't wait for the
7856 // operation to finish, and that would be the client's own fault).
7858 // - If we have more than one client, each client will observe
7859 // whatever is the state at the moment of the commit. So, if we
7860 // have two clients, one issuing an unlink and another issuing a
7861 // link, and if the link happens while the unlink is still on the
7862 // pending state, from the link's point-of-view this is a no-op.
7863 // If different clients are issuing conflicting operations and
7864 // they care about that, then the clients should make sure they
7865 // enforce some kind of concurrency mechanism -- from our
7866 // perspective that's what Douglas Adams would call an SEP.
7868 // This should be used as a general guideline for most commands handled
7869 // in this function. Adapt as you see fit, but please bear in mind that
7870 // this is the expected behavior.
7873 if (prefix
== "osd setcrushmap" ||
7874 (prefix
== "osd crush set" && !osdid_present
)) {
7875 if (pending_inc
.crush
.length()) {
7876 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
7877 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
7880 dout(10) << "prepare_command setting new crush map" << dendl
;
7881 bufferlist
data(m
->get_data());
7884 bufferlist::iterator
bl(data
.begin());
7887 catch (const std::exception
&e
) {
7889 ss
<< "Failed to parse crushmap: " << e
.what();
7893 int64_t prior_version
= 0;
7894 if (cmd_getval_throws(g_ceph_context
, cmdmap
, "prior_version", prior_version
)) {
7895 if (prior_version
== osdmap
.get_crush_version() - 1) {
7896 // see if we are a resend of the last update. this is imperfect
7897 // (multiple racing updaters may not both get reliable success)
7898 // but we expect crush updaters (via this interface) to be rare-ish.
7899 bufferlist current
, proposed
;
7900 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
7901 crush
.encode(proposed
, mon
->get_quorum_con_features());
7902 if (current
.contents_equal(proposed
)) {
7903 dout(10) << __func__
7904 << " proposed matches current and version equals previous"
7907 ss
<< osdmap
.get_crush_version();
7911 if (prior_version
!= osdmap
.get_crush_version()) {
7913 ss
<< "prior_version " << prior_version
<< " != crush version "
7914 << osdmap
.get_crush_version();
7919 if (crush
.has_legacy_rule_ids()) {
7921 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
7924 if (!validate_crush_against_features(&crush
, ss
)) {
7929 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
7934 if (g_conf
->mon_osd_crush_smoke_test
) {
7935 // sanity check: test some inputs to make sure this map isn't
7937 dout(10) << " testing map" << dendl
;
7939 CrushTester
tester(crush
, ess
);
7940 tester
.set_min_x(0);
7941 tester
.set_max_x(50);
7942 auto start
= ceph::coarse_mono_clock::now();
7943 int r
= tester
.test_with_fork(g_conf
->mon_lease
);
7944 auto duration
= ceph::coarse_mono_clock::now() - start
;
7946 dout(10) << " tester.test_with_fork returns " << r
7947 << ": " << ess
.str() << dendl
;
7948 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
7952 dout(10) << __func__
<< " crush somke test duration: "
7953 << duration
<< ", result: " << ess
.str() << dendl
;
7956 pending_inc
.crush
= data
;
7957 ss
<< osdmap
.get_crush_version() + 1;
7960 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
7961 CrushWrapper newcrush
;
7962 _get_pending_crush(newcrush
);
7963 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
7965 if (newcrush
.bucket_exists(bid
) &&
7966 newcrush
.get_bucket_alg(bid
)) {
7967 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
7968 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
7971 if (!validate_crush_against_features(&newcrush
, ss
)) {
7975 pending_inc
.crush
.clear();
7976 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7977 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7978 get_last_committed() + 1));
7980 } else if (prefix
== "osd crush set-device-class") {
7981 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7982 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
7983 << "luminous' before using crush device classes";
7988 string device_class
;
7989 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "class", device_class
)) {
7990 err
= -EINVAL
; // no value!
7995 vector
<string
> idvec
;
7996 cmd_getval_throws(g_ceph_context
, cmdmap
, "ids", idvec
);
7997 CrushWrapper newcrush
;
7998 _get_pending_crush(newcrush
);
8000 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8004 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8005 osdmap
.get_all_osds(osds
);
8008 // try traditional single osd way
8009 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8011 // ss has reason for failure
8012 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
8019 for (auto &osd
: osds
) {
8020 if (!osdmap
.exists(osd
)) {
8021 ss
<< "osd." << osd
<< " does not exist. ";
8026 oss
<< "osd." << osd
;
8027 string name
= oss
.str();
8029 if (newcrush
.get_max_devices() < osd
+ 1) {
8030 newcrush
.set_max_devices(osd
+ 1);
8033 if (newcrush
.item_exists(osd
)) {
8034 action
= "updating";
8036 action
= "creating";
8037 newcrush
.set_item_name(osd
, name
);
8040 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
8041 << "' device_class '" << device_class
<< "'"
8043 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
8047 if (err
== 0 && !_have_pending_crush()) {
8049 // for single osd only, wildcard makes too much noise
8050 ss
<< "set-device-class item id " << osd
<< " name '" << name
8051 << "' device_class '" << device_class
<< "': no change";
8054 updated
.insert(osd
);
8059 if (!updated
.empty()) {
8060 pending_inc
.crush
.clear();
8061 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8062 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
8064 wait_for_finished_proposal(op
,
8065 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
8069 } else if (prefix
== "osd crush rm-device-class") {
8071 vector
<string
> idvec
;
8072 cmd_getval_throws(g_ceph_context
, cmdmap
, "ids", idvec
);
8073 CrushWrapper newcrush
;
8074 _get_pending_crush(newcrush
);
8077 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8082 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8083 osdmap
.get_all_osds(osds
);
8086 // try traditional single osd way
8087 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8089 // ss has reason for failure
8090 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
8097 for (auto &osd
: osds
) {
8098 if (!osdmap
.exists(osd
)) {
8099 ss
<< "osd." << osd
<< " does not exist. ";
8103 auto class_name
= newcrush
.get_item_class(osd
);
8105 ss
<< "osd." << osd
<< " belongs to no class, ";
8108 // note that we do not verify if class_is_in_use here
8109 // in case the device is misclassified and user wants
8110 // to overridely reset...
8112 err
= newcrush
.remove_device_class(g_ceph_context
, osd
, &ss
);
8114 // ss has reason for failure
8117 updated
.insert(osd
);
8121 if (!updated
.empty()) {
8122 pending_inc
.crush
.clear();
8123 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8124 ss
<< "done removing class of osd(s): " << updated
;
8126 wait_for_finished_proposal(op
,
8127 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
8130 } else if (prefix
== "osd crush class rename") {
8131 string srcname
, dstname
;
8132 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "srcname", srcname
)) {
8136 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "dstname", dstname
)) {
8141 CrushWrapper newcrush
;
8142 _get_pending_crush(newcrush
);
8143 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
8144 // suppose this is a replay and return success
8145 // so command is idempotent
8146 ss
<< "already renamed to '" << dstname
<< "'";
8151 err
= newcrush
.rename_class(srcname
, dstname
);
8153 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
8154 << cpp_strerror(err
);
8158 pending_inc
.crush
.clear();
8159 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8160 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
8162 } else if (prefix
== "osd crush add-bucket") {
8163 // os crush add-bucket <name> <type>
8164 string name
, typestr
;
8165 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8166 cmd_getval_throws(g_ceph_context
, cmdmap
, "type", typestr
);
8168 if (!_have_pending_crush() &&
8169 _get_stable_crush().name_exists(name
)) {
8170 ss
<< "bucket '" << name
<< "' already exists";
8174 CrushWrapper newcrush
;
8175 _get_pending_crush(newcrush
);
8177 if (newcrush
.name_exists(name
)) {
8178 ss
<< "bucket '" << name
<< "' already exists";
8181 int type
= newcrush
.get_type_id(typestr
);
8183 ss
<< "type '" << typestr
<< "' does not exist";
8188 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
8193 err
= newcrush
.add_bucket(0, 0,
8194 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
8197 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
8200 err
= newcrush
.set_item_name(bucketno
, name
);
8202 ss
<< "error setting bucket name to '" << name
<< "'";
8206 pending_inc
.crush
.clear();
8207 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8208 ss
<< "added bucket " << name
<< " type " << typestr
8211 } else if (prefix
== "osd crush rename-bucket") {
8212 string srcname
, dstname
;
8213 cmd_getval_throws(g_ceph_context
, cmdmap
, "srcname", srcname
);
8214 cmd_getval_throws(g_ceph_context
, cmdmap
, "dstname", dstname
);
8216 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
8217 if (err
== -EALREADY
) // equivalent to success for idempotency
8223 } else if (prefix
== "osd crush weight-set create" ||
8224 prefix
== "osd crush weight-set create-compat") {
8225 CrushWrapper newcrush
;
8226 _get_pending_crush(newcrush
);
8229 if (newcrush
.has_non_straw2_buckets()) {
8230 ss
<< "crush map contains one or more bucket(s) that are not straw2";
8234 if (prefix
== "osd crush weight-set create") {
8235 if (osdmap
.require_min_compat_client
> 0 &&
8236 osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
8237 ss
<< "require_min_compat_client "
8238 << ceph_release_name(osdmap
.require_min_compat_client
)
8239 << " < luminous, which is required for per-pool weight-sets. "
8240 << "Try 'ceph osd set-require-min-compat-client luminous' "
8241 << "before using the new interface";
8245 string poolname
, mode
;
8246 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolname
);
8247 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
8249 ss
<< "pool '" << poolname
<< "' not found";
8253 cmd_getval_throws(g_ceph_context
, cmdmap
, "mode", mode
);
8254 if (mode
!= "flat" && mode
!= "positional") {
8255 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
8259 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
8261 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
8264 newcrush
.create_choose_args(pool
, positions
);
8265 pending_inc
.crush
.clear();
8266 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8269 } else if (prefix
== "osd crush weight-set rm" ||
8270 prefix
== "osd crush weight-set rm-compat") {
8271 CrushWrapper newcrush
;
8272 _get_pending_crush(newcrush
);
8274 if (prefix
== "osd crush weight-set rm") {
8276 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolname
);
8277 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
8279 ss
<< "pool '" << poolname
<< "' not found";
8284 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
8286 newcrush
.rm_choose_args(pool
);
8287 pending_inc
.crush
.clear();
8288 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8291 } else if (prefix
== "osd crush weight-set reweight" ||
8292 prefix
== "osd crush weight-set reweight-compat") {
8293 string poolname
, item
;
8294 vector
<double> weight
;
8295 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolname
);
8296 cmd_getval_throws(g_ceph_context
, cmdmap
, "item", item
);
8297 cmd_getval_throws(g_ceph_context
, cmdmap
, "weight", weight
);
8298 CrushWrapper newcrush
;
8299 _get_pending_crush(newcrush
);
8301 if (prefix
== "osd crush weight-set reweight") {
8302 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
8304 ss
<< "pool '" << poolname
<< "' not found";
8308 if (!newcrush
.have_choose_args(pool
)) {
8309 ss
<< "no weight-set for pool '" << poolname
<< "'";
8313 auto arg_map
= newcrush
.choose_args_get(pool
);
8314 int positions
= newcrush
.get_choose_args_positions(arg_map
);
8315 if (weight
.size() != (size_t)positions
) {
8316 ss
<< "must specify exact " << positions
<< " weight values";
8321 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
8322 if (!newcrush
.have_choose_args(pool
)) {
8323 ss
<< "no backward-compatible weight-set";
8328 if (!newcrush
.name_exists(item
)) {
8329 ss
<< "item '" << item
<< "' does not exist";
8333 err
= newcrush
.choose_args_adjust_item_weightf(
8335 newcrush
.choose_args_get(pool
),
8336 newcrush
.get_item_id(item
),
8343 pending_inc
.crush
.clear();
8344 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8346 } else if (osdid_present
&&
8347 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
8348 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
8349 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
8350 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
8352 if (!osdmap
.exists(osdid
)) {
8354 ss
<< name
<< " does not exist. Create it before updating the crush map";
8359 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "weight", weight
)) {
8360 ss
<< "unable to parse weight value '"
8361 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8367 vector
<string
> argvec
;
8368 cmd_getval_throws(g_ceph_context
, cmdmap
, "args", argvec
);
8369 map
<string
,string
> loc
;
8370 CrushWrapper::parse_loc_map(argvec
, &loc
);
8372 if (prefix
== "osd crush set"
8373 && !_get_stable_crush().item_exists(osdid
)) {
8375 ss
<< "unable to set item id " << osdid
<< " name '" << name
8376 << "' weight " << weight
<< " at location " << loc
8377 << ": does not exist";
8381 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
8382 << name
<< "' weight " << weight
<< " at location "
8384 CrushWrapper newcrush
;
8385 _get_pending_crush(newcrush
);
8388 if (prefix
== "osd crush set" ||
8389 newcrush
.check_item_loc(g_ceph_context
, osdid
, loc
, (int *)NULL
)) {
8391 err
= newcrush
.update_item(g_ceph_context
, osdid
, weight
, name
, loc
);
8394 err
= newcrush
.insert_item(g_ceph_context
, osdid
, weight
, name
, loc
);
8402 if (err
== 0 && !_have_pending_crush()) {
8403 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
8404 << weight
<< " at location " << loc
<< ": no change";
8408 pending_inc
.crush
.clear();
8409 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8410 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
8411 << weight
<< " at location " << loc
<< " to crush map";
8413 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8414 get_last_committed() + 1));
8417 } else if (prefix
== "osd crush create-or-move") {
8419 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
8420 if (!osdmap
.exists(osdid
)) {
8422 ss
<< name
<< " does not exist. create it before updating the crush map";
8427 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "weight", weight
)) {
8428 ss
<< "unable to parse weight value '"
8429 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8435 vector
<string
> argvec
;
8436 cmd_getval_throws(g_ceph_context
, cmdmap
, "args", argvec
);
8437 map
<string
,string
> loc
;
8438 CrushWrapper::parse_loc_map(argvec
, &loc
);
8440 dout(0) << "create-or-move crush item name '" << name
<< "' initial_weight " << weight
8441 << " at location " << loc
<< dendl
;
8443 CrushWrapper newcrush
;
8444 _get_pending_crush(newcrush
);
8446 err
= newcrush
.create_or_move_item(g_ceph_context
, osdid
, weight
, name
, loc
);
8448 ss
<< "create-or-move updated item name '" << name
<< "' weight " << weight
8449 << " at location " << loc
<< " to crush map";
8453 pending_inc
.crush
.clear();
8454 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8455 ss
<< "create-or-move updating item name '" << name
<< "' weight " << weight
8456 << " at location " << loc
<< " to crush map";
8458 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8459 get_last_committed() + 1));
8464 } else if (prefix
== "osd crush move") {
8466 // osd crush move <name> <loc1> [<loc2> ...]
8469 vector
<string
> argvec
;
8470 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8471 cmd_getval_throws(g_ceph_context
, cmdmap
, "args", argvec
);
8472 map
<string
,string
> loc
;
8473 CrushWrapper::parse_loc_map(argvec
, &loc
);
8475 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
8476 CrushWrapper newcrush
;
8477 _get_pending_crush(newcrush
);
8479 if (!newcrush
.name_exists(name
)) {
8481 ss
<< "item " << name
<< " does not exist";
8484 int id
= newcrush
.get_item_id(name
);
8486 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
8488 err
= newcrush
.create_or_move_item(g_ceph_context
, id
, 0, name
, loc
);
8490 err
= newcrush
.move_bucket(g_ceph_context
, id
, loc
);
8493 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
8494 pending_inc
.crush
.clear();
8495 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8497 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8498 get_last_committed() + 1));
8502 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
8506 } else if (prefix
== "osd crush swap-bucket") {
8507 string source
, dest
, force
;
8508 cmd_getval_throws(g_ceph_context
, cmdmap
, "source", source
);
8509 cmd_getval_throws(g_ceph_context
, cmdmap
, "dest", dest
);
8510 cmd_getval_throws(g_ceph_context
, cmdmap
, "force", force
);
8511 CrushWrapper newcrush
;
8512 _get_pending_crush(newcrush
);
8513 if (!newcrush
.name_exists(source
)) {
8514 ss
<< "source item " << source
<< " does not exist";
8518 if (!newcrush
.name_exists(dest
)) {
8519 ss
<< "dest item " << dest
<< " does not exist";
8523 int sid
= newcrush
.get_item_id(source
);
8524 int did
= newcrush
.get_item_id(dest
);
8526 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 &&
8527 force
!= "--yes-i-really-mean-it") {
8528 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
8532 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
8533 force
!= "--yes-i-really-mean-it") {
8534 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
8535 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
8536 << "; pass --yes-i-really-mean-it to proceed anyway";
8540 int r
= newcrush
.swap_bucket(g_ceph_context
, sid
, did
);
8542 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
8546 ss
<< "swapped bucket of " << source
<< " to " << dest
;
8547 pending_inc
.crush
.clear();
8548 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8549 wait_for_finished_proposal(op
,
8550 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
8551 get_last_committed() + 1));
8553 } else if (prefix
== "osd crush link") {
8554 // osd crush link <name> <loc1> [<loc2> ...]
8556 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8557 vector
<string
> argvec
;
8558 cmd_getval_throws(g_ceph_context
, cmdmap
, "args", argvec
);
8559 map
<string
,string
> loc
;
8560 CrushWrapper::parse_loc_map(argvec
, &loc
);
8562 // Need an explicit check for name_exists because get_item_id returns
8564 int id
= osdmap
.crush
->get_item_id(name
);
8565 if (!osdmap
.crush
->name_exists(name
)) {
8567 ss
<< "item " << name
<< " does not exist";
8570 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
8572 if (osdmap
.crush
->check_item_loc(g_ceph_context
, id
, loc
, (int*) NULL
)) {
8573 ss
<< "no need to move item id " << id
<< " name '" << name
8574 << "' to location " << loc
<< " in crush map";
8579 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
8580 CrushWrapper newcrush
;
8581 _get_pending_crush(newcrush
);
8583 if (!newcrush
.name_exists(name
)) {
8585 ss
<< "item " << name
<< " does not exist";
8588 int id
= newcrush
.get_item_id(name
);
8589 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
8590 err
= newcrush
.link_bucket(g_ceph_context
, id
, loc
);
8592 ss
<< "linked item id " << id
<< " name '" << name
8593 << "' to location " << loc
<< " in crush map";
8594 pending_inc
.crush
.clear();
8595 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8597 ss
<< "cannot link item id " << id
<< " name '" << name
8598 << "' to location " << loc
;
8602 ss
<< "no need to move item id " << id
<< " name '" << name
8603 << "' to location " << loc
<< " in crush map";
8607 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
8608 get_last_committed() + 1));
8610 } else if (prefix
== "osd crush rm" ||
8611 prefix
== "osd crush remove" ||
8612 prefix
== "osd crush unlink") {
8614 // osd crush rm <id> [ancestor]
8615 CrushWrapper newcrush
;
8616 _get_pending_crush(newcrush
);
8619 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8621 if (!osdmap
.crush
->name_exists(name
)) {
8623 ss
<< "device '" << name
<< "' does not appear in the crush map";
8626 if (!newcrush
.name_exists(name
)) {
8628 ss
<< "device '" << name
<< "' does not appear in the crush map";
8630 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8631 get_last_committed() + 1));
8634 int id
= newcrush
.get_item_id(name
);
8637 bool unlink_only
= prefix
== "osd crush unlink";
8638 string ancestor_str
;
8639 if (cmd_getval_throws(g_ceph_context
, cmdmap
, "ancestor", ancestor_str
)) {
8640 if (!newcrush
.name_exists(ancestor_str
)) {
8642 ss
<< "ancestor item '" << ancestor_str
8643 << "' does not appear in the crush map";
8646 ancestor
= newcrush
.get_item_id(ancestor_str
);
8649 err
= prepare_command_osd_crush_remove(
8652 (ancestor
< 0), unlink_only
);
8654 if (err
== -ENOENT
) {
8655 ss
<< "item " << id
<< " does not appear in that position";
8660 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
8662 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8663 get_last_committed() + 1));
8668 } else if (prefix
== "osd crush reweight-all") {
8669 CrushWrapper newcrush
;
8670 _get_pending_crush(newcrush
);
8672 newcrush
.reweight(g_ceph_context
);
8673 pending_inc
.crush
.clear();
8674 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8675 ss
<< "reweighted crush hierarchy";
8677 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8678 get_last_committed() + 1));
8680 } else if (prefix
== "osd crush reweight") {
8681 // osd crush reweight <name> <weight>
8682 CrushWrapper newcrush
;
8683 _get_pending_crush(newcrush
);
8686 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8687 if (!newcrush
.name_exists(name
)) {
8689 ss
<< "device '" << name
<< "' does not appear in the crush map";
8693 int id
= newcrush
.get_item_id(name
);
8695 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
8700 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "weight", w
)) {
8701 ss
<< "unable to parse weight value '"
8702 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8707 err
= newcrush
.adjust_item_weightf(g_ceph_context
, id
, w
);
8710 pending_inc
.crush
.clear();
8711 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8712 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
8715 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8716 get_last_committed() + 1));
8718 } else if (prefix
== "osd crush reweight-subtree") {
8719 // osd crush reweight <name> <weight>
8720 CrushWrapper newcrush
;
8721 _get_pending_crush(newcrush
);
8724 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8725 if (!newcrush
.name_exists(name
)) {
8727 ss
<< "device '" << name
<< "' does not appear in the crush map";
8731 int id
= newcrush
.get_item_id(name
);
8733 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
8738 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "weight", w
)) {
8739 ss
<< "unable to parse weight value '"
8740 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8745 err
= newcrush
.adjust_subtree_weightf(g_ceph_context
, id
, w
);
8748 pending_inc
.crush
.clear();
8749 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8750 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
8753 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8754 get_last_committed() + 1));
8756 } else if (prefix
== "osd crush tunables") {
8757 CrushWrapper newcrush
;
8758 _get_pending_crush(newcrush
);
8762 cmd_getval_throws(g_ceph_context
, cmdmap
, "profile", profile
);
8763 if (profile
== "legacy" || profile
== "argonaut") {
8764 newcrush
.set_tunables_legacy();
8765 } else if (profile
== "bobtail") {
8766 newcrush
.set_tunables_bobtail();
8767 } else if (profile
== "firefly") {
8768 newcrush
.set_tunables_firefly();
8769 } else if (profile
== "hammer") {
8770 newcrush
.set_tunables_hammer();
8771 } else if (profile
== "jewel") {
8772 newcrush
.set_tunables_jewel();
8773 } else if (profile
== "optimal") {
8774 newcrush
.set_tunables_optimal();
8775 } else if (profile
== "default") {
8776 newcrush
.set_tunables_default();
8778 ss
<< "unrecognized profile '" << profile
<< "'";
8783 if (!validate_crush_against_features(&newcrush
, ss
)) {
8788 pending_inc
.crush
.clear();
8789 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8790 ss
<< "adjusted tunables profile to " << profile
;
8792 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8793 get_last_committed() + 1));
8795 } else if (prefix
== "osd crush set-tunable") {
8796 CrushWrapper newcrush
;
8797 _get_pending_crush(newcrush
);
8801 cmd_getval_throws(g_ceph_context
, cmdmap
, "tunable", tunable
);
8804 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "value", value
)) {
8806 ss
<< "failed to parse integer value " << cmd_vartype_stringify(cmdmap
["value"]);
8810 if (tunable
== "straw_calc_version") {
8811 if (value
!= 0 && value
!= 1) {
8812 ss
<< "value must be 0 or 1; got " << value
;
8816 newcrush
.set_straw_calc_version(value
);
8818 ss
<< "unrecognized tunable '" << tunable
<< "'";
8823 if (!validate_crush_against_features(&newcrush
, ss
)) {
8828 pending_inc
.crush
.clear();
8829 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8830 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
8832 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8833 get_last_committed() + 1));
8836 } else if (prefix
== "osd crush rule create-simple") {
8837 string name
, root
, type
, mode
;
8838 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8839 cmd_getval_throws(g_ceph_context
, cmdmap
, "root", root
);
8840 cmd_getval_throws(g_ceph_context
, cmdmap
, "type", type
);
8841 cmd_getval_throws(g_ceph_context
, cmdmap
, "mode", mode
);
8845 if (osdmap
.crush
->rule_exists(name
)) {
8846 // The name is uniquely associated to a ruleid and the rule it contains
8847 // From the user point of view, the rule is more meaningfull.
8848 ss
<< "rule " << name
<< " already exists";
8853 CrushWrapper newcrush
;
8854 _get_pending_crush(newcrush
);
8856 if (newcrush
.rule_exists(name
)) {
8857 // The name is uniquely associated to a ruleid and the rule it contains
8858 // From the user point of view, the rule is more meaningfull.
8859 ss
<< "rule " << name
<< " already exists";
8862 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
8863 pg_pool_t::TYPE_REPLICATED
, &ss
);
8869 pending_inc
.crush
.clear();
8870 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8873 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8874 get_last_committed() + 1));
8877 } else if (prefix
== "osd crush rule create-replicated") {
8878 string name
, root
, type
, device_class
;
8879 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8880 cmd_getval_throws(g_ceph_context
, cmdmap
, "root", root
);
8881 cmd_getval_throws(g_ceph_context
, cmdmap
, "type", type
);
8882 cmd_getval_throws(g_ceph_context
, cmdmap
, "class", device_class
);
8884 if (!device_class
.empty()) {
8885 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8886 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8887 << "luminous' before using crush device classes";
8893 if (osdmap
.crush
->rule_exists(name
)) {
8894 // The name is uniquely associated to a ruleid and the rule it contains
8895 // From the user point of view, the rule is more meaningfull.
8896 ss
<< "rule " << name
<< " already exists";
8901 CrushWrapper newcrush
;
8902 _get_pending_crush(newcrush
);
8904 if (newcrush
.rule_exists(name
)) {
8905 // The name is uniquely associated to a ruleid and the rule it contains
8906 // From the user point of view, the rule is more meaningfull.
8907 ss
<< "rule " << name
<< " already exists";
8910 int ruleno
= newcrush
.add_simple_rule(
8911 name
, root
, type
, device_class
,
8912 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
8918 pending_inc
.crush
.clear();
8919 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8922 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8923 get_last_committed() + 1));
8926 } else if (prefix
== "osd erasure-code-profile rm") {
8928 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8930 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
8933 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
8938 if (osdmap
.has_erasure_code_profile(name
) ||
8939 pending_inc
.new_erasure_code_profiles
.count(name
)) {
8940 if (osdmap
.has_erasure_code_profile(name
)) {
8941 pending_inc
.old_erasure_code_profiles
.push_back(name
);
8943 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
8944 pending_inc
.new_erasure_code_profiles
.erase(name
);
8948 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8949 get_last_committed() + 1));
8952 ss
<< "erasure-code-profile " << name
<< " does not exist";
8957 } else if (prefix
== "osd erasure-code-profile set") {
8959 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
8960 vector
<string
> profile
;
8961 cmd_getval_throws(g_ceph_context
, cmdmap
, "profile", profile
);
8963 if (profile
.size() > 0 && profile
.back() == "--force") {
8969 map
<string
,string
> profile_map
;
8970 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
8973 if (profile_map
.find("plugin") == profile_map
.end()) {
8974 ss
<< "erasure-code-profile " << profile_map
8975 << " must contain a plugin entry" << std::endl
;
8979 string plugin
= profile_map
["plugin"];
8981 if (pending_inc
.has_erasure_code_profile(name
)) {
8982 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
8985 if (plugin
== "isa" || plugin
== "lrc") {
8986 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
, ss
);
8991 } else if (plugin
== "shec") {
8992 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
, ss
);
8998 err
= normalize_profile(name
, profile_map
, force
, &ss
);
9002 if (osdmap
.has_erasure_code_profile(name
)) {
9003 ErasureCodeProfile existing_profile_map
=
9004 osdmap
.get_erasure_code_profile(name
);
9005 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
9009 if (existing_profile_map
== profile_map
) {
9015 ss
<< "will not override erasure code profile " << name
9016 << " because the existing profile "
9017 << existing_profile_map
9018 << " is different from the proposed profile "
9024 dout(20) << "erasure code profile set " << name
<< "="
9025 << profile_map
<< dendl
;
9026 pending_inc
.set_erasure_code_profile(name
, profile_map
);
9030 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9031 get_last_committed() + 1));
9034 } else if (prefix
== "osd crush rule create-erasure") {
9035 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
9040 string name
, poolstr
;
9041 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
9043 cmd_getval_throws(g_ceph_context
, cmdmap
, "profile", profile
);
9045 profile
= "default";
9046 if (profile
== "default") {
9047 if (!osdmap
.has_erasure_code_profile(profile
)) {
9048 if (pending_inc
.has_erasure_code_profile(profile
)) {
9049 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
9053 map
<string
,string
> profile_map
;
9054 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
9059 err
= normalize_profile(name
, profile_map
, true, &ss
);
9062 dout(20) << "erasure code profile set " << profile
<< "="
9063 << profile_map
<< dendl
;
9064 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
9070 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
9073 case -EEXIST
: // return immediately
9074 ss
<< "rule " << name
<< " already exists";
9078 case -EALREADY
: // wait for pending to be proposed
9079 ss
<< "rule " << name
<< " already exists";
9082 default: // non recoverable error
9087 ss
<< "created rule " << name
<< " at " << rule
;
9091 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9092 get_last_committed() + 1));
9095 } else if (prefix
== "osd crush rule rm") {
9097 cmd_getval_throws(g_ceph_context
, cmdmap
, "name", name
);
9099 if (!osdmap
.crush
->rule_exists(name
)) {
9100 ss
<< "rule " << name
<< " does not exist";
9105 CrushWrapper newcrush
;
9106 _get_pending_crush(newcrush
);
9108 if (!newcrush
.rule_exists(name
)) {
9109 ss
<< "rule " << name
<< " does not exist";
9112 int ruleno
= newcrush
.get_rule_id(name
);
9113 assert(ruleno
>= 0);
9115 // make sure it is not in use.
9116 // FIXME: this is ok in some situations, but let's not bother with that
9118 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
9119 if (osdmap
.crush_rule_in_use(ruleset
)) {
9120 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
9125 err
= newcrush
.remove_rule(ruleno
);
9130 pending_inc
.crush
.clear();
9131 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9134 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9135 get_last_committed() + 1));
9138 } else if (prefix
== "osd crush rule rename") {
9141 cmd_getval_throws(g_ceph_context
, cmdmap
, "srcname", srcname
);
9142 cmd_getval_throws(g_ceph_context
, cmdmap
, "dstname", dstname
);
9143 if (srcname
.empty() || dstname
.empty()) {
9144 ss
<< "must specify both source rule name and destination rule name";
9148 if (srcname
== dstname
) {
9149 ss
<< "destination rule name is equal to source rule name";
9154 CrushWrapper newcrush
;
9155 _get_pending_crush(newcrush
);
9156 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
9157 // srcname does not exist and dstname already exists
9158 // suppose this is a replay and return success
9159 // (so this command is idempotent)
9160 ss
<< "already renamed to '" << dstname
<< "'";
9165 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
9167 // ss has reason for failure
9170 pending_inc
.crush
.clear();
9171 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9173 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9174 get_last_committed() + 1));
9177 } else if (prefix
== "osd setmaxosd") {
9179 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "newmax", newmax
)) {
9180 ss
<< "unable to parse 'newmax' value '"
9181 << cmd_vartype_stringify(cmdmap
["newmax"]) << "'";
9186 if (newmax
> g_conf
->mon_max_osd
) {
9188 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
9189 << g_conf
->mon_max_osd
<< ")";
9193 // Don't allow shrinking OSD number as this will cause data loss
9194 // and may cause kernel crashes.
9195 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
9196 if (newmax
< osdmap
.get_max_osd()) {
9197 // Check if the OSDs exist between current max and new value.
9198 // If there are any OSDs exist, then don't allow shrinking number
9200 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
9201 if (osdmap
.exists(i
)) {
9203 ss
<< "cannot shrink max_osd to " << newmax
9204 << " because osd." << i
<< " (and possibly others) still in use";
9210 pending_inc
.new_max_osd
= newmax
;
9211 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
9213 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9214 get_last_committed() + 1));
9217 } else if (prefix
== "osd set-full-ratio" ||
9218 prefix
== "osd set-backfillfull-ratio" ||
9219 prefix
== "osd set-nearfull-ratio") {
9220 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
9221 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9222 << "luminous' before using the new interface";
9227 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "ratio", n
)) {
9228 ss
<< "unable to parse 'ratio' value '"
9229 << cmd_vartype_stringify(cmdmap
["ratio"]) << "'";
9233 if (prefix
== "osd set-full-ratio")
9234 pending_inc
.new_full_ratio
= n
;
9235 else if (prefix
== "osd set-backfillfull-ratio")
9236 pending_inc
.new_backfillfull_ratio
= n
;
9237 else if (prefix
== "osd set-nearfull-ratio")
9238 pending_inc
.new_nearfull_ratio
= n
;
9239 ss
<< prefix
<< " " << n
;
9241 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9242 get_last_committed() + 1));
9244 } else if (prefix
== "osd set-require-min-compat-client") {
9245 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
9246 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9247 << "luminous' before using the new interface";
9252 cmd_getval_throws(g_ceph_context
, cmdmap
, "version", v
);
9253 int vno
= ceph_release_from_name(v
.c_str());
9255 ss
<< "version " << v
<< " is not recognized";
9260 newmap
.deepish_copy_from(osdmap
);
9261 newmap
.apply_incremental(pending_inc
);
9262 newmap
.require_min_compat_client
= vno
;
9263 auto mvno
= newmap
.get_min_compat_client();
9265 ss
<< "osdmap current utilizes features that require "
9266 << ceph_release_name(mvno
)
9267 << "; cannot set require_min_compat_client below that to "
9268 << ceph_release_name(vno
);
9273 cmd_getval_throws(g_ceph_context
, cmdmap
, "sure", sure
);
9274 if (sure
!= "--yes-i-really-mean-it") {
9276 mon
->get_combined_feature_map(&m
);
9277 uint64_t features
= ceph_release_features(vno
);
9281 CEPH_ENTITY_TYPE_CLIENT
,
9282 CEPH_ENTITY_TYPE_MDS
,
9283 CEPH_ENTITY_TYPE_MGR
}) {
9284 auto p
= m
.m
.find(type
);
9285 if (p
== m
.m
.end()) {
9288 for (auto& q
: p
->second
) {
9289 uint64_t missing
= ~q
.first
& features
;
9292 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
9297 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
9298 << "(s) look like " << ceph_release_name(
9299 ceph_release_from_features(q
.first
))
9300 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
9306 ss
<< "; add --yes-i-really-mean-it to do it anyway";
9311 ss
<< "set require_min_compat_client to " << ceph_release_name(vno
);
9312 pending_inc
.new_require_min_compat_client
= vno
;
9314 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9315 get_last_committed() + 1));
9317 } else if (prefix
== "osd pause") {
9318 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
9320 } else if (prefix
== "osd unpause") {
9321 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
9323 } else if (prefix
== "osd set") {
9325 cmd_getval_throws(g_ceph_context
, cmdmap
, "sure", sure
);
9327 cmd_getval_throws(g_ceph_context
, cmdmap
, "key", key
);
9329 return prepare_set_flag(op
, CEPH_OSDMAP_FULL
);
9330 else if (key
== "pause")
9331 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
9332 else if (key
== "noup")
9333 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
9334 else if (key
== "nodown")
9335 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
9336 else if (key
== "noout")
9337 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
9338 else if (key
== "noin")
9339 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
9340 else if (key
== "nobackfill")
9341 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
9342 else if (key
== "norebalance")
9343 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
9344 else if (key
== "norecover")
9345 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
9346 else if (key
== "noscrub")
9347 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
9348 else if (key
== "nodeep-scrub")
9349 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
9350 else if (key
== "notieragent")
9351 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
9352 else if (key
== "sortbitwise") {
9353 if (!osdmap
.get_num_up_osds() && sure
!= "--yes-i-really-mean-it") {
9354 ss
<< "Not advisable to continue since no OSDs are up. Pass "
9355 << "--yes-i-really-mean-it if you really wish to continue.";
9359 if ((osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)
9360 || sure
== "--yes-i-really-mean-it") {
9361 return prepare_set_flag(op
, CEPH_OSDMAP_SORTBITWISE
);
9363 ss
<< "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
9367 } else if (key
== "recovery_deletes") {
9368 if (!osdmap
.get_num_up_osds() && sure
!= "--yes-i-really-mean-it") {
9369 ss
<< "Not advisable to continue since no OSDs are up. Pass "
9370 << "--yes-i-really-mean-it if you really wish to continue.";
9374 if (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_RECOVERY_DELETES
)
9375 || sure
== "--yes-i-really-mean-it") {
9376 return prepare_set_flag(op
, CEPH_OSDMAP_RECOVERY_DELETES
);
9378 ss
<< "not all up OSDs have OSD_RECOVERY_DELETES feature";
9382 } else if (key
== "pglog_hardlimit") {
9383 if (!osdmap
.get_num_up_osds() && sure
!= "--yes-i-really-mean-it") {
9384 ss
<< "Not advisable to continue since no OSDs are up. Pass "
9385 << "--yes-i-really-mean-it if you really wish to continue.";
9389 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
9390 // we are reusing a jewel feature bit that was retired in luminous.
9391 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
9392 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
9393 || sure
== "--yes-i-really-mean-it")) {
9394 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
9396 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
9400 } else if (key
== "require_jewel_osds") {
9401 if (!osdmap
.get_num_up_osds() && sure
!= "--yes-i-really-mean-it") {
9402 ss
<< "Not advisable to continue since no OSDs are up. Pass "
9403 << "--yes-i-really-mean-it if you really wish to continue.";
9407 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
9408 ss
<< "the sortbitwise flag must be set before require_jewel_osds";
9411 } else if (osdmap
.require_osd_release
>= CEPH_RELEASE_JEWEL
) {
9412 ss
<< "require_osd_release is already >= jewel";
9415 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
)
9416 || sure
== "--yes-i-really-mean-it") {
9417 return prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_JEWEL
);
9419 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
9422 } else if (key
== "require_kraken_osds") {
9423 if (!osdmap
.get_num_up_osds() && sure
!= "--yes-i-really-mean-it") {
9424 ss
<< "Not advisable to continue since no OSDs are up. Pass "
9425 << "--yes-i-really-mean-it if you really wish to continue.";
9429 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
9430 ss
<< "the sortbitwise flag must be set before require_kraken_osds";
9433 } else if (osdmap
.require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
9434 ss
<< "require_osd_release is already >= kraken";
9437 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
)
9438 || sure
== "--yes-i-really-mean-it") {
9439 bool r
= prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_KRAKEN
);
9440 // ensure JEWEL is also set
9441 pending_inc
.new_flags
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
9444 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
9448 ss
<< "unrecognized flag '" << key
<< "'";
9452 } else if (prefix
== "osd unset") {
9454 cmd_getval_throws(g_ceph_context
, cmdmap
, "key", key
);
9456 return prepare_unset_flag(op
, CEPH_OSDMAP_FULL
);
9457 else if (key
== "pause")
9458 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
9459 else if (key
== "noup")
9460 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
9461 else if (key
== "nodown")
9462 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
9463 else if (key
== "noout")
9464 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
9465 else if (key
== "noin")
9466 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
9467 else if (key
== "nobackfill")
9468 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
9469 else if (key
== "norebalance")
9470 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
9471 else if (key
== "norecover")
9472 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
9473 else if (key
== "noscrub")
9474 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
9475 else if (key
== "nodeep-scrub")
9476 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
9477 else if (key
== "notieragent")
9478 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
9480 ss
<< "unrecognized flag '" << key
<< "'";
9484 } else if (prefix
== "osd require-osd-release") {
9486 cmd_getval_throws(g_ceph_context
, cmdmap
, "release", release
);
9488 cmd_getval_throws(g_ceph_context
, cmdmap
, "sure", sure
);
9489 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
9490 ss
<< "the sortbitwise flag must be set first";
9494 int rel
= ceph_release_from_name(release
.c_str());
9496 ss
<< "unrecognized release " << release
;
9500 if (rel
< CEPH_RELEASE_LUMINOUS
) {
9501 ss
<< "use this command only for luminous and later";
9505 if (rel
== osdmap
.require_osd_release
) {
9510 if (rel
== CEPH_RELEASE_LUMINOUS
) {
9511 if (!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
)) {
9512 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
9517 ss
<< "not supported for this release yet";
9521 if (rel
< osdmap
.require_osd_release
) {
9522 ss
<< "require_osd_release cannot be lowered once it has been set";
9526 pending_inc
.new_require_osd_release
= rel
;
9527 if (rel
>= CEPH_RELEASE_LUMINOUS
&&
9528 !osdmap
.test_flag(CEPH_OSDMAP_RECOVERY_DELETES
)) {
9529 return prepare_set_flag(op
, CEPH_OSDMAP_RECOVERY_DELETES
);
9532 } else if (prefix
== "osd cluster_snap") {
9533 // ** DISABLE THIS FOR NOW **
9534 ss
<< "cluster snapshot currently disabled (broken implementation)";
9535 // ** DISABLE THIS FOR NOW **
9537 } else if (prefix
== "osd down" ||
9538 prefix
== "osd out" ||
9539 prefix
== "osd in" ||
9540 prefix
== "osd rm") {
9544 bool verbose
= true;
9546 vector
<string
> idvec
;
9547 cmd_getval_throws(g_ceph_context
, cmdmap
, "ids", idvec
);
9548 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9553 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9554 if (prefix
== "osd in") {
9555 // touch out osds only
9556 osdmap
.get_out_osds(osds
);
9558 osdmap
.get_all_osds(osds
);
9561 verbose
= false; // so the output is less noisy.
9563 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9565 ss
<< "invalid osd id" << osd
;
9568 } else if (!osdmap
.exists(osd
)) {
9569 ss
<< "osd." << osd
<< " does not exist. ";
9576 for (auto &osd
: osds
) {
9577 if (prefix
== "osd down") {
9578 if (osdmap
.is_down(osd
)) {
9580 ss
<< "osd." << osd
<< " is already down. ";
9582 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
9583 ss
<< "marked down osd." << osd
<< ". ";
9586 } else if (prefix
== "osd out") {
9587 if (osdmap
.is_out(osd
)) {
9589 ss
<< "osd." << osd
<< " is already out. ";
9591 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
9592 if (osdmap
.osd_weight
[osd
]) {
9593 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
9594 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
9596 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
9598 ss
<< "marked out osd." << osd
<< ". ";
9599 std::ostringstream msg
;
9600 msg
<< "Client " << op
->get_session()->entity_name
9601 << " marked osd." << osd
<< " out";
9602 if (osdmap
.is_up(osd
)) {
9603 msg
<< ", while it was still marked up";
9605 auto period
= ceph_clock_now() - down_pending_out
[osd
];
9606 msg
<< ", after it was down for " << int(period
.sec())
9610 mon
->clog
->info() << msg
.str();
9613 } else if (prefix
== "osd in") {
9614 if (osdmap
.is_in(osd
)) {
9616 ss
<< "osd." << osd
<< " is already in. ";
9618 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
9619 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
9620 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
9621 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
9623 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
9625 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
9627 ss
<< "marked in osd." << osd
<< ". ";
9630 } else if (prefix
== "osd rm") {
9631 err
= prepare_command_osd_remove(osd
);
9633 if (err
== -EBUSY
) {
9636 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
9640 ss
<< ", osd." << osd
;
9642 ss
<< "removed osd." << osd
;
9651 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
9652 get_last_committed() + 1));
9655 } else if (prefix
== "osd add-noup" ||
9656 prefix
== "osd add-nodown" ||
9657 prefix
== "osd add-noin" ||
9658 prefix
== "osd add-noout") {
9667 if (prefix
== "osd add-noup") {
9669 } else if (prefix
== "osd add-nodown") {
9671 } else if (prefix
== "osd add-noin") {
9680 vector
<string
> idvec
;
9681 cmd_getval_throws(g_ceph_context
, cmdmap
, "ids", idvec
);
9682 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9688 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9689 osdmap
.get_all_osds(osds
);
9692 // try traditional single osd way
9694 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9696 // ss has reason for failure
9697 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9705 for (auto &osd
: osds
) {
9707 if (!osdmap
.exists(osd
)) {
9708 ss
<< "osd." << osd
<< " does not exist. ";
9714 if (osdmap
.is_up(osd
)) {
9715 ss
<< "osd." << osd
<< " is already up. ";
9719 if (osdmap
.is_noup(osd
)) {
9720 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
))
9723 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
9730 if (osdmap
.is_down(osd
)) {
9731 ss
<< "osd." << osd
<< " is already down. ";
9735 if (osdmap
.is_nodown(osd
)) {
9736 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
))
9739 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
9746 if (osdmap
.is_in(osd
)) {
9747 ss
<< "osd." << osd
<< " is already in. ";
9751 if (osdmap
.is_noin(osd
)) {
9752 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
))
9755 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
9762 if (osdmap
.is_out(osd
)) {
9763 ss
<< "osd." << osd
<< " is already out. ";
9767 if (osdmap
.is_noout(osd
)) {
9768 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
))
9771 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
9778 assert(0 == "invalid option");
9785 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
9786 get_last_committed() + 1));
9789 } else if (prefix
== "osd rm-noup" ||
9790 prefix
== "osd rm-nodown" ||
9791 prefix
== "osd rm-noin" ||
9792 prefix
== "osd rm-noout") {
9801 if (prefix
== "osd rm-noup") {
9803 } else if (prefix
== "osd rm-nodown") {
9805 } else if (prefix
== "osd rm-noin") {
9814 vector
<string
> idvec
;
9815 cmd_getval_throws(g_ceph_context
, cmdmap
, "ids", idvec
);
9817 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9823 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9825 // touch previous noup/nodown/noin/noout osds only
9828 osdmap
.get_noup_osds(&osds
);
9831 osdmap
.get_nodown_osds(&osds
);
9834 osdmap
.get_noin_osds(&osds
);
9837 osdmap
.get_noout_osds(&osds
);
9840 assert(0 == "invalid option");
9843 // cancel any pending noup/nodown/noin/noout requests too
9844 vector
<int> pending_state_osds
;
9845 (void) pending_inc
.get_pending_state_osds(&pending_state_osds
);
9846 for (auto &p
: pending_state_osds
) {
9850 if (!osdmap
.is_noup(p
) &&
9851 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOUP
)) {
9857 if (!osdmap
.is_nodown(p
) &&
9858 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NODOWN
)) {
9864 if (!osdmap
.is_noin(p
) &&
9865 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOIN
)) {
9871 if (!osdmap
.is_noout(p
) &&
9872 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOOUT
)) {
9878 assert(0 == "invalid option");
9884 // try traditional single osd way
9886 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9888 // ss has reason for failure
9889 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9894 osds
.push_back(osd
);
9897 for (auto &osd
: osds
) {
9899 if (!osdmap
.exists(osd
)) {
9900 ss
<< "osd." << osd
<< " does not exist. ";
9906 if (osdmap
.is_noup(osd
)) {
9907 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
9909 } else if (pending_inc
.pending_osd_state_clear(
9910 osd
, CEPH_OSD_NOUP
)) {
9916 if (osdmap
.is_nodown(osd
)) {
9917 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
9919 } else if (pending_inc
.pending_osd_state_clear(
9920 osd
, CEPH_OSD_NODOWN
)) {
9926 if (osdmap
.is_noin(osd
)) {
9927 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
9929 } else if (pending_inc
.pending_osd_state_clear(
9930 osd
, CEPH_OSD_NOIN
)) {
9936 if (osdmap
.is_noout(osd
)) {
9937 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
9939 } else if (pending_inc
.pending_osd_state_clear(
9940 osd
, CEPH_OSD_NOOUT
)) {
9946 assert(0 == "invalid option");
9953 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
9954 get_last_committed() + 1));
9957 } else if (prefix
== "osd pg-temp") {
9959 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
9960 ss
<< "unable to parse 'pgid' value '"
9961 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
9966 if (!pgid
.parse(pgidstr
.c_str())) {
9967 ss
<< "invalid pgid '" << pgidstr
<< "'";
9971 if (!osdmap
.pg_exists(pgid
)) {
9972 ss
<< "pg " << pgid
<< " does not exist";
9976 if (pending_inc
.new_pg_temp
.count(pgid
)) {
9977 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
9978 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9982 vector
<int64_t> id_vec
;
9983 vector
<int32_t> new_pg_temp
;
9984 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "id", id_vec
)) {
9985 ss
<< "unable to parse 'id' value(s) '"
9986 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9990 for (auto osd
: id_vec
) {
9991 if (!osdmap
.exists(osd
)) {
9992 ss
<< "osd." << osd
<< " does not exist";
9996 new_pg_temp
.push_back(osd
);
9999 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
10000 if ((int)new_pg_temp
.size() < pool_min_size
) {
10001 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
10002 << pool_min_size
<< ")";
10007 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
10008 if ((int)new_pg_temp
.size() > pool_size
) {
10009 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
10010 << pool_size
<< ")";
10015 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
10016 new_pg_temp
.begin(), new_pg_temp
.end());
10017 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
10019 } else if (prefix
== "osd primary-temp") {
10021 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
10022 ss
<< "unable to parse 'pgid' value '"
10023 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
10028 if (!pgid
.parse(pgidstr
.c_str())) {
10029 ss
<< "invalid pgid '" << pgidstr
<< "'";
10033 if (!osdmap
.pg_exists(pgid
)) {
10034 ss
<< "pg " << pgid
<< " does not exist";
10040 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "id", osd
)) {
10041 ss
<< "unable to parse 'id' value '"
10042 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10046 if (osd
!= -1 && !osdmap
.exists(osd
)) {
10047 ss
<< "osd." << osd
<< " does not exist";
10052 if (osdmap
.require_min_compat_client
> 0 &&
10053 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
10054 ss
<< "require_min_compat_client "
10055 << ceph_release_name(osdmap
.require_min_compat_client
)
10056 << " < firefly, which is required for primary-temp";
10059 } else if (!g_conf
->mon_osd_allow_primary_temp
) {
10060 ss
<< "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
10065 pending_inc
.new_primary_temp
[pgid
] = osd
;
10066 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
10068 } else if (prefix
== "osd pg-upmap" ||
10069 prefix
== "osd rm-pg-upmap" ||
10070 prefix
== "osd pg-upmap-items" ||
10071 prefix
== "osd rm-pg-upmap-items") {
10072 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
10073 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10074 << "luminous' before using the new interface";
10078 if (osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
10079 ss
<< "min_compat_client "
10080 << ceph_release_name(osdmap
.require_min_compat_client
)
10081 << " < luminous, which is required for pg-upmap. "
10082 << "Try 'ceph osd set-require-min-compat-client luminous' "
10083 << "before using the new interface";
10087 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
10088 if (err
== -EAGAIN
)
10093 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
10094 ss
<< "unable to parse 'pgid' value '"
10095 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
10100 if (!pgid
.parse(pgidstr
.c_str())) {
10101 ss
<< "invalid pgid '" << pgidstr
<< "'";
10105 if (!osdmap
.pg_exists(pgid
)) {
10106 ss
<< "pg " << pgid
<< " does not exist";
10110 if (pending_inc
.old_pools
.count(pgid
.pool())) {
10111 ss
<< "pool of " << pgid
<< " is pending removal";
10114 wait_for_finished_proposal(op
,
10115 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
10123 OP_RM_PG_UPMAP_ITEMS
,
10126 if (prefix
== "osd pg-upmap") {
10127 option
= OP_PG_UPMAP
;
10128 } else if (prefix
== "osd rm-pg-upmap") {
10129 option
= OP_RM_PG_UPMAP
;
10130 } else if (prefix
== "osd pg-upmap-items") {
10131 option
= OP_PG_UPMAP_ITEMS
;
10133 option
= OP_RM_PG_UPMAP_ITEMS
;
10136 // check pending upmap changes
10138 case OP_PG_UPMAP
: // fall through
10139 case OP_RM_PG_UPMAP
:
10140 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
10141 pending_inc
.old_pg_upmap
.count(pgid
)) {
10142 dout(10) << __func__
<< " waiting for pending update on "
10144 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10149 case OP_PG_UPMAP_ITEMS
: // fall through
10150 case OP_RM_PG_UPMAP_ITEMS
:
10151 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
10152 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
10153 dout(10) << __func__
<< " waiting for pending update on "
10155 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10161 assert(0 == "invalid option");
10167 vector
<int64_t> id_vec
;
10168 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "id", id_vec
)) {
10169 ss
<< "unable to parse 'id' value(s) '"
10170 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10175 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
10176 if ((int)id_vec
.size() < pool_min_size
) {
10177 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
10178 << pool_min_size
<< ")";
10183 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
10184 if ((int)id_vec
.size() > pool_size
) {
10185 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
10186 << pool_size
<< ")";
10191 vector
<int32_t> new_pg_upmap
;
10192 for (auto osd
: id_vec
) {
10193 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
10194 ss
<< "osd." << osd
<< " does not exist";
10198 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
10199 if (it
!= new_pg_upmap
.end()) {
10200 ss
<< "osd." << osd
<< " already exists, ";
10203 new_pg_upmap
.push_back(osd
);
10206 if (new_pg_upmap
.empty()) {
10207 ss
<< "no valid upmap items(pairs) is specified";
10212 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
10213 new_pg_upmap
.begin(), new_pg_upmap
.end());
10214 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
10218 case OP_RM_PG_UPMAP
:
10220 pending_inc
.old_pg_upmap
.insert(pgid
);
10221 ss
<< "clear " << pgid
<< " pg_upmap mapping";
10225 case OP_PG_UPMAP_ITEMS
:
10227 vector
<int64_t> id_vec
;
10228 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "id", id_vec
)) {
10229 ss
<< "unable to parse 'id' value(s) '"
10230 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10235 if (id_vec
.size() % 2) {
10236 ss
<< "you must specify pairs of osd ids to be remapped";
10241 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
10242 if ((int)(id_vec
.size() / 2) > pool_size
) {
10243 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
10244 << pool_size
<< ")";
10249 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
10250 ostringstream items
;
10252 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
10256 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
10259 if (!osdmap
.exists(from
)) {
10260 ss
<< "osd." << from
<< " does not exist";
10264 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
10265 ss
<< "osd." << to
<< " does not exist";
10269 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
10270 auto it
= std::find(new_pg_upmap_items
.begin(),
10271 new_pg_upmap_items
.end(), entry
);
10272 if (it
!= new_pg_upmap_items
.end()) {
10273 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
10276 new_pg_upmap_items
.push_back(entry
);
10277 items
<< from
<< "->" << to
<< ",";
10279 string
out(items
.str());
10280 out
.resize(out
.size() - 1); // drop last ','
10283 if (new_pg_upmap_items
.empty()) {
10284 ss
<< "no valid upmap items(pairs) is specified";
10289 pending_inc
.new_pg_upmap_items
[pgid
] =
10290 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
10291 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
10292 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
10296 case OP_RM_PG_UPMAP_ITEMS
:
10298 pending_inc
.old_pg_upmap_items
.insert(pgid
);
10299 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
10304 assert(0 == "invalid option");
10308 } else if (prefix
== "osd primary-affinity") {
10310 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "id", id
)) {
10311 ss
<< "invalid osd id value '"
10312 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10317 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "weight", w
)) {
10318 ss
<< "unable to parse 'weight' value '"
10319 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
10323 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
10325 ss
<< "weight must be >= 0";
10329 if (osdmap
.require_min_compat_client
> 0 &&
10330 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
10331 ss
<< "require_min_compat_client "
10332 << ceph_release_name(osdmap
.require_min_compat_client
)
10333 << " < firefly, which is required for primary-affinity";
10336 } else if (!g_conf
->mon_osd_allow_primary_affinity
) {
10337 ss
<< "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
10341 err
= check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY
, ss
);
10342 if (err
== -EAGAIN
)
10346 if (osdmap
.exists(id
)) {
10347 pending_inc
.new_primary_affinity
[id
] = ww
;
10348 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
10350 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10351 get_last_committed() + 1));
10354 ss
<< "osd." << id
<< " does not exist";
10358 } else if (prefix
== "osd reweight") {
10360 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "id", id
)) {
10361 ss
<< "unable to parse osd id value '"
10362 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10367 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "weight", w
)) {
10368 ss
<< "unable to parse weight value '"
10369 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
10373 long ww
= (int)((double)CEPH_OSD_IN
*w
);
10375 ss
<< "weight must be >= 0";
10379 if (osdmap
.exists(id
)) {
10380 pending_inc
.new_weight
[id
] = ww
;
10381 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
10383 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10384 get_last_committed() + 1));
10387 ss
<< "osd." << id
<< " does not exist";
10391 } else if (prefix
== "osd reweightn") {
10392 map
<int32_t, uint32_t> weights
;
10393 err
= parse_reweights(g_ceph_context
, cmdmap
, osdmap
, &weights
);
10395 ss
<< "unable to parse 'weights' value '"
10396 << cmd_vartype_stringify(cmdmap
["weights"]) << "'";
10399 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
10400 wait_for_finished_proposal(
10402 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
10404 } else if (prefix
== "osd lost") {
10406 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "id", id
)) {
10407 ss
<< "unable to parse osd id value '"
10408 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10413 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "sure", sure
) || sure
!= "--yes-i-really-mean-it") {
10414 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
10415 "--yes-i-really-mean-it if you really do.";
10418 } else if (!osdmap
.exists(id
)) {
10419 ss
<< "osd." << id
<< " does not exist";
10422 } else if (!osdmap
.is_down(id
)) {
10423 ss
<< "osd." << id
<< " is not down";
10427 epoch_t e
= osdmap
.get_info(id
).down_at
;
10428 pending_inc
.new_lost
[id
] = e
;
10429 ss
<< "marked osd lost in epoch " << e
;
10431 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10432 get_last_committed() + 1));
10436 } else if (prefix
== "osd destroy" || prefix
== "osd purge") {
10437 /* Destroying an OSD means that we don't expect to further make use of
10438 * the OSDs data (which may even become unreadable after this operation),
10439 * and that we are okay with scrubbing all its cephx keys and config-key
10440 * data (which may include lockbox keys, thus rendering the osd's data
10443 * The OSD will not be removed. Instead, we will mark it as destroyed,
10444 * such that a subsequent call to `create` will not reuse the osd id.
10445 * This will play into being able to recreate the OSD, at the same
10446 * crush location, with minimal data movement.
10449 // make sure authmon is writeable.
10450 if (!mon
->authmon()->is_writeable()) {
10451 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
10452 << "osd destroy" << dendl
;
10453 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
10458 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "id", id
)) {
10459 ss
<< "unable to parse osd id value '"
10460 << cmd_vartype_stringify(cmdmap
["id"]) << "";
10465 bool is_destroy
= (prefix
== "osd destroy");
10467 assert("osd purge" == prefix
);
10471 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "sure", sure
) ||
10472 sure
!= "--yes-i-really-mean-it") {
10473 ss
<< "Are you SURE? This will mean real, permanent data loss, as well "
10474 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
10478 } else if (!osdmap
.exists(id
)) {
10479 ss
<< "osd." << id
<< " does not exist";
10480 err
= 0; // idempotent
10482 } else if (osdmap
.is_up(id
)) {
10483 ss
<< "osd." << id
<< " is not `down`.";
10486 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
10487 ss
<< "destroyed osd." << id
;
10492 bool goto_reply
= false;
10496 err
= prepare_command_osd_destroy(id
, ss
);
10497 // we checked above that it should exist.
10498 assert(err
!= -ENOENT
);
10500 err
= prepare_command_osd_purge(id
, ss
);
10501 if (err
== -ENOENT
) {
10503 ss
<< "osd." << id
<< " does not exist.";
10509 if (err
< 0 || goto_reply
) {
10514 ss
<< "destroyed osd." << id
;
10516 ss
<< "purged osd." << id
;
10520 wait_for_finished_proposal(op
,
10521 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
10522 force_immediate_propose();
10525 } else if (prefix
== "osd new") {
10527 // make sure authmon is writeable.
10528 if (!mon
->authmon()->is_writeable()) {
10529 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
10530 << "osd new" << dendl
;
10531 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
10535 map
<string
,string
> param_map
;
10537 bufferlist bl
= m
->get_data();
10538 string param_json
= bl
.to_str();
10539 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
10541 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
10545 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
10548 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
10561 if (err
== EEXIST
) {
10562 // idempotent operation
10567 wait_for_finished_proposal(op
,
10568 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
10569 get_last_committed() + 1));
10570 force_immediate_propose();
10573 } else if (prefix
== "osd create") {
10575 // optional id provided?
10576 int64_t id
= -1, cmd_id
= -1;
10577 if (cmd_getval_throws(g_ceph_context
, cmdmap
, "id", cmd_id
)) {
10579 ss
<< "invalid osd id value '" << cmd_id
<< "'";
10583 dout(10) << " osd create got id " << cmd_id
<< dendl
;
10588 if (cmd_getval_throws(g_ceph_context
, cmdmap
, "uuid", uuidstr
)) {
10589 if (!uuid
.parse(uuidstr
.c_str())) {
10590 ss
<< "invalid uuid value '" << uuidstr
<< "'";
10594 // we only care about the id if we also have the uuid, to
10595 // ensure the operation's idempotency.
10599 int32_t new_id
= -1;
10600 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
10602 if (err
== -EAGAIN
) {
10603 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10606 // a check has failed; reply to the user.
10609 } else if (err
== EEXIST
) {
10610 // this is an idempotent operation; we can go ahead and reply.
10612 f
->open_object_section("created_osd");
10613 f
->dump_int("osdid", new_id
);
10614 f
->close_section();
10624 string empty_device_class
;
10625 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
10628 f
->open_object_section("created_osd");
10629 f
->dump_int("osdid", new_id
);
10630 f
->close_section();
10636 wait_for_finished_proposal(op
,
10637 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
10638 get_last_committed() + 1));
10641 } else if (prefix
== "osd blacklist clear") {
10642 pending_inc
.new_blacklist
.clear();
10643 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
10644 osdmap
.get_blacklist(&blacklist
);
10645 for (const auto &entry
: blacklist
) {
10646 pending_inc
.old_blacklist
.push_back(entry
.first
);
10648 ss
<< " removed all blacklist entries";
10650 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10651 get_last_committed() + 1));
10653 } else if (prefix
== "osd blacklist") {
10655 cmd_getval_throws(g_ceph_context
, cmdmap
, "addr", addrstr
);
10656 entity_addr_t addr
;
10657 if (!addr
.parse(addrstr
.c_str(), 0)) {
10658 ss
<< "unable to parse address " << addrstr
;
10663 string blacklistop
;
10664 cmd_getval_throws(g_ceph_context
, cmdmap
, "blacklistop", blacklistop
);
10665 if (blacklistop
== "add") {
10666 utime_t expires
= ceph_clock_now();
10668 // default one hour
10669 cmd_getval_throws(g_ceph_context
, cmdmap
, "expire", d
,
10670 g_conf
->mon_osd_blacklist_default_expire
);
10673 pending_inc
.new_blacklist
[addr
] = expires
;
10676 // cancel any pending un-blacklisting request too
10677 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
10678 pending_inc
.old_blacklist
.end(), addr
);
10679 if (it
!= pending_inc
.old_blacklist
.end()) {
10680 pending_inc
.old_blacklist
.erase(it
);
10684 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
10686 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10687 get_last_committed() + 1));
10689 } else if (blacklistop
== "rm") {
10690 if (osdmap
.is_blacklisted(addr
) ||
10691 pending_inc
.new_blacklist
.count(addr
)) {
10692 if (osdmap
.is_blacklisted(addr
))
10693 pending_inc
.old_blacklist
.push_back(addr
);
10695 pending_inc
.new_blacklist
.erase(addr
);
10696 ss
<< "un-blacklisting " << addr
;
10698 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10699 get_last_committed() + 1));
10702 ss
<< addr
<< " isn't blacklisted";
10707 } else if (prefix
== "osd pool mksnap") {
10709 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
10710 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
10712 ss
<< "unrecognized pool '" << poolstr
<< "'";
10717 cmd_getval_throws(g_ceph_context
, cmdmap
, "snap", snapname
);
10718 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
10719 if (p
->is_unmanaged_snaps_mode()) {
10720 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
10723 } else if (p
->snap_exists(snapname
.c_str())) {
10724 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
10727 } else if (p
->is_tier()) {
10728 ss
<< "pool " << poolstr
<< " is a cache tier";
10733 if (pending_inc
.new_pools
.count(pool
))
10734 pp
= &pending_inc
.new_pools
[pool
];
10736 pp
= &pending_inc
.new_pools
[pool
];
10739 if (pp
->snap_exists(snapname
.c_str())) {
10740 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
10742 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
10743 pp
->set_snap_epoch(pending_inc
.epoch
);
10744 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
10747 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10748 get_last_committed() + 1));
10750 } else if (prefix
== "osd pool rmsnap") {
10752 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
10753 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
10755 ss
<< "unrecognized pool '" << poolstr
<< "'";
10760 cmd_getval_throws(g_ceph_context
, cmdmap
, "snap", snapname
);
10761 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
10762 if (p
->is_unmanaged_snaps_mode()) {
10763 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
10766 } else if (!p
->snap_exists(snapname
.c_str())) {
10767 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
10772 if (pending_inc
.new_pools
.count(pool
))
10773 pp
= &pending_inc
.new_pools
[pool
];
10775 pp
= &pending_inc
.new_pools
[pool
];
10778 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
10780 pp
->remove_snap(sn
);
10781 pp
->set_snap_epoch(pending_inc
.epoch
);
10782 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
10784 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
10787 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10788 get_last_committed() + 1));
10790 } else if (prefix
== "osd pool create") {
10793 cmd_getval_throws(g_ceph_context
, cmdmap
, "pg_num", pg_num
, int64_t(0));
10794 cmd_getval_throws(g_ceph_context
, cmdmap
, "pgp_num", pgp_num
, pg_num
);
10796 string pool_type_str
;
10797 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool_type", pool_type_str
);
10798 if (pool_type_str
.empty())
10799 pool_type_str
= g_conf
->osd_pool_default_type
;
10802 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
10803 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10804 if (pool_id
>= 0) {
10805 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10806 if (pool_type_str
!= p
->get_type_name()) {
10807 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
10810 ss
<< "pool '" << poolstr
<< "' already exists";
10817 if (pool_type_str
== "replicated") {
10818 pool_type
= pg_pool_t::TYPE_REPLICATED
;
10819 } else if (pool_type_str
== "erasure") {
10820 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
|
10821 CEPH_FEATURE_OSD_ERASURE_CODES
,
10823 if (err
== -EAGAIN
)
10827 pool_type
= pg_pool_t::TYPE_ERASURE
;
10829 ss
<< "unknown pool type '" << pool_type_str
<< "'";
10834 bool implicit_rule_creation
= false;
10835 int64_t expected_num_objects
= 0;
10837 cmd_getval_throws(g_ceph_context
, cmdmap
, "rule", rule_name
);
10838 string erasure_code_profile
;
10839 cmd_getval_throws(g_ceph_context
, cmdmap
, "erasure_code_profile", erasure_code_profile
);
10841 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
10842 if (erasure_code_profile
== "")
10843 erasure_code_profile
= "default";
10844 //handle the erasure code profile
10845 if (erasure_code_profile
== "default") {
10846 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
10847 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
10848 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
10852 map
<string
,string
> profile_map
;
10853 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
10858 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
10859 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
10863 if (rule_name
== "") {
10864 implicit_rule_creation
= true;
10865 if (erasure_code_profile
== "default") {
10866 rule_name
= "erasure-code";
10868 dout(1) << "implicitly use rule named after the pool: "
10869 << poolstr
<< dendl
;
10870 rule_name
= poolstr
;
10873 cmd_getval_throws(g_ceph_context
, cmdmap
, "expected_num_objects",
10874 expected_num_objects
, int64_t(0));
10876 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
10877 // and put expected_num_objects to rule field
10878 if (erasure_code_profile
!= "") { // cmd is from CLI
10879 if (rule_name
!= "") {
10881 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
10882 if (interr
.length()) {
10883 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
10888 rule_name
= erasure_code_profile
;
10889 } else { // cmd is well-formed
10890 cmd_getval_throws(g_ceph_context
, cmdmap
, "expected_num_objects",
10891 expected_num_objects
, int64_t(0));
10895 if (!implicit_rule_creation
&& rule_name
!= "") {
10897 err
= get_crush_rule(rule_name
, &rule
, &ss
);
10898 if (err
== -EAGAIN
) {
10899 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10906 if (expected_num_objects
< 0) {
10907 ss
<< "'expected_num_objects' must be non-negative";
10912 if (expected_num_objects
> 0 &&
10913 cct
->_conf
->osd_objectstore
== "filestore" &&
10914 cct
->_conf
->filestore_merge_threshold
> 0) {
10915 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
10920 if (expected_num_objects
== 0 &&
10921 cct
->_conf
->osd_objectstore
== "filestore" &&
10922 cct
->_conf
->filestore_merge_threshold
< 0) {
10923 int osds
= osdmap
.get_num_osds();
10924 if (osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
10925 ss
<< "For better initial performance on pools expected to store a "
10926 << "large number of objects, consider supplying the "
10927 << "expected_num_objects parameter when creating the pool.\n";
10931 int64_t fast_read_param
;
10932 cmd_getval_throws(g_ceph_context
, cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
10933 FastReadType fast_read
= FAST_READ_DEFAULT
;
10934 if (fast_read_param
== 0)
10935 fast_read
= FAST_READ_OFF
;
10936 else if (fast_read_param
> 0)
10937 fast_read
= FAST_READ_ON
;
10939 err
= prepare_new_pool(poolstr
, 0, // auid=0 for admin created pool
10940 -1, // default crush rule
10943 erasure_code_profile
, pool_type
,
10944 (uint64_t)expected_num_objects
,
10950 ss
<< "pool '" << poolstr
<< "' already exists";
10953 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10962 ss
<< "pool '" << poolstr
<< "' created";
10965 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10966 get_last_committed() + 1));
10969 } else if (prefix
== "osd pool delete" ||
10970 prefix
== "osd pool rm") {
10971 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
10972 string poolstr
, poolstr2
, sure
;
10973 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
10974 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool2", poolstr2
);
10975 cmd_getval_throws(g_ceph_context
, cmdmap
, "sure", sure
);
10976 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
10978 ss
<< "pool '" << poolstr
<< "' does not exist";
10983 bool force_no_fake
= sure
== "--yes-i-really-really-mean-it-not-faking";
10984 if (poolstr2
!= poolstr
||
10985 (sure
!= "--yes-i-really-really-mean-it" && !force_no_fake
)) {
10986 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
10987 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
10988 << "followed by --yes-i-really-really-mean-it.";
10992 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
10993 if (err
== -EAGAIN
) {
10994 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11000 } else if (prefix
== "osd pool rename") {
11001 string srcpoolstr
, destpoolstr
;
11002 cmd_getval_throws(g_ceph_context
, cmdmap
, "srcpool", srcpoolstr
);
11003 cmd_getval_throws(g_ceph_context
, cmdmap
, "destpool", destpoolstr
);
11004 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
11005 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
11007 if (pool_src
< 0) {
11008 if (pool_dst
>= 0) {
11009 // src pool doesn't exist, dst pool does exist: to ensure idempotency
11010 // of operations, assume this rename succeeded, as it is not changing
11011 // the current state. Make sure we output something understandable
11012 // for whoever is issuing the command, if they are paying attention,
11013 // in case it was not intentional; or to avoid a "wtf?" and a bug
11014 // report in case it was intentional, while expecting a failure.
11015 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
11016 << destpoolstr
<< "' does -- assuming successful rename";
11019 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
11023 } else if (pool_dst
>= 0) {
11024 // source pool exists and so does the destination pool
11025 ss
<< "pool '" << destpoolstr
<< "' already exists";
11030 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
11032 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
11034 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
11035 << cpp_strerror(ret
);
11038 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
11039 get_last_committed() + 1));
11042 } else if (prefix
== "osd pool set") {
11043 err
= prepare_command_pool_set(cmdmap
, ss
);
11044 if (err
== -EAGAIN
)
11050 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11051 get_last_committed() + 1));
11053 } else if (prefix
== "osd tier add") {
11054 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11055 if (err
== -EAGAIN
)
11060 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
11061 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11063 ss
<< "unrecognized pool '" << poolstr
<< "'";
11067 string tierpoolstr
;
11068 cmd_getval_throws(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
11069 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
11070 if (tierpool_id
< 0) {
11071 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
11075 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11077 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
11080 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
11084 // make sure new tier is empty
11085 string force_nonempty
;
11086 cmd_getval_throws(g_ceph_context
, cmdmap
, "force_nonempty", force_nonempty
);
11087 const pool_stat_t
*pstats
= mon
->pgservice
->get_pool_stat(tierpool_id
);
11088 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
11089 force_nonempty
!= "--force-nonempty") {
11090 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
11094 if (tp
->ec_pool()) {
11095 ss
<< "tier pool '" << tierpoolstr
11096 << "' is an ec pool, which cannot be a tier";
11100 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
11101 ((force_nonempty
!= "--force-nonempty") ||
11102 (!g_conf
->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
11103 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
11108 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11109 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
11110 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
11111 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11114 np
->tiers
.insert(tierpool_id
);
11115 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
11116 ntp
->tier_of
= pool_id
;
11117 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
11118 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11119 get_last_committed() + 1));
11121 } else if (prefix
== "osd tier remove" ||
11122 prefix
== "osd tier rm") {
11124 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
11125 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11127 ss
<< "unrecognized pool '" << poolstr
<< "'";
11131 string tierpoolstr
;
11132 cmd_getval_throws(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
11133 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
11134 if (tierpool_id
< 0) {
11135 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
11139 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11141 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
11144 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
11148 if (p
->tiers
.count(tierpool_id
) == 0) {
11149 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
11153 if (tp
->tier_of
!= pool_id
) {
11154 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
11155 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
11156 // be scary about it; this is an inconsistency and bells must go off
11157 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
11161 if (p
->read_tier
== tierpool_id
) {
11162 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
11167 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11168 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
11169 if (np
->tiers
.count(tierpool_id
) == 0 ||
11170 ntp
->tier_of
!= pool_id
||
11171 np
->read_tier
== tierpool_id
) {
11172 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11175 np
->tiers
.erase(tierpool_id
);
11177 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
11178 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11179 get_last_committed() + 1));
11181 } else if (prefix
== "osd tier set-overlay") {
11182 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11183 if (err
== -EAGAIN
)
11188 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
11189 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11191 ss
<< "unrecognized pool '" << poolstr
<< "'";
11195 string overlaypoolstr
;
11196 cmd_getval_throws(g_ceph_context
, cmdmap
, "overlaypool", overlaypoolstr
);
11197 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
11198 if (overlaypool_id
< 0) {
11199 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
11203 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11205 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
11207 if (p
->tiers
.count(overlaypool_id
) == 0) {
11208 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
11212 if (p
->read_tier
== overlaypool_id
) {
11214 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
11217 if (p
->has_read_tier()) {
11218 ss
<< "pool '" << poolstr
<< "' has overlay '"
11219 << osdmap
.get_pool_name(p
->read_tier
)
11220 << "'; please remove-overlay first";
11226 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11227 np
->read_tier
= overlaypool_id
;
11228 np
->write_tier
= overlaypool_id
;
11229 np
->set_last_force_op_resend(pending_inc
.epoch
);
11230 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
11231 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
11232 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
11233 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
11234 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
11235 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11236 get_last_committed() + 1));
11238 } else if (prefix
== "osd tier remove-overlay" ||
11239 prefix
== "osd tier rm-overlay") {
11241 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
11242 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11244 ss
<< "unrecognized pool '" << poolstr
<< "'";
11248 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11250 if (!p
->has_read_tier()) {
11252 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
11256 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
11261 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11262 if (np
->has_read_tier()) {
11263 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
11264 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
11265 nop
->set_last_force_op_resend(pending_inc
.epoch
);
11267 if (np
->has_write_tier()) {
11268 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
11269 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
11270 nop
->set_last_force_op_resend(pending_inc
.epoch
);
11272 np
->clear_read_tier();
11273 np
->clear_write_tier();
11274 np
->set_last_force_op_resend(pending_inc
.epoch
);
11275 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
11276 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11277 get_last_committed() + 1));
11279 } else if (prefix
== "osd tier cache-mode") {
11280 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11281 if (err
== -EAGAIN
)
11286 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
11287 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11289 ss
<< "unrecognized pool '" << poolstr
<< "'";
11293 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11295 if (!p
->is_tier()) {
11296 ss
<< "pool '" << poolstr
<< "' is not a tier";
11301 cmd_getval_throws(g_ceph_context
, cmdmap
, "mode", modestr
);
11302 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
11304 ss
<< "'" << modestr
<< "' is not a valid cache mode";
11310 cmd_getval_throws(g_ceph_context
, cmdmap
, "sure", sure
);
11311 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11312 mode
!= pg_pool_t::CACHEMODE_NONE
&&
11313 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
11314 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
11315 sure
!= "--yes-i-really-mean-it") {
11316 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
11317 << "corrupt your data. pass --yes-i-really-mean-it to force.";
11322 // pool already has this cache-mode set and there are no pending changes
11323 if (p
->cache_mode
== mode
&&
11324 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
11325 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
11326 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
11327 << " to " << pg_pool_t::get_cache_mode_name(mode
);
11332 /* Mode description:
11334 * none: No cache-mode defined
11335 * forward: Forward all reads and writes to base pool
11336 * writeback: Cache writes, promote reads from base pool
11337 * readonly: Forward writes to base pool
11338 * readforward: Writes are in writeback mode, Reads are in forward mode
11339 * proxy: Proxy all reads and writes to base pool
11340 * readproxy: Writes are in writeback mode, Reads are in proxy mode
11342 * Hence, these are the allowed transitions:
11345 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11346 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11347 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
11348 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
11349 * writeback -> readforward || readproxy || forward || proxy
11353 // We check if the transition is valid against the current pool mode, as
11354 // it is the only committed state thus far. We will blantly squash
11355 // whatever mode is on the pending state.
11357 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
11358 (mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
11359 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
11360 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
11361 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
11362 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
11363 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
11364 << "' pool; only '"
11365 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD
)
11367 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
11369 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD
)
11371 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
11376 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
11377 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11378 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
11379 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
11380 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
11382 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
11383 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11384 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
11385 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
11386 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
11388 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
11389 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11390 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
11391 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
11392 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
11394 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
11395 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11396 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
11397 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
11398 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
11400 const pool_stat_t
* pstats
=
11401 mon
->pgservice
->get_pool_stat(pool_id
);
11403 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
11404 ss
<< "unable to set cache-mode '"
11405 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
11406 << "': dirty objects found";
11412 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11413 np
->cache_mode
= mode
;
11414 // set this both when moving to and from cache_mode NONE. this is to
11415 // capture legacy pools that were set up before this flag existed.
11416 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
11417 ss
<< "set cache-mode for pool '" << poolstr
11418 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
11419 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
11420 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
11422 if (base_pool
->read_tier
== pool_id
||
11423 base_pool
->write_tier
== pool_id
)
11424 ss
<<" (WARNING: pool is still configured as read or write tier)";
11426 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11427 get_last_committed() + 1));
11429 } else if (prefix
== "osd tier add-cache") {
11430 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11431 if (err
== -EAGAIN
)
11436 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
11437 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11439 ss
<< "unrecognized pool '" << poolstr
<< "'";
11443 string tierpoolstr
;
11444 cmd_getval_throws(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
11445 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
11446 if (tierpool_id
< 0) {
11447 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
11451 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11453 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
11456 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
11461 if (!cmd_getval_throws(g_ceph_context
, cmdmap
, "size", size
)) {
11462 ss
<< "unable to parse 'size' value '"
11463 << cmd_vartype_stringify(cmdmap
["size"]) << "'";
11467 // make sure new tier is empty
11468 const pool_stat_t
*pstats
=
11469 mon
->pgservice
->get_pool_stat(tierpool_id
);
11470 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
11471 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
11475 string modestr
= g_conf
->osd_tier_default_cache_mode
;
11476 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
11478 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
11482 HitSet::Params hsp
;
11483 if (g_conf
->osd_tier_default_cache_hit_set_type
== "bloom") {
11484 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
11485 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
11486 hsp
= HitSet::Params(bsp
);
11487 } else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_hash") {
11488 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
11490 else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_object") {
11491 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
11493 ss
<< "osd tier cache default hit set type '" <<
11494 g_conf
->osd_tier_default_cache_hit_set_type
<< "' is not a known type";
11499 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11500 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
11501 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
11502 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11505 np
->tiers
.insert(tierpool_id
);
11506 np
->read_tier
= np
->write_tier
= tierpool_id
;
11507 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
11508 np
->set_last_force_op_resend(pending_inc
.epoch
);
11509 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
11510 ntp
->tier_of
= pool_id
;
11511 ntp
->cache_mode
= mode
;
11512 ntp
->hit_set_count
= g_conf
->osd_tier_default_cache_hit_set_count
;
11513 ntp
->hit_set_period
= g_conf
->osd_tier_default_cache_hit_set_period
;
11514 ntp
->min_read_recency_for_promote
= g_conf
->osd_tier_default_cache_min_read_recency_for_promote
;
11515 ntp
->min_write_recency_for_promote
= g_conf
->osd_tier_default_cache_min_write_recency_for_promote
;
11516 ntp
->hit_set_grade_decay_rate
= g_conf
->osd_tier_default_cache_hit_set_grade_decay_rate
;
11517 ntp
->hit_set_search_last_n
= g_conf
->osd_tier_default_cache_hit_set_search_last_n
;
11518 ntp
->hit_set_params
= hsp
;
11519 ntp
->target_max_bytes
= size
;
11520 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
11521 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11522 get_last_committed() + 1));
11524 } else if (prefix
== "osd pool set-quota") {
11526 cmd_getval_throws(g_ceph_context
, cmdmap
, "pool", poolstr
);
11527 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11529 ss
<< "unrecognized pool '" << poolstr
<< "'";
11535 cmd_getval_throws(g_ceph_context
, cmdmap
, "field", field
);
11536 if (field
!= "max_objects" && field
!= "max_bytes") {
11537 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
11542 // val could contain unit designations, so we treat as a string
11544 cmd_getval_throws(g_ceph_context
, cmdmap
, "val", val
);
11547 if (field
== "max_objects") {
11548 value
= strict_sistrtoll(val
.c_str(), &tss
);
11549 } else if (field
== "max_bytes") {
11550 value
= strict_iecstrtoll(val
.c_str(), &tss
);
11552 assert(0 == "unrecognized option");
11554 if (!tss
.empty()) {
11555 ss
<< "error parsing value '" << val
<< "': " << tss
;
11560 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
11561 if (field
== "max_objects") {
11562 pi
->quota_max_objects
= value
;
11563 } else if (field
== "max_bytes") {
11564 pi
->quota_max_bytes
= value
;
11566 assert(0 == "unrecognized option");
11568 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
11570 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11571 get_last_committed() + 1));
11573 } else if (prefix
== "osd pool application enable" ||
11574 prefix
== "osd pool application disable" ||
11575 prefix
== "osd pool application set" ||
11576 prefix
== "osd pool application rm") {
11577 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
11578 if (err
== -EAGAIN
)
11584 wait_for_finished_proposal(
11585 op
, new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
11587 } else if (prefix
== "osd reweight-by-pg" ||
11588 prefix
== "osd reweight-by-utilization" ||
11589 prefix
== "osd test-reweight-by-pg" ||
11590 prefix
== "osd test-reweight-by-utilization") {
11592 prefix
== "osd reweight-by-pg" || prefix
== "osd test-reweight-by-pg";
11594 prefix
== "osd test-reweight-by-pg" ||
11595 prefix
== "osd test-reweight-by-utilization";
11597 cmd_getval(g_ceph_context
, cmdmap
, "oload", oload
, int64_t(120));
11598 set
<int64_t> pools
;
11599 vector
<string
> poolnamevec
;
11600 cmd_getval(g_ceph_context
, cmdmap
, "pools", poolnamevec
);
11601 for (unsigned j
= 0; j
< poolnamevec
.size(); j
++) {
11602 int64_t pool
= osdmap
.lookup_pg_pool_name(poolnamevec
[j
]);
11604 ss
<< "pool '" << poolnamevec
[j
] << "' does not exist";
11608 pools
.insert(pool
);
11610 double max_change
= g_conf
->mon_reweight_max_change
;
11611 cmd_getval(g_ceph_context
, cmdmap
, "max_change", max_change
);
11612 if (max_change
<= 0.0) {
11613 ss
<< "max_change " << max_change
<< " must be positive";
11617 int64_t max_osds
= g_conf
->mon_reweight_max_osds
;
11618 cmd_getval(g_ceph_context
, cmdmap
, "max_osds", max_osds
);
11619 if (max_osds
<= 0) {
11620 ss
<< "max_osds " << max_osds
<< " must be positive";
11624 string no_increasing
;
11625 cmd_getval(g_ceph_context
, cmdmap
, "no_increasing", no_increasing
);
11627 mempool::osdmap::map
<int32_t, uint32_t> new_weights
;
11628 err
= mon
->pgservice
->reweight_by_utilization(osdmap
,
11633 pools
.empty() ? NULL
: &pools
,
11634 no_increasing
== "--no-increasing",
11636 &ss
, &out_str
, f
.get());
11638 dout(10) << "reweight::by_utilization: finished with " << out_str
<< dendl
;
11643 rdata
.append(out_str
);
11645 ss
<< "FAILED reweight-by-pg";
11646 } else if (err
== 0 || dry_run
) {
11649 ss
<< "SUCCESSFUL reweight-by-pg";
11650 pending_inc
.new_weight
= std::move(new_weights
);
11651 wait_for_finished_proposal(
11653 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
11656 } else if (prefix
== "osd force-create-pg") {
11659 cmd_getval_throws(g_ceph_context
, cmdmap
, "pgid", pgidstr
);
11660 if (!pgid
.parse(pgidstr
.c_str())) {
11661 ss
<< "invalid pgid '" << pgidstr
<< "'";
11665 if (!osdmap
.pg_exists(pgid
)) {
11666 ss
<< "pg " << pgid
<< " should not exist";
11672 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
11673 auto emplaced
= creating_pgs
.pgs
.emplace(pgid
,
11674 make_pair(osdmap
.get_epoch(),
11675 ceph_clock_now()));
11676 creating_now
= emplaced
.second
;
11678 if (creating_now
) {
11679 ss
<< "pg " << pgidstr
<< " now creating, ok";
11683 ss
<< "pg " << pgid
<< " already creating";
11693 if (err
< 0 && rs
.length() == 0)
11694 rs
= cpp_strerror(err
);
11695 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
11700 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11701 get_last_committed() + 1));
11705 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11709 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
11711 op
->mark_osdmon_event(__func__
);
11713 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11714 MonSession
*session
= m
->get_session();
11716 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
11721 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11722 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11724 const std::string
* pool_name
= nullptr;
11725 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
11726 if (pg_pool
!= nullptr) {
11727 pool_name
= &osdmap
.get_pool_name(m
->pool
);
11730 if (!is_unmanaged_snap_op_permitted(cct
, mon
->key_server
,
11731 session
->entity_name
, session
->caps
,
11733 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
11734 << "privileges. message: " << *m
<< std::endl
11735 << "caps: " << session
->caps
<< dendl
;
11736 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
11742 if (!session
->is_capable("osd", MON_CAP_W
)) {
11743 dout(0) << "got pool op from entity with insufficient privileges. "
11744 << "message: " << *m
<< std::endl
11745 << "caps: " << session
->caps
<< dendl
;
11746 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
11755 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
11757 op
->mark_osdmon_event(__func__
);
11758 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11760 if (enforce_pool_op_caps(op
)) {
11764 if (m
->fsid
!= mon
->monmap
->fsid
) {
11765 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
11766 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
11767 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11771 if (m
->op
== POOL_OP_CREATE
)
11772 return preprocess_pool_op_create(op
);
11774 if (!osdmap
.get_pg_pool(m
->pool
)) {
11775 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
11776 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11780 // check if the snap and snapname exist
11781 bool snap_exists
= false;
11782 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
11783 if (p
->snap_exists(m
->name
.c_str()))
11784 snap_exists
= true;
11787 case POOL_OP_CREATE_SNAP
:
11788 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
11789 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11793 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11797 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11798 if (p
->is_pool_snaps_mode()) {
11799 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11803 case POOL_OP_DELETE_SNAP
:
11804 if (p
->is_unmanaged_snaps_mode()) {
11805 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11808 if (!snap_exists
) {
11809 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11813 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11814 if (p
->is_pool_snaps_mode()) {
11815 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11818 if (p
->is_removed_snap(m
->snapid
)) {
11819 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11823 case POOL_OP_DELETE
:
11824 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
11825 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11829 case POOL_OP_AUID_CHANGE
:
11839 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
11841 op
->mark_osdmon_event(__func__
);
11842 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11843 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
11845 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11852 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
11854 op
->mark_osdmon_event(__func__
);
11855 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11856 dout(10) << "prepare_pool_op " << *m
<< dendl
;
11857 if (m
->op
== POOL_OP_CREATE
) {
11858 return prepare_pool_op_create(op
);
11859 } else if (m
->op
== POOL_OP_DELETE
) {
11860 return prepare_pool_op_delete(op
);
11864 bool changed
= false;
11866 if (!osdmap
.have_pg_pool(m
->pool
)) {
11867 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
11871 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
11874 case POOL_OP_CREATE_SNAP
:
11875 if (pool
->is_tier()) {
11877 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
11879 } // else, fall through
11880 case POOL_OP_DELETE_SNAP
:
11881 if (!pool
->is_unmanaged_snaps_mode()) {
11882 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
11883 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
11884 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
11892 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
11895 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11896 // we won't allow removal of an unmanaged snapshot from a pool
11897 // not in unmanaged snaps mode.
11898 if (!pool
->is_unmanaged_snaps_mode()) {
11899 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
11903 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11904 // but we will allow creating an unmanaged snapshot on any pool
11905 // as long as it is not in 'pool' snaps mode.
11906 if (pool
->is_pool_snaps_mode()) {
11907 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11912 // projected pool info
11914 if (pending_inc
.new_pools
.count(m
->pool
))
11915 pp
= pending_inc
.new_pools
[m
->pool
];
11917 pp
= *osdmap
.get_pg_pool(m
->pool
);
11919 bufferlist reply_data
;
11921 // pool snaps vs unmanaged snaps are mutually exclusive
11923 case POOL_OP_CREATE_SNAP
:
11924 case POOL_OP_DELETE_SNAP
:
11925 if (pp
.is_unmanaged_snaps_mode()) {
11931 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11932 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11933 if (pp
.is_pool_snaps_mode()) {
11940 case POOL_OP_CREATE_SNAP
:
11941 if (!pp
.snap_exists(m
->name
.c_str())) {
11942 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
11943 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
<< " seq " << pp
.get_snap_epoch() << dendl
;
11948 case POOL_OP_DELETE_SNAP
:
11950 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
11958 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11961 pp
.add_unmanaged_snap(snapid
);
11962 ::encode(snapid
, reply_data
);
11967 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11968 if (!pp
.is_removed_snap(m
->snapid
)) {
11969 if (m
->snapid
> pp
.get_snap_seq()) {
11970 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
11973 pp
.remove_unmanaged_snap(m
->snapid
);
11978 case POOL_OP_AUID_CHANGE
:
11979 if (pp
.auid
!= m
->auid
) {
11991 pp
.set_snap_epoch(pending_inc
.epoch
);
11992 pending_inc
.new_pools
[m
->pool
] = pp
;
11996 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
12000 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
12002 op
->mark_osdmon_event(__func__
);
12003 int err
= prepare_new_pool(op
);
12004 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
12008 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
12011 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
12013 // If the Pool is in use by CephFS, refuse to delete it
12014 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
12015 if (pending_fsmap
.pool_in_use(pool_id
)) {
12016 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
12020 if (pool
.tier_of
>= 0) {
12021 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
12022 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
12025 if (!pool
.tiers
.empty()) {
12026 *ss
<< "pool '" << poolstr
<< "' has tiers";
12027 for(auto tier
: pool
.tiers
) {
12028 *ss
<< " " << osdmap
.get_pool_name(tier
);
12033 if (!g_conf
->mon_allow_pool_delete
) {
12034 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
12038 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
12039 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
12043 *ss
<< "pool '" << poolstr
<< "' removed";
12048 * Check if it is safe to add a tier to a base pool
12051 * True if the operation should proceed, false if we should abort here
12052 * (abort doesn't necessarily mean error, could be idempotency)
12054 bool OSDMonitor::_check_become_tier(
12055 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
12056 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
12060 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
12061 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
12063 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
12064 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
12065 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
12070 if (base_pool
->tiers
.count(tier_pool_id
)) {
12071 assert(tier_pool
->tier_of
== base_pool_id
);
12073 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
12074 << base_pool_name
<< "'";
12078 if (base_pool
->is_tier()) {
12079 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
12080 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
12081 << "multiple tiers are not yet supported.";
12086 if (tier_pool
->has_tiers()) {
12087 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
12088 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
12089 it
!= tier_pool
->tiers
.end(); ++it
)
12090 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
12091 *ss
<< " multiple tiers are not yet supported.";
12096 if (tier_pool
->is_tier()) {
12097 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
12098 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
12109 * Check if it is safe to remove a tier from this base pool
12112 * True if the operation should proceed, false if we should abort here
12113 * (abort doesn't necessarily mean error, could be idempotency)
12115 bool OSDMonitor::_check_remove_tier(
12116 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
12117 const pg_pool_t
*tier_pool
,
12118 int *err
, ostream
*ss
) const
12120 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
12122 // Apply CephFS-specific checks
12123 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
12124 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
12125 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
12126 // If the underlying pool is erasure coded and does not allow EC
12127 // overwrites, we can't permit the removal of the replicated tier that
12128 // CephFS relies on to access it
12129 *ss
<< "pool '" << base_pool_name
<<
12130 "' does not allow EC overwrites and is in use by CephFS"
12136 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
12137 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
12138 "tier is still in use as a writeback cache. Change the cache "
12139 "mode and flush the cache before removing it";
12149 int OSDMonitor::_prepare_remove_pool(
12150 int64_t pool
, ostream
*ss
, bool no_fake
)
12152 dout(10) << __func__
<< " " << pool
<< dendl
;
12153 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12154 int r
= _check_remove_pool(pool
, *p
, ss
);
12158 auto new_pool
= pending_inc
.new_pools
.find(pool
);
12159 if (new_pool
!= pending_inc
.new_pools
.end()) {
12160 // if there is a problem with the pending info, wait and retry
12162 const auto& p
= new_pool
->second
;
12163 int r
= _check_remove_pool(pool
, p
, ss
);
12168 if (pending_inc
.old_pools
.count(pool
)) {
12169 dout(10) << __func__
<< " " << pool
<< " already pending removal"
12174 if (g_conf
->mon_fake_pool_delete
&& !no_fake
) {
12175 string old_name
= osdmap
.get_pool_name(pool
);
12176 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
12177 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
12178 << old_name
<< " -> " << new_name
<< dendl
;
12179 pending_inc
.new_pool_names
[pool
] = new_name
;
12184 pending_inc
.old_pools
.insert(pool
);
12186 // remove any pg_temp mappings for this pool
12187 for (auto p
= osdmap
.pg_temp
->begin();
12188 p
!= osdmap
.pg_temp
->end();
12190 if (p
->first
.pool() == (uint64_t)pool
) {
12191 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
12192 << p
->first
<< dendl
;
12193 pending_inc
.new_pg_temp
[p
->first
].clear();
12196 // remove any primary_temp mappings for this pool
12197 for (auto p
= osdmap
.primary_temp
->begin();
12198 p
!= osdmap
.primary_temp
->end();
12200 if (p
->first
.pool() == (uint64_t)pool
) {
12201 dout(10) << __func__
<< " " << pool
12202 << " removing obsolete primary_temp" << p
->first
<< dendl
;
12203 pending_inc
.new_primary_temp
[p
->first
] = -1;
12206 // remove any pg_upmap mappings for this pool
12207 for (auto& p
: osdmap
.pg_upmap
) {
12208 if (p
.first
.pool() == (uint64_t)pool
) {
12209 dout(10) << __func__
<< " " << pool
12210 << " removing obsolete pg_upmap "
12211 << p
.first
<< dendl
;
12212 pending_inc
.old_pg_upmap
.insert(p
.first
);
12215 // remove any pending pg_upmap mappings for this pool
12217 auto it
= pending_inc
.new_pg_upmap
.begin();
12218 while (it
!= pending_inc
.new_pg_upmap
.end()) {
12219 if (it
->first
.pool() == (uint64_t)pool
) {
12220 dout(10) << __func__
<< " " << pool
12221 << " removing pending pg_upmap "
12222 << it
->first
<< dendl
;
12223 it
= pending_inc
.new_pg_upmap
.erase(it
);
12229 // remove any pg_upmap_items mappings for this pool
12230 for (auto& p
: osdmap
.pg_upmap_items
) {
12231 if (p
.first
.pool() == (uint64_t)pool
) {
12232 dout(10) << __func__
<< " " << pool
12233 << " removing obsolete pg_upmap_items " << p
.first
12235 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
12238 // remove any pending pg_upmap mappings for this pool
12240 auto it
= pending_inc
.new_pg_upmap_items
.begin();
12241 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
12242 if (it
->first
.pool() == (uint64_t)pool
) {
12243 dout(10) << __func__
<< " " << pool
12244 << " removing pending pg_upmap_items "
12245 << it
->first
<< dendl
;
12246 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
12253 // remove any choose_args for this pool
12254 CrushWrapper newcrush
;
12255 _get_pending_crush(newcrush
);
12256 if (newcrush
.have_choose_args(pool
)) {
12257 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
12258 newcrush
.rm_choose_args(pool
);
12259 pending_inc
.crush
.clear();
12260 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
12265 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
12267 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
12268 if (pending_inc
.old_pools
.count(pool
)) {
12269 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
12272 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
12273 p
!= pending_inc
.new_pool_names
.end();
12275 if (p
->second
== newname
&& p
->first
!= pool
) {
12280 pending_inc
.new_pool_names
[pool
] = newname
;
12284 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
12286 op
->mark_osdmon_event(__func__
);
12287 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12289 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
12290 if (ret
== -EAGAIN
) {
12291 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12295 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
12296 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
12297 pending_inc
.epoch
));
12301 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
12302 int ret
, epoch_t epoch
, bufferlist
*blp
)
12304 op
->mark_osdmon_event(__func__
);
12305 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12306 dout(20) << "_pool_op_reply " << ret
<< dendl
;
12307 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
12308 ret
, epoch
, get_last_committed(), blp
);
12309 mon
->send_reply(op
, reply
);