1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
24 #include "mon/OSDMonitor.h"
25 #include "mon/Monitor.h"
26 #include "mon/MDSMonitor.h"
27 #include "mon/PGMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreated.h"
51 #include "messages/MOSDPGTemp.h"
52 #include "messages/MMonCommand.h"
53 #include "messages/MRemoveSnaps.h"
54 #include "messages/MOSDScrub.h"
55 #include "messages/MRoute.h"
57 #include "common/TextTable.h"
58 #include "common/Timer.h"
59 #include "common/ceph_argparse.h"
60 #include "common/perf_counters.h"
61 #include "common/strtol.h"
63 #include "common/config.h"
64 #include "common/errno.h"
66 #include "erasure-code/ErasureCodePlugin.h"
67 #include "compressor/Compressor.h"
68 #include "common/Checksummer.h"
70 #include "include/compat.h"
71 #include "include/assert.h"
72 #include "include/stringify.h"
73 #include "include/util.h"
74 #include "common/cmdparse.h"
75 #include "include/str_list.h"
76 #include "include/str_map.h"
77 #include "include/scope_guard.h"
79 #include "json_spirit/json_spirit_reader.h"
81 #define dout_subsys ceph_subsys_mon
82 #define OSD_PG_CREATING_PREFIX "osd_pg_creating"
84 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
86 if (epoch_by_pg
.size() <= ps
) {
87 epoch_by_pg
.resize(ps
+ 1, 0);
89 const auto old_lec
= epoch_by_pg
[ps
];
90 if (old_lec
>= last_epoch_clean
) {
94 epoch_by_pg
[ps
] = last_epoch_clean
;
95 if (last_epoch_clean
< floor
) {
96 floor
= last_epoch_clean
;
97 } else if (last_epoch_clean
> floor
) {
98 if (old_lec
== floor
) {
99 // probably should increase floor?
100 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
101 std::end(epoch_by_pg
));
105 if (ps
!= next_missing
) {
108 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
109 if (epoch_by_pg
[next_missing
] == 0) {
115 void LastEpochClean::remove_pool(uint64_t pool
)
117 report_by_pool
.erase(pool
);
120 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
122 auto& lec
= report_by_pool
[pg
.pool()];
123 return lec
.report(pg
.ps(), last_epoch_clean
);
126 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
128 auto floor
= latest
.get_epoch();
129 for (auto& pool
: latest
.get_pools()) {
130 auto reported
= report_by_pool
.find(pool
.first
);
131 if (reported
== report_by_pool
.end()) {
134 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
137 if (reported
->second
.floor
< floor
) {
138 floor
= reported
->second
.floor
;
145 struct C_UpdateCreatingPGs
: public Context
{
149 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
150 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
151 void finish(int r
) override
{
153 utime_t end
= ceph_clock_now();
154 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
155 << (end
- start
) << " seconds" << dendl
;
156 osdmon
->update_creating_pgs();
157 osdmon
->check_pg_creates_subs();
163 #define dout_prefix _prefix(_dout, mon, osdmap)
164 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
165 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
166 << "(" << mon
->get_state_name()
167 << ").osd e" << osdmap
.get_epoch() << " ";
170 OSDMonitor::OSDMonitor(
174 const string
& service_name
)
175 : PaxosService(mn
, p
, service_name
),
177 inc_osd_cache(g_conf
->mon_osd_cache_size
),
178 full_osd_cache(g_conf
->mon_osd_cache_size
),
179 last_attempted_minwait_time(utime_t()),
180 mapper(mn
->cct
, &mn
->cpu_tp
),
181 op_tracker(cct
, true, 1)
184 bool OSDMonitor::_have_pending_crush()
186 return pending_inc
.crush
.length() > 0;
189 CrushWrapper
&OSDMonitor::_get_stable_crush()
191 return *osdmap
.crush
;
194 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
197 if (pending_inc
.crush
.length())
198 bl
= pending_inc
.crush
;
200 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
202 bufferlist::iterator p
= bl
.begin();
206 void OSDMonitor::create_initial()
208 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
213 mon
->store
->get("mkfs", "osdmap", bl
);
217 newmap
.set_fsid(mon
->monmap
->fsid
);
219 newmap
.build_simple(g_ceph_context
, 0, mon
->monmap
->fsid
, 0);
222 newmap
.created
= newmap
.modified
= ceph_clock_now();
224 // new clusters should sort bitwise by default.
225 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
227 // new cluster should require latest by default
228 if (g_conf
->mon_debug_no_require_luminous
) {
229 newmap
.require_osd_release
= CEPH_RELEASE_KRAKEN
;
230 derr
<< __func__
<< " mon_debug_no_require_luminous=true" << dendl
;
232 newmap
.require_osd_release
= CEPH_RELEASE_LUMINOUS
;
233 newmap
.full_ratio
= g_conf
->mon_osd_full_ratio
;
234 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
235 newmap
.backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
236 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
237 newmap
.nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
238 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
239 int r
= ceph_release_from_name(
240 g_conf
->mon_osd_initial_require_min_compat_client
.c_str());
242 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
244 newmap
.require_min_compat_client
= r
;
247 // encode into pending incremental
248 newmap
.encode(pending_inc
.fullmap
,
249 mon
->get_quorum_con_features() | CEPH_FEATURE_RESERVED
);
250 pending_inc
.full_crc
= newmap
.get_crc();
251 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
254 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
)
256 s
.insert(service_name
);
257 s
.insert(OSD_PG_CREATING_PREFIX
);
260 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
262 version_t version
= get_last_committed();
263 if (version
== osdmap
.epoch
)
265 assert(version
> osdmap
.epoch
);
267 dout(15) << "update_from_paxos paxos e " << version
268 << ", my e " << osdmap
.epoch
<< dendl
;
271 if (!mapping_job
->is_done()) {
272 dout(1) << __func__
<< " mapping job "
273 << mapping_job
.get() << " did not complete, "
274 << mapping_job
->shards
<< " left, canceling" << dendl
;
275 mapping_job
->abort();
283 * We will possibly have a stashed latest that *we* wrote, and we will
284 * always be sure to have the oldest full map in the first..last range
285 * due to encode_trim_extra(), which includes the oldest full map in the trim
288 * encode_trim_extra() does not however write the full map's
289 * version to 'full_latest'. This is only done when we are building the
290 * full maps from the incremental versions. But don't panic! We make sure
291 * that the following conditions find whichever full map version is newer.
293 version_t latest_full
= get_version_latest_full();
294 if (latest_full
== 0 && get_first_committed() > 1)
295 latest_full
= get_first_committed();
297 if (get_first_committed() > 1 &&
298 latest_full
< get_first_committed()) {
299 // the monitor could be just sync'ed with its peer, and the latest_full key
300 // is not encoded in the paxos commits in encode_pending(), so we need to
301 // make sure we get it pointing to a proper version.
302 version_t lc
= get_last_committed();
303 version_t fc
= get_first_committed();
305 dout(10) << __func__
<< " looking for valid full map in interval"
306 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
309 for (version_t v
= lc
; v
>= fc
; v
--) {
310 string full_key
= "full_" + stringify(v
);
311 if (mon
->store
->exists(get_service_name(), full_key
)) {
312 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
318 assert(latest_full
> 0);
319 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
320 put_version_latest_full(t
, latest_full
);
321 mon
->store
->apply_transaction(t
);
322 dout(10) << __func__
<< " updated the on-disk full map version to "
323 << latest_full
<< dendl
;
326 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
327 bufferlist latest_bl
;
328 get_version_full(latest_full
, latest_bl
);
329 assert(latest_bl
.length() != 0);
330 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
331 osdmap
.decode(latest_bl
);
334 if (mon
->monmap
->get_required_features().contains_all(
335 ceph::features::mon::FEATURE_LUMINOUS
)) {
337 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
339 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
340 creating_pgs
.decode(p
);
341 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
342 << creating_pgs
.last_scan_epoch
343 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
345 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
350 // make sure we're using the right pg service.. remove me post-luminous!
351 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
352 dout(10) << __func__
<< " pgservice is mgrstat" << dendl
;
353 mon
->pgservice
= mon
->mgrstatmon()->get_pg_stat_service();
355 dout(10) << __func__
<< " pgservice is pg" << dendl
;
356 mon
->pgservice
= mon
->pgmon()->get_pg_stat_service();
359 // walk through incrementals
360 MonitorDBStore::TransactionRef t
;
362 while (version
> osdmap
.epoch
) {
364 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
366 assert(inc_bl
.length());
368 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
370 OSDMap::Incremental
inc(inc_bl
);
371 err
= osdmap
.apply_incremental(inc
);
375 t
.reset(new MonitorDBStore::Transaction
);
377 // Write out the full map for all past epochs. Encode the full
378 // map with the same features as the incremental. If we don't
379 // know, use the quorum features. If we don't know those either,
380 // encode with all features.
381 uint64_t f
= inc
.encode_features
;
383 f
= mon
->get_quorum_con_features();
387 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
388 tx_size
+= full_bl
.length();
390 bufferlist orig_full_bl
;
391 get_version_full(osdmap
.epoch
, orig_full_bl
);
392 if (orig_full_bl
.length()) {
393 // the primary provided the full map
394 assert(inc
.have_crc
);
395 if (inc
.full_crc
!= osdmap
.crc
) {
396 // This will happen if the mons were running mixed versions in
397 // the past or some other circumstance made the full encoded
398 // maps divergent. Reloading here will bring us back into
399 // sync with the primary for this and all future maps. OSDs
400 // will also be brought back into sync when they discover the
401 // crc mismatch and request a full map from a mon.
402 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
405 osdmap
.decode(orig_full_bl
);
408 assert(!inc
.have_crc
);
409 put_version_full(t
, osdmap
.epoch
, full_bl
);
411 put_version_latest_full(t
, osdmap
.epoch
);
414 dout(1) << osdmap
<< dendl
;
416 if (osdmap
.epoch
== 1) {
417 t
->erase("mkfs", "osdmap");
420 // make sure we're using the right pg service.. remove me post-luminous!
421 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
422 dout(10) << __func__
<< " pgservice is mgrstat" << dendl
;
423 mon
->pgservice
= mon
->mgrstatmon()->get_pg_stat_service();
425 dout(10) << __func__
<< " pgservice is pg" << dendl
;
426 mon
->pgservice
= mon
->pgmon()->get_pg_stat_service();
429 if (tx_size
> g_conf
->mon_sync_max_payload_size
*2) {
430 mon
->store
->apply_transaction(t
);
431 t
= MonitorDBStore::TransactionRef();
434 if (mon
->monmap
->get_required_features().contains_all(
435 ceph::features::mon::FEATURE_LUMINOUS
)) {
436 for (const auto &osd_state
: inc
.new_state
) {
437 if (osd_state
.second
& CEPH_OSD_UP
) {
438 // could be marked up *or* down, but we're too lazy to check which
439 last_osd_report
.erase(osd_state
.first
);
441 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
442 // could be created *or* destroyed, but we can safely drop it
443 osd_epochs
.erase(osd_state
.first
);
450 mon
->store
->apply_transaction(t
);
453 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
454 if (osdmap
.is_out(o
))
456 auto found
= down_pending_out
.find(o
);
457 if (osdmap
.is_down(o
)) {
458 // populate down -> out map
459 if (found
== down_pending_out
.end()) {
460 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
461 down_pending_out
[o
] = ceph_clock_now();
464 if (found
!= down_pending_out
.end()) {
465 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
466 down_pending_out
.erase(found
);
470 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
472 if (mon
->is_leader()) {
473 // kick pgmon, make sure it's seen the latest map
474 mon
->pgmon()->check_osd_map(osdmap
.epoch
);
478 check_pg_creates_subs();
480 share_map_with_random_osd();
485 // make sure our feature bits reflect the latest map
486 update_msgr_features();
488 if (!mon
->is_leader()) {
489 // will be called by on_active() on the leader, avoid doing so twice
494 void OSDMonitor::start_mapping()
496 // initiate mapping job
498 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
500 mapping_job
->abort();
502 if (!osdmap
.get_pools().empty()) {
503 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
504 mapping_job
= mapping
.start_update(osdmap
, mapper
,
505 g_conf
->mon_osd_mapping_pgs_per_chunk
);
506 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
507 << " at " << fin
->start
<< dendl
;
508 mapping_job
->set_finish_event(fin
);
510 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
511 mapping_job
= nullptr;
515 void OSDMonitor::update_msgr_features()
518 types
.insert((int)entity_name_t::TYPE_OSD
);
519 types
.insert((int)entity_name_t::TYPE_CLIENT
);
520 types
.insert((int)entity_name_t::TYPE_MDS
);
521 types
.insert((int)entity_name_t::TYPE_MON
);
522 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
524 uint64_t features
= osdmap
.get_features(*q
, &mask
);
525 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
526 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
527 Messenger::Policy p
= mon
->messenger
->get_policy(*q
);
528 p
.features_required
= (p
.features_required
& ~mask
) | features
;
529 mon
->messenger
->set_policy(*q
, p
);
534 void OSDMonitor::on_active()
538 if (mon
->is_leader()) {
539 mon
->clog
->debug() << "osdmap " << osdmap
;
541 list
<MonOpRequestRef
> ls
;
542 take_all_failures(ls
);
543 while (!ls
.empty()) {
544 MonOpRequestRef op
= ls
.front();
545 op
->mark_osdmon_event(__func__
);
553 void OSDMonitor::on_restart()
555 last_osd_report
.clear();
557 if (mon
->is_leader()) {
558 // fix ruleset != ruleid
559 if (osdmap
.crush
->has_legacy_rulesets() &&
560 !osdmap
.crush
->has_multirule_rulesets()) {
561 CrushWrapper newcrush
;
562 _get_pending_crush(newcrush
);
563 int r
= newcrush
.renumber_rules_by_ruleset();
565 dout(1) << __func__
<< " crush map has ruleset != rule id; fixing" << dendl
;
566 pending_inc
.crush
.clear();
567 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
569 dout(10) << __func__
<< " unable to renumber rules by ruleset" << dendl
;
575 void OSDMonitor::on_shutdown()
577 dout(10) << __func__
<< dendl
;
579 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
581 mapping_job
->abort();
584 // discard failure info, waiters
585 list
<MonOpRequestRef
> ls
;
586 take_all_failures(ls
);
590 void OSDMonitor::update_logger()
592 dout(10) << "update_logger" << dendl
;
594 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
595 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
596 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
597 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
600 void OSDMonitor::create_pending()
602 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
603 pending_inc
.fsid
= mon
->monmap
->fsid
;
605 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
607 // clean up pg_temp, primary_temp
608 OSDMap::clean_temps(g_ceph_context
, osdmap
, &pending_inc
);
609 dout(10) << "create_pending did clean_temps" << dendl
;
611 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
612 // instead of osd_backfill_full_ratio config
613 if (osdmap
.backfillfull_ratio
<= 0) {
614 pending_inc
.new_backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
615 if (pending_inc
.new_backfillfull_ratio
> 1.0)
616 pending_inc
.new_backfillfull_ratio
/= 100;
617 dout(1) << __func__
<< " setting backfillfull_ratio = "
618 << pending_inc
.new_backfillfull_ratio
<< dendl
;
620 if (osdmap
.get_epoch() > 0 &&
621 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
622 // transition full ratios from PGMap to OSDMap (on upgrade)
623 float full_ratio
= mon
->pgservice
->get_full_ratio();
624 float nearfull_ratio
= mon
->pgservice
->get_nearfull_ratio();
625 if (osdmap
.full_ratio
!= full_ratio
) {
626 dout(10) << __func__
<< " full_ratio " << osdmap
.full_ratio
627 << " -> " << full_ratio
<< " (from pgmap)" << dendl
;
628 pending_inc
.new_full_ratio
= full_ratio
;
630 if (osdmap
.nearfull_ratio
!= nearfull_ratio
) {
631 dout(10) << __func__
<< " nearfull_ratio " << osdmap
.nearfull_ratio
632 << " -> " << nearfull_ratio
<< " (from pgmap)" << dendl
;
633 pending_inc
.new_nearfull_ratio
= nearfull_ratio
;
636 // safety check (this shouldn't really happen)
637 if (osdmap
.full_ratio
<= 0) {
638 pending_inc
.new_full_ratio
= g_conf
->mon_osd_full_ratio
;
639 if (pending_inc
.new_full_ratio
> 1.0)
640 pending_inc
.new_full_ratio
/= 100;
641 dout(1) << __func__
<< " setting full_ratio = "
642 << pending_inc
.new_full_ratio
<< dendl
;
644 if (osdmap
.nearfull_ratio
<= 0) {
645 pending_inc
.new_nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
646 if (pending_inc
.new_nearfull_ratio
> 1.0)
647 pending_inc
.new_nearfull_ratio
/= 100;
648 dout(1) << __func__
<< " setting nearfull_ratio = "
649 << pending_inc
.new_nearfull_ratio
<< dendl
;
655 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
)
657 dout(10) << __func__
<< dendl
;
658 creating_pgs_t pending_creatings
;
660 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
661 pending_creatings
= creating_pgs
;
663 // check for new or old pools
664 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
665 if (osdmap
.get_epoch() &&
666 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
668 mon
->pgservice
->maybe_add_creating_pgs(creating_pgs
.last_scan_epoch
,
671 dout(7) << __func__
<< " " << added
<< " pgs added from pgmap" << dendl
;
674 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
678 queued
+= scan_for_creating_pgs(inc
.new_pools
,
682 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
683 for (auto deleted_pool
: inc
.old_pools
) {
684 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
685 dout(10) << __func__
<< " " << removed
686 << " pg removed because containing pool deleted: "
687 << deleted_pool
<< dendl
;
688 last_epoch_clean
.remove_pool(deleted_pool
);
690 // pgmon updates its creating_pgs in check_osd_map() which is called by
691 // on_active() and check_osd_map() could be delayed if lease expires, so its
692 // creating_pgs could be stale in comparison with the one of osdmon. let's
693 // trim them here. otherwise, they will be added back after being erased.
694 unsigned removed
= 0;
695 for (auto& pg
: pending_created_pgs
) {
696 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
697 pending_creatings
.created_pools
.insert(pg
.pool());
698 removed
+= pending_creatings
.pgs
.erase(pg
);
700 pending_created_pgs
.clear();
701 dout(10) << __func__
<< " " << removed
702 << " pgs removed because they're created" << dendl
;
703 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
707 unsigned max
= MAX(1, g_conf
->mon_osd_max_creating_pgs
);
708 const auto total
= pending_creatings
.pgs
.size();
709 while (pending_creatings
.pgs
.size() < max
&&
710 !pending_creatings
.queue
.empty()) {
711 auto p
= pending_creatings
.queue
.begin();
712 int64_t poolid
= p
->first
;
713 dout(10) << __func__
<< " pool " << poolid
714 << " created " << p
->second
.created
715 << " modified " << p
->second
.modified
716 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
718 int n
= MIN(max
- pending_creatings
.pgs
.size(),
719 p
->second
.end
- p
->second
.start
);
720 ps_t first
= p
->second
.start
;
721 ps_t end
= first
+ n
;
722 for (ps_t ps
= first
; ps
< end
; ++ps
) {
723 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
724 // NOTE: use the *current* epoch as the PG creation epoch so that the
725 // OSD does not have to generate a long set of PastIntervals.
726 pending_creatings
.pgs
.emplace(pgid
, make_pair(inc
.epoch
,
727 p
->second
.modified
));
728 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
730 p
->second
.start
= end
;
731 if (p
->second
.done()) {
732 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
733 pending_creatings
.queue
.erase(p
);
735 dout(10) << __func__
<< " pool " << poolid
736 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
740 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
741 << " pools" << dendl
;
742 dout(10) << __func__
<< " " << pending_creatings
.pgs
.size() - total
743 << " pgs added from queued pools" << dendl
;
744 return pending_creatings
;
747 void OSDMonitor::maybe_prime_pg_temp()
750 if (pending_inc
.crush
.length()) {
751 dout(10) << __func__
<< " new crush map, all" << dendl
;
755 if (!pending_inc
.new_up_client
.empty()) {
756 dout(10) << __func__
<< " new up osds, all" << dendl
;
760 // check for interesting OSDs
762 for (auto p
= pending_inc
.new_state
.begin();
763 !all
&& p
!= pending_inc
.new_state
.end();
765 if ((p
->second
& CEPH_OSD_UP
) &&
766 osdmap
.is_up(p
->first
)) {
767 osds
.insert(p
->first
);
770 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
771 !all
&& p
!= pending_inc
.new_weight
.end();
773 if (p
->second
< osdmap
.get_weight(p
->first
)) {
775 osds
.insert(p
->first
);
777 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
783 if (!all
&& osds
.empty())
788 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
789 if (estimate
> mapping
.get_num_pgs() *
790 g_conf
->mon_osd_prime_pg_temp_max_estimate
) {
791 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
792 << osds
.size() << " osds >= "
793 << g_conf
->mon_osd_prime_pg_temp_max_estimate
<< " of total "
794 << mapping
.get_num_pgs() << " pgs, all"
798 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
799 << osds
.size() << " osds" << dendl
;
804 next
.deepish_copy_from(osdmap
);
805 next
.apply_incremental(pending_inc
);
807 if (next
.get_pools().empty()) {
808 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
810 PrimeTempJob
job(next
, this);
811 mapper
.queue(&job
, g_conf
->mon_osd_mapping_pgs_per_chunk
);
812 if (job
.wait_for(g_conf
->mon_osd_prime_pg_temp_max_time
)) {
813 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
815 dout(10) << __func__
<< " did not finish in "
816 << g_conf
->mon_osd_prime_pg_temp_max_time
817 << ", stopping" << dendl
;
821 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
822 utime_t stop
= ceph_clock_now();
823 stop
+= g_conf
->mon_osd_prime_pg_temp_max_time
;
824 const int chunk
= 1000;
826 std::unordered_set
<pg_t
> did_pgs
;
827 for (auto osd
: osds
) {
828 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
829 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
830 for (auto pgid
: pgs
) {
831 if (!did_pgs
.insert(pgid
).second
) {
834 prime_pg_temp(next
, pgid
);
837 if (ceph_clock_now() > stop
) {
838 dout(10) << __func__
<< " consumed more than "
839 << g_conf
->mon_osd_prime_pg_temp_max_time
840 << " seconds, stopping"
850 void OSDMonitor::prime_pg_temp(
854 if (mon
->monmap
->get_required_features().contains_all(
855 ceph::features::mon::FEATURE_LUMINOUS
)) {
856 // TODO: remove this creating_pgs direct access?
857 if (creating_pgs
.pgs
.count(pgid
)) {
861 if (mon
->pgservice
->is_creating_pg(pgid
)) {
865 if (!osdmap
.pg_exists(pgid
)) {
869 vector
<int> up
, acting
;
870 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
872 vector
<int> next_up
, next_acting
;
873 int next_up_primary
, next_acting_primary
;
874 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
875 &next_acting
, &next_acting_primary
);
876 if (acting
== next_acting
)
877 return; // no change since last epoch
880 return; // if previously empty now we can be no worse off
881 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
882 if (pool
&& acting
.size() < pool
->min_size
)
883 return; // can be no worse off than before
885 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
886 << " -> " << next_up
<< "/" << next_acting
887 << ", priming " << acting
890 Mutex::Locker
l(prime_pg_temp_lock
);
891 // do not touch a mapping if a change is pending
892 pending_inc
.new_pg_temp
.emplace(
894 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
899 * @note receiving a transaction in this function gives a fair amount of
900 * freedom to the service implementation if it does need it. It shouldn't.
902 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
904 dout(10) << "encode_pending e " << pending_inc
.epoch
907 // finalize up pending_inc
908 pending_inc
.modified
= ceph_clock_now();
910 int r
= pending_inc
.propagate_snaps_to_tiers(g_ceph_context
, osdmap
);
914 if (!mapping_job
->is_done()) {
915 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
916 << mapping_job
.get() << " did not complete, "
917 << mapping_job
->shards
<< " left" << dendl
;
918 mapping_job
->abort();
919 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
920 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
921 << mapping_job
.get() << " is prior epoch "
922 << mapping
.get_epoch() << dendl
;
924 if (g_conf
->mon_osd_prime_pg_temp
) {
925 maybe_prime_pg_temp();
928 } else if (g_conf
->mon_osd_prime_pg_temp
) {
929 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
938 tmp
.deepish_copy_from(osdmap
);
939 tmp
.apply_incremental(pending_inc
);
941 if (tmp
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
942 // set or clear full/nearfull?
943 int full
, backfill
, nearfull
;
944 tmp
.count_full_nearfull_osds(&full
, &backfill
, &nearfull
);
946 if (!tmp
.test_flag(CEPH_OSDMAP_FULL
)) {
947 dout(10) << __func__
<< " setting full flag" << dendl
;
948 add_flag(CEPH_OSDMAP_FULL
);
949 remove_flag(CEPH_OSDMAP_NEARFULL
);
952 if (tmp
.test_flag(CEPH_OSDMAP_FULL
)) {
953 dout(10) << __func__
<< " clearing full flag" << dendl
;
954 remove_flag(CEPH_OSDMAP_FULL
);
957 if (!tmp
.test_flag(CEPH_OSDMAP_NEARFULL
)) {
958 dout(10) << __func__
<< " setting nearfull flag" << dendl
;
959 add_flag(CEPH_OSDMAP_NEARFULL
);
962 if (tmp
.test_flag(CEPH_OSDMAP_NEARFULL
)) {
963 dout(10) << __func__
<< " clearing nearfull flag" << dendl
;
964 remove_flag(CEPH_OSDMAP_NEARFULL
);
969 // min_compat_client?
970 if (tmp
.require_min_compat_client
== 0) {
971 auto mv
= tmp
.get_min_compat_client();
972 dout(1) << __func__
<< " setting require_min_compat_client to currently "
973 << "required " << ceph_release_name(mv
) << dendl
;
974 mon
->clog
->info() << "setting require_min_compat_client to currently "
975 << "required " << ceph_release_name(mv
);
976 pending_inc
.new_require_min_compat_client
= mv
;
979 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
980 // convert ec profile ruleset-* -> crush-*
981 for (auto& p
: tmp
.erasure_code_profiles
) {
982 bool changed
= false;
983 map
<string
,string
> newprofile
;
984 for (auto& q
: p
.second
) {
985 if (q
.first
.find("ruleset-") == 0) {
986 string key
= "crush-";
987 key
+= q
.first
.substr(8);
988 newprofile
[key
] = q
.second
;
990 dout(20) << " updating ec profile " << p
.first
991 << " key " << q
.first
<< " -> " << key
<< dendl
;
993 newprofile
[q
.first
] = q
.second
;
997 dout(10) << " updated ec profile " << p
.first
<< ": "
998 << newprofile
<< dendl
;
999 pending_inc
.new_erasure_code_profiles
[p
.first
] = newprofile
;
1007 for (auto i
= pending_inc
.new_state
.begin();
1008 i
!= pending_inc
.new_state
.end();
1010 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1011 if (s
& CEPH_OSD_UP
)
1012 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1013 if (s
& CEPH_OSD_EXISTS
)
1014 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1016 for (map
<int32_t,entity_addr_t
>::iterator i
= pending_inc
.new_up_client
.begin();
1017 i
!= pending_inc
.new_up_client
.end();
1019 //FIXME: insert cluster addresses too
1020 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1022 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1023 i
!= pending_inc
.new_weight
.end();
1025 if (i
->second
== CEPH_OSD_OUT
) {
1026 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1027 } else if (i
->second
== CEPH_OSD_IN
) {
1028 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1030 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1034 // features for osdmap and its incremental
1035 uint64_t features
= mon
->get_quorum_con_features();
1037 // encode full map and determine its crc
1040 tmp
.deepish_copy_from(osdmap
);
1041 tmp
.apply_incremental(pending_inc
);
1043 // determine appropriate features
1044 if (tmp
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1045 dout(10) << __func__
<< " encoding without feature SERVER_LUMINOUS"
1047 features
&= ~CEPH_FEATURE_SERVER_LUMINOUS
;
1049 if (tmp
.require_osd_release
< CEPH_RELEASE_KRAKEN
) {
1050 dout(10) << __func__
<< " encoding without feature SERVER_KRAKEN | "
1051 << "MSG_ADDR2" << dendl
;
1052 features
&= ~(CEPH_FEATURE_SERVER_KRAKEN
|
1053 CEPH_FEATURE_MSG_ADDR2
);
1055 if (tmp
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
1056 dout(10) << __func__
<< " encoding without feature SERVER_JEWEL" << dendl
;
1057 features
&= ~CEPH_FEATURE_SERVER_JEWEL
;
1059 dout(10) << __func__
<< " encoding full map with " << features
<< dendl
;
1062 ::encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1063 pending_inc
.full_crc
= tmp
.get_crc();
1065 // include full map in the txn. note that old monitors will
1066 // overwrite this. new ones will now skip the local full map
1067 // encode and reload from this.
1068 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1072 assert(get_last_committed() + 1 == pending_inc
.epoch
);
1073 ::encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1075 dout(20) << " full_crc " << tmp
.get_crc()
1076 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1078 /* put everything in the transaction */
1079 put_version(t
, pending_inc
.epoch
, bl
);
1080 put_last_committed(t
, pending_inc
.epoch
);
1083 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1084 p
!= pending_metadata
.end();
1086 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1087 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1088 p
!= pending_metadata_rm
.end();
1090 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1091 pending_metadata
.clear();
1092 pending_metadata_rm
.clear();
1094 // and pg creating, also!
1095 if (mon
->monmap
->get_required_features().contains_all(
1096 ceph::features::mon::FEATURE_LUMINOUS
)) {
1097 auto pending_creatings
= update_pending_pgs(pending_inc
);
1098 if (osdmap
.get_epoch() &&
1099 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1100 dout(7) << __func__
<< " in the middle of upgrading, "
1101 << " trimming pending creating_pgs using pgmap" << dendl
;
1102 mon
->pgservice
->maybe_trim_creating_pgs(&pending_creatings
);
1104 bufferlist creatings_bl
;
1105 ::encode(pending_creatings
, creatings_bl
);
1106 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1110 health_check_map_t next
;
1111 tmp
.check_health(&next
);
1112 encode_health(next
, t
);
1115 void OSDMonitor::trim_creating_pgs(creating_pgs_t
* creating_pgs
,
1116 const ceph::unordered_map
<pg_t
,pg_stat_t
>& pg_stat
)
1118 auto p
= creating_pgs
->pgs
.begin();
1119 while (p
!= creating_pgs
->pgs
.end()) {
1120 auto q
= pg_stat
.find(p
->first
);
1121 if (q
!= pg_stat
.end() &&
1122 !(q
->second
.state
& PG_STATE_CREATING
)) {
1123 dout(20) << __func__
<< " pgmap shows " << p
->first
<< " is created"
1125 p
= creating_pgs
->pgs
.erase(p
);
1132 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
1135 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
1139 bufferlist::iterator p
= bl
.begin();
1142 catch (buffer::error
& e
) {
1144 *err
<< "osd." << osd
<< " metadata is corrupt";
1150 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
1152 map
<string
,int> by_val
;
1153 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
1154 if (osdmap
.is_up(osd
)) {
1155 map
<string
,string
> meta
;
1156 load_metadata(osd
, meta
, nullptr);
1157 auto p
= meta
.find(field
);
1158 if (p
== meta
.end()) {
1159 by_val
["unknown"]++;
1161 by_val
[p
->second
]++;
1165 f
->open_object_section(field
.c_str());
1166 for (auto& p
: by_val
) {
1167 f
->dump_int(p
.first
.c_str(), p
.second
);
1172 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
1174 map
<string
, string
> metadata
;
1175 int r
= load_metadata(osd
, metadata
, nullptr);
1179 auto it
= metadata
.find("osd_objectstore");
1180 if (it
== metadata
.end())
1186 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
1187 const pg_pool_t
&pool
,
1190 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1191 // since filestore osds could always join the pool later
1192 set
<int> checked_osds
;
1193 for (unsigned ps
= 0; ps
< MIN(8, pool
.get_pg_num()); ++ps
) {
1194 vector
<int> up
, acting
;
1195 pg_t
pgid(ps
, pool_id
, -1);
1196 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
1197 for (int osd
: up
) {
1198 if (checked_osds
.find(osd
) != checked_osds
.end())
1200 string objectstore_type
;
1201 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
1202 // allow with missing metadata, e.g. due to an osd never booting yet
1203 if (r
< 0 || objectstore_type
== "bluestore") {
1204 checked_osds
.insert(osd
);
1207 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
1214 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
1216 map
<string
,string
> m
;
1217 if (int r
= load_metadata(osd
, m
, err
))
1219 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
1220 f
->dump_string(p
->first
.c_str(), p
->second
);
1224 void OSDMonitor::print_nodes(Formatter
*f
)
1226 // group OSDs by their hosts
1227 map
<string
, list
<int> > osds
; // hostname => osd
1228 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
1229 map
<string
, string
> m
;
1230 if (load_metadata(osd
, m
, NULL
)) {
1233 map
<string
, string
>::iterator hostname
= m
.find("hostname");
1234 if (hostname
== m
.end()) {
1235 // not likely though
1238 osds
[hostname
->second
].push_back(osd
);
1241 dump_services(f
, osds
, "osd");
1244 void OSDMonitor::share_map_with_random_osd()
1246 if (osdmap
.get_num_up_osds() == 0) {
1247 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
1251 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
1253 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
1257 dout(10) << "committed, telling random " << s
->inst
<< " all about it" << dendl
;
1258 // whatev, they'll request more if they need it
1259 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch());
1260 s
->con
->send_message(m
);
1261 // NOTE: do *not* record osd has up to this epoch (as we do
1262 // elsewhere) as they may still need to request older values.
1265 version_t
OSDMonitor::get_trim_to()
1267 if (mon
->get_quorum().empty()) {
1268 dout(10) << __func__
<< ": quorum not formed" << dendl
;
1273 if (mon
->monmap
->get_required_features().contains_all(
1274 ceph::features::mon::FEATURE_LUMINOUS
)) {
1276 // TODO: Get this hidden in PGStatService
1277 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1278 if (!creating_pgs
.pgs
.empty()) {
1282 floor
= get_min_last_epoch_clean();
1284 if (!mon
->pgservice
->is_readable())
1286 if (mon
->pgservice
->have_creating_pgs()) {
1289 floor
= mon
->pgservice
->get_min_last_epoch_clean();
1292 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
1293 if (g_conf
->mon_osd_force_trim_to
> 0 &&
1294 g_conf
->mon_osd_force_trim_to
< (int)get_last_committed()) {
1295 floor
= g_conf
->mon_osd_force_trim_to
;
1296 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
1298 unsigned min
= g_conf
->mon_min_osdmap_epochs
;
1299 if (floor
+ min
> get_last_committed()) {
1300 if (min
< get_last_committed())
1301 floor
= get_last_committed() - min
;
1305 if (floor
> get_first_committed())
1311 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
1313 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
1314 // also scan osd epochs
1315 // don't trim past the oldest reported osd epoch
1316 for (auto& osd_epoch
: osd_epochs
) {
1317 if (osd_epoch
.second
< floor
) {
1318 floor
= osd_epoch
.second
;
1324 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
1327 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
1329 get_version_full(first
, bl
);
1330 put_version_full(tx
, first
, bl
);
1335 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
1337 op
->mark_osdmon_event(__func__
);
1338 Message
*m
= op
->get_req();
1339 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1341 switch (m
->get_type()) {
1343 case MSG_MON_COMMAND
:
1344 return preprocess_command(op
);
1345 case CEPH_MSG_MON_GET_OSDMAP
:
1346 return preprocess_get_osdmap(op
);
1349 case MSG_OSD_MARK_ME_DOWN
:
1350 return preprocess_mark_me_down(op
);
1352 return preprocess_full(op
);
1353 case MSG_OSD_FAILURE
:
1354 return preprocess_failure(op
);
1356 return preprocess_boot(op
);
1358 return preprocess_alive(op
);
1359 case MSG_OSD_PG_CREATED
:
1360 return preprocess_pg_created(op
);
1361 case MSG_OSD_PGTEMP
:
1362 return preprocess_pgtemp(op
);
1363 case MSG_OSD_BEACON
:
1364 return preprocess_beacon(op
);
1366 case CEPH_MSG_POOLOP
:
1367 return preprocess_pool_op(op
);
1369 case MSG_REMOVE_SNAPS
:
1370 return preprocess_remove_snaps(op
);
1378 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
1380 op
->mark_osdmon_event(__func__
);
1381 Message
*m
= op
->get_req();
1382 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1384 switch (m
->get_type()) {
1386 case MSG_OSD_MARK_ME_DOWN
:
1387 return prepare_mark_me_down(op
);
1389 return prepare_full(op
);
1390 case MSG_OSD_FAILURE
:
1391 return prepare_failure(op
);
1393 return prepare_boot(op
);
1395 return prepare_alive(op
);
1396 case MSG_OSD_PG_CREATED
:
1397 return prepare_pg_created(op
);
1398 case MSG_OSD_PGTEMP
:
1399 return prepare_pgtemp(op
);
1400 case MSG_OSD_BEACON
:
1401 return prepare_beacon(op
);
1403 case MSG_MON_COMMAND
:
1404 return prepare_command(op
);
1406 case CEPH_MSG_POOLOP
:
1407 return prepare_pool_op(op
);
1409 case MSG_REMOVE_SNAPS
:
1410 return prepare_remove_snaps(op
);
1420 bool OSDMonitor::should_propose(double& delay
)
1422 dout(10) << "should_propose" << dendl
;
1424 // if full map, propose immediately! any subsequent changes will be clobbered.
1425 if (pending_inc
.fullmap
.length())
1428 // adjust osd weights?
1429 if (!osd_weight
.empty() &&
1430 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
1431 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
1432 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
1438 // propose as fast as possible if updating up_thru or pg_temp
1439 // want to merge OSDMap changes as much as possible
1440 if ((pending_inc
.new_primary_temp
.size() == 1
1441 || pending_inc
.new_up_thru
.size() == 1)
1442 && pending_inc
.new_state
.size() < 2) {
1443 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl
;
1445 utime_t now
= ceph_clock_now();
1446 if (now
- last_attempted_minwait_time
> g_conf
->paxos_propose_interval
1447 && now
- paxos
->get_last_commit_time() > g_conf
->paxos_min_wait
) {
1448 delay
= g_conf
->paxos_min_wait
;
1449 last_attempted_minwait_time
= now
;
1454 return PaxosService::should_propose(delay
);
1459 // ---------------------------
1462 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
1464 op
->mark_osdmon_event(__func__
);
1465 MMonGetOSDMap
*m
= static_cast<MMonGetOSDMap
*>(op
->get_req());
1466 dout(10) << __func__
<< " " << *m
<< dendl
;
1467 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
);
1468 epoch_t first
= get_first_committed();
1469 epoch_t last
= osdmap
.get_epoch();
1470 int max
= g_conf
->osd_map_message_max
;
1471 for (epoch_t e
= MAX(first
, m
->get_full_first());
1472 e
<= MIN(last
, m
->get_full_last()) && max
> 0;
1474 int r
= get_version_full(e
, reply
->maps
[e
]);
1477 for (epoch_t e
= MAX(first
, m
->get_inc_first());
1478 e
<= MIN(last
, m
->get_inc_last()) && max
> 0;
1480 int r
= get_version(e
, reply
->incremental_maps
[e
]);
1483 reply
->oldest_map
= first
;
1484 reply
->newest_map
= last
;
1485 mon
->send_reply(op
, reply
);
1490 // ---------------------------
1495 bool OSDMonitor::check_source(PaxosServiceMessage
*m
, uuid_d fsid
) {
1496 // check permissions
1497 MonSession
*session
= m
->get_session();
1500 if (!session
->is_capable("osd", MON_CAP_X
)) {
1501 dout(0) << "got MOSDFailure from entity with insufficient caps "
1502 << session
->caps
<< dendl
;
1505 if (fsid
!= mon
->monmap
->fsid
) {
1506 dout(0) << "check_source: on fsid " << fsid
1507 << " != " << mon
->monmap
->fsid
<< dendl
;
1514 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
1516 op
->mark_osdmon_event(__func__
);
1517 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
1518 // who is target_osd
1519 int badboy
= m
->get_target().name
.num();
1521 // check permissions
1522 if (check_source(m
, m
->fsid
))
1525 // first, verify the reporting host is valid
1526 if (m
->get_orig_source().is_osd()) {
1527 int from
= m
->get_orig_source().num();
1528 if (!osdmap
.exists(from
) ||
1529 osdmap
.get_addr(from
) != m
->get_orig_source_inst().addr
||
1530 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
1531 dout(5) << "preprocess_failure from dead osd." << from
<< ", ignoring" << dendl
;
1532 send_incremental(op
, m
->get_epoch()+1);
1539 if (osdmap
.is_down(badboy
)) {
1540 dout(5) << "preprocess_failure dne(/dup?): " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1541 if (m
->get_epoch() < osdmap
.get_epoch())
1542 send_incremental(op
, m
->get_epoch()+1);
1545 if (osdmap
.get_inst(badboy
) != m
->get_target()) {
1546 dout(5) << "preprocess_failure wrong osd: report " << m
->get_target() << " != map's " << osdmap
.get_inst(badboy
)
1547 << ", from " << m
->get_orig_source_inst() << dendl
;
1548 if (m
->get_epoch() < osdmap
.get_epoch())
1549 send_incremental(op
, m
->get_epoch()+1);
1553 // already reported?
1554 if (osdmap
.is_down(badboy
) ||
1555 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
1556 dout(5) << "preprocess_failure dup/old: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1557 if (m
->get_epoch() < osdmap
.get_epoch())
1558 send_incremental(op
, m
->get_epoch()+1);
1562 if (!can_mark_down(badboy
)) {
1563 dout(5) << "preprocess_failure ignoring report of " << m
->get_target() << " from " << m
->get_orig_source_inst() << dendl
;
1567 dout(10) << "preprocess_failure new: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1574 class C_AckMarkedDown
: public C_MonOp
{
1580 : C_MonOp(op
), osdmon(osdmon
) {}
1582 void _finish(int) override
{
1583 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1584 osdmon
->mon
->send_reply(
1590 false)); // ACK itself does not request an ack
1592 ~C_AckMarkedDown() override
{
1596 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
1598 op
->mark_osdmon_event(__func__
);
1599 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1600 int requesting_down
= m
->get_target().name
.num();
1601 int from
= m
->get_orig_source().num();
1603 // check permissions
1604 if (check_source(m
, m
->fsid
))
1607 // first, verify the reporting host is valid
1608 if (!m
->get_orig_source().is_osd())
1611 if (!osdmap
.exists(from
) ||
1612 osdmap
.is_down(from
) ||
1613 osdmap
.get_addr(from
) != m
->get_target().addr
) {
1614 dout(5) << "preprocess_mark_me_down from dead osd."
1615 << from
<< ", ignoring" << dendl
;
1616 send_incremental(op
, m
->get_epoch()+1);
1620 // no down might be set
1621 if (!can_mark_down(requesting_down
))
1624 dout(10) << "MOSDMarkMeDown for: " << m
->get_target() << dendl
;
1628 if (m
->request_ack
) {
1629 Context
*c(new C_AckMarkedDown(this, op
));
1635 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
1637 op
->mark_osdmon_event(__func__
);
1638 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1639 int target_osd
= m
->get_target().name
.num();
1641 assert(osdmap
.is_up(target_osd
));
1642 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
1644 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
1645 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1647 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
1651 bool OSDMonitor::can_mark_down(int i
)
1653 if (osdmap
.test_flag(CEPH_OSDMAP_NODOWN
)) {
1654 dout(5) << __func__
<< " NODOWN flag set, will not mark osd." << i
1655 << " down" << dendl
;
1659 if (osdmap
.is_nodown(i
)) {
1660 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
1661 << "will not mark it down" << dendl
;
1665 int num_osds
= osdmap
.get_num_osds();
1666 if (num_osds
== 0) {
1667 dout(5) << __func__
<< " no osds" << dendl
;
1670 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
1671 float up_ratio
= (float)up
/ (float)num_osds
;
1672 if (up_ratio
< g_conf
->mon_osd_min_up_ratio
) {
1673 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
1674 << g_conf
->mon_osd_min_up_ratio
1675 << ", will not mark osd." << i
<< " down" << dendl
;
1681 bool OSDMonitor::can_mark_up(int i
)
1683 if (osdmap
.test_flag(CEPH_OSDMAP_NOUP
)) {
1684 dout(5) << __func__
<< " NOUP flag set, will not mark osd." << i
1689 if (osdmap
.is_noup(i
)) {
1690 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
1691 << "will not mark it up" << dendl
;
1699 * @note the parameter @p i apparently only exists here so we can output the
1700 * osd's id on messages.
1702 bool OSDMonitor::can_mark_out(int i
)
1704 if (osdmap
.test_flag(CEPH_OSDMAP_NOOUT
)) {
1705 dout(5) << __func__
<< " NOOUT flag set, will not mark osds out" << dendl
;
1709 if (osdmap
.is_noout(i
)) {
1710 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
1711 << "will not mark it out" << dendl
;
1715 int num_osds
= osdmap
.get_num_osds();
1716 if (num_osds
== 0) {
1717 dout(5) << __func__
<< " no osds" << dendl
;
1720 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
1721 float in_ratio
= (float)in
/ (float)num_osds
;
1722 if (in_ratio
< g_conf
->mon_osd_min_in_ratio
) {
1724 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
1725 << g_conf
->mon_osd_min_in_ratio
1726 << ", will not mark osd." << i
<< " out" << dendl
;
1728 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
1729 << g_conf
->mon_osd_min_in_ratio
1730 << ", will not mark osds out" << dendl
;
1737 bool OSDMonitor::can_mark_in(int i
)
1739 if (osdmap
.test_flag(CEPH_OSDMAP_NOIN
)) {
1740 dout(5) << __func__
<< " NOIN flag set, will not mark osd." << i
1745 if (osdmap
.is_noin(i
)) {
1746 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
1747 << "will not mark it in" << dendl
;
1754 bool OSDMonitor::check_failures(utime_t now
)
1756 bool found_failure
= false;
1757 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
1758 p
!= failure_info
.end();
1760 if (can_mark_down(p
->first
)) {
1761 found_failure
|= check_failure(now
, p
->first
, p
->second
);
1764 return found_failure
;
1767 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
1769 // already pending failure?
1770 if (pending_inc
.new_state
.count(target_osd
) &&
1771 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
1772 dout(10) << " already pending failure" << dendl
;
1776 set
<string
> reporters_by_subtree
;
1777 string reporter_subtree_level
= g_conf
->mon_osd_reporter_subtree_level
;
1778 utime_t
orig_grace(g_conf
->osd_heartbeat_grace
, 0);
1779 utime_t max_failed_since
= fi
.get_failed_since();
1780 utime_t failed_for
= now
- max_failed_since
;
1782 utime_t grace
= orig_grace
;
1783 double my_grace
= 0, peer_grace
= 0;
1785 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
1786 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
1787 decay_k
= ::log(.5) / halflife
;
1789 // scale grace period based on historical probability of 'lagginess'
1790 // (false positive failures due to slowness).
1791 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
1792 double decay
= exp((double)failed_for
* decay_k
);
1793 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
1794 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
1795 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
1799 // consider the peers reporting a failure a proxy for a potential
1800 // 'subcluster' over the overall cluster that is similarly
1801 // laggy. this is clearly not true in all cases, but will sometimes
1802 // help us localize the grace correction to a subset of the system
1803 // (say, a rack with a bad switch) that is unhappy.
1804 assert(fi
.reporters
.size());
1805 for (map
<int,failure_reporter_t
>::iterator p
= fi
.reporters
.begin();
1806 p
!= fi
.reporters
.end();
1808 // get the parent bucket whose type matches with "reporter_subtree_level".
1809 // fall back to OSD if the level doesn't exist.
1810 map
<string
, string
> reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
1811 map
<string
, string
>::iterator iter
= reporter_loc
.find(reporter_subtree_level
);
1812 if (iter
== reporter_loc
.end()) {
1813 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
1815 reporters_by_subtree
.insert(iter
->second
);
1817 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
1818 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
1819 utime_t elapsed
= now
- xi
.down_stamp
;
1820 double decay
= exp((double)elapsed
* decay_k
);
1821 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
1825 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
1826 peer_grace
/= (double)fi
.reporters
.size();
1827 grace
+= peer_grace
;
1830 dout(10) << " osd." << target_osd
<< " has "
1831 << fi
.reporters
.size() << " reporters, "
1832 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
1833 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
1836 if (failed_for
>= grace
&&
1837 (int)reporters_by_subtree
.size() >= g_conf
->mon_osd_min_down_reporters
) {
1838 dout(1) << " we have enough reporters to mark osd." << target_osd
1839 << " down" << dendl
;
1840 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1842 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
1843 << osdmap
.crush
->get_full_location_ordered_string(
1846 << (int)reporters_by_subtree
.size()
1847 << " reporters from different "
1848 << reporter_subtree_level
<< " after "
1849 << failed_for
<< " >= grace " << grace
<< ")";
1855 void OSDMonitor::force_failure(int target_osd
, int by
)
1857 // already pending failure?
1858 if (pending_inc
.new_state
.count(target_osd
) &&
1859 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
1860 dout(10) << " already pending failure" << dendl
;
1864 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
1865 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1867 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
1868 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
1869 << ") (connection refused reported by osd." << by
<< ")";
1873 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
1875 op
->mark_osdmon_event(__func__
);
1876 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
1877 dout(1) << "prepare_failure " << m
->get_target()
1878 << " from " << m
->get_orig_source_inst()
1879 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
1881 int target_osd
= m
->get_target().name
.num();
1882 int reporter
= m
->get_orig_source().num();
1883 assert(osdmap
.is_up(target_osd
));
1884 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
1886 if (m
->if_osd_failed()) {
1887 // calculate failure time
1888 utime_t now
= ceph_clock_now();
1889 utime_t failed_since
=
1890 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
1893 if (m
->is_immediate()) {
1894 mon
->clog
->debug() << m
->get_target() << " reported immediately failed by "
1895 << m
->get_orig_source_inst();
1896 force_failure(target_osd
, reporter
);
1899 mon
->clog
->debug() << m
->get_target() << " reported failed by "
1900 << m
->get_orig_source_inst();
1902 failure_info_t
& fi
= failure_info
[target_osd
];
1903 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
1905 mon
->no_reply(old_op
);
1908 return check_failure(now
, target_osd
, fi
);
1910 // remove the report
1911 mon
->clog
->debug() << m
->get_target() << " failure report canceled by "
1912 << m
->get_orig_source_inst();
1913 if (failure_info
.count(target_osd
)) {
1914 failure_info_t
& fi
= failure_info
[target_osd
];
1915 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
1917 mon
->no_reply(report_op
);
1919 if (fi
.reporters
.empty()) {
1920 dout(10) << " removing last failure_info for osd." << target_osd
1922 failure_info
.erase(target_osd
);
1924 dout(10) << " failure_info for osd." << target_osd
<< " now "
1925 << fi
.reporters
.size() << " reporters" << dendl
;
1928 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
1936 void OSDMonitor::process_failures()
1938 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
1939 while (p
!= failure_info
.end()) {
1940 if (osdmap
.is_up(p
->first
)) {
1943 dout(10) << "process_failures osd." << p
->first
<< dendl
;
1944 list
<MonOpRequestRef
> ls
;
1945 p
->second
.take_report_messages(ls
);
1946 failure_info
.erase(p
++);
1948 while (!ls
.empty()) {
1949 MonOpRequestRef o
= ls
.front();
1951 o
->mark_event(__func__
);
1952 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
1953 send_latest(o
, m
->get_epoch());
1961 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
1963 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
1965 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
1966 p
!= failure_info
.end();
1968 p
->second
.take_report_messages(ls
);
1970 failure_info
.clear();
1976 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
1978 op
->mark_osdmon_event(__func__
);
1979 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
1980 int from
= m
->get_orig_source_inst().name
.num();
1982 // check permissions, ignore if failed (no response expected)
1983 MonSession
*session
= m
->get_session();
1986 if (!session
->is_capable("osd", MON_CAP_X
)) {
1987 dout(0) << "got preprocess_boot message from entity with insufficient caps"
1988 << session
->caps
<< dendl
;
1992 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
1993 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
1994 << " != " << mon
->monmap
->fsid
<< dendl
;
1998 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
1999 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
2003 assert(m
->get_orig_source_inst().name
.is_osd());
2005 // check if osd has required features to boot
2006 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2007 CEPH_FEATURE_OSD_ERASURE_CODES
) &&
2008 !(m
->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES
)) {
2009 dout(0) << __func__
<< " osdmap requires erasure code but osd at "
2010 << m
->get_orig_source_inst()
2011 << " doesn't announce support -- ignore" << dendl
;
2015 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2016 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
) &&
2017 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
)) {
2018 dout(0) << __func__
<< " osdmap requires erasure code plugins v2 but osd at "
2019 << m
->get_orig_source_inst()
2020 << " doesn't announce support -- ignore" << dendl
;
2024 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2025 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
) &&
2026 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
)) {
2027 dout(0) << __func__
<< " osdmap requires erasure code plugins v3 but osd at "
2028 << m
->get_orig_source_inst()
2029 << " doesn't announce support -- ignore" << dendl
;
2033 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
2034 !HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
2035 mon
->clog
->info() << "disallowing boot of OSD "
2036 << m
->get_orig_source_inst()
2037 << " because the osdmap requires"
2038 << " CEPH_FEATURE_SERVER_LUMINOUS"
2039 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2043 if (osdmap
.require_osd_release
>= CEPH_RELEASE_JEWEL
&&
2044 !(m
->osd_features
& CEPH_FEATURE_SERVER_JEWEL
)) {
2045 mon
->clog
->info() << "disallowing boot of OSD "
2046 << m
->get_orig_source_inst()
2047 << " because the osdmap requires"
2048 << " CEPH_FEATURE_SERVER_JEWEL"
2049 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2053 if (osdmap
.require_osd_release
>= CEPH_RELEASE_KRAKEN
&&
2054 !HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
2055 mon
->clog
->info() << "disallowing boot of OSD "
2056 << m
->get_orig_source_inst()
2057 << " because the osdmap requires"
2058 << " CEPH_FEATURE_SERVER_KRAKEN"
2059 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2063 if (osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
2064 !(m
->osd_features
& CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
2065 mon
->clog
->info() << "disallowing boot of OSD "
2066 << m
->get_orig_source_inst()
2067 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2071 if (any_of(osdmap
.get_pools().begin(),
2072 osdmap
.get_pools().end(),
2073 [](const std::pair
<int64_t,pg_pool_t
>& pool
)
2074 { return pool
.second
.use_gmt_hitset
; })) {
2075 assert(osdmap
.get_num_up_osds() == 0 ||
2076 osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
);
2077 if (!(m
->osd_features
& CEPH_FEATURE_OSD_HITSET_GMT
)) {
2078 dout(0) << __func__
<< " one or more pools uses GMT hitsets but osd at "
2079 << m
->get_orig_source_inst()
2080 << " doesn't announce support -- ignore" << dendl
;
2085 // make sure upgrades stop at luminous
2086 if (HAVE_FEATURE(m
->osd_features
, SERVER_M
) &&
2087 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
2088 mon
->clog
->info() << "disallowing boot of post-luminous OSD "
2089 << m
->get_orig_source_inst()
2090 << " because require_osd_release < luminous";
2094 // make sure upgrades stop at jewel
2095 if (HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
) &&
2096 osdmap
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
2097 mon
->clog
->info() << "disallowing boot of post-jewel OSD "
2098 << m
->get_orig_source_inst()
2099 << " because require_osd_release < jewel";
2103 // make sure upgrades stop at hammer
2104 // * HAMMER_0_94_4 is the required hammer feature
2105 // * MON_METADATA is the first post-hammer feature
2106 if (osdmap
.get_num_up_osds() > 0) {
2107 if ((m
->osd_features
& CEPH_FEATURE_MON_METADATA
) &&
2108 !(osdmap
.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4
)) {
2109 mon
->clog
->info() << "disallowing boot of post-hammer OSD "
2110 << m
->get_orig_source_inst()
2111 << " because one or more up OSDs is pre-hammer v0.94.4";
2114 if (!(m
->osd_features
& CEPH_FEATURE_HAMMER_0_94_4
) &&
2115 (osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_METADATA
)) {
2116 mon
->clog
->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2117 << m
->get_orig_source_inst()
2118 << " because all up OSDs are post-hammer";
2124 if (osdmap
.is_up(from
) &&
2125 osdmap
.get_inst(from
) == m
->get_orig_source_inst() &&
2126 osdmap
.get_cluster_addr(from
) == m
->cluster_addr
) {
2128 dout(7) << "preprocess_boot dup from " << m
->get_orig_source_inst()
2129 << " == " << osdmap
.get_inst(from
) << dendl
;
2134 if (osdmap
.exists(from
) &&
2135 !osdmap
.get_uuid(from
).is_zero() &&
2136 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2137 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
2138 << " clashes with existing osd: different fsid"
2139 << " (ours: " << osdmap
.get_uuid(from
)
2140 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
2144 if (osdmap
.exists(from
) &&
2145 osdmap
.get_info(from
).up_from
> m
->version
&&
2146 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) {
2147 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
2148 send_latest(op
, m
->sb
.current_epoch
+1);
2153 if (!can_mark_up(from
)) {
2154 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
2155 send_latest(op
, m
->sb
.current_epoch
+1);
2159 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
2166 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
2168 op
->mark_osdmon_event(__func__
);
2169 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2170 dout(7) << __func__
<< " from " << m
->get_orig_source_inst() << " sb " << m
->sb
2171 << " cluster_addr " << m
->cluster_addr
2172 << " hb_back_addr " << m
->hb_back_addr
2173 << " hb_front_addr " << m
->hb_front_addr
2176 assert(m
->get_orig_source().is_osd());
2177 int from
= m
->get_orig_source().num();
2179 // does this osd exist?
2180 if (from
>= osdmap
.get_max_osd()) {
2181 dout(1) << "boot from osd." << from
<< " >= max_osd "
2182 << osdmap
.get_max_osd() << dendl
;
2186 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
2187 if (pending_inc
.new_state
.count(from
))
2188 oldstate
^= pending_inc
.new_state
[from
];
2190 // already up? mark down first?
2191 if (osdmap
.is_up(from
)) {
2192 dout(7) << __func__
<< " was up, first marking down "
2193 << osdmap
.get_inst(from
) << dendl
;
2194 // preprocess should have caught these; if not, assert.
2195 assert(osdmap
.get_inst(from
) != m
->get_orig_source_inst() ||
2196 osdmap
.get_cluster_addr(from
) != m
->cluster_addr
);
2197 assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
2199 if (pending_inc
.new_state
.count(from
) == 0 ||
2200 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
2201 // mark previous guy down
2202 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
2204 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2205 } else if (pending_inc
.new_up_client
.count(from
)) {
2206 // already prepared, just wait
2207 dout(7) << __func__
<< " already prepared, waiting on "
2208 << m
->get_orig_source_addr() << dendl
;
2209 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2212 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addr();
2213 if (!m
->cluster_addr
.is_blank_ip())
2214 pending_inc
.new_up_cluster
[from
] = m
->cluster_addr
;
2215 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addr
;
2216 if (!m
->hb_front_addr
.is_blank_ip())
2217 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addr
;
2219 down_pending_out
.erase(from
); // if any
2222 osd_weight
[from
] = m
->sb
.weight
;
2225 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
2227 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2228 // preprocess should have caught this; if not, assert.
2229 assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
2230 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
2234 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
2235 const osd_info_t
& i
= osdmap
.get_info(from
);
2236 if (i
.up_from
> i
.lost_at
) {
2237 dout(10) << " fresh osd; marking lost_at too" << dendl
;
2238 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
2243 bufferlist osd_metadata
;
2244 ::encode(m
->metadata
, osd_metadata
);
2245 pending_metadata
[from
] = osd_metadata
;
2246 pending_metadata_rm
.erase(from
);
2248 // adjust last clean unmount epoch?
2249 const osd_info_t
& info
= osdmap
.get_info(from
);
2250 dout(10) << " old osd_info: " << info
<< dendl
;
2251 if (m
->sb
.mounted
> info
.last_clean_begin
||
2252 (m
->sb
.mounted
== info
.last_clean_begin
&&
2253 m
->sb
.clean_thru
> info
.last_clean_end
)) {
2254 epoch_t begin
= m
->sb
.mounted
;
2255 epoch_t end
= m
->sb
.clean_thru
;
2257 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
2258 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
2259 << ") -> [" << begin
<< "-" << end
<< ")"
2261 pending_inc
.new_last_clean_interval
[from
] =
2262 pair
<epoch_t
,epoch_t
>(begin
, end
);
2265 osd_xinfo_t xi
= osdmap
.get_xinfo(from
);
2266 if (m
->boot_epoch
== 0) {
2267 xi
.laggy_probability
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2268 xi
.laggy_interval
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2269 dout(10) << " not laggy, new xi " << xi
<< dendl
;
2271 if (xi
.down_stamp
.sec()) {
2272 int interval
= ceph_clock_now().sec() -
2273 xi
.down_stamp
.sec();
2274 if (g_conf
->mon_osd_laggy_max_interval
&&
2275 (interval
> g_conf
->mon_osd_laggy_max_interval
)) {
2276 interval
= g_conf
->mon_osd_laggy_max_interval
;
2279 interval
* g_conf
->mon_osd_laggy_weight
+
2280 xi
.laggy_interval
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2282 xi
.laggy_probability
=
2283 g_conf
->mon_osd_laggy_weight
+
2284 xi
.laggy_probability
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2285 dout(10) << " laggy, now xi " << xi
<< dendl
;
2288 // set features shared by the osd
2289 if (m
->osd_features
)
2290 xi
.features
= m
->osd_features
;
2292 xi
.features
= m
->get_connection()->get_features();
2295 if ((g_conf
->mon_osd_auto_mark_auto_out_in
&&
2296 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
2297 (g_conf
->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
2298 (g_conf
->mon_osd_auto_mark_in
)) {
2299 if (can_mark_in(from
)) {
2300 if (osdmap
.osd_xinfo
[from
].old_weight
> 0) {
2301 pending_inc
.new_weight
[from
] = osdmap
.osd_xinfo
[from
].old_weight
;
2304 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
2307 dout(7) << __func__
<< " NOIN set, will not mark in "
2308 << m
->get_orig_source_addr() << dendl
;
2312 pending_inc
.new_xinfo
[from
] = xi
;
2315 wait_for_finished_proposal(op
, new C_Booted(this, op
));
2320 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
2322 op
->mark_osdmon_event(__func__
);
2323 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2324 dout(7) << "_booted " << m
->get_orig_source_inst()
2325 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
2328 mon
->clog
->info() << m
->get_orig_source_inst() << " boot";
2331 send_latest(op
, m
->sb
.current_epoch
+1);
2338 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
2340 op
->mark_osdmon_event(__func__
);
2341 MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2342 int from
= m
->get_orig_source().num();
2344 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2346 // check permissions, ignore if failed
2347 MonSession
*session
= m
->get_session();
2350 if (!session
->is_capable("osd", MON_CAP_X
)) {
2351 dout(0) << "MOSDFull from entity with insufficient privileges:"
2352 << session
->caps
<< dendl
;
2356 // ignore a full message from the osd instance that already went down
2357 if (!osdmap
.exists(from
)) {
2358 dout(7) << __func__
<< " ignoring full message from nonexistent "
2359 << m
->get_orig_source_inst() << dendl
;
2362 if ((!osdmap
.is_up(from
) &&
2363 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) ||
2364 (osdmap
.is_up(from
) &&
2365 osdmap
.get_inst(from
) != m
->get_orig_source_inst())) {
2366 dout(7) << __func__
<< " ignoring full message from down "
2367 << m
->get_orig_source_inst() << dendl
;
2371 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
2373 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
2374 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
2375 << " " << m
->get_orig_source_inst() << dendl
;
2376 _reply_map(op
, m
->version
);
2380 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
2381 << " " << m
->get_orig_source_inst() << dendl
;
2388 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
2390 op
->mark_osdmon_event(__func__
);
2391 const MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2392 const int from
= m
->get_orig_source().num();
2394 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2395 const unsigned want_state
= m
->state
& mask
; // safety first
2397 unsigned cur_state
= osdmap
.get_state(from
);
2398 auto p
= pending_inc
.new_state
.find(from
);
2399 if (p
!= pending_inc
.new_state
.end()) {
2400 cur_state
^= p
->second
;
2404 set
<string
> want_state_set
, cur_state_set
;
2405 OSDMap::calc_state_set(want_state
, want_state_set
);
2406 OSDMap::calc_state_set(cur_state
, cur_state_set
);
2408 if (cur_state
!= want_state
) {
2409 if (p
!= pending_inc
.new_state
.end()) {
2412 pending_inc
.new_state
[from
] = 0;
2414 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
2415 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2416 << " -> " << want_state_set
<< dendl
;
2418 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2419 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
2422 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2429 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
2431 op
->mark_osdmon_event(__func__
);
2432 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2433 int from
= m
->get_orig_source().num();
2435 // check permissions, ignore if failed
2436 MonSession
*session
= m
->get_session();
2439 if (!session
->is_capable("osd", MON_CAP_X
)) {
2440 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2441 << session
->caps
<< dendl
;
2445 if (!osdmap
.is_up(from
) ||
2446 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2447 dout(7) << "preprocess_alive ignoring alive message from down " << m
->get_orig_source_inst() << dendl
;
2451 if (osdmap
.get_up_thru(from
) >= m
->want
) {
2453 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
2454 _reply_map(op
, m
->version
);
2458 dout(10) << "preprocess_alive want up_thru " << m
->want
2459 << " from " << m
->get_orig_source_inst() << dendl
;
2466 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
2468 op
->mark_osdmon_event(__func__
);
2469 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2470 int from
= m
->get_orig_source().num();
2472 if (0) { // we probably don't care much about these
2473 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
2476 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
2477 << " from " << m
->get_orig_source_inst() << dendl
;
2479 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
2480 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2484 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
2486 op
->mark_osdmon_event(__func__
);
2487 dout(7) << "_reply_map " << e
2488 << " from " << op
->get_req()->get_orig_source_inst()
2494 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
2496 op
->mark_osdmon_event(__func__
);
2497 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2498 dout(10) << __func__
<< " " << *m
<< dendl
;
2499 auto session
= m
->get_session();
2501 dout(10) << __func__
<< ": no monitor session!" << dendl
;
2504 if (!session
->is_capable("osd", MON_CAP_X
)) {
2505 derr
<< __func__
<< " received from entity "
2506 << "with insufficient privileges " << session
->caps
<< dendl
;
2509 // always forward the "created!" to the leader
2513 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
2515 op
->mark_osdmon_event(__func__
);
2516 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2517 dout(10) << __func__
<< " " << *m
<< dendl
;
2518 auto src
= m
->get_orig_source();
2519 auto from
= src
.num();
2520 if (!src
.is_osd() ||
2521 !mon
->osdmon()->osdmap
.is_up(from
) ||
2522 m
->get_orig_source_inst() != mon
->osdmon()->osdmap
.get_inst(from
)) {
2523 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
2526 pending_created_pgs
.push_back(m
->pgid
);
2533 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
2535 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
2536 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
2537 mempool::osdmap::vector
<int> empty
;
2538 int from
= m
->get_orig_source().num();
2539 size_t ignore_cnt
= 0;
2542 MonSession
*session
= m
->get_session();
2545 if (!session
->is_capable("osd", MON_CAP_X
)) {
2546 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2547 << session
->caps
<< dendl
;
2551 if (!osdmap
.is_up(from
) ||
2552 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2553 dout(7) << "ignoring pgtemp message from down " << m
->get_orig_source_inst() << dendl
;
2557 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
2558 dout(20) << " " << p
->first
2559 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
2560 << " -> " << p
->second
<< dendl
;
2562 // does the pool exist?
2563 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
2565 * 1. If the osdmap does not have the pool, it means the pool has been
2566 * removed in-between the osd sending this message and us handling it.
2567 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2568 * not exist in the pending either, as the osds would not send a
2569 * message about a pool they know nothing about (yet).
2570 * 3. However, if the pool does exist in the pending, then it must be a
2571 * new pool, and not relevant to this message (see 1).
2573 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2574 << ": pool has been removed" << dendl
;
2579 int acting_primary
= -1;
2580 osdmap
.pg_to_up_acting_osds(
2581 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
2582 if (acting_primary
!= from
) {
2583 /* If the source isn't the primary based on the current osdmap, we know
2584 * that the interval changed and that we can discard this message.
2585 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2586 * which of two pg temp mappings on the same pg is more recent.
2588 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2589 << ": primary has changed" << dendl
;
2595 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
2596 osdmap
.primary_temp
->count(p
->first
)))
2599 // NOTE: we assume that this will clear pg_primary, so consider
2600 // an existing pg_primary field to imply a change
2601 if (p
->second
.size() &&
2602 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
2603 !vectors_equal(osdmap
.pg_temp
->get(p
->first
), p
->second
) ||
2604 osdmap
.primary_temp
->count(p
->first
)))
2608 // should we ignore all the pgs?
2609 if (ignore_cnt
== m
->pg_temp
.size())
2612 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
2613 _reply_map(op
, m
->map_epoch
);
2620 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
2622 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
2623 auto ut
= pending_inc
.new_up_thru
.find(from
);
2624 if (ut
!= pending_inc
.new_up_thru
.end()) {
2625 old_up_thru
= ut
->second
;
2627 if (up_thru
> old_up_thru
) {
2628 // set up_thru too, so the osd doesn't have to ask again
2629 pending_inc
.new_up_thru
[from
] = up_thru
;
2633 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
2635 op
->mark_osdmon_event(__func__
);
2636 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
2637 int from
= m
->get_orig_source().num();
2638 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
2639 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
2640 uint64_t pool
= p
->first
.pool();
2641 if (pending_inc
.old_pools
.count(pool
)) {
2642 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2643 << ": pool pending removal" << dendl
;
2646 if (!osdmap
.have_pg_pool(pool
)) {
2647 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2648 << ": pool has been removed" << dendl
;
2651 pending_inc
.new_pg_temp
[p
->first
] =
2652 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
2654 // unconditionally clear pg_primary (until this message can encode
2655 // a change for that, too.. at which point we need to also fix
2656 // preprocess_pg_temp)
2657 if (osdmap
.primary_temp
->count(p
->first
) ||
2658 pending_inc
.new_primary_temp
.count(p
->first
))
2659 pending_inc
.new_primary_temp
[p
->first
] = -1;
2662 // set up_thru too, so the osd doesn't have to ask again
2663 update_up_thru(from
, m
->map_epoch
);
2665 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
2672 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
2674 op
->mark_osdmon_event(__func__
);
2675 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
2676 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
2678 // check privilege, ignore if failed
2679 MonSession
*session
= m
->get_session();
2682 if (!session
->caps
.is_capable(
2684 CEPH_ENTITY_TYPE_MON
,
2685 session
->entity_name
,
2686 "osd", "osd pool rmsnap", {}, true, true, false)) {
2687 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
2688 << session
->caps
<< dendl
;
2692 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
2693 q
!= m
->snaps
.end();
2695 if (!osdmap
.have_pg_pool(q
->first
)) {
2696 dout(10) << " ignoring removed_snaps " << q
->second
<< " on non-existent pool " << q
->first
<< dendl
;
2699 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
2700 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
2701 p
!= q
->second
.end();
2703 if (*p
> pi
->get_snap_seq() ||
2704 !pi
->removed_snaps
.contains(*p
))
2713 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
2715 op
->mark_osdmon_event(__func__
);
2716 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
2717 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
2719 for (map
<int, vector
<snapid_t
> >::iterator p
= m
->snaps
.begin();
2720 p
!= m
->snaps
.end();
2723 if (!osdmap
.have_pg_pool(p
->first
)) {
2724 dout(10) << " ignoring removed_snaps " << p
->second
<< " on non-existent pool " << p
->first
<< dendl
;
2728 pg_pool_t
& pi
= osdmap
.pools
[p
->first
];
2729 for (vector
<snapid_t
>::iterator q
= p
->second
.begin();
2730 q
!= p
->second
.end();
2732 if (!pi
.removed_snaps
.contains(*q
) &&
2733 (!pending_inc
.new_pools
.count(p
->first
) ||
2734 !pending_inc
.new_pools
[p
->first
].removed_snaps
.contains(*q
))) {
2735 pg_pool_t
*newpi
= pending_inc
.get_new_pool(p
->first
, &pi
);
2736 newpi
->removed_snaps
.insert(*q
);
2737 dout(10) << " pool " << p
->first
<< " removed_snaps added " << *q
2738 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
2739 if (*q
> newpi
->get_snap_seq()) {
2740 dout(10) << " pool " << p
->first
<< " snap_seq " << newpi
->get_snap_seq() << " -> " << *q
<< dendl
;
2741 newpi
->set_snap_seq(*q
);
2743 newpi
->set_snap_epoch(pending_inc
.epoch
);
2751 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
2753 op
->mark_osdmon_event(__func__
);
2754 auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
2756 auto session
= beacon
->get_session();
2758 dout(10) << __func__
<< " no monitor session!" << dendl
;
2761 if (!session
->is_capable("osd", MON_CAP_X
)) {
2762 derr
<< __func__
<< " received from entity "
2763 << "with insufficient privileges " << session
->caps
<< dendl
;
2766 // Always forward the beacon to the leader, even if they are the same as
2767 // the old one. The leader will mark as down osds that haven't sent
2768 // beacon for a few minutes.
2772 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
2774 op
->mark_osdmon_event(__func__
);
2775 const auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
2776 const auto src
= beacon
->get_orig_source();
2777 dout(10) << __func__
<< " " << *beacon
2778 << " from " << src
<< dendl
;
2779 int from
= src
.num();
2781 if (!src
.is_osd() ||
2782 !osdmap
.is_up(from
) ||
2783 beacon
->get_orig_source_inst() != osdmap
.get_inst(from
)) {
2784 dout(1) << " ignoring beacon from non-active osd." << dendl
;
2788 last_osd_report
[from
] = ceph_clock_now();
2789 osd_epochs
[from
] = beacon
->version
;
2791 for (const auto& pg
: beacon
->pgs
) {
2792 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
2800 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
2802 op
->mark_osdmon_event(__func__
);
2803 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
2804 << " start " << start
<< dendl
;
2808 send_incremental(op
, start
);
2812 MOSDMap
*OSDMonitor::build_latest_full()
2814 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
);
2815 get_version_full(osdmap
.get_epoch(), r
->maps
[osdmap
.get_epoch()]);
2816 r
->oldest_map
= get_first_committed();
2817 r
->newest_map
= osdmap
.get_epoch();
2821 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
)
2823 dout(10) << "build_incremental [" << from
<< ".." << to
<< "]" << dendl
;
2824 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
);
2825 m
->oldest_map
= get_first_committed();
2826 m
->newest_map
= osdmap
.get_epoch();
2828 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
2830 int err
= get_version(e
, bl
);
2832 assert(bl
.length());
2833 // if (get_version(e, bl) > 0) {
2834 dout(20) << "build_incremental inc " << e
<< " "
2835 << bl
.length() << " bytes" << dendl
;
2836 m
->incremental_maps
[e
] = bl
;
2838 assert(err
== -ENOENT
);
2839 assert(!bl
.length());
2840 get_version_full(e
, bl
);
2841 if (bl
.length() > 0) {
2842 //else if (get_version("full", e, bl) > 0) {
2843 dout(20) << "build_incremental full " << e
<< " "
2844 << bl
.length() << " bytes" << dendl
;
2847 ceph_abort(); // we should have all maps.
2854 void OSDMonitor::send_full(MonOpRequestRef op
)
2856 op
->mark_osdmon_event(__func__
);
2857 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
2858 mon
->send_reply(op
, build_latest_full());
2861 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
2863 op
->mark_osdmon_event(__func__
);
2865 MonSession
*s
= op
->get_session();
2869 s
->proxy_con
->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP
)) {
2870 // oh, we can tell the other mon to do it
2871 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
2873 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
2874 r
->send_osdmap_first
= first
;
2875 s
->proxy_con
->send_message(r
);
2876 op
->mark_event("reply: send routed send_osdmap_first reply");
2879 send_incremental(first
, s
, false, op
);
2883 void OSDMonitor::send_incremental(epoch_t first
,
2884 MonSession
*session
,
2886 MonOpRequestRef req
)
2888 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
2889 << " to " << session
->inst
<< dendl
;
2891 if (first
<= session
->osd_epoch
) {
2892 dout(10) << __func__
<< " " << session
->inst
<< " should already have epoch "
2893 << session
->osd_epoch
<< dendl
;
2894 first
= session
->osd_epoch
+ 1;
2897 if (first
< get_first_committed()) {
2898 first
= get_first_committed();
2900 int err
= get_version_full(first
, bl
);
2902 assert(bl
.length());
2904 dout(20) << "send_incremental starting with base full "
2905 << first
<< " " << bl
.length() << " bytes" << dendl
;
2907 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid());
2908 m
->oldest_map
= get_first_committed();
2909 m
->newest_map
= osdmap
.get_epoch();
2910 m
->maps
[first
] = bl
;
2913 mon
->send_reply(req
, m
);
2914 session
->osd_epoch
= first
;
2917 session
->con
->send_message(m
);
2918 session
->osd_epoch
= first
;
2923 while (first
<= osdmap
.get_epoch()) {
2924 epoch_t last
= MIN(first
+ g_conf
->osd_map_message_max
- 1,
2925 osdmap
.get_epoch());
2926 MOSDMap
*m
= build_incremental(first
, last
);
2929 // send some maps. it may not be all of them, but it will get them
2931 mon
->send_reply(req
, m
);
2933 session
->con
->send_message(m
);
2936 session
->osd_epoch
= last
;
2942 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
2944 if (inc_osd_cache
.lookup(ver
, &bl
)) {
2947 int ret
= PaxosService::get_version(ver
, bl
);
2949 inc_osd_cache
.add(ver
, bl
);
2954 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
2956 if (full_osd_cache
.lookup(ver
, &bl
)) {
2959 int ret
= PaxosService::get_version_full(ver
, bl
);
2961 full_osd_cache
.add(ver
, bl
);
2966 epoch_t
OSDMonitor::blacklist(const entity_addr_t
& a
, utime_t until
)
2968 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
2969 pending_inc
.new_blacklist
[a
] = until
;
2970 return pending_inc
.epoch
;
2974 void OSDMonitor::check_osdmap_subs()
2976 dout(10) << __func__
<< dendl
;
2977 if (!osdmap
.get_epoch()) {
2980 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
2981 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
2984 auto p
= osdmap_subs
->second
->begin();
2988 check_osdmap_sub(sub
);
2992 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
2994 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
2995 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
2996 if (sub
->next
<= osdmap
.get_epoch()) {
2998 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
3000 sub
->session
->con
->send_message(build_latest_full());
3002 mon
->session_map
.remove_sub(sub
);
3004 sub
->next
= osdmap
.get_epoch() + 1;
3008 void OSDMonitor::check_pg_creates_subs()
3010 if (!mon
->monmap
->get_required_features().contains_all(
3011 ceph::features::mon::FEATURE_LUMINOUS
)) {
3012 // PGMonitor takes care of this in pre-luminous era.
3015 if (!osdmap
.get_num_up_osds()) {
3018 assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
3019 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
3020 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
3021 if (pg_creates_subs
== session_map
.subs
.end()) {
3024 for (auto sub
: *pg_creates_subs
->second
) {
3025 check_pg_creates_sub(sub
);
3030 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
3032 dout(20) << __func__
<< " .. " << sub
->session
->inst
<< dendl
;
3033 assert(sub
->type
== "osd_pg_creates");
3034 // only send these if the OSD is up. we will check_subs() when they do
3035 // come up so they will get the creates then.
3036 if (sub
->session
->inst
.name
.is_osd() &&
3037 mon
->osdmon()->osdmap
.is_up(sub
->session
->inst
.name
.num())) {
3038 sub
->next
= send_pg_creates(sub
->session
->inst
.name
.num(),
3039 sub
->session
->con
.get(),
3044 unsigned OSDMonitor::scan_for_creating_pgs(
3045 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
3046 const mempool::osdmap::set
<int64_t>& removed_pools
,
3048 creating_pgs_t
* creating_pgs
) const
3050 unsigned queued
= 0;
3051 for (auto& p
: pools
) {
3052 int64_t poolid
= p
.first
;
3053 const pg_pool_t
& pool
= p
.second
;
3054 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
3055 pool
.get_type(), pool
.get_size());
3056 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
3059 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
3060 const auto created
= pool
.get_last_change();
3061 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
3062 dout(10) << __func__
<< " no change in pool " << poolid
3063 << " " << pool
<< dendl
;
3066 if (removed_pools
.count(poolid
)) {
3067 dout(10) << __func__
<< " pool is being removed: " << poolid
3068 << " " << pool
<< dendl
;
3071 dout(10) << __func__
<< " queueing pool create for " << poolid
3072 << " " << pool
<< dendl
;
3073 if (creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
3074 created
, modified
)) {
3081 void OSDMonitor::update_creating_pgs()
3083 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
3084 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
3085 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
3086 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3087 for (auto& pg
: creating_pgs
.pgs
) {
3088 int acting_primary
= -1;
3089 auto pgid
= pg
.first
;
3090 auto mapped
= pg
.second
.first
;
3091 dout(20) << __func__
<< " looking up " << pgid
<< dendl
;
3092 mapping
.get(pgid
, nullptr, nullptr, nullptr, &acting_primary
);
3093 // check the previous creating_pgs, look for the target to whom the pg was
3094 // previously mapped
3095 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
3096 const auto last_acting_primary
= pgs_by_epoch
.first
;
3097 for (auto& pgs
: pgs_by_epoch
.second
) {
3098 if (pgs
.second
.count(pgid
)) {
3099 if (last_acting_primary
== acting_primary
) {
3102 dout(20) << __func__
<< " " << pgid
<< " "
3103 << " acting_primary:" << last_acting_primary
3104 << " -> " << acting_primary
<< dendl
;
3105 // note epoch if the target of the create message changed.
3106 mapped
= mapping
.get_epoch();
3111 mapped
= mapping
.get_epoch();
3115 dout(10) << __func__
<< " will instruct osd." << acting_primary
3116 << " to create " << pgid
<< dendl
;
3117 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(pgid
);
3119 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
3120 creating_pgs_epoch
= mapping
.get_epoch();
3123 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
)
3125 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
3126 << " " << creating_pgs_by_osd_epoch
<< dendl
;
3127 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3128 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
3129 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
3131 assert(!creating_pgs_by_epoch
->second
.empty());
3133 MOSDPGCreate
*m
= nullptr;
3135 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
3136 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
3137 auto epoch
= epoch_pgs
->first
;
3138 auto& pgs
= epoch_pgs
->second
;
3139 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3140 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
3142 for (auto& pg
: pgs
) {
3144 m
= new MOSDPGCreate(creating_pgs_epoch
);
3145 // Need the create time from the monitor using its clock to set
3146 // last_scrub_stamp upon pg creation.
3147 const auto& creation
= creating_pgs
.pgs
[pg
];
3148 m
->mkpg
.emplace(pg
, pg_create_t
{creation
.first
, pg
, 0});
3149 m
->ctimes
.emplace(pg
, creation
.second
);
3150 dout(20) << __func__
<< " will create " << pg
3151 << " at " << creation
.first
<< dendl
;
3155 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3156 << " has nothing to send" << dendl
;
3159 con
->send_message(m
);
3160 // sub is current through last + 1
3167 void OSDMonitor::tick()
3169 if (!is_active()) return;
3171 dout(10) << osdmap
<< dendl
;
3173 if (!mon
->is_leader()) return;
3175 bool do_propose
= false;
3176 utime_t now
= ceph_clock_now();
3178 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
3179 mon
->monmap
->get_required_features().contains_all(
3180 ceph::features::mon::FEATURE_LUMINOUS
)) {
3181 if (handle_osd_timeouts(now
, last_osd_report
)) {
3187 if (check_failures(now
))
3190 // mark down osds out?
3192 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3193 * influence at all. The decision is made based on the ratio of "in" osds,
3194 * and the function returns false if this ratio is lower that the minimum
3195 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3197 if (can_mark_out(-1)) {
3198 set
<int> down_cache
; // quick cache of down subtrees
3200 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
3201 while (i
!= down_pending_out
.end()) {
3207 if (osdmap
.is_down(o
) &&
3210 utime_t
orig_grace(g_conf
->mon_osd_down_out_interval
, 0);
3211 utime_t grace
= orig_grace
;
3212 double my_grace
= 0.0;
3214 if (g_conf
->mon_osd_adjust_down_out_interval
) {
3215 // scale grace period the same way we do the heartbeat grace.
3216 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
3217 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
3218 double decay_k
= ::log(.5) / halflife
;
3219 double decay
= exp((double)down
* decay_k
);
3220 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
3221 << " down for " << down
<< " decay " << decay
<< dendl
;
3222 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3226 // is this an entire large subtree down?
3227 if (g_conf
->mon_osd_down_out_subtree_limit
.length()) {
3228 int type
= osdmap
.crush
->get_type_id(g_conf
->mon_osd_down_out_subtree_limit
);
3230 if (osdmap
.containing_subtree_is_down(g_ceph_context
, o
, type
, &down_cache
)) {
3231 dout(10) << "tick entire containing " << g_conf
->mon_osd_down_out_subtree_limit
3232 << " subtree for osd." << o
<< " is down; resetting timer" << dendl
;
3233 // reset timer, too.
3234 down_pending_out
[o
] = now
;
3240 if (g_conf
->mon_osd_down_out_interval
> 0 &&
3241 down
.sec() >= grace
) {
3242 dout(10) << "tick marking osd." << o
<< " OUT after " << down
3243 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
3244 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
3246 // set the AUTOOUT bit.
3247 if (pending_inc
.new_state
.count(o
) == 0)
3248 pending_inc
.new_state
[o
] = 0;
3249 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
3251 // remember previous weight
3252 if (pending_inc
.new_xinfo
.count(o
) == 0)
3253 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
3254 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
3258 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
3259 << int(down
.sec()) << " seconds)";
3264 down_pending_out
.erase(o
);
3267 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
3270 // expire blacklisted items?
3271 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
3272 p
!= osdmap
.blacklist
.end();
3274 if (p
->second
< now
) {
3275 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
3276 pending_inc
.old_blacklist
.push_back(p
->first
);
3281 // if map full setting has changed, get that info out there!
3282 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
&&
3283 mon
->pgservice
->is_readable()) {
3284 // for pre-luminous compat only!
3285 if (mon
->pgservice
->have_full_osds()) {
3286 dout(5) << "There are full osds, setting full flag" << dendl
;
3287 add_flag(CEPH_OSDMAP_FULL
);
3288 } else if (osdmap
.test_flag(CEPH_OSDMAP_FULL
)){
3289 dout(10) << "No full osds, removing full flag" << dendl
;
3290 remove_flag(CEPH_OSDMAP_FULL
);
3293 if (mon
->pgservice
->have_nearfull_osds()) {
3294 dout(5) << "There are near full osds, setting nearfull flag" << dendl
;
3295 add_flag(CEPH_OSDMAP_NEARFULL
);
3296 } else if (osdmap
.test_flag(CEPH_OSDMAP_NEARFULL
)){
3297 dout(10) << "No near full osds, removing nearfull flag" << dendl
;
3298 remove_flag(CEPH_OSDMAP_NEARFULL
);
3300 if (pending_inc
.new_flags
!= -1 &&
3301 (pending_inc
.new_flags
^ osdmap
.flags
) & (CEPH_OSDMAP_FULL
| CEPH_OSDMAP_NEARFULL
)) {
3302 dout(1) << "New setting for" <<
3303 (pending_inc
.new_flags
& CEPH_OSDMAP_FULL
? " CEPH_OSDMAP_FULL" : "") <<
3304 (pending_inc
.new_flags
& CEPH_OSDMAP_NEARFULL
? " CEPH_OSDMAP_NEARFULL" : "")
3305 << " -- doing propose" << dendl
;
3310 if (update_pools_status())
3314 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
3318 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
3319 std::map
<int,utime_t
> &last_osd_report
)
3321 utime_t
timeo(g_conf
->mon_osd_report_timeout
, 0);
3322 if (now
- mon
->get_leader_since() < timeo
) {
3323 // We haven't been the leader for long enough to consider OSD timeouts
3327 int max_osd
= osdmap
.get_max_osd();
3328 bool new_down
= false;
3330 for (int i
=0; i
< max_osd
; ++i
) {
3331 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
3332 if (!osdmap
.is_up(i
))
3334 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
3335 if (t
== last_osd_report
.end()) {
3336 // it wasn't in the map; start the timer.
3337 last_osd_report
[i
] = now
;
3338 } else if (can_mark_down(i
)) {
3339 utime_t diff
= now
- t
->second
;
3341 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
3342 << diff
<< " seconds";
3343 derr
<< "no beacon from osd." << i
<< " since " << t
->second
3344 << ", " << diff
<< " seconds ago. marking down" << dendl
;
3345 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
3353 void OSDMonitor::get_health(list
<pair
<health_status_t
,string
> >& summary
,
3354 list
<pair
<health_status_t
,string
> > *detail
,
3355 CephContext
*cct
) const
3357 int num_osds
= osdmap
.get_num_osds();
3359 if (num_osds
== 0) {
3360 summary
.push_back(make_pair(HEALTH_ERR
, "no osds"));
3362 int num_in_osds
= 0;
3363 int num_down_in_osds
= 0;
3365 set
<int> down_in_osds
;
3366 set
<int> up_in_osds
;
3367 set
<int> subtree_up
;
3368 unordered_map
<int, set
<int> > subtree_type_down
;
3369 unordered_map
<int, int> num_osds_subtree
;
3370 int max_type
= osdmap
.crush
->get_max_type_id();
3372 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3373 if (!osdmap
.exists(i
)) {
3374 if (osdmap
.crush
->item_exists(i
)) {
3379 if (osdmap
.is_out(i
))
3382 if (down_in_osds
.count(i
) || up_in_osds
.count(i
))
3384 if (!osdmap
.is_up(i
)) {
3385 down_in_osds
.insert(i
);
3388 for (int type
= 0; type
<= max_type
; type
++) {
3389 if (!osdmap
.crush
->get_type_name(type
))
3391 int r
= osdmap
.crush
->get_immediate_parent_id(current
, &parent_id
);
3394 // break early if this parent is already marked as up
3395 if (subtree_up
.count(parent_id
))
3397 type
= osdmap
.crush
->get_bucket_type(parent_id
);
3398 if (!osdmap
.subtree_type_is_down(
3399 g_ceph_context
, parent_id
, type
,
3400 &down_in_osds
, &up_in_osds
, &subtree_up
, &subtree_type_down
))
3402 current
= parent_id
;
3407 // calculate the number of down osds in each down subtree and
3408 // store it in num_osds_subtree
3409 for (int type
= 1; type
<= max_type
; type
++) {
3410 if (!osdmap
.crush
->get_type_name(type
))
3412 for (auto j
= subtree_type_down
[type
].begin();
3413 j
!= subtree_type_down
[type
].end();
3417 int num
= osdmap
.crush
->get_children(*j
, &children
);
3418 num_osds_subtree
[*j
] = num
;
3422 int num_children
= osdmap
.crush
->get_children(*j
, &children
);
3423 if (num_children
== 0)
3425 for (auto l
= children
.begin(); l
!= children
.end(); ++l
) {
3426 if (num_osds_subtree
[*l
] > 0) {
3427 num
= num
+ num_osds_subtree
[*l
];
3430 num_osds_subtree
[*j
] = num
;
3434 num_down_in_osds
= down_in_osds
.size();
3435 assert(num_down_in_osds
<= num_in_osds
);
3436 if (num_down_in_osds
> 0) {
3437 // summary of down subtree types and osds
3438 for (int type
= max_type
; type
> 0; type
--) {
3439 if (!osdmap
.crush
->get_type_name(type
))
3441 if (subtree_type_down
[type
].size() > 0) {
3443 ss
<< subtree_type_down
[type
].size() << " "
3444 << osdmap
.crush
->get_type_name(type
);
3445 if (subtree_type_down
[type
].size() > 1) {
3448 int sum_down_osds
= 0;
3449 for (auto j
= subtree_type_down
[type
].begin();
3450 j
!= subtree_type_down
[type
].end();
3452 sum_down_osds
= sum_down_osds
+ num_osds_subtree
[*j
];
3454 ss
<< " (" << sum_down_osds
<< " osds) down";
3455 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3459 ss
<< down_in_osds
.size() << " osds down";
3460 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3463 // details of down subtree types
3464 for (int type
= max_type
; type
> 0; type
--) {
3465 if (!osdmap
.crush
->get_type_name(type
))
3467 for (auto j
= subtree_type_down
[type
].rbegin();
3468 j
!= subtree_type_down
[type
].rend();
3471 ss
<< osdmap
.crush
->get_type_name(type
);
3473 ss
<< osdmap
.crush
->get_item_name(*j
);
3474 // at the top level, do not print location
3475 if (type
!= max_type
) {
3477 ss
<< osdmap
.crush
->get_full_location_ordered_string(*j
);
3480 int num
= num_osds_subtree
[*j
];
3481 ss
<< " (" << num
<< " osds)";
3483 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3486 // details of down osds
3487 for (auto it
= down_in_osds
.begin(); it
!= down_in_osds
.end(); ++it
) {
3489 ss
<< "osd." << *it
<< " (";
3490 ss
<< osdmap
.crush
->get_full_location_ordered_string(*it
);
3492 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3497 if (!osds
.empty()) {
3499 ss
<< osds
.size() << " osds exist in the crush map but not in the osdmap";
3500 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3502 ss
<< " (osds: " << osds
<< ")";
3503 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3507 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
3508 // An osd could configure failsafe ratio, to something different
3509 // but for now assume it is the same here.
3510 float fsr
= g_conf
->osd_failsafe_full_ratio
;
3511 if (fsr
> 1.0) fsr
/= 100;
3512 float fr
= osdmap
.get_full_ratio();
3513 float br
= osdmap
.get_backfillfull_ratio();
3514 float nr
= osdmap
.get_nearfull_ratio();
3516 bool out_of_order
= false;
3517 // These checks correspond to how OSDService::check_full_status() in an OSD
3518 // handles the improper setting of these values.
3520 out_of_order
= true;
3523 ss
<< "backfillfull_ratio (" << br
<< ") < nearfull_ratio (" << nr
<< "), increased";
3524 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
3529 out_of_order
= true;
3532 ss
<< "full_ratio (" << fr
<< ") < backfillfull_ratio (" << br
<< "), increased";
3533 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
3538 out_of_order
= true;
3541 ss
<< "osd_failsafe_full_ratio (" << fsr
<< ") < full_ratio (" << fr
<< "), increased";
3542 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
3547 ss
<< "Full ratio(s) out of order";
3548 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
3551 set
<int> full
, backfillfull
, nearfull
;
3552 osdmap
.get_full_osd_counts(&full
, &backfillfull
, &nearfull
);
3555 ss
<< full
.size() << " full osd(s)";
3556 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
3558 if (backfillfull
.size()) {
3560 ss
<< backfillfull
.size() << " backfillfull osd(s)";
3561 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3563 if (nearfull
.size()) {
3565 ss
<< nearfull
.size() << " nearfull osd(s)";
3566 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3569 for (auto& i
: full
) {
3571 ss
<< "osd." << i
<< " is full";
3572 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
3574 for (auto& i
: backfillfull
) {
3576 ss
<< "osd." << i
<< " is backfill full";
3577 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3579 for (auto& i
: nearfull
) {
3581 ss
<< "osd." << i
<< " is near full";
3582 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3586 // warn if there is any noup osds.
3587 vector
<int> noup_osds
;
3588 osdmap
.get_noup_osds(&noup_osds
);
3589 if (noup_osds
.size()) {
3591 ss
<< noup_osds
.size() << " noup osd(s)";
3592 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3594 ss
<< ": " << noup_osds
;
3595 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3599 // warn if there is any nodown osds.
3600 vector
<int> nodown_osds
;
3601 osdmap
.get_nodown_osds(&nodown_osds
);
3602 if (nodown_osds
.size()) {
3604 ss
<< nodown_osds
.size() << " nodown osd(s)";
3605 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3607 ss
<< ": " << nodown_osds
;
3608 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3612 // warn if there is any noin osds.
3613 vector
<int> noin_osds
;
3614 osdmap
.get_noin_osds(&noin_osds
);
3615 if (noin_osds
.size()) {
3617 ss
<< noin_osds
.size() << " noin osd(s)";
3618 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3620 ss
<< ": " << noin_osds
;
3621 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3625 // warn if there is any noout osds.
3626 vector
<int> noout_osds
;
3627 osdmap
.get_noout_osds(&noout_osds
);
3628 if (noout_osds
.size()) {
3630 ss
<< noout_osds
.size() << " noout osd(s)";
3631 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3633 ss
<< ": " << noout_osds
;
3634 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3638 // note: we leave it to ceph-mgr to generate details health warnings
3639 // with actual osd utilizations
3642 uint64_t warn_flags
=
3644 CEPH_OSDMAP_PAUSERD
|
3645 CEPH_OSDMAP_PAUSEWR
|
3646 CEPH_OSDMAP_PAUSEREC
|
3648 CEPH_OSDMAP_NODOWN
|
3651 CEPH_OSDMAP_NOBACKFILL
|
3652 CEPH_OSDMAP_NORECOVER
|
3653 CEPH_OSDMAP_NOSCRUB
|
3654 CEPH_OSDMAP_NODEEP_SCRUB
|
3655 CEPH_OSDMAP_NOTIERAGENT
|
3656 CEPH_OSDMAP_NOREBALANCE
;
3657 if (osdmap
.test_flag(warn_flags
)) {
3659 ss
<< osdmap
.get_flag_string(osdmap
.get_flags() & warn_flags
)
3661 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3663 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3666 // old crush tunables?
3667 if (g_conf
->mon_warn_on_legacy_crush_tunables
) {
3668 string min
= osdmap
.crush
->get_min_required_version();
3669 if (min
< g_conf
->mon_crush_min_required_version
) {
3671 ss
<< "crush map has legacy tunables (require " << min
3672 << ", min is " << g_conf
->mon_crush_min_required_version
<< ")";
3673 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3675 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3676 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3680 if (g_conf
->mon_warn_on_crush_straw_calc_version_zero
) {
3681 if (osdmap
.crush
->get_straw_calc_version() == 0) {
3683 ss
<< "crush map has straw_calc_version=0";
3684 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3686 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3687 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3692 // hit_set-less cache_mode?
3693 if (g_conf
->mon_warn_on_cache_pools_without_hit_sets
) {
3694 int problem_cache_pools
= 0;
3695 for (map
<int64_t, pg_pool_t
>::const_iterator p
= osdmap
.pools
.begin();
3696 p
!= osdmap
.pools
.end();
3698 const pg_pool_t
& info
= p
->second
;
3699 if (info
.cache_mode_requires_hit_set() &&
3700 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
3701 ++problem_cache_pools
;
3704 ss
<< "pool '" << osdmap
.get_pool_name(p
->first
)
3705 << "' with cache_mode " << info
.get_cache_mode_name()
3706 << " needs hit_set_type to be set but it is not";
3707 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3711 if (problem_cache_pools
) {
3713 ss
<< problem_cache_pools
<< " cache pools are missing hit_sets";
3714 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3718 if (osdmap
.crush
->has_multirule_rulesets()) {
3720 ss
<< "CRUSH map contains multirule rulesets";
3721 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3723 ss
<< "; please manually fix the map";
3724 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3728 // Not using 'sortbitwise' and should be?
3729 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
3730 (osdmap
.get_up_osd_features() &
3731 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
3733 ss
<< "no legacy OSD present but 'sortbitwise' flag is not set";
3734 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3737 // Warn if 'mon_osd_down_out_interval' is set to zero.
3738 // Having this option set to zero on the leader acts much like the
3739 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3740 // without the 'noout' flag set but acting like that just the same, so
3741 // we report a HEALTH_WARN in case this option is set to zero.
3742 // This is an ugly hack to get the warning out, but until we find a way
3743 // to spread global options throughout the mon cluster and have all mons
3744 // using a base set of the same options, we need to work around this sort
3746 // There's also the obvious drawback that if this is set on a single
3747 // monitor on a 3-monitor cluster, this warning will only be shown every
3748 // third monitor connection.
3749 if (g_conf
->mon_warn_on_osd_down_out_interval_zero
&&
3750 g_conf
->mon_osd_down_out_interval
== 0) {
3752 ss
<< "mon." << mon
->name
<< " has mon_osd_down_out_interval set to 0";
3753 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3755 ss
<< "; this has the same effect as the 'noout' flag";
3756 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3760 // warn about upgrade flags that can be set but are not.
3761 if (g_conf
->mon_debug_no_require_luminous
) {
3762 // ignore these checks
3763 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
) &&
3764 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
3765 string msg
= "all OSDs are running luminous or later but"
3766 " require_osd_release < luminous";
3767 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
3769 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
3771 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
) &&
3772 osdmap
.require_osd_release
< CEPH_RELEASE_KRAKEN
) {
3773 string msg
= "all OSDs are running kraken or later but"
3774 " require_osd_release < kraken";
3775 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
3777 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
3779 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
) &&
3780 osdmap
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
3781 string msg
= "all OSDs are running jewel or later but"
3782 " require_osd_release < jewel";
3783 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
3785 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
3789 for (auto it
: osdmap
.get_pools()) {
3790 const pg_pool_t
&pool
= it
.second
;
3791 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
3792 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
3794 ss
<< "pool '" << pool_name
<< "' is full";
3795 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3797 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3803 void OSDMonitor::dump_info(Formatter
*f
)
3805 f
->open_object_section("osdmap");
3809 f
->open_array_section("osd_metadata");
3810 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
3811 if (osdmap
.exists(i
)) {
3812 f
->open_object_section("osd");
3813 f
->dump_unsigned("id", i
);
3814 dump_osd_metadata(i
, f
, NULL
);
3820 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
3821 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
3823 f
->open_object_section("crushmap");
3824 osdmap
.crush
->dump(f
);
3829 enum osd_pool_get_choices
{
3830 SIZE
, MIN_SIZE
, CRASH_REPLAY_INTERVAL
,
3831 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
,
3832 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
3833 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
3834 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
3835 USE_GMT_HITSET
, AUID
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
3836 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
3837 CACHE_TARGET_FULL_RATIO
,
3838 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
3839 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
3840 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
3841 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
3842 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
3843 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
3844 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
3845 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
3846 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
};
3848 std::set
<osd_pool_get_choices
>
3849 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
3850 const std::set
<osd_pool_get_choices
>& second
)
3852 std::set
<osd_pool_get_choices
> result
;
3853 std::set_difference(first
.begin(), first
.end(),
3854 second
.begin(), second
.end(),
3855 std::inserter(result
, result
.end()));
3861 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
3863 op
->mark_osdmon_event(__func__
);
3864 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
3867 stringstream ss
, ds
;
3869 map
<string
, cmd_vartype
> cmdmap
;
3870 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
3871 string rs
= ss
.str();
3872 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
3876 MonSession
*session
= m
->get_session();
3878 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
3883 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
3886 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
3887 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
3889 if (prefix
== "osd stat") {
3890 osdmap
.print_summary(f
.get(), ds
, "");
3896 else if (prefix
== "osd perf" ||
3897 prefix
== "osd blocked-by") {
3898 r
= mon
->pgservice
->process_pg_command(prefix
, cmdmap
,
3899 osdmap
, f
.get(), &ss
, &rdata
);
3901 else if (prefix
== "osd dump" ||
3902 prefix
== "osd tree" ||
3903 prefix
== "osd ls" ||
3904 prefix
== "osd getmap" ||
3905 prefix
== "osd getcrushmap" ||
3906 prefix
== "osd ls-tree") {
3911 cmd_getval(g_ceph_context
, cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
3914 bufferlist osdmap_bl
;
3915 int err
= get_version_full(epoch
, osdmap_bl
);
3916 if (err
== -ENOENT
) {
3918 ss
<< "there is no map for epoch " << epoch
;
3922 assert(osdmap_bl
.length());
3925 if (epoch
== osdmap
.get_epoch()) {
3929 p
->decode(osdmap_bl
);
3932 auto sg
= make_scope_guard([&] {
3938 if (prefix
== "osd dump") {
3941 f
->open_object_section("osdmap");
3951 } else if (prefix
== "osd ls") {
3953 f
->open_array_section("osds");
3954 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3955 if (osdmap
.exists(i
)) {
3956 f
->dump_int("osd", i
);
3963 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3964 if (osdmap
.exists(i
)) {
3973 } else if (prefix
== "osd tree") {
3974 vector
<string
> states
;
3975 cmd_getval(g_ceph_context
, cmdmap
, "states", states
);
3976 unsigned filter
= 0;
3977 for (auto& s
: states
) {
3979 filter
|= OSDMap::DUMP_UP
;
3980 } else if (s
== "down") {
3981 filter
|= OSDMap::DUMP_DOWN
;
3982 } else if (s
== "in") {
3983 filter
|= OSDMap::DUMP_IN
;
3984 } else if (s
== "out") {
3985 filter
|= OSDMap::DUMP_OUT
;
3987 ss
<< "unrecognized state '" << s
<< "'";
3992 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
3993 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
) ||
3994 (filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
3995 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) {
3996 ss
<< "cannot specify both up and down or both in and out";
4001 f
->open_object_section("tree");
4002 p
->print_tree(f
.get(), NULL
, filter
);
4006 p
->print_tree(NULL
, &ds
, filter
);
4009 } else if (prefix
== "osd getmap") {
4010 rdata
.append(osdmap_bl
);
4011 ss
<< "got osdmap epoch " << p
->get_epoch();
4012 } else if (prefix
== "osd getcrushmap") {
4013 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
4014 ss
<< p
->get_crush_version();
4015 } else if (prefix
== "osd ls-tree") {
4017 cmd_getval(g_ceph_context
, cmdmap
, "name", bucket_name
);
4019 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
4021 ss
<< "\"" << bucket_name
<< "\" does not exist";
4024 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
4029 f
->open_array_section("osds");
4030 for (auto &i
: osds
) {
4031 if (osdmap
.exists(i
)) {
4032 f
->dump_int("osd", i
);
4039 for (auto &i
: osds
) {
4040 if (osdmap
.exists(i
)) {
4051 } else if (prefix
== "osd df") {
4053 cmd_getval(g_ceph_context
, cmdmap
, "output_method", method
);
4054 print_osd_utilization(osdmap
, mon
->pgservice
, ds
,
4055 f
.get(), method
== "tree");
4057 } else if (prefix
== "osd getmaxosd") {
4059 f
->open_object_section("getmaxosd");
4060 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4061 f
->dump_int("max_osd", osdmap
.get_max_osd());
4065 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
4068 } else if (prefix
== "osd utilization") {
4070 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
4077 } else if (prefix
== "osd find") {
4079 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
4080 ss
<< "unable to parse osd id value '"
4081 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4085 if (!osdmap
.exists(osd
)) {
4086 ss
<< "osd." << osd
<< " does not exist";
4091 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4092 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4093 f
->open_object_section("osd_location");
4094 f
->dump_int("osd", osd
);
4095 f
->dump_stream("ip") << osdmap
.get_addr(osd
);
4096 f
->open_object_section("crush_location");
4097 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
4098 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
4099 f
->dump_string(p
->first
.c_str(), p
->second
);
4103 } else if (prefix
== "osd metadata") {
4105 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
4106 !cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
4107 ss
<< "unable to parse osd id value '"
4108 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4112 if (osd
>= 0 && !osdmap
.exists(osd
)) {
4113 ss
<< "osd." << osd
<< " does not exist";
4118 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4119 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4121 f
->open_object_section("osd_metadata");
4122 f
->dump_unsigned("id", osd
);
4123 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
4129 f
->open_array_section("osd_metadata");
4130 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4131 if (osdmap
.exists(i
)) {
4132 f
->open_object_section("osd");
4133 f
->dump_unsigned("id", i
);
4134 r
= dump_osd_metadata(i
, f
.get(), NULL
);
4135 if (r
== -EINVAL
|| r
== -ENOENT
) {
4136 // Drop error, continue to get other daemons' metadata
4137 dout(4) << "No metadata for osd." << i
<< dendl
;
4149 } else if (prefix
== "osd versions") {
4151 f
.reset(Formatter::create("json-pretty"));
4152 count_metadata("ceph_version", f
.get());
4155 } else if (prefix
== "osd count-metadata") {
4157 f
.reset(Formatter::create("json-pretty"));
4159 cmd_getval(g_ceph_context
, cmdmap
, "property", field
);
4160 count_metadata(field
, f
.get());
4163 } else if (prefix
== "osd map") {
4164 string poolstr
, objstr
, namespacestr
;
4165 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
4166 cmd_getval(g_ceph_context
, cmdmap
, "object", objstr
);
4167 cmd_getval(g_ceph_context
, cmdmap
, "nspace", namespacestr
);
4169 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4171 ss
<< "pool " << poolstr
<< " does not exist";
4175 object_locator_t
oloc(pool
, namespacestr
);
4176 object_t
oid(objstr
);
4177 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
4178 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4179 vector
<int> up
, acting
;
4181 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
4184 if (!namespacestr
.empty())
4185 fullobjname
= namespacestr
+ string("/") + oid
.name
;
4187 fullobjname
= oid
.name
;
4189 f
->open_object_section("osd_map");
4190 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4191 f
->dump_string("pool", poolstr
);
4192 f
->dump_int("pool_id", pool
);
4193 f
->dump_stream("objname") << fullobjname
;
4194 f
->dump_stream("raw_pgid") << pgid
;
4195 f
->dump_stream("pgid") << mpgid
;
4196 f
->open_array_section("up");
4197 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
4198 f
->dump_int("osd", *p
);
4200 f
->dump_int("up_primary", up_p
);
4201 f
->open_array_section("acting");
4202 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
4203 f
->dump_int("osd", *p
);
4205 f
->dump_int("acting_primary", acting_p
);
4206 f
->close_section(); // osd_map
4209 ds
<< "osdmap e" << osdmap
.get_epoch()
4210 << " pool '" << poolstr
<< "' (" << pool
<< ")"
4211 << " object '" << fullobjname
<< "' ->"
4212 << " pg " << pgid
<< " (" << mpgid
<< ")"
4213 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
4214 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
4218 } else if (prefix
== "pg map") {
4221 cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
);
4222 if (!pgid
.parse(pgidstr
.c_str())) {
4223 ss
<< "invalid pgid '" << pgidstr
<< "'";
4227 vector
<int> up
, acting
;
4228 if (!osdmap
.have_pg_pool(pgid
.pool())) {
4229 ss
<< "pg '" << pgidstr
<< "' does not exist";
4233 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4234 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
4236 f
->open_object_section("pg_map");
4237 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4238 f
->dump_stream("raw_pgid") << pgid
;
4239 f
->dump_stream("pgid") << mpgid
;
4240 f
->open_array_section("up");
4241 for (auto osd
: up
) {
4242 f
->dump_int("up_osd", osd
);
4245 f
->open_array_section("acting");
4246 for (auto osd
: acting
) {
4247 f
->dump_int("acting_osd", osd
);
4253 ds
<< "osdmap e" << osdmap
.get_epoch()
4254 << " pg " << pgid
<< " (" << mpgid
<< ")"
4255 << " -> up " << up
<< " acting " << acting
;
4260 } else if (prefix
== "osd scrub" ||
4261 prefix
== "osd deep-scrub" ||
4262 prefix
== "osd repair") {
4264 cmd_getval(g_ceph_context
, cmdmap
, "who", whostr
);
4265 vector
<string
> pvec
;
4266 get_str_vec(prefix
, pvec
);
4268 if (whostr
== "*" || whostr
== "all" || whostr
== "any") {
4271 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++)
4272 if (osdmap
.is_up(i
)) {
4273 ss
<< (c
++ ? "," : "") << i
;
4274 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4275 pvec
.back() == "repair",
4276 pvec
.back() == "deep-scrub"),
4277 osdmap
.get_inst(i
));
4280 ss
<< " instructed to " << pvec
.back();
4282 long osd
= parse_osd_id(whostr
.c_str(), &ss
);
4285 } else if (osdmap
.is_up(osd
)) {
4286 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4287 pvec
.back() == "repair",
4288 pvec
.back() == "deep-scrub"),
4289 osdmap
.get_inst(osd
));
4290 ss
<< "osd." << osd
<< " instructed to " << pvec
.back();
4292 ss
<< "osd." << osd
<< " is not up";
4296 } else if (prefix
== "osd lspools") {
4298 cmd_getval(g_ceph_context
, cmdmap
, "auid", auid
, int64_t(0));
4300 f
->open_array_section("pools");
4301 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
4302 p
!= osdmap
.pools
.end();
4304 if (!auid
|| p
->second
.auid
== (uint64_t)auid
) {
4306 f
->open_object_section("pool");
4307 f
->dump_int("poolnum", p
->first
);
4308 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
4311 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
] << ',';
4320 } else if (prefix
== "osd blacklist ls") {
4322 f
->open_array_section("blacklist");
4324 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
4325 p
!= osdmap
.blacklist
.end();
4328 f
->open_object_section("entry");
4329 f
->dump_stream("addr") << p
->first
;
4330 f
->dump_stream("until") << p
->second
;
4335 ss
<< p
->first
<< " " << p
->second
;
4345 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
4347 } else if (prefix
== "osd pool ls") {
4349 cmd_getval(g_ceph_context
, cmdmap
, "detail", detail
);
4350 if (!f
&& detail
== "detail") {
4352 osdmap
.print_pools(ss
);
4353 rdata
.append(ss
.str());
4356 f
->open_array_section("pools");
4357 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
4358 it
!= osdmap
.get_pools().end();
4361 if (detail
== "detail") {
4362 f
->open_object_section("pool");
4363 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4364 it
->second
.dump(f
.get());
4367 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4370 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
4379 } else if (prefix
== "osd crush get-tunable") {
4381 cmd_getval(g_ceph_context
, cmdmap
, "tunable", tunable
);
4384 f
->open_object_section("tunable");
4385 if (tunable
== "straw_calc_version") {
4387 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
4389 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
4398 rdata
.append(rss
.str());
4402 } else if (prefix
== "osd pool get") {
4404 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
4405 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4407 ss
<< "unrecognized pool '" << poolstr
<< "'";
4412 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
4414 cmd_getval(g_ceph_context
, cmdmap
, "var", var
);
4416 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
4417 const choices_map_t ALL_CHOICES
= {
4419 {"min_size", MIN_SIZE
},
4420 {"crash_replay_interval", CRASH_REPLAY_INTERVAL
},
4421 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
4422 {"crush_rule", CRUSH_RULE
},
4423 {"hashpspool", HASHPSPOOL
}, {"nodelete", NODELETE
},
4424 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
4425 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
4426 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
4427 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
4428 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
4429 {"use_gmt_hitset", USE_GMT_HITSET
},
4430 {"auid", AUID
}, {"target_max_objects", TARGET_MAX_OBJECTS
},
4431 {"target_max_bytes", TARGET_MAX_BYTES
},
4432 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
4433 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
4434 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
4435 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
4436 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
4437 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
4438 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
4439 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
4440 {"fast_read", FAST_READ
},
4441 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
4442 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
4443 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
4444 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
4445 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
4446 {"recovery_priority", RECOVERY_PRIORITY
},
4447 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
4448 {"scrub_priority", SCRUB_PRIORITY
},
4449 {"compression_mode", COMPRESSION_MODE
},
4450 {"compression_algorithm", COMPRESSION_ALGORITHM
},
4451 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
4452 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
4453 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
4454 {"csum_type", CSUM_TYPE
},
4455 {"csum_max_block", CSUM_MAX_BLOCK
},
4456 {"csum_min_block", CSUM_MIN_BLOCK
},
4459 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
4461 const choices_set_t ONLY_TIER_CHOICES
= {
4462 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
4463 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
4464 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
4465 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
4466 MIN_READ_RECENCY_FOR_PROMOTE
,
4467 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
4469 const choices_set_t ONLY_ERASURE_CHOICES
= {
4470 ERASURE_CODE_PROFILE
4473 choices_set_t selected_choices
;
4475 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
4476 it
!= ALL_CHOICES
.end(); ++it
) {
4477 selected_choices
.insert(it
->second
);
4481 selected_choices
= subtract_second_from_first(selected_choices
,
4485 if(!p
->is_erasure()) {
4486 selected_choices
= subtract_second_from_first(selected_choices
,
4487 ONLY_ERASURE_CHOICES
);
4489 } else /* var != "all" */ {
4490 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
4491 osd_pool_get_choices selected
= found
->second
;
4493 if (!p
->is_tier() &&
4494 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
4495 ss
<< "pool '" << poolstr
4496 << "' is not a tier pool: variable not applicable";
4501 if (!p
->is_erasure() &&
4502 ONLY_ERASURE_CHOICES
.find(selected
)
4503 != ONLY_ERASURE_CHOICES
.end()) {
4504 ss
<< "pool '" << poolstr
4505 << "' is not a erasure pool: variable not applicable";
4510 selected_choices
.insert(selected
);
4514 for(choices_set_t::const_iterator it
= selected_choices
.begin();
4515 it
!= selected_choices
.end(); ++it
) {
4516 choices_map_t::const_iterator i
;
4517 f
->open_object_section("pool");
4518 f
->dump_string("pool", poolstr
);
4519 f
->dump_int("pool_id", pool
);
4522 f
->dump_int("pg_num", p
->get_pg_num());
4525 f
->dump_int("pgp_num", p
->get_pgp_num());
4528 f
->dump_int("auid", p
->get_auid());
4531 f
->dump_int("size", p
->get_size());
4534 f
->dump_int("min_size", p
->get_min_size());
4536 case CRASH_REPLAY_INTERVAL
:
4537 f
->dump_int("crash_replay_interval",
4538 p
->get_crash_replay_interval());
4541 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
4542 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
4543 p
->get_crush_rule()));
4545 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
4552 case WRITE_FADVISE_DONTNEED
:
4555 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4556 if (i
->second
== *it
)
4559 assert(i
!= ALL_CHOICES
.end());
4560 f
->dump_string(i
->first
.c_str(),
4561 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
4564 case HIT_SET_PERIOD
:
4565 f
->dump_int("hit_set_period", p
->hit_set_period
);
4568 f
->dump_int("hit_set_count", p
->hit_set_count
);
4571 f
->dump_string("hit_set_type",
4572 HitSet::get_type_name(p
->hit_set_params
.get_type()));
4576 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
4577 BloomHitSet::Params
*bloomp
=
4578 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
4579 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
4580 } else if(var
!= "all") {
4582 ss
<< "hit set is not of type Bloom; " <<
4583 "invalid to get a false positive rate!";
4589 case USE_GMT_HITSET
:
4590 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
4592 case TARGET_MAX_OBJECTS
:
4593 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
4595 case TARGET_MAX_BYTES
:
4596 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
4598 case CACHE_TARGET_DIRTY_RATIO
:
4599 f
->dump_unsigned("cache_target_dirty_ratio_micro",
4600 p
->cache_target_dirty_ratio_micro
);
4601 f
->dump_float("cache_target_dirty_ratio",
4602 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
4604 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
4605 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
4606 p
->cache_target_dirty_high_ratio_micro
);
4607 f
->dump_float("cache_target_dirty_high_ratio",
4608 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
4610 case CACHE_TARGET_FULL_RATIO
:
4611 f
->dump_unsigned("cache_target_full_ratio_micro",
4612 p
->cache_target_full_ratio_micro
);
4613 f
->dump_float("cache_target_full_ratio",
4614 ((float)p
->cache_target_full_ratio_micro
/1000000));
4616 case CACHE_MIN_FLUSH_AGE
:
4617 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
4619 case CACHE_MIN_EVICT_AGE
:
4620 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
4622 case ERASURE_CODE_PROFILE
:
4623 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
4625 case MIN_READ_RECENCY_FOR_PROMOTE
:
4626 f
->dump_int("min_read_recency_for_promote",
4627 p
->min_read_recency_for_promote
);
4629 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
4630 f
->dump_int("min_write_recency_for_promote",
4631 p
->min_write_recency_for_promote
);
4634 f
->dump_int("fast_read", p
->fast_read
);
4636 case HIT_SET_GRADE_DECAY_RATE
:
4637 f
->dump_int("hit_set_grade_decay_rate",
4638 p
->hit_set_grade_decay_rate
);
4640 case HIT_SET_SEARCH_LAST_N
:
4641 f
->dump_int("hit_set_search_last_n",
4642 p
->hit_set_search_last_n
);
4644 case SCRUB_MIN_INTERVAL
:
4645 case SCRUB_MAX_INTERVAL
:
4646 case DEEP_SCRUB_INTERVAL
:
4647 case RECOVERY_PRIORITY
:
4648 case RECOVERY_OP_PRIORITY
:
4649 case SCRUB_PRIORITY
:
4650 case COMPRESSION_MODE
:
4651 case COMPRESSION_ALGORITHM
:
4652 case COMPRESSION_REQUIRED_RATIO
:
4653 case COMPRESSION_MAX_BLOB_SIZE
:
4654 case COMPRESSION_MIN_BLOB_SIZE
:
4656 case CSUM_MAX_BLOCK
:
4657 case CSUM_MIN_BLOCK
:
4658 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4659 if (i
->second
== *it
)
4662 assert(i
!= ALL_CHOICES
.end());
4663 if(*it
== CSUM_TYPE
) {
4665 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
4666 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
4669 p
->opts
.dump(i
->first
, f
.get());
4678 for(choices_set_t::const_iterator it
= selected_choices
.begin();
4679 it
!= selected_choices
.end(); ++it
) {
4680 choices_map_t::const_iterator i
;
4683 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
4686 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
4689 ss
<< "auid: " << p
->get_auid() << "\n";
4692 ss
<< "size: " << p
->get_size() << "\n";
4695 ss
<< "min_size: " << p
->get_min_size() << "\n";
4697 case CRASH_REPLAY_INTERVAL
:
4698 ss
<< "crash_replay_interval: " <<
4699 p
->get_crash_replay_interval() << "\n";
4702 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
4703 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
4704 p
->get_crush_rule()) << "\n";
4706 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
4709 case HIT_SET_PERIOD
:
4710 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
4713 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
4716 ss
<< "hit_set_type: " <<
4717 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
4721 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
4722 BloomHitSet::Params
*bloomp
=
4723 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
4724 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
4725 } else if(var
!= "all") {
4726 ss
<< "hit set is not of type Bloom; " <<
4727 "invalid to get a false positive rate!";
4733 case USE_GMT_HITSET
:
4734 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
4736 case TARGET_MAX_OBJECTS
:
4737 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
4739 case TARGET_MAX_BYTES
:
4740 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
4742 case CACHE_TARGET_DIRTY_RATIO
:
4743 ss
<< "cache_target_dirty_ratio: "
4744 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
4746 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
4747 ss
<< "cache_target_dirty_high_ratio: "
4748 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
4750 case CACHE_TARGET_FULL_RATIO
:
4751 ss
<< "cache_target_full_ratio: "
4752 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
4754 case CACHE_MIN_FLUSH_AGE
:
4755 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
4757 case CACHE_MIN_EVICT_AGE
:
4758 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
4760 case ERASURE_CODE_PROFILE
:
4761 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
4763 case MIN_READ_RECENCY_FOR_PROMOTE
:
4764 ss
<< "min_read_recency_for_promote: " <<
4765 p
->min_read_recency_for_promote
<< "\n";
4767 case HIT_SET_GRADE_DECAY_RATE
:
4768 ss
<< "hit_set_grade_decay_rate: " <<
4769 p
->hit_set_grade_decay_rate
<< "\n";
4771 case HIT_SET_SEARCH_LAST_N
:
4772 ss
<< "hit_set_search_last_n: " <<
4773 p
->hit_set_search_last_n
<< "\n";
4779 case WRITE_FADVISE_DONTNEED
:
4782 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4783 if (i
->second
== *it
)
4786 assert(i
!= ALL_CHOICES
.end());
4787 ss
<< i
->first
<< ": " <<
4788 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
4789 "true" : "false") << "\n";
4791 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
4792 ss
<< "min_write_recency_for_promote: " <<
4793 p
->min_write_recency_for_promote
<< "\n";
4796 ss
<< "fast_read: " << p
->fast_read
<< "\n";
4798 case SCRUB_MIN_INTERVAL
:
4799 case SCRUB_MAX_INTERVAL
:
4800 case DEEP_SCRUB_INTERVAL
:
4801 case RECOVERY_PRIORITY
:
4802 case RECOVERY_OP_PRIORITY
:
4803 case SCRUB_PRIORITY
:
4804 case COMPRESSION_MODE
:
4805 case COMPRESSION_ALGORITHM
:
4806 case COMPRESSION_REQUIRED_RATIO
:
4807 case COMPRESSION_MAX_BLOB_SIZE
:
4808 case COMPRESSION_MIN_BLOB_SIZE
:
4810 case CSUM_MAX_BLOCK
:
4811 case CSUM_MIN_BLOCK
:
4812 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4813 if (i
->second
== *it
)
4816 assert(i
!= ALL_CHOICES
.end());
4818 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
4819 if (p
->opts
.is_set(key
)) {
4820 if(key
== pool_opts_t::CSUM_TYPE
) {
4822 p
->opts
.get(key
, &val
);
4823 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
4825 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
4831 rdata
.append(ss
.str());
4836 } else if (prefix
== "osd pool stats") {
4837 r
= mon
->pgservice
->process_pg_command(prefix
, cmdmap
,
4838 osdmap
, f
.get(), &ss
, &rdata
);
4839 } else if (prefix
== "osd pool get-quota") {
4841 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool_name
);
4843 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
4845 assert(poolid
== -ENOENT
);
4846 ss
<< "unrecognized pool '" << pool_name
<< "'";
4850 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
4853 f
->open_object_section("pool_quotas");
4854 f
->dump_string("pool_name", pool_name
);
4855 f
->dump_unsigned("pool_id", poolid
);
4856 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
4857 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
4862 rs
<< "quotas for pool '" << pool_name
<< "':\n"
4863 << " max objects: ";
4864 if (p
->quota_max_objects
== 0)
4867 rs
<< si_t(p
->quota_max_objects
) << " objects";
4870 if (p
->quota_max_bytes
== 0)
4873 rs
<< si_t(p
->quota_max_bytes
) << "B";
4874 rdata
.append(rs
.str());
4878 } else if (prefix
== "osd crush rule list" ||
4879 prefix
== "osd crush rule ls") {
4881 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4882 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4883 f
->open_array_section("rules");
4884 osdmap
.crush
->list_rules(f
.get());
4889 rdata
.append(rs
.str());
4890 } else if (prefix
== "osd crush rule dump") {
4892 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
4894 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4895 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4897 f
->open_array_section("rules");
4898 osdmap
.crush
->dump_rules(f
.get());
4901 int ruleno
= osdmap
.crush
->get_rule_id(name
);
4903 ss
<< "unknown crush rule '" << name
<< "'";
4907 osdmap
.crush
->dump_rule(ruleno
, f
.get());
4912 rdata
.append(rs
.str());
4913 } else if (prefix
== "osd crush dump") {
4915 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4916 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4917 f
->open_object_section("crush_map");
4918 osdmap
.crush
->dump(f
.get());
4923 rdata
.append(rs
.str());
4924 } else if (prefix
== "osd crush show-tunables") {
4926 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4927 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4928 f
->open_object_section("crush_map_tunables");
4929 osdmap
.crush
->dump_tunables(f
.get());
4934 rdata
.append(rs
.str());
4935 } else if (prefix
== "osd crush tree") {
4936 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4937 f
->open_array_section("crush_map_roots");
4938 osdmap
.crush
->dump_tree(f
.get());
4941 } else if (prefix
== "osd crush class ls") {
4942 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4943 f
->open_array_section("crush_classes");
4944 for (auto i
: osdmap
.crush
->class_name
)
4945 f
->dump_string("class", i
.second
);
4948 } else if (prefix
== "osd crush class ls-osd") {
4950 cmd_getval(g_ceph_context
, cmdmap
, "class", name
);
4951 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4953 osdmap
.crush
->get_devices_by_class(name
, &osds
);
4954 f
->open_array_section("osds");
4955 for (auto& osd
: osds
)
4956 f
->dump_int("osd", osd
);
4959 } else if (prefix
== "osd erasure-code-profile ls") {
4960 const auto &profiles
= osdmap
.get_erasure_code_profiles();
4962 f
->open_array_section("erasure-code-profiles");
4963 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
4965 f
->dump_string("profile", i
->first
.c_str());
4967 rdata
.append(i
->first
+ "\n");
4974 rdata
.append(rs
.str());
4976 } else if (prefix
== "osd erasure-code-profile get") {
4978 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
4979 if (!osdmap
.has_erasure_code_profile(name
)) {
4980 ss
<< "unknown erasure code profile '" << name
<< "'";
4984 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
4986 f
->open_object_section("profile");
4987 for (map
<string
,string
>::const_iterator i
= profile
.begin();
4991 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
4993 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
5000 rdata
.append(rs
.str());
5003 // try prepare update
5010 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
5014 void OSDMonitor::update_pool_flags(int64_t pool_id
, uint64_t flags
)
5016 const pg_pool_t
*pool
= osdmap
.get_pg_pool(pool_id
);
5017 pending_inc
.get_new_pool(pool_id
, pool
)->flags
= flags
;
5020 bool OSDMonitor::update_pools_status()
5022 if (!mon
->pgservice
->is_readable())
5027 auto& pools
= osdmap
.get_pools();
5028 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
5029 const pool_stat_t
*pstat
= mon
->pgservice
->get_pool_stat(it
->first
);
5032 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
5033 const pg_pool_t
&pool
= it
->second
;
5034 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
5037 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
5038 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
5040 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
5044 mon
->clog
->info() << "pool '" << pool_name
5045 << "' no longer full; removing FULL flag";
5047 update_pool_flags(it
->first
, pool
.get_flags() & ~pg_pool_t::FLAG_FULL
);
5053 if (pool
.quota_max_bytes
> 0 &&
5054 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
5055 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
5056 << " (reached quota's max_bytes: "
5057 << si_t(pool
.quota_max_bytes
) << ")";
5059 if (pool
.quota_max_objects
> 0 &&
5060 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
5061 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
5062 << " (reached quota's max_objects: "
5063 << pool
.quota_max_objects
<< ")";
5065 update_pool_flags(it
->first
, pool
.get_flags() | pg_pool_t::FLAG_FULL
);
5072 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
5074 op
->mark_osdmon_event(__func__
);
5075 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
5076 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
5077 MonSession
*session
= m
->get_session();
5080 string erasure_code_profile
;
5084 return prepare_new_pool(m
->name
, m
->auid
, m
->crush_rule
, rule_name
,
5086 erasure_code_profile
,
5087 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5089 return prepare_new_pool(m
->name
, session
->auid
, m
->crush_rule
, rule_name
,
5091 erasure_code_profile
,
5092 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5095 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
5096 const string
& dstname
,
5101 // Avoid creating a pending crush if it does not already exists and
5102 // the rename would fail.
5104 if (!_have_pending_crush()) {
5105 ret
= _get_stable_crush().can_rename_bucket(srcname
,
5112 CrushWrapper newcrush
;
5113 _get_pending_crush(newcrush
);
5115 ret
= newcrush
.rename_bucket(srcname
,
5121 pending_inc
.crush
.clear();
5122 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5123 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
5127 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
5129 string replacement
= "";
5131 if (plugin
== "jerasure_generic" ||
5132 plugin
== "jerasure_sse3" ||
5133 plugin
== "jerasure_sse4" ||
5134 plugin
== "jerasure_neon") {
5135 replacement
= "jerasure";
5136 } else if (plugin
== "shec_generic" ||
5137 plugin
== "shec_sse3" ||
5138 plugin
== "shec_sse4" ||
5139 plugin
== "shec_neon") {
5140 replacement
= "shec";
5143 if (replacement
!= "") {
5144 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
5145 << plugin
<< " that has been deprecated. Please use "
5146 << replacement
<< " instead." << dendl
;
5150 int OSDMonitor::normalize_profile(const string
& profilename
,
5151 ErasureCodeProfile
&profile
,
5155 ErasureCodeInterfaceRef erasure_code
;
5156 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5157 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
5158 check_legacy_ec_plugin(plugin
->second
, profilename
);
5159 int err
= instance
.factory(plugin
->second
,
5160 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5161 profile
, &erasure_code
, ss
);
5166 err
= erasure_code
->init(profile
, ss
);
5171 auto it
= profile
.find("stripe_unit");
5172 if (it
!= profile
.end()) {
5174 uint32_t stripe_unit
= strict_si_cast
<uint32_t>(it
->second
.c_str(), &err_str
);
5175 if (!err_str
.empty()) {
5176 *ss
<< "could not parse stripe_unit '" << it
->second
5177 << "': " << err_str
<< std::endl
;
5180 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
5181 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
5182 if (chunk_size
!= stripe_unit
) {
5183 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
5184 << "alignment. Would be padded to " << chunk_size
5188 if ((stripe_unit
% 4096) != 0 && !force
) {
5189 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
5190 << "use --force to override this check" << std::endl
;
5197 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
5198 const string
&profile
,
5202 int ruleid
= osdmap
.crush
->get_rule_id(name
);
5203 if (ruleid
!= -ENOENT
) {
5204 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
5208 CrushWrapper newcrush
;
5209 _get_pending_crush(newcrush
);
5211 ruleid
= newcrush
.get_rule_id(name
);
5212 if (ruleid
!= -ENOENT
) {
5213 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
5216 ErasureCodeInterfaceRef erasure_code
;
5217 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
5219 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
5223 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
5224 erasure_code
.reset();
5228 pending_inc
.crush
.clear();
5229 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5234 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
5235 ErasureCodeInterfaceRef
*erasure_code
,
5238 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
5240 ErasureCodeProfile profile
=
5241 osdmap
.get_erasure_code_profile(erasure_code_profile
);
5242 ErasureCodeProfile::const_iterator plugin
=
5243 profile
.find("plugin");
5244 if (plugin
== profile
.end()) {
5245 *ss
<< "cannot determine the erasure code plugin"
5246 << " because there is no 'plugin' entry in the erasure_code_profile "
5247 << profile
<< std::endl
;
5250 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
5251 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5252 return instance
.factory(plugin
->second
,
5253 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5254 profile
, erasure_code
, ss
);
5257 int OSDMonitor::check_cluster_features(uint64_t features
,
5260 stringstream unsupported_ss
;
5261 int unsupported_count
= 0;
5262 if ((mon
->get_quorum_con_features() & features
) != features
) {
5263 unsupported_ss
<< "the monitor cluster";
5264 ++unsupported_count
;
5267 set
<int32_t> up_osds
;
5268 osdmap
.get_up_osds(up_osds
);
5269 for (set
<int32_t>::iterator it
= up_osds
.begin();
5270 it
!= up_osds
.end(); ++it
) {
5271 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
5272 if ((xi
.features
& features
) != features
) {
5273 if (unsupported_count
> 0)
5274 unsupported_ss
<< ", ";
5275 unsupported_ss
<< "osd." << *it
;
5276 unsupported_count
++;
5280 if (unsupported_count
> 0) {
5281 ss
<< "features " << features
<< " unsupported by: "
5282 << unsupported_ss
.str();
5286 // check pending osd state, too!
5287 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
5288 pending_inc
.new_xinfo
.begin();
5289 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
5290 const osd_xinfo_t
&xi
= p
->second
;
5291 if ((xi
.features
& features
) != features
) {
5292 dout(10) << __func__
<< " pending osd." << p
->first
5293 << " features are insufficient; retry" << dendl
;
5301 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
5304 OSDMap::Incremental new_pending
= pending_inc
;
5305 ::encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
5307 newmap
.deepish_copy_from(osdmap
);
5308 newmap
.apply_incremental(new_pending
);
5311 if (newmap
.require_min_compat_client
> 0) {
5312 auto mv
= newmap
.get_min_compat_client();
5313 if (mv
> newmap
.require_min_compat_client
) {
5314 ss
<< "new crush map requires client version " << ceph_release_name(mv
)
5315 << " but require_min_compat_client is "
5316 << ceph_release_name(newmap
.require_min_compat_client
);
5323 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
5324 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
5325 stringstream features_ss
;
5326 int r
= check_cluster_features(features
, features_ss
);
5328 ss
<< "Could not change CRUSH: " << features_ss
.str();
5335 bool OSDMonitor::erasure_code_profile_in_use(
5336 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
5337 const string
&profile
,
5341 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
5344 if (p
->second
.erasure_code_profile
== profile
) {
5345 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
5350 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
5355 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
5356 map
<string
,string
> *erasure_code_profile_map
,
5359 int r
= get_json_str_map(g_conf
->osd_pool_default_erasure_code_profile
,
5361 erasure_code_profile_map
);
5364 assert((*erasure_code_profile_map
).count("plugin"));
5365 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
5366 map
<string
,string
> user_map
;
5367 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
5368 i
!= erasure_code_profile
.end();
5370 size_t equal
= i
->find('=');
5371 if (equal
== string::npos
) {
5372 user_map
[*i
] = string();
5373 (*erasure_code_profile_map
)[*i
] = string();
5375 const string key
= i
->substr(0, equal
);
5377 const string value
= i
->substr(equal
);
5378 user_map
[key
] = value
;
5379 (*erasure_code_profile_map
)[key
] = value
;
5383 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
5384 (*erasure_code_profile_map
) = user_map
;
5389 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
5390 const string
&erasure_code_profile
,
5391 unsigned *size
, unsigned *min_size
,
5395 switch (pool_type
) {
5396 case pg_pool_t::TYPE_REPLICATED
:
5397 *size
= g_conf
->osd_pool_default_size
;
5398 *min_size
= g_conf
->get_osd_pool_default_min_size();
5400 case pg_pool_t::TYPE_ERASURE
:
5402 ErasureCodeInterfaceRef erasure_code
;
5403 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
5405 *size
= erasure_code
->get_chunk_count();
5406 *min_size
= MIN(erasure_code
->get_data_chunk_count() + 1, *size
);
5411 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
5418 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
5419 const string
&erasure_code_profile
,
5420 uint32_t *stripe_width
,
5424 switch (pool_type
) {
5425 case pg_pool_t::TYPE_REPLICATED
:
5428 case pg_pool_t::TYPE_ERASURE
:
5430 ErasureCodeProfile profile
=
5431 osdmap
.get_erasure_code_profile(erasure_code_profile
);
5432 ErasureCodeInterfaceRef erasure_code
;
5433 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
5436 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
5437 uint32_t stripe_unit
= g_conf
->osd_pool_erasure_code_stripe_unit
;
5438 auto it
= profile
.find("stripe_unit");
5439 if (it
!= profile
.end()) {
5441 stripe_unit
= strict_si_cast
<uint32_t>(it
->second
.c_str(), &err_str
);
5442 assert(err_str
.empty());
5444 *stripe_width
= data_chunks
*
5445 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
5449 *ss
<< "prepare_pool_stripe_width: "
5450 << pool_type
<< " is not a known pool type";
5457 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
5458 const string
&erasure_code_profile
,
5459 const string
&rule_name
,
5464 if (*crush_rule
< 0) {
5465 switch (pool_type
) {
5466 case pg_pool_t::TYPE_REPLICATED
:
5468 if (rule_name
== "") {
5470 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context
);
5471 if (*crush_rule
< 0) {
5472 // Errors may happen e.g. if no valid rule is available
5473 *ss
<< "No suitable CRUSH rule exists, check "
5474 << "'osd pool default crush *' config options";
5478 return get_crush_rule(rule_name
, crush_rule
, ss
);
5482 case pg_pool_t::TYPE_ERASURE
:
5484 int err
= crush_rule_create_erasure(rule_name
,
5485 erasure_code_profile
,
5489 dout(20) << "prepare_pool_crush_rule: rule "
5490 << rule_name
<< " try again" << dendl
;
5493 // need to wait for the crush rule to be proposed before proceeding
5504 *ss
<< "prepare_pool_crush_rule: " << pool_type
5505 << " is not a known pool type";
5510 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
5511 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
5519 int OSDMonitor::get_crush_rule(const string
&rule_name
,
5524 ret
= osdmap
.crush
->get_rule_id(rule_name
);
5525 if (ret
!= -ENOENT
) {
5529 CrushWrapper newcrush
;
5530 _get_pending_crush(newcrush
);
5532 ret
= newcrush
.get_rule_id(rule_name
);
5533 if (ret
!= -ENOENT
) {
5534 // found it, wait for it to be proposed
5535 dout(20) << __func__
<< ": rule " << rule_name
5536 << " try again" << dendl
;
5539 // Cannot find it , return error
5540 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
5548 * @param name The name of the new pool
5549 * @param auid The auid of the pool owner. Can be -1
5550 * @param crush_rule The crush rule to use. If <0, will use the system default
5551 * @param crush_rule_name The crush rule to use, if crush_rulset <0
5552 * @param pg_num The pg_num to use. If set to 0, will use the system default
5553 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
5554 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
5555 * @param pool_type TYPE_ERASURE, or TYPE_REP
5556 * @param expected_num_objects expected number of objects on the pool
5557 * @param fast_read fast read type.
5558 * @param ss human readable error message, if any.
5560 * @return 0 on success, negative errno on failure.
5562 int OSDMonitor::prepare_new_pool(string
& name
, uint64_t auid
,
5564 const string
&crush_rule_name
,
5565 unsigned pg_num
, unsigned pgp_num
,
5566 const string
&erasure_code_profile
,
5567 const unsigned pool_type
,
5568 const uint64_t expected_num_objects
,
5569 FastReadType fast_read
,
5572 if (name
.length() == 0)
5575 pg_num
= g_conf
->osd_pool_default_pg_num
;
5577 pgp_num
= g_conf
->osd_pool_default_pgp_num
;
5578 if (pg_num
> (unsigned)g_conf
->mon_max_pool_pg_num
) {
5579 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
5580 << g_conf
->mon_max_pool_pg_num
5581 << " (you may adjust 'mon max pool pg num' for higher values)";
5584 if (pgp_num
> pg_num
) {
5585 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
5586 << ", which in this case is " << pg_num
;
5589 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
5590 *ss
<< "'fast_read' can only apply to erasure coding pool";
5594 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
5595 crush_rule_name
, &crush_rule
, ss
);
5597 dout(10) << " prepare_pool_crush_rule returns " << r
<< dendl
;
5600 if (g_conf
->mon_osd_crush_smoke_test
) {
5601 CrushWrapper newcrush
;
5602 _get_pending_crush(newcrush
);
5604 CrushTester
tester(newcrush
, err
);
5605 tester
.set_max_x(50);
5606 tester
.set_rule(crush_rule
);
5607 r
= tester
.test_with_fork(g_conf
->mon_lease
);
5609 dout(10) << " tester.test_with_fork returns " << r
5610 << ": " << err
.str() << dendl
;
5611 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
5615 unsigned size
, min_size
;
5616 r
= prepare_pool_size(pool_type
, erasure_code_profile
, &size
, &min_size
, ss
);
5618 dout(10) << " prepare_pool_size returns " << r
<< dendl
;
5622 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
5626 uint32_t stripe_width
= 0;
5627 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
5629 dout(10) << " prepare_pool_stripe_width returns " << r
<< dendl
;
5634 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
5635 switch (fast_read
) {
5642 case FAST_READ_DEFAULT
:
5643 fread
= g_conf
->mon_osd_pool_ec_fast_read
;
5646 *ss
<< "invalid fast_read setting: " << fast_read
;
5651 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
5652 p
!= pending_inc
.new_pool_names
.end();
5654 if (p
->second
== name
)
5658 if (-1 == pending_inc
.new_pool_max
)
5659 pending_inc
.new_pool_max
= osdmap
.pool_max
;
5660 int64_t pool
= ++pending_inc
.new_pool_max
;
5662 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
5663 pi
->type
= pool_type
;
5664 pi
->fast_read
= fread
;
5665 pi
->flags
= g_conf
->osd_pool_default_flags
;
5666 if (g_conf
->osd_pool_default_flag_hashpspool
)
5667 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
5668 if (g_conf
->osd_pool_default_flag_nodelete
)
5669 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
5670 if (g_conf
->osd_pool_default_flag_nopgchange
)
5671 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
5672 if (g_conf
->osd_pool_default_flag_nosizechange
)
5673 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
5674 if (g_conf
->osd_pool_use_gmt_hitset
&&
5675 (osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
))
5676 pi
->use_gmt_hitset
= true;
5678 pi
->use_gmt_hitset
= false;
5681 pi
->min_size
= min_size
;
5682 pi
->crush_rule
= crush_rule
;
5683 pi
->expected_num_objects
= expected_num_objects
;
5684 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
5685 pi
->set_pg_num(pg_num
);
5686 pi
->set_pgp_num(pgp_num
);
5687 pi
->last_change
= pending_inc
.epoch
;
5689 pi
->erasure_code_profile
= erasure_code_profile
;
5690 pi
->stripe_width
= stripe_width
;
5691 pi
->cache_target_dirty_ratio_micro
=
5692 g_conf
->osd_pool_default_cache_target_dirty_ratio
* 1000000;
5693 pi
->cache_target_dirty_high_ratio_micro
=
5694 g_conf
->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
5695 pi
->cache_target_full_ratio_micro
=
5696 g_conf
->osd_pool_default_cache_target_full_ratio
* 1000000;
5697 pi
->cache_min_flush_age
= g_conf
->osd_pool_default_cache_min_flush_age
;
5698 pi
->cache_min_evict_age
= g_conf
->osd_pool_default_cache_min_evict_age
;
5699 pending_inc
.new_pool_names
[pool
] = name
;
5703 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
5705 op
->mark_osdmon_event(__func__
);
5707 if (pending_inc
.new_flags
< 0)
5708 pending_inc
.new_flags
= osdmap
.get_flags();
5709 pending_inc
.new_flags
|= flag
;
5710 ss
<< OSDMap::get_flag_string(flag
) << " is set";
5711 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
5712 get_last_committed() + 1));
5716 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
5718 op
->mark_osdmon_event(__func__
);
5720 if (pending_inc
.new_flags
< 0)
5721 pending_inc
.new_flags
= osdmap
.get_flags();
5722 pending_inc
.new_flags
&= ~flag
;
5723 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
5724 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
5725 get_last_committed() + 1));
5729 int OSDMonitor::prepare_command_pool_set(map
<string
,cmd_vartype
> &cmdmap
,
5733 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
5734 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5736 ss
<< "unrecognized pool '" << poolstr
<< "'";
5740 cmd_getval(g_ceph_context
, cmdmap
, "var", var
);
5742 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
5743 if (pending_inc
.new_pools
.count(pool
))
5744 p
= pending_inc
.new_pools
[pool
];
5746 // accept val as a json string in the normal case (current
5747 // generation monitor). parse out int or float values from the
5748 // string as needed. however, if it is not a string, try to pull
5749 // out an int, in case an older monitor with an older json schema is
5750 // forwarding a request.
5752 string interr
, floaterr
;
5755 int64_t uf
= 0; // micro-f
5756 if (!cmd_getval(g_ceph_context
, cmdmap
, "val", val
)) {
5757 // wasn't a string; maybe an older mon forwarded json with an int?
5758 if (!cmd_getval(g_ceph_context
, cmdmap
, "val", n
))
5759 return -EINVAL
; // no value!
5761 // we got a string. see if it contains an int.
5762 n
= strict_strtoll(val
.c_str(), 10, &interr
);
5764 f
= strict_strtod(val
.c_str(), &floaterr
);
5765 uf
= llrintl(f
* (double)1000000.0);
5769 (var
== "hit_set_type" || var
== "hit_set_period" ||
5770 var
== "hit_set_count" || var
== "hit_set_fpp" ||
5771 var
== "target_max_objects" || var
== "target_max_bytes" ||
5772 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
5773 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
5774 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
5775 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
5776 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
5780 if (var
== "size") {
5781 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
5782 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
5785 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
5786 ss
<< "can not change the size of an erasure-coded pool";
5789 if (interr
.length()) {
5790 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5793 if (n
<= 0 || n
> 10) {
5794 ss
<< "pool size must be between 1 and 10";
5800 } else if (var
== "min_size") {
5801 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
5802 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
5805 if (interr
.length()) {
5806 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5810 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
5811 if (n
< 1 || n
> p
.size
) {
5812 ss
<< "pool min_size must be between 1 and " << (int)p
.size
;
5816 ErasureCodeInterfaceRef erasure_code
;
5819 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
5821 k
= erasure_code
->get_data_chunk_count();
5823 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.rdbuf();
5827 if (n
< k
|| n
> p
.size
) {
5828 ss
<< "pool min_size must be between " << k
<< " and " << (int)p
.size
;
5833 } else if (var
== "auid") {
5834 if (interr
.length()) {
5835 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5839 } else if (var
== "crash_replay_interval") {
5840 if (interr
.length()) {
5841 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5844 p
.crash_replay_interval
= n
;
5845 } else if (var
== "pg_num") {
5846 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
5847 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
5850 if (interr
.length()) {
5851 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5854 if (n
<= (int)p
.get_pg_num()) {
5855 ss
<< "specified pg_num " << n
<< " <= current " << p
.get_pg_num();
5856 if (n
< (int)p
.get_pg_num())
5861 cmd_getval(g_ceph_context
,cmdmap
, "force", force
);
5862 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&&
5863 force
!= "--yes-i-really-mean-it") {
5864 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
5867 int expected_osds
= MIN(p
.get_pg_num(), osdmap
.get_num_osds());
5868 int64_t new_pgs
= n
- p
.get_pg_num();
5869 if (new_pgs
> g_conf
->mon_osd_max_split_count
* expected_osds
) {
5870 ss
<< "specified pg_num " << n
<< " is too large (creating "
5871 << new_pgs
<< " new PGs on ~" << expected_osds
5872 << " OSDs exceeds per-OSD max of " << g_conf
->mon_osd_max_split_count
5877 // force pre-luminous clients to resend their ops, since they
5878 // don't understand that split PGs now form a new interval.
5879 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
5880 } else if (var
== "pgp_num") {
5881 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
5882 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
5885 if (interr
.length()) {
5886 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5890 ss
<< "specified pgp_num must > 0, but you set to " << n
;
5893 if (n
> (int)p
.get_pg_num()) {
5894 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
5898 } else if (var
== "crush_rule") {
5899 int id
= osdmap
.crush
->get_rule_id(val
);
5900 if (id
== -ENOENT
) {
5901 ss
<< "crush rule " << val
<< " does not exist";
5905 ss
<< cpp_strerror(id
);
5908 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
5912 } else if (var
== "nodelete" || var
== "nopgchange" ||
5913 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
5914 var
== "noscrub" || var
== "nodeep-scrub") {
5915 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
5916 // make sure we only compare against 'n' if we didn't receive a string
5917 if (val
== "true" || (interr
.empty() && n
== 1)) {
5919 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
5922 ss
<< "expecting value 'true', 'false', '0', or '1'";
5925 } else if (var
== "hashpspool") {
5926 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
5928 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
5929 if (force
!= "--yes-i-really-mean-it") {
5930 ss
<< "are you SURE? this will remap all placement groups in this pool,"
5931 " this triggers large data movement,"
5932 " pass --yes-i-really-mean-it if you really do.";
5935 // make sure we only compare against 'n' if we didn't receive a string
5936 if (val
== "true" || (interr
.empty() && n
== 1)) {
5938 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
5941 ss
<< "expecting value 'true', 'false', '0', or '1'";
5944 } else if (var
== "hit_set_type") {
5946 p
.hit_set_params
= HitSet::Params();
5948 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
5951 if (val
== "bloom") {
5952 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
5953 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
5954 p
.hit_set_params
= HitSet::Params(bsp
);
5955 } else if (val
== "explicit_hash")
5956 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
5957 else if (val
== "explicit_object")
5958 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
5960 ss
<< "unrecognized hit_set type '" << val
<< "'";
5964 } else if (var
== "hit_set_period") {
5965 if (interr
.length()) {
5966 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5969 p
.hit_set_period
= n
;
5970 } else if (var
== "hit_set_count") {
5971 if (interr
.length()) {
5972 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5975 p
.hit_set_count
= n
;
5976 } else if (var
== "hit_set_fpp") {
5977 if (floaterr
.length()) {
5978 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
5981 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
5982 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
5985 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
5987 } else if (var
== "use_gmt_hitset") {
5988 if (val
== "true" || (interr
.empty() && n
== 1)) {
5989 if (!(osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
)) {
5990 ss
<< "not all OSDs support GMT hit set.";
5993 p
.use_gmt_hitset
= true;
5995 ss
<< "expecting value 'true' or '1'";
5998 } else if (var
== "allow_ec_overwrites") {
5999 if (!p
.is_erasure()) {
6000 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
6004 if (!g_conf
->mon_debug_no_require_bluestore_for_ec_overwrites
&&
6005 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
6006 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
6009 if (val
== "true" || (interr
.empty() && n
== 1)) {
6010 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
6011 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6012 ss
<< "ec overwrites cannot be disabled once enabled";
6015 ss
<< "expecting value 'true', 'false', '0', or '1'";
6018 } else if (var
== "target_max_objects") {
6019 if (interr
.length()) {
6020 ss
<< "error parsing int '" << val
<< "': " << interr
;
6023 p
.target_max_objects
= n
;
6024 } else if (var
== "target_max_bytes") {
6025 if (interr
.length()) {
6026 ss
<< "error parsing int '" << val
<< "': " << interr
;
6029 p
.target_max_bytes
= n
;
6030 } else if (var
== "cache_target_dirty_ratio") {
6031 if (floaterr
.length()) {
6032 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6035 if (f
< 0 || f
> 1.0) {
6036 ss
<< "value must be in the range 0..1";
6039 p
.cache_target_dirty_ratio_micro
= uf
;
6040 } else if (var
== "cache_target_dirty_high_ratio") {
6041 if (floaterr
.length()) {
6042 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6045 if (f
< 0 || f
> 1.0) {
6046 ss
<< "value must be in the range 0..1";
6049 p
.cache_target_dirty_high_ratio_micro
= uf
;
6050 } else if (var
== "cache_target_full_ratio") {
6051 if (floaterr
.length()) {
6052 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6055 if (f
< 0 || f
> 1.0) {
6056 ss
<< "value must be in the range 0..1";
6059 p
.cache_target_full_ratio_micro
= uf
;
6060 } else if (var
== "cache_min_flush_age") {
6061 if (interr
.length()) {
6062 ss
<< "error parsing int '" << val
<< "': " << interr
;
6065 p
.cache_min_flush_age
= n
;
6066 } else if (var
== "cache_min_evict_age") {
6067 if (interr
.length()) {
6068 ss
<< "error parsing int '" << val
<< "': " << interr
;
6071 p
.cache_min_evict_age
= n
;
6072 } else if (var
== "min_read_recency_for_promote") {
6073 if (interr
.length()) {
6074 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6077 p
.min_read_recency_for_promote
= n
;
6078 } else if (var
== "hit_set_grade_decay_rate") {
6079 if (interr
.length()) {
6080 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6083 if (n
> 100 || n
< 0) {
6084 ss
<< "value out of range,valid range is 0 - 100";
6087 p
.hit_set_grade_decay_rate
= n
;
6088 } else if (var
== "hit_set_search_last_n") {
6089 if (interr
.length()) {
6090 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6093 if (n
> p
.hit_set_count
|| n
< 0) {
6094 ss
<< "value out of range,valid range is 0 - hit_set_count";
6097 p
.hit_set_search_last_n
= n
;
6098 } else if (var
== "min_write_recency_for_promote") {
6099 if (interr
.length()) {
6100 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6103 p
.min_write_recency_for_promote
= n
;
6104 } else if (var
== "fast_read") {
6105 if (p
.is_replicated()) {
6106 ss
<< "fast read is not supported in replication pool";
6109 if (val
== "true" || (interr
.empty() && n
== 1)) {
6111 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6112 p
.fast_read
= false;
6114 ss
<< "expecting value 'true', 'false', '0', or '1'";
6117 } else if (pool_opts_t::is_opt_name(var
)) {
6118 bool unset
= val
== "unset";
6119 if (var
== "compression_mode") {
6121 auto cmode
= Compressor::get_comp_mode_type(val
);
6123 ss
<< "unrecognized compression mode '" << val
<< "'";
6127 } else if (var
== "compression_algorithm") {
6129 auto alg
= Compressor::get_comp_alg_type(val
);
6131 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
6135 } else if (var
== "compression_required_ratio") {
6136 if (floaterr
.length()) {
6137 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
6140 if (f
< 0 || f
> 1) {
6141 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
6144 } else if (var
== "csum_type") {
6145 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
6147 ss
<< "unrecognized csum_type '" << val
<< "'";
6150 //preserve csum_type numeric value
6153 } else if (var
== "compression_max_blob_size" ||
6154 var
== "compression_min_blob_size" ||
6155 var
== "csum_max_block" ||
6156 var
== "csum_min_block") {
6157 if (interr
.length()) {
6158 ss
<< "error parsing int value '" << val
<< "': " << interr
;
6163 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
6164 switch (desc
.type
) {
6165 case pool_opts_t::STR
:
6167 p
.opts
.unset(desc
.key
);
6169 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
6172 case pool_opts_t::INT
:
6173 if (interr
.length()) {
6174 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6178 p
.opts
.unset(desc
.key
);
6180 p
.opts
.set(desc
.key
, static_cast<int>(n
));
6183 case pool_opts_t::DOUBLE
:
6184 if (floaterr
.length()) {
6185 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
6189 p
.opts
.unset(desc
.key
);
6191 p
.opts
.set(desc
.key
, static_cast<double>(f
));
6195 assert(!"unknown type");
6198 ss
<< "unrecognized variable '" << var
<< "'";
6201 if (val
!= "unset") {
6202 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
6204 ss
<< "unset pool " << pool
<< " " << var
;
6206 p
.last_change
= pending_inc
.epoch
;
6207 pending_inc
.new_pools
[pool
] = p
;
6211 int OSDMonitor::_prepare_command_osd_crush_remove(
6212 CrushWrapper
&newcrush
,
6221 err
= newcrush
.remove_item_under(g_ceph_context
, id
, ancestor
,
6224 err
= newcrush
.remove_item(g_ceph_context
, id
, unlink_only
);
6229 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
6231 pending_inc
.crush
.clear();
6232 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6235 int OSDMonitor::prepare_command_osd_crush_remove(
6236 CrushWrapper
&newcrush
,
6242 int err
= _prepare_command_osd_crush_remove(
6243 newcrush
, id
, ancestor
,
6244 has_ancestor
, unlink_only
);
6250 do_osd_crush_remove(newcrush
);
6255 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
6257 if (osdmap
.is_up(id
)) {
6261 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
6262 pending_inc
.new_uuid
[id
] = uuid_d();
6263 pending_metadata_rm
.insert(id
);
6264 pending_metadata
.erase(id
);
6269 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
6271 assert(existing_id
);
6274 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
6275 if (!osdmap
.exists(i
) &&
6276 pending_inc
.new_up_client
.count(i
) == 0 &&
6277 (pending_inc
.new_state
.count(i
) == 0 ||
6278 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
6284 if (pending_inc
.new_max_osd
< 0) {
6285 return osdmap
.get_max_osd();
6287 return pending_inc
.new_max_osd
;
6290 void OSDMonitor::do_osd_create(
6295 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
6298 // We presume validation has been performed prior to calling this
6299 // function. We assert with prejudice.
6301 int32_t allocated_id
= -1; // declare here so we can jump
6302 int32_t existing_id
= -1;
6303 if (!uuid
.is_zero()) {
6304 existing_id
= osdmap
.identify_osd(uuid
);
6305 if (existing_id
>= 0) {
6306 assert(id
< 0 || id
== existing_id
);
6307 *new_id
= existing_id
;
6309 } else if (id
>= 0) {
6310 // uuid does not exist, and id has been provided, so just create
6317 // allocate a new id
6318 allocated_id
= _allocate_osd_id(&existing_id
);
6319 dout(10) << __func__
<< " allocated id " << allocated_id
6320 << " existing id " << existing_id
<< dendl
;
6321 if (existing_id
>= 0) {
6322 assert(existing_id
< osdmap
.get_max_osd());
6323 assert(allocated_id
< 0);
6324 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
6325 *new_id
= existing_id
;
6327 } else if (allocated_id
>= 0) {
6328 assert(existing_id
< 0);
6330 if (pending_inc
.new_max_osd
< 0) {
6331 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
6333 ++pending_inc
.new_max_osd
;
6335 *new_id
= pending_inc
.new_max_osd
- 1;
6336 assert(*new_id
== allocated_id
);
6338 assert(0 == "unexpected condition");
6342 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
6343 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
6344 pending_inc
.new_max_osd
= *new_id
+ 1;
6347 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
6348 if (!uuid
.is_zero())
6349 pending_inc
.new_uuid
[*new_id
] = uuid
;
6352 int OSDMonitor::validate_osd_create(
6355 const bool check_osd_exists
,
6356 int32_t* existing_id
,
6360 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
6361 << " check_osd_exists " << check_osd_exists
<< dendl
;
6363 assert(existing_id
);
6365 if (id
< 0 && uuid
.is_zero()) {
6366 // we have nothing to validate
6369 } else if (uuid
.is_zero()) {
6370 // we have an id but we will ignore it - because that's what
6371 // `osd create` does.
6376 * This function will be used to validate whether we are able to
6377 * create a new osd when the `uuid` is specified.
6379 * It will be used by both `osd create` and `osd new`, as the checks
6380 * are basically the same when it pertains to osd id and uuid validation.
6381 * However, `osd create` presumes an `uuid` is optional, for legacy
6382 * reasons, while `osd new` requires the `uuid` to be provided. This
6383 * means that `osd create` will not be idempotent if an `uuid` is not
6384 * provided, but we will always guarantee the idempotency of `osd new`.
6387 assert(!uuid
.is_zero());
6388 if (pending_inc
.identify_osd(uuid
) >= 0) {
6389 // osd is about to exist
6393 int32_t i
= osdmap
.identify_osd(uuid
);
6395 // osd already exists
6396 if (id
>= 0 && i
!= id
) {
6397 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
6400 // return a positive errno to distinguish between a blocking error
6401 // and an error we consider to not be a problem (i.e., this would be
6402 // an idempotent operation).
6408 if (pending_inc
.new_state
.count(id
)) {
6409 // osd is about to exist
6412 // we may not care if an osd exists if we are recreating a previously
6414 if (check_osd_exists
&& osdmap
.exists(id
)) {
6415 ss
<< "id " << id
<< " already in use and does not match uuid "
6423 int OSDMonitor::prepare_command_osd_create(
6426 int32_t* existing_id
,
6429 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
6430 assert(existing_id
);
6432 if (uuid
.is_zero()) {
6433 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
6436 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
6439 int OSDMonitor::prepare_command_osd_new(
6441 const map
<string
,cmd_vartype
>& cmdmap
,
6442 const map
<string
,string
>& secrets
,
6450 assert(paxos
->is_plugged());
6452 dout(10) << __func__
<< " " << op
<< dendl
;
6454 /* validate command. abort now if something's wrong. */
6456 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
6458 * If `id` is not specified, we will identify any existing osd based
6459 * on `uuid`. Operation will be idempotent iff secrets match.
6461 * If `id` is specified, we will identify any existing osd based on
6462 * `uuid` and match against `id`. If they match, operation will be
6463 * idempotent iff secrets match.
6465 * `-i secrets.json` will be optional. If supplied, will be used
6466 * to check for idempotency when `id` and `uuid` match.
6468 * If `id` is not specified, and `uuid` does not exist, an id will
6469 * be found or allocated for the osd.
6471 * If `id` is specified, and the osd has been previously marked
6472 * as destroyed, then the `id` will be reused.
6474 if (!cmd_getval(g_ceph_context
, cmdmap
, "uuid", uuidstr
)) {
6475 ss
<< "requires the OSD's UUID to be specified.";
6477 } else if (!uuid
.parse(uuidstr
.c_str())) {
6478 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
6482 if (cmd_getval(g_ceph_context
, cmdmap
, "id", id
) &&
6484 ss
<< "invalid OSD id; must be greater or equal than zero.";
6488 // are we running an `osd create`-like command, or recreating
6489 // a previously destroyed osd?
6491 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
6493 // we will care about `id` to assess whether osd is `destroyed`, or
6494 // to create a new osd.
6495 // we will need an `id` by the time we reach auth.
6497 int32_t existing_id
= -1;
6498 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
6501 bool may_be_idempotent
= false;
6502 if (err
== EEXIST
) {
6503 // this is idempotent from the osdmon's point-of-view
6504 may_be_idempotent
= true;
6505 assert(existing_id
>= 0);
6507 } else if (err
< 0) {
6511 if (!may_be_idempotent
) {
6512 // idempotency is out of the window. We are either creating a new
6513 // osd or recreating a destroyed osd.
6515 // We now need to figure out if we have an `id` (and if it's valid),
6516 // of find an `id` if we don't have one.
6518 // NOTE: we need to consider the case where the `id` is specified for
6519 // `osd create`, and we must honor it. So this means checking if
6520 // the `id` is destroyed, and if so assume the destroy; otherwise,
6521 // check if it `exists` - in which case we complain about not being
6522 // `destroyed`. In the end, if nothing fails, we must allow the
6523 // creation, so that we are compatible with `create`.
6524 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
6525 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
6526 ss
<< "OSD " << id
<< " has not yet been destroyed";
6528 } else if (id
< 0) {
6530 id
= _allocate_osd_id(&existing_id
);
6532 assert(existing_id
>= 0);
6535 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
6536 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
6537 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
6539 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
6543 assert(osdmap
.exists(id
));
6546 // we are now able to either create a brand new osd or reuse an existing
6547 // osd that has been previously destroyed.
6549 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
6551 if (may_be_idempotent
&& secrets
.empty()) {
6552 // nothing to do, really.
6553 dout(10) << __func__
<< " idempotent and no secrets -- no op." << dendl
;
6556 f
->open_object_section("created_osd");
6557 f
->dump_int("osdid", id
);
6565 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
6566 bool has_lockbox
= false;
6567 bool has_secrets
= (!secrets
.empty());
6569 ConfigKeyService
*svc
= nullptr;
6570 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
6573 if (secrets
.count("cephx_secret") == 0) {
6574 ss
<< "requires a cephx secret.";
6577 cephx_secret
= secrets
.at("cephx_secret");
6579 bool has_lockbox_secret
= (secrets
.count("cephx_lockbox_secret") > 0);
6580 bool has_dmcrypt_key
= (secrets
.count("dmcrypt_key") > 0);
6582 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
6583 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
6585 if (has_lockbox_secret
&& has_dmcrypt_key
) {
6587 lockbox_secret
= secrets
.at("cephx_lockbox_secret");
6588 dmcrypt_key
= secrets
.at("dmcrypt_key");
6589 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
6590 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
6594 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
6596 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
6604 } else if (may_be_idempotent
&& err
!= EEXIST
) {
6605 // for this to be idempotent, `id` should already be >= 0; no need
6606 // to use validate_id.
6608 ss
<< "osd." << id
<< " exists but secrets do not match";
6613 svc
= (ConfigKeyService
*)mon
->config_key_service
;
6614 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
6617 } else if (may_be_idempotent
&& err
!= EEXIST
) {
6619 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
6624 assert(!has_secrets
|| !cephx_secret
.empty());
6625 assert(!has_lockbox
|| !lockbox_secret
.empty());
6627 if (may_be_idempotent
) {
6628 // we have nothing to do for either the osdmon or the authmon,
6629 // and we have no lockbox - so the config key service will not be
6630 // touched. This is therefore an idempotent operation, and we can
6631 // just return right away.
6632 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
6635 f
->open_object_section("created_osd");
6636 f
->dump_int("osdid", id
);
6643 assert(!may_be_idempotent
);
6647 assert(!cephx_secret
.empty());
6648 assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
6649 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
6651 err
= mon
->authmon()->do_osd_new(cephx_entity
,
6657 assert(nullptr != svc
);
6658 svc
->do_osd_new(uuid
, dmcrypt_key
);
6662 if (is_recreate_destroyed
) {
6664 assert(osdmap
.is_destroyed(id
));
6665 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
6666 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
| CEPH_OSD_NEW
;
6667 pending_inc
.new_uuid
[id
] = uuid
;
6670 int32_t new_id
= -1;
6671 do_osd_create(id
, uuid
, &new_id
);
6672 assert(new_id
>= 0);
6673 assert(id
== new_id
);
6677 f
->open_object_section("created_osd");
6678 f
->dump_int("osdid", id
);
6687 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
6689 op
->mark_osdmon_event(__func__
);
6690 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
6692 map
<string
, cmd_vartype
> cmdmap
;
6693 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
6694 string rs
= ss
.str();
6695 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
6699 MonSession
*session
= m
->get_session();
6701 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
6705 return prepare_command_impl(op
, cmdmap
);
6708 static int parse_reweights(CephContext
*cct
,
6709 const map
<string
,cmd_vartype
> &cmdmap
,
6710 const OSDMap
& osdmap
,
6711 map
<int32_t, uint32_t>* weights
)
6714 if (!cmd_getval(g_ceph_context
, cmdmap
, "weights", weights_str
)) {
6717 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
6718 json_spirit::mValue json_value
;
6719 if (!json_spirit::read(weights_str
, json_value
)) {
6722 if (json_value
.type() != json_spirit::obj_type
) {
6725 const auto obj
= json_value
.get_obj();
6727 for (auto& osd_weight
: obj
) {
6728 auto osd_id
= std::stoi(osd_weight
.first
);
6729 if (!osdmap
.exists(osd_id
)) {
6732 if (osd_weight
.second
.type() != json_spirit::str_type
) {
6735 auto weight
= std::stoul(osd_weight
.second
.get_str());
6736 weights
->insert({osd_id
, weight
});
6738 } catch (const std::logic_error
& e
) {
6744 int OSDMonitor::prepare_command_osd_destroy(
6748 assert(paxos
->is_plugged());
6750 // we check if the osd exists for the benefit of `osd purge`, which may
6751 // have previously removed the osd. If the osd does not exist, return
6752 // -ENOENT to convey this, and let the caller deal with it.
6754 // we presume that all auth secrets and config keys were removed prior
6755 // to this command being called. if they exist by now, we also assume
6756 // they must have been created by some other command and do not pertain
6757 // to this non-existent osd.
6758 if (!osdmap
.exists(id
)) {
6759 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
6763 uuid_d uuid
= osdmap
.get_uuid(id
);
6764 dout(10) << __func__
<< " destroying osd." << id
6765 << " uuid " << uuid
<< dendl
;
6767 // if it has been destroyed, we assume our work here is done.
6768 if (osdmap
.is_destroyed(id
)) {
6769 ss
<< "destroyed osd." << id
;
6773 EntityName cephx_entity
, lockbox_entity
;
6774 bool idempotent_auth
= false, idempotent_cks
= false;
6776 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
6781 if (err
== -ENOENT
) {
6782 idempotent_auth
= true;
6788 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
6789 err
= svc
->validate_osd_destroy(id
, uuid
);
6791 assert(err
== -ENOENT
);
6793 idempotent_cks
= true;
6796 if (!idempotent_auth
) {
6797 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
6801 if (!idempotent_cks
) {
6802 svc
->do_osd_destroy(id
, uuid
);
6805 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
6806 pending_inc
.new_uuid
[id
] = uuid_d();
6808 // we can only propose_pending() once per service, otherwise we'll be
6809 // defying PaxosService and all laws of nature. Therefore, as we may
6810 // be used during 'osd purge', let's keep the caller responsible for
6816 int OSDMonitor::prepare_command_osd_purge(
6820 assert(paxos
->is_plugged());
6821 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
6823 assert(!osdmap
.is_up(id
));
6826 * This may look a bit weird, but this is what's going to happen:
6828 * 1. we make sure that removing from crush works
6829 * 2. we call `prepare_command_osd_destroy()`. If it returns an
6830 * error, then we abort the whole operation, as no updates
6831 * have been made. However, we this function will have
6832 * side-effects, thus we need to make sure that all operations
6833 * performed henceforth will *always* succeed.
6834 * 3. we call `prepare_command_osd_remove()`. Although this
6835 * function can return an error, it currently only checks if the
6836 * osd is up - and we have made sure that it is not so, so there
6837 * is no conflict, and it is effectively an update.
6838 * 4. finally, we call `do_osd_crush_remove()`, which will perform
6839 * the crush update we delayed from before.
6842 CrushWrapper newcrush
;
6843 _get_pending_crush(newcrush
);
6845 bool may_be_idempotent
= false;
6847 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
6848 if (err
== -ENOENT
) {
6850 may_be_idempotent
= true;
6851 } else if (err
< 0) {
6852 ss
<< "error removing osd." << id
<< " from crush";
6856 // no point destroying the osd again if it has already been marked destroyed
6857 if (!osdmap
.is_destroyed(id
)) {
6858 err
= prepare_command_osd_destroy(id
, ss
);
6860 if (err
== -ENOENT
) {
6866 may_be_idempotent
= false;
6871 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
6872 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
6873 << "we are idempotent." << dendl
;
6877 err
= prepare_command_osd_remove(id
);
6878 // we should not be busy, as we should have made sure this id is not up.
6881 do_osd_crush_remove(newcrush
);
6885 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
6886 map
<string
,cmd_vartype
> &cmdmap
)
6888 op
->mark_osdmon_event(__func__
);
6889 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
6897 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
6898 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6901 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
6905 bool osdid_present
= cmd_getval(g_ceph_context
, cmdmap
, "id", osdid
);
6906 if (osdid_present
) {
6908 oss
<< "osd." << osdid
;
6912 // Even if there's a pending state with changes that could affect
6913 // a command, considering that said state isn't yet committed, we
6914 // just don't care about those changes if the command currently being
6915 // handled acts as a no-op against the current committed state.
6916 // In a nutshell, we assume this command happens *before*.
6918 // Let me make this clearer:
6920 // - If we have only one client, and that client issues some
6921 // operation that would conflict with this operation but is
6922 // still on the pending state, then we would be sure that said
6923 // operation wouldn't have returned yet, so the client wouldn't
6924 // issue this operation (unless the client didn't wait for the
6925 // operation to finish, and that would be the client's own fault).
6927 // - If we have more than one client, each client will observe
6928 // whatever is the state at the moment of the commit. So, if we
6929 // have two clients, one issuing an unlink and another issuing a
6930 // link, and if the link happens while the unlink is still on the
6931 // pending state, from the link's point-of-view this is a no-op.
6932 // If different clients are issuing conflicting operations and
6933 // they care about that, then the clients should make sure they
6934 // enforce some kind of concurrency mechanism -- from our
6935 // perspective that's what Douglas Adams would call an SEP.
6937 // This should be used as a general guideline for most commands handled
6938 // in this function. Adapt as you see fit, but please bear in mind that
6939 // this is the expected behavior.
6942 if (prefix
== "osd setcrushmap" ||
6943 (prefix
== "osd crush set" && !osdid_present
)) {
6944 if (pending_inc
.crush
.length()) {
6945 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
6946 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
6949 dout(10) << "prepare_command setting new crush map" << dendl
;
6950 bufferlist
data(m
->get_data());
6953 bufferlist::iterator
bl(data
.begin());
6956 catch (const std::exception
&e
) {
6958 ss
<< "Failed to parse crushmap: " << e
.what();
6962 int64_t prior_version
= 0;
6963 if (cmd_getval(g_ceph_context
, cmdmap
, "prior_version", prior_version
)) {
6964 if (prior_version
== osdmap
.get_crush_version() - 1) {
6965 // see if we are a resend of the last update. this is imperfect
6966 // (multiple racing updaters may not both get reliable success)
6967 // but we expect crush updaters (via this interface) to be rare-ish.
6968 bufferlist current
, proposed
;
6969 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
6970 crush
.encode(proposed
, mon
->get_quorum_con_features());
6971 if (current
.contents_equal(proposed
)) {
6972 dout(10) << __func__
6973 << " proposed matches current and version equals previous"
6976 ss
<< osdmap
.get_crush_version();
6980 if (prior_version
!= osdmap
.get_crush_version()) {
6982 ss
<< "prior_version " << prior_version
<< " != crush version "
6983 << osdmap
.get_crush_version();
6988 if (crush
.has_legacy_rulesets()) {
6990 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
6993 if (!validate_crush_against_features(&crush
, ss
)) {
6998 const auto& osdmap_pools
= osdmap
.get_pools();
6999 for (auto pit
= osdmap_pools
.begin(); pit
!= osdmap_pools
.end(); ++pit
) {
7000 const int64_t pool_id
= pit
->first
;
7001 const pg_pool_t
&pool
= pit
->second
;
7002 int ruleno
= pool
.get_crush_rule();
7003 if (!crush
.rule_exists(ruleno
)) {
7004 ss
<< " the crush rule no "<< ruleno
<< " for pool id " << pool_id
<< " is in use";
7010 if (g_conf
->mon_osd_crush_smoke_test
) {
7011 // sanity check: test some inputs to make sure this map isn't
7013 dout(10) << " testing map" << dendl
;
7015 CrushTester
tester(crush
, ess
);
7016 tester
.set_max_x(50);
7017 int r
= tester
.test_with_fork(g_conf
->mon_lease
);
7019 dout(10) << " tester.test_with_fork returns " << r
7020 << ": " << ess
.str() << dendl
;
7021 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
7025 dout(10) << " crush test result " << ess
.str() << dendl
;
7028 pending_inc
.crush
= data
;
7029 ss
<< osdmap
.get_crush_version() + 1;
7032 } else if (prefix
== "osd crush set-device-class") {
7033 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7034 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
7035 << "luminous' before using crush device classes";
7040 string device_class
;
7041 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
7042 err
= -EINVAL
; // no value!
7047 vector
<string
> idvec
;
7048 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
7049 CrushWrapper newcrush
;
7050 _get_pending_crush(newcrush
);
7052 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
7056 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
7057 osdmap
.get_all_osds(osds
);
7060 // try traditional single osd way
7061 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
7063 // ss has reason for failure
7064 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
7071 for (auto &osd
: osds
) {
7072 if (!osdmap
.exists(osd
)) {
7073 ss
<< "osd." << osd
<< " does not exist. ";
7078 oss
<< "osd." << osd
;
7079 string name
= oss
.str();
7082 if (newcrush
.item_exists(osd
)) {
7083 action
= "updating";
7085 action
= "creating";
7086 newcrush
.set_item_name(osd
, name
);
7089 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
7090 << "' device_class '" << device_class
<< "'"
7092 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
7096 if (err
== 0 && !_have_pending_crush()) {
7098 // for single osd only, wildcard makes too much noise
7099 ss
<< "set-device-class item id " << osd
<< " name '" << name
7100 << "' device_class '" << device_class
<< "': no change";
7103 updated
.insert(osd
);
7108 if (!updated
.empty()) {
7109 pending_inc
.crush
.clear();
7110 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7111 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
7113 wait_for_finished_proposal(op
,
7114 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
7118 } else if (prefix
== "osd crush add-bucket") {
7119 // os crush add-bucket <name> <type>
7120 string name
, typestr
;
7121 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7122 cmd_getval(g_ceph_context
, cmdmap
, "type", typestr
);
7124 if (!_have_pending_crush() &&
7125 _get_stable_crush().name_exists(name
)) {
7126 ss
<< "bucket '" << name
<< "' already exists";
7130 CrushWrapper newcrush
;
7131 _get_pending_crush(newcrush
);
7133 if (newcrush
.name_exists(name
)) {
7134 ss
<< "bucket '" << name
<< "' already exists";
7137 int type
= newcrush
.get_type_id(typestr
);
7139 ss
<< "type '" << typestr
<< "' does not exist";
7144 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
7149 err
= newcrush
.add_bucket(0, 0,
7150 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
7153 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
7156 err
= newcrush
.set_item_name(bucketno
, name
);
7158 ss
<< "error setting bucket name to '" << name
<< "'";
7162 pending_inc
.crush
.clear();
7163 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7164 ss
<< "added bucket " << name
<< " type " << typestr
7167 } else if (prefix
== "osd crush rename-bucket") {
7168 string srcname
, dstname
;
7169 cmd_getval(g_ceph_context
, cmdmap
, "srcname", srcname
);
7170 cmd_getval(g_ceph_context
, cmdmap
, "dstname", dstname
);
7172 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
7173 if (err
== -EALREADY
) // equivalent to success for idempotency
7179 } else if (prefix
== "osd crush class create") {
7180 string device_class
;
7181 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
7182 err
= -EINVAL
; // no value!
7185 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7186 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
7187 << "luminous' before using crush device classes";
7191 if (!_have_pending_crush() &&
7192 _get_stable_crush().class_exists(device_class
)) {
7193 ss
<< "class '" << device_class
<< "' already exists";
7197 CrushWrapper newcrush
;
7198 _get_pending_crush(newcrush
);
7200 if (newcrush
.class_exists(name
)) {
7201 ss
<< "class '" << device_class
<< "' already exists";
7205 int class_id
= newcrush
.get_or_create_class_id(device_class
);
7207 pending_inc
.crush
.clear();
7208 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7209 ss
<< "created class " << device_class
<< " with id " << class_id
7213 } else if (prefix
== "osd crush class rm") {
7214 string device_class
;
7215 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
7216 err
= -EINVAL
; // no value!
7219 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7220 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
7221 << "luminous' before using crush device classes";
7226 CrushWrapper newcrush
;
7227 _get_pending_crush(newcrush
);
7229 if (!newcrush
.class_exists(device_class
)) {
7231 ss
<< "class '" << device_class
<< "' does not exist";
7235 int class_id
= newcrush
.get_class_id(device_class
);
7237 if (newcrush
.class_is_in_use(class_id
)) {
7239 ss
<< "class '" << device_class
<< "' is in use";
7243 err
= newcrush
.remove_class_name(device_class
);
7245 ss
<< "class '" << device_class
<< "' cannot be removed '"
7246 << cpp_strerror(err
) << "'";
7250 pending_inc
.crush
.clear();
7251 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7252 ss
<< "removed class " << device_class
<< " with id " << class_id
7253 << " from crush map";
7256 } else if (prefix
== "osd crush class rename") {
7257 string srcname
, dstname
;
7258 if (!cmd_getval(g_ceph_context
, cmdmap
, "srcname", srcname
)) {
7262 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7263 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
7264 << "luminous' before using crush device classes";
7269 if (!cmd_getval(g_ceph_context
, cmdmap
, "dstname", dstname
)) {
7274 CrushWrapper newcrush
;
7275 _get_pending_crush(newcrush
);
7277 if (!newcrush
.class_exists(srcname
)) {
7279 ss
<< "class '" << srcname
<< "' does not exist";
7283 if (newcrush
.class_exists(dstname
)) {
7285 ss
<< "class '" << dstname
<< "' already exists";
7289 int class_id
= newcrush
.get_class_id(srcname
);
7291 if (newcrush
.class_is_in_use(class_id
)) {
7293 ss
<< "class '" << srcname
<< "' is in use";
7297 err
= newcrush
.rename_class(srcname
, dstname
);
7299 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "':"
7300 << cpp_strerror(err
);
7304 pending_inc
.crush
.clear();
7305 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7306 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
7309 } else if (osdid_present
&&
7310 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
7311 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
7312 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
7313 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
7315 if (!osdmap
.exists(osdid
)) {
7317 ss
<< name
<< " does not exist. create it before updating the crush map";
7322 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
)) {
7323 ss
<< "unable to parse weight value '"
7324 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
7330 vector
<string
> argvec
;
7331 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
7332 map
<string
,string
> loc
;
7333 CrushWrapper::parse_loc_map(argvec
, &loc
);
7335 if (prefix
== "osd crush set"
7336 && !_get_stable_crush().item_exists(osdid
)) {
7338 ss
<< "unable to set item id " << osdid
<< " name '" << name
7339 << "' weight " << weight
<< " at location " << loc
7340 << ": does not exist";
7344 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
7345 << name
<< "' weight " << weight
<< " at location "
7347 CrushWrapper newcrush
;
7348 _get_pending_crush(newcrush
);
7351 if (prefix
== "osd crush set" ||
7352 newcrush
.check_item_loc(g_ceph_context
, osdid
, loc
, (int *)NULL
)) {
7354 err
= newcrush
.update_item(g_ceph_context
, osdid
, weight
, name
, loc
);
7357 err
= newcrush
.insert_item(g_ceph_context
, osdid
, weight
, name
, loc
);
7365 if (err
== 0 && !_have_pending_crush()) {
7366 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
7367 << weight
<< " at location " << loc
<< ": no change";
7371 pending_inc
.crush
.clear();
7372 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7373 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
7374 << weight
<< " at location " << loc
<< " to crush map";
7376 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7377 get_last_committed() + 1));
7380 } else if (prefix
== "osd crush create-or-move") {
7382 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
7383 if (!osdmap
.exists(osdid
)) {
7385 ss
<< name
<< " does not exist. create it before updating the crush map";
7390 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
)) {
7391 ss
<< "unable to parse weight value '"
7392 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
7398 vector
<string
> argvec
;
7399 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
7400 map
<string
,string
> loc
;
7401 CrushWrapper::parse_loc_map(argvec
, &loc
);
7403 dout(0) << "create-or-move crush item name '" << name
<< "' initial_weight " << weight
7404 << " at location " << loc
<< dendl
;
7406 CrushWrapper newcrush
;
7407 _get_pending_crush(newcrush
);
7409 err
= newcrush
.create_or_move_item(g_ceph_context
, osdid
, weight
, name
, loc
);
7411 ss
<< "create-or-move updated item name '" << name
<< "' weight " << weight
7412 << " at location " << loc
<< " to crush map";
7416 pending_inc
.crush
.clear();
7417 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7418 ss
<< "create-or-move updating item name '" << name
<< "' weight " << weight
7419 << " at location " << loc
<< " to crush map";
7421 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7422 get_last_committed() + 1));
7427 } else if (prefix
== "osd crush move") {
7429 // osd crush move <name> <loc1> [<loc2> ...]
7432 vector
<string
> argvec
;
7433 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7434 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
7435 map
<string
,string
> loc
;
7436 CrushWrapper::parse_loc_map(argvec
, &loc
);
7438 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
7439 CrushWrapper newcrush
;
7440 _get_pending_crush(newcrush
);
7442 if (!newcrush
.name_exists(name
)) {
7444 ss
<< "item " << name
<< " does not exist";
7447 int id
= newcrush
.get_item_id(name
);
7449 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
7451 err
= newcrush
.create_or_move_item(g_ceph_context
, id
, 0, name
, loc
);
7453 err
= newcrush
.move_bucket(g_ceph_context
, id
, loc
);
7456 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
7457 pending_inc
.crush
.clear();
7458 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7460 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7461 get_last_committed() + 1));
7465 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
7469 } else if (prefix
== "osd crush swap-bucket") {
7470 string source
, dest
, force
;
7471 cmd_getval(g_ceph_context
, cmdmap
, "source", source
);
7472 cmd_getval(g_ceph_context
, cmdmap
, "dest", dest
);
7473 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
7474 CrushWrapper newcrush
;
7475 _get_pending_crush(newcrush
);
7476 if (!newcrush
.name_exists(source
)) {
7477 ss
<< "source item " << source
<< " does not exist";
7481 if (!newcrush
.name_exists(dest
)) {
7482 ss
<< "dest item " << dest
<< " does not exist";
7486 int sid
= newcrush
.get_item_id(source
);
7487 int did
= newcrush
.get_item_id(dest
);
7489 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 &&
7490 force
!= "--yes-i-really-mean-it") {
7491 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
7495 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
7496 force
!= "--yes-i-really-mean-it") {
7497 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
7498 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
7499 << "; pass --yes-i-really-mean-it to proceed anyway";
7503 int r
= newcrush
.swap_bucket(g_ceph_context
, sid
, did
);
7505 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
7509 ss
<< "swapped bucket of " << source
<< " to " << dest
;
7510 pending_inc
.crush
.clear();
7511 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7512 wait_for_finished_proposal(op
,
7513 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
7514 get_last_committed() + 1));
7516 } else if (prefix
== "osd crush link") {
7517 // osd crush link <name> <loc1> [<loc2> ...]
7519 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7520 vector
<string
> argvec
;
7521 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
7522 map
<string
,string
> loc
;
7523 CrushWrapper::parse_loc_map(argvec
, &loc
);
7525 // Need an explicit check for name_exists because get_item_id returns
7527 int id
= osdmap
.crush
->get_item_id(name
);
7528 if (!osdmap
.crush
->name_exists(name
)) {
7530 ss
<< "item " << name
<< " does not exist";
7533 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
7535 if (osdmap
.crush
->check_item_loc(g_ceph_context
, id
, loc
, (int*) NULL
)) {
7536 ss
<< "no need to move item id " << id
<< " name '" << name
7537 << "' to location " << loc
<< " in crush map";
7542 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
7543 CrushWrapper newcrush
;
7544 _get_pending_crush(newcrush
);
7546 if (!newcrush
.name_exists(name
)) {
7548 ss
<< "item " << name
<< " does not exist";
7551 int id
= newcrush
.get_item_id(name
);
7552 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
7553 err
= newcrush
.link_bucket(g_ceph_context
, id
, loc
);
7555 ss
<< "linked item id " << id
<< " name '" << name
7556 << "' to location " << loc
<< " in crush map";
7557 pending_inc
.crush
.clear();
7558 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7560 ss
<< "cannot link item id " << id
<< " name '" << name
7561 << "' to location " << loc
;
7565 ss
<< "no need to move item id " << id
<< " name '" << name
7566 << "' to location " << loc
<< " in crush map";
7570 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
7571 get_last_committed() + 1));
7573 } else if (prefix
== "osd crush rm" ||
7574 prefix
== "osd crush remove" ||
7575 prefix
== "osd crush unlink") {
7577 // osd crush rm <id> [ancestor]
7578 CrushWrapper newcrush
;
7579 _get_pending_crush(newcrush
);
7582 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7584 if (!osdmap
.crush
->name_exists(name
)) {
7586 ss
<< "device '" << name
<< "' does not appear in the crush map";
7589 if (!newcrush
.name_exists(name
)) {
7591 ss
<< "device '" << name
<< "' does not appear in the crush map";
7593 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7594 get_last_committed() + 1));
7597 int id
= newcrush
.get_item_id(name
);
7600 bool unlink_only
= prefix
== "osd crush unlink";
7601 string ancestor_str
;
7602 if (cmd_getval(g_ceph_context
, cmdmap
, "ancestor", ancestor_str
)) {
7603 if (!newcrush
.name_exists(ancestor_str
)) {
7605 ss
<< "ancestor item '" << ancestor_str
7606 << "' does not appear in the crush map";
7609 ancestor
= newcrush
.get_item_id(ancestor_str
);
7612 err
= prepare_command_osd_crush_remove(
7615 (ancestor
< 0), unlink_only
);
7617 if (err
== -ENOENT
) {
7618 ss
<< "item " << id
<< " does not appear in that position";
7623 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
7625 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7626 get_last_committed() + 1));
7631 } else if (prefix
== "osd crush reweight-all") {
7632 CrushWrapper newcrush
;
7633 _get_pending_crush(newcrush
);
7635 newcrush
.reweight(g_ceph_context
);
7636 pending_inc
.crush
.clear();
7637 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7638 ss
<< "reweighted crush hierarchy";
7640 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7641 get_last_committed() + 1));
7643 } else if (prefix
== "osd crush reweight") {
7644 // osd crush reweight <name> <weight>
7645 CrushWrapper newcrush
;
7646 _get_pending_crush(newcrush
);
7649 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7650 if (!newcrush
.name_exists(name
)) {
7652 ss
<< "device '" << name
<< "' does not appear in the crush map";
7656 int id
= newcrush
.get_item_id(name
);
7658 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
7663 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
7664 ss
<< "unable to parse weight value '"
7665 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
7670 err
= newcrush
.adjust_item_weightf(g_ceph_context
, id
, w
);
7673 pending_inc
.crush
.clear();
7674 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7675 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
7678 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7679 get_last_committed() + 1));
7681 } else if (prefix
== "osd crush reweight-subtree") {
7682 // osd crush reweight <name> <weight>
7683 CrushWrapper newcrush
;
7684 _get_pending_crush(newcrush
);
7687 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7688 if (!newcrush
.name_exists(name
)) {
7690 ss
<< "device '" << name
<< "' does not appear in the crush map";
7694 int id
= newcrush
.get_item_id(name
);
7696 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
7701 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
7702 ss
<< "unable to parse weight value '"
7703 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
7708 err
= newcrush
.adjust_subtree_weightf(g_ceph_context
, id
, w
);
7711 pending_inc
.crush
.clear();
7712 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7713 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
7716 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7717 get_last_committed() + 1));
7719 } else if (prefix
== "osd crush tunables") {
7720 CrushWrapper newcrush
;
7721 _get_pending_crush(newcrush
);
7725 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
7726 if (profile
== "legacy" || profile
== "argonaut") {
7727 newcrush
.set_tunables_legacy();
7728 } else if (profile
== "bobtail") {
7729 newcrush
.set_tunables_bobtail();
7730 } else if (profile
== "firefly") {
7731 newcrush
.set_tunables_firefly();
7732 } else if (profile
== "hammer") {
7733 newcrush
.set_tunables_hammer();
7734 } else if (profile
== "jewel") {
7735 newcrush
.set_tunables_jewel();
7736 } else if (profile
== "optimal") {
7737 newcrush
.set_tunables_optimal();
7738 } else if (profile
== "default") {
7739 newcrush
.set_tunables_default();
7741 ss
<< "unrecognized profile '" << profile
<< "'";
7746 if (!validate_crush_against_features(&newcrush
, ss
)) {
7751 pending_inc
.crush
.clear();
7752 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7753 ss
<< "adjusted tunables profile to " << profile
;
7755 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7756 get_last_committed() + 1));
7758 } else if (prefix
== "osd crush set-tunable") {
7759 CrushWrapper newcrush
;
7760 _get_pending_crush(newcrush
);
7764 cmd_getval(g_ceph_context
, cmdmap
, "tunable", tunable
);
7767 if (!cmd_getval(g_ceph_context
, cmdmap
, "value", value
)) {
7769 ss
<< "failed to parse integer value " << cmd_vartype_stringify(cmdmap
["value"]);
7773 if (tunable
== "straw_calc_version") {
7774 if (value
!= 0 && value
!= 1) {
7775 ss
<< "value must be 0 or 1; got " << value
;
7779 newcrush
.set_straw_calc_version(value
);
7781 ss
<< "unrecognized tunable '" << tunable
<< "'";
7786 if (!validate_crush_against_features(&newcrush
, ss
)) {
7791 pending_inc
.crush
.clear();
7792 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7793 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
7795 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7796 get_last_committed() + 1));
7799 } else if (prefix
== "osd crush rule create-simple") {
7800 string name
, root
, type
, mode
;
7801 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7802 cmd_getval(g_ceph_context
, cmdmap
, "root", root
);
7803 cmd_getval(g_ceph_context
, cmdmap
, "type", type
);
7804 cmd_getval(g_ceph_context
, cmdmap
, "mode", mode
);
7808 if (osdmap
.crush
->rule_exists(name
)) {
7809 // The name is uniquely associated to a ruleid and the rule it contains
7810 // From the user point of view, the rule is more meaningfull.
7811 ss
<< "rule " << name
<< " already exists";
7816 CrushWrapper newcrush
;
7817 _get_pending_crush(newcrush
);
7819 if (newcrush
.rule_exists(name
)) {
7820 // The name is uniquely associated to a ruleid and the rule it contains
7821 // From the user point of view, the rule is more meaningfull.
7822 ss
<< "rule " << name
<< " already exists";
7825 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
7826 pg_pool_t::TYPE_REPLICATED
, &ss
);
7832 pending_inc
.crush
.clear();
7833 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7836 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7837 get_last_committed() + 1));
7840 } else if (prefix
== "osd crush rule create-replicated") {
7841 string name
, root
, type
, device_class
;
7842 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7843 cmd_getval(g_ceph_context
, cmdmap
, "root", root
);
7844 cmd_getval(g_ceph_context
, cmdmap
, "type", type
);
7845 cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
);
7847 if (!device_class
.empty()) {
7848 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7849 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
7850 << "luminous' before using crush device classes";
7856 if (osdmap
.crush
->rule_exists(name
)) {
7857 // The name is uniquely associated to a ruleid and the rule it contains
7858 // From the user point of view, the rule is more meaningfull.
7859 ss
<< "rule " << name
<< " already exists";
7864 CrushWrapper newcrush
;
7865 _get_pending_crush(newcrush
);
7867 if (newcrush
.rule_exists(name
)) {
7868 // The name is uniquely associated to a ruleid and the rule it contains
7869 // From the user point of view, the rule is more meaningfull.
7870 ss
<< "rule " << name
<< " already exists";
7873 int ruleno
= newcrush
.add_simple_rule(
7874 name
, root
, type
, device_class
,
7875 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
7881 pending_inc
.crush
.clear();
7882 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7885 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7886 get_last_committed() + 1));
7889 } else if (prefix
== "osd erasure-code-profile rm") {
7891 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7893 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
7896 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
7901 if (osdmap
.has_erasure_code_profile(name
) ||
7902 pending_inc
.new_erasure_code_profiles
.count(name
)) {
7903 if (osdmap
.has_erasure_code_profile(name
)) {
7904 pending_inc
.old_erasure_code_profiles
.push_back(name
);
7906 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
7907 pending_inc
.new_erasure_code_profiles
.erase(name
);
7911 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7912 get_last_committed() + 1));
7915 ss
<< "erasure-code-profile " << name
<< " does not exist";
7920 } else if (prefix
== "osd erasure-code-profile set") {
7922 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7923 vector
<string
> profile
;
7924 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
7926 if (profile
.size() > 0 && profile
.back() == "--force") {
7932 map
<string
,string
> profile_map
;
7933 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
7936 if (profile_map
.find("plugin") == profile_map
.end()) {
7937 ss
<< "erasure-code-profile " << profile_map
7938 << " must contain a plugin entry" << std::endl
;
7942 string plugin
= profile_map
["plugin"];
7944 if (pending_inc
.has_erasure_code_profile(name
)) {
7945 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
7948 if (plugin
== "isa" || plugin
== "lrc") {
7949 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
, ss
);
7954 } else if (plugin
== "shec") {
7955 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
, ss
);
7961 err
= normalize_profile(name
, profile_map
, force
, &ss
);
7965 if (osdmap
.has_erasure_code_profile(name
)) {
7966 ErasureCodeProfile existing_profile_map
=
7967 osdmap
.get_erasure_code_profile(name
);
7968 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
7972 if (existing_profile_map
== profile_map
) {
7978 ss
<< "will not override erasure code profile " << name
7979 << " because the existing profile "
7980 << existing_profile_map
7981 << " is different from the proposed profile "
7987 dout(20) << "erasure code profile set " << name
<< "="
7988 << profile_map
<< dendl
;
7989 pending_inc
.set_erasure_code_profile(name
, profile_map
);
7993 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7994 get_last_committed() + 1));
7997 } else if (prefix
== "osd crush rule create-erasure") {
7998 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
8003 string name
, poolstr
;
8004 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8006 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
8008 profile
= "default";
8009 if (profile
== "default") {
8010 if (!osdmap
.has_erasure_code_profile(profile
)) {
8011 if (pending_inc
.has_erasure_code_profile(profile
)) {
8012 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
8016 map
<string
,string
> profile_map
;
8017 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
8022 err
= normalize_profile(name
, profile_map
, true, &ss
);
8025 dout(20) << "erasure code profile set " << profile
<< "="
8026 << profile_map
<< dendl
;
8027 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
8033 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
8036 case -EEXIST
: // return immediately
8037 ss
<< "rule " << name
<< " already exists";
8041 case -EALREADY
: // wait for pending to be proposed
8042 ss
<< "rule " << name
<< " already exists";
8045 default: // non recoverable error
8050 ss
<< "created rule " << name
<< " at " << rule
;
8054 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8055 get_last_committed() + 1));
8058 } else if (prefix
== "osd crush rule rm") {
8060 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8062 if (!osdmap
.crush
->rule_exists(name
)) {
8063 ss
<< "rule " << name
<< " does not exist";
8068 CrushWrapper newcrush
;
8069 _get_pending_crush(newcrush
);
8071 if (!newcrush
.rule_exists(name
)) {
8072 ss
<< "rule " << name
<< " does not exist";
8075 int ruleno
= newcrush
.get_rule_id(name
);
8076 assert(ruleno
>= 0);
8078 // make sure it is not in use.
8079 // FIXME: this is ok in some situations, but let's not bother with that
8081 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
8082 if (osdmap
.crush_ruleset_in_use(ruleset
)) {
8083 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
8088 err
= newcrush
.remove_rule(ruleno
);
8093 pending_inc
.crush
.clear();
8094 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8097 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8098 get_last_committed() + 1));
8101 } else if (prefix
== "osd setmaxosd") {
8103 if (!cmd_getval(g_ceph_context
, cmdmap
, "newmax", newmax
)) {
8104 ss
<< "unable to parse 'newmax' value '"
8105 << cmd_vartype_stringify(cmdmap
["newmax"]) << "'";
8110 if (newmax
> g_conf
->mon_max_osd
) {
8112 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
8113 << g_conf
->mon_max_osd
<< ")";
8117 // Don't allow shrinking OSD number as this will cause data loss
8118 // and may cause kernel crashes.
8119 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
8120 if (newmax
< osdmap
.get_max_osd()) {
8121 // Check if the OSDs exist between current max and new value.
8122 // If there are any OSDs exist, then don't allow shrinking number
8124 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
8125 if (osdmap
.exists(i
)) {
8127 ss
<< "cannot shrink max_osd to " << newmax
8128 << " because osd." << i
<< " (and possibly others) still in use";
8134 pending_inc
.new_max_osd
= newmax
;
8135 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
8137 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8138 get_last_committed() + 1));
8141 } else if (prefix
== "osd set-full-ratio" ||
8142 prefix
== "osd set-backfillfull-ratio" ||
8143 prefix
== "osd set-nearfull-ratio") {
8144 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8145 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8146 << "luminous' before using the new interface";
8151 if (!cmd_getval(g_ceph_context
, cmdmap
, "ratio", n
)) {
8152 ss
<< "unable to parse 'ratio' value '"
8153 << cmd_vartype_stringify(cmdmap
["ratio"]) << "'";
8157 if (prefix
== "osd set-full-ratio")
8158 pending_inc
.new_full_ratio
= n
;
8159 else if (prefix
== "osd set-backfillfull-ratio")
8160 pending_inc
.new_backfillfull_ratio
= n
;
8161 else if (prefix
== "osd set-nearfull-ratio")
8162 pending_inc
.new_nearfull_ratio
= n
;
8163 ss
<< prefix
<< " " << n
;
8165 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8166 get_last_committed() + 1));
8168 } else if (prefix
== "osd set-require-min-compat-client") {
8169 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8170 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8171 << "luminous' before using the new interface";
8176 cmd_getval(g_ceph_context
, cmdmap
, "version", v
);
8177 int vno
= ceph_release_from_name(v
.c_str());
8179 ss
<< "version " << v
<< " is not recognized";
8184 newmap
.deepish_copy_from(osdmap
);
8185 newmap
.apply_incremental(pending_inc
);
8186 newmap
.require_min_compat_client
= vno
;
8187 auto mvno
= newmap
.get_min_compat_client();
8189 ss
<< "osdmap current utilizes features that require "
8190 << ceph_release_name(mvno
)
8191 << "; cannot set require_min_compat_client below that to "
8192 << ceph_release_name(vno
);
8197 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
8198 if (sure
!= "--yes-i-really-mean-it") {
8200 mon
->get_combined_feature_map(&m
);
8201 uint64_t features
= ceph_release_features(vno
);
8205 CEPH_ENTITY_TYPE_CLIENT
,
8206 CEPH_ENTITY_TYPE_MDS
,
8207 CEPH_ENTITY_TYPE_MGR
}) {
8208 auto p
= m
.m
.find(type
);
8209 if (p
== m
.m
.end()) {
8212 for (auto& q
: p
->second
) {
8213 uint64_t missing
= ~q
.first
& features
;
8216 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
8221 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
8222 << "(s) look like " << ceph_release_name(
8223 ceph_release_from_features(q
.first
))
8224 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
8230 ss
<< "; add --yes-i-really-mean-it to do it anyway";
8235 ss
<< "set require_min_compat_client to " << ceph_release_name(vno
);
8236 pending_inc
.new_require_min_compat_client
= vno
;
8238 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8239 get_last_committed() + 1));
8241 } else if (prefix
== "osd pause") {
8242 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
8244 } else if (prefix
== "osd unpause") {
8245 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
8247 } else if (prefix
== "osd set") {
8249 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
8251 return prepare_set_flag(op
, CEPH_OSDMAP_FULL
);
8252 else if (key
== "pause")
8253 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
8254 else if (key
== "noup")
8255 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
8256 else if (key
== "nodown")
8257 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
8258 else if (key
== "noout")
8259 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
8260 else if (key
== "noin")
8261 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
8262 else if (key
== "nobackfill")
8263 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
8264 else if (key
== "norebalance")
8265 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
8266 else if (key
== "norecover")
8267 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
8268 else if (key
== "noscrub")
8269 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
8270 else if (key
== "nodeep-scrub")
8271 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
8272 else if (key
== "notieragent")
8273 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
8274 else if (key
== "sortbitwise") {
8275 if (osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
) {
8276 return prepare_set_flag(op
, CEPH_OSDMAP_SORTBITWISE
);
8278 ss
<< "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
8282 } else if (key
== "require_jewel_osds") {
8283 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
8284 ss
<< "the sortbitwise flag must be set before require_jewel_osds";
8287 } else if (osdmap
.require_osd_release
>= CEPH_RELEASE_JEWEL
) {
8288 ss
<< "require_osd_release is already >= jewel";
8291 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
)) {
8292 return prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_JEWEL
);
8294 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
8297 } else if (key
== "require_kraken_osds") {
8298 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
8299 ss
<< "the sortbitwise flag must be set before require_kraken_osds";
8302 } else if (osdmap
.require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
8303 ss
<< "require_osd_release is already >= kraken";
8306 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
)) {
8307 bool r
= prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_KRAKEN
);
8308 // ensure JEWEL is also set
8309 pending_inc
.new_flags
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
8312 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
8316 ss
<< "unrecognized flag '" << key
<< "'";
8320 } else if (prefix
== "osd unset") {
8322 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
8324 return prepare_unset_flag(op
, CEPH_OSDMAP_FULL
);
8325 else if (key
== "pause")
8326 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
8327 else if (key
== "noup")
8328 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
8329 else if (key
== "nodown")
8330 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
8331 else if (key
== "noout")
8332 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
8333 else if (key
== "noin")
8334 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
8335 else if (key
== "nobackfill")
8336 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
8337 else if (key
== "norebalance")
8338 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
8339 else if (key
== "norecover")
8340 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
8341 else if (key
== "noscrub")
8342 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
8343 else if (key
== "nodeep-scrub")
8344 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
8345 else if (key
== "notieragent")
8346 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
8348 ss
<< "unrecognized flag '" << key
<< "'";
8352 } else if (prefix
== "osd require-osd-release") {
8354 cmd_getval(g_ceph_context
, cmdmap
, "release", release
);
8355 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
8356 ss
<< "the sortbitwise flag must be set first";
8360 int rel
= ceph_release_from_name(release
.c_str());
8362 ss
<< "unrecognized release " << release
;
8366 if (rel
< CEPH_RELEASE_LUMINOUS
) {
8367 ss
<< "use this command only for luminous and later";
8371 if (rel
== CEPH_RELEASE_LUMINOUS
) {
8372 if (!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
)) {
8373 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
8378 ss
<< "not supported for this release yet";
8382 if (rel
< osdmap
.require_osd_release
) {
8383 ss
<< "require_osd_release cannot be lowered once it has been set";
8387 pending_inc
.new_require_osd_release
= rel
;
8389 } else if (prefix
== "osd cluster_snap") {
8390 // ** DISABLE THIS FOR NOW **
8391 ss
<< "cluster snapshot currently disabled (broken implementation)";
8392 // ** DISABLE THIS FOR NOW **
8394 } else if (prefix
== "osd down" ||
8395 prefix
== "osd out" ||
8396 prefix
== "osd in" ||
8397 prefix
== "osd rm") {
8401 bool verbose
= true;
8403 vector
<string
> idvec
;
8404 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
8405 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8410 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8411 if (prefix
== "osd in") {
8412 // touch out osds only
8413 osdmap
.get_out_osds(osds
);
8415 osdmap
.get_all_osds(osds
);
8418 verbose
= false; // so the output is less noisy.
8420 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8422 ss
<< "invalid osd id" << osd
;
8425 } else if (!osdmap
.exists(osd
)) {
8426 ss
<< "osd." << osd
<< " does not exist. ";
8433 for (auto &osd
: osds
) {
8434 if (prefix
== "osd down") {
8435 if (osdmap
.is_down(osd
)) {
8437 ss
<< "osd." << osd
<< " is already down. ";
8439 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
8440 ss
<< "marked down osd." << osd
<< ". ";
8443 } else if (prefix
== "osd out") {
8444 if (osdmap
.is_out(osd
)) {
8446 ss
<< "osd." << osd
<< " is already out. ";
8448 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
8449 if (osdmap
.osd_weight
[osd
]) {
8450 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
8451 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
8453 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
8455 ss
<< "marked out osd." << osd
<< ". ";
8456 std::ostringstream msg
;
8457 msg
<< "Client " << op
->get_session()->entity_name
8458 << " marked osd." << osd
<< " out";
8459 if (osdmap
.is_up(osd
)) {
8460 msg
<< ", while it was still marked up";
8462 msg
<< ", after it was down for " << int(down_pending_out
[osd
].sec())
8466 mon
->clog
->info() << msg
.str();
8469 } else if (prefix
== "osd in") {
8470 if (osdmap
.is_in(osd
)) {
8472 ss
<< "osd." << osd
<< " is already in. ";
8474 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
8475 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
8476 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
8477 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
8479 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
8481 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
8483 ss
<< "marked in osd." << osd
<< ". ";
8486 } else if (prefix
== "osd rm") {
8487 err
= prepare_command_osd_remove(osd
);
8489 if (err
== -EBUSY
) {
8492 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
8496 ss
<< ", osd." << osd
;
8498 ss
<< "removed osd." << osd
;
8507 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
8508 get_last_committed() + 1));
8511 } else if (prefix
== "osd add-noup" ||
8512 prefix
== "osd add-nodown" ||
8513 prefix
== "osd add-noin" ||
8514 prefix
== "osd add-noout") {
8523 if (prefix
== "osd add-noup") {
8525 } else if (prefix
== "osd add-nodown") {
8527 } else if (prefix
== "osd add-noin") {
8536 vector
<string
> idvec
;
8537 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
8538 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8544 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8545 osdmap
.get_all_osds(osds
);
8548 // try traditional single osd way
8550 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8552 // ss has reason for failure
8553 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
8561 for (auto &osd
: osds
) {
8563 if (!osdmap
.exists(osd
)) {
8564 ss
<< "osd." << osd
<< " does not exist. ";
8570 if (osdmap
.is_up(osd
)) {
8571 ss
<< "osd." << osd
<< " is already up. ";
8575 if (osdmap
.is_noup(osd
)) {
8576 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
))
8579 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
8586 if (osdmap
.is_down(osd
)) {
8587 ss
<< "osd." << osd
<< " is already down. ";
8591 if (osdmap
.is_nodown(osd
)) {
8592 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
))
8595 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
8602 if (osdmap
.is_in(osd
)) {
8603 ss
<< "osd." << osd
<< " is already in. ";
8607 if (osdmap
.is_noin(osd
)) {
8608 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
))
8611 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
8618 if (osdmap
.is_out(osd
)) {
8619 ss
<< "osd." << osd
<< " is already out. ";
8623 if (osdmap
.is_noout(osd
)) {
8624 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
))
8627 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
8634 assert(0 == "invalid option");
8641 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
8642 get_last_committed() + 1));
8645 } else if (prefix
== "osd rm-noup" ||
8646 prefix
== "osd rm-nodown" ||
8647 prefix
== "osd rm-noin" ||
8648 prefix
== "osd rm-noout") {
8657 if (prefix
== "osd rm-noup") {
8659 } else if (prefix
== "osd rm-nodown") {
8661 } else if (prefix
== "osd rm-noin") {
8670 vector
<string
> idvec
;
8671 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
8673 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8679 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8681 // touch previous noup/nodown/noin/noout osds only
8684 osdmap
.get_noup_osds(&osds
);
8687 osdmap
.get_nodown_osds(&osds
);
8690 osdmap
.get_noin_osds(&osds
);
8693 osdmap
.get_noout_osds(&osds
);
8696 assert(0 == "invalid option");
8699 // cancel any pending noup/nodown/noin/noout requests too
8700 vector
<int> pending_state_osds
;
8701 (void) pending_inc
.get_pending_state_osds(&pending_state_osds
);
8702 for (auto &p
: pending_state_osds
) {
8706 if (!osdmap
.is_noup(p
) &&
8707 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOUP
)) {
8713 if (!osdmap
.is_nodown(p
) &&
8714 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NODOWN
)) {
8720 if (!osdmap
.is_noin(p
) &&
8721 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOIN
)) {
8727 if (!osdmap
.is_noout(p
) &&
8728 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOOUT
)) {
8734 assert(0 == "invalid option");
8740 // try traditional single osd way
8742 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8744 // ss has reason for failure
8745 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
8750 osds
.push_back(osd
);
8753 for (auto &osd
: osds
) {
8755 if (!osdmap
.exists(osd
)) {
8756 ss
<< "osd." << osd
<< " does not exist. ";
8762 if (osdmap
.is_noup(osd
)) {
8763 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
8765 } else if (pending_inc
.pending_osd_state_clear(
8766 osd
, CEPH_OSD_NOUP
)) {
8772 if (osdmap
.is_nodown(osd
)) {
8773 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
8775 } else if (pending_inc
.pending_osd_state_clear(
8776 osd
, CEPH_OSD_NODOWN
)) {
8782 if (osdmap
.is_noin(osd
)) {
8783 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
8785 } else if (pending_inc
.pending_osd_state_clear(
8786 osd
, CEPH_OSD_NOIN
)) {
8792 if (osdmap
.is_noout(osd
)) {
8793 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
8795 } else if (pending_inc
.pending_osd_state_clear(
8796 osd
, CEPH_OSD_NOOUT
)) {
8802 assert(0 == "invalid option");
8809 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
8810 get_last_committed() + 1));
8813 } else if (prefix
== "osd pg-temp") {
8815 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
8816 ss
<< "unable to parse 'pgid' value '"
8817 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
8822 if (!pgid
.parse(pgidstr
.c_str())) {
8823 ss
<< "invalid pgid '" << pgidstr
<< "'";
8827 if (!osdmap
.pg_exists(pgid
)) {
8828 ss
<< "pg " << pgid
<< " does not exist";
8832 if (pending_inc
.new_pg_temp
.count(pgid
)) {
8833 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
8834 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8838 vector
<int64_t> id_vec
;
8839 vector
<int32_t> new_pg_temp
;
8840 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
8841 ss
<< "unable to parse 'id' value(s) '"
8842 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
8846 for (auto osd
: id_vec
) {
8847 if (!osdmap
.exists(osd
)) {
8848 ss
<< "osd." << osd
<< " does not exist";
8852 new_pg_temp
.push_back(osd
);
8855 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
8856 if ((int)new_pg_temp
.size() < pool_min_size
) {
8857 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
8858 << pool_min_size
<< ")";
8863 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
8864 if ((int)new_pg_temp
.size() > pool_size
) {
8865 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
8866 << pool_size
<< ")";
8871 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
8872 new_pg_temp
.begin(), new_pg_temp
.end());
8873 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
8875 } else if (prefix
== "osd primary-temp") {
8877 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
8878 ss
<< "unable to parse 'pgid' value '"
8879 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
8884 if (!pgid
.parse(pgidstr
.c_str())) {
8885 ss
<< "invalid pgid '" << pgidstr
<< "'";
8889 if (!osdmap
.pg_exists(pgid
)) {
8890 ss
<< "pg " << pgid
<< " does not exist";
8896 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
8897 ss
<< "unable to parse 'id' value '"
8898 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
8902 if (osd
!= -1 && !osdmap
.exists(osd
)) {
8903 ss
<< "osd." << osd
<< " does not exist";
8908 if (osdmap
.require_min_compat_client
> 0 &&
8909 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
8910 ss
<< "require_min_compat_client "
8911 << ceph_release_name(osdmap
.require_min_compat_client
)
8912 << " < firefly, which is required for primary-temp";
8915 } else if (!g_conf
->mon_osd_allow_primary_temp
) {
8916 ss
<< "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
8921 pending_inc
.new_primary_temp
[pgid
] = osd
;
8922 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
8924 } else if (prefix
== "osd pg-upmap" ||
8925 prefix
== "osd rm-pg-upmap" ||
8926 prefix
== "osd pg-upmap-items" ||
8927 prefix
== "osd rm-pg-upmap-items") {
8928 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8929 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8930 << "luminous' before using the new interface";
8934 if (osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
8935 ss
<< "min_compat_client "
8936 << ceph_release_name(osdmap
.require_min_compat_client
)
8937 << " < luminous, which is required for pg-upmap. "
8938 << "Try 'ceph osd set-require-min-compat-client luminous' "
8939 << "before using the new interface";
8943 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
8949 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
8950 ss
<< "unable to parse 'pgid' value '"
8951 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
8956 if (!pgid
.parse(pgidstr
.c_str())) {
8957 ss
<< "invalid pgid '" << pgidstr
<< "'";
8961 if (!osdmap
.pg_exists(pgid
)) {
8962 ss
<< "pg " << pgid
<< " does not exist";
8971 OP_RM_PG_UPMAP_ITEMS
,
8974 if (prefix
== "osd pg-upmap") {
8975 option
= OP_PG_UPMAP
;
8976 } else if (prefix
== "osd rm-pg-upmap") {
8977 option
= OP_RM_PG_UPMAP
;
8978 } else if (prefix
== "osd pg-upmap-items") {
8979 option
= OP_PG_UPMAP_ITEMS
;
8981 option
= OP_RM_PG_UPMAP_ITEMS
;
8984 // check pending upmap changes
8986 case OP_PG_UPMAP
: // fall through
8987 case OP_RM_PG_UPMAP
:
8988 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
8989 pending_inc
.old_pg_upmap
.count(pgid
)) {
8990 dout(10) << __func__
<< " waiting for pending update on "
8992 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8997 case OP_PG_UPMAP_ITEMS
: // fall through
8998 case OP_RM_PG_UPMAP_ITEMS
:
8999 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
9000 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
9001 dout(10) << __func__
<< " waiting for pending update on "
9003 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9009 assert(0 == "invalid option");
9015 vector
<int64_t> id_vec
;
9016 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
9017 ss
<< "unable to parse 'id' value(s) '"
9018 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9023 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
9024 if ((int)id_vec
.size() < pool_min_size
) {
9025 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
9026 << pool_min_size
<< ")";
9031 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
9032 if ((int)id_vec
.size() > pool_size
) {
9033 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
9034 << pool_size
<< ")";
9039 vector
<int32_t> new_pg_upmap
;
9040 for (auto osd
: id_vec
) {
9041 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
9042 ss
<< "osd." << osd
<< " does not exist";
9046 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
9047 if (it
!= new_pg_upmap
.end()) {
9048 ss
<< "osd." << osd
<< " already exists, ";
9051 new_pg_upmap
.push_back(osd
);
9054 if (new_pg_upmap
.empty()) {
9055 ss
<< "no valid upmap items(pairs) is specified";
9060 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
9061 new_pg_upmap
.begin(), new_pg_upmap
.end());
9062 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
9066 case OP_RM_PG_UPMAP
:
9068 pending_inc
.old_pg_upmap
.insert(pgid
);
9069 ss
<< "clear " << pgid
<< " pg_upmap mapping";
9073 case OP_PG_UPMAP_ITEMS
:
9075 vector
<int64_t> id_vec
;
9076 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
9077 ss
<< "unable to parse 'id' value(s) '"
9078 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9083 if (id_vec
.size() % 2) {
9084 ss
<< "you must specify pairs of osd ids to be remapped";
9089 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
9090 if ((int)(id_vec
.size() / 2) > pool_size
) {
9091 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
9092 << pool_size
<< ")";
9097 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
9098 ostringstream items
;
9100 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
9104 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
9107 if (!osdmap
.exists(from
)) {
9108 ss
<< "osd." << from
<< " does not exist";
9112 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
9113 ss
<< "osd." << to
<< " does not exist";
9117 new_pg_upmap_items
.push_back(make_pair(from
, to
));
9118 items
<< from
<< "->" << to
<< ",";
9120 string
out(items
.str());
9121 out
.resize(out
.size() - 1); // drop last ','
9124 if (new_pg_upmap_items
.empty()) {
9125 ss
<< "no valid upmap items(pairs) is specified";
9130 pending_inc
.new_pg_upmap_items
[pgid
] =
9131 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
9132 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
9133 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
9137 case OP_RM_PG_UPMAP_ITEMS
:
9139 pending_inc
.old_pg_upmap_items
.insert(pgid
);
9140 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
9145 assert(0 == "invalid option");
9149 } else if (prefix
== "osd primary-affinity") {
9151 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
9152 ss
<< "invalid osd id value '"
9153 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9158 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
9159 ss
<< "unable to parse 'weight' value '"
9160 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
9164 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
9166 ss
<< "weight must be >= 0";
9170 if (osdmap
.require_min_compat_client
> 0 &&
9171 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
9172 ss
<< "require_min_compat_client "
9173 << ceph_release_name(osdmap
.require_min_compat_client
)
9174 << " < firefly, which is required for primary-affinity";
9177 } else if (!g_conf
->mon_osd_allow_primary_affinity
) {
9178 ss
<< "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
9182 err
= check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY
, ss
);
9187 if (osdmap
.exists(id
)) {
9188 pending_inc
.new_primary_affinity
[id
] = ww
;
9189 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
9191 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9192 get_last_committed() + 1));
9195 ss
<< "osd." << id
<< " does not exist";
9199 } else if (prefix
== "osd reweight") {
9201 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
9202 ss
<< "unable to parse osd id value '"
9203 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9208 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
9209 ss
<< "unable to parse weight value '"
9210 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
9214 long ww
= (int)((double)CEPH_OSD_IN
*w
);
9216 ss
<< "weight must be >= 0";
9220 if (osdmap
.exists(id
)) {
9221 pending_inc
.new_weight
[id
] = ww
;
9222 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
9224 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9225 get_last_committed() + 1));
9228 ss
<< "osd." << id
<< " does not exist";
9232 } else if (prefix
== "osd reweightn") {
9233 map
<int32_t, uint32_t> weights
;
9234 err
= parse_reweights(g_ceph_context
, cmdmap
, osdmap
, &weights
);
9236 ss
<< "unable to parse 'weights' value '"
9237 << cmd_vartype_stringify(cmdmap
["weights"]) << "'";
9240 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
9241 wait_for_finished_proposal(
9243 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
9245 } else if (prefix
== "osd lost") {
9247 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
9248 ss
<< "unable to parse osd id value '"
9249 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9254 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) || sure
!= "--yes-i-really-mean-it") {
9255 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
9256 "--yes-i-really-mean-it if you really do.";
9259 } else if (!osdmap
.exists(id
)) {
9260 ss
<< "osd." << id
<< " does not exist";
9263 } else if (!osdmap
.is_down(id
)) {
9264 ss
<< "osd." << id
<< " is not down";
9268 epoch_t e
= osdmap
.get_info(id
).down_at
;
9269 pending_inc
.new_lost
[id
] = e
;
9270 ss
<< "marked osd lost in epoch " << e
;
9272 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9273 get_last_committed() + 1));
9277 } else if (prefix
== "osd destroy" || prefix
== "osd purge") {
9278 /* Destroying an OSD means that we don't expect to further make use of
9279 * the OSDs data (which may even become unreadable after this operation),
9280 * and that we are okay with scrubbing all its cephx keys and config-key
9281 * data (which may include lockbox keys, thus rendering the osd's data
9284 * The OSD will not be removed. Instead, we will mark it as destroyed,
9285 * such that a subsequent call to `create` will not reuse the osd id.
9286 * This will play into being able to recreate the OSD, at the same
9287 * crush location, with minimal data movement.
9290 // make sure authmon is writeable.
9291 if (!mon
->authmon()->is_writeable()) {
9292 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
9293 << "osd destroy" << dendl
;
9294 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
9299 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
9300 ss
<< "unable to parse osd id value '"
9301 << cmd_vartype_stringify(cmdmap
["id"]) << "";
9306 bool is_destroy
= (prefix
== "osd destroy");
9308 assert("osd purge" == prefix
);
9312 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) ||
9313 sure
!= "--yes-i-really-mean-it") {
9314 ss
<< "Are you SURE? This will mean real, permanent data loss, as well "
9315 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
9319 } else if (is_destroy
&& !osdmap
.exists(id
)) {
9320 ss
<< "osd." << id
<< " does not exist";
9323 } else if (osdmap
.is_up(id
)) {
9324 ss
<< "osd." << id
<< " is not `down`.";
9327 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
9328 ss
<< "destroyed osd." << id
;
9333 bool goto_reply
= false;
9337 err
= prepare_command_osd_destroy(id
, ss
);
9338 // we checked above that it should exist.
9339 assert(err
!= -ENOENT
);
9341 err
= prepare_command_osd_purge(id
, ss
);
9342 if (err
== -ENOENT
) {
9344 ss
<< "osd." << id
<< " does not exist.";
9350 if (err
< 0 || goto_reply
) {
9355 ss
<< "destroyed osd." << id
;
9357 ss
<< "purged osd." << id
;
9361 wait_for_finished_proposal(op
,
9362 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
9363 force_immediate_propose();
9366 } else if (prefix
== "osd new") {
9368 // make sure authmon is writeable.
9369 if (!mon
->authmon()->is_writeable()) {
9370 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
9371 << "osd new" << dendl
;
9372 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
9376 map
<string
,string
> secrets_map
;
9378 bufferlist bl
= m
->get_data();
9379 string secrets_json
= bl
.to_str();
9380 dout(20) << __func__
<< " osd new json = " << secrets_json
<< dendl
;
9382 err
= get_json_str_map(secrets_json
, ss
, &secrets_map
);
9386 dout(20) << __func__
<< " osd new secrets " << secrets_map
<< dendl
;
9389 err
= prepare_command_osd_new(op
, cmdmap
, secrets_map
, ss
, f
.get());
9402 if (err
== EEXIST
) {
9403 // idempotent operation
9408 wait_for_finished_proposal(op
,
9409 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
9410 get_last_committed() + 1));
9411 force_immediate_propose();
9414 } else if (prefix
== "osd create") {
9416 // optional id provided?
9417 int64_t id
= -1, cmd_id
= -1;
9418 if (cmd_getval(g_ceph_context
, cmdmap
, "id", cmd_id
)) {
9420 ss
<< "invalid osd id value '" << cmd_id
<< "'";
9424 dout(10) << " osd create got id " << cmd_id
<< dendl
;
9429 if (cmd_getval(g_ceph_context
, cmdmap
, "uuid", uuidstr
)) {
9430 if (!uuid
.parse(uuidstr
.c_str())) {
9431 ss
<< "invalid uuid value '" << uuidstr
<< "'";
9435 // we only care about the id if we also have the uuid, to
9436 // ensure the operation's idempotency.
9440 int32_t new_id
= -1;
9441 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
9443 if (err
== -EAGAIN
) {
9444 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9447 // a check has failed; reply to the user.
9450 } else if (err
== EEXIST
) {
9451 // this is an idempotent operation; we can go ahead and reply.
9453 f
->open_object_section("created_osd");
9454 f
->dump_int("osdid", new_id
);
9465 do_osd_create(id
, uuid
, &new_id
);
9468 f
->open_object_section("created_osd");
9469 f
->dump_int("osdid", new_id
);
9476 wait_for_finished_proposal(op
,
9477 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
9478 get_last_committed() + 1));
9481 } else if (prefix
== "osd blacklist clear") {
9482 pending_inc
.new_blacklist
.clear();
9483 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
9484 osdmap
.get_blacklist(&blacklist
);
9485 for (const auto &entry
: blacklist
) {
9486 pending_inc
.old_blacklist
.push_back(entry
.first
);
9488 ss
<< " removed all blacklist entries";
9490 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9491 get_last_committed() + 1));
9493 } else if (prefix
== "osd blacklist") {
9495 cmd_getval(g_ceph_context
, cmdmap
, "addr", addrstr
);
9497 if (!addr
.parse(addrstr
.c_str(), 0)) {
9498 ss
<< "unable to parse address " << addrstr
;
9504 cmd_getval(g_ceph_context
, cmdmap
, "blacklistop", blacklistop
);
9505 if (blacklistop
== "add") {
9506 utime_t expires
= ceph_clock_now();
9509 cmd_getval(g_ceph_context
, cmdmap
, "expire", d
,
9510 g_conf
->mon_osd_blacklist_default_expire
);
9513 pending_inc
.new_blacklist
[addr
] = expires
;
9516 // cancel any pending un-blacklisting request too
9517 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
9518 pending_inc
.old_blacklist
.end(), addr
);
9519 if (it
!= pending_inc
.old_blacklist
.end()) {
9520 pending_inc
.old_blacklist
.erase(it
);
9524 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
9526 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9527 get_last_committed() + 1));
9529 } else if (blacklistop
== "rm") {
9530 if (osdmap
.is_blacklisted(addr
) ||
9531 pending_inc
.new_blacklist
.count(addr
)) {
9532 if (osdmap
.is_blacklisted(addr
))
9533 pending_inc
.old_blacklist
.push_back(addr
);
9535 pending_inc
.new_blacklist
.erase(addr
);
9536 ss
<< "un-blacklisting " << addr
;
9538 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9539 get_last_committed() + 1));
9542 ss
<< addr
<< " isn't blacklisted";
9547 } else if (prefix
== "osd pool mksnap") {
9549 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
9550 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
9552 ss
<< "unrecognized pool '" << poolstr
<< "'";
9557 cmd_getval(g_ceph_context
, cmdmap
, "snap", snapname
);
9558 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
9559 if (p
->is_unmanaged_snaps_mode()) {
9560 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
9563 } else if (p
->snap_exists(snapname
.c_str())) {
9564 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
9567 } else if (p
->is_tier()) {
9568 ss
<< "pool " << poolstr
<< " is a cache tier";
9573 if (pending_inc
.new_pools
.count(pool
))
9574 pp
= &pending_inc
.new_pools
[pool
];
9576 pp
= &pending_inc
.new_pools
[pool
];
9579 if (pp
->snap_exists(snapname
.c_str())) {
9580 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
9582 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
9583 pp
->set_snap_epoch(pending_inc
.epoch
);
9584 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
9587 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9588 get_last_committed() + 1));
9590 } else if (prefix
== "osd pool rmsnap") {
9592 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
9593 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
9595 ss
<< "unrecognized pool '" << poolstr
<< "'";
9600 cmd_getval(g_ceph_context
, cmdmap
, "snap", snapname
);
9601 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
9602 if (p
->is_unmanaged_snaps_mode()) {
9603 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
9606 } else if (!p
->snap_exists(snapname
.c_str())) {
9607 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
9612 if (pending_inc
.new_pools
.count(pool
))
9613 pp
= &pending_inc
.new_pools
[pool
];
9615 pp
= &pending_inc
.new_pools
[pool
];
9618 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
9620 pp
->remove_snap(sn
);
9621 pp
->set_snap_epoch(pending_inc
.epoch
);
9622 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
9624 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
9627 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9628 get_last_committed() + 1));
9630 } else if (prefix
== "osd pool create") {
9633 cmd_getval(g_ceph_context
, cmdmap
, "pg_num", pg_num
, int64_t(0));
9634 cmd_getval(g_ceph_context
, cmdmap
, "pgp_num", pgp_num
, pg_num
);
9636 string pool_type_str
;
9637 cmd_getval(g_ceph_context
, cmdmap
, "pool_type", pool_type_str
);
9638 if (pool_type_str
.empty())
9639 pool_type_str
= g_conf
->osd_pool_default_type
;
9642 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
9643 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
9645 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
9646 if (pool_type_str
!= p
->get_type_name()) {
9647 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
9650 ss
<< "pool '" << poolstr
<< "' already exists";
9657 if (pool_type_str
== "replicated") {
9658 pool_type
= pg_pool_t::TYPE_REPLICATED
;
9659 } else if (pool_type_str
== "erasure") {
9660 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
|
9661 CEPH_FEATURE_OSD_ERASURE_CODES
,
9667 pool_type
= pg_pool_t::TYPE_ERASURE
;
9669 ss
<< "unknown pool type '" << pool_type_str
<< "'";
9674 bool implicit_rule_creation
= false;
9676 cmd_getval(g_ceph_context
, cmdmap
, "rule", rule_name
);
9677 string erasure_code_profile
;
9678 cmd_getval(g_ceph_context
, cmdmap
, "erasure_code_profile", erasure_code_profile
);
9680 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
9681 if (erasure_code_profile
== "")
9682 erasure_code_profile
= "default";
9683 //handle the erasure code profile
9684 if (erasure_code_profile
== "default") {
9685 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
9686 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
9687 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
9691 map
<string
,string
> profile_map
;
9692 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
9697 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
9698 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
9702 if (rule_name
== "") {
9703 implicit_rule_creation
= true;
9704 if (erasure_code_profile
== "default") {
9705 rule_name
= "erasure-code";
9707 dout(1) << "implicitly use rule named after the pool: "
9708 << poolstr
<< dendl
;
9709 rule_name
= poolstr
;
9713 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
9714 rule_name
= erasure_code_profile
;
9717 if (!implicit_rule_creation
&& rule_name
!= "") {
9719 err
= get_crush_rule(rule_name
, &rule
, &ss
);
9720 if (err
== -EAGAIN
) {
9721 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9728 int64_t expected_num_objects
;
9729 cmd_getval(g_ceph_context
, cmdmap
, "expected_num_objects", expected_num_objects
, int64_t(0));
9730 if (expected_num_objects
< 0) {
9731 ss
<< "'expected_num_objects' must be non-negative";
9736 int64_t fast_read_param
;
9737 cmd_getval(g_ceph_context
, cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
9738 FastReadType fast_read
= FAST_READ_DEFAULT
;
9739 if (fast_read_param
== 0)
9740 fast_read
= FAST_READ_OFF
;
9741 else if (fast_read_param
> 0)
9742 fast_read
= FAST_READ_ON
;
9744 err
= prepare_new_pool(poolstr
, 0, // auid=0 for admin created pool
9745 -1, // default crush rule
9748 erasure_code_profile
, pool_type
,
9749 (uint64_t)expected_num_objects
,
9755 ss
<< "pool '" << poolstr
<< "' already exists";
9758 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9767 ss
<< "pool '" << poolstr
<< "' created";
9770 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9771 get_last_committed() + 1));
9774 } else if (prefix
== "osd pool delete" ||
9775 prefix
== "osd pool rm") {
9776 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
9777 string poolstr
, poolstr2
, sure
;
9778 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
9779 cmd_getval(g_ceph_context
, cmdmap
, "pool2", poolstr2
);
9780 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
9781 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
9783 ss
<< "pool '" << poolstr
<< "' does not exist";
9788 bool force_no_fake
= sure
== "--yes-i-really-really-mean-it-not-faking";
9789 if (poolstr2
!= poolstr
||
9790 (sure
!= "--yes-i-really-really-mean-it" && !force_no_fake
)) {
9791 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
9792 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
9793 << "followed by --yes-i-really-really-mean-it.";
9797 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
9798 if (err
== -EAGAIN
) {
9799 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9805 } else if (prefix
== "osd pool rename") {
9806 string srcpoolstr
, destpoolstr
;
9807 cmd_getval(g_ceph_context
, cmdmap
, "srcpool", srcpoolstr
);
9808 cmd_getval(g_ceph_context
, cmdmap
, "destpool", destpoolstr
);
9809 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
9810 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
9813 if (pool_dst
>= 0) {
9814 // src pool doesn't exist, dst pool does exist: to ensure idempotency
9815 // of operations, assume this rename succeeded, as it is not changing
9816 // the current state. Make sure we output something understandable
9817 // for whoever is issuing the command, if they are paying attention,
9818 // in case it was not intentional; or to avoid a "wtf?" and a bug
9819 // report in case it was intentional, while expecting a failure.
9820 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
9821 << destpoolstr
<< "' does -- assuming successful rename";
9824 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
9828 } else if (pool_dst
>= 0) {
9829 // source pool exists and so does the destination pool
9830 ss
<< "pool '" << destpoolstr
<< "' already exists";
9835 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
9837 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
9839 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
9840 << cpp_strerror(ret
);
9843 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
9844 get_last_committed() + 1));
9847 } else if (prefix
== "osd pool set") {
9848 err
= prepare_command_pool_set(cmdmap
, ss
);
9855 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9856 get_last_committed() + 1));
9858 } else if (prefix
== "osd tier add") {
9859 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
9865 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
9866 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
9868 ss
<< "unrecognized pool '" << poolstr
<< "'";
9873 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
9874 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
9875 if (tierpool_id
< 0) {
9876 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
9880 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
9882 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
9885 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
9889 // make sure new tier is empty
9890 string force_nonempty
;
9891 cmd_getval(g_ceph_context
, cmdmap
, "force_nonempty", force_nonempty
);
9892 const pool_stat_t
*pstats
= mon
->pgservice
->get_pool_stat(tierpool_id
);
9893 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
9894 force_nonempty
!= "--force-nonempty") {
9895 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
9899 if (tp
->ec_pool()) {
9900 ss
<< "tier pool '" << tierpoolstr
9901 << "' is an ec pool, which cannot be a tier";
9905 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
9906 ((force_nonempty
!= "--force-nonempty") ||
9907 (!g_conf
->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
9908 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
9913 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
9914 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
9915 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
9916 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9919 np
->tiers
.insert(tierpool_id
);
9920 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
9921 ntp
->tier_of
= pool_id
;
9922 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
9923 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
9924 get_last_committed() + 1));
9926 } else if (prefix
== "osd tier remove" ||
9927 prefix
== "osd tier rm") {
9929 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
9930 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
9932 ss
<< "unrecognized pool '" << poolstr
<< "'";
9937 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
9938 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
9939 if (tierpool_id
< 0) {
9940 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
9944 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
9946 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
9949 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
9953 if (p
->tiers
.count(tierpool_id
) == 0) {
9954 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
9958 if (tp
->tier_of
!= pool_id
) {
9959 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
9960 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
9961 // be scary about it; this is an inconsistency and bells must go off
9962 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
9966 if (p
->read_tier
== tierpool_id
) {
9967 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
9972 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
9973 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
9974 if (np
->tiers
.count(tierpool_id
) == 0 ||
9975 ntp
->tier_of
!= pool_id
||
9976 np
->read_tier
== tierpool_id
) {
9977 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9980 np
->tiers
.erase(tierpool_id
);
9982 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
9983 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
9984 get_last_committed() + 1));
9986 } else if (prefix
== "osd tier set-overlay") {
9987 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
9993 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
9994 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
9996 ss
<< "unrecognized pool '" << poolstr
<< "'";
10000 string overlaypoolstr
;
10001 cmd_getval(g_ceph_context
, cmdmap
, "overlaypool", overlaypoolstr
);
10002 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
10003 if (overlaypool_id
< 0) {
10004 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
10008 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10010 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
10012 if (p
->tiers
.count(overlaypool_id
) == 0) {
10013 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
10017 if (p
->read_tier
== overlaypool_id
) {
10019 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
10022 if (p
->has_read_tier()) {
10023 ss
<< "pool '" << poolstr
<< "' has overlay '"
10024 << osdmap
.get_pool_name(p
->read_tier
)
10025 << "'; please remove-overlay first";
10031 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10032 np
->read_tier
= overlaypool_id
;
10033 np
->write_tier
= overlaypool_id
;
10034 np
->set_last_force_op_resend(pending_inc
.epoch
);
10035 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
10036 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
10037 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
10038 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
10039 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
10040 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10041 get_last_committed() + 1));
10043 } else if (prefix
== "osd tier remove-overlay" ||
10044 prefix
== "osd tier rm-overlay") {
10046 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10047 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10049 ss
<< "unrecognized pool '" << poolstr
<< "'";
10053 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10055 if (!p
->has_read_tier()) {
10057 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
10061 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
10066 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10067 if (np
->has_read_tier()) {
10068 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
10069 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
10070 nop
->set_last_force_op_resend(pending_inc
.epoch
);
10072 if (np
->has_write_tier()) {
10073 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
10074 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
10075 nop
->set_last_force_op_resend(pending_inc
.epoch
);
10077 np
->clear_read_tier();
10078 np
->clear_write_tier();
10079 np
->set_last_force_op_resend(pending_inc
.epoch
);
10080 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
10081 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10082 get_last_committed() + 1));
10084 } else if (prefix
== "osd tier cache-mode") {
10085 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
10086 if (err
== -EAGAIN
)
10091 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10092 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10094 ss
<< "unrecognized pool '" << poolstr
<< "'";
10098 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10100 if (!p
->is_tier()) {
10101 ss
<< "pool '" << poolstr
<< "' is not a tier";
10106 cmd_getval(g_ceph_context
, cmdmap
, "mode", modestr
);
10107 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
10109 ss
<< "'" << modestr
<< "' is not a valid cache mode";
10115 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
10116 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10117 mode
!= pg_pool_t::CACHEMODE_NONE
&&
10118 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
10119 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
10120 sure
!= "--yes-i-really-mean-it") {
10121 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
10122 << "corrupt your data. pass --yes-i-really-mean-it to force.";
10127 // pool already has this cache-mode set and there are no pending changes
10128 if (p
->cache_mode
== mode
&&
10129 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
10130 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
10131 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
10132 << " to " << pg_pool_t::get_cache_mode_name(mode
);
10137 /* Mode description:
10139 * none: No cache-mode defined
10140 * forward: Forward all reads and writes to base pool
10141 * writeback: Cache writes, promote reads from base pool
10142 * readonly: Forward writes to base pool
10143 * readforward: Writes are in writeback mode, Reads are in forward mode
10144 * proxy: Proxy all reads and writes to base pool
10145 * readproxy: Writes are in writeback mode, Reads are in proxy mode
10147 * Hence, these are the allowed transitions:
10150 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10151 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10152 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
10153 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
10154 * writeback -> readforward || readproxy || forward || proxy
10158 // We check if the transition is valid against the current pool mode, as
10159 // it is the only committed state thus far. We will blantly squash
10160 // whatever mode is on the pending state.
10162 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
10163 (mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
10164 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
10165 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
10166 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
10167 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
10168 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
10169 << "' pool; only '"
10170 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD
)
10172 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
10174 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD
)
10176 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
10181 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
10182 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10183 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
10184 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
10185 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
10187 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
10188 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10189 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
10190 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
10191 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
10193 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
10194 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10195 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
10196 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
10197 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
10199 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
10200 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10201 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
10202 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
10203 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
10205 const pool_stat_t
* pstats
=
10206 mon
->pgservice
->get_pool_stat(pool_id
);
10208 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
10209 ss
<< "unable to set cache-mode '"
10210 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
10211 << "': dirty objects found";
10217 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10218 np
->cache_mode
= mode
;
10219 // set this both when moving to and from cache_mode NONE. this is to
10220 // capture legacy pools that were set up before this flag existed.
10221 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
10222 ss
<< "set cache-mode for pool '" << poolstr
10223 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
10224 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
10225 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
10227 if (base_pool
->read_tier
== pool_id
||
10228 base_pool
->write_tier
== pool_id
)
10229 ss
<<" (WARNING: pool is still configured as read or write tier)";
10231 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10232 get_last_committed() + 1));
10234 } else if (prefix
== "osd tier add-cache") {
10235 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
10236 if (err
== -EAGAIN
)
10241 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10242 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10244 ss
<< "unrecognized pool '" << poolstr
<< "'";
10248 string tierpoolstr
;
10249 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
10250 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
10251 if (tierpool_id
< 0) {
10252 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
10256 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10258 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
10261 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
10266 if (!cmd_getval(g_ceph_context
, cmdmap
, "size", size
)) {
10267 ss
<< "unable to parse 'size' value '"
10268 << cmd_vartype_stringify(cmdmap
["size"]) << "'";
10272 // make sure new tier is empty
10273 const pool_stat_t
*pstats
=
10274 mon
->pgservice
->get_pool_stat(tierpool_id
);
10275 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
10276 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
10280 string modestr
= g_conf
->osd_tier_default_cache_mode
;
10281 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
10283 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
10287 HitSet::Params hsp
;
10288 if (g_conf
->osd_tier_default_cache_hit_set_type
== "bloom") {
10289 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
10290 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
10291 hsp
= HitSet::Params(bsp
);
10292 } else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_hash") {
10293 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
10295 else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_object") {
10296 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
10298 ss
<< "osd tier cache default hit set type '" <<
10299 g_conf
->osd_tier_default_cache_hit_set_type
<< "' is not a known type";
10304 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10305 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
10306 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
10307 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10310 np
->tiers
.insert(tierpool_id
);
10311 np
->read_tier
= np
->write_tier
= tierpool_id
;
10312 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
10313 np
->set_last_force_op_resend(pending_inc
.epoch
);
10314 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
10315 ntp
->tier_of
= pool_id
;
10316 ntp
->cache_mode
= mode
;
10317 ntp
->hit_set_count
= g_conf
->osd_tier_default_cache_hit_set_count
;
10318 ntp
->hit_set_period
= g_conf
->osd_tier_default_cache_hit_set_period
;
10319 ntp
->min_read_recency_for_promote
= g_conf
->osd_tier_default_cache_min_read_recency_for_promote
;
10320 ntp
->min_write_recency_for_promote
= g_conf
->osd_tier_default_cache_min_write_recency_for_promote
;
10321 ntp
->hit_set_grade_decay_rate
= g_conf
->osd_tier_default_cache_hit_set_grade_decay_rate
;
10322 ntp
->hit_set_search_last_n
= g_conf
->osd_tier_default_cache_hit_set_search_last_n
;
10323 ntp
->hit_set_params
= hsp
;
10324 ntp
->target_max_bytes
= size
;
10325 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
10326 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10327 get_last_committed() + 1));
10329 } else if (prefix
== "osd pool set-quota") {
10331 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10332 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10334 ss
<< "unrecognized pool '" << poolstr
<< "'";
10340 cmd_getval(g_ceph_context
, cmdmap
, "field", field
);
10341 if (field
!= "max_objects" && field
!= "max_bytes") {
10342 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
10347 // val could contain unit designations, so we treat as a string
10349 cmd_getval(g_ceph_context
, cmdmap
, "val", val
);
10351 int64_t value
= unit_to_bytesize(val
, &tss
);
10353 ss
<< "error parsing value '" << value
<< "': " << tss
.str();
10358 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
10359 if (field
== "max_objects") {
10360 pi
->quota_max_objects
= value
;
10361 } else if (field
== "max_bytes") {
10362 pi
->quota_max_bytes
= value
;
10364 assert(0 == "unrecognized option");
10366 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
10368 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10369 get_last_committed() + 1));
10372 } else if (prefix
== "osd reweight-by-pg" ||
10373 prefix
== "osd reweight-by-utilization" ||
10374 prefix
== "osd test-reweight-by-pg" ||
10375 prefix
== "osd test-reweight-by-utilization") {
10377 prefix
== "osd reweight-by-pg" || prefix
== "osd test-reweight-by-pg";
10379 prefix
== "osd test-reweight-by-pg" ||
10380 prefix
== "osd test-reweight-by-utilization";
10382 cmd_getval(g_ceph_context
, cmdmap
, "oload", oload
, int64_t(120));
10383 set
<int64_t> pools
;
10384 vector
<string
> poolnamevec
;
10385 cmd_getval(g_ceph_context
, cmdmap
, "pools", poolnamevec
);
10386 for (unsigned j
= 0; j
< poolnamevec
.size(); j
++) {
10387 int64_t pool
= osdmap
.lookup_pg_pool_name(poolnamevec
[j
]);
10389 ss
<< "pool '" << poolnamevec
[j
] << "' does not exist";
10393 pools
.insert(pool
);
10395 double max_change
= g_conf
->mon_reweight_max_change
;
10396 cmd_getval(g_ceph_context
, cmdmap
, "max_change", max_change
);
10397 if (max_change
<= 0.0) {
10398 ss
<< "max_change " << max_change
<< " must be positive";
10402 int64_t max_osds
= g_conf
->mon_reweight_max_osds
;
10403 cmd_getval(g_ceph_context
, cmdmap
, "max_osds", max_osds
);
10404 if (max_osds
<= 0) {
10405 ss
<< "max_osds " << max_osds
<< " must be positive";
10409 string no_increasing
;
10410 cmd_getval(g_ceph_context
, cmdmap
, "no_increasing", no_increasing
);
10412 mempool::osdmap::map
<int32_t, uint32_t> new_weights
;
10413 err
= mon
->pgservice
->reweight_by_utilization(osdmap
,
10418 pools
.empty() ? NULL
: &pools
,
10419 no_increasing
== "--no-increasing",
10421 &ss
, &out_str
, f
.get());
10423 dout(10) << "reweight::by_utilization: finished with " << out_str
<< dendl
;
10428 rdata
.append(out_str
);
10430 ss
<< "FAILED reweight-by-pg";
10431 } else if (err
== 0 || dry_run
) {
10434 ss
<< "SUCCESSFUL reweight-by-pg";
10435 pending_inc
.new_weight
= std::move(new_weights
);
10436 wait_for_finished_proposal(
10438 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
10447 if (err
< 0 && rs
.length() == 0)
10448 rs
= cpp_strerror(err
);
10449 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
10454 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10455 get_last_committed() + 1));
10459 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10463 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
10465 op
->mark_osdmon_event(__func__
);
10466 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
10468 if (m
->fsid
!= mon
->monmap
->fsid
) {
10469 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
10470 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
10471 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
10475 if (m
->op
== POOL_OP_CREATE
)
10476 return preprocess_pool_op_create(op
);
10478 if (!osdmap
.get_pg_pool(m
->pool
)) {
10479 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
10480 _pool_op_reply(op
, 0, osdmap
.get_epoch());
10484 // check if the snap and snapname exist
10485 bool snap_exists
= false;
10486 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
10487 if (p
->snap_exists(m
->name
.c_str()))
10488 snap_exists
= true;
10491 case POOL_OP_CREATE_SNAP
:
10492 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
10493 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
10497 _pool_op_reply(op
, 0, osdmap
.get_epoch());
10501 case POOL_OP_CREATE_UNMANAGED_SNAP
:
10502 if (p
->is_pool_snaps_mode()) {
10503 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
10507 case POOL_OP_DELETE_SNAP
:
10508 if (p
->is_unmanaged_snaps_mode()) {
10509 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
10512 if (!snap_exists
) {
10513 _pool_op_reply(op
, 0, osdmap
.get_epoch());
10517 case POOL_OP_DELETE_UNMANAGED_SNAP
:
10518 if (p
->is_pool_snaps_mode()) {
10519 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
10522 if (p
->is_removed_snap(m
->snapid
)) {
10523 _pool_op_reply(op
, 0, osdmap
.get_epoch());
10527 case POOL_OP_DELETE
:
10528 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
10529 _pool_op_reply(op
, 0, osdmap
.get_epoch());
10533 case POOL_OP_AUID_CHANGE
:
10543 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
10545 op
->mark_osdmon_event(__func__
);
10546 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
10547 MonSession
*session
= m
->get_session();
10549 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
10552 if (!session
->is_capable("osd", MON_CAP_W
)) {
10553 dout(5) << "attempt to create new pool without sufficient auid privileges!"
10554 << "message: " << *m
<< std::endl
10555 << "caps: " << session
->caps
<< dendl
;
10556 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
10560 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
10562 _pool_op_reply(op
, 0, osdmap
.get_epoch());
10569 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
10571 op
->mark_osdmon_event(__func__
);
10572 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
10573 dout(10) << "prepare_pool_op " << *m
<< dendl
;
10574 if (m
->op
== POOL_OP_CREATE
) {
10575 return prepare_pool_op_create(op
);
10576 } else if (m
->op
== POOL_OP_DELETE
) {
10577 return prepare_pool_op_delete(op
);
10581 bool changed
= false;
10583 if (!osdmap
.have_pg_pool(m
->pool
)) {
10584 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
10588 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
10591 case POOL_OP_CREATE_SNAP
:
10592 if (pool
->is_tier()) {
10594 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
10596 } // else, fall through
10597 case POOL_OP_DELETE_SNAP
:
10598 if (!pool
->is_unmanaged_snaps_mode()) {
10599 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
10600 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
10601 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
10609 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
10612 case POOL_OP_DELETE_UNMANAGED_SNAP
:
10613 // we won't allow removal of an unmanaged snapshot from a pool
10614 // not in unmanaged snaps mode.
10615 if (!pool
->is_unmanaged_snaps_mode()) {
10616 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
10620 case POOL_OP_CREATE_UNMANAGED_SNAP
:
10621 // but we will allow creating an unmanaged snapshot on any pool
10622 // as long as it is not in 'pool' snaps mode.
10623 if (pool
->is_pool_snaps_mode()) {
10624 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
10629 // projected pool info
10631 if (pending_inc
.new_pools
.count(m
->pool
))
10632 pp
= pending_inc
.new_pools
[m
->pool
];
10634 pp
= *osdmap
.get_pg_pool(m
->pool
);
10636 bufferlist reply_data
;
10638 // pool snaps vs unmanaged snaps are mutually exclusive
10640 case POOL_OP_CREATE_SNAP
:
10641 case POOL_OP_DELETE_SNAP
:
10642 if (pp
.is_unmanaged_snaps_mode()) {
10648 case POOL_OP_CREATE_UNMANAGED_SNAP
:
10649 case POOL_OP_DELETE_UNMANAGED_SNAP
:
10650 if (pp
.is_pool_snaps_mode()) {
10657 case POOL_OP_CREATE_SNAP
:
10658 if (!pp
.snap_exists(m
->name
.c_str())) {
10659 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
10660 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
<< " seq " << pp
.get_snap_epoch() << dendl
;
10665 case POOL_OP_DELETE_SNAP
:
10667 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
10675 case POOL_OP_CREATE_UNMANAGED_SNAP
:
10678 pp
.add_unmanaged_snap(snapid
);
10679 ::encode(snapid
, reply_data
);
10684 case POOL_OP_DELETE_UNMANAGED_SNAP
:
10685 if (!pp
.is_removed_snap(m
->snapid
)) {
10686 pp
.remove_unmanaged_snap(m
->snapid
);
10691 case POOL_OP_AUID_CHANGE
:
10692 if (pp
.auid
!= m
->auid
) {
10704 pp
.set_snap_epoch(pending_inc
.epoch
);
10705 pending_inc
.new_pools
[m
->pool
] = pp
;
10709 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
10713 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
10715 op
->mark_osdmon_event(__func__
);
10716 int err
= prepare_new_pool(op
);
10717 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
10721 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
10724 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
10726 // If the Pool is in use by CephFS, refuse to delete it
10727 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending();
10728 if (pending_fsmap
.pool_in_use(pool_id
)) {
10729 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
10733 if (pool
.tier_of
>= 0) {
10734 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
10735 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
10738 if (!pool
.tiers
.empty()) {
10739 *ss
<< "pool '" << poolstr
<< "' has tiers";
10740 for(auto tier
: pool
.tiers
) {
10741 *ss
<< " " << osdmap
.get_pool_name(tier
);
10746 if (!g_conf
->mon_allow_pool_delete
) {
10747 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
10751 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
10752 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
10756 *ss
<< "pool '" << poolstr
<< "' removed";
10761 * Check if it is safe to add a tier to a base pool
10764 * True if the operation should proceed, false if we should abort here
10765 * (abort doesn't necessarily mean error, could be idempotency)
10767 bool OSDMonitor::_check_become_tier(
10768 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
10769 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
10773 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
10774 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
10776 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending();
10777 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
10778 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
10783 if (base_pool
->tiers
.count(tier_pool_id
)) {
10784 assert(tier_pool
->tier_of
== base_pool_id
);
10786 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
10787 << base_pool_name
<< "'";
10791 if (base_pool
->is_tier()) {
10792 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
10793 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
10794 << "multiple tiers are not yet supported.";
10799 if (tier_pool
->has_tiers()) {
10800 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
10801 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
10802 it
!= tier_pool
->tiers
.end(); ++it
)
10803 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
10804 *ss
<< " multiple tiers are not yet supported.";
10809 if (tier_pool
->is_tier()) {
10810 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
10811 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
10822 * Check if it is safe to remove a tier from this base pool
10825 * True if the operation should proceed, false if we should abort here
10826 * (abort doesn't necessarily mean error, could be idempotency)
10828 bool OSDMonitor::_check_remove_tier(
10829 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
10830 const pg_pool_t
*tier_pool
,
10831 int *err
, ostream
*ss
) const
10833 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
10835 // Apply CephFS-specific checks
10836 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending();
10837 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
10838 if (base_pool
->type
!= pg_pool_t::TYPE_REPLICATED
) {
10839 // If the underlying pool is erasure coded, we can't permit the
10840 // removal of the replicated tier that CephFS relies on to access it
10841 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS via its tier";
10846 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
10847 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
10848 "tier is still in use as a writeback cache. Change the cache "
10849 "mode and flush the cache before removing it";
10859 int OSDMonitor::_prepare_remove_pool(
10860 int64_t pool
, ostream
*ss
, bool no_fake
)
10862 dout(10) << __func__
<< " " << pool
<< dendl
;
10863 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
10864 int r
= _check_remove_pool(pool
, *p
, ss
);
10868 auto new_pool
= pending_inc
.new_pools
.find(pool
);
10869 if (new_pool
!= pending_inc
.new_pools
.end()) {
10870 // if there is a problem with the pending info, wait and retry
10872 const auto& p
= new_pool
->second
;
10873 int r
= _check_remove_pool(pool
, p
, ss
);
10878 if (pending_inc
.old_pools
.count(pool
)) {
10879 dout(10) << __func__
<< " " << pool
<< " already pending removal"
10884 if (g_conf
->mon_fake_pool_delete
&& !no_fake
) {
10885 string old_name
= osdmap
.get_pool_name(pool
);
10886 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
10887 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
10888 << old_name
<< " -> " << new_name
<< dendl
;
10889 pending_inc
.new_pool_names
[pool
] = new_name
;
10894 pending_inc
.old_pools
.insert(pool
);
10896 // remove any pg_temp mappings for this pool
10897 for (auto p
= osdmap
.pg_temp
->begin();
10898 p
!= osdmap
.pg_temp
->end();
10900 if (p
->first
.pool() == (uint64_t)pool
) {
10901 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
10902 << p
->first
<< dendl
;
10903 pending_inc
.new_pg_temp
[p
->first
].clear();
10906 // remove any primary_temp mappings for this pool
10907 for (auto p
= osdmap
.primary_temp
->begin();
10908 p
!= osdmap
.primary_temp
->end();
10910 if (p
->first
.pool() == (uint64_t)pool
) {
10911 dout(10) << __func__
<< " " << pool
10912 << " removing obsolete primary_temp" << p
->first
<< dendl
;
10913 pending_inc
.new_primary_temp
[p
->first
] = -1;
10916 // remove any pg_upmap mappings for this pool
10917 for (auto& p
: osdmap
.pg_upmap
) {
10918 if (p
.first
.pool() == (uint64_t)pool
) {
10919 dout(10) << __func__
<< " " << pool
10920 << " removing obsolete pg_upmap "
10921 << p
.first
<< dendl
;
10922 pending_inc
.old_pg_upmap
.insert(p
.first
);
10925 // remove any pg_upmap_items mappings for this pool
10926 for (auto& p
: osdmap
.pg_upmap_items
) {
10927 if (p
.first
.pool() == (uint64_t)pool
) {
10928 dout(10) << __func__
<< " " << pool
10929 << " removing obsolete pg_upmap_items " << p
.first
10931 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
10937 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
10939 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
10940 if (pending_inc
.old_pools
.count(pool
)) {
10941 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
10944 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
10945 p
!= pending_inc
.new_pool_names
.end();
10947 if (p
->second
== newname
&& p
->first
!= pool
) {
10952 pending_inc
.new_pool_names
[pool
] = newname
;
10956 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
10958 op
->mark_osdmon_event(__func__
);
10959 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
10961 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
10962 if (ret
== -EAGAIN
) {
10963 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10967 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
10968 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
10969 pending_inc
.epoch
));
10973 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
10974 int ret
, epoch_t epoch
, bufferlist
*blp
)
10976 op
->mark_osdmon_event(__func__
);
10977 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
10978 dout(20) << "_pool_op_reply " << ret
<< dendl
;
10979 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
10980 ret
, epoch
, get_last_committed(), blp
);
10981 mon
->send_reply(op
, reply
);