1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
24 #include "mon/OSDMonitor.h"
25 #include "mon/Monitor.h"
26 #include "mon/MDSMonitor.h"
27 #include "mon/PGMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreated.h"
51 #include "messages/MOSDPGTemp.h"
52 #include "messages/MMonCommand.h"
53 #include "messages/MRemoveSnaps.h"
54 #include "messages/MOSDScrub.h"
55 #include "messages/MRoute.h"
57 #include "common/TextTable.h"
58 #include "common/Timer.h"
59 #include "common/ceph_argparse.h"
60 #include "common/perf_counters.h"
61 #include "common/strtol.h"
63 #include "common/config.h"
64 #include "common/errno.h"
66 #include "erasure-code/ErasureCodePlugin.h"
67 #include "compressor/Compressor.h"
68 #include "common/Checksummer.h"
70 #include "include/compat.h"
71 #include "include/assert.h"
72 #include "include/stringify.h"
73 #include "include/util.h"
74 #include "common/cmdparse.h"
75 #include "include/str_list.h"
76 #include "include/str_map.h"
77 #include "include/scope_guard.h"
79 #include "json_spirit/json_spirit_reader.h"
81 #include <boost/algorithm/string/predicate.hpp>
83 #define dout_subsys ceph_subsys_mon
84 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
85 static const string
OSD_METADATA_PREFIX("osd_metadata");
89 const uint32_t MAX_POOL_APPLICATIONS
= 4;
90 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
91 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
93 } // anonymous namespace
95 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
97 if (epoch_by_pg
.size() <= ps
) {
98 epoch_by_pg
.resize(ps
+ 1, 0);
100 const auto old_lec
= epoch_by_pg
[ps
];
101 if (old_lec
>= last_epoch_clean
) {
105 epoch_by_pg
[ps
] = last_epoch_clean
;
106 if (last_epoch_clean
< floor
) {
107 floor
= last_epoch_clean
;
108 } else if (last_epoch_clean
> floor
) {
109 if (old_lec
== floor
) {
110 // probably should increase floor?
111 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
112 std::end(epoch_by_pg
));
116 if (ps
!= next_missing
) {
119 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
120 if (epoch_by_pg
[next_missing
] == 0) {
126 void LastEpochClean::remove_pool(uint64_t pool
)
128 report_by_pool
.erase(pool
);
131 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
133 auto& lec
= report_by_pool
[pg
.pool()];
134 return lec
.report(pg
.ps(), last_epoch_clean
);
137 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
139 auto floor
= latest
.get_epoch();
140 for (auto& pool
: latest
.get_pools()) {
141 auto reported
= report_by_pool
.find(pool
.first
);
142 if (reported
== report_by_pool
.end()) {
145 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
148 if (reported
->second
.floor
< floor
) {
149 floor
= reported
->second
.floor
;
156 struct C_UpdateCreatingPGs
: public Context
{
160 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
161 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
162 void finish(int r
) override
{
164 utime_t end
= ceph_clock_now();
165 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
166 << (end
- start
) << " seconds" << dendl
;
167 osdmon
->update_creating_pgs();
168 osdmon
->check_pg_creates_subs();
174 #define dout_prefix _prefix(_dout, mon, osdmap)
175 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
176 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
177 << "(" << mon
->get_state_name()
178 << ").osd e" << osdmap
.get_epoch() << " ";
181 OSDMonitor::OSDMonitor(
185 const string
& service_name
)
186 : PaxosService(mn
, p
, service_name
),
188 inc_osd_cache(g_conf
->mon_osd_cache_size
),
189 full_osd_cache(g_conf
->mon_osd_cache_size
),
190 last_attempted_minwait_time(utime_t()),
191 mapper(mn
->cct
, &mn
->cpu_tp
),
192 op_tracker(cct
, true, 1)
195 bool OSDMonitor::_have_pending_crush()
197 return pending_inc
.crush
.length() > 0;
200 CrushWrapper
&OSDMonitor::_get_stable_crush()
202 return *osdmap
.crush
;
205 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
208 if (pending_inc
.crush
.length())
209 bl
= pending_inc
.crush
;
211 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
213 bufferlist::iterator p
= bl
.begin();
217 void OSDMonitor::create_initial()
219 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
224 mon
->store
->get("mkfs", "osdmap", bl
);
228 newmap
.set_fsid(mon
->monmap
->fsid
);
230 newmap
.build_simple(g_ceph_context
, 0, mon
->monmap
->fsid
, 0);
233 newmap
.created
= newmap
.modified
= ceph_clock_now();
235 // new clusters should sort bitwise by default.
236 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
238 // new cluster should require latest by default
239 if (g_conf
->mon_debug_no_require_luminous
) {
240 newmap
.require_osd_release
= CEPH_RELEASE_KRAKEN
;
241 derr
<< __func__
<< " mon_debug_no_require_luminous=true" << dendl
;
243 newmap
.require_osd_release
= CEPH_RELEASE_LUMINOUS
;
245 CEPH_OSDMAP_RECOVERY_DELETES
|
246 CEPH_OSDMAP_PURGED_SNAPDIRS
;
247 newmap
.full_ratio
= g_conf
->mon_osd_full_ratio
;
248 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
249 newmap
.backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
250 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
251 newmap
.nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
252 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
253 int r
= ceph_release_from_name(
254 g_conf
->mon_osd_initial_require_min_compat_client
.c_str());
256 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
258 newmap
.require_min_compat_client
= r
;
261 // encode into pending incremental
262 newmap
.encode(pending_inc
.fullmap
,
263 mon
->get_quorum_con_features() | CEPH_FEATURE_RESERVED
);
264 pending_inc
.full_crc
= newmap
.get_crc();
265 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
268 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
)
270 s
.insert(service_name
);
271 s
.insert(OSD_PG_CREATING_PREFIX
);
272 s
.insert(OSD_METADATA_PREFIX
);
275 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
277 version_t version
= get_last_committed();
278 if (version
== osdmap
.epoch
)
280 assert(version
> osdmap
.epoch
);
282 dout(15) << "update_from_paxos paxos e " << version
283 << ", my e " << osdmap
.epoch
<< dendl
;
286 if (!mapping_job
->is_done()) {
287 dout(1) << __func__
<< " mapping job "
288 << mapping_job
.get() << " did not complete, "
289 << mapping_job
->shards
<< " left, canceling" << dendl
;
290 mapping_job
->abort();
298 * We will possibly have a stashed latest that *we* wrote, and we will
299 * always be sure to have the oldest full map in the first..last range
300 * due to encode_trim_extra(), which includes the oldest full map in the trim
303 * encode_trim_extra() does not however write the full map's
304 * version to 'full_latest'. This is only done when we are building the
305 * full maps from the incremental versions. But don't panic! We make sure
306 * that the following conditions find whichever full map version is newer.
308 version_t latest_full
= get_version_latest_full();
309 if (latest_full
== 0 && get_first_committed() > 1)
310 latest_full
= get_first_committed();
312 if (get_first_committed() > 1 &&
313 latest_full
< get_first_committed()) {
314 // the monitor could be just sync'ed with its peer, and the latest_full key
315 // is not encoded in the paxos commits in encode_pending(), so we need to
316 // make sure we get it pointing to a proper version.
317 version_t lc
= get_last_committed();
318 version_t fc
= get_first_committed();
320 dout(10) << __func__
<< " looking for valid full map in interval"
321 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
324 for (version_t v
= lc
; v
>= fc
; v
--) {
325 string full_key
= "full_" + stringify(v
);
326 if (mon
->store
->exists(get_service_name(), full_key
)) {
327 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
333 assert(latest_full
> 0);
334 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
335 put_version_latest_full(t
, latest_full
);
336 mon
->store
->apply_transaction(t
);
337 dout(10) << __func__
<< " updated the on-disk full map version to "
338 << latest_full
<< dendl
;
341 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
342 bufferlist latest_bl
;
343 get_version_full(latest_full
, latest_bl
);
344 assert(latest_bl
.length() != 0);
345 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
346 osdmap
.decode(latest_bl
);
349 if (mon
->monmap
->get_required_features().contains_all(
350 ceph::features::mon::FEATURE_LUMINOUS
)) {
352 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
354 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
355 creating_pgs
.decode(p
);
356 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
357 << creating_pgs
.last_scan_epoch
358 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
360 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
365 // make sure we're using the right pg service.. remove me post-luminous!
366 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
367 dout(10) << __func__
<< " pgservice is mgrstat" << dendl
;
368 mon
->pgservice
= mon
->mgrstatmon()->get_pg_stat_service();
370 dout(10) << __func__
<< " pgservice is pg" << dendl
;
371 mon
->pgservice
= mon
->pgmon()->get_pg_stat_service();
374 // walk through incrementals
375 MonitorDBStore::TransactionRef t
;
377 while (version
> osdmap
.epoch
) {
379 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
381 assert(inc_bl
.length());
383 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
385 OSDMap::Incremental
inc(inc_bl
);
386 err
= osdmap
.apply_incremental(inc
);
390 t
.reset(new MonitorDBStore::Transaction
);
392 // Write out the full map for all past epochs. Encode the full
393 // map with the same features as the incremental. If we don't
394 // know, use the quorum features. If we don't know those either,
395 // encode with all features.
396 uint64_t f
= inc
.encode_features
;
398 f
= mon
->get_quorum_con_features();
402 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
403 tx_size
+= full_bl
.length();
405 bufferlist orig_full_bl
;
406 get_version_full(osdmap
.epoch
, orig_full_bl
);
407 if (orig_full_bl
.length()) {
408 // the primary provided the full map
409 assert(inc
.have_crc
);
410 if (inc
.full_crc
!= osdmap
.crc
) {
411 // This will happen if the mons were running mixed versions in
412 // the past or some other circumstance made the full encoded
413 // maps divergent. Reloading here will bring us back into
414 // sync with the primary for this and all future maps. OSDs
415 // will also be brought back into sync when they discover the
416 // crc mismatch and request a full map from a mon.
417 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
420 osdmap
.decode(orig_full_bl
);
423 assert(!inc
.have_crc
);
424 put_version_full(t
, osdmap
.epoch
, full_bl
);
426 put_version_latest_full(t
, osdmap
.epoch
);
429 dout(1) << osdmap
<< dendl
;
431 if (osdmap
.epoch
== 1) {
432 t
->erase("mkfs", "osdmap");
435 // make sure we're using the right pg service.. remove me post-luminous!
436 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
437 dout(10) << __func__
<< " pgservice is mgrstat" << dendl
;
438 mon
->pgservice
= mon
->mgrstatmon()->get_pg_stat_service();
440 dout(10) << __func__
<< " pgservice is pg" << dendl
;
441 mon
->pgservice
= mon
->pgmon()->get_pg_stat_service();
444 if (tx_size
> g_conf
->mon_sync_max_payload_size
*2) {
445 mon
->store
->apply_transaction(t
);
446 t
= MonitorDBStore::TransactionRef();
449 if (mon
->monmap
->get_required_features().contains_all(
450 ceph::features::mon::FEATURE_LUMINOUS
)) {
451 for (const auto &osd_state
: inc
.new_state
) {
452 if (osd_state
.second
& CEPH_OSD_UP
) {
453 // could be marked up *or* down, but we're too lazy to check which
454 last_osd_report
.erase(osd_state
.first
);
456 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
457 // could be created *or* destroyed, but we can safely drop it
458 osd_epochs
.erase(osd_state
.first
);
465 mon
->store
->apply_transaction(t
);
468 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
469 if (osdmap
.is_out(o
))
471 auto found
= down_pending_out
.find(o
);
472 if (osdmap
.is_down(o
)) {
473 // populate down -> out map
474 if (found
== down_pending_out
.end()) {
475 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
476 down_pending_out
[o
] = ceph_clock_now();
479 if (found
!= down_pending_out
.end()) {
480 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
481 down_pending_out
.erase(found
);
485 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
487 if (mon
->is_leader()) {
488 // kick pgmon, make sure it's seen the latest map
489 mon
->pgmon()->check_osd_map(osdmap
.epoch
);
493 check_pg_creates_subs();
495 share_map_with_random_osd();
500 // make sure our feature bits reflect the latest map
501 update_msgr_features();
503 if (!mon
->is_leader()) {
504 // will be called by on_active() on the leader, avoid doing so twice
509 void OSDMonitor::start_mapping()
511 // initiate mapping job
513 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
515 mapping_job
->abort();
517 if (!osdmap
.get_pools().empty()) {
518 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
519 mapping_job
= mapping
.start_update(osdmap
, mapper
,
520 g_conf
->mon_osd_mapping_pgs_per_chunk
);
521 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
522 << " at " << fin
->start
<< dendl
;
523 mapping_job
->set_finish_event(fin
);
525 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
526 mapping_job
= nullptr;
530 void OSDMonitor::update_msgr_features()
533 types
.insert((int)entity_name_t::TYPE_OSD
);
534 types
.insert((int)entity_name_t::TYPE_CLIENT
);
535 types
.insert((int)entity_name_t::TYPE_MDS
);
536 types
.insert((int)entity_name_t::TYPE_MON
);
537 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
539 uint64_t features
= osdmap
.get_features(*q
, &mask
);
540 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
541 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
542 Messenger::Policy p
= mon
->messenger
->get_policy(*q
);
543 p
.features_required
= (p
.features_required
& ~mask
) | features
;
544 mon
->messenger
->set_policy(*q
, p
);
549 void OSDMonitor::on_active()
553 if (mon
->is_leader()) {
554 mon
->clog
->debug() << "osdmap " << osdmap
;
556 list
<MonOpRequestRef
> ls
;
557 take_all_failures(ls
);
558 while (!ls
.empty()) {
559 MonOpRequestRef op
= ls
.front();
560 op
->mark_osdmon_event(__func__
);
568 void OSDMonitor::on_restart()
570 last_osd_report
.clear();
573 void OSDMonitor::on_shutdown()
575 dout(10) << __func__
<< dendl
;
577 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
579 mapping_job
->abort();
582 // discard failure info, waiters
583 list
<MonOpRequestRef
> ls
;
584 take_all_failures(ls
);
588 void OSDMonitor::update_logger()
590 dout(10) << "update_logger" << dendl
;
592 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
593 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
594 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
595 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
598 void OSDMonitor::create_pending()
600 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
601 pending_inc
.fsid
= mon
->monmap
->fsid
;
603 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
605 // clean up pg_temp, primary_temp
606 OSDMap::clean_temps(g_ceph_context
, osdmap
, &pending_inc
);
607 dout(10) << "create_pending did clean_temps" << dendl
;
609 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
610 // instead of osd_backfill_full_ratio config
611 if (osdmap
.backfillfull_ratio
<= 0) {
612 pending_inc
.new_backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
613 if (pending_inc
.new_backfillfull_ratio
> 1.0)
614 pending_inc
.new_backfillfull_ratio
/= 100;
615 dout(1) << __func__
<< " setting backfillfull_ratio = "
616 << pending_inc
.new_backfillfull_ratio
<< dendl
;
618 if (osdmap
.get_epoch() > 0 &&
619 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
620 // transition full ratios from PGMap to OSDMap (on upgrade)
621 float full_ratio
= mon
->pgservice
->get_full_ratio();
622 float nearfull_ratio
= mon
->pgservice
->get_nearfull_ratio();
623 if (osdmap
.full_ratio
!= full_ratio
) {
624 dout(10) << __func__
<< " full_ratio " << osdmap
.full_ratio
625 << " -> " << full_ratio
<< " (from pgmap)" << dendl
;
626 pending_inc
.new_full_ratio
= full_ratio
;
628 if (osdmap
.nearfull_ratio
!= nearfull_ratio
) {
629 dout(10) << __func__
<< " nearfull_ratio " << osdmap
.nearfull_ratio
630 << " -> " << nearfull_ratio
<< " (from pgmap)" << dendl
;
631 pending_inc
.new_nearfull_ratio
= nearfull_ratio
;
634 // safety check (this shouldn't really happen)
635 if (osdmap
.full_ratio
<= 0) {
636 pending_inc
.new_full_ratio
= g_conf
->mon_osd_full_ratio
;
637 if (pending_inc
.new_full_ratio
> 1.0)
638 pending_inc
.new_full_ratio
/= 100;
639 dout(1) << __func__
<< " setting full_ratio = "
640 << pending_inc
.new_full_ratio
<< dendl
;
642 if (osdmap
.nearfull_ratio
<= 0) {
643 pending_inc
.new_nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
644 if (pending_inc
.new_nearfull_ratio
> 1.0)
645 pending_inc
.new_nearfull_ratio
/= 100;
646 dout(1) << __func__
<< " setting nearfull_ratio = "
647 << pending_inc
.new_nearfull_ratio
<< dendl
;
651 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
653 if (osdmap
.crush
->has_legacy_rule_ids()) {
654 CrushWrapper newcrush
;
655 _get_pending_crush(newcrush
);
657 // First, for all pools, work out which rule they really used
658 // by resolving ruleset to rule.
659 for (const auto &i
: osdmap
.get_pools()) {
660 const auto pool_id
= i
.first
;
661 const auto &pool
= i
.second
;
662 int new_rule_id
= newcrush
.find_rule(pool
.crush_rule
,
663 pool
.type
, pool
.size
);
665 dout(1) << __func__
<< " rewriting pool "
666 << osdmap
.get_pool_name(pool_id
) << " crush ruleset "
667 << pool
.crush_rule
<< " -> rule id " << new_rule_id
<< dendl
;
668 if (pending_inc
.new_pools
.count(pool_id
) == 0) {
669 pending_inc
.new_pools
[pool_id
] = pool
;
671 pending_inc
.new_pools
[pool_id
].crush_rule
= new_rule_id
;
674 // Now, go ahead and renumber all the rules so that their
675 // rule_id field corresponds to their position in the array
676 auto old_to_new
= newcrush
.renumber_rules();
677 dout(1) << __func__
<< " Rewrote " << old_to_new
<< " crush IDs:" << dendl
;
678 for (const auto &i
: old_to_new
) {
679 dout(1) << __func__
<< " " << i
.first
<< " -> " << i
.second
<< dendl
;
681 pending_inc
.crush
.clear();
682 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
687 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
688 const OSDMap
& nextmap
)
690 dout(10) << __func__
<< dendl
;
691 creating_pgs_t pending_creatings
;
693 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
694 pending_creatings
= creating_pgs
;
696 // check for new or old pools
697 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
698 if (osdmap
.get_epoch() &&
699 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
701 mon
->pgservice
->maybe_add_creating_pgs(creating_pgs
.last_scan_epoch
,
704 dout(7) << __func__
<< " " << added
<< " pgs added from pgmap" << dendl
;
707 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
711 queued
+= scan_for_creating_pgs(inc
.new_pools
,
715 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
716 for (auto deleted_pool
: inc
.old_pools
) {
717 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
718 dout(10) << __func__
<< " " << removed
719 << " pg removed because containing pool deleted: "
720 << deleted_pool
<< dendl
;
721 last_epoch_clean
.remove_pool(deleted_pool
);
723 // pgmon updates its creating_pgs in check_osd_map() which is called by
724 // on_active() and check_osd_map() could be delayed if lease expires, so its
725 // creating_pgs could be stale in comparison with the one of osdmon. let's
726 // trim them here. otherwise, they will be added back after being erased.
727 unsigned removed
= 0;
728 for (auto& pg
: pending_created_pgs
) {
729 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
730 pending_creatings
.created_pools
.insert(pg
.pool());
731 removed
+= pending_creatings
.pgs
.erase(pg
);
733 pending_created_pgs
.clear();
734 dout(10) << __func__
<< " " << removed
735 << " pgs removed because they're created" << dendl
;
736 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
739 // filter out any pgs that shouldn't exist.
741 auto i
= pending_creatings
.pgs
.begin();
742 while (i
!= pending_creatings
.pgs
.end()) {
743 if (!nextmap
.pg_exists(i
->first
)) {
744 dout(10) << __func__
<< " removing pg " << i
->first
745 << " which should not exist" << dendl
;
746 i
= pending_creatings
.pgs
.erase(i
);
754 unsigned max
= MAX(1, g_conf
->mon_osd_max_creating_pgs
);
755 const auto total
= pending_creatings
.pgs
.size();
756 while (pending_creatings
.pgs
.size() < max
&&
757 !pending_creatings
.queue
.empty()) {
758 auto p
= pending_creatings
.queue
.begin();
759 int64_t poolid
= p
->first
;
760 dout(10) << __func__
<< " pool " << poolid
761 << " created " << p
->second
.created
762 << " modified " << p
->second
.modified
763 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
765 int n
= MIN(max
- pending_creatings
.pgs
.size(),
766 p
->second
.end
- p
->second
.start
);
767 ps_t first
= p
->second
.start
;
768 ps_t end
= first
+ n
;
769 for (ps_t ps
= first
; ps
< end
; ++ps
) {
770 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
771 // NOTE: use the *current* epoch as the PG creation epoch so that the
772 // OSD does not have to generate a long set of PastIntervals.
773 pending_creatings
.pgs
.emplace(pgid
, make_pair(inc
.epoch
,
774 p
->second
.modified
));
775 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
777 p
->second
.start
= end
;
778 if (p
->second
.done()) {
779 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
780 pending_creatings
.queue
.erase(p
);
782 dout(10) << __func__
<< " pool " << poolid
783 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
787 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
788 << " pools" << dendl
;
790 << " " << (pending_creatings
.pgs
.size() - total
)
791 << "/" << pending_creatings
.pgs
.size()
792 << " pgs added from queued pools" << dendl
;
793 return pending_creatings
;
796 void OSDMonitor::maybe_prime_pg_temp()
799 if (pending_inc
.crush
.length()) {
800 dout(10) << __func__
<< " new crush map, all" << dendl
;
804 if (!pending_inc
.new_up_client
.empty()) {
805 dout(10) << __func__
<< " new up osds, all" << dendl
;
809 // check for interesting OSDs
811 for (auto p
= pending_inc
.new_state
.begin();
812 !all
&& p
!= pending_inc
.new_state
.end();
814 if ((p
->second
& CEPH_OSD_UP
) &&
815 osdmap
.is_up(p
->first
)) {
816 osds
.insert(p
->first
);
819 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
820 !all
&& p
!= pending_inc
.new_weight
.end();
822 if (p
->second
< osdmap
.get_weight(p
->first
)) {
824 osds
.insert(p
->first
);
826 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
832 if (!all
&& osds
.empty())
837 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
838 if (estimate
> mapping
.get_num_pgs() *
839 g_conf
->mon_osd_prime_pg_temp_max_estimate
) {
840 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
841 << osds
.size() << " osds >= "
842 << g_conf
->mon_osd_prime_pg_temp_max_estimate
<< " of total "
843 << mapping
.get_num_pgs() << " pgs, all"
847 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
848 << osds
.size() << " osds" << dendl
;
853 next
.deepish_copy_from(osdmap
);
854 next
.apply_incremental(pending_inc
);
856 if (next
.get_pools().empty()) {
857 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
859 PrimeTempJob
job(next
, this);
860 mapper
.queue(&job
, g_conf
->mon_osd_mapping_pgs_per_chunk
);
861 if (job
.wait_for(g_conf
->mon_osd_prime_pg_temp_max_time
)) {
862 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
864 dout(10) << __func__
<< " did not finish in "
865 << g_conf
->mon_osd_prime_pg_temp_max_time
866 << ", stopping" << dendl
;
870 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
871 utime_t stop
= ceph_clock_now();
872 stop
+= g_conf
->mon_osd_prime_pg_temp_max_time
;
873 const int chunk
= 1000;
875 std::unordered_set
<pg_t
> did_pgs
;
876 for (auto osd
: osds
) {
877 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
878 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
879 for (auto pgid
: pgs
) {
880 if (!did_pgs
.insert(pgid
).second
) {
883 prime_pg_temp(next
, pgid
);
886 if (ceph_clock_now() > stop
) {
887 dout(10) << __func__
<< " consumed more than "
888 << g_conf
->mon_osd_prime_pg_temp_max_time
889 << " seconds, stopping"
899 void OSDMonitor::prime_pg_temp(
903 if (mon
->monmap
->get_required_features().contains_all(
904 ceph::features::mon::FEATURE_LUMINOUS
)) {
905 // TODO: remove this creating_pgs direct access?
906 if (creating_pgs
.pgs
.count(pgid
)) {
910 if (mon
->pgservice
->is_creating_pg(pgid
)) {
914 if (!osdmap
.pg_exists(pgid
)) {
918 vector
<int> up
, acting
;
919 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
921 vector
<int> next_up
, next_acting
;
922 int next_up_primary
, next_acting_primary
;
923 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
924 &next_acting
, &next_acting_primary
);
925 if (acting
== next_acting
&& next_up
!= next_acting
)
926 return; // no change since last epoch
929 return; // if previously empty now we can be no worse off
930 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
931 if (pool
&& acting
.size() < pool
->min_size
)
932 return; // can be no worse off than before
934 if (next_up
== next_acting
) {
936 dout(20) << __func__
<< "next_up === next_acting now, clear pg_temp"
940 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
941 << " -> " << next_up
<< "/" << next_acting
942 << ", priming " << acting
945 Mutex::Locker
l(prime_pg_temp_lock
);
946 // do not touch a mapping if a change is pending
947 pending_inc
.new_pg_temp
.emplace(
949 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
954 * @note receiving a transaction in this function gives a fair amount of
955 * freedom to the service implementation if it does need it. It shouldn't.
957 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
959 dout(10) << "encode_pending e " << pending_inc
.epoch
962 // finalize up pending_inc
963 pending_inc
.modified
= ceph_clock_now();
965 int r
= pending_inc
.propagate_snaps_to_tiers(g_ceph_context
, osdmap
);
969 if (!mapping_job
->is_done()) {
970 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
971 << mapping_job
.get() << " did not complete, "
972 << mapping_job
->shards
<< " left" << dendl
;
973 mapping_job
->abort();
974 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
975 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
976 << mapping_job
.get() << " is prior epoch "
977 << mapping
.get_epoch() << dendl
;
979 if (g_conf
->mon_osd_prime_pg_temp
) {
980 maybe_prime_pg_temp();
983 } else if (g_conf
->mon_osd_prime_pg_temp
) {
984 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
989 // ensure we don't have blank new_state updates. these are interrpeted as
990 // CEPH_OSD_UP (and almost certainly not what we want!).
991 auto p
= pending_inc
.new_state
.begin();
992 while (p
!= pending_inc
.new_state
.end()) {
993 if (p
->second
== 0) {
994 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
995 p
= pending_inc
.new_state
.erase(p
);
1005 tmp
.deepish_copy_from(osdmap
);
1006 tmp
.apply_incremental(pending_inc
);
1008 if (tmp
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1009 // remove any legacy osdmap nearfull/full flags
1011 if (tmp
.test_flag(CEPH_OSDMAP_FULL
| CEPH_OSDMAP_NEARFULL
)) {
1012 dout(10) << __func__
<< " clearing legacy osdmap nearfull/full flag"
1014 remove_flag(CEPH_OSDMAP_NEARFULL
);
1015 remove_flag(CEPH_OSDMAP_FULL
);
1018 // collect which pools are currently affected by
1019 // the near/backfill/full osd(s),
1020 // and set per-pool near/backfill/full flag instead
1021 set
<int64_t> full_pool_ids
;
1022 set
<int64_t> backfillfull_pool_ids
;
1023 set
<int64_t> nearfull_pool_ids
;
1024 tmp
.get_full_pools(g_ceph_context
,
1026 &backfillfull_pool_ids
,
1027 &nearfull_pool_ids
);
1028 if (full_pool_ids
.empty() ||
1029 backfillfull_pool_ids
.empty() ||
1030 nearfull_pool_ids
.empty()) {
1031 // normal case - no nearfull, backfillfull or full osds
1032 // try cancel any improper nearfull/backfillfull/full pool
1034 for (auto &pool
: tmp
.get_pools()) {
1035 auto p
= pool
.first
;
1036 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1037 nearfull_pool_ids
.empty()) {
1038 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1039 << "'s nearfull flag" << dendl
;
1040 if (pending_inc
.new_pools
.count(p
) == 0) {
1041 // load original pool info first!
1042 pending_inc
.new_pools
[p
] = pool
.second
;
1044 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1046 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1047 backfillfull_pool_ids
.empty()) {
1048 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1049 << "'s backfillfull flag" << dendl
;
1050 if (pending_inc
.new_pools
.count(p
) == 0) {
1051 pending_inc
.new_pools
[p
] = pool
.second
;
1053 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1055 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1056 full_pool_ids
.empty()) {
1057 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
1058 // set by EQUOTA, skipping
1061 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1062 << "'s full flag" << dendl
;
1063 if (pending_inc
.new_pools
.count(p
) == 0) {
1064 pending_inc
.new_pools
[p
] = pool
.second
;
1066 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1070 if (!full_pool_ids
.empty()) {
1071 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1072 << " as full" << dendl
;
1073 for (auto &p
: full_pool_ids
) {
1074 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1077 if (pending_inc
.new_pools
.count(p
) == 0) {
1078 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1080 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1081 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1082 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1084 // cancel FLAG_FULL for pools which are no longer full too
1085 for (auto &pool
: tmp
.get_pools()) {
1086 auto p
= pool
.first
;
1087 if (full_pool_ids
.count(p
)) {
1088 // skip pools we have just marked as full above
1091 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1092 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
1093 // don't touch if currently is not full
1094 // or is running out of quota (and hence considered as full)
1097 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1098 << "'s full flag" << dendl
;
1099 if (pending_inc
.new_pools
.count(p
) == 0) {
1100 pending_inc
.new_pools
[p
] = pool
.second
;
1102 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1105 if (!backfillfull_pool_ids
.empty()) {
1106 for (auto &p
: backfillfull_pool_ids
) {
1107 if (full_pool_ids
.count(p
)) {
1108 // skip pools we have already considered as full above
1111 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
1112 // make sure FLAG_FULL is truly set, so we are safe not
1113 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1114 assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1117 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1118 // don't bother if pool is already marked as backfillfull
1121 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1122 << "'s as backfillfull" << dendl
;
1123 if (pending_inc
.new_pools
.count(p
) == 0) {
1124 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1126 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1127 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1129 // cancel FLAG_BACKFILLFULL for pools
1130 // which are no longer backfillfull too
1131 for (auto &pool
: tmp
.get_pools()) {
1132 auto p
= pool
.first
;
1133 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1134 // skip pools we have just marked as backfillfull/full above
1137 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1138 // and don't touch if currently is not backfillfull
1141 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1142 << "'s backfillfull flag" << dendl
;
1143 if (pending_inc
.new_pools
.count(p
) == 0) {
1144 pending_inc
.new_pools
[p
] = pool
.second
;
1146 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1149 if (!nearfull_pool_ids
.empty()) {
1150 for (auto &p
: nearfull_pool_ids
) {
1151 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1154 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
1155 // make sure FLAG_FULL is truly set, so we are safe not
1156 // to set a extra (redundant) FLAG_NEARFULL flag
1157 assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1160 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1161 // don't bother if pool is already marked as nearfull
1164 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1165 << "'s as nearfull" << dendl
;
1166 if (pending_inc
.new_pools
.count(p
) == 0) {
1167 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1169 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1171 // cancel FLAG_NEARFULL for pools
1172 // which are no longer nearfull too
1173 for (auto &pool
: tmp
.get_pools()) {
1174 auto p
= pool
.first
;
1175 if (full_pool_ids
.count(p
) ||
1176 backfillfull_pool_ids
.count(p
) ||
1177 nearfull_pool_ids
.count(p
)) {
1178 // skip pools we have just marked as
1179 // nearfull/backfillfull/full above
1182 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1183 // and don't touch if currently is not nearfull
1186 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1187 << "'s nearfull flag" << dendl
;
1188 if (pending_inc
.new_pools
.count(p
) == 0) {
1189 pending_inc
.new_pools
[p
] = pool
.second
;
1191 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1195 // min_compat_client?
1196 if (tmp
.require_min_compat_client
== 0) {
1197 auto mv
= tmp
.get_min_compat_client();
1198 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1199 << "required " << ceph_release_name(mv
) << dendl
;
1200 mon
->clog
->info() << "setting require_min_compat_client to currently "
1201 << "required " << ceph_release_name(mv
);
1202 pending_inc
.new_require_min_compat_client
= mv
;
1205 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1206 // convert ec profile ruleset-* -> crush-*
1207 for (auto& p
: tmp
.erasure_code_profiles
) {
1208 bool changed
= false;
1209 map
<string
,string
> newprofile
;
1210 for (auto& q
: p
.second
) {
1211 if (q
.first
.find("ruleset-") == 0) {
1212 string key
= "crush-";
1213 key
+= q
.first
.substr(8);
1214 newprofile
[key
] = q
.second
;
1216 dout(20) << " updating ec profile " << p
.first
1217 << " key " << q
.first
<< " -> " << key
<< dendl
;
1219 newprofile
[q
.first
] = q
.second
;
1223 dout(10) << " updated ec profile " << p
.first
<< ": "
1224 << newprofile
<< dendl
;
1225 pending_inc
.new_erasure_code_profiles
[p
.first
] = newprofile
;
1229 // auto-enable pool applications upon upgrade
1230 // NOTE: this can be removed post-Luminous assuming upgrades need to
1231 // proceed through Luminous
1232 for (auto &pool_pair
: tmp
.pools
) {
1233 int64_t pool_id
= pool_pair
.first
;
1234 pg_pool_t pg_pool
= pool_pair
.second
;
1235 if (pg_pool
.is_tier()) {
1239 std::string pool_name
= tmp
.get_pool_name(pool_id
);
1240 uint32_t match_count
= 0;
1243 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending();
1244 if (pending_fsmap
.pool_in_use(pool_id
)) {
1245 dout(10) << __func__
<< " auto-enabling CephFS on pool '"
1246 << pool_name
<< "'" << dendl
;
1247 pg_pool
.application_metadata
.insert(
1248 {pg_pool_t::APPLICATION_NAME_CEPHFS
, {}});
1252 // RBD heuristics (default OpenStack pool names from docs and
1254 if (boost::algorithm::contains(pool_name
, "rbd") ||
1255 pool_name
== "images" || pool_name
== "volumes" ||
1256 pool_name
== "backups" || pool_name
== "vms") {
1257 dout(10) << __func__
<< " auto-enabling RBD on pool '"
1258 << pool_name
<< "'" << dendl
;
1259 pg_pool
.application_metadata
.insert(
1260 {pg_pool_t::APPLICATION_NAME_RBD
, {}});
1265 if (boost::algorithm::contains(pool_name
, ".rgw") ||
1266 boost::algorithm::contains(pool_name
, ".log") ||
1267 boost::algorithm::contains(pool_name
, ".intent-log") ||
1268 boost::algorithm::contains(pool_name
, ".usage") ||
1269 boost::algorithm::contains(pool_name
, ".users")) {
1270 dout(10) << __func__
<< " auto-enabling RGW on pool '"
1271 << pool_name
<< "'" << dendl
;
1272 pg_pool
.application_metadata
.insert(
1273 {pg_pool_t::APPLICATION_NAME_RGW
, {}});
1277 // OpenStack gnocchi (from ceph-ansible)
1278 if (pool_name
== "metrics" && match_count
== 0) {
1279 dout(10) << __func__
<< " auto-enabling OpenStack Gnocchi on pool '"
1280 << pool_name
<< "'" << dendl
;
1281 pg_pool
.application_metadata
.insert({"openstack_gnocchi", {}});
1285 if (match_count
== 1) {
1286 pg_pool
.last_change
= pending_inc
.epoch
;
1287 pending_inc
.new_pools
[pool_id
] = pg_pool
;
1288 } else if (match_count
> 1) {
1289 auto pstat
= mon
->pgservice
->get_pool_stat(pool_id
);
1290 if (pstat
!= nullptr && pstat
->stats
.sum
.num_objects
> 0) {
1291 mon
->clog
->info() << "unable to auto-enable application for pool "
1292 << "'" << pool_name
<< "'";
1301 for (auto i
= pending_inc
.new_state
.begin();
1302 i
!= pending_inc
.new_state
.end();
1304 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1305 if (s
& CEPH_OSD_UP
)
1306 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1307 if (s
& CEPH_OSD_EXISTS
)
1308 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1310 for (map
<int32_t,entity_addr_t
>::iterator i
= pending_inc
.new_up_client
.begin();
1311 i
!= pending_inc
.new_up_client
.end();
1313 //FIXME: insert cluster addresses too
1314 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1316 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1317 i
!= pending_inc
.new_weight
.end();
1319 if (i
->second
== CEPH_OSD_OUT
) {
1320 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1321 } else if (i
->second
== CEPH_OSD_IN
) {
1322 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1324 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1328 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1329 osdmap
.maybe_remove_pg_upmaps(cct
, osdmap
, &pending_inc
);
1331 // features for osdmap and its incremental
1332 uint64_t features
= mon
->get_quorum_con_features();
1334 // encode full map and determine its crc
1337 tmp
.deepish_copy_from(osdmap
);
1338 tmp
.apply_incremental(pending_inc
);
1340 // determine appropriate features
1341 if (tmp
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1342 dout(10) << __func__
<< " encoding without feature SERVER_LUMINOUS"
1344 features
&= ~CEPH_FEATURE_SERVER_LUMINOUS
;
1346 if (tmp
.require_osd_release
< CEPH_RELEASE_KRAKEN
) {
1347 dout(10) << __func__
<< " encoding without feature SERVER_KRAKEN | "
1348 << "MSG_ADDR2" << dendl
;
1349 features
&= ~(CEPH_FEATURE_SERVER_KRAKEN
|
1350 CEPH_FEATURE_MSG_ADDR2
);
1352 if (tmp
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
1353 dout(10) << __func__
<< " encoding without feature SERVER_JEWEL" << dendl
;
1354 features
&= ~CEPH_FEATURE_SERVER_JEWEL
;
1356 dout(10) << __func__
<< " encoding full map with " << features
<< dendl
;
1359 ::encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1360 pending_inc
.full_crc
= tmp
.get_crc();
1362 // include full map in the txn. note that old monitors will
1363 // overwrite this. new ones will now skip the local full map
1364 // encode and reload from this.
1365 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1369 assert(get_last_committed() + 1 == pending_inc
.epoch
);
1370 ::encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1372 dout(20) << " full_crc " << tmp
.get_crc()
1373 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1375 /* put everything in the transaction */
1376 put_version(t
, pending_inc
.epoch
, bl
);
1377 put_last_committed(t
, pending_inc
.epoch
);
1380 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1381 p
!= pending_metadata
.end();
1383 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1384 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1385 p
!= pending_metadata_rm
.end();
1387 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1388 pending_metadata
.clear();
1389 pending_metadata_rm
.clear();
1391 // and pg creating, also!
1392 if (mon
->monmap
->get_required_features().contains_all(
1393 ceph::features::mon::FEATURE_LUMINOUS
)) {
1394 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1395 if (osdmap
.get_epoch() &&
1396 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1397 dout(7) << __func__
<< " in the middle of upgrading, "
1398 << " trimming pending creating_pgs using pgmap" << dendl
;
1399 mon
->pgservice
->maybe_trim_creating_pgs(&pending_creatings
);
1401 bufferlist creatings_bl
;
1402 ::encode(pending_creatings
, creatings_bl
);
1403 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1407 health_check_map_t next
;
1408 tmp
.check_health(&next
);
1409 encode_health(next
, t
);
1412 void OSDMonitor::trim_creating_pgs(creating_pgs_t
* creating_pgs
,
1413 const ceph::unordered_map
<pg_t
,pg_stat_t
>& pg_stat
)
1415 auto p
= creating_pgs
->pgs
.begin();
1416 while (p
!= creating_pgs
->pgs
.end()) {
1417 auto q
= pg_stat
.find(p
->first
);
1418 if (q
!= pg_stat
.end() &&
1419 !(q
->second
.state
& PG_STATE_CREATING
)) {
1420 dout(20) << __func__
<< " pgmap shows " << p
->first
<< " is created"
1422 p
= creating_pgs
->pgs
.erase(p
);
1429 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
1432 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
1436 bufferlist::iterator p
= bl
.begin();
1439 catch (buffer::error
& e
) {
1441 *err
<< "osd." << osd
<< " metadata is corrupt";
1447 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
1449 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
1450 if (osdmap
.is_up(osd
)) {
1451 map
<string
,string
> meta
;
1452 load_metadata(osd
, meta
, nullptr);
1453 auto p
= meta
.find(field
);
1454 if (p
== meta
.end()) {
1455 (*out
)["unknown"]++;
1457 (*out
)[p
->second
]++;
1463 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
1465 map
<string
,int> by_val
;
1466 count_metadata(field
, &by_val
);
1467 f
->open_object_section(field
.c_str());
1468 for (auto& p
: by_val
) {
1469 f
->dump_int(p
.first
.c_str(), p
.second
);
1474 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
1476 map
<string
, string
> metadata
;
1477 int r
= load_metadata(osd
, metadata
, nullptr);
1481 auto it
= metadata
.find("osd_objectstore");
1482 if (it
== metadata
.end())
1488 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
1489 const pg_pool_t
&pool
,
1492 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1493 // since filestore osds could always join the pool later
1494 set
<int> checked_osds
;
1495 for (unsigned ps
= 0; ps
< MIN(8, pool
.get_pg_num()); ++ps
) {
1496 vector
<int> up
, acting
;
1497 pg_t
pgid(ps
, pool_id
, -1);
1498 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
1499 for (int osd
: up
) {
1500 if (checked_osds
.find(osd
) != checked_osds
.end())
1502 string objectstore_type
;
1503 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
1504 // allow with missing metadata, e.g. due to an osd never booting yet
1505 if (r
< 0 || objectstore_type
== "bluestore") {
1506 checked_osds
.insert(osd
);
1509 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
1516 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
1518 map
<string
,string
> m
;
1519 if (int r
= load_metadata(osd
, m
, err
))
1521 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
1522 f
->dump_string(p
->first
.c_str(), p
->second
);
1526 void OSDMonitor::print_nodes(Formatter
*f
)
1528 // group OSDs by their hosts
1529 map
<string
, list
<int> > osds
; // hostname => osd
1530 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
1531 map
<string
, string
> m
;
1532 if (load_metadata(osd
, m
, NULL
)) {
1535 map
<string
, string
>::iterator hostname
= m
.find("hostname");
1536 if (hostname
== m
.end()) {
1537 // not likely though
1540 osds
[hostname
->second
].push_back(osd
);
1543 dump_services(f
, osds
, "osd");
1546 void OSDMonitor::share_map_with_random_osd()
1548 if (osdmap
.get_num_up_osds() == 0) {
1549 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
1553 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
1555 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
1559 dout(10) << "committed, telling random " << s
->inst
<< " all about it" << dendl
;
1560 // whatev, they'll request more if they need it
1561 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch());
1562 s
->con
->send_message(m
);
1563 // NOTE: do *not* record osd has up to this epoch (as we do
1564 // elsewhere) as they may still need to request older values.
1567 version_t
OSDMonitor::get_trim_to()
1569 if (mon
->get_quorum().empty()) {
1570 dout(10) << __func__
<< ": quorum not formed" << dendl
;
1575 if (mon
->monmap
->get_required_features().contains_all(
1576 ceph::features::mon::FEATURE_LUMINOUS
)) {
1578 // TODO: Get this hidden in PGStatService
1579 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1580 if (!creating_pgs
.pgs
.empty()) {
1584 floor
= get_min_last_epoch_clean();
1586 if (!mon
->pgservice
->is_readable())
1588 if (mon
->pgservice
->have_creating_pgs()) {
1591 floor
= mon
->pgservice
->get_min_last_epoch_clean();
1594 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
1595 if (g_conf
->mon_osd_force_trim_to
> 0 &&
1596 g_conf
->mon_osd_force_trim_to
< (int)get_last_committed()) {
1597 floor
= g_conf
->mon_osd_force_trim_to
;
1598 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
1600 unsigned min
= g_conf
->mon_min_osdmap_epochs
;
1601 if (floor
+ min
> get_last_committed()) {
1602 if (min
< get_last_committed())
1603 floor
= get_last_committed() - min
;
1607 if (floor
> get_first_committed())
1613 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
1615 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
1616 // also scan osd epochs
1617 // don't trim past the oldest reported osd epoch
1618 for (auto& osd_epoch
: osd_epochs
) {
1619 if (osd_epoch
.second
< floor
) {
1620 floor
= osd_epoch
.second
;
1626 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
1629 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
1631 get_version_full(first
, bl
);
1632 put_version_full(tx
, first
, bl
);
1637 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
1639 op
->mark_osdmon_event(__func__
);
1640 Message
*m
= op
->get_req();
1641 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1643 switch (m
->get_type()) {
1645 case MSG_MON_COMMAND
:
1646 return preprocess_command(op
);
1647 case CEPH_MSG_MON_GET_OSDMAP
:
1648 return preprocess_get_osdmap(op
);
1651 case MSG_OSD_MARK_ME_DOWN
:
1652 return preprocess_mark_me_down(op
);
1654 return preprocess_full(op
);
1655 case MSG_OSD_FAILURE
:
1656 return preprocess_failure(op
);
1658 return preprocess_boot(op
);
1660 return preprocess_alive(op
);
1661 case MSG_OSD_PG_CREATED
:
1662 return preprocess_pg_created(op
);
1663 case MSG_OSD_PGTEMP
:
1664 return preprocess_pgtemp(op
);
1665 case MSG_OSD_BEACON
:
1666 return preprocess_beacon(op
);
1668 case CEPH_MSG_POOLOP
:
1669 return preprocess_pool_op(op
);
1671 case MSG_REMOVE_SNAPS
:
1672 return preprocess_remove_snaps(op
);
1680 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
1682 op
->mark_osdmon_event(__func__
);
1683 Message
*m
= op
->get_req();
1684 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1686 switch (m
->get_type()) {
1688 case MSG_OSD_MARK_ME_DOWN
:
1689 return prepare_mark_me_down(op
);
1691 return prepare_full(op
);
1692 case MSG_OSD_FAILURE
:
1693 return prepare_failure(op
);
1695 return prepare_boot(op
);
1697 return prepare_alive(op
);
1698 case MSG_OSD_PG_CREATED
:
1699 return prepare_pg_created(op
);
1700 case MSG_OSD_PGTEMP
:
1701 return prepare_pgtemp(op
);
1702 case MSG_OSD_BEACON
:
1703 return prepare_beacon(op
);
1705 case MSG_MON_COMMAND
:
1706 return prepare_command(op
);
1708 case CEPH_MSG_POOLOP
:
1709 return prepare_pool_op(op
);
1711 case MSG_REMOVE_SNAPS
:
1712 return prepare_remove_snaps(op
);
1722 bool OSDMonitor::should_propose(double& delay
)
1724 dout(10) << "should_propose" << dendl
;
1726 // if full map, propose immediately! any subsequent changes will be clobbered.
1727 if (pending_inc
.fullmap
.length())
1730 // adjust osd weights?
1731 if (!osd_weight
.empty() &&
1732 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
1733 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
1734 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
1740 // propose as fast as possible if updating up_thru or pg_temp
1741 // want to merge OSDMap changes as much as possible
1742 if ((pending_inc
.new_primary_temp
.size() == 1
1743 || pending_inc
.new_up_thru
.size() == 1)
1744 && pending_inc
.new_state
.size() < 2) {
1745 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl
;
1747 utime_t now
= ceph_clock_now();
1748 if (now
- last_attempted_minwait_time
> g_conf
->paxos_propose_interval
1749 && now
- paxos
->get_last_commit_time() > g_conf
->paxos_min_wait
) {
1750 delay
= g_conf
->paxos_min_wait
;
1751 last_attempted_minwait_time
= now
;
1756 return PaxosService::should_propose(delay
);
1761 // ---------------------------
1764 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
1766 op
->mark_osdmon_event(__func__
);
1767 MMonGetOSDMap
*m
= static_cast<MMonGetOSDMap
*>(op
->get_req());
1768 dout(10) << __func__
<< " " << *m
<< dendl
;
1769 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
);
1770 epoch_t first
= get_first_committed();
1771 epoch_t last
= osdmap
.get_epoch();
1772 int max
= g_conf
->osd_map_message_max
;
1773 for (epoch_t e
= MAX(first
, m
->get_full_first());
1774 e
<= MIN(last
, m
->get_full_last()) && max
> 0;
1776 int r
= get_version_full(e
, reply
->maps
[e
]);
1779 for (epoch_t e
= MAX(first
, m
->get_inc_first());
1780 e
<= MIN(last
, m
->get_inc_last()) && max
> 0;
1782 int r
= get_version(e
, reply
->incremental_maps
[e
]);
1785 reply
->oldest_map
= first
;
1786 reply
->newest_map
= last
;
1787 mon
->send_reply(op
, reply
);
1792 // ---------------------------
1797 bool OSDMonitor::check_source(PaxosServiceMessage
*m
, uuid_d fsid
) {
1798 // check permissions
1799 MonSession
*session
= m
->get_session();
1802 if (!session
->is_capable("osd", MON_CAP_X
)) {
1803 dout(0) << "got MOSDFailure from entity with insufficient caps "
1804 << session
->caps
<< dendl
;
1807 if (fsid
!= mon
->monmap
->fsid
) {
1808 dout(0) << "check_source: on fsid " << fsid
1809 << " != " << mon
->monmap
->fsid
<< dendl
;
1816 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
1818 op
->mark_osdmon_event(__func__
);
1819 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
1820 // who is target_osd
1821 int badboy
= m
->get_target().name
.num();
1823 // check permissions
1824 if (check_source(m
, m
->fsid
))
1827 // first, verify the reporting host is valid
1828 if (m
->get_orig_source().is_osd()) {
1829 int from
= m
->get_orig_source().num();
1830 if (!osdmap
.exists(from
) ||
1831 osdmap
.get_addr(from
) != m
->get_orig_source_inst().addr
||
1832 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
1833 dout(5) << "preprocess_failure from dead osd." << from
<< ", ignoring" << dendl
;
1834 send_incremental(op
, m
->get_epoch()+1);
1841 if (osdmap
.is_down(badboy
)) {
1842 dout(5) << "preprocess_failure dne(/dup?): " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1843 if (m
->get_epoch() < osdmap
.get_epoch())
1844 send_incremental(op
, m
->get_epoch()+1);
1847 if (osdmap
.get_inst(badboy
) != m
->get_target()) {
1848 dout(5) << "preprocess_failure wrong osd: report " << m
->get_target() << " != map's " << osdmap
.get_inst(badboy
)
1849 << ", from " << m
->get_orig_source_inst() << dendl
;
1850 if (m
->get_epoch() < osdmap
.get_epoch())
1851 send_incremental(op
, m
->get_epoch()+1);
1855 // already reported?
1856 if (osdmap
.is_down(badboy
) ||
1857 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
1858 dout(5) << "preprocess_failure dup/old: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1859 if (m
->get_epoch() < osdmap
.get_epoch())
1860 send_incremental(op
, m
->get_epoch()+1);
1864 if (!can_mark_down(badboy
)) {
1865 dout(5) << "preprocess_failure ignoring report of " << m
->get_target() << " from " << m
->get_orig_source_inst() << dendl
;
1869 dout(10) << "preprocess_failure new: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1876 class C_AckMarkedDown
: public C_MonOp
{
1882 : C_MonOp(op
), osdmon(osdmon
) {}
1884 void _finish(int) override
{
1885 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1886 osdmon
->mon
->send_reply(
1892 false)); // ACK itself does not request an ack
1894 ~C_AckMarkedDown() override
{
1898 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
1900 op
->mark_osdmon_event(__func__
);
1901 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1902 int requesting_down
= m
->get_target().name
.num();
1903 int from
= m
->get_orig_source().num();
1905 // check permissions
1906 if (check_source(m
, m
->fsid
))
1909 // first, verify the reporting host is valid
1910 if (!m
->get_orig_source().is_osd())
1913 if (!osdmap
.exists(from
) ||
1914 osdmap
.is_down(from
) ||
1915 osdmap
.get_addr(from
) != m
->get_target().addr
) {
1916 dout(5) << "preprocess_mark_me_down from dead osd."
1917 << from
<< ", ignoring" << dendl
;
1918 send_incremental(op
, m
->get_epoch()+1);
1922 // no down might be set
1923 if (!can_mark_down(requesting_down
))
1926 dout(10) << "MOSDMarkMeDown for: " << m
->get_target() << dendl
;
1930 if (m
->request_ack
) {
1931 Context
*c(new C_AckMarkedDown(this, op
));
1937 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
1939 op
->mark_osdmon_event(__func__
);
1940 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1941 int target_osd
= m
->get_target().name
.num();
1943 assert(osdmap
.is_up(target_osd
));
1944 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
1946 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
1947 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1949 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
1953 bool OSDMonitor::can_mark_down(int i
)
1955 if (osdmap
.test_flag(CEPH_OSDMAP_NODOWN
)) {
1956 dout(5) << __func__
<< " NODOWN flag set, will not mark osd." << i
1957 << " down" << dendl
;
1961 if (osdmap
.is_nodown(i
)) {
1962 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
1963 << "will not mark it down" << dendl
;
1967 int num_osds
= osdmap
.get_num_osds();
1968 if (num_osds
== 0) {
1969 dout(5) << __func__
<< " no osds" << dendl
;
1972 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
1973 float up_ratio
= (float)up
/ (float)num_osds
;
1974 if (up_ratio
< g_conf
->mon_osd_min_up_ratio
) {
1975 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
1976 << g_conf
->mon_osd_min_up_ratio
1977 << ", will not mark osd." << i
<< " down" << dendl
;
1983 bool OSDMonitor::can_mark_up(int i
)
1985 if (osdmap
.test_flag(CEPH_OSDMAP_NOUP
)) {
1986 dout(5) << __func__
<< " NOUP flag set, will not mark osd." << i
1991 if (osdmap
.is_noup(i
)) {
1992 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
1993 << "will not mark it up" << dendl
;
2001 * @note the parameter @p i apparently only exists here so we can output the
2002 * osd's id on messages.
2004 bool OSDMonitor::can_mark_out(int i
)
2006 if (osdmap
.test_flag(CEPH_OSDMAP_NOOUT
)) {
2007 dout(5) << __func__
<< " NOOUT flag set, will not mark osds out" << dendl
;
2011 if (osdmap
.is_noout(i
)) {
2012 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
2013 << "will not mark it out" << dendl
;
2017 int num_osds
= osdmap
.get_num_osds();
2018 if (num_osds
== 0) {
2019 dout(5) << __func__
<< " no osds" << dendl
;
2022 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
2023 float in_ratio
= (float)in
/ (float)num_osds
;
2024 if (in_ratio
< g_conf
->mon_osd_min_in_ratio
) {
2026 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
2027 << g_conf
->mon_osd_min_in_ratio
2028 << ", will not mark osd." << i
<< " out" << dendl
;
2030 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
2031 << g_conf
->mon_osd_min_in_ratio
2032 << ", will not mark osds out" << dendl
;
2039 bool OSDMonitor::can_mark_in(int i
)
2041 if (osdmap
.test_flag(CEPH_OSDMAP_NOIN
)) {
2042 dout(5) << __func__
<< " NOIN flag set, will not mark osd." << i
2047 if (osdmap
.is_noin(i
)) {
2048 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
2049 << "will not mark it in" << dendl
;
2056 bool OSDMonitor::check_failures(utime_t now
)
2058 bool found_failure
= false;
2059 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2060 p
!= failure_info
.end();
2062 if (can_mark_down(p
->first
)) {
2063 found_failure
|= check_failure(now
, p
->first
, p
->second
);
2066 return found_failure
;
2069 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
2071 // already pending failure?
2072 if (pending_inc
.new_state
.count(target_osd
) &&
2073 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
2074 dout(10) << " already pending failure" << dendl
;
2078 set
<string
> reporters_by_subtree
;
2079 string reporter_subtree_level
= g_conf
->mon_osd_reporter_subtree_level
;
2080 utime_t
orig_grace(g_conf
->osd_heartbeat_grace
, 0);
2081 utime_t max_failed_since
= fi
.get_failed_since();
2082 utime_t failed_for
= now
- max_failed_since
;
2084 utime_t grace
= orig_grace
;
2085 double my_grace
= 0, peer_grace
= 0;
2087 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
2088 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
2089 decay_k
= ::log(.5) / halflife
;
2091 // scale grace period based on historical probability of 'lagginess'
2092 // (false positive failures due to slowness).
2093 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
2094 double decay
= exp((double)failed_for
* decay_k
);
2095 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
2096 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
2097 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
2101 // consider the peers reporting a failure a proxy for a potential
2102 // 'subcluster' over the overall cluster that is similarly
2103 // laggy. this is clearly not true in all cases, but will sometimes
2104 // help us localize the grace correction to a subset of the system
2105 // (say, a rack with a bad switch) that is unhappy.
2106 assert(fi
.reporters
.size());
2107 for (map
<int,failure_reporter_t
>::iterator p
= fi
.reporters
.begin();
2108 p
!= fi
.reporters
.end();
2110 // get the parent bucket whose type matches with "reporter_subtree_level".
2111 // fall back to OSD if the level doesn't exist.
2112 map
<string
, string
> reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
2113 map
<string
, string
>::iterator iter
= reporter_loc
.find(reporter_subtree_level
);
2114 if (iter
== reporter_loc
.end()) {
2115 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
2117 reporters_by_subtree
.insert(iter
->second
);
2119 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
2120 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
2121 utime_t elapsed
= now
- xi
.down_stamp
;
2122 double decay
= exp((double)elapsed
* decay_k
);
2123 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
2127 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
2128 peer_grace
/= (double)fi
.reporters
.size();
2129 grace
+= peer_grace
;
2132 dout(10) << " osd." << target_osd
<< " has "
2133 << fi
.reporters
.size() << " reporters, "
2134 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
2135 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
2138 if (failed_for
>= grace
&&
2139 (int)reporters_by_subtree
.size() >= g_conf
->mon_osd_min_down_reporters
) {
2140 dout(1) << " we have enough reporters to mark osd." << target_osd
2141 << " down" << dendl
;
2142 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2144 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
2145 << osdmap
.crush
->get_full_location_ordered_string(
2148 << (int)reporters_by_subtree
.size()
2149 << " reporters from different "
2150 << reporter_subtree_level
<< " after "
2151 << failed_for
<< " >= grace " << grace
<< ")";
2157 void OSDMonitor::force_failure(int target_osd
, int by
)
2159 // already pending failure?
2160 if (pending_inc
.new_state
.count(target_osd
) &&
2161 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
2162 dout(10) << " already pending failure" << dendl
;
2166 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
2167 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2169 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
2170 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
2171 << ") (connection refused reported by osd." << by
<< ")";
2175 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
2177 op
->mark_osdmon_event(__func__
);
2178 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
2179 dout(1) << "prepare_failure " << m
->get_target()
2180 << " from " << m
->get_orig_source_inst()
2181 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
2183 int target_osd
= m
->get_target().name
.num();
2184 int reporter
= m
->get_orig_source().num();
2185 assert(osdmap
.is_up(target_osd
));
2186 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
2188 if (m
->if_osd_failed()) {
2189 // calculate failure time
2190 utime_t now
= ceph_clock_now();
2191 utime_t failed_since
=
2192 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
2195 if (m
->is_immediate()) {
2196 mon
->clog
->debug() << m
->get_target() << " reported immediately failed by "
2197 << m
->get_orig_source_inst();
2198 force_failure(target_osd
, reporter
);
2202 mon
->clog
->debug() << m
->get_target() << " reported failed by "
2203 << m
->get_orig_source_inst();
2205 failure_info_t
& fi
= failure_info
[target_osd
];
2206 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
2208 mon
->no_reply(old_op
);
2211 return check_failure(now
, target_osd
, fi
);
2213 // remove the report
2214 mon
->clog
->debug() << m
->get_target() << " failure report canceled by "
2215 << m
->get_orig_source_inst();
2216 if (failure_info
.count(target_osd
)) {
2217 failure_info_t
& fi
= failure_info
[target_osd
];
2218 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
2220 mon
->no_reply(report_op
);
2222 if (fi
.reporters
.empty()) {
2223 dout(10) << " removing last failure_info for osd." << target_osd
2225 failure_info
.erase(target_osd
);
2227 dout(10) << " failure_info for osd." << target_osd
<< " now "
2228 << fi
.reporters
.size() << " reporters" << dendl
;
2231 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
2239 void OSDMonitor::process_failures()
2241 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2242 while (p
!= failure_info
.end()) {
2243 if (osdmap
.is_up(p
->first
)) {
2246 dout(10) << "process_failures osd." << p
->first
<< dendl
;
2247 list
<MonOpRequestRef
> ls
;
2248 p
->second
.take_report_messages(ls
);
2249 failure_info
.erase(p
++);
2251 while (!ls
.empty()) {
2252 MonOpRequestRef o
= ls
.front();
2254 o
->mark_event(__func__
);
2255 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
2256 send_latest(o
, m
->get_epoch());
2264 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
2266 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
2268 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2269 p
!= failure_info
.end();
2271 p
->second
.take_report_messages(ls
);
2273 failure_info
.clear();
2279 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
2281 op
->mark_osdmon_event(__func__
);
2282 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2283 int from
= m
->get_orig_source_inst().name
.num();
2285 // check permissions, ignore if failed (no response expected)
2286 MonSession
*session
= m
->get_session();
2289 if (!session
->is_capable("osd", MON_CAP_X
)) {
2290 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2291 << session
->caps
<< dendl
;
2295 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
2296 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
2297 << " != " << mon
->monmap
->fsid
<< dendl
;
2301 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
2302 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
2306 assert(m
->get_orig_source_inst().name
.is_osd());
2308 // check if osd has required features to boot
2309 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2310 CEPH_FEATURE_OSD_ERASURE_CODES
) &&
2311 !(m
->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES
)) {
2312 dout(0) << __func__
<< " osdmap requires erasure code but osd at "
2313 << m
->get_orig_source_inst()
2314 << " doesn't announce support -- ignore" << dendl
;
2318 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2319 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
) &&
2320 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
)) {
2321 dout(0) << __func__
<< " osdmap requires erasure code plugins v2 but osd at "
2322 << m
->get_orig_source_inst()
2323 << " doesn't announce support -- ignore" << dendl
;
2327 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2328 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
) &&
2329 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
)) {
2330 dout(0) << __func__
<< " osdmap requires erasure code plugins v3 but osd at "
2331 << m
->get_orig_source_inst()
2332 << " doesn't announce support -- ignore" << dendl
;
2336 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
2337 !HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
2338 mon
->clog
->info() << "disallowing boot of OSD "
2339 << m
->get_orig_source_inst()
2340 << " because the osdmap requires"
2341 << " CEPH_FEATURE_SERVER_LUMINOUS"
2342 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2346 if (osdmap
.require_osd_release
>= CEPH_RELEASE_JEWEL
&&
2347 !(m
->osd_features
& CEPH_FEATURE_SERVER_JEWEL
)) {
2348 mon
->clog
->info() << "disallowing boot of OSD "
2349 << m
->get_orig_source_inst()
2350 << " because the osdmap requires"
2351 << " CEPH_FEATURE_SERVER_JEWEL"
2352 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2356 if (osdmap
.require_osd_release
>= CEPH_RELEASE_KRAKEN
&&
2357 !HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
2358 mon
->clog
->info() << "disallowing boot of OSD "
2359 << m
->get_orig_source_inst()
2360 << " because the osdmap requires"
2361 << " CEPH_FEATURE_SERVER_KRAKEN"
2362 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2366 if (osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
2367 !(m
->osd_features
& CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
2368 mon
->clog
->info() << "disallowing boot of OSD "
2369 << m
->get_orig_source_inst()
2370 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2374 if (osdmap
.test_flag(CEPH_OSDMAP_RECOVERY_DELETES
) &&
2375 !(m
->osd_features
& CEPH_FEATURE_OSD_RECOVERY_DELETES
)) {
2376 mon
->clog
->info() << "disallowing boot of OSD "
2377 << m
->get_orig_source_inst()
2378 << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
2382 if (any_of(osdmap
.get_pools().begin(),
2383 osdmap
.get_pools().end(),
2384 [](const std::pair
<int64_t,pg_pool_t
>& pool
)
2385 { return pool
.second
.use_gmt_hitset
; })) {
2386 assert(osdmap
.get_num_up_osds() == 0 ||
2387 osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
);
2388 if (!(m
->osd_features
& CEPH_FEATURE_OSD_HITSET_GMT
)) {
2389 dout(0) << __func__
<< " one or more pools uses GMT hitsets but osd at "
2390 << m
->get_orig_source_inst()
2391 << " doesn't announce support -- ignore" << dendl
;
2396 // make sure upgrades stop at luminous
2397 if (HAVE_FEATURE(m
->osd_features
, SERVER_M
) &&
2398 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
2399 mon
->clog
->info() << "disallowing boot of post-luminous OSD "
2400 << m
->get_orig_source_inst()
2401 << " because require_osd_release < luminous";
2405 // make sure upgrades stop at jewel
2406 if (HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
) &&
2407 osdmap
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
2408 mon
->clog
->info() << "disallowing boot of post-jewel OSD "
2409 << m
->get_orig_source_inst()
2410 << " because require_osd_release < jewel";
2414 // make sure upgrades stop at hammer
2415 // * HAMMER_0_94_4 is the required hammer feature
2416 // * MON_METADATA is the first post-hammer feature
2417 if (osdmap
.get_num_up_osds() > 0) {
2418 if ((m
->osd_features
& CEPH_FEATURE_MON_METADATA
) &&
2419 !(osdmap
.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4
)) {
2420 mon
->clog
->info() << "disallowing boot of post-hammer OSD "
2421 << m
->get_orig_source_inst()
2422 << " because one or more up OSDs is pre-hammer v0.94.4";
2425 if (!(m
->osd_features
& CEPH_FEATURE_HAMMER_0_94_4
) &&
2426 (osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_METADATA
)) {
2427 mon
->clog
->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2428 << m
->get_orig_source_inst()
2429 << " because all up OSDs are post-hammer";
2435 if (osdmap
.is_up(from
) &&
2436 osdmap
.get_inst(from
) == m
->get_orig_source_inst() &&
2437 osdmap
.get_cluster_addr(from
) == m
->cluster_addr
) {
2439 dout(7) << "preprocess_boot dup from " << m
->get_orig_source_inst()
2440 << " == " << osdmap
.get_inst(from
) << dendl
;
2445 if (osdmap
.exists(from
) &&
2446 !osdmap
.get_uuid(from
).is_zero() &&
2447 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2448 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
2449 << " clashes with existing osd: different fsid"
2450 << " (ours: " << osdmap
.get_uuid(from
)
2451 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
2455 if (osdmap
.exists(from
) &&
2456 osdmap
.get_info(from
).up_from
> m
->version
&&
2457 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) {
2458 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
2459 send_latest(op
, m
->sb
.current_epoch
+1);
2464 if (!can_mark_up(from
)) {
2465 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
2466 send_latest(op
, m
->sb
.current_epoch
+1);
2470 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
2477 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
2479 op
->mark_osdmon_event(__func__
);
2480 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2481 dout(7) << __func__
<< " from " << m
->get_orig_source_inst() << " sb " << m
->sb
2482 << " cluster_addr " << m
->cluster_addr
2483 << " hb_back_addr " << m
->hb_back_addr
2484 << " hb_front_addr " << m
->hb_front_addr
2487 assert(m
->get_orig_source().is_osd());
2488 int from
= m
->get_orig_source().num();
2490 // does this osd exist?
2491 if (from
>= osdmap
.get_max_osd()) {
2492 dout(1) << "boot from osd." << from
<< " >= max_osd "
2493 << osdmap
.get_max_osd() << dendl
;
2497 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
2498 if (pending_inc
.new_state
.count(from
))
2499 oldstate
^= pending_inc
.new_state
[from
];
2501 // already up? mark down first?
2502 if (osdmap
.is_up(from
)) {
2503 dout(7) << __func__
<< " was up, first marking down "
2504 << osdmap
.get_inst(from
) << dendl
;
2505 // preprocess should have caught these; if not, assert.
2506 assert(osdmap
.get_inst(from
) != m
->get_orig_source_inst() ||
2507 osdmap
.get_cluster_addr(from
) != m
->cluster_addr
);
2508 assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
2510 if (pending_inc
.new_state
.count(from
) == 0 ||
2511 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
2512 // mark previous guy down
2513 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
2515 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2516 } else if (pending_inc
.new_up_client
.count(from
)) {
2517 // already prepared, just wait
2518 dout(7) << __func__
<< " already prepared, waiting on "
2519 << m
->get_orig_source_addr() << dendl
;
2520 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2523 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addr();
2524 if (!m
->cluster_addr
.is_blank_ip())
2525 pending_inc
.new_up_cluster
[from
] = m
->cluster_addr
;
2526 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addr
;
2527 if (!m
->hb_front_addr
.is_blank_ip())
2528 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addr
;
2530 down_pending_out
.erase(from
); // if any
2533 osd_weight
[from
] = m
->sb
.weight
;
2536 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
2538 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2539 // preprocess should have caught this; if not, assert.
2540 assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
2541 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
2545 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
2546 const osd_info_t
& i
= osdmap
.get_info(from
);
2547 if (i
.up_from
> i
.lost_at
) {
2548 dout(10) << " fresh osd; marking lost_at too" << dendl
;
2549 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
2554 bufferlist osd_metadata
;
2555 ::encode(m
->metadata
, osd_metadata
);
2556 pending_metadata
[from
] = osd_metadata
;
2557 pending_metadata_rm
.erase(from
);
2559 // adjust last clean unmount epoch?
2560 const osd_info_t
& info
= osdmap
.get_info(from
);
2561 dout(10) << " old osd_info: " << info
<< dendl
;
2562 if (m
->sb
.mounted
> info
.last_clean_begin
||
2563 (m
->sb
.mounted
== info
.last_clean_begin
&&
2564 m
->sb
.clean_thru
> info
.last_clean_end
)) {
2565 epoch_t begin
= m
->sb
.mounted
;
2566 epoch_t end
= m
->sb
.clean_thru
;
2568 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
2569 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
2570 << ") -> [" << begin
<< "-" << end
<< ")"
2572 pending_inc
.new_last_clean_interval
[from
] =
2573 pair
<epoch_t
,epoch_t
>(begin
, end
);
2576 osd_xinfo_t xi
= osdmap
.get_xinfo(from
);
2577 if (m
->boot_epoch
== 0) {
2578 xi
.laggy_probability
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2579 xi
.laggy_interval
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2580 dout(10) << " not laggy, new xi " << xi
<< dendl
;
2582 if (xi
.down_stamp
.sec()) {
2583 int interval
= ceph_clock_now().sec() -
2584 xi
.down_stamp
.sec();
2585 if (g_conf
->mon_osd_laggy_max_interval
&&
2586 (interval
> g_conf
->mon_osd_laggy_max_interval
)) {
2587 interval
= g_conf
->mon_osd_laggy_max_interval
;
2590 interval
* g_conf
->mon_osd_laggy_weight
+
2591 xi
.laggy_interval
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2593 xi
.laggy_probability
=
2594 g_conf
->mon_osd_laggy_weight
+
2595 xi
.laggy_probability
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2596 dout(10) << " laggy, now xi " << xi
<< dendl
;
2599 // set features shared by the osd
2600 if (m
->osd_features
)
2601 xi
.features
= m
->osd_features
;
2603 xi
.features
= m
->get_connection()->get_features();
2606 if ((g_conf
->mon_osd_auto_mark_auto_out_in
&&
2607 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
2608 (g_conf
->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
2609 (g_conf
->mon_osd_auto_mark_in
)) {
2610 if (can_mark_in(from
)) {
2611 if (osdmap
.osd_xinfo
[from
].old_weight
> 0) {
2612 pending_inc
.new_weight
[from
] = osdmap
.osd_xinfo
[from
].old_weight
;
2615 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
2618 dout(7) << __func__
<< " NOIN set, will not mark in "
2619 << m
->get_orig_source_addr() << dendl
;
2623 pending_inc
.new_xinfo
[from
] = xi
;
2626 wait_for_finished_proposal(op
, new C_Booted(this, op
));
2631 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
2633 op
->mark_osdmon_event(__func__
);
2634 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2635 dout(7) << "_booted " << m
->get_orig_source_inst()
2636 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
2639 mon
->clog
->info() << m
->get_orig_source_inst() << " boot";
2642 send_latest(op
, m
->sb
.current_epoch
+1);
2649 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
2651 op
->mark_osdmon_event(__func__
);
2652 MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2653 int from
= m
->get_orig_source().num();
2655 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2657 // check permissions, ignore if failed
2658 MonSession
*session
= m
->get_session();
2661 if (!session
->is_capable("osd", MON_CAP_X
)) {
2662 dout(0) << "MOSDFull from entity with insufficient privileges:"
2663 << session
->caps
<< dendl
;
2667 // ignore a full message from the osd instance that already went down
2668 if (!osdmap
.exists(from
)) {
2669 dout(7) << __func__
<< " ignoring full message from nonexistent "
2670 << m
->get_orig_source_inst() << dendl
;
2673 if ((!osdmap
.is_up(from
) &&
2674 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) ||
2675 (osdmap
.is_up(from
) &&
2676 osdmap
.get_inst(from
) != m
->get_orig_source_inst())) {
2677 dout(7) << __func__
<< " ignoring full message from down "
2678 << m
->get_orig_source_inst() << dendl
;
2682 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
2684 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
2685 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
2686 << " " << m
->get_orig_source_inst() << dendl
;
2687 _reply_map(op
, m
->version
);
2691 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
2692 << " " << m
->get_orig_source_inst() << dendl
;
2699 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
2701 op
->mark_osdmon_event(__func__
);
2702 const MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2703 const int from
= m
->get_orig_source().num();
2705 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2706 const unsigned want_state
= m
->state
& mask
; // safety first
2708 unsigned cur_state
= osdmap
.get_state(from
);
2709 auto p
= pending_inc
.new_state
.find(from
);
2710 if (p
!= pending_inc
.new_state
.end()) {
2711 cur_state
^= p
->second
;
2715 set
<string
> want_state_set
, cur_state_set
;
2716 OSDMap::calc_state_set(want_state
, want_state_set
);
2717 OSDMap::calc_state_set(cur_state
, cur_state_set
);
2719 if (cur_state
!= want_state
) {
2720 if (p
!= pending_inc
.new_state
.end()) {
2723 pending_inc
.new_state
[from
] = 0;
2725 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
2726 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2727 << " -> " << want_state_set
<< dendl
;
2729 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2730 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
2733 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2740 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
2742 op
->mark_osdmon_event(__func__
);
2743 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2744 int from
= m
->get_orig_source().num();
2746 // check permissions, ignore if failed
2747 MonSession
*session
= m
->get_session();
2750 if (!session
->is_capable("osd", MON_CAP_X
)) {
2751 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2752 << session
->caps
<< dendl
;
2756 if (!osdmap
.is_up(from
) ||
2757 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2758 dout(7) << "preprocess_alive ignoring alive message from down " << m
->get_orig_source_inst() << dendl
;
2762 if (osdmap
.get_up_thru(from
) >= m
->want
) {
2764 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
2765 _reply_map(op
, m
->version
);
2769 dout(10) << "preprocess_alive want up_thru " << m
->want
2770 << " from " << m
->get_orig_source_inst() << dendl
;
2777 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
2779 op
->mark_osdmon_event(__func__
);
2780 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2781 int from
= m
->get_orig_source().num();
2783 if (0) { // we probably don't care much about these
2784 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
2787 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
2788 << " from " << m
->get_orig_source_inst() << dendl
;
2790 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
2791 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2795 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
2797 op
->mark_osdmon_event(__func__
);
2798 dout(7) << "_reply_map " << e
2799 << " from " << op
->get_req()->get_orig_source_inst()
2805 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
2807 op
->mark_osdmon_event(__func__
);
2808 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2809 dout(10) << __func__
<< " " << *m
<< dendl
;
2810 auto session
= m
->get_session();
2813 dout(10) << __func__
<< ": no monitor session!" << dendl
;
2816 if (!session
->is_capable("osd", MON_CAP_X
)) {
2817 derr
<< __func__
<< " received from entity "
2818 << "with insufficient privileges " << session
->caps
<< dendl
;
2821 // always forward the "created!" to the leader
2825 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
2827 op
->mark_osdmon_event(__func__
);
2828 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2829 dout(10) << __func__
<< " " << *m
<< dendl
;
2830 auto src
= m
->get_orig_source();
2831 auto from
= src
.num();
2832 if (!src
.is_osd() ||
2833 !mon
->osdmon()->osdmap
.is_up(from
) ||
2834 m
->get_orig_source_inst() != mon
->osdmon()->osdmap
.get_inst(from
)) {
2835 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
2838 pending_created_pgs
.push_back(m
->pgid
);
2845 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
2847 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
2848 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
2849 mempool::osdmap::vector
<int> empty
;
2850 int from
= m
->get_orig_source().num();
2851 size_t ignore_cnt
= 0;
2854 MonSession
*session
= m
->get_session();
2857 if (!session
->is_capable("osd", MON_CAP_X
)) {
2858 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2859 << session
->caps
<< dendl
;
2863 if (!osdmap
.is_up(from
) ||
2864 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2865 dout(7) << "ignoring pgtemp message from down " << m
->get_orig_source_inst() << dendl
;
2873 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
2874 dout(20) << " " << p
->first
2875 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
2876 << " -> " << p
->second
<< dendl
;
2878 // does the pool exist?
2879 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
2881 * 1. If the osdmap does not have the pool, it means the pool has been
2882 * removed in-between the osd sending this message and us handling it.
2883 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2884 * not exist in the pending either, as the osds would not send a
2885 * message about a pool they know nothing about (yet).
2886 * 3. However, if the pool does exist in the pending, then it must be a
2887 * new pool, and not relevant to this message (see 1).
2889 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2890 << ": pool has been removed" << dendl
;
2895 int acting_primary
= -1;
2896 osdmap
.pg_to_up_acting_osds(
2897 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
2898 if (acting_primary
!= from
) {
2899 /* If the source isn't the primary based on the current osdmap, we know
2900 * that the interval changed and that we can discard this message.
2901 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2902 * which of two pg temp mappings on the same pg is more recent.
2904 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2905 << ": primary has changed" << dendl
;
2911 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
2912 osdmap
.primary_temp
->count(p
->first
)))
2915 // NOTE: we assume that this will clear pg_primary, so consider
2916 // an existing pg_primary field to imply a change
2917 if (p
->second
.size() &&
2918 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
2919 !vectors_equal(osdmap
.pg_temp
->get(p
->first
), p
->second
) ||
2920 osdmap
.primary_temp
->count(p
->first
)))
2924 // should we ignore all the pgs?
2925 if (ignore_cnt
== m
->pg_temp
.size())
2928 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
2929 _reply_map(op
, m
->map_epoch
);
2936 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
2938 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
2939 auto ut
= pending_inc
.new_up_thru
.find(from
);
2940 if (ut
!= pending_inc
.new_up_thru
.end()) {
2941 old_up_thru
= ut
->second
;
2943 if (up_thru
> old_up_thru
) {
2944 // set up_thru too, so the osd doesn't have to ask again
2945 pending_inc
.new_up_thru
[from
] = up_thru
;
2949 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
2951 op
->mark_osdmon_event(__func__
);
2952 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
2953 int from
= m
->get_orig_source().num();
2954 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
2955 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
2956 uint64_t pool
= p
->first
.pool();
2957 if (pending_inc
.old_pools
.count(pool
)) {
2958 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2959 << ": pool pending removal" << dendl
;
2962 if (!osdmap
.have_pg_pool(pool
)) {
2963 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2964 << ": pool has been removed" << dendl
;
2967 pending_inc
.new_pg_temp
[p
->first
] =
2968 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
2970 // unconditionally clear pg_primary (until this message can encode
2971 // a change for that, too.. at which point we need to also fix
2972 // preprocess_pg_temp)
2973 if (osdmap
.primary_temp
->count(p
->first
) ||
2974 pending_inc
.new_primary_temp
.count(p
->first
))
2975 pending_inc
.new_primary_temp
[p
->first
] = -1;
2978 // set up_thru too, so the osd doesn't have to ask again
2979 update_up_thru(from
, m
->map_epoch
);
2981 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
2988 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
2990 op
->mark_osdmon_event(__func__
);
2991 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
2992 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
2994 // check privilege, ignore if failed
2995 MonSession
*session
= m
->get_session();
2998 if (!session
->caps
.is_capable(
3000 CEPH_ENTITY_TYPE_MON
,
3001 session
->entity_name
,
3002 "osd", "osd pool rmsnap", {}, true, true, false)) {
3003 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3004 << session
->caps
<< dendl
;
3008 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
3009 q
!= m
->snaps
.end();
3011 if (!osdmap
.have_pg_pool(q
->first
)) {
3012 dout(10) << " ignoring removed_snaps " << q
->second
<< " on non-existent pool " << q
->first
<< dendl
;
3015 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
3016 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
3017 p
!= q
->second
.end();
3019 if (*p
> pi
->get_snap_seq() ||
3020 !pi
->removed_snaps
.contains(*p
))
3029 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
3031 op
->mark_osdmon_event(__func__
);
3032 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
3033 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
3035 for (map
<int, vector
<snapid_t
> >::iterator p
= m
->snaps
.begin();
3036 p
!= m
->snaps
.end();
3039 if (!osdmap
.have_pg_pool(p
->first
)) {
3040 dout(10) << " ignoring removed_snaps " << p
->second
<< " on non-existent pool " << p
->first
<< dendl
;
3044 pg_pool_t
& pi
= osdmap
.pools
[p
->first
];
3045 for (vector
<snapid_t
>::iterator q
= p
->second
.begin();
3046 q
!= p
->second
.end();
3048 if (!pi
.removed_snaps
.contains(*q
) &&
3049 (!pending_inc
.new_pools
.count(p
->first
) ||
3050 !pending_inc
.new_pools
[p
->first
].removed_snaps
.contains(*q
))) {
3051 pg_pool_t
*newpi
= pending_inc
.get_new_pool(p
->first
, &pi
);
3052 newpi
->removed_snaps
.insert(*q
);
3053 dout(10) << " pool " << p
->first
<< " removed_snaps added " << *q
3054 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
3055 if (*q
> newpi
->get_snap_seq()) {
3056 dout(10) << " pool " << p
->first
<< " snap_seq " << newpi
->get_snap_seq() << " -> " << *q
<< dendl
;
3057 newpi
->set_snap_seq(*q
);
3059 newpi
->set_snap_epoch(pending_inc
.epoch
);
3067 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
3069 op
->mark_osdmon_event(__func__
);
3070 auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
3072 auto session
= beacon
->get_session();
3075 dout(10) << __func__
<< " no monitor session!" << dendl
;
3078 if (!session
->is_capable("osd", MON_CAP_X
)) {
3079 derr
<< __func__
<< " received from entity "
3080 << "with insufficient privileges " << session
->caps
<< dendl
;
3083 // Always forward the beacon to the leader, even if they are the same as
3084 // the old one. The leader will mark as down osds that haven't sent
3085 // beacon for a few minutes.
3089 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
3091 op
->mark_osdmon_event(__func__
);
3092 const auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
3093 const auto src
= beacon
->get_orig_source();
3094 dout(10) << __func__
<< " " << *beacon
3095 << " from " << src
<< dendl
;
3096 int from
= src
.num();
3098 if (!src
.is_osd() ||
3099 !osdmap
.is_up(from
) ||
3100 beacon
->get_orig_source_inst() != osdmap
.get_inst(from
)) {
3101 dout(1) << " ignoring beacon from non-active osd." << dendl
;
3105 last_osd_report
[from
] = ceph_clock_now();
3106 osd_epochs
[from
] = beacon
->version
;
3108 for (const auto& pg
: beacon
->pgs
) {
3109 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
3117 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
3119 op
->mark_osdmon_event(__func__
);
3120 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
3121 << " start " << start
<< dendl
;
3125 send_incremental(op
, start
);
3129 MOSDMap
*OSDMonitor::build_latest_full()
3131 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
);
3132 get_version_full(osdmap
.get_epoch(), r
->maps
[osdmap
.get_epoch()]);
3133 r
->oldest_map
= get_first_committed();
3134 r
->newest_map
= osdmap
.get_epoch();
3138 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
)
3140 dout(10) << "build_incremental [" << from
<< ".." << to
<< "]" << dendl
;
3141 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
);
3142 m
->oldest_map
= get_first_committed();
3143 m
->newest_map
= osdmap
.get_epoch();
3145 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
3147 int err
= get_version(e
, bl
);
3149 assert(bl
.length());
3150 // if (get_version(e, bl) > 0) {
3151 dout(20) << "build_incremental inc " << e
<< " "
3152 << bl
.length() << " bytes" << dendl
;
3153 m
->incremental_maps
[e
] = bl
;
3155 assert(err
== -ENOENT
);
3156 assert(!bl
.length());
3157 get_version_full(e
, bl
);
3158 if (bl
.length() > 0) {
3159 //else if (get_version("full", e, bl) > 0) {
3160 dout(20) << "build_incremental full " << e
<< " "
3161 << bl
.length() << " bytes" << dendl
;
3164 ceph_abort(); // we should have all maps.
3171 void OSDMonitor::send_full(MonOpRequestRef op
)
3173 op
->mark_osdmon_event(__func__
);
3174 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
3175 mon
->send_reply(op
, build_latest_full());
3178 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
3180 op
->mark_osdmon_event(__func__
);
3182 MonSession
*s
= op
->get_session();
3186 s
->proxy_con
->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP
)) {
3187 // oh, we can tell the other mon to do it
3188 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
3190 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
3191 r
->send_osdmap_first
= first
;
3192 s
->proxy_con
->send_message(r
);
3193 op
->mark_event("reply: send routed send_osdmap_first reply");
3196 send_incremental(first
, s
, false, op
);
3200 void OSDMonitor::send_incremental(epoch_t first
,
3201 MonSession
*session
,
3203 MonOpRequestRef req
)
3205 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
3206 << " to " << session
->inst
<< dendl
;
3208 if (first
<= session
->osd_epoch
) {
3209 dout(10) << __func__
<< " " << session
->inst
<< " should already have epoch "
3210 << session
->osd_epoch
<< dendl
;
3211 first
= session
->osd_epoch
+ 1;
3214 if (first
< get_first_committed()) {
3215 first
= get_first_committed();
3217 int err
= get_version_full(first
, bl
);
3219 assert(bl
.length());
3221 dout(20) << "send_incremental starting with base full "
3222 << first
<< " " << bl
.length() << " bytes" << dendl
;
3224 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid());
3225 m
->oldest_map
= get_first_committed();
3226 m
->newest_map
= osdmap
.get_epoch();
3227 m
->maps
[first
] = bl
;
3230 mon
->send_reply(req
, m
);
3231 session
->osd_epoch
= first
;
3234 session
->con
->send_message(m
);
3235 session
->osd_epoch
= first
;
3240 while (first
<= osdmap
.get_epoch()) {
3241 epoch_t last
= MIN(first
+ g_conf
->osd_map_message_max
- 1,
3242 osdmap
.get_epoch());
3243 MOSDMap
*m
= build_incremental(first
, last
);
3246 // send some maps. it may not be all of them, but it will get them
3248 mon
->send_reply(req
, m
);
3250 session
->con
->send_message(m
);
3253 session
->osd_epoch
= last
;
3259 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
3261 if (inc_osd_cache
.lookup(ver
, &bl
)) {
3264 int ret
= PaxosService::get_version(ver
, bl
);
3266 inc_osd_cache
.add(ver
, bl
);
3271 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
3273 if (full_osd_cache
.lookup(ver
, &bl
)) {
3276 int ret
= PaxosService::get_version_full(ver
, bl
);
3278 full_osd_cache
.add(ver
, bl
);
3283 epoch_t
OSDMonitor::blacklist(const entity_addr_t
& a
, utime_t until
)
3285 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
3286 pending_inc
.new_blacklist
[a
] = until
;
3287 return pending_inc
.epoch
;
3291 void OSDMonitor::check_osdmap_subs()
3293 dout(10) << __func__
<< dendl
;
3294 if (!osdmap
.get_epoch()) {
3297 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
3298 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
3301 auto p
= osdmap_subs
->second
->begin();
3305 check_osdmap_sub(sub
);
3309 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
3311 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
3312 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
3313 if (sub
->next
<= osdmap
.get_epoch()) {
3315 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
3317 sub
->session
->con
->send_message(build_latest_full());
3319 mon
->session_map
.remove_sub(sub
);
3321 sub
->next
= osdmap
.get_epoch() + 1;
3325 void OSDMonitor::check_pg_creates_subs()
3327 if (!mon
->monmap
->get_required_features().contains_all(
3328 ceph::features::mon::FEATURE_LUMINOUS
)) {
3329 // PGMonitor takes care of this in pre-luminous era.
3332 if (!osdmap
.get_num_up_osds()) {
3335 assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
3336 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
3337 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
3338 if (pg_creates_subs
== session_map
.subs
.end()) {
3341 for (auto sub
: *pg_creates_subs
->second
) {
3342 check_pg_creates_sub(sub
);
3347 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
3349 dout(20) << __func__
<< " .. " << sub
->session
->inst
<< dendl
;
3350 assert(sub
->type
== "osd_pg_creates");
3351 // only send these if the OSD is up. we will check_subs() when they do
3352 // come up so they will get the creates then.
3353 if (sub
->session
->inst
.name
.is_osd() &&
3354 mon
->osdmon()->osdmap
.is_up(sub
->session
->inst
.name
.num())) {
3355 sub
->next
= send_pg_creates(sub
->session
->inst
.name
.num(),
3356 sub
->session
->con
.get(),
3361 void OSDMonitor::do_application_enable(int64_t pool_id
,
3362 const std::string
&app_name
)
3364 assert(paxos
->is_plugged() && is_writeable());
3366 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
3369 assert(osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
||
3370 pending_inc
.new_require_osd_release
>= CEPH_RELEASE_LUMINOUS
);
3372 auto pp
= osdmap
.get_pg_pool(pool_id
);
3373 assert(pp
!= nullptr);
3376 if (pending_inc
.new_pools
.count(pool_id
)) {
3377 p
= pending_inc
.new_pools
[pool_id
];
3380 p
.application_metadata
.insert({app_name
, {}});
3381 p
.last_change
= pending_inc
.epoch
;
3382 pending_inc
.new_pools
[pool_id
] = p
;
3385 unsigned OSDMonitor::scan_for_creating_pgs(
3386 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
3387 const mempool::osdmap::set
<int64_t>& removed_pools
,
3389 creating_pgs_t
* creating_pgs
) const
3391 unsigned queued
= 0;
3392 for (auto& p
: pools
) {
3393 int64_t poolid
= p
.first
;
3394 const pg_pool_t
& pool
= p
.second
;
3395 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
3396 pool
.get_type(), pool
.get_size());
3397 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
3400 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
3401 const auto created
= pool
.get_last_change();
3402 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
3403 dout(10) << __func__
<< " no change in pool " << poolid
3404 << " " << pool
<< dendl
;
3407 if (removed_pools
.count(poolid
)) {
3408 dout(10) << __func__
<< " pool is being removed: " << poolid
3409 << " " << pool
<< dendl
;
3412 dout(10) << __func__
<< " queueing pool create for " << poolid
3413 << " " << pool
<< dendl
;
3414 if (creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
3415 created
, modified
)) {
3422 void OSDMonitor::update_creating_pgs()
3424 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
3425 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
3426 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
3427 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3428 for (const auto& pg
: creating_pgs
.pgs
) {
3429 int acting_primary
= -1;
3430 auto pgid
= pg
.first
;
3431 if (!osdmap
.pg_exists(pgid
)) {
3432 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
3436 auto mapped
= pg
.second
.first
;
3437 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
3438 mapping
.get(pgid
, nullptr, nullptr, nullptr, &acting_primary
);
3439 // check the previous creating_pgs, look for the target to whom the pg was
3440 // previously mapped
3441 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
3442 const auto last_acting_primary
= pgs_by_epoch
.first
;
3443 for (auto& pgs
: pgs_by_epoch
.second
) {
3444 if (pgs
.second
.count(pgid
)) {
3445 if (last_acting_primary
== acting_primary
) {
3448 dout(20) << __func__
<< " " << pgid
<< " "
3449 << " acting_primary:" << last_acting_primary
3450 << " -> " << acting_primary
<< dendl
;
3451 // note epoch if the target of the create message changed.
3452 mapped
= mapping
.get_epoch();
3457 mapped
= mapping
.get_epoch();
3461 dout(10) << __func__
<< " will instruct osd." << acting_primary
3462 << " to create " << pgid
<< "@" << mapped
<< dendl
;
3463 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(pgid
);
3465 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
3466 creating_pgs_epoch
= mapping
.get_epoch();
3469 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
3471 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
3472 << " " << creating_pgs_by_osd_epoch
<< dendl
;
3473 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3474 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
3475 dout(20) << __func__
3476 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
3477 // the subscribers will be updated when the mapping is completed anyway
3480 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
3481 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
3483 assert(!creating_pgs_by_epoch
->second
.empty());
3485 MOSDPGCreate
*m
= nullptr;
3487 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
3488 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
3489 auto epoch
= epoch_pgs
->first
;
3490 auto& pgs
= epoch_pgs
->second
;
3491 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3492 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
3494 for (auto& pg
: pgs
) {
3496 m
= new MOSDPGCreate(creating_pgs_epoch
);
3497 // Need the create time from the monitor using its clock to set
3498 // last_scrub_stamp upon pg creation.
3499 auto create
= creating_pgs
.pgs
.find(pg
);
3500 assert(create
!= creating_pgs
.pgs
.end());
3501 m
->mkpg
.emplace(pg
, pg_create_t
{create
->second
.first
, pg
, 0});
3502 m
->ctimes
.emplace(pg
, create
->second
.second
);
3503 dout(20) << __func__
<< " will create " << pg
3504 << " at " << create
->second
.first
<< dendl
;
3508 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3509 << " has nothing to send" << dendl
;
3512 con
->send_message(m
);
3513 // sub is current through last + 1
3520 void OSDMonitor::tick()
3522 if (!is_active()) return;
3524 dout(10) << osdmap
<< dendl
;
3526 if (!mon
->is_leader()) return;
3528 bool do_propose
= false;
3529 utime_t now
= ceph_clock_now();
3531 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
3532 mon
->monmap
->get_required_features().contains_all(
3533 ceph::features::mon::FEATURE_LUMINOUS
)) {
3534 if (handle_osd_timeouts(now
, last_osd_report
)) {
3538 if (!osdmap
.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS
) &&
3539 osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
3540 mon
->mgrstatmon()->is_readable() &&
3541 mon
->mgrstatmon()->definitely_converted_snapsets()) {
3542 dout(1) << __func__
<< " all snapsets converted, setting purged_snapdirs"
3544 add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS
);
3549 if (check_failures(now
))
3552 // mark down osds out?
3554 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3555 * influence at all. The decision is made based on the ratio of "in" osds,
3556 * and the function returns false if this ratio is lower that the minimum
3557 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3559 if (can_mark_out(-1)) {
3560 set
<int> down_cache
; // quick cache of down subtrees
3562 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
3563 while (i
!= down_pending_out
.end()) {
3569 if (osdmap
.is_down(o
) &&
3572 utime_t
orig_grace(g_conf
->mon_osd_down_out_interval
, 0);
3573 utime_t grace
= orig_grace
;
3574 double my_grace
= 0.0;
3576 if (g_conf
->mon_osd_adjust_down_out_interval
) {
3577 // scale grace period the same way we do the heartbeat grace.
3578 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
3579 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
3580 double decay_k
= ::log(.5) / halflife
;
3581 double decay
= exp((double)down
* decay_k
);
3582 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
3583 << " down for " << down
<< " decay " << decay
<< dendl
;
3584 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3588 // is this an entire large subtree down?
3589 if (g_conf
->mon_osd_down_out_subtree_limit
.length()) {
3590 int type
= osdmap
.crush
->get_type_id(g_conf
->mon_osd_down_out_subtree_limit
);
3592 if (osdmap
.containing_subtree_is_down(g_ceph_context
, o
, type
, &down_cache
)) {
3593 dout(10) << "tick entire containing " << g_conf
->mon_osd_down_out_subtree_limit
3594 << " subtree for osd." << o
<< " is down; resetting timer" << dendl
;
3595 // reset timer, too.
3596 down_pending_out
[o
] = now
;
3602 bool down_out
= !osdmap
.is_destroyed(o
) &&
3603 g_conf
->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
3604 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
3605 g_conf
->mon_osd_destroyed_out_interval
> 0 &&
3606 // this is not precise enough as we did not make a note when this osd
3607 // was marked as destroyed, but let's not bother with that
3608 // complexity for now.
3609 down
.sec() >= g_conf
->mon_osd_destroyed_out_interval
;
3610 if (down_out
|| destroyed_out
) {
3611 dout(10) << "tick marking osd." << o
<< " OUT after " << down
3612 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
3613 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
3615 // set the AUTOOUT bit.
3616 if (pending_inc
.new_state
.count(o
) == 0)
3617 pending_inc
.new_state
[o
] = 0;
3618 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
3620 // remember previous weight
3621 if (pending_inc
.new_xinfo
.count(o
) == 0)
3622 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
3623 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
3627 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
3628 << int(down
.sec()) << " seconds)";
3633 down_pending_out
.erase(o
);
3636 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
3639 // expire blacklisted items?
3640 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
3641 p
!= osdmap
.blacklist
.end();
3643 if (p
->second
< now
) {
3644 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
3645 pending_inc
.old_blacklist
.push_back(p
->first
);
3650 // if map full setting has changed, get that info out there!
3651 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
&&
3652 mon
->pgservice
->is_readable()) {
3653 // for pre-luminous compat only!
3654 if (mon
->pgservice
->have_full_osds()) {
3655 dout(5) << "There are full osds, setting full flag" << dendl
;
3656 add_flag(CEPH_OSDMAP_FULL
);
3657 } else if (osdmap
.test_flag(CEPH_OSDMAP_FULL
)){
3658 dout(10) << "No full osds, removing full flag" << dendl
;
3659 remove_flag(CEPH_OSDMAP_FULL
);
3662 if (mon
->pgservice
->have_nearfull_osds()) {
3663 dout(5) << "There are near full osds, setting nearfull flag" << dendl
;
3664 add_flag(CEPH_OSDMAP_NEARFULL
);
3665 } else if (osdmap
.test_flag(CEPH_OSDMAP_NEARFULL
)){
3666 dout(10) << "No near full osds, removing nearfull flag" << dendl
;
3667 remove_flag(CEPH_OSDMAP_NEARFULL
);
3669 if (pending_inc
.new_flags
!= -1 &&
3670 (pending_inc
.new_flags
^ osdmap
.flags
) & (CEPH_OSDMAP_FULL
| CEPH_OSDMAP_NEARFULL
)) {
3671 dout(1) << "New setting for" <<
3672 (pending_inc
.new_flags
& CEPH_OSDMAP_FULL
? " CEPH_OSDMAP_FULL" : "") <<
3673 (pending_inc
.new_flags
& CEPH_OSDMAP_NEARFULL
? " CEPH_OSDMAP_NEARFULL" : "")
3674 << " -- doing propose" << dendl
;
3679 if (update_pools_status())
3683 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
3687 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
3688 std::map
<int,utime_t
> &last_osd_report
)
3690 utime_t
timeo(g_conf
->mon_osd_report_timeout
, 0);
3691 if (now
- mon
->get_leader_since() < timeo
) {
3692 // We haven't been the leader for long enough to consider OSD timeouts
3696 int max_osd
= osdmap
.get_max_osd();
3697 bool new_down
= false;
3699 for (int i
=0; i
< max_osd
; ++i
) {
3700 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
3701 if (!osdmap
.exists(i
)) {
3702 last_osd_report
.erase(i
); // if any
3705 if (!osdmap
.is_up(i
))
3707 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
3708 if (t
== last_osd_report
.end()) {
3709 // it wasn't in the map; start the timer.
3710 last_osd_report
[i
] = now
;
3711 } else if (can_mark_down(i
)) {
3712 utime_t diff
= now
- t
->second
;
3714 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
3715 << diff
<< " seconds";
3716 derr
<< "no beacon from osd." << i
<< " since " << t
->second
3717 << ", " << diff
<< " seconds ago. marking down" << dendl
;
3718 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
3726 void OSDMonitor::get_health(list
<pair
<health_status_t
,string
> >& summary
,
3727 list
<pair
<health_status_t
,string
> > *detail
,
3728 CephContext
*cct
) const
3730 int num_osds
= osdmap
.get_num_osds();
3732 if (num_osds
== 0) {
3733 summary
.push_back(make_pair(HEALTH_ERR
, "no osds"));
3735 int num_in_osds
= 0;
3736 int num_down_in_osds
= 0;
3738 set
<int> down_in_osds
;
3739 set
<int> up_in_osds
;
3740 set
<int> subtree_up
;
3741 unordered_map
<int, set
<int> > subtree_type_down
;
3742 unordered_map
<int, int> num_osds_subtree
;
3743 int max_type
= osdmap
.crush
->get_max_type_id();
3745 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3746 if (!osdmap
.exists(i
)) {
3747 if (osdmap
.crush
->item_exists(i
)) {
3752 if (osdmap
.is_out(i
))
3755 if (down_in_osds
.count(i
) || up_in_osds
.count(i
))
3757 if (!osdmap
.is_up(i
)) {
3758 down_in_osds
.insert(i
);
3761 for (int type
= 0; type
<= max_type
; type
++) {
3762 if (!osdmap
.crush
->get_type_name(type
))
3764 int r
= osdmap
.crush
->get_immediate_parent_id(current
, &parent_id
);
3767 // break early if this parent is already marked as up
3768 if (subtree_up
.count(parent_id
))
3770 type
= osdmap
.crush
->get_bucket_type(parent_id
);
3771 if (!osdmap
.subtree_type_is_down(
3772 g_ceph_context
, parent_id
, type
,
3773 &down_in_osds
, &up_in_osds
, &subtree_up
, &subtree_type_down
))
3775 current
= parent_id
;
3780 // calculate the number of down osds in each down subtree and
3781 // store it in num_osds_subtree
3782 for (int type
= 1; type
<= max_type
; type
++) {
3783 if (!osdmap
.crush
->get_type_name(type
))
3785 for (auto j
= subtree_type_down
[type
].begin();
3786 j
!= subtree_type_down
[type
].end();
3790 int num
= osdmap
.crush
->get_children(*j
, &children
);
3791 num_osds_subtree
[*j
] = num
;
3795 int num_children
= osdmap
.crush
->get_children(*j
, &children
);
3796 if (num_children
== 0)
3798 for (auto l
= children
.begin(); l
!= children
.end(); ++l
) {
3799 if (num_osds_subtree
[*l
] > 0) {
3800 num
= num
+ num_osds_subtree
[*l
];
3803 num_osds_subtree
[*j
] = num
;
3807 num_down_in_osds
= down_in_osds
.size();
3808 assert(num_down_in_osds
<= num_in_osds
);
3809 if (num_down_in_osds
> 0) {
3810 // summary of down subtree types and osds
3811 for (int type
= max_type
; type
> 0; type
--) {
3812 if (!osdmap
.crush
->get_type_name(type
))
3814 if (subtree_type_down
[type
].size() > 0) {
3816 ss
<< subtree_type_down
[type
].size() << " "
3817 << osdmap
.crush
->get_type_name(type
);
3818 if (subtree_type_down
[type
].size() > 1) {
3821 int sum_down_osds
= 0;
3822 for (auto j
= subtree_type_down
[type
].begin();
3823 j
!= subtree_type_down
[type
].end();
3825 sum_down_osds
= sum_down_osds
+ num_osds_subtree
[*j
];
3827 ss
<< " (" << sum_down_osds
<< " osds) down";
3828 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3832 ss
<< down_in_osds
.size() << " osds down";
3833 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3836 // details of down subtree types
3837 for (int type
= max_type
; type
> 0; type
--) {
3838 if (!osdmap
.crush
->get_type_name(type
))
3840 for (auto j
= subtree_type_down
[type
].rbegin();
3841 j
!= subtree_type_down
[type
].rend();
3844 ss
<< osdmap
.crush
->get_type_name(type
);
3846 ss
<< osdmap
.crush
->get_item_name(*j
);
3847 // at the top level, do not print location
3848 if (type
!= max_type
) {
3850 ss
<< osdmap
.crush
->get_full_location_ordered_string(*j
);
3853 int num
= num_osds_subtree
[*j
];
3854 ss
<< " (" << num
<< " osds)";
3856 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3859 // details of down osds
3860 for (auto it
= down_in_osds
.begin(); it
!= down_in_osds
.end(); ++it
) {
3862 ss
<< "osd." << *it
<< " (";
3863 ss
<< osdmap
.crush
->get_full_location_ordered_string(*it
);
3865 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3870 if (!osds
.empty()) {
3872 ss
<< osds
.size() << " osds exist in the crush map but not in the osdmap";
3873 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3875 ss
<< " (osds: " << osds
<< ")";
3876 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3880 // note: we leave it to ceph-mgr to generate details health warnings
3881 // with actual osd utilizations
3884 uint64_t warn_flags
=
3886 CEPH_OSDMAP_PAUSERD
|
3887 CEPH_OSDMAP_PAUSEWR
|
3888 CEPH_OSDMAP_PAUSEREC
|
3890 CEPH_OSDMAP_NODOWN
|
3893 CEPH_OSDMAP_NOBACKFILL
|
3894 CEPH_OSDMAP_NORECOVER
|
3895 CEPH_OSDMAP_NOSCRUB
|
3896 CEPH_OSDMAP_NODEEP_SCRUB
|
3897 CEPH_OSDMAP_NOTIERAGENT
|
3898 CEPH_OSDMAP_NOREBALANCE
;
3899 if (osdmap
.test_flag(warn_flags
)) {
3901 ss
<< osdmap
.get_flag_string(osdmap
.get_flags() & warn_flags
)
3903 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3905 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3908 // old crush tunables?
3909 if (g_conf
->mon_warn_on_legacy_crush_tunables
) {
3910 string min
= osdmap
.crush
->get_min_required_version();
3911 if (min
< g_conf
->mon_crush_min_required_version
) {
3913 ss
<< "crush map has legacy tunables (require " << min
3914 << ", min is " << g_conf
->mon_crush_min_required_version
<< ")";
3915 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3917 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3918 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3922 if (g_conf
->mon_warn_on_crush_straw_calc_version_zero
) {
3923 if (osdmap
.crush
->get_straw_calc_version() == 0) {
3925 ss
<< "crush map has straw_calc_version=0";
3926 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3928 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3929 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3934 // hit_set-less cache_mode?
3935 if (g_conf
->mon_warn_on_cache_pools_without_hit_sets
) {
3936 int problem_cache_pools
= 0;
3937 for (map
<int64_t, pg_pool_t
>::const_iterator p
= osdmap
.pools
.begin();
3938 p
!= osdmap
.pools
.end();
3940 const pg_pool_t
& info
= p
->second
;
3941 if (info
.cache_mode_requires_hit_set() &&
3942 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
3943 ++problem_cache_pools
;
3946 ss
<< "pool '" << osdmap
.get_pool_name(p
->first
)
3947 << "' with cache_mode " << info
.get_cache_mode_name()
3948 << " needs hit_set_type to be set but it is not";
3949 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3953 if (problem_cache_pools
) {
3955 ss
<< problem_cache_pools
<< " cache pools are missing hit_sets";
3956 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3960 // Not using 'sortbitwise' and should be?
3961 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
3962 (osdmap
.get_up_osd_features() &
3963 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
3965 ss
<< "no legacy OSD present but 'sortbitwise' flag is not set";
3966 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3969 // Warn if 'mon_osd_down_out_interval' is set to zero.
3970 // Having this option set to zero on the leader acts much like the
3971 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3972 // without the 'noout' flag set but acting like that just the same, so
3973 // we report a HEALTH_WARN in case this option is set to zero.
3974 // This is an ugly hack to get the warning out, but until we find a way
3975 // to spread global options throughout the mon cluster and have all mons
3976 // using a base set of the same options, we need to work around this sort
3978 // There's also the obvious drawback that if this is set on a single
3979 // monitor on a 3-monitor cluster, this warning will only be shown every
3980 // third monitor connection.
3981 if (g_conf
->mon_warn_on_osd_down_out_interval_zero
&&
3982 g_conf
->mon_osd_down_out_interval
== 0) {
3984 ss
<< "mon." << mon
->name
<< " has mon_osd_down_out_interval set to 0";
3985 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3987 ss
<< "; this has the same effect as the 'noout' flag";
3988 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3992 // warn about upgrade flags that can be set but are not.
3993 if (g_conf
->mon_debug_no_require_luminous
) {
3994 // ignore these checks
3995 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
) &&
3996 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
3997 string msg
= "all OSDs are running luminous or later but"
3998 " require_osd_release < luminous";
3999 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
4001 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
4003 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
) &&
4004 osdmap
.require_osd_release
< CEPH_RELEASE_KRAKEN
) {
4005 string msg
= "all OSDs are running kraken or later but"
4006 " require_osd_release < kraken";
4007 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
4009 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
4011 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
) &&
4012 osdmap
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
4013 string msg
= "all OSDs are running jewel or later but"
4014 " require_osd_release < jewel";
4015 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
4017 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
4021 for (auto it
: osdmap
.get_pools()) {
4022 const pg_pool_t
&pool
= it
.second
;
4023 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
4024 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
4026 ss
<< "pool '" << pool_name
<< "' is full";
4027 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4029 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4035 void OSDMonitor::dump_info(Formatter
*f
)
4037 f
->open_object_section("osdmap");
4041 f
->open_array_section("osd_metadata");
4042 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4043 if (osdmap
.exists(i
)) {
4044 f
->open_object_section("osd");
4045 f
->dump_unsigned("id", i
);
4046 dump_osd_metadata(i
, f
, NULL
);
4052 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
4053 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
4055 f
->open_object_section("crushmap");
4056 osdmap
.crush
->dump(f
);
4061 enum osd_pool_get_choices
{
4062 SIZE
, MIN_SIZE
, CRASH_REPLAY_INTERVAL
,
4063 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
,
4064 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
4065 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
4066 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
4067 USE_GMT_HITSET
, AUID
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
4068 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
4069 CACHE_TARGET_FULL_RATIO
,
4070 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
4071 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
4072 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
4073 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
4074 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
4075 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
4076 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
4077 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
4078 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
};
4080 std::set
<osd_pool_get_choices
>
4081 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
4082 const std::set
<osd_pool_get_choices
>& second
)
4084 std::set
<osd_pool_get_choices
> result
;
4085 std::set_difference(first
.begin(), first
.end(),
4086 second
.begin(), second
.end(),
4087 std::inserter(result
, result
.end()));
4093 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
4095 op
->mark_osdmon_event(__func__
);
4096 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
4099 stringstream ss
, ds
;
4101 map
<string
, cmd_vartype
> cmdmap
;
4102 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
4103 string rs
= ss
.str();
4104 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
4108 MonSession
*session
= m
->get_session();
4110 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
4115 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
4118 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
4119 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
4121 if (prefix
== "osd stat") {
4122 osdmap
.print_summary(f
.get(), ds
, "");
4128 else if (prefix
== "osd perf" ||
4129 prefix
== "osd blocked-by") {
4130 r
= mon
->pgservice
->process_pg_command(prefix
, cmdmap
,
4131 osdmap
, f
.get(), &ss
, &rdata
);
4133 else if (prefix
== "osd dump" ||
4134 prefix
== "osd tree" ||
4135 prefix
== "osd ls" ||
4136 prefix
== "osd getmap" ||
4137 prefix
== "osd getcrushmap" ||
4138 prefix
== "osd ls-tree") {
4143 cmd_getval(g_ceph_context
, cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
4146 bufferlist osdmap_bl
;
4147 int err
= get_version_full(epoch
, osdmap_bl
);
4148 if (err
== -ENOENT
) {
4150 ss
<< "there is no map for epoch " << epoch
;
4154 assert(osdmap_bl
.length());
4157 if (epoch
== osdmap
.get_epoch()) {
4161 p
->decode(osdmap_bl
);
4164 auto sg
= make_scope_guard([&] {
4170 if (prefix
== "osd dump") {
4173 f
->open_object_section("osdmap");
4183 } else if (prefix
== "osd ls") {
4185 f
->open_array_section("osds");
4186 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
4187 if (osdmap
.exists(i
)) {
4188 f
->dump_int("osd", i
);
4195 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
4196 if (osdmap
.exists(i
)) {
4205 } else if (prefix
== "osd tree") {
4206 vector
<string
> states
;
4207 cmd_getval(g_ceph_context
, cmdmap
, "states", states
);
4208 unsigned filter
= 0;
4209 for (auto& s
: states
) {
4211 filter
|= OSDMap::DUMP_UP
;
4212 } else if (s
== "down") {
4213 filter
|= OSDMap::DUMP_DOWN
;
4214 } else if (s
== "in") {
4215 filter
|= OSDMap::DUMP_IN
;
4216 } else if (s
== "out") {
4217 filter
|= OSDMap::DUMP_OUT
;
4218 } else if (s
== "destroyed") {
4219 filter
|= OSDMap::DUMP_DESTROYED
;
4221 ss
<< "unrecognized state '" << s
<< "'";
4226 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
4227 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
4228 ss
<< "cannot specify both 'in' and 'out'";
4232 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
4233 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
4234 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
4235 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
4236 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
4237 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
4238 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
4243 f
->open_object_section("tree");
4244 p
->print_tree(f
.get(), NULL
, filter
);
4248 p
->print_tree(NULL
, &ds
, filter
);
4251 } else if (prefix
== "osd getmap") {
4252 rdata
.append(osdmap_bl
);
4253 ss
<< "got osdmap epoch " << p
->get_epoch();
4254 } else if (prefix
== "osd getcrushmap") {
4255 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
4256 ss
<< p
->get_crush_version();
4257 } else if (prefix
== "osd ls-tree") {
4259 cmd_getval(g_ceph_context
, cmdmap
, "name", bucket_name
);
4261 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
4263 ss
<< "\"" << bucket_name
<< "\" does not exist";
4266 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
4271 f
->open_array_section("osds");
4272 for (auto &i
: osds
) {
4273 if (osdmap
.exists(i
)) {
4274 f
->dump_int("osd", i
);
4281 for (auto &i
: osds
) {
4282 if (osdmap
.exists(i
)) {
4293 } else if (prefix
== "osd df") {
4295 cmd_getval(g_ceph_context
, cmdmap
, "output_method", method
);
4296 print_osd_utilization(osdmap
, mon
->pgservice
, ds
,
4297 f
.get(), method
== "tree");
4299 } else if (prefix
== "osd getmaxosd") {
4301 f
->open_object_section("getmaxosd");
4302 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4303 f
->dump_int("max_osd", osdmap
.get_max_osd());
4307 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
4310 } else if (prefix
== "osd utilization") {
4312 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
4319 } else if (prefix
== "osd find") {
4321 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
4322 ss
<< "unable to parse osd id value '"
4323 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4327 if (!osdmap
.exists(osd
)) {
4328 ss
<< "osd." << osd
<< " does not exist";
4333 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4334 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4335 f
->open_object_section("osd_location");
4336 f
->dump_int("osd", osd
);
4337 f
->dump_stream("ip") << osdmap
.get_addr(osd
);
4338 f
->open_object_section("crush_location");
4339 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
4340 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
4341 f
->dump_string(p
->first
.c_str(), p
->second
);
4345 } else if (prefix
== "osd metadata") {
4347 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
4348 !cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
4349 ss
<< "unable to parse osd id value '"
4350 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4354 if (osd
>= 0 && !osdmap
.exists(osd
)) {
4355 ss
<< "osd." << osd
<< " does not exist";
4360 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4361 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4363 f
->open_object_section("osd_metadata");
4364 f
->dump_unsigned("id", osd
);
4365 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
4371 f
->open_array_section("osd_metadata");
4372 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4373 if (osdmap
.exists(i
)) {
4374 f
->open_object_section("osd");
4375 f
->dump_unsigned("id", i
);
4376 r
= dump_osd_metadata(i
, f
.get(), NULL
);
4377 if (r
== -EINVAL
|| r
== -ENOENT
) {
4378 // Drop error, continue to get other daemons' metadata
4379 dout(4) << "No metadata for osd." << i
<< dendl
;
4391 } else if (prefix
== "osd versions") {
4393 f
.reset(Formatter::create("json-pretty"));
4394 count_metadata("ceph_version", f
.get());
4397 } else if (prefix
== "osd count-metadata") {
4399 f
.reset(Formatter::create("json-pretty"));
4401 cmd_getval(g_ceph_context
, cmdmap
, "property", field
);
4402 count_metadata(field
, f
.get());
4405 } else if (prefix
== "osd map") {
4406 string poolstr
, objstr
, namespacestr
;
4407 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
4408 cmd_getval(g_ceph_context
, cmdmap
, "object", objstr
);
4409 cmd_getval(g_ceph_context
, cmdmap
, "nspace", namespacestr
);
4411 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4413 ss
<< "pool " << poolstr
<< " does not exist";
4417 object_locator_t
oloc(pool
, namespacestr
);
4418 object_t
oid(objstr
);
4419 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
4420 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4421 vector
<int> up
, acting
;
4423 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
4426 if (!namespacestr
.empty())
4427 fullobjname
= namespacestr
+ string("/") + oid
.name
;
4429 fullobjname
= oid
.name
;
4431 f
->open_object_section("osd_map");
4432 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4433 f
->dump_string("pool", poolstr
);
4434 f
->dump_int("pool_id", pool
);
4435 f
->dump_stream("objname") << fullobjname
;
4436 f
->dump_stream("raw_pgid") << pgid
;
4437 f
->dump_stream("pgid") << mpgid
;
4438 f
->open_array_section("up");
4439 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
4440 f
->dump_int("osd", *p
);
4442 f
->dump_int("up_primary", up_p
);
4443 f
->open_array_section("acting");
4444 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
4445 f
->dump_int("osd", *p
);
4447 f
->dump_int("acting_primary", acting_p
);
4448 f
->close_section(); // osd_map
4451 ds
<< "osdmap e" << osdmap
.get_epoch()
4452 << " pool '" << poolstr
<< "' (" << pool
<< ")"
4453 << " object '" << fullobjname
<< "' ->"
4454 << " pg " << pgid
<< " (" << mpgid
<< ")"
4455 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
4456 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
4460 } else if (prefix
== "pg map") {
4463 cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
);
4464 if (!pgid
.parse(pgidstr
.c_str())) {
4465 ss
<< "invalid pgid '" << pgidstr
<< "'";
4469 vector
<int> up
, acting
;
4470 if (!osdmap
.have_pg_pool(pgid
.pool())) {
4471 ss
<< "pg '" << pgidstr
<< "' does not exist";
4475 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4476 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
4478 f
->open_object_section("pg_map");
4479 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4480 f
->dump_stream("raw_pgid") << pgid
;
4481 f
->dump_stream("pgid") << mpgid
;
4482 f
->open_array_section("up");
4483 for (auto osd
: up
) {
4484 f
->dump_int("up_osd", osd
);
4487 f
->open_array_section("acting");
4488 for (auto osd
: acting
) {
4489 f
->dump_int("acting_osd", osd
);
4495 ds
<< "osdmap e" << osdmap
.get_epoch()
4496 << " pg " << pgid
<< " (" << mpgid
<< ")"
4497 << " -> up " << up
<< " acting " << acting
;
4502 } else if (prefix
== "osd scrub" ||
4503 prefix
== "osd deep-scrub" ||
4504 prefix
== "osd repair") {
4506 cmd_getval(g_ceph_context
, cmdmap
, "who", whostr
);
4507 vector
<string
> pvec
;
4508 get_str_vec(prefix
, pvec
);
4510 if (whostr
== "*" || whostr
== "all" || whostr
== "any") {
4513 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++)
4514 if (osdmap
.is_up(i
)) {
4515 ss
<< (c
++ ? "," : "") << i
;
4516 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4517 pvec
.back() == "repair",
4518 pvec
.back() == "deep-scrub"),
4519 osdmap
.get_inst(i
));
4522 ss
<< " instructed to " << pvec
.back();
4524 long osd
= parse_osd_id(whostr
.c_str(), &ss
);
4527 } else if (osdmap
.is_up(osd
)) {
4528 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4529 pvec
.back() == "repair",
4530 pvec
.back() == "deep-scrub"),
4531 osdmap
.get_inst(osd
));
4532 ss
<< "osd." << osd
<< " instructed to " << pvec
.back();
4534 ss
<< "osd." << osd
<< " is not up";
4538 } else if (prefix
== "osd lspools") {
4540 cmd_getval(g_ceph_context
, cmdmap
, "auid", auid
, int64_t(0));
4542 f
->open_array_section("pools");
4543 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
4544 p
!= osdmap
.pools
.end();
4546 if (!auid
|| p
->second
.auid
== (uint64_t)auid
) {
4548 f
->open_object_section("pool");
4549 f
->dump_int("poolnum", p
->first
);
4550 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
4553 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
] << ',';
4562 } else if (prefix
== "osd blacklist ls") {
4564 f
->open_array_section("blacklist");
4566 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
4567 p
!= osdmap
.blacklist
.end();
4570 f
->open_object_section("entry");
4571 f
->dump_stream("addr") << p
->first
;
4572 f
->dump_stream("until") << p
->second
;
4577 ss
<< p
->first
<< " " << p
->second
;
4587 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
4589 } else if (prefix
== "osd pool ls") {
4591 cmd_getval(g_ceph_context
, cmdmap
, "detail", detail
);
4592 if (!f
&& detail
== "detail") {
4594 osdmap
.print_pools(ss
);
4595 rdata
.append(ss
.str());
4598 f
->open_array_section("pools");
4599 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
4600 it
!= osdmap
.get_pools().end();
4603 if (detail
== "detail") {
4604 f
->open_object_section("pool");
4605 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4606 it
->second
.dump(f
.get());
4609 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4612 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
4621 } else if (prefix
== "osd crush get-tunable") {
4623 cmd_getval(g_ceph_context
, cmdmap
, "tunable", tunable
);
4626 f
->open_object_section("tunable");
4627 if (tunable
== "straw_calc_version") {
4629 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
4631 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
4640 rdata
.append(rss
.str());
4644 } else if (prefix
== "osd pool get") {
4646 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
4647 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4649 ss
<< "unrecognized pool '" << poolstr
<< "'";
4654 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
4656 cmd_getval(g_ceph_context
, cmdmap
, "var", var
);
4658 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
4659 const choices_map_t ALL_CHOICES
= {
4661 {"min_size", MIN_SIZE
},
4662 {"crash_replay_interval", CRASH_REPLAY_INTERVAL
},
4663 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
4664 {"crush_rule", CRUSH_RULE
},
4665 {"hashpspool", HASHPSPOOL
}, {"nodelete", NODELETE
},
4666 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
4667 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
4668 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
4669 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
4670 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
4671 {"use_gmt_hitset", USE_GMT_HITSET
},
4672 {"auid", AUID
}, {"target_max_objects", TARGET_MAX_OBJECTS
},
4673 {"target_max_bytes", TARGET_MAX_BYTES
},
4674 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
4675 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
4676 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
4677 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
4678 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
4679 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
4680 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
4681 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
4682 {"fast_read", FAST_READ
},
4683 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
4684 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
4685 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
4686 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
4687 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
4688 {"recovery_priority", RECOVERY_PRIORITY
},
4689 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
4690 {"scrub_priority", SCRUB_PRIORITY
},
4691 {"compression_mode", COMPRESSION_MODE
},
4692 {"compression_algorithm", COMPRESSION_ALGORITHM
},
4693 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
4694 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
4695 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
4696 {"csum_type", CSUM_TYPE
},
4697 {"csum_max_block", CSUM_MAX_BLOCK
},
4698 {"csum_min_block", CSUM_MIN_BLOCK
},
4701 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
4703 const choices_set_t ONLY_TIER_CHOICES
= {
4704 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
4705 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
4706 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
4707 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
4708 MIN_READ_RECENCY_FOR_PROMOTE
,
4709 MIN_WRITE_RECENCY_FOR_PROMOTE
,
4710 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
4712 const choices_set_t ONLY_ERASURE_CHOICES
= {
4713 ERASURE_CODE_PROFILE
4716 choices_set_t selected_choices
;
4718 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
4719 it
!= ALL_CHOICES
.end(); ++it
) {
4720 selected_choices
.insert(it
->second
);
4724 selected_choices
= subtract_second_from_first(selected_choices
,
4728 if(!p
->is_erasure()) {
4729 selected_choices
= subtract_second_from_first(selected_choices
,
4730 ONLY_ERASURE_CHOICES
);
4732 } else /* var != "all" */ {
4733 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
4734 osd_pool_get_choices selected
= found
->second
;
4736 if (!p
->is_tier() &&
4737 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
4738 ss
<< "pool '" << poolstr
4739 << "' is not a tier pool: variable not applicable";
4744 if (!p
->is_erasure() &&
4745 ONLY_ERASURE_CHOICES
.find(selected
)
4746 != ONLY_ERASURE_CHOICES
.end()) {
4747 ss
<< "pool '" << poolstr
4748 << "' is not a erasure pool: variable not applicable";
4753 if (pool_opts_t::is_opt_name(var
) &&
4754 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
4755 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
4760 selected_choices
.insert(selected
);
4764 f
->open_object_section("pool");
4765 f
->dump_string("pool", poolstr
);
4766 f
->dump_int("pool_id", pool
);
4767 for(choices_set_t::const_iterator it
= selected_choices
.begin();
4768 it
!= selected_choices
.end(); ++it
) {
4769 choices_map_t::const_iterator i
;
4770 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4771 if (i
->second
== *it
) {
4775 assert(i
!= ALL_CHOICES
.end());
4778 f
->dump_int("pg_num", p
->get_pg_num());
4781 f
->dump_int("pgp_num", p
->get_pgp_num());
4784 f
->dump_int("auid", p
->get_auid());
4787 f
->dump_int("size", p
->get_size());
4790 f
->dump_int("min_size", p
->get_min_size());
4792 case CRASH_REPLAY_INTERVAL
:
4793 f
->dump_int("crash_replay_interval",
4794 p
->get_crash_replay_interval());
4797 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
4798 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
4799 p
->get_crush_rule()));
4801 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
4808 case WRITE_FADVISE_DONTNEED
:
4811 f
->dump_bool(i
->first
.c_str(),
4812 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
4814 case HIT_SET_PERIOD
:
4815 f
->dump_int("hit_set_period", p
->hit_set_period
);
4818 f
->dump_int("hit_set_count", p
->hit_set_count
);
4821 f
->dump_string("hit_set_type",
4822 HitSet::get_type_name(p
->hit_set_params
.get_type()));
4826 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
4827 BloomHitSet::Params
*bloomp
=
4828 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
4829 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
4830 } else if(var
!= "all") {
4832 ss
<< "hit set is not of type Bloom; " <<
4833 "invalid to get a false positive rate!";
4839 case USE_GMT_HITSET
:
4840 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
4842 case TARGET_MAX_OBJECTS
:
4843 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
4845 case TARGET_MAX_BYTES
:
4846 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
4848 case CACHE_TARGET_DIRTY_RATIO
:
4849 f
->dump_unsigned("cache_target_dirty_ratio_micro",
4850 p
->cache_target_dirty_ratio_micro
);
4851 f
->dump_float("cache_target_dirty_ratio",
4852 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
4854 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
4855 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
4856 p
->cache_target_dirty_high_ratio_micro
);
4857 f
->dump_float("cache_target_dirty_high_ratio",
4858 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
4860 case CACHE_TARGET_FULL_RATIO
:
4861 f
->dump_unsigned("cache_target_full_ratio_micro",
4862 p
->cache_target_full_ratio_micro
);
4863 f
->dump_float("cache_target_full_ratio",
4864 ((float)p
->cache_target_full_ratio_micro
/1000000));
4866 case CACHE_MIN_FLUSH_AGE
:
4867 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
4869 case CACHE_MIN_EVICT_AGE
:
4870 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
4872 case ERASURE_CODE_PROFILE
:
4873 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
4875 case MIN_READ_RECENCY_FOR_PROMOTE
:
4876 f
->dump_int("min_read_recency_for_promote",
4877 p
->min_read_recency_for_promote
);
4879 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
4880 f
->dump_int("min_write_recency_for_promote",
4881 p
->min_write_recency_for_promote
);
4884 f
->dump_int("fast_read", p
->fast_read
);
4886 case HIT_SET_GRADE_DECAY_RATE
:
4887 f
->dump_int("hit_set_grade_decay_rate",
4888 p
->hit_set_grade_decay_rate
);
4890 case HIT_SET_SEARCH_LAST_N
:
4891 f
->dump_int("hit_set_search_last_n",
4892 p
->hit_set_search_last_n
);
4894 case SCRUB_MIN_INTERVAL
:
4895 case SCRUB_MAX_INTERVAL
:
4896 case DEEP_SCRUB_INTERVAL
:
4897 case RECOVERY_PRIORITY
:
4898 case RECOVERY_OP_PRIORITY
:
4899 case SCRUB_PRIORITY
:
4900 case COMPRESSION_MODE
:
4901 case COMPRESSION_ALGORITHM
:
4902 case COMPRESSION_REQUIRED_RATIO
:
4903 case COMPRESSION_MAX_BLOB_SIZE
:
4904 case COMPRESSION_MIN_BLOB_SIZE
:
4906 case CSUM_MAX_BLOCK
:
4907 case CSUM_MIN_BLOCK
:
4908 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
4909 if (p
->opts
.is_set(key
)) {
4910 if(*it
== CSUM_TYPE
) {
4912 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
4913 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
4915 p
->opts
.dump(i
->first
, f
.get());
4924 for(choices_set_t::const_iterator it
= selected_choices
.begin();
4925 it
!= selected_choices
.end(); ++it
) {
4926 choices_map_t::const_iterator i
;
4929 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
4932 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
4935 ss
<< "auid: " << p
->get_auid() << "\n";
4938 ss
<< "size: " << p
->get_size() << "\n";
4941 ss
<< "min_size: " << p
->get_min_size() << "\n";
4943 case CRASH_REPLAY_INTERVAL
:
4944 ss
<< "crash_replay_interval: " <<
4945 p
->get_crash_replay_interval() << "\n";
4948 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
4949 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
4950 p
->get_crush_rule()) << "\n";
4952 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
4955 case HIT_SET_PERIOD
:
4956 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
4959 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
4962 ss
<< "hit_set_type: " <<
4963 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
4967 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
4968 BloomHitSet::Params
*bloomp
=
4969 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
4970 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
4971 } else if(var
!= "all") {
4972 ss
<< "hit set is not of type Bloom; " <<
4973 "invalid to get a false positive rate!";
4979 case USE_GMT_HITSET
:
4980 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
4982 case TARGET_MAX_OBJECTS
:
4983 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
4985 case TARGET_MAX_BYTES
:
4986 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
4988 case CACHE_TARGET_DIRTY_RATIO
:
4989 ss
<< "cache_target_dirty_ratio: "
4990 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
4992 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
4993 ss
<< "cache_target_dirty_high_ratio: "
4994 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
4996 case CACHE_TARGET_FULL_RATIO
:
4997 ss
<< "cache_target_full_ratio: "
4998 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
5000 case CACHE_MIN_FLUSH_AGE
:
5001 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
5003 case CACHE_MIN_EVICT_AGE
:
5004 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
5006 case ERASURE_CODE_PROFILE
:
5007 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
5009 case MIN_READ_RECENCY_FOR_PROMOTE
:
5010 ss
<< "min_read_recency_for_promote: " <<
5011 p
->min_read_recency_for_promote
<< "\n";
5013 case HIT_SET_GRADE_DECAY_RATE
:
5014 ss
<< "hit_set_grade_decay_rate: " <<
5015 p
->hit_set_grade_decay_rate
<< "\n";
5017 case HIT_SET_SEARCH_LAST_N
:
5018 ss
<< "hit_set_search_last_n: " <<
5019 p
->hit_set_search_last_n
<< "\n";
5025 case WRITE_FADVISE_DONTNEED
:
5028 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5029 if (i
->second
== *it
)
5032 assert(i
!= ALL_CHOICES
.end());
5033 ss
<< i
->first
<< ": " <<
5034 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
5035 "true" : "false") << "\n";
5037 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
5038 ss
<< "min_write_recency_for_promote: " <<
5039 p
->min_write_recency_for_promote
<< "\n";
5042 ss
<< "fast_read: " << p
->fast_read
<< "\n";
5044 case SCRUB_MIN_INTERVAL
:
5045 case SCRUB_MAX_INTERVAL
:
5046 case DEEP_SCRUB_INTERVAL
:
5047 case RECOVERY_PRIORITY
:
5048 case RECOVERY_OP_PRIORITY
:
5049 case SCRUB_PRIORITY
:
5050 case COMPRESSION_MODE
:
5051 case COMPRESSION_ALGORITHM
:
5052 case COMPRESSION_REQUIRED_RATIO
:
5053 case COMPRESSION_MAX_BLOB_SIZE
:
5054 case COMPRESSION_MIN_BLOB_SIZE
:
5056 case CSUM_MAX_BLOCK
:
5057 case CSUM_MIN_BLOCK
:
5058 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
5059 if (i
->second
== *it
)
5062 assert(i
!= ALL_CHOICES
.end());
5064 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
5065 if (p
->opts
.is_set(key
)) {
5066 if(key
== pool_opts_t::CSUM_TYPE
) {
5068 p
->opts
.get(key
, &val
);
5069 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
5071 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
5077 rdata
.append(ss
.str());
5082 } else if (prefix
== "osd pool stats") {
5083 r
= mon
->pgservice
->process_pg_command(prefix
, cmdmap
,
5084 osdmap
, f
.get(), &ss
, &rdata
);
5085 } else if (prefix
== "osd pool get-quota") {
5087 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool_name
);
5089 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
5091 assert(poolid
== -ENOENT
);
5092 ss
<< "unrecognized pool '" << pool_name
<< "'";
5096 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
5099 f
->open_object_section("pool_quotas");
5100 f
->dump_string("pool_name", pool_name
);
5101 f
->dump_unsigned("pool_id", poolid
);
5102 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
5103 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
5108 rs
<< "quotas for pool '" << pool_name
<< "':\n"
5109 << " max objects: ";
5110 if (p
->quota_max_objects
== 0)
5113 rs
<< si_t(p
->quota_max_objects
) << " objects";
5116 if (p
->quota_max_bytes
== 0)
5119 rs
<< si_t(p
->quota_max_bytes
) << "B";
5120 rdata
.append(rs
.str());
5124 } else if (prefix
== "osd crush rule list" ||
5125 prefix
== "osd crush rule ls") {
5127 f
->open_array_section("rules");
5128 osdmap
.crush
->list_rules(f
.get());
5133 osdmap
.crush
->list_rules(&ss
);
5134 rdata
.append(ss
.str());
5136 } else if (prefix
== "osd crush rule ls-by-class") {
5138 cmd_getval(g_ceph_context
, cmdmap
, "class", class_name
);
5139 if (class_name
.empty()) {
5140 ss
<< "no class specified";
5145 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
5147 ss
<< "failed to get rules by class '" << class_name
<< "'";
5151 f
->open_array_section("rules");
5152 for (auto &rule
: rules
) {
5153 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
5159 for (auto &rule
: rules
) {
5160 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
5162 rdata
.append(rs
.str());
5164 } else if (prefix
== "osd crush rule dump") {
5166 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
5168 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
5169 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5171 f
->open_array_section("rules");
5172 osdmap
.crush
->dump_rules(f
.get());
5175 int ruleno
= osdmap
.crush
->get_rule_id(name
);
5177 ss
<< "unknown crush rule '" << name
<< "'";
5181 osdmap
.crush
->dump_rule(ruleno
, f
.get());
5186 rdata
.append(rs
.str());
5187 } else if (prefix
== "osd crush dump") {
5189 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
5190 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5191 f
->open_object_section("crush_map");
5192 osdmap
.crush
->dump(f
.get());
5197 rdata
.append(rs
.str());
5198 } else if (prefix
== "osd crush show-tunables") {
5200 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
5201 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5202 f
->open_object_section("crush_map_tunables");
5203 osdmap
.crush
->dump_tunables(f
.get());
5208 rdata
.append(rs
.str());
5209 } else if (prefix
== "osd crush tree") {
5211 cmd_getval(g_ceph_context
, cmdmap
, "shadow", shadow
);
5212 bool show_shadow
= shadow
== "--show-shadow";
5213 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5215 osdmap
.crush
->dump_tree(nullptr,
5217 osdmap
.get_pool_names(),
5222 osdmap
.crush
->dump_tree(&ss
,
5224 osdmap
.get_pool_names(),
5226 rdata
.append(ss
.str());
5228 } else if (prefix
== "osd crush ls") {
5230 if (!cmd_getval(g_ceph_context
, cmdmap
, "node", name
)) {
5231 ss
<< "no node specified";
5235 if (!osdmap
.crush
->name_exists(name
)) {
5236 ss
<< "node '" << name
<< "' does not exist";
5240 int id
= osdmap
.crush
->get_item_id(name
);
5243 result
.push_back(id
);
5245 int num
= osdmap
.crush
->get_bucket_size(id
);
5246 for (int i
= 0; i
< num
; ++i
) {
5247 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
5251 f
->open_array_section("items");
5252 for (auto i
: result
) {
5253 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
5259 for (auto i
: result
) {
5260 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
5262 rdata
.append(ss
.str());
5265 } else if (prefix
== "osd crush class ls") {
5266 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5267 f
->open_array_section("crush_classes");
5268 for (auto i
: osdmap
.crush
->class_name
)
5269 f
->dump_string("class", i
.second
);
5272 } else if (prefix
== "osd crush class ls-osd") {
5274 cmd_getval(g_ceph_context
, cmdmap
, "class", name
);
5276 osdmap
.crush
->get_devices_by_class(name
, &osds
);
5278 f
->open_array_section("osds");
5279 for (auto &osd
: osds
)
5280 f
->dump_int("osd", osd
);
5285 for (auto &osd
: osds
) {
5293 } else if (prefix
== "osd erasure-code-profile ls") {
5294 const auto &profiles
= osdmap
.get_erasure_code_profiles();
5296 f
->open_array_section("erasure-code-profiles");
5297 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
5299 f
->dump_string("profile", i
->first
.c_str());
5301 rdata
.append(i
->first
+ "\n");
5308 rdata
.append(rs
.str());
5310 } else if (prefix
== "osd crush weight-set ls") {
5311 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5313 f
->open_array_section("weight_sets");
5314 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5315 f
->dump_string("pool", "(compat)");
5317 for (auto& i
: osdmap
.crush
->choose_args
) {
5319 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
5326 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5329 for (auto& i
: osdmap
.crush
->choose_args
) {
5331 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
5334 rdata
.append(rs
.str());
5336 } else if (prefix
== "osd crush weight-set dump") {
5337 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5339 osdmap
.crush
->dump_choose_args(f
.get());
5341 } else if (prefix
== "osd erasure-code-profile get") {
5343 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
5344 if (!osdmap
.has_erasure_code_profile(name
)) {
5345 ss
<< "unknown erasure code profile '" << name
<< "'";
5349 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
5351 f
->open_object_section("profile");
5352 for (map
<string
,string
>::const_iterator i
= profile
.begin();
5356 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
5358 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
5365 rdata
.append(rs
.str());
5367 } else if (prefix
== "osd pool application get") {
5368 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5371 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool_name
);
5373 cmd_getval(g_ceph_context
, cmdmap
, "app", app
);
5375 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
5377 if (pool_name
.empty()) {
5379 f
->open_object_section("pools");
5380 for (const auto &pool
: osdmap
.pools
) {
5381 std::string
name("<unknown>");
5382 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
5383 if (pni
!= osdmap
.pool_name
.end())
5385 f
->open_object_section(name
.c_str());
5386 for (auto &app_pair
: pool
.second
.application_metadata
) {
5387 f
->open_object_section(app_pair
.first
.c_str());
5388 for (auto &kv_pair
: app_pair
.second
) {
5389 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5393 f
->close_section(); // name
5395 f
->close_section(); // pools
5398 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
5400 ss
<< "unrecognized pool '" << pool_name
<< "'";
5404 auto p
= osdmap
.get_pg_pool(pool
);
5407 f
->open_object_section(pool_name
.c_str());
5408 for (auto &app_pair
: p
->application_metadata
) {
5409 f
->open_object_section(app_pair
.first
.c_str());
5410 for (auto &kv_pair
: app_pair
.second
) {
5411 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5413 f
->close_section(); // application
5415 f
->close_section(); // pool_name
5420 auto app_it
= p
->application_metadata
.find(app
);
5421 if (app_it
== p
->application_metadata
.end()) {
5422 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
5426 // filter by pool + app
5428 f
->open_object_section(app_it
->first
.c_str());
5429 for (auto &kv_pair
: app_it
->second
) {
5430 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5432 f
->close_section(); // application
5436 // filter by pool + app + key
5437 auto key_it
= app_it
->second
.find(key
);
5438 if (key_it
== app_it
->second
.end()) {
5439 ss
<< "application '" << app
<< "' on pool '" << pool_name
5440 << "' does not have key '" << key
<< "'";
5444 ss
<< key_it
->second
<< "\n";
5445 rdata
.append(ss
.str());
5449 // try prepare update
5456 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
5460 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
5462 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
5463 osdmap
.get_pg_pool(pool_id
));
5465 pool
->set_flag(flags
);
5468 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
5470 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
5471 osdmap
.get_pg_pool(pool_id
));
5473 pool
->unset_flag(flags
);
5476 bool OSDMonitor::update_pools_status()
5478 if (!mon
->pgservice
->is_readable())
5483 auto& pools
= osdmap
.get_pools();
5484 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
5485 const pool_stat_t
*pstat
= mon
->pgservice
->get_pool_stat(it
->first
);
5488 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
5489 const pg_pool_t
&pool
= it
->second
;
5490 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
5493 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
5494 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
5496 if (pool
.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
5500 mon
->clog
->info() << "pool '" << pool_name
5501 << "' no longer out of quota; removing NO_QUOTA flag";
5502 // below we cancel FLAG_FULL too, we'll set it again in
5503 // OSDMonitor::encode_pending if it still fails the osd-full checking.
5504 clear_pool_flags(it
->first
,
5505 pg_pool_t::FLAG_FULL_NO_QUOTA
| pg_pool_t::FLAG_FULL
);
5511 if (pool
.quota_max_bytes
> 0 &&
5512 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
5513 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
5514 << " (reached quota's max_bytes: "
5515 << si_t(pool
.quota_max_bytes
) << ")";
5517 if (pool
.quota_max_objects
> 0 &&
5518 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
5519 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
5520 << " (reached quota's max_objects: "
5521 << pool
.quota_max_objects
<< ")";
5523 // set both FLAG_FULL_NO_QUOTA and FLAG_FULL
5524 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
5525 // since FLAG_FULL should always take precedence
5526 set_pool_flags(it
->first
,
5527 pg_pool_t::FLAG_FULL_NO_QUOTA
| pg_pool_t::FLAG_FULL
);
5528 clear_pool_flags(it
->first
,
5529 pg_pool_t::FLAG_NEARFULL
|
5530 pg_pool_t::FLAG_BACKFILLFULL
);
5537 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
5539 op
->mark_osdmon_event(__func__
);
5540 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
5541 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
5542 MonSession
*session
= m
->get_session();
5545 string erasure_code_profile
;
5550 ret
= prepare_new_pool(m
->name
, m
->auid
, m
->crush_rule
, rule_name
,
5552 erasure_code_profile
,
5553 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5555 ret
= prepare_new_pool(m
->name
, session
->auid
, m
->crush_rule
, rule_name
,
5557 erasure_code_profile
,
5558 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5561 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
5566 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
5567 const string
& dstname
,
5572 // Avoid creating a pending crush if it does not already exists and
5573 // the rename would fail.
5575 if (!_have_pending_crush()) {
5576 ret
= _get_stable_crush().can_rename_bucket(srcname
,
5583 CrushWrapper newcrush
;
5584 _get_pending_crush(newcrush
);
5586 ret
= newcrush
.rename_bucket(srcname
,
5592 pending_inc
.crush
.clear();
5593 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5594 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
5598 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
5600 string replacement
= "";
5602 if (plugin
== "jerasure_generic" ||
5603 plugin
== "jerasure_sse3" ||
5604 plugin
== "jerasure_sse4" ||
5605 plugin
== "jerasure_neon") {
5606 replacement
= "jerasure";
5607 } else if (plugin
== "shec_generic" ||
5608 plugin
== "shec_sse3" ||
5609 plugin
== "shec_sse4" ||
5610 plugin
== "shec_neon") {
5611 replacement
= "shec";
5614 if (replacement
!= "") {
5615 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
5616 << plugin
<< " that has been deprecated. Please use "
5617 << replacement
<< " instead." << dendl
;
5621 int OSDMonitor::normalize_profile(const string
& profilename
,
5622 ErasureCodeProfile
&profile
,
5626 ErasureCodeInterfaceRef erasure_code
;
5627 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5628 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
5629 check_legacy_ec_plugin(plugin
->second
, profilename
);
5630 int err
= instance
.factory(plugin
->second
,
5631 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5632 profile
, &erasure_code
, ss
);
5637 err
= erasure_code
->init(profile
, ss
);
5642 auto it
= profile
.find("stripe_unit");
5643 if (it
!= profile
.end()) {
5645 uint32_t stripe_unit
= strict_si_cast
<uint32_t>(it
->second
.c_str(), &err_str
);
5646 if (!err_str
.empty()) {
5647 *ss
<< "could not parse stripe_unit '" << it
->second
5648 << "': " << err_str
<< std::endl
;
5651 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
5652 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
5653 if (chunk_size
!= stripe_unit
) {
5654 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
5655 << "alignment. Would be padded to " << chunk_size
5659 if ((stripe_unit
% 4096) != 0 && !force
) {
5660 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
5661 << "use --force to override this check" << std::endl
;
5668 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
5669 const string
&profile
,
5673 int ruleid
= osdmap
.crush
->get_rule_id(name
);
5674 if (ruleid
!= -ENOENT
) {
5675 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
5679 CrushWrapper newcrush
;
5680 _get_pending_crush(newcrush
);
5682 ruleid
= newcrush
.get_rule_id(name
);
5683 if (ruleid
!= -ENOENT
) {
5684 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
5687 ErasureCodeInterfaceRef erasure_code
;
5688 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
5690 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
5694 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
5695 erasure_code
.reset();
5699 pending_inc
.crush
.clear();
5700 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5705 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
5706 ErasureCodeInterfaceRef
*erasure_code
,
5709 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
5711 ErasureCodeProfile profile
=
5712 osdmap
.get_erasure_code_profile(erasure_code_profile
);
5713 ErasureCodeProfile::const_iterator plugin
=
5714 profile
.find("plugin");
5715 if (plugin
== profile
.end()) {
5716 *ss
<< "cannot determine the erasure code plugin"
5717 << " because there is no 'plugin' entry in the erasure_code_profile "
5718 << profile
<< std::endl
;
5721 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
5722 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5723 return instance
.factory(plugin
->second
,
5724 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5725 profile
, erasure_code
, ss
);
5728 int OSDMonitor::check_cluster_features(uint64_t features
,
5731 stringstream unsupported_ss
;
5732 int unsupported_count
= 0;
5733 if ((mon
->get_quorum_con_features() & features
) != features
) {
5734 unsupported_ss
<< "the monitor cluster";
5735 ++unsupported_count
;
5738 set
<int32_t> up_osds
;
5739 osdmap
.get_up_osds(up_osds
);
5740 for (set
<int32_t>::iterator it
= up_osds
.begin();
5741 it
!= up_osds
.end(); ++it
) {
5742 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
5743 if ((xi
.features
& features
) != features
) {
5744 if (unsupported_count
> 0)
5745 unsupported_ss
<< ", ";
5746 unsupported_ss
<< "osd." << *it
;
5747 unsupported_count
++;
5751 if (unsupported_count
> 0) {
5752 ss
<< "features " << features
<< " unsupported by: "
5753 << unsupported_ss
.str();
5757 // check pending osd state, too!
5758 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
5759 pending_inc
.new_xinfo
.begin();
5760 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
5761 const osd_xinfo_t
&xi
= p
->second
;
5762 if ((xi
.features
& features
) != features
) {
5763 dout(10) << __func__
<< " pending osd." << p
->first
5764 << " features are insufficient; retry" << dendl
;
5772 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
5775 OSDMap::Incremental new_pending
= pending_inc
;
5776 ::encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
5778 newmap
.deepish_copy_from(osdmap
);
5779 newmap
.apply_incremental(new_pending
);
5782 if (newmap
.require_min_compat_client
> 0) {
5783 auto mv
= newmap
.get_min_compat_client();
5784 if (mv
> newmap
.require_min_compat_client
) {
5785 ss
<< "new crush map requires client version " << ceph_release_name(mv
)
5786 << " but require_min_compat_client is "
5787 << ceph_release_name(newmap
.require_min_compat_client
);
5794 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
5795 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
5796 stringstream features_ss
;
5797 int r
= check_cluster_features(features
, features_ss
);
5799 ss
<< "Could not change CRUSH: " << features_ss
.str();
5806 bool OSDMonitor::erasure_code_profile_in_use(
5807 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
5808 const string
&profile
,
5812 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
5815 if (p
->second
.erasure_code_profile
== profile
) {
5816 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
5821 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
5826 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
5827 map
<string
,string
> *erasure_code_profile_map
,
5830 int r
= get_json_str_map(g_conf
->osd_pool_default_erasure_code_profile
,
5832 erasure_code_profile_map
);
5835 assert((*erasure_code_profile_map
).count("plugin"));
5836 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
5837 map
<string
,string
> user_map
;
5838 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
5839 i
!= erasure_code_profile
.end();
5841 size_t equal
= i
->find('=');
5842 if (equal
== string::npos
) {
5843 user_map
[*i
] = string();
5844 (*erasure_code_profile_map
)[*i
] = string();
5846 string key
= i
->substr(0, equal
);
5848 const string value
= i
->substr(equal
);
5849 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
5850 key
.find("ruleset-") == 0) {
5851 if (g_conf
->get_val
<bool>("mon_fixup_legacy_erasure_code_profiles")) {
5852 mon
->clog
->warn() << "erasure code profile property '" << key
5853 << "' is no longer supported; try "
5854 << "'crush-" << key
.substr(8) << "' instead";
5855 key
= string("crush-") + key
.substr(8);
5857 *ss
<< "property '" << key
<< "' is no longer supported; try "
5858 << "'crush-" << key
.substr(8) << "' instead";
5862 user_map
[key
] = value
;
5863 (*erasure_code_profile_map
)[key
] = value
;
5867 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
5868 (*erasure_code_profile_map
) = user_map
;
5873 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
5874 const string
&erasure_code_profile
,
5875 unsigned *size
, unsigned *min_size
,
5879 switch (pool_type
) {
5880 case pg_pool_t::TYPE_REPLICATED
:
5881 *size
= g_conf
->osd_pool_default_size
;
5882 *min_size
= g_conf
->get_osd_pool_default_min_size();
5884 case pg_pool_t::TYPE_ERASURE
:
5886 ErasureCodeInterfaceRef erasure_code
;
5887 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
5889 *size
= erasure_code
->get_chunk_count();
5890 *min_size
= MIN(erasure_code
->get_data_chunk_count() + 1, *size
);
5895 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
5902 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
5903 const string
&erasure_code_profile
,
5904 uint32_t *stripe_width
,
5908 switch (pool_type
) {
5909 case pg_pool_t::TYPE_REPLICATED
:
5912 case pg_pool_t::TYPE_ERASURE
:
5914 ErasureCodeProfile profile
=
5915 osdmap
.get_erasure_code_profile(erasure_code_profile
);
5916 ErasureCodeInterfaceRef erasure_code
;
5917 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
5920 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
5921 uint32_t stripe_unit
= g_conf
->osd_pool_erasure_code_stripe_unit
;
5922 auto it
= profile
.find("stripe_unit");
5923 if (it
!= profile
.end()) {
5925 stripe_unit
= strict_si_cast
<uint32_t>(it
->second
.c_str(), &err_str
);
5926 assert(err_str
.empty());
5928 *stripe_width
= data_chunks
*
5929 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
5933 *ss
<< "prepare_pool_stripe_width: "
5934 << pool_type
<< " is not a known pool type";
5941 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
5942 const string
&erasure_code_profile
,
5943 const string
&rule_name
,
5948 if (*crush_rule
< 0) {
5949 switch (pool_type
) {
5950 case pg_pool_t::TYPE_REPLICATED
:
5952 if (rule_name
== "") {
5954 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context
);
5955 if (*crush_rule
< 0) {
5956 // Errors may happen e.g. if no valid rule is available
5957 *ss
<< "No suitable CRUSH rule exists, check "
5958 << "'osd pool default crush *' config options";
5962 return get_crush_rule(rule_name
, crush_rule
, ss
);
5966 case pg_pool_t::TYPE_ERASURE
:
5968 int err
= crush_rule_create_erasure(rule_name
,
5969 erasure_code_profile
,
5973 dout(20) << "prepare_pool_crush_rule: rule "
5974 << rule_name
<< " try again" << dendl
;
5977 // need to wait for the crush rule to be proposed before proceeding
5988 *ss
<< "prepare_pool_crush_rule: " << pool_type
5989 << " is not a known pool type";
5994 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
5995 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
6003 int OSDMonitor::get_crush_rule(const string
&rule_name
,
6008 ret
= osdmap
.crush
->get_rule_id(rule_name
);
6009 if (ret
!= -ENOENT
) {
6013 CrushWrapper newcrush
;
6014 _get_pending_crush(newcrush
);
6016 ret
= newcrush
.get_rule_id(rule_name
);
6017 if (ret
!= -ENOENT
) {
6018 // found it, wait for it to be proposed
6019 dout(20) << __func__
<< ": rule " << rule_name
6020 << " try again" << dendl
;
6023 // Cannot find it , return error
6024 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
6031 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
*ss
)
6033 auto max_pgs_per_osd
= g_conf
->get_val
<uint64_t>("mon_max_pg_per_osd");
6034 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u); // assume min cluster size 3
6035 auto max_pgs
= max_pgs_per_osd
* num_osds
;
6036 uint64_t projected
= 0;
6038 projected
+= pg_num
* size
;
6040 for (const auto& i
: osdmap
.get_pools()) {
6041 if (i
.first
== pool
) {
6042 projected
+= pg_num
* size
;
6044 projected
+= i
.second
.get_pg_num() * i
.second
.get_size();
6047 if (projected
> max_pgs
) {
6049 *ss
<< "pool id " << pool
;
6051 *ss
<< " pg_num " << pg_num
<< " size " << size
6052 << " would mean " << projected
6053 << " total pgs, which exceeds max " << max_pgs
6054 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6055 << " * num_in_osds " << num_osds
<< ")";
6062 * @param name The name of the new pool
6063 * @param auid The auid of the pool owner. Can be -1
6064 * @param crush_rule The crush rule to use. If <0, will use the system default
6065 * @param crush_rule_name The crush rule to use, if crush_rulset <0
6066 * @param pg_num The pg_num to use. If set to 0, will use the system default
6067 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6068 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6069 * @param pool_type TYPE_ERASURE, or TYPE_REP
6070 * @param expected_num_objects expected number of objects on the pool
6071 * @param fast_read fast read type.
6072 * @param ss human readable error message, if any.
6074 * @return 0 on success, negative errno on failure.
6076 int OSDMonitor::prepare_new_pool(string
& name
, uint64_t auid
,
6078 const string
&crush_rule_name
,
6079 unsigned pg_num
, unsigned pgp_num
,
6080 const string
&erasure_code_profile
,
6081 const unsigned pool_type
,
6082 const uint64_t expected_num_objects
,
6083 FastReadType fast_read
,
6086 if (name
.length() == 0)
6089 pg_num
= g_conf
->osd_pool_default_pg_num
;
6091 pgp_num
= g_conf
->osd_pool_default_pgp_num
;
6092 if (pg_num
> (unsigned)g_conf
->mon_max_pool_pg_num
) {
6093 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
6094 << g_conf
->mon_max_pool_pg_num
6095 << " (you may adjust 'mon max pool pg num' for higher values)";
6098 if (pgp_num
> pg_num
) {
6099 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6100 << ", which in this case is " << pg_num
;
6103 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
6104 *ss
<< "'fast_read' can only apply to erasure coding pool";
6108 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
6109 crush_rule_name
, &crush_rule
, ss
);
6111 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
6114 if (g_conf
->mon_osd_crush_smoke_test
) {
6115 CrushWrapper newcrush
;
6116 _get_pending_crush(newcrush
);
6118 CrushTester
tester(newcrush
, err
);
6119 tester
.set_min_x(0);
6120 tester
.set_max_x(50);
6121 tester
.set_rule(crush_rule
);
6122 auto start
= ceph::coarse_mono_clock::now();
6123 r
= tester
.test_with_fork(g_conf
->mon_lease
);
6124 auto duration
= ceph::coarse_mono_clock::now() - start
;
6126 dout(10) << "tester.test_with_fork returns " << r
6127 << ": " << err
.str() << dendl
;
6128 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
6131 dout(10) << __func__
<< " crush smoke test duration: "
6132 << duration
<< dendl
;
6134 unsigned size
, min_size
;
6135 r
= prepare_pool_size(pool_type
, erasure_code_profile
, &size
, &min_size
, ss
);
6137 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
6140 r
= check_pg_num(-1, pg_num
, size
, ss
);
6142 dout(10) << "check_pg_num returns " << r
<< dendl
;
6146 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
6150 uint32_t stripe_width
= 0;
6151 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
6153 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
6158 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
6159 switch (fast_read
) {
6166 case FAST_READ_DEFAULT
:
6167 fread
= g_conf
->mon_osd_pool_ec_fast_read
;
6170 *ss
<< "invalid fast_read setting: " << fast_read
;
6175 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
6176 p
!= pending_inc
.new_pool_names
.end();
6178 if (p
->second
== name
)
6182 if (-1 == pending_inc
.new_pool_max
)
6183 pending_inc
.new_pool_max
= osdmap
.pool_max
;
6184 int64_t pool
= ++pending_inc
.new_pool_max
;
6186 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
6187 pi
->type
= pool_type
;
6188 pi
->fast_read
= fread
;
6189 pi
->flags
= g_conf
->osd_pool_default_flags
;
6190 if (g_conf
->osd_pool_default_flag_hashpspool
)
6191 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
6192 if (g_conf
->osd_pool_default_flag_nodelete
)
6193 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
6194 if (g_conf
->osd_pool_default_flag_nopgchange
)
6195 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
6196 if (g_conf
->osd_pool_default_flag_nosizechange
)
6197 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
6198 if (g_conf
->osd_pool_use_gmt_hitset
&&
6199 (osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
))
6200 pi
->use_gmt_hitset
= true;
6202 pi
->use_gmt_hitset
= false;
6205 pi
->min_size
= min_size
;
6206 pi
->crush_rule
= crush_rule
;
6207 pi
->expected_num_objects
= expected_num_objects
;
6208 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
6209 pi
->set_pg_num(pg_num
);
6210 pi
->set_pgp_num(pgp_num
);
6211 pi
->last_change
= pending_inc
.epoch
;
6213 pi
->erasure_code_profile
= erasure_code_profile
;
6214 pi
->stripe_width
= stripe_width
;
6215 pi
->cache_target_dirty_ratio_micro
=
6216 g_conf
->osd_pool_default_cache_target_dirty_ratio
* 1000000;
6217 pi
->cache_target_dirty_high_ratio_micro
=
6218 g_conf
->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
6219 pi
->cache_target_full_ratio_micro
=
6220 g_conf
->osd_pool_default_cache_target_full_ratio
* 1000000;
6221 pi
->cache_min_flush_age
= g_conf
->osd_pool_default_cache_min_flush_age
;
6222 pi
->cache_min_evict_age
= g_conf
->osd_pool_default_cache_min_evict_age
;
6223 pending_inc
.new_pool_names
[pool
] = name
;
6227 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
6229 op
->mark_osdmon_event(__func__
);
6231 if (pending_inc
.new_flags
< 0)
6232 pending_inc
.new_flags
= osdmap
.get_flags();
6233 pending_inc
.new_flags
|= flag
;
6234 ss
<< OSDMap::get_flag_string(flag
) << " is set";
6235 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
6236 get_last_committed() + 1));
6240 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
6242 op
->mark_osdmon_event(__func__
);
6244 if (pending_inc
.new_flags
< 0)
6245 pending_inc
.new_flags
= osdmap
.get_flags();
6246 pending_inc
.new_flags
&= ~flag
;
6247 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
6248 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
6249 get_last_committed() + 1));
6253 int OSDMonitor::prepare_command_pool_set(map
<string
,cmd_vartype
> &cmdmap
,
6257 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
6258 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
6260 ss
<< "unrecognized pool '" << poolstr
<< "'";
6264 cmd_getval(g_ceph_context
, cmdmap
, "var", var
);
6266 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
6267 if (pending_inc
.new_pools
.count(pool
))
6268 p
= pending_inc
.new_pools
[pool
];
6270 // accept val as a json string in the normal case (current
6271 // generation monitor). parse out int or float values from the
6272 // string as needed. however, if it is not a string, try to pull
6273 // out an int, in case an older monitor with an older json schema is
6274 // forwarding a request.
6276 string interr
, floaterr
;
6279 int64_t uf
= 0; // micro-f
6280 if (!cmd_getval(g_ceph_context
, cmdmap
, "val", val
)) {
6281 // wasn't a string; maybe an older mon forwarded json with an int?
6282 if (!cmd_getval(g_ceph_context
, cmdmap
, "val", n
))
6283 return -EINVAL
; // no value!
6285 // we got a string. see if it contains an int.
6286 n
= strict_strtoll(val
.c_str(), 10, &interr
);
6288 f
= strict_strtod(val
.c_str(), &floaterr
);
6289 uf
= llrintl(f
* (double)1000000.0);
6293 (var
== "hit_set_type" || var
== "hit_set_period" ||
6294 var
== "hit_set_count" || var
== "hit_set_fpp" ||
6295 var
== "target_max_objects" || var
== "target_max_bytes" ||
6296 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
6297 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
6298 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
6299 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
6300 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
6304 if (var
== "size") {
6305 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
6306 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
6309 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
6310 ss
<< "can not change the size of an erasure-coded pool";
6313 if (interr
.length()) {
6314 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6317 if (n
<= 0 || n
> 10) {
6318 ss
<< "pool size must be between 1 and 10";
6321 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, &ss
);
6328 } else if (var
== "min_size") {
6329 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
6330 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
6333 if (interr
.length()) {
6334 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6338 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
6339 if (n
< 1 || n
> p
.size
) {
6340 ss
<< "pool min_size must be between 1 and " << (int)p
.size
;
6344 ErasureCodeInterfaceRef erasure_code
;
6347 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
6349 k
= erasure_code
->get_data_chunk_count();
6351 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
6355 if (n
< k
|| n
> p
.size
) {
6356 ss
<< "pool min_size must be between " << k
<< " and " << (int)p
.size
;
6361 } else if (var
== "auid") {
6362 if (interr
.length()) {
6363 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6367 } else if (var
== "crash_replay_interval") {
6368 if (interr
.length()) {
6369 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6372 p
.crash_replay_interval
= n
;
6373 } else if (var
== "pg_num") {
6374 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
6375 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
6378 if (interr
.length()) {
6379 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6382 if (n
<= (int)p
.get_pg_num()) {
6383 ss
<< "specified pg_num " << n
<< " <= current " << p
.get_pg_num();
6384 if (n
< (int)p
.get_pg_num())
6388 if (n
> (unsigned)g_conf
->mon_max_pool_pg_num
) {
6389 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
6390 << g_conf
->mon_max_pool_pg_num
6391 << " (you may adjust 'mon max pool pg num' for higher values)";
6394 int r
= check_pg_num(pool
, n
, p
.get_size(), &ss
);
6399 cmd_getval(g_ceph_context
,cmdmap
, "force", force
);
6400 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&&
6401 force
!= "--yes-i-really-mean-it") {
6402 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
6405 int expected_osds
= MIN(p
.get_pg_num(), osdmap
.get_num_osds());
6406 int64_t new_pgs
= n
- p
.get_pg_num();
6407 if (new_pgs
> g_conf
->mon_osd_max_split_count
* expected_osds
) {
6408 ss
<< "specified pg_num " << n
<< " is too large (creating "
6409 << new_pgs
<< " new PGs on ~" << expected_osds
6410 << " OSDs exceeds per-OSD max of " << g_conf
->mon_osd_max_split_count
6415 // force pre-luminous clients to resend their ops, since they
6416 // don't understand that split PGs now form a new interval.
6417 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
6418 } else if (var
== "pgp_num") {
6419 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
6420 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
6423 if (interr
.length()) {
6424 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6428 ss
<< "specified pgp_num must > 0, but you set to " << n
;
6431 if (n
> (int)p
.get_pg_num()) {
6432 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
6436 } else if (var
== "crush_rule") {
6437 int id
= osdmap
.crush
->get_rule_id(val
);
6438 if (id
== -ENOENT
) {
6439 ss
<< "crush rule " << val
<< " does not exist";
6443 ss
<< cpp_strerror(id
);
6446 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
6450 } else if (var
== "nodelete" || var
== "nopgchange" ||
6451 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
6452 var
== "noscrub" || var
== "nodeep-scrub") {
6453 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
6454 // make sure we only compare against 'n' if we didn't receive a string
6455 if (val
== "true" || (interr
.empty() && n
== 1)) {
6457 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6460 ss
<< "expecting value 'true', 'false', '0', or '1'";
6463 } else if (var
== "hashpspool") {
6464 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
6466 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
6467 if (force
!= "--yes-i-really-mean-it") {
6468 ss
<< "are you SURE? this will remap all placement groups in this pool,"
6469 " this triggers large data movement,"
6470 " pass --yes-i-really-mean-it if you really do.";
6473 // make sure we only compare against 'n' if we didn't receive a string
6474 if (val
== "true" || (interr
.empty() && n
== 1)) {
6476 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6479 ss
<< "expecting value 'true', 'false', '0', or '1'";
6482 } else if (var
== "hit_set_type") {
6484 p
.hit_set_params
= HitSet::Params();
6486 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
6489 if (val
== "bloom") {
6490 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
6491 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
6492 p
.hit_set_params
= HitSet::Params(bsp
);
6493 } else if (val
== "explicit_hash")
6494 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
6495 else if (val
== "explicit_object")
6496 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
6498 ss
<< "unrecognized hit_set type '" << val
<< "'";
6502 } else if (var
== "hit_set_period") {
6503 if (interr
.length()) {
6504 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6507 p
.hit_set_period
= n
;
6508 } else if (var
== "hit_set_count") {
6509 if (interr
.length()) {
6510 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6513 p
.hit_set_count
= n
;
6514 } else if (var
== "hit_set_fpp") {
6515 if (floaterr
.length()) {
6516 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
6519 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
6520 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
6523 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
6525 } else if (var
== "use_gmt_hitset") {
6526 if (val
== "true" || (interr
.empty() && n
== 1)) {
6528 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
6529 if (!osdmap
.get_num_up_osds() && force
!= "--yes-i-really-mean-it") {
6530 ss
<< "Not advisable to continue since no OSDs are up. Pass "
6531 << "--yes-i-really-mean-it if you really wish to continue.";
6534 if (!(osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
)
6535 && force
!= "--yes-i-really-mean-it") {
6536 ss
<< "not all OSDs support GMT hit set.";
6539 p
.use_gmt_hitset
= true;
6541 ss
<< "expecting value 'true' or '1'";
6544 } else if (var
== "allow_ec_overwrites") {
6545 if (!p
.is_erasure()) {
6546 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
6550 if (!g_conf
->mon_debug_no_require_bluestore_for_ec_overwrites
&&
6551 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
6552 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
6555 if (val
== "true" || (interr
.empty() && n
== 1)) {
6556 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
6557 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6558 ss
<< "ec overwrites cannot be disabled once enabled";
6561 ss
<< "expecting value 'true', 'false', '0', or '1'";
6564 } else if (var
== "target_max_objects") {
6565 if (interr
.length()) {
6566 ss
<< "error parsing int '" << val
<< "': " << interr
;
6569 p
.target_max_objects
= n
;
6570 } else if (var
== "target_max_bytes") {
6571 if (interr
.length()) {
6572 ss
<< "error parsing int '" << val
<< "': " << interr
;
6575 p
.target_max_bytes
= n
;
6576 } else if (var
== "cache_target_dirty_ratio") {
6577 if (floaterr
.length()) {
6578 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6581 if (f
< 0 || f
> 1.0) {
6582 ss
<< "value must be in the range 0..1";
6585 p
.cache_target_dirty_ratio_micro
= uf
;
6586 } else if (var
== "cache_target_dirty_high_ratio") {
6587 if (floaterr
.length()) {
6588 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6591 if (f
< 0 || f
> 1.0) {
6592 ss
<< "value must be in the range 0..1";
6595 p
.cache_target_dirty_high_ratio_micro
= uf
;
6596 } else if (var
== "cache_target_full_ratio") {
6597 if (floaterr
.length()) {
6598 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6601 if (f
< 0 || f
> 1.0) {
6602 ss
<< "value must be in the range 0..1";
6605 p
.cache_target_full_ratio_micro
= uf
;
6606 } else if (var
== "cache_min_flush_age") {
6607 if (interr
.length()) {
6608 ss
<< "error parsing int '" << val
<< "': " << interr
;
6611 p
.cache_min_flush_age
= n
;
6612 } else if (var
== "cache_min_evict_age") {
6613 if (interr
.length()) {
6614 ss
<< "error parsing int '" << val
<< "': " << interr
;
6617 p
.cache_min_evict_age
= n
;
6618 } else if (var
== "min_read_recency_for_promote") {
6619 if (interr
.length()) {
6620 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6623 p
.min_read_recency_for_promote
= n
;
6624 } else if (var
== "hit_set_grade_decay_rate") {
6625 if (interr
.length()) {
6626 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6629 if (n
> 100 || n
< 0) {
6630 ss
<< "value out of range,valid range is 0 - 100";
6633 p
.hit_set_grade_decay_rate
= n
;
6634 } else if (var
== "hit_set_search_last_n") {
6635 if (interr
.length()) {
6636 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6639 if (n
> p
.hit_set_count
|| n
< 0) {
6640 ss
<< "value out of range,valid range is 0 - hit_set_count";
6643 p
.hit_set_search_last_n
= n
;
6644 } else if (var
== "min_write_recency_for_promote") {
6645 if (interr
.length()) {
6646 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6649 p
.min_write_recency_for_promote
= n
;
6650 } else if (var
== "fast_read") {
6651 if (p
.is_replicated()) {
6652 ss
<< "fast read is not supported in replication pool";
6655 if (val
== "true" || (interr
.empty() && n
== 1)) {
6657 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6658 p
.fast_read
= false;
6660 ss
<< "expecting value 'true', 'false', '0', or '1'";
6663 } else if (pool_opts_t::is_opt_name(var
)) {
6664 bool unset
= val
== "unset";
6665 if (var
== "compression_mode") {
6667 auto cmode
= Compressor::get_comp_mode_type(val
);
6669 ss
<< "unrecognized compression mode '" << val
<< "'";
6673 } else if (var
== "compression_algorithm") {
6675 auto alg
= Compressor::get_comp_alg_type(val
);
6677 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
6681 } else if (var
== "compression_required_ratio") {
6682 if (floaterr
.length()) {
6683 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
6686 if (f
< 0 || f
> 1) {
6687 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
6690 } else if (var
== "csum_type") {
6691 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
6693 ss
<< "unrecognized csum_type '" << val
<< "'";
6696 //preserve csum_type numeric value
6699 } else if (var
== "compression_max_blob_size" ||
6700 var
== "compression_min_blob_size" ||
6701 var
== "csum_max_block" ||
6702 var
== "csum_min_block") {
6703 if (interr
.length()) {
6704 ss
<< "error parsing int value '" << val
<< "': " << interr
;
6709 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
6710 switch (desc
.type
) {
6711 case pool_opts_t::STR
:
6713 p
.opts
.unset(desc
.key
);
6715 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
6718 case pool_opts_t::INT
:
6719 if (interr
.length()) {
6720 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6724 p
.opts
.unset(desc
.key
);
6726 p
.opts
.set(desc
.key
, static_cast<int>(n
));
6729 case pool_opts_t::DOUBLE
:
6730 if (floaterr
.length()) {
6731 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
6735 p
.opts
.unset(desc
.key
);
6737 p
.opts
.set(desc
.key
, static_cast<double>(f
));
6741 assert(!"unknown type");
6744 ss
<< "unrecognized variable '" << var
<< "'";
6747 if (val
!= "unset") {
6748 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
6750 ss
<< "unset pool " << pool
<< " " << var
;
6752 p
.last_change
= pending_inc
.epoch
;
6753 pending_inc
.new_pools
[pool
] = p
;
6757 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
6758 map
<string
,cmd_vartype
> &cmdmap
,
6762 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool_name
);
6763 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6765 ss
<< "unrecognized pool '" << pool_name
<< "'";
6769 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
6770 if (pending_inc
.new_pools
.count(pool
)) {
6771 p
= pending_inc
.new_pools
[pool
];
6775 cmd_getval(g_ceph_context
, cmdmap
, "app", app
);
6776 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
6778 if (boost::algorithm::ends_with(prefix
, "enable")) {
6780 ss
<< "application name must be provided";
6785 ss
<< "application must be enabled on base tier";
6790 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
6792 if (!app_exists
&& !p
.application_metadata
.empty() &&
6793 force
!= "--yes-i-really-mean-it") {
6794 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
6795 << "application; pass --yes-i-really-mean-it to proceed anyway";
6799 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
6800 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
6801 << "max " << MAX_POOL_APPLICATIONS
;
6805 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
6806 ss
<< "application name '" << app
<< "' too long; max length "
6807 << MAX_POOL_APPLICATION_LENGTH
;
6812 p
.application_metadata
[app
] = {};
6814 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
6816 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
6818 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
6820 if (force
!= "--yes-i-really-mean-it") {
6821 ss
<< "Are you SURE? Disabling an application within a pool might result "
6822 << "in loss of application functionality; pass "
6823 << "--yes-i-really-mean-it to proceed anyway";
6828 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
6830 return 0; // idempotent
6833 p
.application_metadata
.erase(app
);
6834 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
6836 } else if (boost::algorithm::ends_with(prefix
, "set")) {
6838 ss
<< "application metadata must be set on base tier";
6843 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
6849 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
6852 ss
<< "key must be provided";
6856 auto &app_keys
= p
.application_metadata
[app
];
6857 if (app_keys
.count(key
) == 0 &&
6858 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
6859 ss
<< "too many keys set for application '" << app
<< "' on pool '"
6860 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
6864 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
6865 ss
<< "key '" << app
<< "' too long; max length "
6866 << MAX_POOL_APPLICATION_LENGTH
;
6871 cmd_getval(g_ceph_context
, cmdmap
, "value", value
);
6872 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
6873 ss
<< "value '" << value
<< "' too long; max length "
6874 << MAX_POOL_APPLICATION_LENGTH
;
6878 p
.application_metadata
[app
][key
] = value
;
6879 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
6880 << value
<< "' on pool '" << pool_name
<< "'";
6881 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
6883 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
6889 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
6890 auto it
= p
.application_metadata
[app
].find(key
);
6891 if (it
== p
.application_metadata
[app
].end()) {
6892 ss
<< "application '" << app
<< "' on pool '" << pool_name
6893 << "' does not have key '" << key
<< "'";
6894 return 0; // idempotent
6897 p
.application_metadata
[app
].erase(it
);
6898 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
6899 << pool_name
<< "'";
6904 p
.last_change
= pending_inc
.epoch
;
6905 pending_inc
.new_pools
[pool
] = p
;
6909 int OSDMonitor::_prepare_command_osd_crush_remove(
6910 CrushWrapper
&newcrush
,
6919 err
= newcrush
.remove_item_under(g_ceph_context
, id
, ancestor
,
6922 err
= newcrush
.remove_item(g_ceph_context
, id
, unlink_only
);
6927 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
6929 pending_inc
.crush
.clear();
6930 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6933 int OSDMonitor::prepare_command_osd_crush_remove(
6934 CrushWrapper
&newcrush
,
6940 int err
= _prepare_command_osd_crush_remove(
6941 newcrush
, id
, ancestor
,
6942 has_ancestor
, unlink_only
);
6948 do_osd_crush_remove(newcrush
);
6953 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
6955 if (osdmap
.is_up(id
)) {
6959 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
6960 pending_inc
.new_uuid
[id
] = uuid_d();
6961 pending_metadata_rm
.insert(id
);
6962 pending_metadata
.erase(id
);
6967 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
6969 assert(existing_id
);
6972 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
6973 if (!osdmap
.exists(i
) &&
6974 pending_inc
.new_up_client
.count(i
) == 0 &&
6975 (pending_inc
.new_state
.count(i
) == 0 ||
6976 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
6982 if (pending_inc
.new_max_osd
< 0) {
6983 return osdmap
.get_max_osd();
6985 return pending_inc
.new_max_osd
;
6988 void OSDMonitor::do_osd_create(
6991 const string
& device_class
,
6994 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
6997 // We presume validation has been performed prior to calling this
6998 // function. We assert with prejudice.
7000 int32_t allocated_id
= -1; // declare here so we can jump
7001 int32_t existing_id
= -1;
7002 if (!uuid
.is_zero()) {
7003 existing_id
= osdmap
.identify_osd(uuid
);
7004 if (existing_id
>= 0) {
7005 assert(id
< 0 || id
== existing_id
);
7006 *new_id
= existing_id
;
7008 } else if (id
>= 0) {
7009 // uuid does not exist, and id has been provided, so just create
7016 // allocate a new id
7017 allocated_id
= _allocate_osd_id(&existing_id
);
7018 dout(10) << __func__
<< " allocated id " << allocated_id
7019 << " existing id " << existing_id
<< dendl
;
7020 if (existing_id
>= 0) {
7021 assert(existing_id
< osdmap
.get_max_osd());
7022 assert(allocated_id
< 0);
7023 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
7024 *new_id
= existing_id
;
7025 } else if (allocated_id
>= 0) {
7026 assert(existing_id
< 0);
7028 if (pending_inc
.new_max_osd
< 0) {
7029 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
7031 ++pending_inc
.new_max_osd
;
7033 *new_id
= pending_inc
.new_max_osd
- 1;
7034 assert(*new_id
== allocated_id
);
7036 assert(0 == "unexpected condition");
7040 if (device_class
.size()) {
7041 CrushWrapper newcrush
;
7042 _get_pending_crush(newcrush
);
7043 if (newcrush
.get_max_devices() < *new_id
+ 1) {
7044 newcrush
.set_max_devices(*new_id
+ 1);
7046 string name
= string("osd.") + stringify(*new_id
);
7047 if (!newcrush
.item_exists(*new_id
)) {
7048 newcrush
.set_item_name(*new_id
, name
);
7051 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
7053 derr
<< __func__
<< " failed to set " << name
<< " device_class "
7054 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
7056 // non-fatal... this might be a replay and we want to be idempotent.
7058 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
7060 pending_inc
.crush
.clear();
7061 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7064 dout(20) << __func__
<< " no device_class" << dendl
;
7067 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
7068 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
7069 pending_inc
.new_max_osd
= *new_id
+ 1;
7072 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
7073 if (!uuid
.is_zero())
7074 pending_inc
.new_uuid
[*new_id
] = uuid
;
7077 int OSDMonitor::validate_osd_create(
7080 const bool check_osd_exists
,
7081 int32_t* existing_id
,
7085 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
7086 << " check_osd_exists " << check_osd_exists
<< dendl
;
7088 assert(existing_id
);
7090 if (id
< 0 && uuid
.is_zero()) {
7091 // we have nothing to validate
7094 } else if (uuid
.is_zero()) {
7095 // we have an id but we will ignore it - because that's what
7096 // `osd create` does.
7101 * This function will be used to validate whether we are able to
7102 * create a new osd when the `uuid` is specified.
7104 * It will be used by both `osd create` and `osd new`, as the checks
7105 * are basically the same when it pertains to osd id and uuid validation.
7106 * However, `osd create` presumes an `uuid` is optional, for legacy
7107 * reasons, while `osd new` requires the `uuid` to be provided. This
7108 * means that `osd create` will not be idempotent if an `uuid` is not
7109 * provided, but we will always guarantee the idempotency of `osd new`.
7112 assert(!uuid
.is_zero());
7113 if (pending_inc
.identify_osd(uuid
) >= 0) {
7114 // osd is about to exist
7118 int32_t i
= osdmap
.identify_osd(uuid
);
7120 // osd already exists
7121 if (id
>= 0 && i
!= id
) {
7122 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
7125 // return a positive errno to distinguish between a blocking error
7126 // and an error we consider to not be a problem (i.e., this would be
7127 // an idempotent operation).
7133 if (pending_inc
.new_state
.count(id
)) {
7134 // osd is about to exist
7137 // we may not care if an osd exists if we are recreating a previously
7139 if (check_osd_exists
&& osdmap
.exists(id
)) {
7140 ss
<< "id " << id
<< " already in use and does not match uuid "
7148 int OSDMonitor::prepare_command_osd_create(
7151 int32_t* existing_id
,
7154 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
7155 assert(existing_id
);
7156 if (osdmap
.is_destroyed(id
)) {
7157 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
7162 if (uuid
.is_zero()) {
7163 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
7166 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
7169 int OSDMonitor::prepare_command_osd_new(
7171 const map
<string
,cmd_vartype
>& cmdmap
,
7172 const map
<string
,string
>& params
,
7180 assert(paxos
->is_plugged());
7182 dout(10) << __func__
<< " " << op
<< dendl
;
7184 /* validate command. abort now if something's wrong. */
7186 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
7188 * If `id` is not specified, we will identify any existing osd based
7189 * on `uuid`. Operation will be idempotent iff secrets match.
7191 * If `id` is specified, we will identify any existing osd based on
7192 * `uuid` and match against `id`. If they match, operation will be
7193 * idempotent iff secrets match.
7195 * `-i secrets.json` will be optional. If supplied, will be used
7196 * to check for idempotency when `id` and `uuid` match.
7198 * If `id` is not specified, and `uuid` does not exist, an id will
7199 * be found or allocated for the osd.
7201 * If `id` is specified, and the osd has been previously marked
7202 * as destroyed, then the `id` will be reused.
7204 if (!cmd_getval(g_ceph_context
, cmdmap
, "uuid", uuidstr
)) {
7205 ss
<< "requires the OSD's UUID to be specified.";
7207 } else if (!uuid
.parse(uuidstr
.c_str())) {
7208 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
7212 if (cmd_getval(g_ceph_context
, cmdmap
, "id", id
) &&
7214 ss
<< "invalid OSD id; must be greater or equal than zero.";
7218 // are we running an `osd create`-like command, or recreating
7219 // a previously destroyed osd?
7221 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
7223 // we will care about `id` to assess whether osd is `destroyed`, or
7224 // to create a new osd.
7225 // we will need an `id` by the time we reach auth.
7227 int32_t existing_id
= -1;
7228 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
7231 bool may_be_idempotent
= false;
7232 if (err
== EEXIST
) {
7233 // this is idempotent from the osdmon's point-of-view
7234 may_be_idempotent
= true;
7235 assert(existing_id
>= 0);
7237 } else if (err
< 0) {
7241 if (!may_be_idempotent
) {
7242 // idempotency is out of the window. We are either creating a new
7243 // osd or recreating a destroyed osd.
7245 // We now need to figure out if we have an `id` (and if it's valid),
7246 // of find an `id` if we don't have one.
7248 // NOTE: we need to consider the case where the `id` is specified for
7249 // `osd create`, and we must honor it. So this means checking if
7250 // the `id` is destroyed, and if so assume the destroy; otherwise,
7251 // check if it `exists` - in which case we complain about not being
7252 // `destroyed`. In the end, if nothing fails, we must allow the
7253 // creation, so that we are compatible with `create`.
7254 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
7255 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
7256 ss
<< "OSD " << id
<< " has not yet been destroyed";
7258 } else if (id
< 0) {
7260 id
= _allocate_osd_id(&existing_id
);
7262 assert(existing_id
>= 0);
7265 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
7266 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
7267 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
7269 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
7273 assert(osdmap
.exists(id
));
7276 // we are now able to either create a brand new osd or reuse an existing
7277 // osd that has been previously destroyed.
7279 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
7281 if (may_be_idempotent
&& params
.empty()) {
7282 // nothing to do, really.
7283 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
7286 f
->open_object_section("created_osd");
7287 f
->dump_int("osdid", id
);
7295 string device_class
;
7296 auto p
= params
.find("crush_device_class");
7297 if (p
!= params
.end()) {
7298 device_class
= p
->second
;
7299 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
7301 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
7302 bool has_lockbox
= false;
7303 bool has_secrets
= params
.count("cephx_secret")
7304 || params
.count("cephx_lockbox_secret")
7305 || params
.count("dmcrypt_key");
7307 ConfigKeyService
*svc
= nullptr;
7308 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
7311 if (params
.count("cephx_secret") == 0) {
7312 ss
<< "requires a cephx secret.";
7315 cephx_secret
= params
.at("cephx_secret");
7317 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
7318 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
7320 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
7321 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
7323 if (has_lockbox_secret
&& has_dmcrypt_key
) {
7325 lockbox_secret
= params
.at("cephx_lockbox_secret");
7326 dmcrypt_key
= params
.at("dmcrypt_key");
7327 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
7328 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
7332 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
7334 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
7342 } else if (may_be_idempotent
&& err
!= EEXIST
) {
7343 // for this to be idempotent, `id` should already be >= 0; no need
7344 // to use validate_id.
7346 ss
<< "osd." << id
<< " exists but secrets do not match";
7351 svc
= (ConfigKeyService
*)mon
->config_key_service
;
7352 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
7355 } else if (may_be_idempotent
&& err
!= EEXIST
) {
7357 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
7362 assert(!has_secrets
|| !cephx_secret
.empty());
7363 assert(!has_lockbox
|| !lockbox_secret
.empty());
7365 if (may_be_idempotent
) {
7366 // we have nothing to do for either the osdmon or the authmon,
7367 // and we have no lockbox - so the config key service will not be
7368 // touched. This is therefore an idempotent operation, and we can
7369 // just return right away.
7370 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
7373 f
->open_object_section("created_osd");
7374 f
->dump_int("osdid", id
);
7381 assert(!may_be_idempotent
);
7385 assert(!cephx_secret
.empty());
7386 assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
7387 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
7389 err
= mon
->authmon()->do_osd_new(cephx_entity
,
7395 assert(nullptr != svc
);
7396 svc
->do_osd_new(uuid
, dmcrypt_key
);
7400 if (is_recreate_destroyed
) {
7402 assert(osdmap
.is_destroyed(id
));
7403 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
7404 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
| CEPH_OSD_NEW
;
7405 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
7406 // due to http://tracker.ceph.com/issues/20751 some clusters may
7407 // have UP set for non-existent OSDs; make sure it is cleared
7408 // for a newly created osd.
7409 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
7411 pending_inc
.new_uuid
[id
] = uuid
;
7414 int32_t new_id
= -1;
7415 do_osd_create(id
, uuid
, device_class
, &new_id
);
7416 assert(new_id
>= 0);
7417 assert(id
== new_id
);
7421 f
->open_object_section("created_osd");
7422 f
->dump_int("osdid", id
);
7431 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
7433 op
->mark_osdmon_event(__func__
);
7434 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
7436 map
<string
, cmd_vartype
> cmdmap
;
7437 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
7438 string rs
= ss
.str();
7439 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
7443 MonSession
*session
= m
->get_session();
7445 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
7449 return prepare_command_impl(op
, cmdmap
);
7452 static int parse_reweights(CephContext
*cct
,
7453 const map
<string
,cmd_vartype
> &cmdmap
,
7454 const OSDMap
& osdmap
,
7455 map
<int32_t, uint32_t>* weights
)
7458 if (!cmd_getval(g_ceph_context
, cmdmap
, "weights", weights_str
)) {
7461 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
7462 json_spirit::mValue json_value
;
7463 if (!json_spirit::read(weights_str
, json_value
)) {
7466 if (json_value
.type() != json_spirit::obj_type
) {
7469 const auto obj
= json_value
.get_obj();
7471 for (auto& osd_weight
: obj
) {
7472 auto osd_id
= std::stoi(osd_weight
.first
);
7473 if (!osdmap
.exists(osd_id
)) {
7476 if (osd_weight
.second
.type() != json_spirit::str_type
) {
7479 auto weight
= std::stoul(osd_weight
.second
.get_str());
7480 weights
->insert({osd_id
, weight
});
7482 } catch (const std::logic_error
& e
) {
7488 int OSDMonitor::prepare_command_osd_destroy(
7492 assert(paxos
->is_plugged());
7494 // we check if the osd exists for the benefit of `osd purge`, which may
7495 // have previously removed the osd. If the osd does not exist, return
7496 // -ENOENT to convey this, and let the caller deal with it.
7498 // we presume that all auth secrets and config keys were removed prior
7499 // to this command being called. if they exist by now, we also assume
7500 // they must have been created by some other command and do not pertain
7501 // to this non-existent osd.
7502 if (!osdmap
.exists(id
)) {
7503 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
7507 uuid_d uuid
= osdmap
.get_uuid(id
);
7508 dout(10) << __func__
<< " destroying osd." << id
7509 << " uuid " << uuid
<< dendl
;
7511 // if it has been destroyed, we assume our work here is done.
7512 if (osdmap
.is_destroyed(id
)) {
7513 ss
<< "destroyed osd." << id
;
7517 EntityName cephx_entity
, lockbox_entity
;
7518 bool idempotent_auth
= false, idempotent_cks
= false;
7520 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
7525 if (err
== -ENOENT
) {
7526 idempotent_auth
= true;
7532 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
7533 err
= svc
->validate_osd_destroy(id
, uuid
);
7535 assert(err
== -ENOENT
);
7537 idempotent_cks
= true;
7540 if (!idempotent_auth
) {
7541 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
7545 if (!idempotent_cks
) {
7546 svc
->do_osd_destroy(id
, uuid
);
7549 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
7550 pending_inc
.new_uuid
[id
] = uuid_d();
7552 // we can only propose_pending() once per service, otherwise we'll be
7553 // defying PaxosService and all laws of nature. Therefore, as we may
7554 // be used during 'osd purge', let's keep the caller responsible for
7560 int OSDMonitor::prepare_command_osd_purge(
7564 assert(paxos
->is_plugged());
7565 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
7567 assert(!osdmap
.is_up(id
));
7570 * This may look a bit weird, but this is what's going to happen:
7572 * 1. we make sure that removing from crush works
7573 * 2. we call `prepare_command_osd_destroy()`. If it returns an
7574 * error, then we abort the whole operation, as no updates
7575 * have been made. However, we this function will have
7576 * side-effects, thus we need to make sure that all operations
7577 * performed henceforth will *always* succeed.
7578 * 3. we call `prepare_command_osd_remove()`. Although this
7579 * function can return an error, it currently only checks if the
7580 * osd is up - and we have made sure that it is not so, so there
7581 * is no conflict, and it is effectively an update.
7582 * 4. finally, we call `do_osd_crush_remove()`, which will perform
7583 * the crush update we delayed from before.
7586 CrushWrapper newcrush
;
7587 _get_pending_crush(newcrush
);
7589 bool may_be_idempotent
= false;
7591 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
7592 if (err
== -ENOENT
) {
7594 may_be_idempotent
= true;
7595 } else if (err
< 0) {
7596 ss
<< "error removing osd." << id
<< " from crush";
7600 // no point destroying the osd again if it has already been marked destroyed
7601 if (!osdmap
.is_destroyed(id
)) {
7602 err
= prepare_command_osd_destroy(id
, ss
);
7604 if (err
== -ENOENT
) {
7610 may_be_idempotent
= false;
7615 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
7616 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
7617 << "we are idempotent." << dendl
;
7621 err
= prepare_command_osd_remove(id
);
7622 // we should not be busy, as we should have made sure this id is not up.
7625 do_osd_crush_remove(newcrush
);
7629 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
7630 map
<string
,cmd_vartype
> &cmdmap
)
7632 op
->mark_osdmon_event(__func__
);
7633 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
7641 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
7642 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
7645 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
7649 bool osdid_present
= false;
7650 if (prefix
!= "osd pg-temp" &&
7651 prefix
!= "osd pg-upmap" &&
7652 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
7653 osdid_present
= cmd_getval(g_ceph_context
, cmdmap
, "id", osdid
);
7655 if (osdid_present
) {
7657 oss
<< "osd." << osdid
;
7661 // Even if there's a pending state with changes that could affect
7662 // a command, considering that said state isn't yet committed, we
7663 // just don't care about those changes if the command currently being
7664 // handled acts as a no-op against the current committed state.
7665 // In a nutshell, we assume this command happens *before*.
7667 // Let me make this clearer:
7669 // - If we have only one client, and that client issues some
7670 // operation that would conflict with this operation but is
7671 // still on the pending state, then we would be sure that said
7672 // operation wouldn't have returned yet, so the client wouldn't
7673 // issue this operation (unless the client didn't wait for the
7674 // operation to finish, and that would be the client's own fault).
7676 // - If we have more than one client, each client will observe
7677 // whatever is the state at the moment of the commit. So, if we
7678 // have two clients, one issuing an unlink and another issuing a
7679 // link, and if the link happens while the unlink is still on the
7680 // pending state, from the link's point-of-view this is a no-op.
7681 // If different clients are issuing conflicting operations and
7682 // they care about that, then the clients should make sure they
7683 // enforce some kind of concurrency mechanism -- from our
7684 // perspective that's what Douglas Adams would call an SEP.
7686 // This should be used as a general guideline for most commands handled
7687 // in this function. Adapt as you see fit, but please bear in mind that
7688 // this is the expected behavior.
7691 if (prefix
== "osd setcrushmap" ||
7692 (prefix
== "osd crush set" && !osdid_present
)) {
7693 if (pending_inc
.crush
.length()) {
7694 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
7695 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
7698 dout(10) << "prepare_command setting new crush map" << dendl
;
7699 bufferlist
data(m
->get_data());
7702 bufferlist::iterator
bl(data
.begin());
7705 catch (const std::exception
&e
) {
7707 ss
<< "Failed to parse crushmap: " << e
.what();
7711 int64_t prior_version
= 0;
7712 if (cmd_getval(g_ceph_context
, cmdmap
, "prior_version", prior_version
)) {
7713 if (prior_version
== osdmap
.get_crush_version() - 1) {
7714 // see if we are a resend of the last update. this is imperfect
7715 // (multiple racing updaters may not both get reliable success)
7716 // but we expect crush updaters (via this interface) to be rare-ish.
7717 bufferlist current
, proposed
;
7718 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
7719 crush
.encode(proposed
, mon
->get_quorum_con_features());
7720 if (current
.contents_equal(proposed
)) {
7721 dout(10) << __func__
7722 << " proposed matches current and version equals previous"
7725 ss
<< osdmap
.get_crush_version();
7729 if (prior_version
!= osdmap
.get_crush_version()) {
7731 ss
<< "prior_version " << prior_version
<< " != crush version "
7732 << osdmap
.get_crush_version();
7737 if (crush
.has_legacy_rule_ids()) {
7739 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
7742 if (!validate_crush_against_features(&crush
, ss
)) {
7747 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
7752 if (g_conf
->mon_osd_crush_smoke_test
) {
7753 // sanity check: test some inputs to make sure this map isn't
7755 dout(10) << " testing map" << dendl
;
7757 CrushTester
tester(crush
, ess
);
7758 tester
.set_min_x(0);
7759 tester
.set_max_x(50);
7760 auto start
= ceph::coarse_mono_clock::now();
7761 int r
= tester
.test_with_fork(g_conf
->mon_lease
);
7762 auto duration
= ceph::coarse_mono_clock::now() - start
;
7764 dout(10) << " tester.test_with_fork returns " << r
7765 << ": " << ess
.str() << dendl
;
7766 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
7770 dout(10) << __func__
<< " crush somke test duration: "
7771 << duration
<< ", result: " << ess
.str() << dendl
;
7774 pending_inc
.crush
= data
;
7775 ss
<< osdmap
.get_crush_version() + 1;
7778 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
7779 CrushWrapper newcrush
;
7780 _get_pending_crush(newcrush
);
7781 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
7783 if (newcrush
.bucket_exists(bid
) &&
7784 newcrush
.get_bucket_alg(bid
)) {
7785 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
7786 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
7789 if (!validate_crush_against_features(&newcrush
, ss
)) {
7793 pending_inc
.crush
.clear();
7794 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7795 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7796 get_last_committed() + 1));
7798 } else if (prefix
== "osd crush set-device-class") {
7799 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7800 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
7801 << "luminous' before using crush device classes";
7806 string device_class
;
7807 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
7808 err
= -EINVAL
; // no value!
7813 vector
<string
> idvec
;
7814 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
7815 CrushWrapper newcrush
;
7816 _get_pending_crush(newcrush
);
7818 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
7822 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
7823 osdmap
.get_all_osds(osds
);
7826 // try traditional single osd way
7827 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
7829 // ss has reason for failure
7830 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
7837 for (auto &osd
: osds
) {
7838 if (!osdmap
.exists(osd
)) {
7839 ss
<< "osd." << osd
<< " does not exist. ";
7844 oss
<< "osd." << osd
;
7845 string name
= oss
.str();
7847 if (newcrush
.get_max_devices() < osd
+ 1) {
7848 newcrush
.set_max_devices(osd
+ 1);
7851 if (newcrush
.item_exists(osd
)) {
7852 action
= "updating";
7854 action
= "creating";
7855 newcrush
.set_item_name(osd
, name
);
7858 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
7859 << "' device_class '" << device_class
<< "'"
7861 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
7865 if (err
== 0 && !_have_pending_crush()) {
7867 // for single osd only, wildcard makes too much noise
7868 ss
<< "set-device-class item id " << osd
<< " name '" << name
7869 << "' device_class '" << device_class
<< "': no change";
7872 updated
.insert(osd
);
7877 if (!updated
.empty()) {
7878 pending_inc
.crush
.clear();
7879 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7880 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
7882 wait_for_finished_proposal(op
,
7883 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
7887 } else if (prefix
== "osd crush rm-device-class") {
7889 vector
<string
> idvec
;
7890 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
7891 CrushWrapper newcrush
;
7892 _get_pending_crush(newcrush
);
7895 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
7900 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
7901 osdmap
.get_all_osds(osds
);
7904 // try traditional single osd way
7905 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
7907 // ss has reason for failure
7908 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
7915 for (auto &osd
: osds
) {
7916 if (!osdmap
.exists(osd
)) {
7917 ss
<< "osd." << osd
<< " does not exist. ";
7921 auto class_name
= newcrush
.get_item_class(osd
);
7923 ss
<< "osd." << osd
<< " belongs to no class, ";
7926 // note that we do not verify if class_is_in_use here
7927 // in case the device is misclassified and user wants
7928 // to overridely reset...
7930 err
= newcrush
.remove_device_class(g_ceph_context
, osd
, &ss
);
7932 // ss has reason for failure
7935 updated
.insert(osd
);
7939 if (!updated
.empty()) {
7940 pending_inc
.crush
.clear();
7941 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7942 ss
<< "done removing class of osd(s): " << updated
;
7944 wait_for_finished_proposal(op
,
7945 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
7948 } else if (prefix
== "osd crush class rename") {
7949 string srcname
, dstname
;
7950 if (!cmd_getval(g_ceph_context
, cmdmap
, "srcname", srcname
)) {
7954 if (!cmd_getval(g_ceph_context
, cmdmap
, "dstname", dstname
)) {
7959 CrushWrapper newcrush
;
7960 _get_pending_crush(newcrush
);
7961 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
7962 // suppose this is a replay and return success
7963 // so command is idempotent
7964 ss
<< "already renamed to '" << dstname
<< "'";
7969 err
= newcrush
.rename_class(srcname
, dstname
);
7971 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
7972 << cpp_strerror(err
);
7976 pending_inc
.crush
.clear();
7977 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7978 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
7980 } else if (prefix
== "osd crush add-bucket") {
7981 // os crush add-bucket <name> <type>
7982 string name
, typestr
;
7983 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7984 cmd_getval(g_ceph_context
, cmdmap
, "type", typestr
);
7986 if (!_have_pending_crush() &&
7987 _get_stable_crush().name_exists(name
)) {
7988 ss
<< "bucket '" << name
<< "' already exists";
7992 CrushWrapper newcrush
;
7993 _get_pending_crush(newcrush
);
7995 if (newcrush
.name_exists(name
)) {
7996 ss
<< "bucket '" << name
<< "' already exists";
7999 int type
= newcrush
.get_type_id(typestr
);
8001 ss
<< "type '" << typestr
<< "' does not exist";
8006 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
8011 err
= newcrush
.add_bucket(0, 0,
8012 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
8015 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
8018 err
= newcrush
.set_item_name(bucketno
, name
);
8020 ss
<< "error setting bucket name to '" << name
<< "'";
8024 pending_inc
.crush
.clear();
8025 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8026 ss
<< "added bucket " << name
<< " type " << typestr
8029 } else if (prefix
== "osd crush rename-bucket") {
8030 string srcname
, dstname
;
8031 cmd_getval(g_ceph_context
, cmdmap
, "srcname", srcname
);
8032 cmd_getval(g_ceph_context
, cmdmap
, "dstname", dstname
);
8034 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
8035 if (err
== -EALREADY
) // equivalent to success for idempotency
8041 } else if (prefix
== "osd crush weight-set create" ||
8042 prefix
== "osd crush weight-set create-compat") {
8043 CrushWrapper newcrush
;
8044 _get_pending_crush(newcrush
);
8047 if (newcrush
.has_non_straw2_buckets()) {
8048 ss
<< "crush map contains one or more bucket(s) that are not straw2";
8052 if (prefix
== "osd crush weight-set create") {
8053 if (osdmap
.require_min_compat_client
> 0 &&
8054 osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
8055 ss
<< "require_min_compat_client "
8056 << ceph_release_name(osdmap
.require_min_compat_client
)
8057 << " < luminous, which is required for per-pool weight-sets. "
8058 << "Try 'ceph osd set-require-min-compat-client luminous' "
8059 << "before using the new interface";
8063 string poolname
, mode
;
8064 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolname
);
8065 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
8067 ss
<< "pool '" << poolname
<< "' not found";
8071 cmd_getval(g_ceph_context
, cmdmap
, "mode", mode
);
8072 if (mode
!= "flat" && mode
!= "positional") {
8073 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
8077 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
8079 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
8082 newcrush
.create_choose_args(pool
, positions
);
8083 pending_inc
.crush
.clear();
8084 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8087 } else if (prefix
== "osd crush weight-set rm" ||
8088 prefix
== "osd crush weight-set rm-compat") {
8089 CrushWrapper newcrush
;
8090 _get_pending_crush(newcrush
);
8092 if (prefix
== "osd crush weight-set rm") {
8094 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolname
);
8095 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
8097 ss
<< "pool '" << poolname
<< "' not found";
8102 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
8104 newcrush
.rm_choose_args(pool
);
8105 pending_inc
.crush
.clear();
8106 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8109 } else if (prefix
== "osd crush weight-set reweight" ||
8110 prefix
== "osd crush weight-set reweight-compat") {
8111 string poolname
, item
;
8112 vector
<double> weight
;
8113 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolname
);
8114 cmd_getval(g_ceph_context
, cmdmap
, "item", item
);
8115 cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
);
8116 CrushWrapper newcrush
;
8117 _get_pending_crush(newcrush
);
8119 if (prefix
== "osd crush weight-set reweight") {
8120 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
8122 ss
<< "pool '" << poolname
<< "' not found";
8126 if (!newcrush
.have_choose_args(pool
)) {
8127 ss
<< "no weight-set for pool '" << poolname
<< "'";
8131 auto arg_map
= newcrush
.choose_args_get(pool
);
8132 int positions
= newcrush
.get_choose_args_positions(arg_map
);
8133 if (weight
.size() != (size_t)positions
) {
8134 ss
<< "must specify exact " << positions
<< " weight values";
8139 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
8140 if (!newcrush
.have_choose_args(pool
)) {
8141 ss
<< "no backward-compatible weight-set";
8146 if (!newcrush
.name_exists(item
)) {
8147 ss
<< "item '" << item
<< "' does not exist";
8151 err
= newcrush
.choose_args_adjust_item_weightf(
8153 newcrush
.choose_args_get(pool
),
8154 newcrush
.get_item_id(item
),
8161 pending_inc
.crush
.clear();
8162 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8164 } else if (osdid_present
&&
8165 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
8166 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
8167 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
8168 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
8170 if (!osdmap
.exists(osdid
)) {
8172 ss
<< name
<< " does not exist. Create it before updating the crush map";
8177 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
)) {
8178 ss
<< "unable to parse weight value '"
8179 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8185 vector
<string
> argvec
;
8186 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
8187 map
<string
,string
> loc
;
8188 CrushWrapper::parse_loc_map(argvec
, &loc
);
8190 if (prefix
== "osd crush set"
8191 && !_get_stable_crush().item_exists(osdid
)) {
8193 ss
<< "unable to set item id " << osdid
<< " name '" << name
8194 << "' weight " << weight
<< " at location " << loc
8195 << ": does not exist";
8199 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
8200 << name
<< "' weight " << weight
<< " at location "
8202 CrushWrapper newcrush
;
8203 _get_pending_crush(newcrush
);
8206 if (prefix
== "osd crush set" ||
8207 newcrush
.check_item_loc(g_ceph_context
, osdid
, loc
, (int *)NULL
)) {
8209 err
= newcrush
.update_item(g_ceph_context
, osdid
, weight
, name
, loc
);
8212 err
= newcrush
.insert_item(g_ceph_context
, osdid
, weight
, name
, loc
);
8220 if (err
== 0 && !_have_pending_crush()) {
8221 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
8222 << weight
<< " at location " << loc
<< ": no change";
8226 pending_inc
.crush
.clear();
8227 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8228 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
8229 << weight
<< " at location " << loc
<< " to crush map";
8231 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8232 get_last_committed() + 1));
8235 } else if (prefix
== "osd crush create-or-move") {
8237 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
8238 if (!osdmap
.exists(osdid
)) {
8240 ss
<< name
<< " does not exist. create it before updating the crush map";
8245 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
)) {
8246 ss
<< "unable to parse weight value '"
8247 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8253 vector
<string
> argvec
;
8254 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
8255 map
<string
,string
> loc
;
8256 CrushWrapper::parse_loc_map(argvec
, &loc
);
8258 dout(0) << "create-or-move crush item name '" << name
<< "' initial_weight " << weight
8259 << " at location " << loc
<< dendl
;
8261 CrushWrapper newcrush
;
8262 _get_pending_crush(newcrush
);
8264 err
= newcrush
.create_or_move_item(g_ceph_context
, osdid
, weight
, name
, loc
);
8266 ss
<< "create-or-move updated item name '" << name
<< "' weight " << weight
8267 << " at location " << loc
<< " to crush map";
8271 pending_inc
.crush
.clear();
8272 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8273 ss
<< "create-or-move updating item name '" << name
<< "' weight " << weight
8274 << " at location " << loc
<< " to crush map";
8276 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8277 get_last_committed() + 1));
8282 } else if (prefix
== "osd crush move") {
8284 // osd crush move <name> <loc1> [<loc2> ...]
8287 vector
<string
> argvec
;
8288 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8289 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
8290 map
<string
,string
> loc
;
8291 CrushWrapper::parse_loc_map(argvec
, &loc
);
8293 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
8294 CrushWrapper newcrush
;
8295 _get_pending_crush(newcrush
);
8297 if (!newcrush
.name_exists(name
)) {
8299 ss
<< "item " << name
<< " does not exist";
8302 int id
= newcrush
.get_item_id(name
);
8304 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
8306 err
= newcrush
.create_or_move_item(g_ceph_context
, id
, 0, name
, loc
);
8308 err
= newcrush
.move_bucket(g_ceph_context
, id
, loc
);
8311 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
8312 pending_inc
.crush
.clear();
8313 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8315 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8316 get_last_committed() + 1));
8320 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
8324 } else if (prefix
== "osd crush swap-bucket") {
8325 string source
, dest
, force
;
8326 cmd_getval(g_ceph_context
, cmdmap
, "source", source
);
8327 cmd_getval(g_ceph_context
, cmdmap
, "dest", dest
);
8328 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
8329 CrushWrapper newcrush
;
8330 _get_pending_crush(newcrush
);
8331 if (!newcrush
.name_exists(source
)) {
8332 ss
<< "source item " << source
<< " does not exist";
8336 if (!newcrush
.name_exists(dest
)) {
8337 ss
<< "dest item " << dest
<< " does not exist";
8341 int sid
= newcrush
.get_item_id(source
);
8342 int did
= newcrush
.get_item_id(dest
);
8344 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 &&
8345 force
!= "--yes-i-really-mean-it") {
8346 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
8350 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
8351 force
!= "--yes-i-really-mean-it") {
8352 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
8353 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
8354 << "; pass --yes-i-really-mean-it to proceed anyway";
8358 int r
= newcrush
.swap_bucket(g_ceph_context
, sid
, did
);
8360 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
8364 ss
<< "swapped bucket of " << source
<< " to " << dest
;
8365 pending_inc
.crush
.clear();
8366 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8367 wait_for_finished_proposal(op
,
8368 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
8369 get_last_committed() + 1));
8371 } else if (prefix
== "osd crush link") {
8372 // osd crush link <name> <loc1> [<loc2> ...]
8374 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8375 vector
<string
> argvec
;
8376 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
8377 map
<string
,string
> loc
;
8378 CrushWrapper::parse_loc_map(argvec
, &loc
);
8380 // Need an explicit check for name_exists because get_item_id returns
8382 int id
= osdmap
.crush
->get_item_id(name
);
8383 if (!osdmap
.crush
->name_exists(name
)) {
8385 ss
<< "item " << name
<< " does not exist";
8388 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
8390 if (osdmap
.crush
->check_item_loc(g_ceph_context
, id
, loc
, (int*) NULL
)) {
8391 ss
<< "no need to move item id " << id
<< " name '" << name
8392 << "' to location " << loc
<< " in crush map";
8397 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
8398 CrushWrapper newcrush
;
8399 _get_pending_crush(newcrush
);
8401 if (!newcrush
.name_exists(name
)) {
8403 ss
<< "item " << name
<< " does not exist";
8406 int id
= newcrush
.get_item_id(name
);
8407 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
8408 err
= newcrush
.link_bucket(g_ceph_context
, id
, loc
);
8410 ss
<< "linked item id " << id
<< " name '" << name
8411 << "' to location " << loc
<< " in crush map";
8412 pending_inc
.crush
.clear();
8413 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8415 ss
<< "cannot link item id " << id
<< " name '" << name
8416 << "' to location " << loc
;
8420 ss
<< "no need to move item id " << id
<< " name '" << name
8421 << "' to location " << loc
<< " in crush map";
8425 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
8426 get_last_committed() + 1));
8428 } else if (prefix
== "osd crush rm" ||
8429 prefix
== "osd crush remove" ||
8430 prefix
== "osd crush unlink") {
8432 // osd crush rm <id> [ancestor]
8433 CrushWrapper newcrush
;
8434 _get_pending_crush(newcrush
);
8437 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8439 if (!osdmap
.crush
->name_exists(name
)) {
8441 ss
<< "device '" << name
<< "' does not appear in the crush map";
8444 if (!newcrush
.name_exists(name
)) {
8446 ss
<< "device '" << name
<< "' does not appear in the crush map";
8448 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8449 get_last_committed() + 1));
8452 int id
= newcrush
.get_item_id(name
);
8455 bool unlink_only
= prefix
== "osd crush unlink";
8456 string ancestor_str
;
8457 if (cmd_getval(g_ceph_context
, cmdmap
, "ancestor", ancestor_str
)) {
8458 if (!newcrush
.name_exists(ancestor_str
)) {
8460 ss
<< "ancestor item '" << ancestor_str
8461 << "' does not appear in the crush map";
8464 ancestor
= newcrush
.get_item_id(ancestor_str
);
8467 err
= prepare_command_osd_crush_remove(
8470 (ancestor
< 0), unlink_only
);
8472 if (err
== -ENOENT
) {
8473 ss
<< "item " << id
<< " does not appear in that position";
8478 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
8480 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8481 get_last_committed() + 1));
8486 } else if (prefix
== "osd crush reweight-all") {
8487 CrushWrapper newcrush
;
8488 _get_pending_crush(newcrush
);
8490 newcrush
.reweight(g_ceph_context
);
8491 pending_inc
.crush
.clear();
8492 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8493 ss
<< "reweighted crush hierarchy";
8495 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8496 get_last_committed() + 1));
8498 } else if (prefix
== "osd crush reweight") {
8499 // osd crush reweight <name> <weight>
8500 CrushWrapper newcrush
;
8501 _get_pending_crush(newcrush
);
8504 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8505 if (!newcrush
.name_exists(name
)) {
8507 ss
<< "device '" << name
<< "' does not appear in the crush map";
8511 int id
= newcrush
.get_item_id(name
);
8513 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
8518 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
8519 ss
<< "unable to parse weight value '"
8520 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8525 err
= newcrush
.adjust_item_weightf(g_ceph_context
, id
, w
);
8528 pending_inc
.crush
.clear();
8529 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8530 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
8533 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8534 get_last_committed() + 1));
8536 } else if (prefix
== "osd crush reweight-subtree") {
8537 // osd crush reweight <name> <weight>
8538 CrushWrapper newcrush
;
8539 _get_pending_crush(newcrush
);
8542 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8543 if (!newcrush
.name_exists(name
)) {
8545 ss
<< "device '" << name
<< "' does not appear in the crush map";
8549 int id
= newcrush
.get_item_id(name
);
8551 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
8556 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
8557 ss
<< "unable to parse weight value '"
8558 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8563 err
= newcrush
.adjust_subtree_weightf(g_ceph_context
, id
, w
);
8566 pending_inc
.crush
.clear();
8567 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8568 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
8571 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8572 get_last_committed() + 1));
8574 } else if (prefix
== "osd crush tunables") {
8575 CrushWrapper newcrush
;
8576 _get_pending_crush(newcrush
);
8580 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
8581 if (profile
== "legacy" || profile
== "argonaut") {
8582 newcrush
.set_tunables_legacy();
8583 } else if (profile
== "bobtail") {
8584 newcrush
.set_tunables_bobtail();
8585 } else if (profile
== "firefly") {
8586 newcrush
.set_tunables_firefly();
8587 } else if (profile
== "hammer") {
8588 newcrush
.set_tunables_hammer();
8589 } else if (profile
== "jewel") {
8590 newcrush
.set_tunables_jewel();
8591 } else if (profile
== "optimal") {
8592 newcrush
.set_tunables_optimal();
8593 } else if (profile
== "default") {
8594 newcrush
.set_tunables_default();
8596 ss
<< "unrecognized profile '" << profile
<< "'";
8601 if (!validate_crush_against_features(&newcrush
, ss
)) {
8606 pending_inc
.crush
.clear();
8607 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8608 ss
<< "adjusted tunables profile to " << profile
;
8610 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8611 get_last_committed() + 1));
8613 } else if (prefix
== "osd crush set-tunable") {
8614 CrushWrapper newcrush
;
8615 _get_pending_crush(newcrush
);
8619 cmd_getval(g_ceph_context
, cmdmap
, "tunable", tunable
);
8622 if (!cmd_getval(g_ceph_context
, cmdmap
, "value", value
)) {
8624 ss
<< "failed to parse integer value " << cmd_vartype_stringify(cmdmap
["value"]);
8628 if (tunable
== "straw_calc_version") {
8629 if (value
!= 0 && value
!= 1) {
8630 ss
<< "value must be 0 or 1; got " << value
;
8634 newcrush
.set_straw_calc_version(value
);
8636 ss
<< "unrecognized tunable '" << tunable
<< "'";
8641 if (!validate_crush_against_features(&newcrush
, ss
)) {
8646 pending_inc
.crush
.clear();
8647 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8648 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
8650 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8651 get_last_committed() + 1));
8654 } else if (prefix
== "osd crush rule create-simple") {
8655 string name
, root
, type
, mode
;
8656 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8657 cmd_getval(g_ceph_context
, cmdmap
, "root", root
);
8658 cmd_getval(g_ceph_context
, cmdmap
, "type", type
);
8659 cmd_getval(g_ceph_context
, cmdmap
, "mode", mode
);
8663 if (osdmap
.crush
->rule_exists(name
)) {
8664 // The name is uniquely associated to a ruleid and the rule it contains
8665 // From the user point of view, the rule is more meaningfull.
8666 ss
<< "rule " << name
<< " already exists";
8671 CrushWrapper newcrush
;
8672 _get_pending_crush(newcrush
);
8674 if (newcrush
.rule_exists(name
)) {
8675 // The name is uniquely associated to a ruleid and the rule it contains
8676 // From the user point of view, the rule is more meaningfull.
8677 ss
<< "rule " << name
<< " already exists";
8680 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
8681 pg_pool_t::TYPE_REPLICATED
, &ss
);
8687 pending_inc
.crush
.clear();
8688 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8691 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8692 get_last_committed() + 1));
8695 } else if (prefix
== "osd crush rule create-replicated") {
8696 string name
, root
, type
, device_class
;
8697 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8698 cmd_getval(g_ceph_context
, cmdmap
, "root", root
);
8699 cmd_getval(g_ceph_context
, cmdmap
, "type", type
);
8700 cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
);
8702 if (!device_class
.empty()) {
8703 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8704 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8705 << "luminous' before using crush device classes";
8711 if (osdmap
.crush
->rule_exists(name
)) {
8712 // The name is uniquely associated to a ruleid and the rule it contains
8713 // From the user point of view, the rule is more meaningfull.
8714 ss
<< "rule " << name
<< " already exists";
8719 CrushWrapper newcrush
;
8720 _get_pending_crush(newcrush
);
8722 if (newcrush
.rule_exists(name
)) {
8723 // The name is uniquely associated to a ruleid and the rule it contains
8724 // From the user point of view, the rule is more meaningfull.
8725 ss
<< "rule " << name
<< " already exists";
8728 int ruleno
= newcrush
.add_simple_rule(
8729 name
, root
, type
, device_class
,
8730 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
8736 pending_inc
.crush
.clear();
8737 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8740 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8741 get_last_committed() + 1));
8744 } else if (prefix
== "osd erasure-code-profile rm") {
8746 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8748 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
8751 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
8756 if (osdmap
.has_erasure_code_profile(name
) ||
8757 pending_inc
.new_erasure_code_profiles
.count(name
)) {
8758 if (osdmap
.has_erasure_code_profile(name
)) {
8759 pending_inc
.old_erasure_code_profiles
.push_back(name
);
8761 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
8762 pending_inc
.new_erasure_code_profiles
.erase(name
);
8766 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8767 get_last_committed() + 1));
8770 ss
<< "erasure-code-profile " << name
<< " does not exist";
8775 } else if (prefix
== "osd erasure-code-profile set") {
8777 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8778 vector
<string
> profile
;
8779 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
8781 if (profile
.size() > 0 && profile
.back() == "--force") {
8787 map
<string
,string
> profile_map
;
8788 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
8791 if (profile_map
.find("plugin") == profile_map
.end()) {
8792 ss
<< "erasure-code-profile " << profile_map
8793 << " must contain a plugin entry" << std::endl
;
8797 string plugin
= profile_map
["plugin"];
8799 if (pending_inc
.has_erasure_code_profile(name
)) {
8800 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
8803 if (plugin
== "isa" || plugin
== "lrc") {
8804 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
, ss
);
8809 } else if (plugin
== "shec") {
8810 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
, ss
);
8816 err
= normalize_profile(name
, profile_map
, force
, &ss
);
8820 if (osdmap
.has_erasure_code_profile(name
)) {
8821 ErasureCodeProfile existing_profile_map
=
8822 osdmap
.get_erasure_code_profile(name
);
8823 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
8827 if (existing_profile_map
== profile_map
) {
8833 ss
<< "will not override erasure code profile " << name
8834 << " because the existing profile "
8835 << existing_profile_map
8836 << " is different from the proposed profile "
8842 dout(20) << "erasure code profile set " << name
<< "="
8843 << profile_map
<< dendl
;
8844 pending_inc
.set_erasure_code_profile(name
, profile_map
);
8848 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8849 get_last_committed() + 1));
8852 } else if (prefix
== "osd crush rule create-erasure") {
8853 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
8858 string name
, poolstr
;
8859 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8861 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
8863 profile
= "default";
8864 if (profile
== "default") {
8865 if (!osdmap
.has_erasure_code_profile(profile
)) {
8866 if (pending_inc
.has_erasure_code_profile(profile
)) {
8867 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
8871 map
<string
,string
> profile_map
;
8872 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
8877 err
= normalize_profile(name
, profile_map
, true, &ss
);
8880 dout(20) << "erasure code profile set " << profile
<< "="
8881 << profile_map
<< dendl
;
8882 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
8888 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
8891 case -EEXIST
: // return immediately
8892 ss
<< "rule " << name
<< " already exists";
8896 case -EALREADY
: // wait for pending to be proposed
8897 ss
<< "rule " << name
<< " already exists";
8900 default: // non recoverable error
8905 ss
<< "created rule " << name
<< " at " << rule
;
8909 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8910 get_last_committed() + 1));
8913 } else if (prefix
== "osd crush rule rm") {
8915 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8917 if (!osdmap
.crush
->rule_exists(name
)) {
8918 ss
<< "rule " << name
<< " does not exist";
8923 CrushWrapper newcrush
;
8924 _get_pending_crush(newcrush
);
8926 if (!newcrush
.rule_exists(name
)) {
8927 ss
<< "rule " << name
<< " does not exist";
8930 int ruleno
= newcrush
.get_rule_id(name
);
8931 assert(ruleno
>= 0);
8933 // make sure it is not in use.
8934 // FIXME: this is ok in some situations, but let's not bother with that
8936 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
8937 if (osdmap
.crush_rule_in_use(ruleset
)) {
8938 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
8943 err
= newcrush
.remove_rule(ruleno
);
8948 pending_inc
.crush
.clear();
8949 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8952 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8953 get_last_committed() + 1));
8956 } else if (prefix
== "osd crush rule rename") {
8959 cmd_getval(g_ceph_context
, cmdmap
, "srcname", srcname
);
8960 cmd_getval(g_ceph_context
, cmdmap
, "dstname", dstname
);
8961 if (srcname
.empty() || dstname
.empty()) {
8962 ss
<< "must specify both source rule name and destination rule name";
8966 if (srcname
== dstname
) {
8967 ss
<< "destination rule name is equal to source rule name";
8972 CrushWrapper newcrush
;
8973 _get_pending_crush(newcrush
);
8974 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
8975 // srcname does not exist and dstname already exists
8976 // suppose this is a replay and return success
8977 // (so this command is idempotent)
8978 ss
<< "already renamed to '" << dstname
<< "'";
8983 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
8985 // ss has reason for failure
8988 pending_inc
.crush
.clear();
8989 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8991 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8992 get_last_committed() + 1));
8995 } else if (prefix
== "osd setmaxosd") {
8997 if (!cmd_getval(g_ceph_context
, cmdmap
, "newmax", newmax
)) {
8998 ss
<< "unable to parse 'newmax' value '"
8999 << cmd_vartype_stringify(cmdmap
["newmax"]) << "'";
9004 if (newmax
> g_conf
->mon_max_osd
) {
9006 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
9007 << g_conf
->mon_max_osd
<< ")";
9011 // Don't allow shrinking OSD number as this will cause data loss
9012 // and may cause kernel crashes.
9013 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
9014 if (newmax
< osdmap
.get_max_osd()) {
9015 // Check if the OSDs exist between current max and new value.
9016 // If there are any OSDs exist, then don't allow shrinking number
9018 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
9019 if (osdmap
.exists(i
)) {
9021 ss
<< "cannot shrink max_osd to " << newmax
9022 << " because osd." << i
<< " (and possibly others) still in use";
9028 pending_inc
.new_max_osd
= newmax
;
9029 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
9031 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9032 get_last_committed() + 1));
9035 } else if (prefix
== "osd set-full-ratio" ||
9036 prefix
== "osd set-backfillfull-ratio" ||
9037 prefix
== "osd set-nearfull-ratio") {
9038 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
9039 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9040 << "luminous' before using the new interface";
9045 if (!cmd_getval(g_ceph_context
, cmdmap
, "ratio", n
)) {
9046 ss
<< "unable to parse 'ratio' value '"
9047 << cmd_vartype_stringify(cmdmap
["ratio"]) << "'";
9051 if (prefix
== "osd set-full-ratio")
9052 pending_inc
.new_full_ratio
= n
;
9053 else if (prefix
== "osd set-backfillfull-ratio")
9054 pending_inc
.new_backfillfull_ratio
= n
;
9055 else if (prefix
== "osd set-nearfull-ratio")
9056 pending_inc
.new_nearfull_ratio
= n
;
9057 ss
<< prefix
<< " " << n
;
9059 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9060 get_last_committed() + 1));
9062 } else if (prefix
== "osd set-require-min-compat-client") {
9063 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
9064 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9065 << "luminous' before using the new interface";
9070 cmd_getval(g_ceph_context
, cmdmap
, "version", v
);
9071 int vno
= ceph_release_from_name(v
.c_str());
9073 ss
<< "version " << v
<< " is not recognized";
9078 newmap
.deepish_copy_from(osdmap
);
9079 newmap
.apply_incremental(pending_inc
);
9080 newmap
.require_min_compat_client
= vno
;
9081 auto mvno
= newmap
.get_min_compat_client();
9083 ss
<< "osdmap current utilizes features that require "
9084 << ceph_release_name(mvno
)
9085 << "; cannot set require_min_compat_client below that to "
9086 << ceph_release_name(vno
);
9091 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
9092 if (sure
!= "--yes-i-really-mean-it") {
9094 mon
->get_combined_feature_map(&m
);
9095 uint64_t features
= ceph_release_features(vno
);
9099 CEPH_ENTITY_TYPE_CLIENT
,
9100 CEPH_ENTITY_TYPE_MDS
,
9101 CEPH_ENTITY_TYPE_MGR
}) {
9102 auto p
= m
.m
.find(type
);
9103 if (p
== m
.m
.end()) {
9106 for (auto& q
: p
->second
) {
9107 uint64_t missing
= ~q
.first
& features
;
9110 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
9115 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
9116 << "(s) look like " << ceph_release_name(
9117 ceph_release_from_features(q
.first
))
9118 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
9124 ss
<< "; add --yes-i-really-mean-it to do it anyway";
9129 ss
<< "set require_min_compat_client to " << ceph_release_name(vno
);
9130 pending_inc
.new_require_min_compat_client
= vno
;
9132 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9133 get_last_committed() + 1));
9135 } else if (prefix
== "osd pause") {
9136 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
9138 } else if (prefix
== "osd unpause") {
9139 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
9141 } else if (prefix
== "osd set") {
9143 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
9145 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
9147 return prepare_set_flag(op
, CEPH_OSDMAP_FULL
);
9148 else if (key
== "pause")
9149 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
9150 else if (key
== "noup")
9151 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
9152 else if (key
== "nodown")
9153 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
9154 else if (key
== "noout")
9155 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
9156 else if (key
== "noin")
9157 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
9158 else if (key
== "nobackfill")
9159 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
9160 else if (key
== "norebalance")
9161 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
9162 else if (key
== "norecover")
9163 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
9164 else if (key
== "noscrub")
9165 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
9166 else if (key
== "nodeep-scrub")
9167 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
9168 else if (key
== "notieragent")
9169 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
9170 else if (key
== "sortbitwise") {
9171 if (!osdmap
.get_num_up_osds() && sure
!= "--yes-i-really-mean-it") {
9172 ss
<< "Not advisable to continue since no OSDs are up. Pass "
9173 << "--yes-i-really-mean-it if you really wish to continue.";
9177 if ((osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)
9178 || sure
== "--yes-i-really-mean-it") {
9179 return prepare_set_flag(op
, CEPH_OSDMAP_SORTBITWISE
);
9181 ss
<< "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
9185 } else if (key
== "recovery_deletes") {
9186 if (!osdmap
.get_num_up_osds() && sure
!= "--yes-i-really-mean-it") {
9187 ss
<< "Not advisable to continue since no OSDs are up. Pass "
9188 << "--yes-i-really-mean-it if you really wish to continue.";
9192 if (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_RECOVERY_DELETES
)
9193 || sure
== "--yes-i-really-mean-it") {
9194 return prepare_set_flag(op
, CEPH_OSDMAP_RECOVERY_DELETES
);
9196 ss
<< "not all up OSDs have OSD_RECOVERY_DELETES feature";
9200 } else if (key
== "require_jewel_osds") {
9201 if (!osdmap
.get_num_up_osds() && sure
!= "--yes-i-really-mean-it") {
9202 ss
<< "Not advisable to continue since no OSDs are up. Pass "
9203 << "--yes-i-really-mean-it if you really wish to continue.";
9207 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
9208 ss
<< "the sortbitwise flag must be set before require_jewel_osds";
9211 } else if (osdmap
.require_osd_release
>= CEPH_RELEASE_JEWEL
) {
9212 ss
<< "require_osd_release is already >= jewel";
9215 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
)
9216 || sure
== "--yes-i-really-mean-it") {
9217 return prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_JEWEL
);
9219 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
9222 } else if (key
== "require_kraken_osds") {
9223 if (!osdmap
.get_num_up_osds() && sure
!= "--yes-i-really-mean-it") {
9224 ss
<< "Not advisable to continue since no OSDs are up. Pass "
9225 << "--yes-i-really-mean-it if you really wish to continue.";
9229 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
9230 ss
<< "the sortbitwise flag must be set before require_kraken_osds";
9233 } else if (osdmap
.require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
9234 ss
<< "require_osd_release is already >= kraken";
9237 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
)
9238 || sure
== "--yes-i-really-mean-it") {
9239 bool r
= prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_KRAKEN
);
9240 // ensure JEWEL is also set
9241 pending_inc
.new_flags
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
9244 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
9248 ss
<< "unrecognized flag '" << key
<< "'";
9252 } else if (prefix
== "osd unset") {
9254 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
9256 return prepare_unset_flag(op
, CEPH_OSDMAP_FULL
);
9257 else if (key
== "pause")
9258 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
9259 else if (key
== "noup")
9260 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
9261 else if (key
== "nodown")
9262 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
9263 else if (key
== "noout")
9264 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
9265 else if (key
== "noin")
9266 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
9267 else if (key
== "nobackfill")
9268 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
9269 else if (key
== "norebalance")
9270 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
9271 else if (key
== "norecover")
9272 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
9273 else if (key
== "noscrub")
9274 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
9275 else if (key
== "nodeep-scrub")
9276 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
9277 else if (key
== "notieragent")
9278 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
9280 ss
<< "unrecognized flag '" << key
<< "'";
9284 } else if (prefix
== "osd require-osd-release") {
9286 cmd_getval(g_ceph_context
, cmdmap
, "release", release
);
9288 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
9289 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
9290 ss
<< "the sortbitwise flag must be set first";
9294 int rel
= ceph_release_from_name(release
.c_str());
9296 ss
<< "unrecognized release " << release
;
9300 if (rel
< CEPH_RELEASE_LUMINOUS
) {
9301 ss
<< "use this command only for luminous and later";
9305 if (rel
== osdmap
.require_osd_release
) {
9310 if (rel
== CEPH_RELEASE_LUMINOUS
) {
9311 if (!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
)) {
9312 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
9317 ss
<< "not supported for this release yet";
9321 if (rel
< osdmap
.require_osd_release
) {
9322 ss
<< "require_osd_release cannot be lowered once it has been set";
9326 pending_inc
.new_require_osd_release
= rel
;
9327 if (rel
>= CEPH_RELEASE_LUMINOUS
&&
9328 !osdmap
.test_flag(CEPH_OSDMAP_RECOVERY_DELETES
)) {
9329 return prepare_set_flag(op
, CEPH_OSDMAP_RECOVERY_DELETES
);
9332 } else if (prefix
== "osd cluster_snap") {
9333 // ** DISABLE THIS FOR NOW **
9334 ss
<< "cluster snapshot currently disabled (broken implementation)";
9335 // ** DISABLE THIS FOR NOW **
9337 } else if (prefix
== "osd down" ||
9338 prefix
== "osd out" ||
9339 prefix
== "osd in" ||
9340 prefix
== "osd rm") {
9344 bool verbose
= true;
9346 vector
<string
> idvec
;
9347 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
9348 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9353 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9354 if (prefix
== "osd in") {
9355 // touch out osds only
9356 osdmap
.get_out_osds(osds
);
9358 osdmap
.get_all_osds(osds
);
9361 verbose
= false; // so the output is less noisy.
9363 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9365 ss
<< "invalid osd id" << osd
;
9368 } else if (!osdmap
.exists(osd
)) {
9369 ss
<< "osd." << osd
<< " does not exist. ";
9376 for (auto &osd
: osds
) {
9377 if (prefix
== "osd down") {
9378 if (osdmap
.is_down(osd
)) {
9380 ss
<< "osd." << osd
<< " is already down. ";
9382 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
9383 ss
<< "marked down osd." << osd
<< ". ";
9386 } else if (prefix
== "osd out") {
9387 if (osdmap
.is_out(osd
)) {
9389 ss
<< "osd." << osd
<< " is already out. ";
9391 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
9392 if (osdmap
.osd_weight
[osd
]) {
9393 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
9394 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
9396 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
9398 ss
<< "marked out osd." << osd
<< ". ";
9399 std::ostringstream msg
;
9400 msg
<< "Client " << op
->get_session()->entity_name
9401 << " marked osd." << osd
<< " out";
9402 if (osdmap
.is_up(osd
)) {
9403 msg
<< ", while it was still marked up";
9405 auto period
= ceph_clock_now() - down_pending_out
[osd
];
9406 msg
<< ", after it was down for " << int(period
.sec())
9410 mon
->clog
->info() << msg
.str();
9413 } else if (prefix
== "osd in") {
9414 if (osdmap
.is_in(osd
)) {
9416 ss
<< "osd." << osd
<< " is already in. ";
9418 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
9419 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
9420 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
9421 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
9423 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
9425 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
9427 ss
<< "marked in osd." << osd
<< ". ";
9430 } else if (prefix
== "osd rm") {
9431 err
= prepare_command_osd_remove(osd
);
9433 if (err
== -EBUSY
) {
9436 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
9440 ss
<< ", osd." << osd
;
9442 ss
<< "removed osd." << osd
;
9451 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
9452 get_last_committed() + 1));
9455 } else if (prefix
== "osd add-noup" ||
9456 prefix
== "osd add-nodown" ||
9457 prefix
== "osd add-noin" ||
9458 prefix
== "osd add-noout") {
9467 if (prefix
== "osd add-noup") {
9469 } else if (prefix
== "osd add-nodown") {
9471 } else if (prefix
== "osd add-noin") {
9480 vector
<string
> idvec
;
9481 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
9482 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9488 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9489 osdmap
.get_all_osds(osds
);
9492 // try traditional single osd way
9494 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9496 // ss has reason for failure
9497 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9505 for (auto &osd
: osds
) {
9507 if (!osdmap
.exists(osd
)) {
9508 ss
<< "osd." << osd
<< " does not exist. ";
9514 if (osdmap
.is_up(osd
)) {
9515 ss
<< "osd." << osd
<< " is already up. ";
9519 if (osdmap
.is_noup(osd
)) {
9520 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
))
9523 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
9530 if (osdmap
.is_down(osd
)) {
9531 ss
<< "osd." << osd
<< " is already down. ";
9535 if (osdmap
.is_nodown(osd
)) {
9536 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
))
9539 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
9546 if (osdmap
.is_in(osd
)) {
9547 ss
<< "osd." << osd
<< " is already in. ";
9551 if (osdmap
.is_noin(osd
)) {
9552 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
))
9555 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
9562 if (osdmap
.is_out(osd
)) {
9563 ss
<< "osd." << osd
<< " is already out. ";
9567 if (osdmap
.is_noout(osd
)) {
9568 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
))
9571 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
9578 assert(0 == "invalid option");
9585 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
9586 get_last_committed() + 1));
9589 } else if (prefix
== "osd rm-noup" ||
9590 prefix
== "osd rm-nodown" ||
9591 prefix
== "osd rm-noin" ||
9592 prefix
== "osd rm-noout") {
9601 if (prefix
== "osd rm-noup") {
9603 } else if (prefix
== "osd rm-nodown") {
9605 } else if (prefix
== "osd rm-noin") {
9614 vector
<string
> idvec
;
9615 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
9617 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9623 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9625 // touch previous noup/nodown/noin/noout osds only
9628 osdmap
.get_noup_osds(&osds
);
9631 osdmap
.get_nodown_osds(&osds
);
9634 osdmap
.get_noin_osds(&osds
);
9637 osdmap
.get_noout_osds(&osds
);
9640 assert(0 == "invalid option");
9643 // cancel any pending noup/nodown/noin/noout requests too
9644 vector
<int> pending_state_osds
;
9645 (void) pending_inc
.get_pending_state_osds(&pending_state_osds
);
9646 for (auto &p
: pending_state_osds
) {
9650 if (!osdmap
.is_noup(p
) &&
9651 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOUP
)) {
9657 if (!osdmap
.is_nodown(p
) &&
9658 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NODOWN
)) {
9664 if (!osdmap
.is_noin(p
) &&
9665 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOIN
)) {
9671 if (!osdmap
.is_noout(p
) &&
9672 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOOUT
)) {
9678 assert(0 == "invalid option");
9684 // try traditional single osd way
9686 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9688 // ss has reason for failure
9689 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9694 osds
.push_back(osd
);
9697 for (auto &osd
: osds
) {
9699 if (!osdmap
.exists(osd
)) {
9700 ss
<< "osd." << osd
<< " does not exist. ";
9706 if (osdmap
.is_noup(osd
)) {
9707 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
9709 } else if (pending_inc
.pending_osd_state_clear(
9710 osd
, CEPH_OSD_NOUP
)) {
9716 if (osdmap
.is_nodown(osd
)) {
9717 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
9719 } else if (pending_inc
.pending_osd_state_clear(
9720 osd
, CEPH_OSD_NODOWN
)) {
9726 if (osdmap
.is_noin(osd
)) {
9727 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
9729 } else if (pending_inc
.pending_osd_state_clear(
9730 osd
, CEPH_OSD_NOIN
)) {
9736 if (osdmap
.is_noout(osd
)) {
9737 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
9739 } else if (pending_inc
.pending_osd_state_clear(
9740 osd
, CEPH_OSD_NOOUT
)) {
9746 assert(0 == "invalid option");
9753 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
9754 get_last_committed() + 1));
9757 } else if (prefix
== "osd pg-temp") {
9759 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
9760 ss
<< "unable to parse 'pgid' value '"
9761 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
9766 if (!pgid
.parse(pgidstr
.c_str())) {
9767 ss
<< "invalid pgid '" << pgidstr
<< "'";
9771 if (!osdmap
.pg_exists(pgid
)) {
9772 ss
<< "pg " << pgid
<< " does not exist";
9776 if (pending_inc
.new_pg_temp
.count(pgid
)) {
9777 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
9778 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9782 vector
<int64_t> id_vec
;
9783 vector
<int32_t> new_pg_temp
;
9784 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
9785 ss
<< "unable to parse 'id' value(s) '"
9786 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9790 for (auto osd
: id_vec
) {
9791 if (!osdmap
.exists(osd
)) {
9792 ss
<< "osd." << osd
<< " does not exist";
9796 new_pg_temp
.push_back(osd
);
9799 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
9800 if ((int)new_pg_temp
.size() < pool_min_size
) {
9801 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
9802 << pool_min_size
<< ")";
9807 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
9808 if ((int)new_pg_temp
.size() > pool_size
) {
9809 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
9810 << pool_size
<< ")";
9815 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
9816 new_pg_temp
.begin(), new_pg_temp
.end());
9817 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
9819 } else if (prefix
== "osd primary-temp") {
9821 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
9822 ss
<< "unable to parse 'pgid' value '"
9823 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
9828 if (!pgid
.parse(pgidstr
.c_str())) {
9829 ss
<< "invalid pgid '" << pgidstr
<< "'";
9833 if (!osdmap
.pg_exists(pgid
)) {
9834 ss
<< "pg " << pgid
<< " does not exist";
9840 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
9841 ss
<< "unable to parse 'id' value '"
9842 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9846 if (osd
!= -1 && !osdmap
.exists(osd
)) {
9847 ss
<< "osd." << osd
<< " does not exist";
9852 if (osdmap
.require_min_compat_client
> 0 &&
9853 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
9854 ss
<< "require_min_compat_client "
9855 << ceph_release_name(osdmap
.require_min_compat_client
)
9856 << " < firefly, which is required for primary-temp";
9859 } else if (!g_conf
->mon_osd_allow_primary_temp
) {
9860 ss
<< "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
9865 pending_inc
.new_primary_temp
[pgid
] = osd
;
9866 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
9868 } else if (prefix
== "osd pg-upmap" ||
9869 prefix
== "osd rm-pg-upmap" ||
9870 prefix
== "osd pg-upmap-items" ||
9871 prefix
== "osd rm-pg-upmap-items") {
9872 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
9873 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9874 << "luminous' before using the new interface";
9878 if (osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
9879 ss
<< "min_compat_client "
9880 << ceph_release_name(osdmap
.require_min_compat_client
)
9881 << " < luminous, which is required for pg-upmap. "
9882 << "Try 'ceph osd set-require-min-compat-client luminous' "
9883 << "before using the new interface";
9887 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
9893 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
9894 ss
<< "unable to parse 'pgid' value '"
9895 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
9900 if (!pgid
.parse(pgidstr
.c_str())) {
9901 ss
<< "invalid pgid '" << pgidstr
<< "'";
9905 if (!osdmap
.pg_exists(pgid
)) {
9906 ss
<< "pg " << pgid
<< " does not exist";
9910 if (pending_inc
.old_pools
.count(pgid
.pool())) {
9911 ss
<< "pool of " << pgid
<< " is pending removal";
9914 wait_for_finished_proposal(op
,
9915 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
9923 OP_RM_PG_UPMAP_ITEMS
,
9926 if (prefix
== "osd pg-upmap") {
9927 option
= OP_PG_UPMAP
;
9928 } else if (prefix
== "osd rm-pg-upmap") {
9929 option
= OP_RM_PG_UPMAP
;
9930 } else if (prefix
== "osd pg-upmap-items") {
9931 option
= OP_PG_UPMAP_ITEMS
;
9933 option
= OP_RM_PG_UPMAP_ITEMS
;
9936 // check pending upmap changes
9938 case OP_PG_UPMAP
: // fall through
9939 case OP_RM_PG_UPMAP
:
9940 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
9941 pending_inc
.old_pg_upmap
.count(pgid
)) {
9942 dout(10) << __func__
<< " waiting for pending update on "
9944 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9949 case OP_PG_UPMAP_ITEMS
: // fall through
9950 case OP_RM_PG_UPMAP_ITEMS
:
9951 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
9952 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
9953 dout(10) << __func__
<< " waiting for pending update on "
9955 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9961 assert(0 == "invalid option");
9967 vector
<int64_t> id_vec
;
9968 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
9969 ss
<< "unable to parse 'id' value(s) '"
9970 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9975 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
9976 if ((int)id_vec
.size() < pool_min_size
) {
9977 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
9978 << pool_min_size
<< ")";
9983 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
9984 if ((int)id_vec
.size() > pool_size
) {
9985 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
9986 << pool_size
<< ")";
9991 vector
<int32_t> new_pg_upmap
;
9992 for (auto osd
: id_vec
) {
9993 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
9994 ss
<< "osd." << osd
<< " does not exist";
9998 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
9999 if (it
!= new_pg_upmap
.end()) {
10000 ss
<< "osd." << osd
<< " already exists, ";
10003 new_pg_upmap
.push_back(osd
);
10006 if (new_pg_upmap
.empty()) {
10007 ss
<< "no valid upmap items(pairs) is specified";
10012 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
10013 new_pg_upmap
.begin(), new_pg_upmap
.end());
10014 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
10018 case OP_RM_PG_UPMAP
:
10020 pending_inc
.old_pg_upmap
.insert(pgid
);
10021 ss
<< "clear " << pgid
<< " pg_upmap mapping";
10025 case OP_PG_UPMAP_ITEMS
:
10027 vector
<int64_t> id_vec
;
10028 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
10029 ss
<< "unable to parse 'id' value(s) '"
10030 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10035 if (id_vec
.size() % 2) {
10036 ss
<< "you must specify pairs of osd ids to be remapped";
10041 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
10042 if ((int)(id_vec
.size() / 2) > pool_size
) {
10043 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
10044 << pool_size
<< ")";
10049 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
10050 ostringstream items
;
10052 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
10056 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
10059 if (!osdmap
.exists(from
)) {
10060 ss
<< "osd." << from
<< " does not exist";
10064 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
10065 ss
<< "osd." << to
<< " does not exist";
10069 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
10070 auto it
= std::find(new_pg_upmap_items
.begin(),
10071 new_pg_upmap_items
.end(), entry
);
10072 if (it
!= new_pg_upmap_items
.end()) {
10073 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
10076 new_pg_upmap_items
.push_back(entry
);
10077 items
<< from
<< "->" << to
<< ",";
10079 string
out(items
.str());
10080 out
.resize(out
.size() - 1); // drop last ','
10083 if (new_pg_upmap_items
.empty()) {
10084 ss
<< "no valid upmap items(pairs) is specified";
10089 pending_inc
.new_pg_upmap_items
[pgid
] =
10090 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
10091 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
10092 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
10096 case OP_RM_PG_UPMAP_ITEMS
:
10098 pending_inc
.old_pg_upmap_items
.insert(pgid
);
10099 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
10104 assert(0 == "invalid option");
10108 } else if (prefix
== "osd primary-affinity") {
10110 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
10111 ss
<< "invalid osd id value '"
10112 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10117 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
10118 ss
<< "unable to parse 'weight' value '"
10119 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
10123 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
10125 ss
<< "weight must be >= 0";
10129 if (osdmap
.require_min_compat_client
> 0 &&
10130 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
10131 ss
<< "require_min_compat_client "
10132 << ceph_release_name(osdmap
.require_min_compat_client
)
10133 << " < firefly, which is required for primary-affinity";
10136 } else if (!g_conf
->mon_osd_allow_primary_affinity
) {
10137 ss
<< "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
10141 err
= check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY
, ss
);
10142 if (err
== -EAGAIN
)
10146 if (osdmap
.exists(id
)) {
10147 pending_inc
.new_primary_affinity
[id
] = ww
;
10148 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
10150 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10151 get_last_committed() + 1));
10154 ss
<< "osd." << id
<< " does not exist";
10158 } else if (prefix
== "osd reweight") {
10160 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
10161 ss
<< "unable to parse osd id value '"
10162 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10167 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
10168 ss
<< "unable to parse weight value '"
10169 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
10173 long ww
= (int)((double)CEPH_OSD_IN
*w
);
10175 ss
<< "weight must be >= 0";
10179 if (osdmap
.exists(id
)) {
10180 pending_inc
.new_weight
[id
] = ww
;
10181 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
10183 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10184 get_last_committed() + 1));
10187 ss
<< "osd." << id
<< " does not exist";
10191 } else if (prefix
== "osd reweightn") {
10192 map
<int32_t, uint32_t> weights
;
10193 err
= parse_reweights(g_ceph_context
, cmdmap
, osdmap
, &weights
);
10195 ss
<< "unable to parse 'weights' value '"
10196 << cmd_vartype_stringify(cmdmap
["weights"]) << "'";
10199 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
10200 wait_for_finished_proposal(
10202 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
10204 } else if (prefix
== "osd lost") {
10206 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
10207 ss
<< "unable to parse osd id value '"
10208 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
10213 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) || sure
!= "--yes-i-really-mean-it") {
10214 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
10215 "--yes-i-really-mean-it if you really do.";
10218 } else if (!osdmap
.exists(id
)) {
10219 ss
<< "osd." << id
<< " does not exist";
10222 } else if (!osdmap
.is_down(id
)) {
10223 ss
<< "osd." << id
<< " is not down";
10227 epoch_t e
= osdmap
.get_info(id
).down_at
;
10228 pending_inc
.new_lost
[id
] = e
;
10229 ss
<< "marked osd lost in epoch " << e
;
10231 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10232 get_last_committed() + 1));
10236 } else if (prefix
== "osd destroy" || prefix
== "osd purge") {
10237 /* Destroying an OSD means that we don't expect to further make use of
10238 * the OSDs data (which may even become unreadable after this operation),
10239 * and that we are okay with scrubbing all its cephx keys and config-key
10240 * data (which may include lockbox keys, thus rendering the osd's data
10243 * The OSD will not be removed. Instead, we will mark it as destroyed,
10244 * such that a subsequent call to `create` will not reuse the osd id.
10245 * This will play into being able to recreate the OSD, at the same
10246 * crush location, with minimal data movement.
10249 // make sure authmon is writeable.
10250 if (!mon
->authmon()->is_writeable()) {
10251 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
10252 << "osd destroy" << dendl
;
10253 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
10258 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
10259 ss
<< "unable to parse osd id value '"
10260 << cmd_vartype_stringify(cmdmap
["id"]) << "";
10265 bool is_destroy
= (prefix
== "osd destroy");
10267 assert("osd purge" == prefix
);
10271 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) ||
10272 sure
!= "--yes-i-really-mean-it") {
10273 ss
<< "Are you SURE? This will mean real, permanent data loss, as well "
10274 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
10278 } else if (!osdmap
.exists(id
)) {
10279 ss
<< "osd." << id
<< " does not exist";
10280 err
= 0; // idempotent
10282 } else if (osdmap
.is_up(id
)) {
10283 ss
<< "osd." << id
<< " is not `down`.";
10286 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
10287 ss
<< "destroyed osd." << id
;
10292 bool goto_reply
= false;
10296 err
= prepare_command_osd_destroy(id
, ss
);
10297 // we checked above that it should exist.
10298 assert(err
!= -ENOENT
);
10300 err
= prepare_command_osd_purge(id
, ss
);
10301 if (err
== -ENOENT
) {
10303 ss
<< "osd." << id
<< " does not exist.";
10309 if (err
< 0 || goto_reply
) {
10314 ss
<< "destroyed osd." << id
;
10316 ss
<< "purged osd." << id
;
10320 wait_for_finished_proposal(op
,
10321 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
10322 force_immediate_propose();
10325 } else if (prefix
== "osd new") {
10327 // make sure authmon is writeable.
10328 if (!mon
->authmon()->is_writeable()) {
10329 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
10330 << "osd new" << dendl
;
10331 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
10335 map
<string
,string
> param_map
;
10337 bufferlist bl
= m
->get_data();
10338 string param_json
= bl
.to_str();
10339 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
10341 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
10345 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
10348 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
10361 if (err
== EEXIST
) {
10362 // idempotent operation
10367 wait_for_finished_proposal(op
,
10368 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
10369 get_last_committed() + 1));
10370 force_immediate_propose();
10373 } else if (prefix
== "osd create") {
10375 // optional id provided?
10376 int64_t id
= -1, cmd_id
= -1;
10377 if (cmd_getval(g_ceph_context
, cmdmap
, "id", cmd_id
)) {
10379 ss
<< "invalid osd id value '" << cmd_id
<< "'";
10383 dout(10) << " osd create got id " << cmd_id
<< dendl
;
10388 if (cmd_getval(g_ceph_context
, cmdmap
, "uuid", uuidstr
)) {
10389 if (!uuid
.parse(uuidstr
.c_str())) {
10390 ss
<< "invalid uuid value '" << uuidstr
<< "'";
10394 // we only care about the id if we also have the uuid, to
10395 // ensure the operation's idempotency.
10399 int32_t new_id
= -1;
10400 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
10402 if (err
== -EAGAIN
) {
10403 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10406 // a check has failed; reply to the user.
10409 } else if (err
== EEXIST
) {
10410 // this is an idempotent operation; we can go ahead and reply.
10412 f
->open_object_section("created_osd");
10413 f
->dump_int("osdid", new_id
);
10414 f
->close_section();
10424 string empty_device_class
;
10425 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
10428 f
->open_object_section("created_osd");
10429 f
->dump_int("osdid", new_id
);
10430 f
->close_section();
10436 wait_for_finished_proposal(op
,
10437 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
10438 get_last_committed() + 1));
10441 } else if (prefix
== "osd blacklist clear") {
10442 pending_inc
.new_blacklist
.clear();
10443 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
10444 osdmap
.get_blacklist(&blacklist
);
10445 for (const auto &entry
: blacklist
) {
10446 pending_inc
.old_blacklist
.push_back(entry
.first
);
10448 ss
<< " removed all blacklist entries";
10450 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10451 get_last_committed() + 1));
10453 } else if (prefix
== "osd blacklist") {
10455 cmd_getval(g_ceph_context
, cmdmap
, "addr", addrstr
);
10456 entity_addr_t addr
;
10457 if (!addr
.parse(addrstr
.c_str(), 0)) {
10458 ss
<< "unable to parse address " << addrstr
;
10463 string blacklistop
;
10464 cmd_getval(g_ceph_context
, cmdmap
, "blacklistop", blacklistop
);
10465 if (blacklistop
== "add") {
10466 utime_t expires
= ceph_clock_now();
10468 // default one hour
10469 cmd_getval(g_ceph_context
, cmdmap
, "expire", d
,
10470 g_conf
->mon_osd_blacklist_default_expire
);
10473 pending_inc
.new_blacklist
[addr
] = expires
;
10476 // cancel any pending un-blacklisting request too
10477 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
10478 pending_inc
.old_blacklist
.end(), addr
);
10479 if (it
!= pending_inc
.old_blacklist
.end()) {
10480 pending_inc
.old_blacklist
.erase(it
);
10484 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
10486 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10487 get_last_committed() + 1));
10489 } else if (blacklistop
== "rm") {
10490 if (osdmap
.is_blacklisted(addr
) ||
10491 pending_inc
.new_blacklist
.count(addr
)) {
10492 if (osdmap
.is_blacklisted(addr
))
10493 pending_inc
.old_blacklist
.push_back(addr
);
10495 pending_inc
.new_blacklist
.erase(addr
);
10496 ss
<< "un-blacklisting " << addr
;
10498 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10499 get_last_committed() + 1));
10502 ss
<< addr
<< " isn't blacklisted";
10507 } else if (prefix
== "osd pool mksnap") {
10509 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10510 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
10512 ss
<< "unrecognized pool '" << poolstr
<< "'";
10517 cmd_getval(g_ceph_context
, cmdmap
, "snap", snapname
);
10518 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
10519 if (p
->is_unmanaged_snaps_mode()) {
10520 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
10523 } else if (p
->snap_exists(snapname
.c_str())) {
10524 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
10527 } else if (p
->is_tier()) {
10528 ss
<< "pool " << poolstr
<< " is a cache tier";
10533 if (pending_inc
.new_pools
.count(pool
))
10534 pp
= &pending_inc
.new_pools
[pool
];
10536 pp
= &pending_inc
.new_pools
[pool
];
10539 if (pp
->snap_exists(snapname
.c_str())) {
10540 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
10542 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
10543 pp
->set_snap_epoch(pending_inc
.epoch
);
10544 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
10547 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10548 get_last_committed() + 1));
10550 } else if (prefix
== "osd pool rmsnap") {
10552 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10553 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
10555 ss
<< "unrecognized pool '" << poolstr
<< "'";
10560 cmd_getval(g_ceph_context
, cmdmap
, "snap", snapname
);
10561 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
10562 if (p
->is_unmanaged_snaps_mode()) {
10563 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
10566 } else if (!p
->snap_exists(snapname
.c_str())) {
10567 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
10572 if (pending_inc
.new_pools
.count(pool
))
10573 pp
= &pending_inc
.new_pools
[pool
];
10575 pp
= &pending_inc
.new_pools
[pool
];
10578 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
10580 pp
->remove_snap(sn
);
10581 pp
->set_snap_epoch(pending_inc
.epoch
);
10582 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
10584 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
10587 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10588 get_last_committed() + 1));
10590 } else if (prefix
== "osd pool create") {
10593 cmd_getval(g_ceph_context
, cmdmap
, "pg_num", pg_num
, int64_t(0));
10594 cmd_getval(g_ceph_context
, cmdmap
, "pgp_num", pgp_num
, pg_num
);
10596 string pool_type_str
;
10597 cmd_getval(g_ceph_context
, cmdmap
, "pool_type", pool_type_str
);
10598 if (pool_type_str
.empty())
10599 pool_type_str
= g_conf
->osd_pool_default_type
;
10602 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10603 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10604 if (pool_id
>= 0) {
10605 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10606 if (pool_type_str
!= p
->get_type_name()) {
10607 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
10610 ss
<< "pool '" << poolstr
<< "' already exists";
10617 if (pool_type_str
== "replicated") {
10618 pool_type
= pg_pool_t::TYPE_REPLICATED
;
10619 } else if (pool_type_str
== "erasure") {
10620 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
|
10621 CEPH_FEATURE_OSD_ERASURE_CODES
,
10623 if (err
== -EAGAIN
)
10627 pool_type
= pg_pool_t::TYPE_ERASURE
;
10629 ss
<< "unknown pool type '" << pool_type_str
<< "'";
10634 bool implicit_rule_creation
= false;
10635 int64_t expected_num_objects
= 0;
10637 cmd_getval(g_ceph_context
, cmdmap
, "rule", rule_name
);
10638 string erasure_code_profile
;
10639 cmd_getval(g_ceph_context
, cmdmap
, "erasure_code_profile", erasure_code_profile
);
10641 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
10642 if (erasure_code_profile
== "")
10643 erasure_code_profile
= "default";
10644 //handle the erasure code profile
10645 if (erasure_code_profile
== "default") {
10646 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
10647 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
10648 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
10652 map
<string
,string
> profile_map
;
10653 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
10658 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
10659 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
10663 if (rule_name
== "") {
10664 implicit_rule_creation
= true;
10665 if (erasure_code_profile
== "default") {
10666 rule_name
= "erasure-code";
10668 dout(1) << "implicitly use rule named after the pool: "
10669 << poolstr
<< dendl
;
10670 rule_name
= poolstr
;
10673 cmd_getval(g_ceph_context
, cmdmap
, "expected_num_objects",
10674 expected_num_objects
, int64_t(0));
10676 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
10677 // and put expected_num_objects to rule field
10678 if (erasure_code_profile
!= "") { // cmd is from CLI
10679 if (rule_name
!= "") {
10681 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
10682 if (interr
.length()) {
10683 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
10688 rule_name
= erasure_code_profile
;
10689 } else { // cmd is well-formed
10690 cmd_getval(g_ceph_context
, cmdmap
, "expected_num_objects",
10691 expected_num_objects
, int64_t(0));
10695 if (!implicit_rule_creation
&& rule_name
!= "") {
10697 err
= get_crush_rule(rule_name
, &rule
, &ss
);
10698 if (err
== -EAGAIN
) {
10699 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10706 if (expected_num_objects
< 0) {
10707 ss
<< "'expected_num_objects' must be non-negative";
10712 int64_t fast_read_param
;
10713 cmd_getval(g_ceph_context
, cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
10714 FastReadType fast_read
= FAST_READ_DEFAULT
;
10715 if (fast_read_param
== 0)
10716 fast_read
= FAST_READ_OFF
;
10717 else if (fast_read_param
> 0)
10718 fast_read
= FAST_READ_ON
;
10720 err
= prepare_new_pool(poolstr
, 0, // auid=0 for admin created pool
10721 -1, // default crush rule
10724 erasure_code_profile
, pool_type
,
10725 (uint64_t)expected_num_objects
,
10731 ss
<< "pool '" << poolstr
<< "' already exists";
10734 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10743 ss
<< "pool '" << poolstr
<< "' created";
10746 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10747 get_last_committed() + 1));
10750 } else if (prefix
== "osd pool delete" ||
10751 prefix
== "osd pool rm") {
10752 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
10753 string poolstr
, poolstr2
, sure
;
10754 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10755 cmd_getval(g_ceph_context
, cmdmap
, "pool2", poolstr2
);
10756 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
10757 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
10759 ss
<< "pool '" << poolstr
<< "' does not exist";
10764 bool force_no_fake
= sure
== "--yes-i-really-really-mean-it-not-faking";
10765 if (poolstr2
!= poolstr
||
10766 (sure
!= "--yes-i-really-really-mean-it" && !force_no_fake
)) {
10767 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
10768 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
10769 << "followed by --yes-i-really-really-mean-it.";
10773 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
10774 if (err
== -EAGAIN
) {
10775 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10781 } else if (prefix
== "osd pool rename") {
10782 string srcpoolstr
, destpoolstr
;
10783 cmd_getval(g_ceph_context
, cmdmap
, "srcpool", srcpoolstr
);
10784 cmd_getval(g_ceph_context
, cmdmap
, "destpool", destpoolstr
);
10785 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
10786 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
10788 if (pool_src
< 0) {
10789 if (pool_dst
>= 0) {
10790 // src pool doesn't exist, dst pool does exist: to ensure idempotency
10791 // of operations, assume this rename succeeded, as it is not changing
10792 // the current state. Make sure we output something understandable
10793 // for whoever is issuing the command, if they are paying attention,
10794 // in case it was not intentional; or to avoid a "wtf?" and a bug
10795 // report in case it was intentional, while expecting a failure.
10796 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
10797 << destpoolstr
<< "' does -- assuming successful rename";
10800 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
10804 } else if (pool_dst
>= 0) {
10805 // source pool exists and so does the destination pool
10806 ss
<< "pool '" << destpoolstr
<< "' already exists";
10811 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
10813 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
10815 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
10816 << cpp_strerror(ret
);
10819 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
10820 get_last_committed() + 1));
10823 } else if (prefix
== "osd pool set") {
10824 err
= prepare_command_pool_set(cmdmap
, ss
);
10825 if (err
== -EAGAIN
)
10831 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10832 get_last_committed() + 1));
10834 } else if (prefix
== "osd tier add") {
10835 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
10836 if (err
== -EAGAIN
)
10841 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10842 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10844 ss
<< "unrecognized pool '" << poolstr
<< "'";
10848 string tierpoolstr
;
10849 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
10850 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
10851 if (tierpool_id
< 0) {
10852 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
10856 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10858 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
10861 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
10865 // make sure new tier is empty
10866 string force_nonempty
;
10867 cmd_getval(g_ceph_context
, cmdmap
, "force_nonempty", force_nonempty
);
10868 const pool_stat_t
*pstats
= mon
->pgservice
->get_pool_stat(tierpool_id
);
10869 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
10870 force_nonempty
!= "--force-nonempty") {
10871 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
10875 if (tp
->ec_pool()) {
10876 ss
<< "tier pool '" << tierpoolstr
10877 << "' is an ec pool, which cannot be a tier";
10881 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
10882 ((force_nonempty
!= "--force-nonempty") ||
10883 (!g_conf
->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
10884 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
10889 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10890 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
10891 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
10892 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10895 np
->tiers
.insert(tierpool_id
);
10896 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
10897 ntp
->tier_of
= pool_id
;
10898 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
10899 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10900 get_last_committed() + 1));
10902 } else if (prefix
== "osd tier remove" ||
10903 prefix
== "osd tier rm") {
10905 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10906 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10908 ss
<< "unrecognized pool '" << poolstr
<< "'";
10912 string tierpoolstr
;
10913 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
10914 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
10915 if (tierpool_id
< 0) {
10916 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
10920 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10922 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
10925 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
10929 if (p
->tiers
.count(tierpool_id
) == 0) {
10930 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
10934 if (tp
->tier_of
!= pool_id
) {
10935 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
10936 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
10937 // be scary about it; this is an inconsistency and bells must go off
10938 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
10942 if (p
->read_tier
== tierpool_id
) {
10943 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
10948 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10949 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
10950 if (np
->tiers
.count(tierpool_id
) == 0 ||
10951 ntp
->tier_of
!= pool_id
||
10952 np
->read_tier
== tierpool_id
) {
10953 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10956 np
->tiers
.erase(tierpool_id
);
10958 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
10959 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10960 get_last_committed() + 1));
10962 } else if (prefix
== "osd tier set-overlay") {
10963 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
10964 if (err
== -EAGAIN
)
10969 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10970 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10972 ss
<< "unrecognized pool '" << poolstr
<< "'";
10976 string overlaypoolstr
;
10977 cmd_getval(g_ceph_context
, cmdmap
, "overlaypool", overlaypoolstr
);
10978 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
10979 if (overlaypool_id
< 0) {
10980 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
10984 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10986 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
10988 if (p
->tiers
.count(overlaypool_id
) == 0) {
10989 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
10993 if (p
->read_tier
== overlaypool_id
) {
10995 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
10998 if (p
->has_read_tier()) {
10999 ss
<< "pool '" << poolstr
<< "' has overlay '"
11000 << osdmap
.get_pool_name(p
->read_tier
)
11001 << "'; please remove-overlay first";
11007 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11008 np
->read_tier
= overlaypool_id
;
11009 np
->write_tier
= overlaypool_id
;
11010 np
->set_last_force_op_resend(pending_inc
.epoch
);
11011 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
11012 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
11013 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
11014 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
11015 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
11016 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11017 get_last_committed() + 1));
11019 } else if (prefix
== "osd tier remove-overlay" ||
11020 prefix
== "osd tier rm-overlay") {
11022 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
11023 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11025 ss
<< "unrecognized pool '" << poolstr
<< "'";
11029 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11031 if (!p
->has_read_tier()) {
11033 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
11037 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
11042 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11043 if (np
->has_read_tier()) {
11044 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
11045 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
11046 nop
->set_last_force_op_resend(pending_inc
.epoch
);
11048 if (np
->has_write_tier()) {
11049 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
11050 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
11051 nop
->set_last_force_op_resend(pending_inc
.epoch
);
11053 np
->clear_read_tier();
11054 np
->clear_write_tier();
11055 np
->set_last_force_op_resend(pending_inc
.epoch
);
11056 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
11057 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11058 get_last_committed() + 1));
11060 } else if (prefix
== "osd tier cache-mode") {
11061 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11062 if (err
== -EAGAIN
)
11067 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
11068 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11070 ss
<< "unrecognized pool '" << poolstr
<< "'";
11074 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11076 if (!p
->is_tier()) {
11077 ss
<< "pool '" << poolstr
<< "' is not a tier";
11082 cmd_getval(g_ceph_context
, cmdmap
, "mode", modestr
);
11083 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
11085 ss
<< "'" << modestr
<< "' is not a valid cache mode";
11091 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
11092 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11093 mode
!= pg_pool_t::CACHEMODE_NONE
&&
11094 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
11095 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
11096 sure
!= "--yes-i-really-mean-it") {
11097 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
11098 << "corrupt your data. pass --yes-i-really-mean-it to force.";
11103 // pool already has this cache-mode set and there are no pending changes
11104 if (p
->cache_mode
== mode
&&
11105 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
11106 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
11107 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
11108 << " to " << pg_pool_t::get_cache_mode_name(mode
);
11113 /* Mode description:
11115 * none: No cache-mode defined
11116 * forward: Forward all reads and writes to base pool
11117 * writeback: Cache writes, promote reads from base pool
11118 * readonly: Forward writes to base pool
11119 * readforward: Writes are in writeback mode, Reads are in forward mode
11120 * proxy: Proxy all reads and writes to base pool
11121 * readproxy: Writes are in writeback mode, Reads are in proxy mode
11123 * Hence, these are the allowed transitions:
11126 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11127 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11128 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
11129 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
11130 * writeback -> readforward || readproxy || forward || proxy
11134 // We check if the transition is valid against the current pool mode, as
11135 // it is the only committed state thus far. We will blantly squash
11136 // whatever mode is on the pending state.
11138 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
11139 (mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
11140 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
11141 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
11142 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
11143 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
11144 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
11145 << "' pool; only '"
11146 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD
)
11148 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
11150 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD
)
11152 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
11157 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
11158 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11159 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
11160 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
11161 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
11163 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
11164 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11165 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
11166 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
11167 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
11169 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
11170 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11171 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
11172 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
11173 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
11175 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
11176 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11177 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
11178 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
11179 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
11181 const pool_stat_t
* pstats
=
11182 mon
->pgservice
->get_pool_stat(pool_id
);
11184 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
11185 ss
<< "unable to set cache-mode '"
11186 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
11187 << "': dirty objects found";
11193 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11194 np
->cache_mode
= mode
;
11195 // set this both when moving to and from cache_mode NONE. this is to
11196 // capture legacy pools that were set up before this flag existed.
11197 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
11198 ss
<< "set cache-mode for pool '" << poolstr
11199 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
11200 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
11201 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
11203 if (base_pool
->read_tier
== pool_id
||
11204 base_pool
->write_tier
== pool_id
)
11205 ss
<<" (WARNING: pool is still configured as read or write tier)";
11207 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11208 get_last_committed() + 1));
11210 } else if (prefix
== "osd tier add-cache") {
11211 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
11212 if (err
== -EAGAIN
)
11217 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
11218 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11220 ss
<< "unrecognized pool '" << poolstr
<< "'";
11224 string tierpoolstr
;
11225 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
11226 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
11227 if (tierpool_id
< 0) {
11228 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
11232 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
11234 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
11237 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
11242 if (!cmd_getval(g_ceph_context
, cmdmap
, "size", size
)) {
11243 ss
<< "unable to parse 'size' value '"
11244 << cmd_vartype_stringify(cmdmap
["size"]) << "'";
11248 // make sure new tier is empty
11249 const pool_stat_t
*pstats
=
11250 mon
->pgservice
->get_pool_stat(tierpool_id
);
11251 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
11252 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
11256 string modestr
= g_conf
->osd_tier_default_cache_mode
;
11257 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
11259 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
11263 HitSet::Params hsp
;
11264 if (g_conf
->osd_tier_default_cache_hit_set_type
== "bloom") {
11265 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
11266 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
11267 hsp
= HitSet::Params(bsp
);
11268 } else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_hash") {
11269 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
11271 else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_object") {
11272 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
11274 ss
<< "osd tier cache default hit set type '" <<
11275 g_conf
->osd_tier_default_cache_hit_set_type
<< "' is not a known type";
11280 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
11281 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
11282 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
11283 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11286 np
->tiers
.insert(tierpool_id
);
11287 np
->read_tier
= np
->write_tier
= tierpool_id
;
11288 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
11289 np
->set_last_force_op_resend(pending_inc
.epoch
);
11290 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
11291 ntp
->tier_of
= pool_id
;
11292 ntp
->cache_mode
= mode
;
11293 ntp
->hit_set_count
= g_conf
->osd_tier_default_cache_hit_set_count
;
11294 ntp
->hit_set_period
= g_conf
->osd_tier_default_cache_hit_set_period
;
11295 ntp
->min_read_recency_for_promote
= g_conf
->osd_tier_default_cache_min_read_recency_for_promote
;
11296 ntp
->min_write_recency_for_promote
= g_conf
->osd_tier_default_cache_min_write_recency_for_promote
;
11297 ntp
->hit_set_grade_decay_rate
= g_conf
->osd_tier_default_cache_hit_set_grade_decay_rate
;
11298 ntp
->hit_set_search_last_n
= g_conf
->osd_tier_default_cache_hit_set_search_last_n
;
11299 ntp
->hit_set_params
= hsp
;
11300 ntp
->target_max_bytes
= size
;
11301 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
11302 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
11303 get_last_committed() + 1));
11305 } else if (prefix
== "osd pool set-quota") {
11307 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
11308 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
11310 ss
<< "unrecognized pool '" << poolstr
<< "'";
11316 cmd_getval(g_ceph_context
, cmdmap
, "field", field
);
11317 if (field
!= "max_objects" && field
!= "max_bytes") {
11318 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
11323 // val could contain unit designations, so we treat as a string
11325 cmd_getval(g_ceph_context
, cmdmap
, "val", val
);
11327 int64_t value
= unit_to_bytesize(val
, &tss
);
11329 ss
<< "error parsing value '" << value
<< "': " << tss
.str();
11334 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
11335 if (field
== "max_objects") {
11336 pi
->quota_max_objects
= value
;
11337 } else if (field
== "max_bytes") {
11338 pi
->quota_max_bytes
= value
;
11340 assert(0 == "unrecognized option");
11342 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
11344 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11345 get_last_committed() + 1));
11347 } else if (prefix
== "osd pool application enable" ||
11348 prefix
== "osd pool application disable" ||
11349 prefix
== "osd pool application set" ||
11350 prefix
== "osd pool application rm") {
11351 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
11352 if (err
== -EAGAIN
)
11358 wait_for_finished_proposal(
11359 op
, new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
11361 } else if (prefix
== "osd reweight-by-pg" ||
11362 prefix
== "osd reweight-by-utilization" ||
11363 prefix
== "osd test-reweight-by-pg" ||
11364 prefix
== "osd test-reweight-by-utilization") {
11366 prefix
== "osd reweight-by-pg" || prefix
== "osd test-reweight-by-pg";
11368 prefix
== "osd test-reweight-by-pg" ||
11369 prefix
== "osd test-reweight-by-utilization";
11371 cmd_getval(g_ceph_context
, cmdmap
, "oload", oload
, int64_t(120));
11372 set
<int64_t> pools
;
11373 vector
<string
> poolnamevec
;
11374 cmd_getval(g_ceph_context
, cmdmap
, "pools", poolnamevec
);
11375 for (unsigned j
= 0; j
< poolnamevec
.size(); j
++) {
11376 int64_t pool
= osdmap
.lookup_pg_pool_name(poolnamevec
[j
]);
11378 ss
<< "pool '" << poolnamevec
[j
] << "' does not exist";
11382 pools
.insert(pool
);
11384 double max_change
= g_conf
->mon_reweight_max_change
;
11385 cmd_getval(g_ceph_context
, cmdmap
, "max_change", max_change
);
11386 if (max_change
<= 0.0) {
11387 ss
<< "max_change " << max_change
<< " must be positive";
11391 int64_t max_osds
= g_conf
->mon_reweight_max_osds
;
11392 cmd_getval(g_ceph_context
, cmdmap
, "max_osds", max_osds
);
11393 if (max_osds
<= 0) {
11394 ss
<< "max_osds " << max_osds
<< " must be positive";
11398 string no_increasing
;
11399 cmd_getval(g_ceph_context
, cmdmap
, "no_increasing", no_increasing
);
11401 mempool::osdmap::map
<int32_t, uint32_t> new_weights
;
11402 err
= mon
->pgservice
->reweight_by_utilization(osdmap
,
11407 pools
.empty() ? NULL
: &pools
,
11408 no_increasing
== "--no-increasing",
11410 &ss
, &out_str
, f
.get());
11412 dout(10) << "reweight::by_utilization: finished with " << out_str
<< dendl
;
11417 rdata
.append(out_str
);
11419 ss
<< "FAILED reweight-by-pg";
11420 } else if (err
== 0 || dry_run
) {
11423 ss
<< "SUCCESSFUL reweight-by-pg";
11424 pending_inc
.new_weight
= std::move(new_weights
);
11425 wait_for_finished_proposal(
11427 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
11430 } else if (prefix
== "osd force-create-pg") {
11433 cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
);
11434 if (!pgid
.parse(pgidstr
.c_str())) {
11435 ss
<< "invalid pgid '" << pgidstr
<< "'";
11439 if (!osdmap
.pg_exists(pgid
)) {
11440 ss
<< "pg " << pgid
<< " should not exist";
11446 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
11447 auto emplaced
= creating_pgs
.pgs
.emplace(pgid
,
11448 make_pair(osdmap
.get_epoch(),
11449 ceph_clock_now()));
11450 creating_now
= emplaced
.second
;
11452 if (creating_now
) {
11453 ss
<< "pg " << pgidstr
<< " now creating, ok";
11457 ss
<< "pg " << pgid
<< " already creating";
11467 if (err
< 0 && rs
.length() == 0)
11468 rs
= cpp_strerror(err
);
11469 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
11474 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11475 get_last_committed() + 1));
11479 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11483 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
11485 op
->mark_osdmon_event(__func__
);
11486 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11488 if (m
->fsid
!= mon
->monmap
->fsid
) {
11489 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
11490 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
11491 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11495 if (m
->op
== POOL_OP_CREATE
)
11496 return preprocess_pool_op_create(op
);
11498 if (!osdmap
.get_pg_pool(m
->pool
)) {
11499 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
11500 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11504 // check if the snap and snapname exist
11505 bool snap_exists
= false;
11506 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
11507 if (p
->snap_exists(m
->name
.c_str()))
11508 snap_exists
= true;
11511 case POOL_OP_CREATE_SNAP
:
11512 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
11513 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11517 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11521 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11522 if (p
->is_pool_snaps_mode()) {
11523 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11527 case POOL_OP_DELETE_SNAP
:
11528 if (p
->is_unmanaged_snaps_mode()) {
11529 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11532 if (!snap_exists
) {
11533 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11537 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11538 if (p
->is_pool_snaps_mode()) {
11539 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11542 if (p
->is_removed_snap(m
->snapid
)) {
11543 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11547 case POOL_OP_DELETE
:
11548 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
11549 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11553 case POOL_OP_AUID_CHANGE
:
11563 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
11565 op
->mark_osdmon_event(__func__
);
11566 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11567 MonSession
*session
= m
->get_session();
11569 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
11572 if (!session
->is_capable("osd", MON_CAP_W
)) {
11573 dout(5) << "attempt to create new pool without sufficient auid privileges!"
11574 << "message: " << *m
<< std::endl
11575 << "caps: " << session
->caps
<< dendl
;
11576 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
11580 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
11582 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11589 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
11591 op
->mark_osdmon_event(__func__
);
11592 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11593 dout(10) << "prepare_pool_op " << *m
<< dendl
;
11594 if (m
->op
== POOL_OP_CREATE
) {
11595 return prepare_pool_op_create(op
);
11596 } else if (m
->op
== POOL_OP_DELETE
) {
11597 return prepare_pool_op_delete(op
);
11601 bool changed
= false;
11603 if (!osdmap
.have_pg_pool(m
->pool
)) {
11604 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
11608 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
11611 case POOL_OP_CREATE_SNAP
:
11612 if (pool
->is_tier()) {
11614 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
11616 } // else, fall through
11617 case POOL_OP_DELETE_SNAP
:
11618 if (!pool
->is_unmanaged_snaps_mode()) {
11619 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
11620 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
11621 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
11629 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
11632 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11633 // we won't allow removal of an unmanaged snapshot from a pool
11634 // not in unmanaged snaps mode.
11635 if (!pool
->is_unmanaged_snaps_mode()) {
11636 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
11640 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11641 // but we will allow creating an unmanaged snapshot on any pool
11642 // as long as it is not in 'pool' snaps mode.
11643 if (pool
->is_pool_snaps_mode()) {
11644 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11649 // projected pool info
11651 if (pending_inc
.new_pools
.count(m
->pool
))
11652 pp
= pending_inc
.new_pools
[m
->pool
];
11654 pp
= *osdmap
.get_pg_pool(m
->pool
);
11656 bufferlist reply_data
;
11658 // pool snaps vs unmanaged snaps are mutually exclusive
11660 case POOL_OP_CREATE_SNAP
:
11661 case POOL_OP_DELETE_SNAP
:
11662 if (pp
.is_unmanaged_snaps_mode()) {
11668 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11669 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11670 if (pp
.is_pool_snaps_mode()) {
11677 case POOL_OP_CREATE_SNAP
:
11678 if (!pp
.snap_exists(m
->name
.c_str())) {
11679 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
11680 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
<< " seq " << pp
.get_snap_epoch() << dendl
;
11685 case POOL_OP_DELETE_SNAP
:
11687 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
11695 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11698 pp
.add_unmanaged_snap(snapid
);
11699 ::encode(snapid
, reply_data
);
11704 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11705 if (!pp
.is_removed_snap(m
->snapid
)) {
11706 pp
.remove_unmanaged_snap(m
->snapid
);
11711 case POOL_OP_AUID_CHANGE
:
11712 if (pp
.auid
!= m
->auid
) {
11724 pp
.set_snap_epoch(pending_inc
.epoch
);
11725 pending_inc
.new_pools
[m
->pool
] = pp
;
11729 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
11733 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
11735 op
->mark_osdmon_event(__func__
);
11736 int err
= prepare_new_pool(op
);
11737 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
11741 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
11744 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
11746 // If the Pool is in use by CephFS, refuse to delete it
11747 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending();
11748 if (pending_fsmap
.pool_in_use(pool_id
)) {
11749 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
11753 if (pool
.tier_of
>= 0) {
11754 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
11755 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
11758 if (!pool
.tiers
.empty()) {
11759 *ss
<< "pool '" << poolstr
<< "' has tiers";
11760 for(auto tier
: pool
.tiers
) {
11761 *ss
<< " " << osdmap
.get_pool_name(tier
);
11766 if (!g_conf
->mon_allow_pool_delete
) {
11767 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
11771 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
11772 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
11776 *ss
<< "pool '" << poolstr
<< "' removed";
11781 * Check if it is safe to add a tier to a base pool
11784 * True if the operation should proceed, false if we should abort here
11785 * (abort doesn't necessarily mean error, could be idempotency)
11787 bool OSDMonitor::_check_become_tier(
11788 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
11789 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
11793 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
11794 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
11796 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending();
11797 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
11798 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
11803 if (base_pool
->tiers
.count(tier_pool_id
)) {
11804 assert(tier_pool
->tier_of
== base_pool_id
);
11806 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
11807 << base_pool_name
<< "'";
11811 if (base_pool
->is_tier()) {
11812 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
11813 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
11814 << "multiple tiers are not yet supported.";
11819 if (tier_pool
->has_tiers()) {
11820 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
11821 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
11822 it
!= tier_pool
->tiers
.end(); ++it
)
11823 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
11824 *ss
<< " multiple tiers are not yet supported.";
11829 if (tier_pool
->is_tier()) {
11830 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
11831 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
11842 * Check if it is safe to remove a tier from this base pool
11845 * True if the operation should proceed, false if we should abort here
11846 * (abort doesn't necessarily mean error, could be idempotency)
11848 bool OSDMonitor::_check_remove_tier(
11849 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
11850 const pg_pool_t
*tier_pool
,
11851 int *err
, ostream
*ss
) const
11853 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
11855 // Apply CephFS-specific checks
11856 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending();
11857 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
11858 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
11859 // If the underlying pool is erasure coded and does not allow EC
11860 // overwrites, we can't permit the removal of the replicated tier that
11861 // CephFS relies on to access it
11862 *ss
<< "pool '" << base_pool_name
<<
11863 "' does not allow EC overwrites and is in use by CephFS"
11869 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
11870 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
11871 "tier is still in use as a writeback cache. Change the cache "
11872 "mode and flush the cache before removing it";
11882 int OSDMonitor::_prepare_remove_pool(
11883 int64_t pool
, ostream
*ss
, bool no_fake
)
11885 dout(10) << __func__
<< " " << pool
<< dendl
;
11886 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
11887 int r
= _check_remove_pool(pool
, *p
, ss
);
11891 auto new_pool
= pending_inc
.new_pools
.find(pool
);
11892 if (new_pool
!= pending_inc
.new_pools
.end()) {
11893 // if there is a problem with the pending info, wait and retry
11895 const auto& p
= new_pool
->second
;
11896 int r
= _check_remove_pool(pool
, p
, ss
);
11901 if (pending_inc
.old_pools
.count(pool
)) {
11902 dout(10) << __func__
<< " " << pool
<< " already pending removal"
11907 if (g_conf
->mon_fake_pool_delete
&& !no_fake
) {
11908 string old_name
= osdmap
.get_pool_name(pool
);
11909 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
11910 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
11911 << old_name
<< " -> " << new_name
<< dendl
;
11912 pending_inc
.new_pool_names
[pool
] = new_name
;
11917 pending_inc
.old_pools
.insert(pool
);
11919 // remove any pg_temp mappings for this pool
11920 for (auto p
= osdmap
.pg_temp
->begin();
11921 p
!= osdmap
.pg_temp
->end();
11923 if (p
->first
.pool() == (uint64_t)pool
) {
11924 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
11925 << p
->first
<< dendl
;
11926 pending_inc
.new_pg_temp
[p
->first
].clear();
11929 // remove any primary_temp mappings for this pool
11930 for (auto p
= osdmap
.primary_temp
->begin();
11931 p
!= osdmap
.primary_temp
->end();
11933 if (p
->first
.pool() == (uint64_t)pool
) {
11934 dout(10) << __func__
<< " " << pool
11935 << " removing obsolete primary_temp" << p
->first
<< dendl
;
11936 pending_inc
.new_primary_temp
[p
->first
] = -1;
11939 // remove any pg_upmap mappings for this pool
11940 for (auto& p
: osdmap
.pg_upmap
) {
11941 if (p
.first
.pool() == (uint64_t)pool
) {
11942 dout(10) << __func__
<< " " << pool
11943 << " removing obsolete pg_upmap "
11944 << p
.first
<< dendl
;
11945 pending_inc
.old_pg_upmap
.insert(p
.first
);
11948 // remove any pending pg_upmap mappings for this pool
11950 auto it
= pending_inc
.new_pg_upmap
.begin();
11951 while (it
!= pending_inc
.new_pg_upmap
.end()) {
11952 if (it
->first
.pool() == (uint64_t)pool
) {
11953 dout(10) << __func__
<< " " << pool
11954 << " removing pending pg_upmap "
11955 << it
->first
<< dendl
;
11956 it
= pending_inc
.new_pg_upmap
.erase(it
);
11962 // remove any pg_upmap_items mappings for this pool
11963 for (auto& p
: osdmap
.pg_upmap_items
) {
11964 if (p
.first
.pool() == (uint64_t)pool
) {
11965 dout(10) << __func__
<< " " << pool
11966 << " removing obsolete pg_upmap_items " << p
.first
11968 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
11971 // remove any pending pg_upmap mappings for this pool
11973 auto it
= pending_inc
.new_pg_upmap_items
.begin();
11974 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
11975 if (it
->first
.pool() == (uint64_t)pool
) {
11976 dout(10) << __func__
<< " " << pool
11977 << " removing pending pg_upmap_items "
11978 << it
->first
<< dendl
;
11979 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
11986 // remove any choose_args for this pool
11987 CrushWrapper newcrush
;
11988 _get_pending_crush(newcrush
);
11989 if (newcrush
.have_choose_args(pool
)) {
11990 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
11991 newcrush
.rm_choose_args(pool
);
11992 pending_inc
.crush
.clear();
11993 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
11998 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
12000 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
12001 if (pending_inc
.old_pools
.count(pool
)) {
12002 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
12005 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
12006 p
!= pending_inc
.new_pool_names
.end();
12008 if (p
->second
== newname
&& p
->first
!= pool
) {
12013 pending_inc
.new_pool_names
[pool
] = newname
;
12017 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
12019 op
->mark_osdmon_event(__func__
);
12020 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12022 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
12023 if (ret
== -EAGAIN
) {
12024 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12028 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
12029 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
12030 pending_inc
.epoch
));
12034 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
12035 int ret
, epoch_t epoch
, bufferlist
*blp
)
12037 op
->mark_osdmon_event(__func__
);
12038 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
12039 dout(20) << "_pool_op_reply " << ret
<< dendl
;
12040 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
12041 ret
, epoch
, get_last_committed(), blp
);
12042 mon
->send_reply(op
, reply
);