1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
22 #include "OSDMonitor.h"
24 #include "MDSMonitor.h"
25 #include "PGMonitor.h"
27 #include "MonitorDBStore.h"
30 #include "crush/CrushWrapper.h"
31 #include "crush/CrushTester.h"
32 #include "crush/CrushTreeDumper.h"
34 #include "messages/MOSDBeacon.h"
35 #include "messages/MOSDFailure.h"
36 #include "messages/MOSDMarkMeDown.h"
37 #include "messages/MOSDFull.h"
38 #include "messages/MOSDMap.h"
39 #include "messages/MMonGetOSDMap.h"
40 #include "messages/MOSDBoot.h"
41 #include "messages/MOSDAlive.h"
42 #include "messages/MPoolOp.h"
43 #include "messages/MPoolOpReply.h"
44 #include "messages/MOSDPGCreate.h"
45 #include "messages/MOSDPGCreated.h"
46 #include "messages/MOSDPGTemp.h"
47 #include "messages/MMonCommand.h"
48 #include "messages/MRemoveSnaps.h"
49 #include "messages/MOSDScrub.h"
50 #include "messages/MRoute.h"
52 #include "common/TextTable.h"
53 #include "common/Timer.h"
54 #include "common/ceph_argparse.h"
55 #include "common/perf_counters.h"
56 #include "common/strtol.h"
58 #include "common/config.h"
59 #include "common/errno.h"
61 #include "erasure-code/ErasureCodePlugin.h"
62 #include "compressor/Compressor.h"
63 #include "common/Checksummer.h"
65 #include "include/compat.h"
66 #include "include/assert.h"
67 #include "include/stringify.h"
68 #include "include/util.h"
69 #include "common/cmdparse.h"
70 #include "include/str_list.h"
71 #include "include/str_map.h"
73 #include "json_spirit/json_spirit_reader.h"
75 #define dout_subsys ceph_subsys_mon
76 #define OSD_PG_CREATING_PREFIX "osd_pg_creating"
78 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
80 if (epoch_by_pg
.size() <= ps
) {
81 epoch_by_pg
.resize(ps
+ 1, 0);
83 const auto old_lec
= epoch_by_pg
[ps
];
84 if (old_lec
>= last_epoch_clean
) {
88 epoch_by_pg
[ps
] = last_epoch_clean
;
89 if (last_epoch_clean
< floor
) {
90 floor
= last_epoch_clean
;
91 } else if (last_epoch_clean
> floor
) {
92 if (old_lec
== floor
) {
93 // probably should increase floor?
94 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
95 std::end(epoch_by_pg
));
99 if (ps
!= next_missing
) {
102 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
103 if (epoch_by_pg
[next_missing
] == 0) {
109 void LastEpochClean::remove_pool(uint64_t pool
)
111 report_by_pool
.erase(pool
);
114 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
116 auto& lec
= report_by_pool
[pg
.pool()];
117 return lec
.report(pg
.ps(), last_epoch_clean
);
120 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
122 auto floor
= latest
.get_epoch();
123 for (auto& pool
: latest
.get_pools()) {
124 auto reported
= report_by_pool
.find(pool
.first
);
125 if (reported
== report_by_pool
.end()) {
128 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
131 if (reported
->second
.floor
< floor
) {
132 floor
= reported
->second
.floor
;
139 struct C_UpdateCreatingPGs
: public Context
{
143 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
144 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
145 void finish(int r
) override
{
147 utime_t end
= ceph_clock_now();
148 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
149 << (end
- start
) << " seconds" << dendl
;
150 osdmon
->update_creating_pgs();
151 osdmon
->check_pg_creates_subs();
157 #define dout_prefix _prefix(_dout, mon, osdmap)
158 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
159 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
160 << "(" << mon
->get_state_name()
161 << ").osd e" << osdmap
.get_epoch() << " ";
164 OSDMonitor::OSDMonitor(
168 const string
& service_name
)
169 : PaxosService(mn
, p
, service_name
),
171 inc_osd_cache(g_conf
->mon_osd_cache_size
),
172 full_osd_cache(g_conf
->mon_osd_cache_size
),
173 last_attempted_minwait_time(utime_t()),
174 mapper(mn
->cct
, &mn
->cpu_tp
),
175 op_tracker(cct
, true, 1)
178 bool OSDMonitor::_have_pending_crush()
180 return pending_inc
.crush
.length() > 0;
183 CrushWrapper
&OSDMonitor::_get_stable_crush()
185 return *osdmap
.crush
;
188 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
191 if (pending_inc
.crush
.length())
192 bl
= pending_inc
.crush
;
194 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
196 bufferlist::iterator p
= bl
.begin();
200 void OSDMonitor::create_initial()
202 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
207 mon
->store
->get("mkfs", "osdmap", bl
);
211 newmap
.set_fsid(mon
->monmap
->fsid
);
213 newmap
.build_simple(g_ceph_context
, 0, mon
->monmap
->fsid
, 0,
214 g_conf
->osd_pg_bits
, g_conf
->osd_pgp_bits
);
217 newmap
.created
= newmap
.modified
= ceph_clock_now();
219 // new clusters should sort bitwise by default.
220 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
222 // new cluster should require latest by default
223 newmap
.set_flag(CEPH_OSDMAP_REQUIRE_JEWEL
);
224 newmap
.set_flag(CEPH_OSDMAP_REQUIRE_KRAKEN
);
225 if (!g_conf
->mon_debug_no_require_luminous
) {
226 newmap
.set_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
);
227 newmap
.full_ratio
= g_conf
->mon_osd_full_ratio
;
228 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
229 newmap
.backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
230 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
231 newmap
.nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
232 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
233 newmap
.require_min_compat_client
= g_conf
->mon_osd_initial_require_min_compat_client
;
236 // encode into pending incremental
237 newmap
.encode(pending_inc
.fullmap
,
238 mon
->get_quorum_con_features() | CEPH_FEATURE_RESERVED
);
239 pending_inc
.full_crc
= newmap
.get_crc();
240 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
243 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
)
245 s
.insert(service_name
);
246 s
.insert(OSD_PG_CREATING_PREFIX
);
249 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
251 version_t version
= get_last_committed();
252 if (version
== osdmap
.epoch
)
254 assert(version
> osdmap
.epoch
);
256 dout(15) << "update_from_paxos paxos e " << version
257 << ", my e " << osdmap
.epoch
<< dendl
;
261 * We will possibly have a stashed latest that *we* wrote, and we will
262 * always be sure to have the oldest full map in the first..last range
263 * due to encode_trim_extra(), which includes the oldest full map in the trim
266 * encode_trim_extra() does not however write the full map's
267 * version to 'full_latest'. This is only done when we are building the
268 * full maps from the incremental versions. But don't panic! We make sure
269 * that the following conditions find whichever full map version is newer.
271 version_t latest_full
= get_version_latest_full();
272 if (latest_full
== 0 && get_first_committed() > 1)
273 latest_full
= get_first_committed();
275 if (get_first_committed() > 1 &&
276 latest_full
< get_first_committed()) {
277 // the monitor could be just sync'ed with its peer, and the latest_full key
278 // is not encoded in the paxos commits in encode_pending(), so we need to
279 // make sure we get it pointing to a proper version.
280 version_t lc
= get_last_committed();
281 version_t fc
= get_first_committed();
283 dout(10) << __func__
<< " looking for valid full map in interval"
284 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
287 for (version_t v
= lc
; v
>= fc
; v
--) {
288 string full_key
= "full_" + stringify(v
);
289 if (mon
->store
->exists(get_service_name(), full_key
)) {
290 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
296 assert(latest_full
> 0);
297 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
298 put_version_latest_full(t
, latest_full
);
299 mon
->store
->apply_transaction(t
);
300 dout(10) << __func__
<< " updated the on-disk full map version to "
301 << latest_full
<< dendl
;
304 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
305 bufferlist latest_bl
;
306 get_version_full(latest_full
, latest_bl
);
307 assert(latest_bl
.length() != 0);
308 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
309 osdmap
.decode(latest_bl
);
312 if (mon
->monmap
->get_required_features().contains_all(
313 ceph::features::mon::FEATURE_LUMINOUS
)) {
315 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
317 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
318 creating_pgs
.decode(p
);
319 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
320 << creating_pgs
.last_scan_epoch
321 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
323 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
328 // walk through incrementals
329 MonitorDBStore::TransactionRef t
;
331 while (version
> osdmap
.epoch
) {
333 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
335 assert(inc_bl
.length());
337 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
339 OSDMap::Incremental
inc(inc_bl
);
340 err
= osdmap
.apply_incremental(inc
);
344 t
.reset(new MonitorDBStore::Transaction
);
346 // Write out the full map for all past epochs. Encode the full
347 // map with the same features as the incremental. If we don't
348 // know, use the quorum features. If we don't know those either,
349 // encode with all features.
350 uint64_t f
= inc
.encode_features
;
352 f
= mon
->get_quorum_con_features();
356 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
357 tx_size
+= full_bl
.length();
359 bufferlist orig_full_bl
;
360 get_version_full(osdmap
.epoch
, orig_full_bl
);
361 if (orig_full_bl
.length()) {
362 // the primary provided the full map
363 assert(inc
.have_crc
);
364 if (inc
.full_crc
!= osdmap
.crc
) {
365 // This will happen if the mons were running mixed versions in
366 // the past or some other circumstance made the full encoded
367 // maps divergent. Reloading here will bring us back into
368 // sync with the primary for this and all future maps. OSDs
369 // will also be brought back into sync when they discover the
370 // crc mismatch and request a full map from a mon.
371 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
374 osdmap
.decode(orig_full_bl
);
377 assert(!inc
.have_crc
);
378 put_version_full(t
, osdmap
.epoch
, full_bl
);
380 put_version_latest_full(t
, osdmap
.epoch
);
383 dout(1) << osdmap
<< dendl
;
385 if (osdmap
.epoch
== 1) {
386 t
->erase("mkfs", "osdmap");
389 if (tx_size
> g_conf
->mon_sync_max_payload_size
*2) {
390 mon
->store
->apply_transaction(t
);
391 t
= MonitorDBStore::TransactionRef();
394 if (mon
->monmap
->get_required_features().contains_all(
395 ceph::features::mon::FEATURE_LUMINOUS
)) {
396 creating_pgs
= update_pending_pgs(inc
);
397 for (const auto &osd_state
: inc
.new_state
) {
398 if (osd_state
.second
& CEPH_OSD_UP
) {
399 // could be marked up *or* down, but we're too lazy to check which
400 last_osd_report
.erase(osd_state
.first
);
402 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
403 // could be created *or* destroyed, but we can safely drop it
404 osd_epochs
.erase(osd_state
.first
);
411 mon
->store
->apply_transaction(t
);
414 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
415 if (osdmap
.is_out(o
))
417 auto found
= down_pending_out
.find(o
);
418 if (osdmap
.is_down(o
)) {
419 // populate down -> out map
420 if (found
== down_pending_out
.end()) {
421 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
422 down_pending_out
[o
] = ceph_clock_now();
425 if (found
!= down_pending_out
.end()) {
426 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
427 down_pending_out
.erase(found
);
431 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
433 if (mon
->is_leader()) {
434 // kick pgmon, make sure it's seen the latest map
435 mon
->pgmon()->check_osd_map(osdmap
.epoch
);
439 check_pg_creates_subs();
441 share_map_with_random_osd();
446 // make sure our feature bits reflect the latest map
447 update_msgr_features();
449 if (!mon
->is_leader()) {
450 // will be called by on_active() on the leader, avoid doing so twice
455 void OSDMonitor::start_mapping()
457 // initiate mapping job
459 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
461 mapping_job
->abort();
463 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
464 mapping_job
= mapping
.start_update(osdmap
, mapper
,
465 g_conf
->mon_osd_mapping_pgs_per_chunk
);
466 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
467 << " at " << fin
->start
<< dendl
;
468 mapping_job
->set_finish_event(fin
);
471 void OSDMonitor::update_msgr_features()
474 types
.insert((int)entity_name_t::TYPE_OSD
);
475 types
.insert((int)entity_name_t::TYPE_CLIENT
);
476 types
.insert((int)entity_name_t::TYPE_MDS
);
477 types
.insert((int)entity_name_t::TYPE_MON
);
478 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
480 uint64_t features
= osdmap
.get_features(*q
, &mask
);
481 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
482 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
483 Messenger::Policy p
= mon
->messenger
->get_policy(*q
);
484 p
.features_required
= (p
.features_required
& ~mask
) | features
;
485 mon
->messenger
->set_policy(*q
, p
);
490 void OSDMonitor::on_active()
494 if (mon
->is_leader()) {
495 mon
->clog
->info() << "osdmap " << osdmap
;
497 list
<MonOpRequestRef
> ls
;
498 take_all_failures(ls
);
499 while (!ls
.empty()) {
500 MonOpRequestRef op
= ls
.front();
501 op
->mark_osdmon_event(__func__
);
509 void OSDMonitor::on_restart()
511 last_osd_report
.clear();
514 void OSDMonitor::on_shutdown()
516 dout(10) << __func__
<< dendl
;
518 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
520 mapping_job
->abort();
523 // discard failure info, waiters
524 list
<MonOpRequestRef
> ls
;
525 take_all_failures(ls
);
529 void OSDMonitor::update_logger()
531 dout(10) << "update_logger" << dendl
;
533 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
534 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
535 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
536 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
539 template <typename F
>
540 class OSDUtilizationDumper
: public CrushTreeDumper::Dumper
<F
> {
542 typedef CrushTreeDumper::Dumper
<F
> Parent
;
544 OSDUtilizationDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
545 const PGMap
*pgm_
, bool tree_
) :
550 average_util(average_utilization()),
558 void dump_stray(F
*f
) {
559 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
560 if (osdmap
->exists(i
) && !this->is_touched(i
))
561 dump_item(CrushTreeDumper::Item(i
, 0, 0), f
);
565 void dump_item(const CrushTreeDumper::Item
&qi
, F
*f
) override
{
566 if (!tree
&& qi
.is_bucket())
569 float reweight
= qi
.is_bucket() ? -1 : osdmap
->get_weightf(qi
.id
);
570 int64_t kb
= 0, kb_used
= 0, kb_avail
= 0;
572 if (get_bucket_utilization(qi
.id
, &kb
, &kb_used
, &kb_avail
))
574 util
= 100.0 * (double)kb_used
/ (double)kb
;
578 var
= util
/ average_util
;
580 size_t num_pgs
= qi
.is_bucket() ? 0 : pgm
->get_num_pg_by_osd(qi
.id
);
582 dump_item(qi
, reweight
, kb
, kb_used
, kb_avail
, util
, var
, num_pgs
, f
);
584 if (!qi
.is_bucket() && reweight
> 0) {
585 if (min_var
< 0 || var
< min_var
)
587 if (max_var
< 0 || var
> max_var
)
590 double dev
= util
- average_util
;
592 stddev
+= reweight
* dev
;
597 virtual void dump_item(const CrushTreeDumper::Item
&qi
,
604 const size_t num_pgs
,
608 return sum
> 0 ? sqrt(stddev
/ sum
) : 0;
611 double average_utilization() {
612 int64_t kb
= 0, kb_used
= 0;
613 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
614 if (!osdmap
->exists(i
) || osdmap
->get_weight(i
) == 0)
616 int64_t kb_i
, kb_used_i
, kb_avail_i
;
617 if (get_osd_utilization(i
, &kb_i
, &kb_used_i
, &kb_avail_i
)) {
619 kb_used
+= kb_used_i
;
622 return kb
> 0 ? 100.0 * (double)kb_used
/ (double)kb
: 0;
625 bool get_osd_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
626 int64_t* kb_avail
) const {
627 typedef ceph::unordered_map
<int32_t,osd_stat_t
> OsdStat
;
628 OsdStat::const_iterator p
= pgm
->osd_stat
.find(id
);
629 if (p
== pgm
->osd_stat
.end())
632 *kb_used
= p
->second
.kb_used
;
633 *kb_avail
= p
->second
.kb_avail
;
637 bool get_bucket_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
638 int64_t* kb_avail
) const {
640 if (osdmap
->is_out(id
)) {
646 return get_osd_utilization(id
, kb
, kb_used
, kb_avail
);
653 for (int k
= osdmap
->crush
->get_bucket_size(id
) - 1; k
>= 0; k
--) {
654 int item
= osdmap
->crush
->get_bucket_item(id
, k
);
655 int64_t kb_i
= 0, kb_used_i
= 0, kb_avail_i
= 0;
656 if (!get_bucket_utilization(item
, &kb_i
, &kb_used_i
, &kb_avail_i
))
659 *kb_used
+= kb_used_i
;
660 *kb_avail
+= kb_avail_i
;
666 const OSDMap
*osdmap
;
676 class OSDUtilizationPlainDumper
: public OSDUtilizationDumper
<TextTable
> {
678 typedef OSDUtilizationDumper
<TextTable
> Parent
;
680 OSDUtilizationPlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
681 const PGMap
*pgm
, bool tree
) :
682 Parent(crush
, osdmap
, pgm
, tree
) {}
684 void dump(TextTable
*tbl
) {
685 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
686 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
687 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
688 tbl
->define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
689 tbl
->define_column("USE", TextTable::LEFT
, TextTable::RIGHT
);
690 tbl
->define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
691 tbl
->define_column("%USE", TextTable::LEFT
, TextTable::RIGHT
);
692 tbl
->define_column("VAR", TextTable::LEFT
, TextTable::RIGHT
);
693 tbl
->define_column("PGS", TextTable::LEFT
, TextTable::RIGHT
);
695 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
701 *tbl
<< "" << "" << "TOTAL"
702 << si_t(pgm
->osd_sum
.kb
<< 10)
703 << si_t(pgm
->osd_sum
.kb_used
<< 10)
704 << si_t(pgm
->osd_sum
.kb_avail
<< 10)
705 << lowprecision_t(average_util
)
707 << TextTable::endrow
;
711 struct lowprecision_t
{
713 explicit lowprecision_t(float _v
) : v(_v
) {}
715 friend std::ostream
&operator<<(ostream
& out
, const lowprecision_t
& v
);
717 using OSDUtilizationDumper
<TextTable
>::dump_item
;
718 void dump_item(const CrushTreeDumper::Item
&qi
,
725 const size_t num_pgs
,
726 TextTable
*tbl
) override
{
728 << weightf_t(qi
.weight
)
729 << weightf_t(reweight
)
731 << si_t(kb_used
<< 10)
732 << si_t(kb_avail
<< 10)
733 << lowprecision_t(util
)
734 << lowprecision_t(var
);
736 if (qi
.is_bucket()) {
744 for (int k
= 0; k
< qi
.depth
; k
++)
746 if (qi
.is_bucket()) {
747 int type
= crush
->get_bucket_type(qi
.id
);
748 name
<< crush
->get_type_name(type
) << " "
749 << crush
->get_item_name(qi
.id
);
751 name
<< "osd." << qi
.id
;
756 *tbl
<< TextTable::endrow
;
762 out
<< "MIN/MAX VAR: " << lowprecision_t(min_var
)
763 << "/" << lowprecision_t(max_var
) << " "
764 << "STDDEV: " << lowprecision_t(dev());
769 ostream
& operator<<(ostream
& out
,
770 const OSDUtilizationPlainDumper::lowprecision_t
& v
)
774 } else if (v
.v
< 0.001) {
777 std::streamsize p
= out
.precision();
778 return out
<< std::fixed
<< std::setprecision(2) << v
.v
<< std::setprecision(p
);
782 class OSDUtilizationFormatDumper
: public OSDUtilizationDumper
<Formatter
> {
784 typedef OSDUtilizationDumper
<Formatter
> Parent
;
786 OSDUtilizationFormatDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
787 const PGMap
*pgm
, bool tree
) :
788 Parent(crush
, osdmap
, pgm
, tree
) {}
790 void dump(Formatter
*f
) {
791 f
->open_array_section("nodes");
795 f
->open_array_section("stray");
801 using OSDUtilizationDumper
<Formatter
>::dump_item
;
802 void dump_item(const CrushTreeDumper::Item
&qi
,
809 const size_t num_pgs
,
810 Formatter
*f
) override
{
811 f
->open_object_section("item");
812 CrushTreeDumper::dump_item_fields(crush
, qi
, f
);
813 f
->dump_float("reweight", reweight
);
814 f
->dump_int("kb", kb
);
815 f
->dump_int("kb_used", kb_used
);
816 f
->dump_int("kb_avail", kb_avail
);
817 f
->dump_float("utilization", util
);
818 f
->dump_float("var", var
);
819 f
->dump_unsigned("pgs", num_pgs
);
820 CrushTreeDumper::dump_bucket_children(crush
, qi
, f
);
825 void summary(Formatter
*f
) {
826 f
->open_object_section("summary");
827 f
->dump_int("total_kb", pgm
->osd_sum
.kb
);
828 f
->dump_int("total_kb_used", pgm
->osd_sum
.kb_used
);
829 f
->dump_int("total_kb_avail", pgm
->osd_sum
.kb_avail
);
830 f
->dump_float("average_utilization", average_util
);
831 f
->dump_float("min_var", min_var
);
832 f
->dump_float("max_var", max_var
);
833 f
->dump_float("dev", dev());
838 void OSDMonitor::print_utilization(ostream
&out
, Formatter
*f
, bool tree
) const
840 const PGMap
*pgm
= &mon
->pgmon()->pg_map
;
841 const CrushWrapper
*crush
= osdmap
.crush
.get();
844 f
->open_object_section("df");
845 OSDUtilizationFormatDumper
d(crush
, &osdmap
, pgm
, tree
);
851 OSDUtilizationPlainDumper
d(crush
, &osdmap
, pgm
, tree
);
855 << d
.summary() << "\n";
859 void OSDMonitor::create_pending()
861 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
862 pending_inc
.fsid
= mon
->monmap
->fsid
;
864 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
866 // clean up pg_temp, primary_temp
867 OSDMap::clean_temps(g_ceph_context
, osdmap
, &pending_inc
);
868 dout(10) << "create_pending did clean_temps" << dendl
;
870 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
871 // instead of osd_backfill_full_ratio config
872 if (osdmap
.backfillfull_ratio
<= 0) {
873 pending_inc
.new_backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
874 if (pending_inc
.new_backfillfull_ratio
> 1.0)
875 pending_inc
.new_backfillfull_ratio
/= 100;
876 dout(1) << __func__
<< " setting backfillfull_ratio = "
877 << pending_inc
.new_backfillfull_ratio
<< dendl
;
879 if (!osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
880 // transition full ratios from PGMap to OSDMap (on upgrade)
881 PGMap
*pg_map
= &mon
->pgmon()->pg_map
;
882 if (osdmap
.full_ratio
!= pg_map
->full_ratio
) {
883 dout(10) << __func__
<< " full_ratio " << osdmap
.full_ratio
884 << " -> " << pg_map
->full_ratio
<< " (from pgmap)" << dendl
;
885 pending_inc
.new_full_ratio
= pg_map
->full_ratio
;
887 if (osdmap
.nearfull_ratio
!= pg_map
->nearfull_ratio
) {
888 dout(10) << __func__
<< " nearfull_ratio " << osdmap
.nearfull_ratio
889 << " -> " << pg_map
->nearfull_ratio
<< " (from pgmap)" << dendl
;
890 pending_inc
.new_nearfull_ratio
= pg_map
->nearfull_ratio
;
893 // safety check (this shouldn't really happen)
894 if (osdmap
.full_ratio
<= 0) {
895 pending_inc
.new_full_ratio
= g_conf
->mon_osd_full_ratio
;
896 if (pending_inc
.new_full_ratio
> 1.0)
897 pending_inc
.new_full_ratio
/= 100;
898 dout(1) << __func__
<< " setting full_ratio = "
899 << pending_inc
.new_full_ratio
<< dendl
;
901 if (osdmap
.nearfull_ratio
<= 0) {
902 pending_inc
.new_nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
903 if (pending_inc
.new_nearfull_ratio
> 1.0)
904 pending_inc
.new_nearfull_ratio
/= 100;
905 dout(1) << __func__
<< " setting nearfull_ratio = "
906 << pending_inc
.new_nearfull_ratio
<< dendl
;
912 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
)
914 creating_pgs_t pending_creatings
;
916 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
917 pending_creatings
= creating_pgs
;
919 if (pending_creatings
.last_scan_epoch
> inc
.epoch
) {
920 return pending_creatings
;
922 for (auto& pg
: pending_created_pgs
) {
923 pending_creatings
.created_pools
.insert(pg
.pool());
924 pending_creatings
.pgs
.erase(pg
);
926 pending_created_pgs
.clear();
927 // PAXOS_PGMAP is less than PAXOS_OSDMAP, so PGMonitor::update_from_paxos()
928 // should have prepared the latest pgmap if any
929 const auto& pgm
= mon
->pgmon()->pg_map
;
930 if (pgm
.last_pg_scan
>= creating_pgs
.last_scan_epoch
) {
931 // TODO: please stop updating pgmap with pgstats once the upgrade is completed
932 for (auto& pgid
: pgm
.creating_pgs
) {
933 auto st
= pgm
.pg_stat
.find(pgid
);
934 assert(st
!= pgm
.pg_stat
.end());
935 auto created
= make_pair(st
->second
.created
, st
->second
.last_scrub_stamp
);
936 // no need to add the pg, if it already exists in creating_pgs
937 pending_creatings
.pgs
.emplace(pgid
, created
);
940 for (auto old_pool
: inc
.old_pools
) {
941 pending_creatings
.created_pools
.erase(old_pool
);
942 const auto removed_pool
= (uint64_t)old_pool
;
944 pending_creatings
.pgs
.lower_bound(pg_t
{0, removed_pool
});
946 pending_creatings
.pgs
.lower_bound(pg_t
{0, removed_pool
+ 1});
947 pending_creatings
.pgs
.erase(first
, last
);
948 last_epoch_clean
.remove_pool(removed_pool
);
950 scan_for_creating_pgs(osdmap
.get_pools(),
954 scan_for_creating_pgs(inc
.new_pools
,
958 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
959 return pending_creatings
;
962 void OSDMonitor::maybe_prime_pg_temp()
965 if (pending_inc
.crush
.length()) {
966 dout(10) << __func__
<< " new crush map, all" << dendl
;
970 if (!pending_inc
.new_up_client
.empty()) {
971 dout(10) << __func__
<< " new up osds, all" << dendl
;
975 // check for interesting OSDs
977 for (map
<int32_t,uint8_t>::iterator p
= pending_inc
.new_state
.begin();
978 !all
&& p
!= pending_inc
.new_state
.end();
980 if ((p
->second
& CEPH_OSD_UP
) &&
981 osdmap
.is_up(p
->first
)) {
982 osds
.insert(p
->first
);
985 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
986 !all
&& p
!= pending_inc
.new_weight
.end();
988 if (p
->second
< osdmap
.get_weight(p
->first
)) {
990 osds
.insert(p
->first
);
992 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
998 if (!all
&& osds
.empty())
1003 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1004 if (estimate
> mapping
.get_num_pgs() *
1005 g_conf
->mon_osd_prime_pg_temp_max_estimate
) {
1006 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1007 << osds
.size() << " osds >= "
1008 << g_conf
->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1009 << mapping
.get_num_pgs() << " pgs, all"
1013 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1014 << osds
.size() << " osds" << dendl
;
1019 next
.deepish_copy_from(osdmap
);
1020 next
.apply_incremental(pending_inc
);
1023 PrimeTempJob
job(next
, this);
1024 mapper
.queue(&job
, g_conf
->mon_osd_mapping_pgs_per_chunk
);
1025 if (job
.wait_for(g_conf
->mon_osd_prime_pg_temp_max_time
)) {
1026 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1028 dout(10) << __func__
<< " did not finish in "
1029 << g_conf
->mon_osd_prime_pg_temp_max_time
1030 << ", stopping" << dendl
;
1034 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1035 utime_t stop
= ceph_clock_now();
1036 stop
+= g_conf
->mon_osd_prime_pg_temp_max_time
;
1037 const int chunk
= 1000;
1039 std::unordered_set
<pg_t
> did_pgs
;
1040 for (auto osd
: osds
) {
1041 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1042 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1043 for (auto pgid
: pgs
) {
1044 if (!did_pgs
.insert(pgid
).second
) {
1047 prime_pg_temp(next
, pgid
);
1050 if (ceph_clock_now() > stop
) {
1051 dout(10) << __func__
<< " consumed more than "
1052 << g_conf
->mon_osd_prime_pg_temp_max_time
1053 << " seconds, stopping"
1063 void OSDMonitor::prime_pg_temp(
1067 if (mon
->monmap
->get_required_features().contains_all(
1068 ceph::features::mon::FEATURE_LUMINOUS
)) {
1069 if (creating_pgs
.pgs
.count(pgid
)) {
1073 const auto& pg_map
= mon
->pgmon()->pg_map
;
1074 if (pg_map
.creating_pgs
.count(pgid
)) {
1078 if (!osdmap
.pg_exists(pgid
)) {
1082 vector
<int> up
, acting
;
1083 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1085 vector
<int> next_up
, next_acting
;
1086 int next_up_primary
, next_acting_primary
;
1087 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1088 &next_acting
, &next_acting_primary
);
1089 if (acting
== next_acting
)
1090 return; // no change since last epoch
1093 return; // if previously empty now we can be no worse off
1094 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1095 if (pool
&& acting
.size() < pool
->min_size
)
1096 return; // can be no worse off than before
1098 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1099 << " -> " << next_up
<< "/" << next_acting
1100 << ", priming " << acting
1103 Mutex::Locker
l(prime_pg_temp_lock
);
1104 // do not touch a mapping if a change is pending
1105 pending_inc
.new_pg_temp
.emplace(
1107 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1112 * @note receiving a transaction in this function gives a fair amount of
1113 * freedom to the service implementation if it does need it. It shouldn't.
1115 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1117 dout(10) << "encode_pending e " << pending_inc
.epoch
1120 // finalize up pending_inc
1121 pending_inc
.modified
= ceph_clock_now();
1123 int r
= pending_inc
.propagate_snaps_to_tiers(g_ceph_context
, osdmap
);
1127 if (!mapping_job
->is_done()) {
1128 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1129 << mapping_job
.get() << " did not complete, "
1130 << mapping_job
->shards
<< " left" << dendl
;
1131 mapping_job
->abort();
1132 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1133 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1134 << mapping_job
.get() << " is prior epoch "
1135 << mapping
.get_epoch() << dendl
;
1137 if (g_conf
->mon_osd_prime_pg_temp
) {
1138 maybe_prime_pg_temp();
1141 } else if (g_conf
->mon_osd_prime_pg_temp
) {
1142 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1145 mapping_job
.reset();
1151 tmp
.deepish_copy_from(osdmap
);
1152 tmp
.apply_incremental(pending_inc
);
1154 if (tmp
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
1155 // set or clear full/nearfull?
1156 int full
, backfill
, nearfull
;
1157 tmp
.count_full_nearfull_osds(&full
, &backfill
, &nearfull
);
1159 if (!tmp
.test_flag(CEPH_OSDMAP_FULL
)) {
1160 dout(10) << __func__
<< " setting full flag" << dendl
;
1161 add_flag(CEPH_OSDMAP_FULL
);
1162 remove_flag(CEPH_OSDMAP_NEARFULL
);
1165 if (tmp
.test_flag(CEPH_OSDMAP_FULL
)) {
1166 dout(10) << __func__
<< " clearing full flag" << dendl
;
1167 remove_flag(CEPH_OSDMAP_FULL
);
1170 if (!tmp
.test_flag(CEPH_OSDMAP_NEARFULL
)) {
1171 dout(10) << __func__
<< " setting nearfull flag" << dendl
;
1172 add_flag(CEPH_OSDMAP_NEARFULL
);
1175 if (tmp
.test_flag(CEPH_OSDMAP_NEARFULL
)) {
1176 dout(10) << __func__
<< " clearing nearfull flag" << dendl
;
1177 remove_flag(CEPH_OSDMAP_NEARFULL
);
1182 // min_compat_client?
1183 if (tmp
.require_min_compat_client
.empty()) {
1184 auto mv
= tmp
.get_min_compat_client();
1185 dout(1) << __func__
<< " setting require_min_compat_client to current " << mv
1187 mon
->clog
->info() << "setting require_min_compat_client to currently required "
1189 pending_inc
.new_require_min_compat_client
= mv
.first
;
1195 for (map
<int32_t,uint8_t>::iterator i
= pending_inc
.new_state
.begin();
1196 i
!= pending_inc
.new_state
.end();
1198 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1199 if (s
& CEPH_OSD_UP
)
1200 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1201 if (s
& CEPH_OSD_EXISTS
)
1202 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1204 for (map
<int32_t,entity_addr_t
>::iterator i
= pending_inc
.new_up_client
.begin();
1205 i
!= pending_inc
.new_up_client
.end();
1207 //FIXME: insert cluster addresses too
1208 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1210 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1211 i
!= pending_inc
.new_weight
.end();
1213 if (i
->second
== CEPH_OSD_OUT
) {
1214 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1215 } else if (i
->second
== CEPH_OSD_IN
) {
1216 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1218 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1222 // features for osdmap and its incremental
1223 uint64_t features
= mon
->get_quorum_con_features();
1225 // encode full map and determine its crc
1228 tmp
.deepish_copy_from(osdmap
);
1229 tmp
.apply_incremental(pending_inc
);
1231 // determine appropriate features
1232 if (!tmp
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
1233 dout(10) << __func__
<< " encoding without feature SERVER_LUMINOUS"
1235 features
&= ~CEPH_FEATURE_SERVER_LUMINOUS
;
1237 if (!tmp
.test_flag(CEPH_OSDMAP_REQUIRE_JEWEL
)) {
1238 dout(10) << __func__
<< " encoding without feature SERVER_JEWEL" << dendl
;
1239 features
&= ~CEPH_FEATURE_SERVER_JEWEL
;
1241 if (!tmp
.test_flag(CEPH_OSDMAP_REQUIRE_KRAKEN
)) {
1242 dout(10) << __func__
<< " encoding without feature SERVER_KRAKEN | "
1243 << "MSG_ADDR2" << dendl
;
1244 features
&= ~(CEPH_FEATURE_SERVER_KRAKEN
|
1245 CEPH_FEATURE_MSG_ADDR2
);
1247 dout(10) << __func__
<< " encoding full map with " << features
<< dendl
;
1250 ::encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1251 pending_inc
.full_crc
= tmp
.get_crc();
1253 // include full map in the txn. note that old monitors will
1254 // overwrite this. new ones will now skip the local full map
1255 // encode and reload from this.
1256 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1260 assert(get_last_committed() + 1 == pending_inc
.epoch
);
1261 ::encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1263 dout(20) << " full_crc " << tmp
.get_crc()
1264 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1266 /* put everything in the transaction */
1267 put_version(t
, pending_inc
.epoch
, bl
);
1268 put_last_committed(t
, pending_inc
.epoch
);
1271 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1272 p
!= pending_metadata
.end();
1274 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1275 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1276 p
!= pending_metadata_rm
.end();
1278 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1279 pending_metadata
.clear();
1280 pending_metadata_rm
.clear();
1282 // and pg creating, also!
1283 if (mon
->monmap
->get_required_features().contains_all(
1284 ceph::features::mon::FEATURE_LUMINOUS
)) {
1285 auto pending_creatings
= update_pending_pgs(pending_inc
);
1286 if (!osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
1287 dout(7) << __func__
<< " in the middle of upgrading, "
1288 << " trimming pending creating_pgs using pgmap" << dendl
;
1289 trim_creating_pgs(&pending_creatings
, mon
->pgmon()->pg_map
);
1291 bufferlist creatings_bl
;
1292 ::encode(pending_creatings
, creatings_bl
);
1293 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1297 void OSDMonitor::trim_creating_pgs(creating_pgs_t
* creating_pgs
,
1300 auto p
= creating_pgs
->pgs
.begin();
1301 while (p
!= creating_pgs
->pgs
.end()) {
1302 auto q
= pgm
.pg_stat
.find(p
->first
);
1303 if (q
!= pgm
.pg_stat
.end() &&
1304 !(q
->second
.state
& PG_STATE_CREATING
)) {
1305 dout(20) << __func__
<< " pgmap shows " << p
->first
<< " is created"
1307 p
= creating_pgs
->pgs
.erase(p
);
1308 creating_pgs
->created_pools
.insert(q
->first
.pool());
1315 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
1318 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
1322 bufferlist::iterator p
= bl
.begin();
1325 catch (buffer::error
& e
) {
1327 *err
<< "osd." << osd
<< " metadata is corrupt";
1333 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
1335 map
<string
, string
> metadata
;
1336 int r
= load_metadata(osd
, metadata
, nullptr);
1340 auto it
= metadata
.find("osd_objectstore");
1341 if (it
== metadata
.end())
1347 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
1348 const pg_pool_t
&pool
,
1351 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1352 // since filestore osds could always join the pool later
1353 set
<int> checked_osds
;
1354 for (unsigned ps
= 0; ps
< MIN(8, pool
.get_pg_num()); ++ps
) {
1355 vector
<int> up
, acting
;
1356 pg_t
pgid(ps
, pool_id
, -1);
1357 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
1358 for (int osd
: up
) {
1359 if (checked_osds
.find(osd
) != checked_osds
.end())
1361 string objectstore_type
;
1362 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
1363 // allow with missing metadata, e.g. due to an osd never booting yet
1364 if (r
< 0 || objectstore_type
== "bluestore") {
1365 checked_osds
.insert(osd
);
1368 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
1375 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
1377 map
<string
,string
> m
;
1378 if (int r
= load_metadata(osd
, m
, err
))
1380 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
1381 f
->dump_string(p
->first
.c_str(), p
->second
);
1385 void OSDMonitor::print_nodes(Formatter
*f
)
1387 // group OSDs by their hosts
1388 map
<string
, list
<int> > osds
; // hostname => osd
1389 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
1390 map
<string
, string
> m
;
1391 if (load_metadata(osd
, m
, NULL
)) {
1394 map
<string
, string
>::iterator hostname
= m
.find("hostname");
1395 if (hostname
== m
.end()) {
1396 // not likely though
1399 osds
[hostname
->second
].push_back(osd
);
1402 dump_services(f
, osds
, "osd");
1405 void OSDMonitor::share_map_with_random_osd()
1407 if (osdmap
.get_num_up_osds() == 0) {
1408 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
1412 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
1414 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
1418 dout(10) << "committed, telling random " << s
->inst
<< " all about it" << dendl
;
1419 // whatev, they'll request more if they need it
1420 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch());
1421 s
->con
->send_message(m
);
1422 // NOTE: do *not* record osd has up to this epoch (as we do
1423 // elsewhere) as they may still need to request older values.
1426 version_t
OSDMonitor::get_trim_to()
1430 if (mon
->monmap
->get_required_features().contains_all(
1431 ceph::features::mon::FEATURE_LUMINOUS
)) {
1433 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1434 if (!creating_pgs
.pgs
.empty()) {
1438 floor
= get_min_last_epoch_clean();
1440 if (!mon
->pgmon()->is_readable())
1442 if (mon
->pgmon()->pg_map
.creating_pgs
.empty()) {
1445 floor
= mon
->pgmon()->pg_map
.get_min_last_epoch_clean();
1448 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
1449 if (g_conf
->mon_osd_force_trim_to
> 0 &&
1450 g_conf
->mon_osd_force_trim_to
< (int)get_last_committed()) {
1451 floor
= g_conf
->mon_osd_force_trim_to
;
1452 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
1454 unsigned min
= g_conf
->mon_min_osdmap_epochs
;
1455 if (floor
+ min
> get_last_committed()) {
1456 if (min
< get_last_committed())
1457 floor
= get_last_committed() - min
;
1461 if (floor
> get_first_committed())
1467 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
1469 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
1470 // also scan osd epochs
1471 // don't trim past the oldest reported osd epoch
1472 for (auto& osd_epoch
: osd_epochs
) {
1473 if (osd_epoch
.second
< floor
) {
1474 floor
= osd_epoch
.second
;
1480 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
1483 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
1485 get_version_full(first
, bl
);
1486 put_version_full(tx
, first
, bl
);
1491 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
1493 op
->mark_osdmon_event(__func__
);
1494 Message
*m
= op
->get_req();
1495 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1497 switch (m
->get_type()) {
1499 case MSG_MON_COMMAND
:
1500 return preprocess_command(op
);
1501 case CEPH_MSG_MON_GET_OSDMAP
:
1502 return preprocess_get_osdmap(op
);
1505 case MSG_OSD_MARK_ME_DOWN
:
1506 return preprocess_mark_me_down(op
);
1508 return preprocess_full(op
);
1509 case MSG_OSD_FAILURE
:
1510 return preprocess_failure(op
);
1512 return preprocess_boot(op
);
1514 return preprocess_alive(op
);
1515 case MSG_OSD_PG_CREATED
:
1516 return preprocess_pg_created(op
);
1517 case MSG_OSD_PGTEMP
:
1518 return preprocess_pgtemp(op
);
1519 case MSG_OSD_BEACON
:
1520 return preprocess_beacon(op
);
1522 case CEPH_MSG_POOLOP
:
1523 return preprocess_pool_op(op
);
1525 case MSG_REMOVE_SNAPS
:
1526 return preprocess_remove_snaps(op
);
1534 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
1536 op
->mark_osdmon_event(__func__
);
1537 Message
*m
= op
->get_req();
1538 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1540 switch (m
->get_type()) {
1542 case MSG_OSD_MARK_ME_DOWN
:
1543 return prepare_mark_me_down(op
);
1545 return prepare_full(op
);
1546 case MSG_OSD_FAILURE
:
1547 return prepare_failure(op
);
1549 return prepare_boot(op
);
1551 return prepare_alive(op
);
1552 case MSG_OSD_PG_CREATED
:
1553 return prepare_pg_created(op
);
1554 case MSG_OSD_PGTEMP
:
1555 return prepare_pgtemp(op
);
1556 case MSG_OSD_BEACON
:
1557 return prepare_beacon(op
);
1559 case MSG_MON_COMMAND
:
1560 return prepare_command(op
);
1562 case CEPH_MSG_POOLOP
:
1563 return prepare_pool_op(op
);
1565 case MSG_REMOVE_SNAPS
:
1566 return prepare_remove_snaps(op
);
1576 bool OSDMonitor::should_propose(double& delay
)
1578 dout(10) << "should_propose" << dendl
;
1580 // if full map, propose immediately! any subsequent changes will be clobbered.
1581 if (pending_inc
.fullmap
.length())
1584 // adjust osd weights?
1585 if (!osd_weight
.empty() &&
1586 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
1587 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
1588 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
1594 // propose as fast as possible if updating up_thru or pg_temp
1595 // want to merge OSDMap changes as much as possible
1596 if ((pending_inc
.new_primary_temp
.size() == 1
1597 || pending_inc
.new_up_thru
.size() == 1)
1598 && pending_inc
.new_state
.size() < 2) {
1599 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl
;
1601 utime_t now
= ceph_clock_now();
1602 if (now
- last_attempted_minwait_time
> g_conf
->paxos_propose_interval
1603 && now
- paxos
->get_last_commit_time() > g_conf
->paxos_min_wait
) {
1604 delay
= g_conf
->paxos_min_wait
;
1605 last_attempted_minwait_time
= now
;
1610 return PaxosService::should_propose(delay
);
1615 // ---------------------------
1618 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
1620 op
->mark_osdmon_event(__func__
);
1621 MMonGetOSDMap
*m
= static_cast<MMonGetOSDMap
*>(op
->get_req());
1622 dout(10) << __func__
<< " " << *m
<< dendl
;
1623 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
);
1624 epoch_t first
= get_first_committed();
1625 epoch_t last
= osdmap
.get_epoch();
1626 int max
= g_conf
->osd_map_message_max
;
1627 for (epoch_t e
= MAX(first
, m
->get_full_first());
1628 e
<= MIN(last
, m
->get_full_last()) && max
> 0;
1630 int r
= get_version_full(e
, reply
->maps
[e
]);
1633 for (epoch_t e
= MAX(first
, m
->get_inc_first());
1634 e
<= MIN(last
, m
->get_inc_last()) && max
> 0;
1636 int r
= get_version(e
, reply
->incremental_maps
[e
]);
1639 reply
->oldest_map
= first
;
1640 reply
->newest_map
= last
;
1641 mon
->send_reply(op
, reply
);
1646 // ---------------------------
1651 bool OSDMonitor::check_source(PaxosServiceMessage
*m
, uuid_d fsid
) {
1652 // check permissions
1653 MonSession
*session
= m
->get_session();
1656 if (!session
->is_capable("osd", MON_CAP_X
)) {
1657 dout(0) << "got MOSDFailure from entity with insufficient caps "
1658 << session
->caps
<< dendl
;
1661 if (fsid
!= mon
->monmap
->fsid
) {
1662 dout(0) << "check_source: on fsid " << fsid
1663 << " != " << mon
->monmap
->fsid
<< dendl
;
1670 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
1672 op
->mark_osdmon_event(__func__
);
1673 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
1674 // who is target_osd
1675 int badboy
= m
->get_target().name
.num();
1677 // check permissions
1678 if (check_source(m
, m
->fsid
))
1681 // first, verify the reporting host is valid
1682 if (m
->get_orig_source().is_osd()) {
1683 int from
= m
->get_orig_source().num();
1684 if (!osdmap
.exists(from
) ||
1685 osdmap
.get_addr(from
) != m
->get_orig_source_inst().addr
||
1686 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
1687 dout(5) << "preprocess_failure from dead osd." << from
<< ", ignoring" << dendl
;
1688 send_incremental(op
, m
->get_epoch()+1);
1695 if (osdmap
.is_down(badboy
)) {
1696 dout(5) << "preprocess_failure dne(/dup?): " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1697 if (m
->get_epoch() < osdmap
.get_epoch())
1698 send_incremental(op
, m
->get_epoch()+1);
1701 if (osdmap
.get_inst(badboy
) != m
->get_target()) {
1702 dout(5) << "preprocess_failure wrong osd: report " << m
->get_target() << " != map's " << osdmap
.get_inst(badboy
)
1703 << ", from " << m
->get_orig_source_inst() << dendl
;
1704 if (m
->get_epoch() < osdmap
.get_epoch())
1705 send_incremental(op
, m
->get_epoch()+1);
1709 // already reported?
1710 if (osdmap
.is_down(badboy
) ||
1711 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
1712 dout(5) << "preprocess_failure dup/old: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1713 if (m
->get_epoch() < osdmap
.get_epoch())
1714 send_incremental(op
, m
->get_epoch()+1);
1718 if (!can_mark_down(badboy
)) {
1719 dout(5) << "preprocess_failure ignoring report of " << m
->get_target() << " from " << m
->get_orig_source_inst() << dendl
;
1723 dout(10) << "preprocess_failure new: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1730 class C_AckMarkedDown
: public C_MonOp
{
1736 : C_MonOp(op
), osdmon(osdmon
) {}
1738 void _finish(int) override
{
1739 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1740 osdmon
->mon
->send_reply(
1746 false)); // ACK itself does not request an ack
1748 ~C_AckMarkedDown() override
{
1752 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
1754 op
->mark_osdmon_event(__func__
);
1755 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1756 int requesting_down
= m
->get_target().name
.num();
1757 int from
= m
->get_orig_source().num();
1759 // check permissions
1760 if (check_source(m
, m
->fsid
))
1763 // first, verify the reporting host is valid
1764 if (!m
->get_orig_source().is_osd())
1767 if (!osdmap
.exists(from
) ||
1768 osdmap
.is_down(from
) ||
1769 osdmap
.get_addr(from
) != m
->get_target().addr
) {
1770 dout(5) << "preprocess_mark_me_down from dead osd."
1771 << from
<< ", ignoring" << dendl
;
1772 send_incremental(op
, m
->get_epoch()+1);
1776 // no down might be set
1777 if (!can_mark_down(requesting_down
))
1780 dout(10) << "MOSDMarkMeDown for: " << m
->get_target() << dendl
;
1784 if (m
->request_ack
) {
1785 Context
*c(new C_AckMarkedDown(this, op
));
1791 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
1793 op
->mark_osdmon_event(__func__
);
1794 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1795 int target_osd
= m
->get_target().name
.num();
1797 assert(osdmap
.is_up(target_osd
));
1798 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
1800 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
1801 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1803 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
1807 bool OSDMonitor::can_mark_down(int i
)
1809 if (osdmap
.test_flag(CEPH_OSDMAP_NODOWN
)) {
1810 dout(5) << "can_mark_down NODOWN flag set, will not mark osd." << i
<< " down" << dendl
;
1813 int num_osds
= osdmap
.get_num_osds();
1814 if (num_osds
== 0) {
1815 dout(5) << "can_mark_down no osds" << dendl
;
1818 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
1819 float up_ratio
= (float)up
/ (float)num_osds
;
1820 if (up_ratio
< g_conf
->mon_osd_min_up_ratio
) {
1821 dout(2) << "can_mark_down current up_ratio " << up_ratio
<< " < min "
1822 << g_conf
->mon_osd_min_up_ratio
1823 << ", will not mark osd." << i
<< " down" << dendl
;
1829 bool OSDMonitor::can_mark_up(int i
)
1831 if (osdmap
.test_flag(CEPH_OSDMAP_NOUP
)) {
1832 dout(5) << "can_mark_up NOUP flag set, will not mark osd." << i
<< " up" << dendl
;
1839 * @note the parameter @p i apparently only exists here so we can output the
1840 * osd's id on messages.
1842 bool OSDMonitor::can_mark_out(int i
)
1844 if (osdmap
.test_flag(CEPH_OSDMAP_NOOUT
)) {
1845 dout(5) << __func__
<< " NOOUT flag set, will not mark osds out" << dendl
;
1848 int num_osds
= osdmap
.get_num_osds();
1849 if (num_osds
== 0) {
1850 dout(5) << __func__
<< " no osds" << dendl
;
1853 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
1854 float in_ratio
= (float)in
/ (float)num_osds
;
1855 if (in_ratio
< g_conf
->mon_osd_min_in_ratio
) {
1857 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
1858 << g_conf
->mon_osd_min_in_ratio
1859 << ", will not mark osd." << i
<< " out" << dendl
;
1861 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
1862 << g_conf
->mon_osd_min_in_ratio
1863 << ", will not mark osds out" << dendl
;
1870 bool OSDMonitor::can_mark_in(int i
)
1872 if (osdmap
.test_flag(CEPH_OSDMAP_NOIN
)) {
1873 dout(5) << "can_mark_in NOIN flag set, will not mark osd." << i
<< " in" << dendl
;
1879 bool OSDMonitor::check_failures(utime_t now
)
1881 bool found_failure
= false;
1882 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
1883 p
!= failure_info
.end();
1885 if (can_mark_down(p
->first
)) {
1886 found_failure
|= check_failure(now
, p
->first
, p
->second
);
1889 return found_failure
;
1892 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
1894 // already pending failure?
1895 if (pending_inc
.new_state
.count(target_osd
) &&
1896 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
1897 dout(10) << " already pending failure" << dendl
;
1901 set
<string
> reporters_by_subtree
;
1902 string reporter_subtree_level
= g_conf
->mon_osd_reporter_subtree_level
;
1903 utime_t
orig_grace(g_conf
->osd_heartbeat_grace
, 0);
1904 utime_t max_failed_since
= fi
.get_failed_since();
1905 utime_t failed_for
= now
- max_failed_since
;
1907 utime_t grace
= orig_grace
;
1908 double my_grace
= 0, peer_grace
= 0;
1910 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
1911 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
1912 decay_k
= ::log(.5) / halflife
;
1914 // scale grace period based on historical probability of 'lagginess'
1915 // (false positive failures due to slowness).
1916 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
1917 double decay
= exp((double)failed_for
* decay_k
);
1918 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
1919 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
1920 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
1924 // consider the peers reporting a failure a proxy for a potential
1925 // 'subcluster' over the overall cluster that is similarly
1926 // laggy. this is clearly not true in all cases, but will sometimes
1927 // help us localize the grace correction to a subset of the system
1928 // (say, a rack with a bad switch) that is unhappy.
1929 assert(fi
.reporters
.size());
1930 for (map
<int,failure_reporter_t
>::iterator p
= fi
.reporters
.begin();
1931 p
!= fi
.reporters
.end();
1933 // get the parent bucket whose type matches with "reporter_subtree_level".
1934 // fall back to OSD if the level doesn't exist.
1935 map
<string
, string
> reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
1936 map
<string
, string
>::iterator iter
= reporter_loc
.find(reporter_subtree_level
);
1937 if (iter
== reporter_loc
.end()) {
1938 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
1940 reporters_by_subtree
.insert(iter
->second
);
1942 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
1943 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
1944 utime_t elapsed
= now
- xi
.down_stamp
;
1945 double decay
= exp((double)elapsed
* decay_k
);
1946 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
1950 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
1951 peer_grace
/= (double)fi
.reporters
.size();
1952 grace
+= peer_grace
;
1955 dout(10) << " osd." << target_osd
<< " has "
1956 << fi
.reporters
.size() << " reporters, "
1957 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
1958 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
1961 if (failed_for
>= grace
&&
1962 (int)reporters_by_subtree
.size() >= g_conf
->mon_osd_min_down_reporters
) {
1963 dout(1) << " we have enough reporters to mark osd." << target_osd
1964 << " down" << dendl
;
1965 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1967 mon
->clog
->info() << osdmap
.get_inst(target_osd
) << " failed ("
1968 << (int)reporters_by_subtree
.size() << " reporters from different "
1969 << reporter_subtree_level
<< " after "
1970 << failed_for
<< " >= grace " << grace
<< ")";
1976 void OSDMonitor::force_failure(utime_t now
, int target_osd
)
1978 // already pending failure?
1979 if (pending_inc
.new_state
.count(target_osd
) &&
1980 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
1981 dout(10) << " already pending failure" << dendl
;
1985 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
1986 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1988 mon
->clog
->info() << osdmap
.get_inst(target_osd
) << " failed (forced)";
1992 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
1994 op
->mark_osdmon_event(__func__
);
1995 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
1996 dout(1) << "prepare_failure " << m
->get_target()
1997 << " from " << m
->get_orig_source_inst()
1998 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
2000 int target_osd
= m
->get_target().name
.num();
2001 int reporter
= m
->get_orig_source().num();
2002 assert(osdmap
.is_up(target_osd
));
2003 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
2005 if (m
->if_osd_failed()) {
2006 // calculate failure time
2007 utime_t now
= ceph_clock_now();
2008 utime_t failed_since
=
2009 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
2012 if (m
->is_immediate()) {
2013 mon
->clog
->debug() << m
->get_target() << " reported immediately failed by "
2014 << m
->get_orig_source_inst();
2015 force_failure(now
, target_osd
);
2018 mon
->clog
->debug() << m
->get_target() << " reported failed by "
2019 << m
->get_orig_source_inst();
2021 failure_info_t
& fi
= failure_info
[target_osd
];
2022 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
2024 mon
->no_reply(old_op
);
2027 return check_failure(now
, target_osd
, fi
);
2029 // remove the report
2030 mon
->clog
->debug() << m
->get_target() << " failure report canceled by "
2031 << m
->get_orig_source_inst();
2032 if (failure_info
.count(target_osd
)) {
2033 failure_info_t
& fi
= failure_info
[target_osd
];
2034 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
2036 mon
->no_reply(report_op
);
2038 if (fi
.reporters
.empty()) {
2039 dout(10) << " removing last failure_info for osd." << target_osd
2041 failure_info
.erase(target_osd
);
2043 dout(10) << " failure_info for osd." << target_osd
<< " now "
2044 << fi
.reporters
.size() << " reporters" << dendl
;
2047 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
2055 void OSDMonitor::process_failures()
2057 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2058 while (p
!= failure_info
.end()) {
2059 if (osdmap
.is_up(p
->first
)) {
2062 dout(10) << "process_failures osd." << p
->first
<< dendl
;
2063 list
<MonOpRequestRef
> ls
;
2064 p
->second
.take_report_messages(ls
);
2065 failure_info
.erase(p
++);
2067 while (!ls
.empty()) {
2068 MonOpRequestRef o
= ls
.front();
2070 o
->mark_event(__func__
);
2071 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
2072 send_latest(o
, m
->get_epoch());
2080 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
2082 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
2084 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2085 p
!= failure_info
.end();
2087 p
->second
.take_report_messages(ls
);
2089 failure_info
.clear();
2095 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
2097 op
->mark_osdmon_event(__func__
);
2098 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2099 int from
= m
->get_orig_source_inst().name
.num();
2101 // check permissions, ignore if failed (no response expected)
2102 MonSession
*session
= m
->get_session();
2105 if (!session
->is_capable("osd", MON_CAP_X
)) {
2106 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2107 << session
->caps
<< dendl
;
2111 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
2112 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
2113 << " != " << mon
->monmap
->fsid
<< dendl
;
2117 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
2118 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
2122 assert(m
->get_orig_source_inst().name
.is_osd());
2124 // check if osd has required features to boot
2125 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2126 CEPH_FEATURE_OSD_ERASURE_CODES
) &&
2127 !(m
->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES
)) {
2128 dout(0) << __func__
<< " osdmap requires erasure code but osd at "
2129 << m
->get_orig_source_inst()
2130 << " doesn't announce support -- ignore" << dendl
;
2134 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2135 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
) &&
2136 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
)) {
2137 dout(0) << __func__
<< " osdmap requires erasure code plugins v2 but osd at "
2138 << m
->get_orig_source_inst()
2139 << " doesn't announce support -- ignore" << dendl
;
2143 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2144 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
) &&
2145 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
)) {
2146 dout(0) << __func__
<< " osdmap requires erasure code plugins v3 but osd at "
2147 << m
->get_orig_source_inst()
2148 << " doesn't announce support -- ignore" << dendl
;
2152 if (osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
) &&
2153 !HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
2154 mon
->clog
->info() << "disallowing boot of OSD "
2155 << m
->get_orig_source_inst()
2156 << " because the osdmap requires"
2157 << " CEPH_FEATURE_SERVER_LUMINOUS"
2158 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2162 if (osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_JEWEL
) &&
2163 !(m
->osd_features
& CEPH_FEATURE_SERVER_JEWEL
)) {
2164 mon
->clog
->info() << "disallowing boot of OSD "
2165 << m
->get_orig_source_inst()
2166 << " because the osdmap requires"
2167 << " CEPH_FEATURE_SERVER_JEWEL"
2168 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2172 if (osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_KRAKEN
) &&
2173 !HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
2174 mon
->clog
->info() << "disallowing boot of OSD "
2175 << m
->get_orig_source_inst()
2176 << " because the osdmap requires"
2177 << " CEPH_FEATURE_SERVER_KRAKEN"
2178 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2182 if (osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
2183 !(m
->osd_features
& CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
2184 mon
->clog
->info() << "disallowing boot of OSD "
2185 << m
->get_orig_source_inst()
2186 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2190 if (any_of(osdmap
.get_pools().begin(),
2191 osdmap
.get_pools().end(),
2192 [](const std::pair
<int64_t,pg_pool_t
>& pool
)
2193 { return pool
.second
.use_gmt_hitset
; })) {
2194 assert(osdmap
.get_num_up_osds() == 0 ||
2195 osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
);
2196 if (!(m
->osd_features
& CEPH_FEATURE_OSD_HITSET_GMT
)) {
2197 dout(0) << __func__
<< " one or more pools uses GMT hitsets but osd at "
2198 << m
->get_orig_source_inst()
2199 << " doesn't announce support -- ignore" << dendl
;
2204 // make sure upgrades stop at luminous
2205 if (HAVE_FEATURE(m
->osd_features
, SERVER_M
) &&
2206 !osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
2207 mon
->clog
->info() << "disallowing boot of post-luminous OSD "
2208 << m
->get_orig_source_inst()
2209 << " because require_luminous_osds is not set";
2213 // make sure upgrades stop at jewel
2214 if (HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
) &&
2215 !osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_JEWEL
)) {
2216 mon
->clog
->info() << "disallowing boot of post-jewel OSD "
2217 << m
->get_orig_source_inst()
2218 << " because require_jewel_osds is not set";
2222 // make sure upgrades stop at hammer
2223 // * HAMMER_0_94_4 is the required hammer feature
2224 // * MON_METADATA is the first post-hammer feature
2225 if (osdmap
.get_num_up_osds() > 0) {
2226 if ((m
->osd_features
& CEPH_FEATURE_MON_METADATA
) &&
2227 !(osdmap
.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4
)) {
2228 mon
->clog
->info() << "disallowing boot of post-hammer OSD "
2229 << m
->get_orig_source_inst()
2230 << " because one or more up OSDs is pre-hammer v0.94.4";
2233 if (!(m
->osd_features
& CEPH_FEATURE_HAMMER_0_94_4
) &&
2234 (osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_METADATA
)) {
2235 mon
->clog
->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2236 << m
->get_orig_source_inst()
2237 << " because all up OSDs are post-hammer";
2243 if (osdmap
.is_up(from
) &&
2244 osdmap
.get_inst(from
) == m
->get_orig_source_inst() &&
2245 osdmap
.get_cluster_addr(from
) == m
->cluster_addr
) {
2247 dout(7) << "preprocess_boot dup from " << m
->get_orig_source_inst()
2248 << " == " << osdmap
.get_inst(from
) << dendl
;
2253 if (osdmap
.exists(from
) &&
2254 !osdmap
.get_uuid(from
).is_zero() &&
2255 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2256 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
2257 << " clashes with existing osd: different fsid"
2258 << " (ours: " << osdmap
.get_uuid(from
)
2259 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
2263 if (osdmap
.exists(from
) &&
2264 osdmap
.get_info(from
).up_from
> m
->version
&&
2265 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) {
2266 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
2267 send_latest(op
, m
->sb
.current_epoch
+1);
2272 if (!can_mark_up(from
)) {
2273 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
2274 send_latest(op
, m
->sb
.current_epoch
+1);
2278 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
2285 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
2287 op
->mark_osdmon_event(__func__
);
2288 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2289 dout(7) << __func__
<< " from " << m
->get_orig_source_inst() << " sb " << m
->sb
2290 << " cluster_addr " << m
->cluster_addr
2291 << " hb_back_addr " << m
->hb_back_addr
2292 << " hb_front_addr " << m
->hb_front_addr
2295 assert(m
->get_orig_source().is_osd());
2296 int from
= m
->get_orig_source().num();
2298 // does this osd exist?
2299 if (from
>= osdmap
.get_max_osd()) {
2300 dout(1) << "boot from osd." << from
<< " >= max_osd "
2301 << osdmap
.get_max_osd() << dendl
;
2305 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
2306 if (pending_inc
.new_state
.count(from
))
2307 oldstate
^= pending_inc
.new_state
[from
];
2309 // already up? mark down first?
2310 if (osdmap
.is_up(from
)) {
2311 dout(7) << __func__
<< " was up, first marking down "
2312 << osdmap
.get_inst(from
) << dendl
;
2313 // preprocess should have caught these; if not, assert.
2314 assert(osdmap
.get_inst(from
) != m
->get_orig_source_inst() ||
2315 osdmap
.get_cluster_addr(from
) != m
->cluster_addr
);
2316 assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
2318 if (pending_inc
.new_state
.count(from
) == 0 ||
2319 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
2320 // mark previous guy down
2321 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
2323 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2324 } else if (pending_inc
.new_up_client
.count(from
)) {
2325 // already prepared, just wait
2326 dout(7) << __func__
<< " already prepared, waiting on "
2327 << m
->get_orig_source_addr() << dendl
;
2328 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2331 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addr();
2332 if (!m
->cluster_addr
.is_blank_ip())
2333 pending_inc
.new_up_cluster
[from
] = m
->cluster_addr
;
2334 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addr
;
2335 if (!m
->hb_front_addr
.is_blank_ip())
2336 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addr
;
2338 down_pending_out
.erase(from
); // if any
2341 osd_weight
[from
] = m
->sb
.weight
;
2344 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
2346 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2347 // preprocess should have caught this; if not, assert.
2348 assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
2349 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
2353 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
2354 const osd_info_t
& i
= osdmap
.get_info(from
);
2355 if (i
.up_from
> i
.lost_at
) {
2356 dout(10) << " fresh osd; marking lost_at too" << dendl
;
2357 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
2362 bufferlist osd_metadata
;
2363 ::encode(m
->metadata
, osd_metadata
);
2364 pending_metadata
[from
] = osd_metadata
;
2366 // adjust last clean unmount epoch?
2367 const osd_info_t
& info
= osdmap
.get_info(from
);
2368 dout(10) << " old osd_info: " << info
<< dendl
;
2369 if (m
->sb
.mounted
> info
.last_clean_begin
||
2370 (m
->sb
.mounted
== info
.last_clean_begin
&&
2371 m
->sb
.clean_thru
> info
.last_clean_end
)) {
2372 epoch_t begin
= m
->sb
.mounted
;
2373 epoch_t end
= m
->sb
.clean_thru
;
2375 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
2376 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
2377 << ") -> [" << begin
<< "-" << end
<< ")"
2379 pending_inc
.new_last_clean_interval
[from
] =
2380 pair
<epoch_t
,epoch_t
>(begin
, end
);
2383 osd_xinfo_t xi
= osdmap
.get_xinfo(from
);
2384 if (m
->boot_epoch
== 0) {
2385 xi
.laggy_probability
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2386 xi
.laggy_interval
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2387 dout(10) << " not laggy, new xi " << xi
<< dendl
;
2389 if (xi
.down_stamp
.sec()) {
2390 int interval
= ceph_clock_now().sec() -
2391 xi
.down_stamp
.sec();
2392 if (g_conf
->mon_osd_laggy_max_interval
&&
2393 (interval
> g_conf
->mon_osd_laggy_max_interval
)) {
2394 interval
= g_conf
->mon_osd_laggy_max_interval
;
2397 interval
* g_conf
->mon_osd_laggy_weight
+
2398 xi
.laggy_interval
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2400 xi
.laggy_probability
=
2401 g_conf
->mon_osd_laggy_weight
+
2402 xi
.laggy_probability
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2403 dout(10) << " laggy, now xi " << xi
<< dendl
;
2406 // set features shared by the osd
2407 if (m
->osd_features
)
2408 xi
.features
= m
->osd_features
;
2410 xi
.features
= m
->get_connection()->get_features();
2413 if ((g_conf
->mon_osd_auto_mark_auto_out_in
&&
2414 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
2415 (g_conf
->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
2416 (g_conf
->mon_osd_auto_mark_in
)) {
2417 if (can_mark_in(from
)) {
2418 if (osdmap
.osd_xinfo
[from
].old_weight
> 0) {
2419 pending_inc
.new_weight
[from
] = osdmap
.osd_xinfo
[from
].old_weight
;
2422 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
2425 dout(7) << __func__
<< " NOIN set, will not mark in "
2426 << m
->get_orig_source_addr() << dendl
;
2430 pending_inc
.new_xinfo
[from
] = xi
;
2433 wait_for_finished_proposal(op
, new C_Booted(this, op
));
2438 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
2440 op
->mark_osdmon_event(__func__
);
2441 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2442 dout(7) << "_booted " << m
->get_orig_source_inst()
2443 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
2446 mon
->clog
->info() << m
->get_orig_source_inst() << " boot";
2449 send_latest(op
, m
->sb
.current_epoch
+1);
2456 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
2458 op
->mark_osdmon_event(__func__
);
2459 MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2460 int from
= m
->get_orig_source().num();
2462 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2464 // check permissions, ignore if failed
2465 MonSession
*session
= m
->get_session();
2468 if (!session
->is_capable("osd", MON_CAP_X
)) {
2469 dout(0) << "MOSDFull from entity with insufficient privileges:"
2470 << session
->caps
<< dendl
;
2474 // ignore a full message from the osd instance that already went down
2475 if (!osdmap
.exists(from
)) {
2476 dout(7) << __func__
<< " ignoring full message from nonexistent "
2477 << m
->get_orig_source_inst() << dendl
;
2480 if ((!osdmap
.is_up(from
) &&
2481 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) ||
2482 (osdmap
.is_up(from
) &&
2483 osdmap
.get_inst(from
) != m
->get_orig_source_inst())) {
2484 dout(7) << __func__
<< " ignoring full message from down "
2485 << m
->get_orig_source_inst() << dendl
;
2489 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
2491 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
2492 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
2493 << " " << m
->get_orig_source_inst() << dendl
;
2494 _reply_map(op
, m
->version
);
2498 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
2499 << " " << m
->get_orig_source_inst() << dendl
;
2506 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
2508 op
->mark_osdmon_event(__func__
);
2509 const MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2510 const int from
= m
->get_orig_source().num();
2512 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2513 const unsigned want_state
= m
->state
& mask
; // safety first
2515 unsigned cur_state
= osdmap
.get_state(from
);
2516 auto p
= pending_inc
.new_state
.find(from
);
2517 if (p
!= pending_inc
.new_state
.end()) {
2518 cur_state
^= p
->second
;
2522 set
<string
> want_state_set
, cur_state_set
;
2523 OSDMap::calc_state_set(want_state
, want_state_set
);
2524 OSDMap::calc_state_set(cur_state
, cur_state_set
);
2526 if (cur_state
!= want_state
) {
2527 if (p
!= pending_inc
.new_state
.end()) {
2530 pending_inc
.new_state
[from
] = 0;
2532 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
2533 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2534 << " -> " << want_state_set
<< dendl
;
2536 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2537 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
2540 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2547 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
2549 op
->mark_osdmon_event(__func__
);
2550 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2551 int from
= m
->get_orig_source().num();
2553 // check permissions, ignore if failed
2554 MonSession
*session
= m
->get_session();
2557 if (!session
->is_capable("osd", MON_CAP_X
)) {
2558 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2559 << session
->caps
<< dendl
;
2563 if (!osdmap
.is_up(from
) ||
2564 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2565 dout(7) << "preprocess_alive ignoring alive message from down " << m
->get_orig_source_inst() << dendl
;
2569 if (osdmap
.get_up_thru(from
) >= m
->want
) {
2571 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
2572 _reply_map(op
, m
->version
);
2576 dout(10) << "preprocess_alive want up_thru " << m
->want
2577 << " from " << m
->get_orig_source_inst() << dendl
;
2584 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
2586 op
->mark_osdmon_event(__func__
);
2587 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2588 int from
= m
->get_orig_source().num();
2590 if (0) { // we probably don't care much about these
2591 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
2594 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
2595 << " from " << m
->get_orig_source_inst() << dendl
;
2597 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
2598 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2602 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
2604 op
->mark_osdmon_event(__func__
);
2605 dout(7) << "_reply_map " << e
2606 << " from " << op
->get_req()->get_orig_source_inst()
2612 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
2614 op
->mark_osdmon_event(__func__
);
2615 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2616 dout(10) << __func__
<< " " << *m
<< dendl
;
2617 auto session
= m
->get_session();
2619 dout(10) << __func__
<< ": no monitor session!" << dendl
;
2622 if (!session
->is_capable("osd", MON_CAP_X
)) {
2623 derr
<< __func__
<< " received from entity "
2624 << "with insufficient privileges " << session
->caps
<< dendl
;
2627 // always forward the "created!" to the leader
2631 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
2633 op
->mark_osdmon_event(__func__
);
2634 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2635 dout(10) << __func__
<< " " << *m
<< dendl
;
2636 auto src
= m
->get_orig_source();
2637 auto from
= src
.num();
2638 if (!src
.is_osd() ||
2639 !mon
->osdmon()->osdmap
.is_up(from
) ||
2640 m
->get_orig_source_inst() != mon
->osdmon()->osdmap
.get_inst(from
)) {
2641 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
2644 pending_created_pgs
.push_back(m
->pgid
);
2651 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
2653 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
2654 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
2655 mempool::osdmap::vector
<int> empty
;
2656 int from
= m
->get_orig_source().num();
2657 size_t ignore_cnt
= 0;
2660 MonSession
*session
= m
->get_session();
2663 if (!session
->is_capable("osd", MON_CAP_X
)) {
2664 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2665 << session
->caps
<< dendl
;
2669 if (!osdmap
.is_up(from
) ||
2670 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2671 dout(7) << "ignoring pgtemp message from down " << m
->get_orig_source_inst() << dendl
;
2675 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
2676 dout(20) << " " << p
->first
2677 << (osdmap
.pg_temp
->count(p
->first
) ? (*osdmap
.pg_temp
)[p
->first
] : empty
)
2678 << " -> " << p
->second
<< dendl
;
2680 // does the pool exist?
2681 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
2683 * 1. If the osdmap does not have the pool, it means the pool has been
2684 * removed in-between the osd sending this message and us handling it.
2685 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2686 * not exist in the pending either, as the osds would not send a
2687 * message about a pool they know nothing about (yet).
2688 * 3. However, if the pool does exist in the pending, then it must be a
2689 * new pool, and not relevant to this message (see 1).
2691 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2692 << ": pool has been removed" << dendl
;
2697 int acting_primary
= -1;
2698 osdmap
.pg_to_up_acting_osds(
2699 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
2700 if (acting_primary
!= from
) {
2701 /* If the source isn't the primary based on the current osdmap, we know
2702 * that the interval changed and that we can discard this message.
2703 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2704 * which of two pg temp mappings on the same pg is more recent.
2706 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2707 << ": primary has changed" << dendl
;
2713 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
2714 osdmap
.primary_temp
->count(p
->first
)))
2717 // NOTE: we assume that this will clear pg_primary, so consider
2718 // an existing pg_primary field to imply a change
2719 if (p
->second
.size() &&
2720 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
2721 !vectors_equal((*osdmap
.pg_temp
)[p
->first
], p
->second
) ||
2722 osdmap
.primary_temp
->count(p
->first
)))
2726 // should we ignore all the pgs?
2727 if (ignore_cnt
== m
->pg_temp
.size())
2730 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
2731 _reply_map(op
, m
->map_epoch
);
2738 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
2740 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
2741 auto ut
= pending_inc
.new_up_thru
.find(from
);
2742 if (ut
!= pending_inc
.new_up_thru
.end()) {
2743 old_up_thru
= ut
->second
;
2745 if (up_thru
> old_up_thru
) {
2746 // set up_thru too, so the osd doesn't have to ask again
2747 pending_inc
.new_up_thru
[from
] = up_thru
;
2751 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
2753 op
->mark_osdmon_event(__func__
);
2754 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
2755 int from
= m
->get_orig_source().num();
2756 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
2757 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
2758 uint64_t pool
= p
->first
.pool();
2759 if (pending_inc
.old_pools
.count(pool
)) {
2760 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2761 << ": pool pending removal" << dendl
;
2764 if (!osdmap
.have_pg_pool(pool
)) {
2765 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2766 << ": pool has been removed" << dendl
;
2769 pending_inc
.new_pg_temp
[p
->first
] =
2770 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
2772 // unconditionally clear pg_primary (until this message can encode
2773 // a change for that, too.. at which point we need to also fix
2774 // preprocess_pg_temp)
2775 if (osdmap
.primary_temp
->count(p
->first
) ||
2776 pending_inc
.new_primary_temp
.count(p
->first
))
2777 pending_inc
.new_primary_temp
[p
->first
] = -1;
2780 // set up_thru too, so the osd doesn't have to ask again
2781 update_up_thru(from
, m
->map_epoch
);
2783 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
2790 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
2792 op
->mark_osdmon_event(__func__
);
2793 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
2794 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
2796 // check privilege, ignore if failed
2797 MonSession
*session
= m
->get_session();
2800 if (!session
->caps
.is_capable(
2802 CEPH_ENTITY_TYPE_MON
,
2803 session
->entity_name
,
2804 "osd", "osd pool rmsnap", {}, true, true, false)) {
2805 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
2806 << session
->caps
<< dendl
;
2810 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
2811 q
!= m
->snaps
.end();
2813 if (!osdmap
.have_pg_pool(q
->first
)) {
2814 dout(10) << " ignoring removed_snaps " << q
->second
<< " on non-existent pool " << q
->first
<< dendl
;
2817 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
2818 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
2819 p
!= q
->second
.end();
2821 if (*p
> pi
->get_snap_seq() ||
2822 !pi
->removed_snaps
.contains(*p
))
2831 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
2833 op
->mark_osdmon_event(__func__
);
2834 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
2835 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
2837 for (map
<int, vector
<snapid_t
> >::iterator p
= m
->snaps
.begin();
2838 p
!= m
->snaps
.end();
2841 if (!osdmap
.have_pg_pool(p
->first
)) {
2842 dout(10) << " ignoring removed_snaps " << p
->second
<< " on non-existent pool " << p
->first
<< dendl
;
2846 pg_pool_t
& pi
= osdmap
.pools
[p
->first
];
2847 for (vector
<snapid_t
>::iterator q
= p
->second
.begin();
2848 q
!= p
->second
.end();
2850 if (!pi
.removed_snaps
.contains(*q
) &&
2851 (!pending_inc
.new_pools
.count(p
->first
) ||
2852 !pending_inc
.new_pools
[p
->first
].removed_snaps
.contains(*q
))) {
2853 pg_pool_t
*newpi
= pending_inc
.get_new_pool(p
->first
, &pi
);
2854 newpi
->removed_snaps
.insert(*q
);
2855 dout(10) << " pool " << p
->first
<< " removed_snaps added " << *q
2856 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
2857 if (*q
> newpi
->get_snap_seq()) {
2858 dout(10) << " pool " << p
->first
<< " snap_seq " << newpi
->get_snap_seq() << " -> " << *q
<< dendl
;
2859 newpi
->set_snap_seq(*q
);
2861 newpi
->set_snap_epoch(pending_inc
.epoch
);
2869 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
2871 op
->mark_osdmon_event(__func__
);
2872 auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
2874 auto session
= beacon
->get_session();
2876 dout(10) << __func__
<< " no monitor session!" << dendl
;
2879 if (!session
->is_capable("osd", MON_CAP_X
)) {
2880 derr
<< __func__
<< " received from entity "
2881 << "with insufficient privileges " << session
->caps
<< dendl
;
2884 // Always forward the beacon to the leader, even if they are the same as
2885 // the old one. The leader will mark as down osds that haven't sent
2886 // beacon for a few minutes.
2890 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
2892 op
->mark_osdmon_event(__func__
);
2893 const auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
2894 const auto src
= beacon
->get_orig_source();
2895 dout(10) << __func__
<< " " << *beacon
2896 << " from " << src
<< dendl
;
2897 int from
= src
.num();
2899 if (!src
.is_osd() ||
2900 !osdmap
.is_up(from
) ||
2901 beacon
->get_orig_source_inst() != osdmap
.get_inst(from
)) {
2902 dout(1) << " ignoring beacon from non-active osd." << dendl
;
2906 last_osd_report
[from
] = ceph_clock_now();
2907 osd_epochs
[from
] = beacon
->version
;
2909 for (const auto& pg
: beacon
->pgs
) {
2910 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
2918 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
2920 op
->mark_osdmon_event(__func__
);
2921 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
2922 << " start " << start
<< dendl
;
2926 send_incremental(op
, start
);
2930 MOSDMap
*OSDMonitor::build_latest_full()
2932 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
);
2933 get_version_full(osdmap
.get_epoch(), r
->maps
[osdmap
.get_epoch()]);
2934 r
->oldest_map
= get_first_committed();
2935 r
->newest_map
= osdmap
.get_epoch();
2939 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
)
2941 dout(10) << "build_incremental [" << from
<< ".." << to
<< "]" << dendl
;
2942 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
);
2943 m
->oldest_map
= get_first_committed();
2944 m
->newest_map
= osdmap
.get_epoch();
2946 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
2948 int err
= get_version(e
, bl
);
2950 assert(bl
.length());
2951 // if (get_version(e, bl) > 0) {
2952 dout(20) << "build_incremental inc " << e
<< " "
2953 << bl
.length() << " bytes" << dendl
;
2954 m
->incremental_maps
[e
] = bl
;
2956 assert(err
== -ENOENT
);
2957 assert(!bl
.length());
2958 get_version_full(e
, bl
);
2959 if (bl
.length() > 0) {
2960 //else if (get_version("full", e, bl) > 0) {
2961 dout(20) << "build_incremental full " << e
<< " "
2962 << bl
.length() << " bytes" << dendl
;
2965 ceph_abort(); // we should have all maps.
2972 void OSDMonitor::send_full(MonOpRequestRef op
)
2974 op
->mark_osdmon_event(__func__
);
2975 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
2976 mon
->send_reply(op
, build_latest_full());
2979 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
2981 op
->mark_osdmon_event(__func__
);
2983 MonSession
*s
= op
->get_session();
2987 s
->proxy_con
->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP
)) {
2988 // oh, we can tell the other mon to do it
2989 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
2991 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
2992 r
->send_osdmap_first
= first
;
2993 s
->proxy_con
->send_message(r
);
2994 op
->mark_event("reply: send routed send_osdmap_first reply");
2997 send_incremental(first
, s
, false, op
);
3001 void OSDMonitor::send_incremental(epoch_t first
,
3002 MonSession
*session
,
3004 MonOpRequestRef req
)
3006 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
3007 << " to " << session
->inst
<< dendl
;
3009 if (first
<= session
->osd_epoch
) {
3010 dout(10) << __func__
<< session
->inst
<< " should already have epoch "
3011 << session
->osd_epoch
<< dendl
;
3012 first
= session
->osd_epoch
+ 1;
3015 if (first
< get_first_committed()) {
3016 first
= get_first_committed();
3018 int err
= get_version_full(first
, bl
);
3020 assert(bl
.length());
3022 dout(20) << "send_incremental starting with base full "
3023 << first
<< " " << bl
.length() << " bytes" << dendl
;
3025 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid());
3026 m
->oldest_map
= get_first_committed();
3027 m
->newest_map
= osdmap
.get_epoch();
3028 m
->maps
[first
] = bl
;
3031 mon
->send_reply(req
, m
);
3032 session
->osd_epoch
= first
;
3035 session
->con
->send_message(m
);
3036 session
->osd_epoch
= first
;
3041 while (first
<= osdmap
.get_epoch()) {
3042 epoch_t last
= MIN(first
+ g_conf
->osd_map_message_max
- 1,
3043 osdmap
.get_epoch());
3044 MOSDMap
*m
= build_incremental(first
, last
);
3047 // send some maps. it may not be all of them, but it will get them
3049 mon
->send_reply(req
, m
);
3051 session
->con
->send_message(m
);
3054 session
->osd_epoch
= last
;
3060 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
3062 if (inc_osd_cache
.lookup(ver
, &bl
)) {
3065 int ret
= PaxosService::get_version(ver
, bl
);
3067 inc_osd_cache
.add(ver
, bl
);
3072 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
3074 if (full_osd_cache
.lookup(ver
, &bl
)) {
3077 int ret
= PaxosService::get_version_full(ver
, bl
);
3079 full_osd_cache
.add(ver
, bl
);
3084 epoch_t
OSDMonitor::blacklist(const entity_addr_t
& a
, utime_t until
)
3086 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
3087 pending_inc
.new_blacklist
[a
] = until
;
3088 return pending_inc
.epoch
;
3092 void OSDMonitor::check_osdmap_subs()
3094 dout(10) << __func__
<< dendl
;
3095 if (!osdmap
.get_epoch()) {
3098 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
3099 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
3102 auto p
= osdmap_subs
->second
->begin();
3106 check_osdmap_sub(sub
);
3110 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
3112 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
3113 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
3114 if (sub
->next
<= osdmap
.get_epoch()) {
3116 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
3118 sub
->session
->con
->send_message(build_latest_full());
3120 mon
->session_map
.remove_sub(sub
);
3122 sub
->next
= osdmap
.get_epoch() + 1;
3126 void OSDMonitor::check_pg_creates_subs()
3128 if (!mon
->monmap
->get_required_features().contains_all(
3129 ceph::features::mon::FEATURE_LUMINOUS
)) {
3130 // PGMonitor takes care of this in pre-luminous era.
3133 if (!osdmap
.get_num_up_osds()) {
3136 assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
3137 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
3138 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
3139 if (pg_creates_subs
== session_map
.subs
.end()) {
3142 for (auto sub
: *pg_creates_subs
->second
) {
3143 check_pg_creates_sub(sub
);
3148 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
3150 dout(20) << __func__
<< " .. " << sub
->session
->inst
<< dendl
;
3151 assert(sub
->type
== "osd_pg_creates");
3152 // only send these if the OSD is up. we will check_subs() when they do
3153 // come up so they will get the creates then.
3154 if (sub
->session
->inst
.name
.is_osd() &&
3155 mon
->osdmon()->osdmap
.is_up(sub
->session
->inst
.name
.num())) {
3156 sub
->next
= send_pg_creates(sub
->session
->inst
.name
.num(),
3157 sub
->session
->con
.get(),
3162 void OSDMonitor::scan_for_creating_pgs(
3163 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
3164 const mempool::osdmap::set
<int64_t>& removed_pools
,
3166 creating_pgs_t
* creating_pgs
) const
3168 for (auto& p
: pools
) {
3169 int64_t poolid
= p
.first
;
3170 const pg_pool_t
& pool
= p
.second
;
3171 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_ruleset(),
3172 pool
.get_type(), pool
.get_size());
3173 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
3176 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
3177 const auto created
= pool
.get_last_change();
3178 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
3179 dout(10) << __func__
<< " no change in pool " << poolid
3180 << " " << pool
<< dendl
;
3183 if (removed_pools
.count(poolid
)) {
3184 dout(10) << __func__
<< " pool is being removed: " << poolid
3185 << " " << pool
<< dendl
;
3188 dout(10) << __func__
<< " scanning pool " << poolid
3189 << " " << pool
<< dendl
;
3190 if (creating_pgs
->created_pools
.count(poolid
)) {
3191 // split pgs are skipped by OSD, so drop it early.
3194 // first pgs in this pool
3195 for (ps_t ps
= 0; ps
< pool
.get_pg_num(); ps
++) {
3196 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
3197 if (creating_pgs
->pgs
.count(pgid
)) {
3198 dout(20) << __func__
<< " already have " << pgid
<< dendl
;
3201 creating_pgs
->pgs
.emplace(pgid
, make_pair(created
, modified
));
3202 dout(10) << __func__
<< " adding " << pgid
3203 << " at " << osdmap
.get_epoch() << dendl
;
3208 void OSDMonitor::update_creating_pgs()
3210 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
3211 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3212 for (auto& pg
: creating_pgs
.pgs
) {
3213 int acting_primary
= -1;
3214 auto pgid
= pg
.first
;
3215 auto mapped
= pg
.second
.first
;
3216 mapping
.get(pgid
, nullptr, nullptr, nullptr, &acting_primary
);
3217 // check the previous creating_pgs, look for the target to whom the pg was
3218 // previously mapped
3219 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
3220 const auto last_acting_primary
= pgs_by_epoch
.first
;
3221 for (auto& pgs
: pgs_by_epoch
.second
) {
3222 if (pgs
.second
.count(pgid
)) {
3223 if (last_acting_primary
== acting_primary
) {
3226 dout(20) << __func__
<< " " << pgid
<< " "
3227 << " acting_primary:" << last_acting_primary
3228 << " -> " << acting_primary
<< dendl
;
3229 // note epoch if the target of the create message changed.
3230 mapped
= mapping
.get_epoch();
3236 dout(10) << __func__
<< " will instruct osd." << acting_primary
3237 << " to create " << pgid
<< dendl
;
3238 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(pgid
);
3240 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
3241 creating_pgs_epoch
= mapping
.get_epoch();
3244 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
)
3246 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
3247 << " " << creating_pgs_by_osd_epoch
<< dendl
;
3248 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3249 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
3250 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
3252 assert(!creating_pgs_by_epoch
->second
.empty());
3254 MOSDPGCreate
*m
= nullptr;
3256 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
3257 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
3258 auto epoch
= epoch_pgs
->first
;
3259 auto& pgs
= epoch_pgs
->second
;
3260 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3261 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
3263 for (auto& pg
: pgs
) {
3265 m
= new MOSDPGCreate(creating_pgs_epoch
);
3266 // Need the create time from the monitor using its clock to set
3267 // last_scrub_stamp upon pg creation.
3268 const auto& creation
= creating_pgs
.pgs
[pg
];
3269 m
->mkpg
.emplace(pg
, pg_create_t
{creation
.first
, pg
, 0});
3270 m
->ctimes
.emplace(pg
, creation
.second
);
3271 dout(20) << __func__
<< " will create " << pg
3272 << " at " << creation
.first
<< dendl
;
3276 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3277 << " has nothing to send" << dendl
;
3280 con
->send_message(m
);
3281 // sub is current through last + 1
3288 void OSDMonitor::tick()
3290 if (!is_active()) return;
3292 dout(10) << osdmap
<< dendl
;
3294 if (!mon
->is_leader()) return;
3296 bool do_propose
= false;
3297 utime_t now
= ceph_clock_now();
3299 if (osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
) &&
3300 mon
->monmap
->get_required_features().contains_all(
3301 ceph::features::mon::FEATURE_LUMINOUS
)) {
3302 if (handle_osd_timeouts(now
, last_osd_report
)) {
3308 if (check_failures(now
))
3311 // mark down osds out?
3313 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3314 * influence at all. The decision is made based on the ratio of "in" osds,
3315 * and the function returns false if this ratio is lower that the minimum
3316 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3318 if (can_mark_out(-1)) {
3319 set
<int> down_cache
; // quick cache of down subtrees
3321 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
3322 while (i
!= down_pending_out
.end()) {
3328 if (osdmap
.is_down(o
) &&
3331 utime_t
orig_grace(g_conf
->mon_osd_down_out_interval
, 0);
3332 utime_t grace
= orig_grace
;
3333 double my_grace
= 0.0;
3335 if (g_conf
->mon_osd_adjust_down_out_interval
) {
3336 // scale grace period the same way we do the heartbeat grace.
3337 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
3338 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
3339 double decay_k
= ::log(.5) / halflife
;
3340 double decay
= exp((double)down
* decay_k
);
3341 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
3342 << " down for " << down
<< " decay " << decay
<< dendl
;
3343 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3347 // is this an entire large subtree down?
3348 if (g_conf
->mon_osd_down_out_subtree_limit
.length()) {
3349 int type
= osdmap
.crush
->get_type_id(g_conf
->mon_osd_down_out_subtree_limit
);
3351 if (osdmap
.containing_subtree_is_down(g_ceph_context
, o
, type
, &down_cache
)) {
3352 dout(10) << "tick entire containing " << g_conf
->mon_osd_down_out_subtree_limit
3353 << " subtree for osd." << o
<< " is down; resetting timer" << dendl
;
3354 // reset timer, too.
3355 down_pending_out
[o
] = now
;
3361 if (g_conf
->mon_osd_down_out_interval
> 0 &&
3362 down
.sec() >= grace
) {
3363 dout(10) << "tick marking osd." << o
<< " OUT after " << down
3364 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
3365 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
3367 // set the AUTOOUT bit.
3368 if (pending_inc
.new_state
.count(o
) == 0)
3369 pending_inc
.new_state
[o
] = 0;
3370 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
3372 // remember previous weight
3373 if (pending_inc
.new_xinfo
.count(o
) == 0)
3374 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
3375 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
3379 mon
->clog
->info() << "osd." << o
<< " out (down for " << down
<< ")";
3384 down_pending_out
.erase(o
);
3387 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
3390 // expire blacklisted items?
3391 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
3392 p
!= osdmap
.blacklist
.end();
3394 if (p
->second
< now
) {
3395 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
3396 pending_inc
.old_blacklist
.push_back(p
->first
);
3401 // if map full setting has changed, get that info out there!
3402 if (!osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
) &&
3403 mon
->pgmon()->is_readable()) {
3404 // for pre-luminous compat only!
3405 if (!mon
->pgmon()->pg_map
.full_osds
.empty()) {
3406 dout(5) << "There are full osds, setting full flag" << dendl
;
3407 add_flag(CEPH_OSDMAP_FULL
);
3408 } else if (osdmap
.test_flag(CEPH_OSDMAP_FULL
)){
3409 dout(10) << "No full osds, removing full flag" << dendl
;
3410 remove_flag(CEPH_OSDMAP_FULL
);
3413 if (!mon
->pgmon()->pg_map
.nearfull_osds
.empty()) {
3414 dout(5) << "There are near full osds, setting nearfull flag" << dendl
;
3415 add_flag(CEPH_OSDMAP_NEARFULL
);
3416 } else if (osdmap
.test_flag(CEPH_OSDMAP_NEARFULL
)){
3417 dout(10) << "No near full osds, removing nearfull flag" << dendl
;
3418 remove_flag(CEPH_OSDMAP_NEARFULL
);
3420 if (pending_inc
.new_flags
!= -1 &&
3421 (pending_inc
.new_flags
^ osdmap
.flags
) & (CEPH_OSDMAP_FULL
| CEPH_OSDMAP_NEARFULL
)) {
3422 dout(1) << "New setting for" <<
3423 (pending_inc
.new_flags
& CEPH_OSDMAP_FULL
? " CEPH_OSDMAP_FULL" : "") <<
3424 (pending_inc
.new_flags
& CEPH_OSDMAP_NEARFULL
? " CEPH_OSDMAP_NEARFULL" : "")
3425 << " -- doing propose" << dendl
;
3430 if (update_pools_status())
3434 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
3438 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
3439 std::map
<int,utime_t
> &last_osd_report
)
3441 utime_t
timeo(g_conf
->mon_osd_report_timeout
, 0);
3442 if (now
- mon
->get_leader_since() < timeo
) {
3443 // We haven't been the leader for long enough to consider OSD timeouts
3447 int max_osd
= osdmap
.get_max_osd();
3448 bool new_down
= false;
3450 for (int i
=0; i
< max_osd
; ++i
) {
3451 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
3452 if (!osdmap
.is_up(i
))
3454 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
3455 if (t
== last_osd_report
.end()) {
3456 // it wasn't in the map; start the timer.
3457 last_osd_report
[i
] = now
;
3458 } else if (can_mark_down(i
)) {
3459 utime_t diff
= now
- t
->second
;
3461 mon
->clog
->info() << "osd." << i
<< " marked down after no pg stats for " << diff
<< "seconds";
3462 derr
<< "no osd or pg stats from osd." << i
<< " since " << t
->second
<< ", " << diff
3463 << " seconds ago. marking down" << dendl
;
3464 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
3472 void OSDMonitor::get_health(list
<pair
<health_status_t
,string
> >& summary
,
3473 list
<pair
<health_status_t
,string
> > *detail
,
3474 CephContext
*cct
) const
3476 int num_osds
= osdmap
.get_num_osds();
3478 if (num_osds
== 0) {
3479 summary
.push_back(make_pair(HEALTH_ERR
, "no osds"));
3481 int num_in_osds
= 0;
3482 int num_down_in_osds
= 0;
3484 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3485 if (!osdmap
.exists(i
)) {
3486 if (osdmap
.crush
->item_exists(i
)) {
3491 if (osdmap
.is_out(i
))
3494 if (!osdmap
.is_up(i
)) {
3497 const osd_info_t
& info
= osdmap
.get_info(i
);
3499 ss
<< "osd." << i
<< " is down since epoch " << info
.down_at
3500 << ", last address " << osdmap
.get_addr(i
);
3501 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3505 assert(num_down_in_osds
<= num_in_osds
);
3506 if (num_down_in_osds
> 0) {
3508 ss
<< num_down_in_osds
<< "/" << num_in_osds
<< " in osds are down";
3509 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3512 if (!osds
.empty()) {
3514 ss
<< "osds were removed from osdmap, but still kept in crushmap";
3515 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3517 ss
<< " osds: [" << osds
<< "]";
3518 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3522 if (osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
3523 // An osd could configure failsafe ratio, to something different
3524 // but for now assume it is the same here.
3525 float fsr
= g_conf
->osd_failsafe_full_ratio
;
3526 if (fsr
> 1.0) fsr
/= 100;
3527 float fr
= osdmap
.get_full_ratio();
3528 float br
= osdmap
.get_backfillfull_ratio();
3529 float nr
= osdmap
.get_nearfull_ratio();
3531 bool out_of_order
= false;
3532 // These checks correspond to how OSDService::check_full_status() in an OSD
3533 // handles the improper setting of these values.
3535 out_of_order
= true;
3538 ss
<< "backfillfull_ratio (" << br
<< ") < nearfull_ratio (" << nr
<< "), increased";
3539 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
3544 out_of_order
= true;
3547 ss
<< "full_ratio (" << fr
<< ") < backfillfull_ratio (" << br
<< "), increased";
3548 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
3553 out_of_order
= true;
3556 ss
<< "osd_failsafe_full_ratio (" << fsr
<< ") < full_ratio (" << fr
<< "), increased";
3557 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
3562 ss
<< "Full ratio(s) out of order";
3563 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
3566 map
<int, float> full
, backfillfull
, nearfull
;
3567 osdmap
.get_full_osd_util(mon
->pgmon()->pg_map
.osd_stat
, &full
, &backfillfull
, &nearfull
);
3570 ss
<< full
.size() << " full osd(s)";
3571 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
3573 if (backfillfull
.size()) {
3575 ss
<< backfillfull
.size() << " backfillfull osd(s)";
3576 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3578 if (nearfull
.size()) {
3580 ss
<< nearfull
.size() << " nearfull osd(s)";
3581 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3584 for (auto& i
: full
) {
3586 ss
<< "osd." << i
.first
<< " is full at " << roundf(i
.second
* 100) << "%";
3587 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
3589 for (auto& i
: backfillfull
) {
3591 ss
<< "osd." << i
.first
<< " is backfill full at " << roundf(i
.second
* 100) << "%";
3592 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3594 for (auto& i
: nearfull
) {
3596 ss
<< "osd." << i
.first
<< " is near full at " << roundf(i
.second
* 100) << "%";
3597 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3601 // note: we leave it to ceph-mgr to generate details health warnings
3602 // with actual osd utilizations
3605 uint64_t warn_flags
=
3607 CEPH_OSDMAP_PAUSERD
|
3608 CEPH_OSDMAP_PAUSEWR
|
3609 CEPH_OSDMAP_PAUSEREC
|
3611 CEPH_OSDMAP_NODOWN
|
3614 CEPH_OSDMAP_NOBACKFILL
|
3615 CEPH_OSDMAP_NORECOVER
|
3616 CEPH_OSDMAP_NOSCRUB
|
3617 CEPH_OSDMAP_NODEEP_SCRUB
|
3618 CEPH_OSDMAP_NOTIERAGENT
|
3619 CEPH_OSDMAP_NOREBALANCE
;
3620 if (osdmap
.test_flag(warn_flags
)) {
3622 ss
<< osdmap
.get_flag_string(osdmap
.get_flags() & warn_flags
)
3624 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3626 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3629 // old crush tunables?
3630 if (g_conf
->mon_warn_on_legacy_crush_tunables
) {
3631 string min
= osdmap
.crush
->get_min_required_version();
3632 if (min
< g_conf
->mon_crush_min_required_version
) {
3634 ss
<< "crush map has legacy tunables (require " << min
3635 << ", min is " << g_conf
->mon_crush_min_required_version
<< ")";
3636 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3638 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3639 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3643 if (g_conf
->mon_warn_on_crush_straw_calc_version_zero
) {
3644 if (osdmap
.crush
->get_straw_calc_version() == 0) {
3646 ss
<< "crush map has straw_calc_version=0";
3647 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3649 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3650 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3655 // hit_set-less cache_mode?
3656 if (g_conf
->mon_warn_on_cache_pools_without_hit_sets
) {
3657 int problem_cache_pools
= 0;
3658 for (map
<int64_t, pg_pool_t
>::const_iterator p
= osdmap
.pools
.begin();
3659 p
!= osdmap
.pools
.end();
3661 const pg_pool_t
& info
= p
->second
;
3662 if (info
.cache_mode_requires_hit_set() &&
3663 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
3664 ++problem_cache_pools
;
3667 ss
<< "pool '" << osdmap
.get_pool_name(p
->first
)
3668 << "' with cache_mode " << info
.get_cache_mode_name()
3669 << " needs hit_set_type to be set but it is not";
3670 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3674 if (problem_cache_pools
) {
3676 ss
<< problem_cache_pools
<< " cache pools are missing hit_sets";
3677 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3681 // Not using 'sortbitwise' and should be?
3682 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
3683 (osdmap
.get_up_osd_features() &
3684 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
3686 ss
<< "no legacy OSD present but 'sortbitwise' flag is not set";
3687 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3690 // Warn if 'mon_osd_down_out_interval' is set to zero.
3691 // Having this option set to zero on the leader acts much like the
3692 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3693 // without the 'noout' flag set but acting like that just the same, so
3694 // we report a HEALTH_WARN in case this option is set to zero.
3695 // This is an ugly hack to get the warning out, but until we find a way
3696 // to spread global options throughout the mon cluster and have all mons
3697 // using a base set of the same options, we need to work around this sort
3699 // There's also the obvious drawback that if this is set on a single
3700 // monitor on a 3-monitor cluster, this warning will only be shown every
3701 // third monitor connection.
3702 if (g_conf
->mon_warn_on_osd_down_out_interval_zero
&&
3703 g_conf
->mon_osd_down_out_interval
== 0) {
3705 ss
<< "mon." << mon
->name
<< " has mon_osd_down_out_interval set to 0";
3706 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3708 ss
<< "; this has the same effect as the 'noout' flag";
3709 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3713 // warn about upgrade flags that can be set but are not.
3714 if (g_conf
->mon_debug_no_require_luminous
) {
3715 // ignore these checks
3716 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
) &&
3717 !osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
3718 string msg
= "all OSDs are running luminous or later but the"
3719 " 'require_luminous_osds' osdmap flag is not set";
3720 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
3722 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
3724 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
) &&
3725 !osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_KRAKEN
)) {
3726 string msg
= "all OSDs are running kraken or later but the"
3727 " 'require_kraken_osds' osdmap flag is not set";
3728 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
3730 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
3732 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
) &&
3733 !osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_JEWEL
)) {
3734 string msg
= "all OSDs are running jewel or later but the"
3735 " 'require_jewel_osds' osdmap flag is not set";
3736 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
3738 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
3742 get_pools_health(summary
, detail
);
3746 void OSDMonitor::dump_info(Formatter
*f
)
3748 f
->open_object_section("osdmap");
3752 f
->open_array_section("osd_metadata");
3753 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
3754 if (osdmap
.exists(i
)) {
3755 f
->open_object_section("osd");
3756 f
->dump_unsigned("id", i
);
3757 dump_osd_metadata(i
, f
, NULL
);
3763 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
3764 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
3766 f
->open_object_section("crushmap");
3767 osdmap
.crush
->dump(f
);
3772 enum osd_pool_get_choices
{
3773 SIZE
, MIN_SIZE
, CRASH_REPLAY_INTERVAL
,
3774 PG_NUM
, PGP_NUM
, CRUSH_RULE
, CRUSH_RULESET
, HASHPSPOOL
,
3775 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
3776 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
3777 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
3778 USE_GMT_HITSET
, AUID
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
3779 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
3780 CACHE_TARGET_FULL_RATIO
,
3781 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
3782 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
3783 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
3784 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
3785 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
3786 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
3787 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
3788 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
3789 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
};
3791 std::set
<osd_pool_get_choices
>
3792 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
3793 const std::set
<osd_pool_get_choices
>& second
)
3795 std::set
<osd_pool_get_choices
> result
;
3796 std::set_difference(first
.begin(), first
.end(),
3797 second
.begin(), second
.end(),
3798 std::inserter(result
, result
.end()));
3804 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
3806 op
->mark_osdmon_event(__func__
);
3807 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
3810 stringstream ss
, ds
;
3812 map
<string
, cmd_vartype
> cmdmap
;
3813 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
3814 string rs
= ss
.str();
3815 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
3819 MonSession
*session
= m
->get_session();
3821 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
3826 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
3829 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
3830 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
3832 if (prefix
== "osd stat") {
3833 osdmap
.print_summary(f
.get(), ds
);
3839 else if (prefix
== "osd perf" ||
3840 prefix
== "osd blocked-by") {
3841 const PGMap
&pgm
= mon
->pgmon()->pg_map
;
3842 r
= process_pg_map_command(prefix
, cmdmap
, pgm
, osdmap
,
3843 f
.get(), &ss
, &rdata
);
3845 else if (prefix
== "osd dump" ||
3846 prefix
== "osd tree" ||
3847 prefix
== "osd ls" ||
3848 prefix
== "osd getmap" ||
3849 prefix
== "osd getcrushmap") {
3854 cmd_getval(g_ceph_context
, cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
3857 bufferlist osdmap_bl
;
3858 int err
= get_version_full(epoch
, osdmap_bl
);
3859 if (err
== -ENOENT
) {
3861 ss
<< "there is no map for epoch " << epoch
;
3865 assert(osdmap_bl
.length());
3868 if (epoch
== osdmap
.get_epoch()) {
3872 p
->decode(osdmap_bl
);
3875 if (prefix
== "osd dump") {
3878 f
->open_object_section("osdmap");
3888 } else if (prefix
== "osd ls") {
3890 f
->open_array_section("osds");
3891 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3892 if (osdmap
.exists(i
)) {
3893 f
->dump_int("osd", i
);
3900 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3901 if (osdmap
.exists(i
)) {
3910 } else if (prefix
== "osd tree") {
3912 f
->open_object_section("tree");
3913 p
->print_tree(f
.get(), NULL
);
3917 p
->print_tree(NULL
, &ds
);
3920 } else if (prefix
== "osd getmap") {
3921 rdata
.append(osdmap_bl
);
3922 ss
<< "got osdmap epoch " << p
->get_epoch();
3923 } else if (prefix
== "osd getcrushmap") {
3924 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
3925 ss
<< "got crush map from osdmap epoch " << p
->get_epoch();
3929 } else if (prefix
== "osd df") {
3931 cmd_getval(g_ceph_context
, cmdmap
, "output_method", method
);
3932 print_utilization(ds
, f
? f
.get() : NULL
, method
== "tree");
3934 } else if (prefix
== "osd getmaxosd") {
3936 f
->open_object_section("getmaxosd");
3937 f
->dump_unsigned("epoch", osdmap
.get_epoch());
3938 f
->dump_int("max_osd", osdmap
.get_max_osd());
3942 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
3945 } else if (prefix
== "osd utilization") {
3947 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
3954 } else if (prefix
== "osd find") {
3956 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
3957 ss
<< "unable to parse osd id value '"
3958 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
3962 if (!osdmap
.exists(osd
)) {
3963 ss
<< "osd." << osd
<< " does not exist";
3968 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
3969 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
3970 f
->open_object_section("osd_location");
3971 f
->dump_int("osd", osd
);
3972 f
->dump_stream("ip") << osdmap
.get_addr(osd
);
3973 f
->open_object_section("crush_location");
3974 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
3975 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
3976 f
->dump_string(p
->first
.c_str(), p
->second
);
3980 } else if (prefix
== "osd metadata") {
3982 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
3983 !cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
3984 ss
<< "unable to parse osd id value '"
3985 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
3989 if (osd
>= 0 && !osdmap
.exists(osd
)) {
3990 ss
<< "osd." << osd
<< " does not exist";
3995 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
3996 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
3998 f
->open_object_section("osd_metadata");
3999 f
->dump_unsigned("id", osd
);
4000 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
4006 f
->open_array_section("osd_metadata");
4007 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4008 if (osdmap
.exists(i
)) {
4009 f
->open_object_section("osd");
4010 f
->dump_unsigned("id", i
);
4011 r
= dump_osd_metadata(i
, f
.get(), NULL
);
4012 if (r
== -EINVAL
|| r
== -ENOENT
) {
4013 // Drop error, continue to get other daemons' metadata
4014 dout(4) << "No metadata for osd." << i
<< dendl
;
4026 } else if (prefix
== "osd map") {
4027 string poolstr
, objstr
, namespacestr
;
4028 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
4029 cmd_getval(g_ceph_context
, cmdmap
, "object", objstr
);
4030 cmd_getval(g_ceph_context
, cmdmap
, "nspace", namespacestr
);
4032 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4034 ss
<< "pool " << poolstr
<< " does not exist";
4038 object_locator_t
oloc(pool
, namespacestr
);
4039 object_t
oid(objstr
);
4040 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
4041 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4042 vector
<int> up
, acting
;
4044 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
4047 if (!namespacestr
.empty())
4048 fullobjname
= namespacestr
+ string("/") + oid
.name
;
4050 fullobjname
= oid
.name
;
4052 f
->open_object_section("osd_map");
4053 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4054 f
->dump_string("pool", poolstr
);
4055 f
->dump_int("pool_id", pool
);
4056 f
->dump_stream("objname") << fullobjname
;
4057 f
->dump_stream("raw_pgid") << pgid
;
4058 f
->dump_stream("pgid") << mpgid
;
4059 f
->open_array_section("up");
4060 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
4061 f
->dump_int("osd", *p
);
4063 f
->dump_int("up_primary", up_p
);
4064 f
->open_array_section("acting");
4065 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
4066 f
->dump_int("osd", *p
);
4068 f
->dump_int("acting_primary", acting_p
);
4069 f
->close_section(); // osd_map
4072 ds
<< "osdmap e" << osdmap
.get_epoch()
4073 << " pool '" << poolstr
<< "' (" << pool
<< ")"
4074 << " object '" << fullobjname
<< "' ->"
4075 << " pg " << pgid
<< " (" << mpgid
<< ")"
4076 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
4077 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
4081 } else if (prefix
== "pg map") {
4084 cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
);
4085 if (!pgid
.parse(pgidstr
.c_str())) {
4086 ss
<< "invalid pgid '" << pgidstr
<< "'";
4090 vector
<int> up
, acting
;
4091 if (!osdmap
.have_pg_pool(pgid
.pool())) {
4092 ss
<< "pg '" << pgidstr
<< "' does not exist";
4096 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4097 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
4099 f
->open_object_section("pg_map");
4100 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4101 f
->dump_stream("raw_pgid") << pgid
;
4102 f
->dump_stream("pgid") << mpgid
;
4103 f
->open_array_section("up");
4104 for (auto osd
: up
) {
4105 f
->dump_int("up_osd", osd
);
4108 f
->open_array_section("acting");
4109 for (auto osd
: acting
) {
4110 f
->dump_int("acting_osd", osd
);
4116 ds
<< "osdmap e" << osdmap
.get_epoch()
4117 << " pg " << pgid
<< " (" << mpgid
<< ")"
4118 << " -> up " << up
<< " acting " << acting
;
4123 } else if ((prefix
== "osd scrub" ||
4124 prefix
== "osd deep-scrub" ||
4125 prefix
== "osd repair")) {
4127 cmd_getval(g_ceph_context
, cmdmap
, "who", whostr
);
4128 vector
<string
> pvec
;
4129 get_str_vec(prefix
, pvec
);
4131 if (whostr
== "*") {
4134 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++)
4135 if (osdmap
.is_up(i
)) {
4136 ss
<< (c
++ ? "," : "") << i
;
4137 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4138 pvec
.back() == "repair",
4139 pvec
.back() == "deep-scrub"),
4140 osdmap
.get_inst(i
));
4143 ss
<< " instructed to " << pvec
.back();
4145 long osd
= parse_osd_id(whostr
.c_str(), &ss
);
4148 } else if (osdmap
.is_up(osd
)) {
4149 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4150 pvec
.back() == "repair",
4151 pvec
.back() == "deep-scrub"),
4152 osdmap
.get_inst(osd
));
4153 ss
<< "osd." << osd
<< " instructed to " << pvec
.back();
4155 ss
<< "osd." << osd
<< " is not up";
4159 } else if (prefix
== "osd lspools") {
4161 cmd_getval(g_ceph_context
, cmdmap
, "auid", auid
, int64_t(0));
4163 f
->open_array_section("pools");
4164 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
4165 p
!= osdmap
.pools
.end();
4167 if (!auid
|| p
->second
.auid
== (uint64_t)auid
) {
4169 f
->open_object_section("pool");
4170 f
->dump_int("poolnum", p
->first
);
4171 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
4174 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
] << ',';
4183 } else if (prefix
== "osd blacklist ls") {
4185 f
->open_array_section("blacklist");
4187 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
4188 p
!= osdmap
.blacklist
.end();
4191 f
->open_object_section("entry");
4192 f
->dump_stream("addr") << p
->first
;
4193 f
->dump_stream("until") << p
->second
;
4198 ss
<< p
->first
<< " " << p
->second
;
4208 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
4210 } else if (prefix
== "osd pool ls") {
4212 cmd_getval(g_ceph_context
, cmdmap
, "detail", detail
);
4213 if (!f
&& detail
== "detail") {
4215 osdmap
.print_pools(ss
);
4216 rdata
.append(ss
.str());
4219 f
->open_array_section("pools");
4220 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
4221 it
!= osdmap
.get_pools().end();
4224 if (detail
== "detail") {
4225 f
->open_object_section("pool");
4226 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4227 it
->second
.dump(f
.get());
4230 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4233 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
4242 } else if (prefix
== "osd crush get-tunable") {
4244 cmd_getval(g_ceph_context
, cmdmap
, "tunable", tunable
);
4247 f
->open_object_section("tunable");
4248 if (tunable
== "straw_calc_version") {
4250 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
4252 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
4261 rdata
.append(rss
.str());
4265 } else if (prefix
== "osd pool get") {
4267 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
4268 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4270 ss
<< "unrecognized pool '" << poolstr
<< "'";
4275 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
4277 cmd_getval(g_ceph_context
, cmdmap
, "var", var
);
4279 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
4280 const choices_map_t ALL_CHOICES
= {
4282 {"min_size", MIN_SIZE
},
4283 {"crash_replay_interval", CRASH_REPLAY_INTERVAL
},
4284 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
4285 {"crush_rule", CRUSH_RULE
},
4286 {"crush_ruleset", CRUSH_RULESET
},
4287 {"hashpspool", HASHPSPOOL
}, {"nodelete", NODELETE
},
4288 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
4289 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
4290 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
4291 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
4292 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
4293 {"use_gmt_hitset", USE_GMT_HITSET
},
4294 {"auid", AUID
}, {"target_max_objects", TARGET_MAX_OBJECTS
},
4295 {"target_max_bytes", TARGET_MAX_BYTES
},
4296 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
4297 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
4298 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
4299 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
4300 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
4301 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
4302 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
4303 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
4304 {"fast_read", FAST_READ
},
4305 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
4306 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
4307 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
4308 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
4309 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
4310 {"recovery_priority", RECOVERY_PRIORITY
},
4311 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
4312 {"scrub_priority", SCRUB_PRIORITY
},
4313 {"compression_mode", COMPRESSION_MODE
},
4314 {"compression_algorithm", COMPRESSION_ALGORITHM
},
4315 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
4316 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
4317 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
4318 {"csum_type", CSUM_TYPE
},
4319 {"csum_max_block", CSUM_MAX_BLOCK
},
4320 {"csum_min_block", CSUM_MIN_BLOCK
},
4323 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
4325 const choices_set_t ONLY_TIER_CHOICES
= {
4326 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
4327 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
4328 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
4329 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
4330 MIN_READ_RECENCY_FOR_PROMOTE
,
4331 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
4333 const choices_set_t ONLY_ERASURE_CHOICES
= {
4334 ERASURE_CODE_PROFILE
4337 choices_set_t selected_choices
;
4339 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
4340 it
!= ALL_CHOICES
.end(); ++it
) {
4341 selected_choices
.insert(it
->second
);
4345 selected_choices
= subtract_second_from_first(selected_choices
,
4349 if(!p
->is_erasure()) {
4350 selected_choices
= subtract_second_from_first(selected_choices
,
4351 ONLY_ERASURE_CHOICES
);
4353 } else /* var != "all" */ {
4354 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
4355 osd_pool_get_choices selected
= found
->second
;
4357 if (!p
->is_tier() &&
4358 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
4359 ss
<< "pool '" << poolstr
4360 << "' is not a tier pool: variable not applicable";
4365 if (!p
->is_erasure() &&
4366 ONLY_ERASURE_CHOICES
.find(selected
)
4367 != ONLY_ERASURE_CHOICES
.end()) {
4368 ss
<< "pool '" << poolstr
4369 << "' is not a erasure pool: variable not applicable";
4374 selected_choices
.insert(selected
);
4378 for(choices_set_t::const_iterator it
= selected_choices
.begin();
4379 it
!= selected_choices
.end(); ++it
) {
4380 choices_map_t::const_iterator i
;
4381 f
->open_object_section("pool");
4382 f
->dump_string("pool", poolstr
);
4383 f
->dump_int("pool_id", pool
);
4386 f
->dump_int("pg_num", p
->get_pg_num());
4389 f
->dump_int("pgp_num", p
->get_pgp_num());
4392 f
->dump_int("auid", p
->get_auid());
4395 f
->dump_int("size", p
->get_size());
4398 f
->dump_int("min_size", p
->get_min_size());
4400 case CRASH_REPLAY_INTERVAL
:
4401 f
->dump_int("crash_replay_interval",
4402 p
->get_crash_replay_interval());
4405 if (osdmap
.crush
->rule_exists(p
->get_crush_ruleset())) {
4406 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
4407 p
->get_crush_ruleset()));
4409 f
->dump_string("crush_rule", stringify(p
->get_crush_ruleset()));
4413 f
->dump_int("crush_ruleset", p
->get_crush_ruleset());
4419 case WRITE_FADVISE_DONTNEED
:
4422 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4423 if (i
->second
== *it
)
4426 assert(i
!= ALL_CHOICES
.end());
4427 f
->dump_string(i
->first
.c_str(),
4428 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
4431 case HIT_SET_PERIOD
:
4432 f
->dump_int("hit_set_period", p
->hit_set_period
);
4435 f
->dump_int("hit_set_count", p
->hit_set_count
);
4438 f
->dump_string("hit_set_type",
4439 HitSet::get_type_name(p
->hit_set_params
.get_type()));
4443 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
4444 BloomHitSet::Params
*bloomp
=
4445 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
4446 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
4447 } else if(var
!= "all") {
4449 ss
<< "hit set is not of type Bloom; " <<
4450 "invalid to get a false positive rate!";
4456 case USE_GMT_HITSET
:
4457 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
4459 case TARGET_MAX_OBJECTS
:
4460 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
4462 case TARGET_MAX_BYTES
:
4463 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
4465 case CACHE_TARGET_DIRTY_RATIO
:
4466 f
->dump_unsigned("cache_target_dirty_ratio_micro",
4467 p
->cache_target_dirty_ratio_micro
);
4468 f
->dump_float("cache_target_dirty_ratio",
4469 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
4471 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
4472 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
4473 p
->cache_target_dirty_high_ratio_micro
);
4474 f
->dump_float("cache_target_dirty_high_ratio",
4475 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
4477 case CACHE_TARGET_FULL_RATIO
:
4478 f
->dump_unsigned("cache_target_full_ratio_micro",
4479 p
->cache_target_full_ratio_micro
);
4480 f
->dump_float("cache_target_full_ratio",
4481 ((float)p
->cache_target_full_ratio_micro
/1000000));
4483 case CACHE_MIN_FLUSH_AGE
:
4484 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
4486 case CACHE_MIN_EVICT_AGE
:
4487 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
4489 case ERASURE_CODE_PROFILE
:
4490 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
4492 case MIN_READ_RECENCY_FOR_PROMOTE
:
4493 f
->dump_int("min_read_recency_for_promote",
4494 p
->min_read_recency_for_promote
);
4496 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
4497 f
->dump_int("min_write_recency_for_promote",
4498 p
->min_write_recency_for_promote
);
4501 f
->dump_int("fast_read", p
->fast_read
);
4503 case HIT_SET_GRADE_DECAY_RATE
:
4504 f
->dump_int("hit_set_grade_decay_rate",
4505 p
->hit_set_grade_decay_rate
);
4507 case HIT_SET_SEARCH_LAST_N
:
4508 f
->dump_int("hit_set_search_last_n",
4509 p
->hit_set_search_last_n
);
4511 case SCRUB_MIN_INTERVAL
:
4512 case SCRUB_MAX_INTERVAL
:
4513 case DEEP_SCRUB_INTERVAL
:
4514 case RECOVERY_PRIORITY
:
4515 case RECOVERY_OP_PRIORITY
:
4516 case SCRUB_PRIORITY
:
4517 case COMPRESSION_MODE
:
4518 case COMPRESSION_ALGORITHM
:
4519 case COMPRESSION_REQUIRED_RATIO
:
4520 case COMPRESSION_MAX_BLOB_SIZE
:
4521 case COMPRESSION_MIN_BLOB_SIZE
:
4523 case CSUM_MAX_BLOCK
:
4524 case CSUM_MIN_BLOCK
:
4525 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4526 if (i
->second
== *it
)
4529 assert(i
!= ALL_CHOICES
.end());
4530 if(*it
== CSUM_TYPE
) {
4532 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
4533 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
4536 p
->opts
.dump(i
->first
, f
.get());
4545 for(choices_set_t::const_iterator it
= selected_choices
.begin();
4546 it
!= selected_choices
.end(); ++it
) {
4547 choices_map_t::const_iterator i
;
4550 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
4553 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
4556 ss
<< "auid: " << p
->get_auid() << "\n";
4559 ss
<< "size: " << p
->get_size() << "\n";
4562 ss
<< "min_size: " << p
->get_min_size() << "\n";
4564 case CRASH_REPLAY_INTERVAL
:
4565 ss
<< "crash_replay_interval: " <<
4566 p
->get_crash_replay_interval() << "\n";
4569 if (osdmap
.crush
->rule_exists(p
->get_crush_ruleset())) {
4570 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
4571 p
->get_crush_ruleset()) << "\n";
4573 ss
<< "crush_rule: " << p
->get_crush_ruleset() << "\n";
4577 ss
<< "crush_ruleset: " << p
->get_crush_ruleset() << "\n";
4579 case HIT_SET_PERIOD
:
4580 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
4583 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
4586 ss
<< "hit_set_type: " <<
4587 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
4591 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
4592 BloomHitSet::Params
*bloomp
=
4593 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
4594 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
4595 } else if(var
!= "all") {
4596 ss
<< "hit set is not of type Bloom; " <<
4597 "invalid to get a false positive rate!";
4603 case USE_GMT_HITSET
:
4604 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
4606 case TARGET_MAX_OBJECTS
:
4607 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
4609 case TARGET_MAX_BYTES
:
4610 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
4612 case CACHE_TARGET_DIRTY_RATIO
:
4613 ss
<< "cache_target_dirty_ratio: "
4614 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
4616 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
4617 ss
<< "cache_target_dirty_high_ratio: "
4618 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
4620 case CACHE_TARGET_FULL_RATIO
:
4621 ss
<< "cache_target_full_ratio: "
4622 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
4624 case CACHE_MIN_FLUSH_AGE
:
4625 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
4627 case CACHE_MIN_EVICT_AGE
:
4628 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
4630 case ERASURE_CODE_PROFILE
:
4631 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
4633 case MIN_READ_RECENCY_FOR_PROMOTE
:
4634 ss
<< "min_read_recency_for_promote: " <<
4635 p
->min_read_recency_for_promote
<< "\n";
4637 case HIT_SET_GRADE_DECAY_RATE
:
4638 ss
<< "hit_set_grade_decay_rate: " <<
4639 p
->hit_set_grade_decay_rate
<< "\n";
4641 case HIT_SET_SEARCH_LAST_N
:
4642 ss
<< "hit_set_search_last_n: " <<
4643 p
->hit_set_search_last_n
<< "\n";
4649 case WRITE_FADVISE_DONTNEED
:
4652 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4653 if (i
->second
== *it
)
4656 assert(i
!= ALL_CHOICES
.end());
4657 ss
<< i
->first
<< ": " <<
4658 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
4659 "true" : "false") << "\n";
4661 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
4662 ss
<< "min_write_recency_for_promote: " <<
4663 p
->min_write_recency_for_promote
<< "\n";
4666 ss
<< "fast_read: " << p
->fast_read
<< "\n";
4668 case SCRUB_MIN_INTERVAL
:
4669 case SCRUB_MAX_INTERVAL
:
4670 case DEEP_SCRUB_INTERVAL
:
4671 case RECOVERY_PRIORITY
:
4672 case RECOVERY_OP_PRIORITY
:
4673 case SCRUB_PRIORITY
:
4674 case COMPRESSION_MODE
:
4675 case COMPRESSION_ALGORITHM
:
4676 case COMPRESSION_REQUIRED_RATIO
:
4677 case COMPRESSION_MAX_BLOB_SIZE
:
4678 case COMPRESSION_MIN_BLOB_SIZE
:
4680 case CSUM_MAX_BLOCK
:
4681 case CSUM_MIN_BLOCK
:
4682 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4683 if (i
->second
== *it
)
4686 assert(i
!= ALL_CHOICES
.end());
4688 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
4689 if (p
->opts
.is_set(key
)) {
4690 if(key
== pool_opts_t::CSUM_TYPE
) {
4692 p
->opts
.get(key
, &val
);
4693 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
4695 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
4701 rdata
.append(ss
.str());
4706 } else if (prefix
== "osd pool stats") {
4707 const auto &pgm
= mon
->pgmon()->pg_map
;
4708 r
= process_pg_map_command(prefix
, cmdmap
, pgm
, osdmap
,
4709 f
.get(), &ss
, &rdata
);
4710 } else if (prefix
== "osd pool get-quota") {
4712 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool_name
);
4714 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
4716 assert(poolid
== -ENOENT
);
4717 ss
<< "unrecognized pool '" << pool_name
<< "'";
4721 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
4724 f
->open_object_section("pool_quotas");
4725 f
->dump_string("pool_name", pool_name
);
4726 f
->dump_unsigned("pool_id", poolid
);
4727 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
4728 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
4733 rs
<< "quotas for pool '" << pool_name
<< "':\n"
4734 << " max objects: ";
4735 if (p
->quota_max_objects
== 0)
4738 rs
<< si_t(p
->quota_max_objects
) << " objects";
4741 if (p
->quota_max_bytes
== 0)
4744 rs
<< si_t(p
->quota_max_bytes
) << "B";
4745 rdata
.append(rs
.str());
4749 } else if (prefix
== "osd crush rule list" ||
4750 prefix
== "osd crush rule ls") {
4752 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4753 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4754 f
->open_array_section("rules");
4755 osdmap
.crush
->list_rules(f
.get());
4760 rdata
.append(rs
.str());
4761 } else if (prefix
== "osd crush rule dump") {
4763 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
4765 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4766 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4768 f
->open_array_section("rules");
4769 osdmap
.crush
->dump_rules(f
.get());
4772 int ruleno
= osdmap
.crush
->get_rule_id(name
);
4774 ss
<< "unknown crush ruleset '" << name
<< "'";
4778 osdmap
.crush
->dump_rule(ruleno
, f
.get());
4783 rdata
.append(rs
.str());
4784 } else if (prefix
== "osd crush dump") {
4786 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4787 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4788 f
->open_object_section("crush_map");
4789 osdmap
.crush
->dump(f
.get());
4794 rdata
.append(rs
.str());
4795 } else if (prefix
== "osd crush show-tunables") {
4797 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4798 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4799 f
->open_object_section("crush_map_tunables");
4800 osdmap
.crush
->dump_tunables(f
.get());
4805 rdata
.append(rs
.str());
4806 } else if (prefix
== "osd crush tree") {
4807 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4808 f
->open_array_section("crush_map_roots");
4809 osdmap
.crush
->dump_tree(f
.get());
4812 } else if (prefix
== "osd crush class ls") {
4813 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4814 f
->open_array_section("crush_classes");
4815 for (auto i
: osdmap
.crush
->class_name
)
4816 f
->dump_string("class", i
.second
);
4819 } else if (prefix
== "osd erasure-code-profile ls") {
4820 const auto &profiles
= osdmap
.get_erasure_code_profiles();
4822 f
->open_array_section("erasure-code-profiles");
4823 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
4825 f
->dump_string("profile", i
->first
.c_str());
4827 rdata
.append(i
->first
+ "\n");
4834 rdata
.append(rs
.str());
4836 } else if (prefix
== "osd erasure-code-profile get") {
4838 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
4839 if (!osdmap
.has_erasure_code_profile(name
)) {
4840 ss
<< "unknown erasure code profile '" << name
<< "'";
4844 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
4846 f
->open_object_section("profile");
4847 for (map
<string
,string
>::const_iterator i
= profile
.begin();
4851 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
4853 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
4860 rdata
.append(rs
.str());
4863 // try prepare update
4870 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
4874 void OSDMonitor::update_pool_flags(int64_t pool_id
, uint64_t flags
)
4876 const pg_pool_t
*pool
= osdmap
.get_pg_pool(pool_id
);
4877 pending_inc
.get_new_pool(pool_id
, pool
)->flags
= flags
;
4880 bool OSDMonitor::update_pools_status()
4882 if (!mon
->pgmon()->is_readable())
4887 auto& pools
= osdmap
.get_pools();
4888 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
4889 if (!mon
->pgmon()->pg_map
.pg_pool_sum
.count(it
->first
))
4891 pool_stat_t
& stats
= mon
->pgmon()->pg_map
.pg_pool_sum
[it
->first
];
4892 object_stat_sum_t
& sum
= stats
.stats
.sum
;
4893 const pg_pool_t
&pool
= it
->second
;
4894 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
4897 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
4898 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
4900 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
4904 mon
->clog
->info() << "pool '" << pool_name
4905 << "' no longer full; removing FULL flag";
4907 update_pool_flags(it
->first
, pool
.get_flags() & ~pg_pool_t::FLAG_FULL
);
4913 if (pool
.quota_max_bytes
> 0 &&
4914 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
4915 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
4916 << " (reached quota's max_bytes: "
4917 << si_t(pool
.quota_max_bytes
) << ")";
4919 if (pool
.quota_max_objects
> 0 &&
4920 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
4921 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
4922 << " (reached quota's max_objects: "
4923 << pool
.quota_max_objects
<< ")";
4925 update_pool_flags(it
->first
, pool
.get_flags() | pg_pool_t::FLAG_FULL
);
4932 void OSDMonitor::get_pools_health(
4933 list
<pair
<health_status_t
,string
> >& summary
,
4934 list
<pair
<health_status_t
,string
> > *detail
) const
4936 auto& pools
= osdmap
.get_pools();
4937 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
4938 if (!mon
->pgmon()->pg_map
.pg_pool_sum
.count(it
->first
))
4940 pool_stat_t
& stats
= mon
->pgmon()->pg_map
.pg_pool_sum
[it
->first
];
4941 object_stat_sum_t
& sum
= stats
.stats
.sum
;
4942 const pg_pool_t
&pool
= it
->second
;
4943 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
4945 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
4946 // uncomment these asserts if/when we update the FULL flag on pg_stat update
4947 //assert((pool.quota_max_objects > 0) || (pool.quota_max_bytes > 0));
4950 ss
<< "pool '" << pool_name
<< "' is full";
4951 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
4953 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
4956 float warn_threshold
= (float)g_conf
->mon_pool_quota_warn_threshold
/100;
4957 float crit_threshold
= (float)g_conf
->mon_pool_quota_crit_threshold
/100;
4959 if (pool
.quota_max_objects
> 0) {
4961 health_status_t status
= HEALTH_OK
;
4962 if ((uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
4963 // uncomment these asserts if/when we update the FULL flag on pg_stat update
4964 //assert(pool.has_flag(pg_pool_t::FLAG_FULL));
4965 } else if (crit_threshold
> 0 &&
4966 sum
.num_objects
>= pool
.quota_max_objects
*crit_threshold
) {
4967 ss
<< "pool '" << pool_name
4968 << "' has " << sum
.num_objects
<< " objects"
4969 << " (max " << pool
.quota_max_objects
<< ")";
4970 status
= HEALTH_ERR
;
4971 } else if (warn_threshold
> 0 &&
4972 sum
.num_objects
>= pool
.quota_max_objects
*warn_threshold
) {
4973 ss
<< "pool '" << pool_name
4974 << "' has " << sum
.num_objects
<< " objects"
4975 << " (max " << pool
.quota_max_objects
<< ")";
4976 status
= HEALTH_WARN
;
4978 if (status
!= HEALTH_OK
) {
4979 pair
<health_status_t
,string
> s(status
, ss
.str());
4980 summary
.push_back(s
);
4982 detail
->push_back(s
);
4986 if (pool
.quota_max_bytes
> 0) {
4987 health_status_t status
= HEALTH_OK
;
4989 if ((uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
4990 // uncomment these asserts if/when we update the FULL flag on pg_stat update
4991 //assert(pool.has_flag(pg_pool_t::FLAG_FULL));
4992 } else if (crit_threshold
> 0 &&
4993 sum
.num_bytes
>= pool
.quota_max_bytes
*crit_threshold
) {
4994 ss
<< "pool '" << pool_name
4995 << "' has " << si_t(sum
.num_bytes
) << " bytes"
4996 << " (max " << si_t(pool
.quota_max_bytes
) << ")";
4997 status
= HEALTH_ERR
;
4998 } else if (warn_threshold
> 0 &&
4999 sum
.num_bytes
>= pool
.quota_max_bytes
*warn_threshold
) {
5000 ss
<< "pool '" << pool_name
5001 << "' has " << si_t(sum
.num_bytes
) << " bytes"
5002 << " (max " << si_t(pool
.quota_max_bytes
) << ")";
5003 status
= HEALTH_WARN
;
5005 if (status
!= HEALTH_OK
) {
5006 pair
<health_status_t
,string
> s(status
, ss
.str());
5007 summary
.push_back(s
);
5009 detail
->push_back(s
);
5016 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
5018 op
->mark_osdmon_event(__func__
);
5019 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
5020 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
5021 MonSession
*session
= m
->get_session();
5024 string erasure_code_profile
;
5026 string ruleset_name
;
5028 return prepare_new_pool(m
->name
, m
->auid
, m
->crush_rule
, ruleset_name
,
5030 erasure_code_profile
,
5031 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5033 return prepare_new_pool(m
->name
, session
->auid
, m
->crush_rule
, ruleset_name
,
5035 erasure_code_profile
,
5036 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5039 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
5040 const string
& dstname
,
5045 // Avoid creating a pending crush if it does not already exists and
5046 // the rename would fail.
5048 if (!_have_pending_crush()) {
5049 ret
= _get_stable_crush().can_rename_bucket(srcname
,
5056 CrushWrapper newcrush
;
5057 _get_pending_crush(newcrush
);
5059 ret
= newcrush
.rename_bucket(srcname
,
5065 pending_inc
.crush
.clear();
5066 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5067 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
5071 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
5073 string replacement
= "";
5075 if (plugin
== "jerasure_generic" ||
5076 plugin
== "jerasure_sse3" ||
5077 plugin
== "jerasure_sse4" ||
5078 plugin
== "jerasure_neon") {
5079 replacement
= "jerasure";
5080 } else if (plugin
== "shec_generic" ||
5081 plugin
== "shec_sse3" ||
5082 plugin
== "shec_sse4" ||
5083 plugin
== "shec_neon") {
5084 replacement
= "shec";
5087 if (replacement
!= "") {
5088 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
5089 << plugin
<< " that has been deprecated. Please use "
5090 << replacement
<< " instead." << dendl
;
5094 int OSDMonitor::normalize_profile(const string
& profilename
,
5095 ErasureCodeProfile
&profile
,
5099 ErasureCodeInterfaceRef erasure_code
;
5100 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5101 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
5102 check_legacy_ec_plugin(plugin
->second
, profilename
);
5103 int err
= instance
.factory(plugin
->second
,
5104 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5105 profile
, &erasure_code
, ss
);
5110 err
= erasure_code
->init(profile
, ss
);
5115 auto it
= profile
.find("stripe_unit");
5116 if (it
!= profile
.end()) {
5118 uint32_t stripe_unit
= strict_si_cast
<uint32_t>(it
->second
.c_str(), &err_str
);
5119 if (!err_str
.empty()) {
5120 *ss
<< "could not parse stripe_unit '" << it
->second
5121 << "': " << err_str
<< std::endl
;
5124 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
5125 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
5126 if (chunk_size
!= stripe_unit
) {
5127 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
5128 << "alignment. Would be padded to " << chunk_size
5132 if ((stripe_unit
% 4096) != 0 && !force
) {
5133 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
5134 << "use --force to override this check" << std::endl
;
5141 int OSDMonitor::crush_ruleset_create_erasure(const string
&name
,
5142 const string
&profile
,
5146 int ruleid
= osdmap
.crush
->get_rule_id(name
);
5147 if (ruleid
!= -ENOENT
) {
5148 *ruleset
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
5152 CrushWrapper newcrush
;
5153 _get_pending_crush(newcrush
);
5155 ruleid
= newcrush
.get_rule_id(name
);
5156 if (ruleid
!= -ENOENT
) {
5157 *ruleset
= newcrush
.get_rule_mask_ruleset(ruleid
);
5160 ErasureCodeInterfaceRef erasure_code
;
5161 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
5163 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
5167 err
= erasure_code
->create_ruleset(name
, newcrush
, ss
);
5168 erasure_code
.reset();
5172 pending_inc
.crush
.clear();
5173 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5178 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
5179 ErasureCodeInterfaceRef
*erasure_code
,
5182 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
5184 ErasureCodeProfile profile
=
5185 osdmap
.get_erasure_code_profile(erasure_code_profile
);
5186 ErasureCodeProfile::const_iterator plugin
=
5187 profile
.find("plugin");
5188 if (plugin
== profile
.end()) {
5189 *ss
<< "cannot determine the erasure code plugin"
5190 << " because there is no 'plugin' entry in the erasure_code_profile "
5191 << profile
<< std::endl
;
5194 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
5195 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5196 return instance
.factory(plugin
->second
,
5197 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5198 profile
, erasure_code
, ss
);
5201 int OSDMonitor::check_cluster_features(uint64_t features
,
5204 stringstream unsupported_ss
;
5205 int unsupported_count
= 0;
5206 if ((mon
->get_quorum_con_features() & features
) != features
) {
5207 unsupported_ss
<< "the monitor cluster";
5208 ++unsupported_count
;
5211 set
<int32_t> up_osds
;
5212 osdmap
.get_up_osds(up_osds
);
5213 for (set
<int32_t>::iterator it
= up_osds
.begin();
5214 it
!= up_osds
.end(); ++it
) {
5215 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
5216 if ((xi
.features
& features
) != features
) {
5217 if (unsupported_count
> 0)
5218 unsupported_ss
<< ", ";
5219 unsupported_ss
<< "osd." << *it
;
5220 unsupported_count
++;
5224 if (unsupported_count
> 0) {
5225 ss
<< "features " << features
<< " unsupported by: "
5226 << unsupported_ss
.str();
5230 // check pending osd state, too!
5231 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
5232 pending_inc
.new_xinfo
.begin();
5233 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
5234 const osd_xinfo_t
&xi
= p
->second
;
5235 if ((xi
.features
& features
) != features
) {
5236 dout(10) << __func__
<< " pending osd." << p
->first
5237 << " features are insufficient; retry" << dendl
;
5245 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
5248 OSDMap::Incremental new_pending
= pending_inc
;
5249 ::encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
5251 newmap
.deepish_copy_from(osdmap
);
5252 newmap
.apply_incremental(new_pending
);
5255 if (newmap
.require_min_compat_client
.length()) {
5256 auto mv
= newmap
.get_min_compat_client();
5257 if (mv
.first
> newmap
.require_min_compat_client
) {
5258 ss
<< "new crush map requires client version " << mv
5259 << " but require_min_compat_client is "
5260 << newmap
.require_min_compat_client
;
5267 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
5268 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
5269 stringstream features_ss
;
5270 int r
= check_cluster_features(features
, features_ss
);
5272 ss
<< "Could not change CRUSH: " << features_ss
.str();
5279 bool OSDMonitor::erasure_code_profile_in_use(
5280 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
5281 const string
&profile
,
5285 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
5288 if (p
->second
.erasure_code_profile
== profile
) {
5289 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
5294 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
5299 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
5300 map
<string
,string
> *erasure_code_profile_map
,
5303 int r
= get_json_str_map(g_conf
->osd_pool_default_erasure_code_profile
,
5305 erasure_code_profile_map
);
5308 assert((*erasure_code_profile_map
).count("plugin"));
5309 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
5310 map
<string
,string
> user_map
;
5311 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
5312 i
!= erasure_code_profile
.end();
5314 size_t equal
= i
->find('=');
5315 if (equal
== string::npos
) {
5316 user_map
[*i
] = string();
5317 (*erasure_code_profile_map
)[*i
] = string();
5319 const string key
= i
->substr(0, equal
);
5321 const string value
= i
->substr(equal
);
5322 user_map
[key
] = value
;
5323 (*erasure_code_profile_map
)[key
] = value
;
5327 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
5328 (*erasure_code_profile_map
) = user_map
;
5333 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
5334 const string
&erasure_code_profile
,
5335 unsigned *size
, unsigned *min_size
,
5339 switch (pool_type
) {
5340 case pg_pool_t::TYPE_REPLICATED
:
5341 *size
= g_conf
->osd_pool_default_size
;
5342 *min_size
= g_conf
->get_osd_pool_default_min_size();
5344 case pg_pool_t::TYPE_ERASURE
:
5346 ErasureCodeInterfaceRef erasure_code
;
5347 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
5349 *size
= erasure_code
->get_chunk_count();
5350 *min_size
= MIN(erasure_code
->get_data_chunk_count() + 1, *size
);
5355 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
5362 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
5363 const string
&erasure_code_profile
,
5364 uint32_t *stripe_width
,
5368 switch (pool_type
) {
5369 case pg_pool_t::TYPE_REPLICATED
:
5372 case pg_pool_t::TYPE_ERASURE
:
5374 ErasureCodeProfile profile
=
5375 osdmap
.get_erasure_code_profile(erasure_code_profile
);
5376 ErasureCodeInterfaceRef erasure_code
;
5377 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
5380 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
5381 uint32_t stripe_unit
= g_conf
->osd_pool_erasure_code_stripe_unit
;
5382 auto it
= profile
.find("stripe_unit");
5383 if (it
!= profile
.end()) {
5385 stripe_unit
= strict_si_cast
<uint32_t>(it
->second
.c_str(), &err_str
);
5386 assert(err_str
.empty());
5388 *stripe_width
= data_chunks
*
5389 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
5393 *ss
<< "prepare_pool_stripe_width: "
5394 << pool_type
<< " is not a known pool type";
5401 int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type
,
5402 const string
&erasure_code_profile
,
5403 const string
&ruleset_name
,
5408 if (*crush_ruleset
< 0) {
5409 switch (pool_type
) {
5410 case pg_pool_t::TYPE_REPLICATED
:
5412 if (ruleset_name
== "") {
5413 //Use default ruleset
5414 *crush_ruleset
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context
);
5415 if (*crush_ruleset
< 0) {
5416 // Errors may happen e.g. if no valid ruleset is available
5417 *ss
<< "No suitable CRUSH ruleset exists, check "
5418 << "'osd pool default crush *' config options";
5422 return get_crush_ruleset(ruleset_name
, crush_ruleset
, ss
);
5426 case pg_pool_t::TYPE_ERASURE
:
5428 int err
= crush_ruleset_create_erasure(ruleset_name
,
5429 erasure_code_profile
,
5433 dout(20) << "prepare_pool_crush_ruleset: ruleset "
5434 << ruleset_name
<< " try again" << dendl
;
5437 // need to wait for the crush rule to be proposed before proceeding
5448 *ss
<< "prepare_pool_crush_ruleset: " << pool_type
5449 << " is not a known pool type";
5454 if (!osdmap
.crush
->ruleset_exists(*crush_ruleset
)) {
5455 *ss
<< "CRUSH ruleset " << *crush_ruleset
<< " not found";
5463 int OSDMonitor::get_crush_ruleset(const string
&ruleset_name
,
5468 ret
= osdmap
.crush
->get_rule_id(ruleset_name
);
5469 if (ret
!= -ENOENT
) {
5471 *crush_ruleset
= ret
;
5473 CrushWrapper newcrush
;
5474 _get_pending_crush(newcrush
);
5476 ret
= newcrush
.get_rule_id(ruleset_name
);
5477 if (ret
!= -ENOENT
) {
5478 // found it, wait for it to be proposed
5479 dout(20) << __func__
<< ": ruleset " << ruleset_name
5480 << " try again" << dendl
;
5483 //Cannot find it , return error
5484 *ss
<< "specified ruleset " << ruleset_name
<< " doesn't exist";
5492 * @param name The name of the new pool
5493 * @param auid The auid of the pool owner. Can be -1
5494 * @param crush_ruleset The crush rule to use. If <0, will use the system default
5495 * @param crush_ruleset_name The crush rule to use, if crush_rulset <0
5496 * @param pg_num The pg_num to use. If set to 0, will use the system default
5497 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
5498 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
5499 * @param pool_type TYPE_ERASURE, or TYPE_REP
5500 * @param expected_num_objects expected number of objects on the pool
5501 * @param fast_read fast read type.
5502 * @param ss human readable error message, if any.
5504 * @return 0 on success, negative errno on failure.
5506 int OSDMonitor::prepare_new_pool(string
& name
, uint64_t auid
,
5508 const string
&crush_ruleset_name
,
5509 unsigned pg_num
, unsigned pgp_num
,
5510 const string
&erasure_code_profile
,
5511 const unsigned pool_type
,
5512 const uint64_t expected_num_objects
,
5513 FastReadType fast_read
,
5516 if (name
.length() == 0)
5519 pg_num
= g_conf
->osd_pool_default_pg_num
;
5521 pgp_num
= g_conf
->osd_pool_default_pgp_num
;
5522 if (pg_num
> (unsigned)g_conf
->mon_max_pool_pg_num
) {
5523 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
5524 << g_conf
->mon_max_pool_pg_num
5525 << " (you may adjust 'mon max pool pg num' for higher values)";
5528 if (pgp_num
> pg_num
) {
5529 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
5530 << ", which in this case is " << pg_num
;
5533 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
5534 *ss
<< "'fast_read' can only apply to erasure coding pool";
5538 r
= prepare_pool_crush_ruleset(pool_type
, erasure_code_profile
,
5539 crush_ruleset_name
, &crush_ruleset
, ss
);
5541 dout(10) << " prepare_pool_crush_ruleset returns " << r
<< dendl
;
5544 CrushWrapper newcrush
;
5545 _get_pending_crush(newcrush
);
5547 CrushTester
tester(newcrush
, err
);
5548 // use the internal crush tester if crushtool config is empty
5549 if (g_conf
->crushtool
.empty()) {
5552 r
= tester
.test_with_crushtool(g_conf
->crushtool
.c_str(),
5553 osdmap
.get_max_osd(),
5558 dout(10) << " tester.test_with_crushtool returns " << r
5559 << ": " << err
.str() << dendl
;
5560 *ss
<< "crushtool check failed with " << r
<< ": " << err
.str();
5563 unsigned size
, min_size
;
5564 r
= prepare_pool_size(pool_type
, erasure_code_profile
, &size
, &min_size
, ss
);
5566 dout(10) << " prepare_pool_size returns " << r
<< dendl
;
5570 if (!osdmap
.crush
->check_crush_rule(crush_ruleset
, pool_type
, size
, *ss
)) {
5574 uint32_t stripe_width
= 0;
5575 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
5577 dout(10) << " prepare_pool_stripe_width returns " << r
<< dendl
;
5582 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
5583 switch (fast_read
) {
5590 case FAST_READ_DEFAULT
:
5591 fread
= g_conf
->mon_osd_pool_ec_fast_read
;
5594 *ss
<< "invalid fast_read setting: " << fast_read
;
5599 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
5600 p
!= pending_inc
.new_pool_names
.end();
5602 if (p
->second
== name
)
5606 if (-1 == pending_inc
.new_pool_max
)
5607 pending_inc
.new_pool_max
= osdmap
.pool_max
;
5608 int64_t pool
= ++pending_inc
.new_pool_max
;
5610 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
5611 pi
->type
= pool_type
;
5612 pi
->fast_read
= fread
;
5613 pi
->flags
= g_conf
->osd_pool_default_flags
;
5614 if (g_conf
->osd_pool_default_flag_hashpspool
)
5615 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
5616 if (g_conf
->osd_pool_default_flag_nodelete
)
5617 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
5618 if (g_conf
->osd_pool_default_flag_nopgchange
)
5619 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
5620 if (g_conf
->osd_pool_default_flag_nosizechange
)
5621 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
5622 if (g_conf
->osd_pool_use_gmt_hitset
&&
5623 (osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
))
5624 pi
->use_gmt_hitset
= true;
5626 pi
->use_gmt_hitset
= false;
5629 pi
->min_size
= min_size
;
5630 pi
->crush_ruleset
= crush_ruleset
;
5631 pi
->expected_num_objects
= expected_num_objects
;
5632 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
5633 pi
->set_pg_num(pg_num
);
5634 pi
->set_pgp_num(pgp_num
);
5635 pi
->last_change
= pending_inc
.epoch
;
5637 pi
->erasure_code_profile
= erasure_code_profile
;
5638 pi
->stripe_width
= stripe_width
;
5639 pi
->cache_target_dirty_ratio_micro
=
5640 g_conf
->osd_pool_default_cache_target_dirty_ratio
* 1000000;
5641 pi
->cache_target_dirty_high_ratio_micro
=
5642 g_conf
->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
5643 pi
->cache_target_full_ratio_micro
=
5644 g_conf
->osd_pool_default_cache_target_full_ratio
* 1000000;
5645 pi
->cache_min_flush_age
= g_conf
->osd_pool_default_cache_min_flush_age
;
5646 pi
->cache_min_evict_age
= g_conf
->osd_pool_default_cache_min_evict_age
;
5647 pending_inc
.new_pool_names
[pool
] = name
;
5651 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
5653 op
->mark_osdmon_event(__func__
);
5655 if (pending_inc
.new_flags
< 0)
5656 pending_inc
.new_flags
= osdmap
.get_flags();
5657 pending_inc
.new_flags
|= flag
;
5658 ss
<< OSDMap::get_flag_string(flag
) << " is set";
5659 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
5660 get_last_committed() + 1));
5664 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
5666 op
->mark_osdmon_event(__func__
);
5668 if (pending_inc
.new_flags
< 0)
5669 pending_inc
.new_flags
= osdmap
.get_flags();
5670 pending_inc
.new_flags
&= ~flag
;
5671 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
5672 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
5673 get_last_committed() + 1));
5677 int OSDMonitor::parse_osd_id(const char *s
, stringstream
*pss
)
5680 if (strncmp(s
, "osd.", 4) == 0) {
5686 long id
= parse_pos_long(s
, &ss
);
5692 *pss
<< "osd id " << id
<< " is too large";
5699 int OSDMonitor::prepare_command_pool_set(map
<string
,cmd_vartype
> &cmdmap
,
5703 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
5704 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5706 ss
<< "unrecognized pool '" << poolstr
<< "'";
5710 cmd_getval(g_ceph_context
, cmdmap
, "var", var
);
5712 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
5713 if (pending_inc
.new_pools
.count(pool
))
5714 p
= pending_inc
.new_pools
[pool
];
5716 // accept val as a json string in the normal case (current
5717 // generation monitor). parse out int or float values from the
5718 // string as needed. however, if it is not a string, try to pull
5719 // out an int, in case an older monitor with an older json schema is
5720 // forwarding a request.
5722 string interr
, floaterr
;
5725 int64_t uf
= 0; // micro-f
5726 if (!cmd_getval(g_ceph_context
, cmdmap
, "val", val
)) {
5727 // wasn't a string; maybe an older mon forwarded json with an int?
5728 if (!cmd_getval(g_ceph_context
, cmdmap
, "val", n
))
5729 return -EINVAL
; // no value!
5731 // we got a string. see if it contains an int.
5732 n
= strict_strtoll(val
.c_str(), 10, &interr
);
5734 f
= strict_strtod(val
.c_str(), &floaterr
);
5735 uf
= llrintl(f
* (double)1000000.0);
5739 (var
== "hit_set_type" || var
== "hit_set_period" ||
5740 var
== "hit_set_count" || var
== "hit_set_fpp" ||
5741 var
== "target_max_objects" || var
== "target_max_bytes" ||
5742 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
5743 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
5744 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
5745 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
5746 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
5750 if (var
== "size") {
5751 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
5752 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
5755 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
5756 ss
<< "can not change the size of an erasure-coded pool";
5759 if (interr
.length()) {
5760 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5763 if (n
<= 0 || n
> 10) {
5764 ss
<< "pool size must be between 1 and 10";
5770 } else if (var
== "min_size") {
5771 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
5772 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
5775 if (interr
.length()) {
5776 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5780 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
5781 if (n
< 1 || n
> p
.size
) {
5782 ss
<< "pool min_size must be between 1 and " << (int)p
.size
;
5786 ErasureCodeInterfaceRef erasure_code
;
5789 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
5791 k
= erasure_code
->get_data_chunk_count();
5793 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.rdbuf();
5797 if (n
< k
|| n
> p
.size
) {
5798 ss
<< "pool min_size must be between " << k
<< " and " << (int)p
.size
;
5803 } else if (var
== "auid") {
5804 if (interr
.length()) {
5805 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5809 } else if (var
== "crash_replay_interval") {
5810 if (interr
.length()) {
5811 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5814 p
.crash_replay_interval
= n
;
5815 } else if (var
== "pg_num") {
5816 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
5817 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
5820 if (interr
.length()) {
5821 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5824 if (n
<= (int)p
.get_pg_num()) {
5825 ss
<< "specified pg_num " << n
<< " <= current " << p
.get_pg_num();
5826 if (n
< (int)p
.get_pg_num())
5831 cmd_getval(g_ceph_context
,cmdmap
, "force", force
);
5832 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&&
5833 force
!= "--yes-i-really-mean-it") {
5834 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
5837 int expected_osds
= MIN(p
.get_pg_num(), osdmap
.get_num_osds());
5838 int64_t new_pgs
= n
- p
.get_pg_num();
5839 if (new_pgs
> g_conf
->mon_osd_max_split_count
* expected_osds
) {
5840 ss
<< "specified pg_num " << n
<< " is too large (creating "
5841 << new_pgs
<< " new PGs on ~" << expected_osds
5842 << " OSDs exceeds per-OSD max of " << g_conf
->mon_osd_max_split_count
5847 // force pre-luminous clients to resend their ops, since they
5848 // don't understand that split PGs now form a new interval.
5849 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
5850 } else if (var
== "pgp_num") {
5851 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
5852 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
5855 if (interr
.length()) {
5856 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5860 ss
<< "specified pgp_num must > 0, but you set to " << n
;
5863 if (n
> (int)p
.get_pg_num()) {
5864 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
5868 } else if (var
== "crush_rule") {
5869 int id
= osdmap
.crush
->get_rule_id(val
);
5870 if (id
== -ENOENT
) {
5871 ss
<< "crush rule " << val
<< " does not exist";
5875 ss
<< cpp_strerror(id
);
5878 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
5881 p
.crush_ruleset
= id
;
5882 } else if (var
== "crush_ruleset") {
5883 if (interr
.length()) {
5884 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5887 if (!osdmap
.crush
->ruleset_exists(n
)) {
5888 ss
<< "crush ruleset " << n
<< " does not exist";
5892 if (!osdmap
.crush
->check_crush_rule(n
, p
.get_type(), p
.get_size(), ss
)) {
5895 p
.crush_ruleset
= n
;
5896 } else if (var
== "nodelete" || var
== "nopgchange" ||
5897 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
5898 var
== "noscrub" || var
== "nodeep-scrub") {
5899 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
5900 // make sure we only compare against 'n' if we didn't receive a string
5901 if (val
== "true" || (interr
.empty() && n
== 1)) {
5903 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
5906 ss
<< "expecting value 'true', 'false', '0', or '1'";
5909 } else if (var
== "hashpspool") {
5910 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
5912 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
5913 if (force
!= "--yes-i-really-mean-it") {
5914 ss
<< "are you SURE? this will remap all placement groups in this pool,"
5915 " this triggers large data movement,"
5916 " pass --yes-i-really-mean-it if you really do.";
5919 // make sure we only compare against 'n' if we didn't receive a string
5920 if (val
== "true" || (interr
.empty() && n
== 1)) {
5922 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
5925 ss
<< "expecting value 'true', 'false', '0', or '1'";
5928 } else if (var
== "hit_set_type") {
5930 p
.hit_set_params
= HitSet::Params();
5932 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
5935 if (val
== "bloom") {
5936 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
5937 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
5938 p
.hit_set_params
= HitSet::Params(bsp
);
5939 } else if (val
== "explicit_hash")
5940 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
5941 else if (val
== "explicit_object")
5942 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
5944 ss
<< "unrecognized hit_set type '" << val
<< "'";
5948 } else if (var
== "hit_set_period") {
5949 if (interr
.length()) {
5950 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5953 p
.hit_set_period
= n
;
5954 } else if (var
== "hit_set_count") {
5955 if (interr
.length()) {
5956 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
5959 p
.hit_set_count
= n
;
5960 } else if (var
== "hit_set_fpp") {
5961 if (floaterr
.length()) {
5962 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
5965 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
5966 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
5969 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
5971 } else if (var
== "use_gmt_hitset") {
5972 if (val
== "true" || (interr
.empty() && n
== 1)) {
5973 if (!(osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
)) {
5974 ss
<< "not all OSDs support GMT hit set.";
5977 p
.use_gmt_hitset
= true;
5979 ss
<< "expecting value 'true' or '1'";
5982 } else if (var
== "allow_ec_overwrites") {
5983 if (!p
.is_erasure()) {
5984 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
5987 if (val
== "true" || (interr
.empty() && n
== 1)) {
5988 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
5989 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
5990 ss
<< "ec overwrites cannot be disabled once enabled";
5993 ss
<< "expecting value 'true', 'false', '0', or '1'";
5997 if (!is_pool_currently_all_bluestore(pool
, p
, &err
)) {
5998 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
6001 } else if (var
== "target_max_objects") {
6002 if (interr
.length()) {
6003 ss
<< "error parsing int '" << val
<< "': " << interr
;
6006 p
.target_max_objects
= n
;
6007 } else if (var
== "target_max_bytes") {
6008 if (interr
.length()) {
6009 ss
<< "error parsing int '" << val
<< "': " << interr
;
6012 p
.target_max_bytes
= n
;
6013 } else if (var
== "cache_target_dirty_ratio") {
6014 if (floaterr
.length()) {
6015 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6018 if (f
< 0 || f
> 1.0) {
6019 ss
<< "value must be in the range 0..1";
6022 p
.cache_target_dirty_ratio_micro
= uf
;
6023 } else if (var
== "cache_target_dirty_high_ratio") {
6024 if (floaterr
.length()) {
6025 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6028 if (f
< 0 || f
> 1.0) {
6029 ss
<< "value must be in the range 0..1";
6032 p
.cache_target_dirty_high_ratio_micro
= uf
;
6033 } else if (var
== "cache_target_full_ratio") {
6034 if (floaterr
.length()) {
6035 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6038 if (f
< 0 || f
> 1.0) {
6039 ss
<< "value must be in the range 0..1";
6042 p
.cache_target_full_ratio_micro
= uf
;
6043 } else if (var
== "cache_min_flush_age") {
6044 if (interr
.length()) {
6045 ss
<< "error parsing int '" << val
<< "': " << interr
;
6048 p
.cache_min_flush_age
= n
;
6049 } else if (var
== "cache_min_evict_age") {
6050 if (interr
.length()) {
6051 ss
<< "error parsing int '" << val
<< "': " << interr
;
6054 p
.cache_min_evict_age
= n
;
6055 } else if (var
== "min_read_recency_for_promote") {
6056 if (interr
.length()) {
6057 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6060 p
.min_read_recency_for_promote
= n
;
6061 } else if (var
== "hit_set_grade_decay_rate") {
6062 if (interr
.length()) {
6063 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6066 if (n
> 100 || n
< 0) {
6067 ss
<< "value out of range,valid range is 0 - 100";
6070 p
.hit_set_grade_decay_rate
= n
;
6071 } else if (var
== "hit_set_search_last_n") {
6072 if (interr
.length()) {
6073 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6076 if (n
> p
.hit_set_count
|| n
< 0) {
6077 ss
<< "value out of range,valid range is 0 - hit_set_count";
6080 p
.hit_set_search_last_n
= n
;
6081 } else if (var
== "min_write_recency_for_promote") {
6082 if (interr
.length()) {
6083 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6086 p
.min_write_recency_for_promote
= n
;
6087 } else if (var
== "fast_read") {
6088 if (p
.is_replicated()) {
6089 ss
<< "fast read is not supported in replication pool";
6092 if (val
== "true" || (interr
.empty() && n
== 1)) {
6094 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6095 p
.fast_read
= false;
6097 ss
<< "expecting value 'true', 'false', '0', or '1'";
6100 } else if (pool_opts_t::is_opt_name(var
)) {
6101 if (var
== "compression_mode") {
6102 auto cmode
= Compressor::get_comp_mode_type(val
);
6104 ss
<< "unrecognized compression mode '" << val
<< "'";
6107 } else if (var
== "compression_algorithm") {
6108 auto alg
= Compressor::get_comp_alg_type(val
);
6110 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
6113 } else if (var
== "compression_required_ratio") {
6114 if (floaterr
.length()) {
6115 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
6119 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
6122 } else if (var
== "csum_type") {
6123 auto t
= val
!= "unset" ? Checksummer::get_csum_string_type(val
) : 0;
6125 ss
<< "unrecognized csum_type '" << val
<< "'";
6128 //preserve csum_type numeric value
6131 } else if (var
== "compression_max_blob_size" ||
6132 var
== "compression_min_blob_size" ||
6133 var
== "csum_max_block" ||
6134 var
== "csum_min_block") {
6135 if (interr
.length()) {
6136 ss
<< "error parsing int value '" << val
<< "': " << interr
;
6141 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
6142 switch (desc
.type
) {
6143 case pool_opts_t::STR
:
6145 p
.opts
.unset(desc
.key
);
6147 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
6150 case pool_opts_t::INT
:
6151 if (interr
.length()) {
6152 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6156 p
.opts
.unset(desc
.key
);
6158 p
.opts
.set(desc
.key
, static_cast<int>(n
));
6161 case pool_opts_t::DOUBLE
:
6162 if (floaterr
.length()) {
6163 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
6167 p
.opts
.unset(desc
.key
);
6169 p
.opts
.set(desc
.key
, static_cast<double>(f
));
6173 assert(!"unknown type");
6176 ss
<< "unrecognized variable '" << var
<< "'";
6179 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
6180 p
.last_change
= pending_inc
.epoch
;
6181 pending_inc
.new_pools
[pool
] = p
;
6185 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
6187 op
->mark_osdmon_event(__func__
);
6188 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
6190 map
<string
, cmd_vartype
> cmdmap
;
6191 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
6192 string rs
= ss
.str();
6193 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
6197 MonSession
*session
= m
->get_session();
6199 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
6203 return prepare_command_impl(op
, cmdmap
);
6206 static int parse_reweights(CephContext
*cct
,
6207 const map
<string
,cmd_vartype
> &cmdmap
,
6208 const OSDMap
& osdmap
,
6209 map
<int32_t, uint32_t>* weights
)
6212 if (!cmd_getval(g_ceph_context
, cmdmap
, "weights", weights_str
)) {
6215 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
6216 json_spirit::mValue json_value
;
6217 if (!json_spirit::read(weights_str
, json_value
)) {
6220 if (json_value
.type() != json_spirit::obj_type
) {
6223 const auto obj
= json_value
.get_obj();
6225 for (auto& osd_weight
: obj
) {
6226 auto osd_id
= std::stoi(osd_weight
.first
);
6227 if (!osdmap
.exists(osd_id
)) {
6230 if (osd_weight
.second
.type() != json_spirit::str_type
) {
6233 auto weight
= std::stoul(osd_weight
.second
.get_str());
6234 weights
->insert({osd_id
, weight
});
6236 } catch (const std::logic_error
& e
) {
6242 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
6243 map
<string
,cmd_vartype
> &cmdmap
)
6245 op
->mark_osdmon_event(__func__
);
6246 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
6254 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
6255 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6258 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
6262 bool osdid_present
= cmd_getval(g_ceph_context
, cmdmap
, "id", osdid
);
6263 if (osdid_present
) {
6265 oss
<< "osd." << osdid
;
6269 // Even if there's a pending state with changes that could affect
6270 // a command, considering that said state isn't yet committed, we
6271 // just don't care about those changes if the command currently being
6272 // handled acts as a no-op against the current committed state.
6273 // In a nutshell, we assume this command happens *before*.
6275 // Let me make this clearer:
6277 // - If we have only one client, and that client issues some
6278 // operation that would conflict with this operation but is
6279 // still on the pending state, then we would be sure that said
6280 // operation wouldn't have returned yet, so the client wouldn't
6281 // issue this operation (unless the client didn't wait for the
6282 // operation to finish, and that would be the client's own fault).
6284 // - If we have more than one client, each client will observe
6285 // whatever is the state at the moment of the commit. So, if we
6286 // have two clients, one issuing an unlink and another issuing a
6287 // link, and if the link happens while the unlink is still on the
6288 // pending state, from the link's point-of-view this is a no-op.
6289 // If different clients are issuing conflicting operations and
6290 // they care about that, then the clients should make sure they
6291 // enforce some kind of concurrency mechanism -- from our
6292 // perspective that's what Douglas Adams would call an SEP.
6294 // This should be used as a general guideline for most commands handled
6295 // in this function. Adapt as you see fit, but please bear in mind that
6296 // this is the expected behavior.
6299 if (prefix
== "osd setcrushmap" ||
6300 (prefix
== "osd crush set" && !osdid_present
)) {
6301 dout(10) << "prepare_command setting new crush map" << dendl
;
6302 bufferlist
data(m
->get_data());
6305 bufferlist::iterator
bl(data
.begin());
6308 catch (const std::exception
&e
) {
6310 ss
<< "Failed to parse crushmap: " << e
.what();
6314 if (!validate_crush_against_features(&crush
, ss
)) {
6319 const auto& osdmap_pools
= osdmap
.get_pools();
6320 for (auto pit
= osdmap_pools
.begin(); pit
!= osdmap_pools
.end(); ++pit
) {
6321 const int64_t pool_id
= pit
->first
;
6322 const pg_pool_t
&pool
= pit
->second
;
6323 int ruleno
= pool
.get_crush_ruleset();
6324 if (!crush
.rule_exists(ruleno
)) {
6325 ss
<< " the crush rule no "<< ruleno
<< " for pool id " << pool_id
<< " is in use";
6331 // sanity check: test some inputs to make sure this map isn't totally broken
6332 dout(10) << " testing map" << dendl
;
6334 CrushTester
tester(crush
, ess
);
6335 // XXX: Use mon_lease as a timeout value for crushtool.
6336 // If the crushtool consistently takes longer than 'mon_lease' seconds,
6337 // then we would consistently trigger an election before the command
6338 // finishes, having a flapping monitor unable to hold quorum.
6339 int r
= tester
.test_with_crushtool(g_conf
->crushtool
.c_str(),
6340 osdmap
.get_max_osd(),
6343 derr
<< "error on crush map: " << ess
.str() << dendl
;
6344 ss
<< "Failed crushmap test: " << ess
.str();
6349 dout(10) << " result " << ess
.str() << dendl
;
6351 pending_inc
.crush
= data
;
6352 ss
<< "set crush map";
6355 } else if (prefix
== "osd crush set-device-class") {
6356 if (!osdmap
.exists(osdid
)) {
6358 ss
<< name
<< " does not exist. create it before updating the crush map";
6362 string device_class
;
6363 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
6364 err
= -EINVAL
; // no value!
6368 CrushWrapper newcrush
;
6369 _get_pending_crush(newcrush
);
6372 if (newcrush
.item_exists(osdid
)) {
6373 action
= "updating";
6375 action
= "creating";
6376 newcrush
.set_item_name(osdid
, name
);
6379 dout(5) << action
<< " crush item id " << osdid
<< " name '"
6380 << name
<< "' device_class " << device_class
<< dendl
;
6381 err
= newcrush
.update_device_class(g_ceph_context
, osdid
, device_class
, name
);
6386 if (err
== 0 && !_have_pending_crush()) {
6387 ss
<< "set-device-class item id " << osdid
<< " name '" << name
<< "' device_class "
6388 << device_class
<< " : no change";
6392 pending_inc
.crush
.clear();
6393 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6394 ss
<< "set-device-class item id " << osdid
<< " name '" << name
<< "' device_class "
6397 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6398 get_last_committed() + 1));
6401 } else if (prefix
== "osd crush add-bucket") {
6402 // os crush add-bucket <name> <type>
6403 string name
, typestr
;
6404 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
6405 cmd_getval(g_ceph_context
, cmdmap
, "type", typestr
);
6407 if (!_have_pending_crush() &&
6408 _get_stable_crush().name_exists(name
)) {
6409 ss
<< "bucket '" << name
<< "' already exists";
6413 CrushWrapper newcrush
;
6414 _get_pending_crush(newcrush
);
6416 if (newcrush
.name_exists(name
)) {
6417 ss
<< "bucket '" << name
<< "' already exists";
6420 int type
= newcrush
.get_type_id(typestr
);
6422 ss
<< "type '" << typestr
<< "' does not exist";
6427 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
6432 err
= newcrush
.add_bucket(0, 0,
6433 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
6436 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
6439 err
= newcrush
.set_item_name(bucketno
, name
);
6441 ss
<< "error setting bucket name to '" << name
<< "'";
6445 pending_inc
.crush
.clear();
6446 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6447 ss
<< "added bucket " << name
<< " type " << typestr
6450 } else if (prefix
== "osd crush rename-bucket") {
6451 string srcname
, dstname
;
6452 cmd_getval(g_ceph_context
, cmdmap
, "srcname", srcname
);
6453 cmd_getval(g_ceph_context
, cmdmap
, "dstname", dstname
);
6455 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
6456 if (err
== -EALREADY
) // equivalent to success for idempotency
6462 } else if (prefix
== "osd crush class create") {
6463 string device_class
;
6464 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
6465 err
= -EINVAL
; // no value!
6469 if (!_have_pending_crush() &&
6470 _get_stable_crush().class_exists(device_class
)) {
6471 ss
<< "class '" << device_class
<< "' already exists";
6475 CrushWrapper newcrush
;
6476 _get_pending_crush(newcrush
);
6478 if (newcrush
.class_exists(name
)) {
6479 ss
<< "class '" << device_class
<< "' already exists";
6483 int class_id
= newcrush
.get_or_create_class_id(device_class
);
6485 pending_inc
.crush
.clear();
6486 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6487 ss
<< "created class " << device_class
<< " with id " << class_id
6491 } else if (prefix
== "osd crush class rm") {
6492 string device_class
;
6493 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
6494 err
= -EINVAL
; // no value!
6498 CrushWrapper newcrush
;
6499 _get_pending_crush(newcrush
);
6501 if (!newcrush
.class_exists(device_class
)) {
6503 ss
<< "class '" << device_class
<< "' does not exist";
6507 int class_id
= newcrush
.get_class_id(device_class
);
6509 if (newcrush
.class_is_in_use(class_id
)) {
6511 ss
<< "class '" << device_class
<< "' is in use";
6515 err
= newcrush
.remove_class_name(device_class
);
6517 ss
<< "class '" << device_class
<< "' cannot be removed '"
6518 << cpp_strerror(err
) << "'";
6522 pending_inc
.crush
.clear();
6523 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6524 ss
<< "removed class " << device_class
<< " with id " << class_id
6525 << " from crush map";
6528 } else if (osdid_present
&&
6529 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
6530 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
6531 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
6532 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
6534 if (!osdmap
.exists(osdid
)) {
6536 ss
<< name
<< " does not exist. create it before updating the crush map";
6541 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
)) {
6542 ss
<< "unable to parse weight value '"
6543 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
6549 vector
<string
> argvec
;
6550 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
6551 map
<string
,string
> loc
;
6552 CrushWrapper::parse_loc_map(argvec
, &loc
);
6554 if (prefix
== "osd crush set"
6555 && !_get_stable_crush().item_exists(osdid
)) {
6557 ss
<< "unable to set item id " << osdid
<< " name '" << name
6558 << "' weight " << weight
<< " at location " << loc
6559 << ": does not exist";
6563 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
6564 << name
<< "' weight " << weight
<< " at location "
6566 CrushWrapper newcrush
;
6567 _get_pending_crush(newcrush
);
6570 if (prefix
== "osd crush set" ||
6571 newcrush
.check_item_loc(g_ceph_context
, osdid
, loc
, (int *)NULL
)) {
6573 err
= newcrush
.update_item(g_ceph_context
, osdid
, weight
, name
, loc
);
6576 err
= newcrush
.insert_item(g_ceph_context
, osdid
, weight
, name
, loc
);
6584 if (err
== 0 && !_have_pending_crush()) {
6585 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
6586 << weight
<< " at location " << loc
<< ": no change";
6590 pending_inc
.crush
.clear();
6591 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6592 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
6593 << weight
<< " at location " << loc
<< " to crush map";
6595 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6596 get_last_committed() + 1));
6599 } else if (prefix
== "osd crush create-or-move") {
6601 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
6602 if (!osdmap
.exists(osdid
)) {
6604 ss
<< name
<< " does not exist. create it before updating the crush map";
6609 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
)) {
6610 ss
<< "unable to parse weight value '"
6611 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
6617 vector
<string
> argvec
;
6618 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
6619 map
<string
,string
> loc
;
6620 CrushWrapper::parse_loc_map(argvec
, &loc
);
6622 dout(0) << "create-or-move crush item name '" << name
<< "' initial_weight " << weight
6623 << " at location " << loc
<< dendl
;
6625 CrushWrapper newcrush
;
6626 _get_pending_crush(newcrush
);
6628 err
= newcrush
.create_or_move_item(g_ceph_context
, osdid
, weight
, name
, loc
);
6630 ss
<< "create-or-move updated item name '" << name
<< "' weight " << weight
6631 << " at location " << loc
<< " to crush map";
6635 pending_inc
.crush
.clear();
6636 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6637 ss
<< "create-or-move updating item name '" << name
<< "' weight " << weight
6638 << " at location " << loc
<< " to crush map";
6640 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6641 get_last_committed() + 1));
6646 } else if (prefix
== "osd crush move") {
6648 // osd crush move <name> <loc1> [<loc2> ...]
6651 vector
<string
> argvec
;
6652 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
6653 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
6654 map
<string
,string
> loc
;
6655 CrushWrapper::parse_loc_map(argvec
, &loc
);
6657 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
6658 CrushWrapper newcrush
;
6659 _get_pending_crush(newcrush
);
6661 if (!newcrush
.name_exists(name
)) {
6663 ss
<< "item " << name
<< " does not exist";
6666 int id
= newcrush
.get_item_id(name
);
6668 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
6670 err
= newcrush
.create_or_move_item(g_ceph_context
, id
, 0, name
, loc
);
6672 err
= newcrush
.move_bucket(g_ceph_context
, id
, loc
);
6675 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
6676 pending_inc
.crush
.clear();
6677 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6679 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6680 get_last_committed() + 1));
6684 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
6689 } else if (prefix
== "osd crush link") {
6690 // osd crush link <name> <loc1> [<loc2> ...]
6692 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
6693 vector
<string
> argvec
;
6694 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
6695 map
<string
,string
> loc
;
6696 CrushWrapper::parse_loc_map(argvec
, &loc
);
6698 // Need an explicit check for name_exists because get_item_id returns
6700 int id
= osdmap
.crush
->get_item_id(name
);
6701 if (!osdmap
.crush
->name_exists(name
)) {
6703 ss
<< "item " << name
<< " does not exist";
6706 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
6708 if (osdmap
.crush
->check_item_loc(g_ceph_context
, id
, loc
, (int*) NULL
)) {
6709 ss
<< "no need to move item id " << id
<< " name '" << name
6710 << "' to location " << loc
<< " in crush map";
6715 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
6716 CrushWrapper newcrush
;
6717 _get_pending_crush(newcrush
);
6719 if (!newcrush
.name_exists(name
)) {
6721 ss
<< "item " << name
<< " does not exist";
6724 int id
= newcrush
.get_item_id(name
);
6725 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
6726 err
= newcrush
.link_bucket(g_ceph_context
, id
, loc
);
6728 ss
<< "linked item id " << id
<< " name '" << name
6729 << "' to location " << loc
<< " in crush map";
6730 pending_inc
.crush
.clear();
6731 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6733 ss
<< "cannot link item id " << id
<< " name '" << name
6734 << "' to location " << loc
;
6738 ss
<< "no need to move item id " << id
<< " name '" << name
6739 << "' to location " << loc
<< " in crush map";
6743 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
6744 get_last_committed() + 1));
6746 } else if (prefix
== "osd crush rm" ||
6747 prefix
== "osd crush remove" ||
6748 prefix
== "osd crush unlink") {
6750 // osd crush rm <id> [ancestor]
6751 CrushWrapper newcrush
;
6752 _get_pending_crush(newcrush
);
6755 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
6757 if (!osdmap
.crush
->name_exists(name
)) {
6759 ss
<< "device '" << name
<< "' does not appear in the crush map";
6762 if (!newcrush
.name_exists(name
)) {
6764 ss
<< "device '" << name
<< "' does not appear in the crush map";
6766 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6767 get_last_committed() + 1));
6770 int id
= newcrush
.get_item_id(name
);
6771 bool unlink_only
= prefix
== "osd crush unlink";
6772 string ancestor_str
;
6773 if (cmd_getval(g_ceph_context
, cmdmap
, "ancestor", ancestor_str
)) {
6774 if (!newcrush
.name_exists(ancestor_str
)) {
6776 ss
<< "ancestor item '" << ancestor_str
6777 << "' does not appear in the crush map";
6780 int ancestor
= newcrush
.get_item_id(ancestor_str
);
6781 err
= newcrush
.remove_item_under(g_ceph_context
, id
, ancestor
,
6784 err
= newcrush
.remove_item(g_ceph_context
, id
, unlink_only
);
6786 if (err
== -ENOENT
) {
6787 ss
<< "item " << id
<< " does not appear in that position";
6792 pending_inc
.crush
.clear();
6793 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6794 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
6796 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6797 get_last_committed() + 1));
6802 } else if (prefix
== "osd crush reweight-all") {
6803 // osd crush reweight <name> <weight>
6804 CrushWrapper newcrush
;
6805 _get_pending_crush(newcrush
);
6807 newcrush
.reweight(g_ceph_context
);
6808 pending_inc
.crush
.clear();
6809 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6810 ss
<< "reweighted crush hierarchy";
6812 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6813 get_last_committed() + 1));
6815 } else if (prefix
== "osd crush reweight") {
6816 // osd crush reweight <name> <weight>
6817 CrushWrapper newcrush
;
6818 _get_pending_crush(newcrush
);
6821 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
6822 if (!newcrush
.name_exists(name
)) {
6824 ss
<< "device '" << name
<< "' does not appear in the crush map";
6828 int id
= newcrush
.get_item_id(name
);
6830 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
6835 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
6836 ss
<< "unable to parse weight value '"
6837 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
6842 err
= newcrush
.adjust_item_weightf(g_ceph_context
, id
, w
);
6845 pending_inc
.crush
.clear();
6846 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6847 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
6850 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6851 get_last_committed() + 1));
6853 } else if (prefix
== "osd crush reweight-subtree") {
6854 // osd crush reweight <name> <weight>
6855 CrushWrapper newcrush
;
6856 _get_pending_crush(newcrush
);
6859 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
6860 if (!newcrush
.name_exists(name
)) {
6862 ss
<< "device '" << name
<< "' does not appear in the crush map";
6866 int id
= newcrush
.get_item_id(name
);
6868 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
6873 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
6874 ss
<< "unable to parse weight value '"
6875 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
6880 err
= newcrush
.adjust_subtree_weightf(g_ceph_context
, id
, w
);
6883 pending_inc
.crush
.clear();
6884 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6885 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
6888 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6889 get_last_committed() + 1));
6891 } else if (prefix
== "osd crush tunables") {
6892 CrushWrapper newcrush
;
6893 _get_pending_crush(newcrush
);
6897 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
6898 if (profile
== "legacy" || profile
== "argonaut") {
6899 newcrush
.set_tunables_legacy();
6900 } else if (profile
== "bobtail") {
6901 newcrush
.set_tunables_bobtail();
6902 } else if (profile
== "firefly") {
6903 newcrush
.set_tunables_firefly();
6904 } else if (profile
== "hammer") {
6905 newcrush
.set_tunables_hammer();
6906 } else if (profile
== "jewel") {
6907 newcrush
.set_tunables_jewel();
6908 } else if (profile
== "optimal") {
6909 newcrush
.set_tunables_optimal();
6910 } else if (profile
== "default") {
6911 newcrush
.set_tunables_default();
6913 ss
<< "unrecognized profile '" << profile
<< "'";
6918 if (!validate_crush_against_features(&newcrush
, ss
)) {
6923 pending_inc
.crush
.clear();
6924 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6925 ss
<< "adjusted tunables profile to " << profile
;
6927 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6928 get_last_committed() + 1));
6930 } else if (prefix
== "osd crush set-tunable") {
6931 CrushWrapper newcrush
;
6932 _get_pending_crush(newcrush
);
6936 cmd_getval(g_ceph_context
, cmdmap
, "tunable", tunable
);
6939 if (!cmd_getval(g_ceph_context
, cmdmap
, "value", value
)) {
6941 ss
<< "failed to parse integer value " << cmd_vartype_stringify(cmdmap
["value"]);
6945 if (tunable
== "straw_calc_version") {
6946 if (value
< 0 || value
> 1) {
6947 ss
<< "value must be 0 or 1; got " << value
;
6951 newcrush
.set_straw_calc_version(value
);
6953 ss
<< "unrecognized tunable '" << tunable
<< "'";
6958 if (!validate_crush_against_features(&newcrush
, ss
)) {
6963 pending_inc
.crush
.clear();
6964 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6965 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
6967 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
6968 get_last_committed() + 1));
6971 } else if (prefix
== "osd crush rule create-simple") {
6972 string name
, root
, type
, mode
;
6973 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
6974 cmd_getval(g_ceph_context
, cmdmap
, "root", root
);
6975 cmd_getval(g_ceph_context
, cmdmap
, "type", type
);
6976 cmd_getval(g_ceph_context
, cmdmap
, "mode", mode
);
6980 if (osdmap
.crush
->rule_exists(name
)) {
6981 // The name is uniquely associated to a ruleid and the ruleset it contains
6982 // From the user point of view, the ruleset is more meaningfull.
6983 ss
<< "ruleset " << name
<< " already exists";
6988 CrushWrapper newcrush
;
6989 _get_pending_crush(newcrush
);
6991 if (newcrush
.rule_exists(name
)) {
6992 // The name is uniquely associated to a ruleid and the ruleset it contains
6993 // From the user point of view, the ruleset is more meaningfull.
6994 ss
<< "ruleset " << name
<< " already exists";
6997 int ruleno
= newcrush
.add_simple_ruleset(name
, root
, type
, mode
,
6998 pg_pool_t::TYPE_REPLICATED
, &ss
);
7004 pending_inc
.crush
.clear();
7005 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7008 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7009 get_last_committed() + 1));
7012 } else if (prefix
== "osd erasure-code-profile rm") {
7014 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7016 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
7019 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
7024 if (osdmap
.has_erasure_code_profile(name
) ||
7025 pending_inc
.new_erasure_code_profiles
.count(name
)) {
7026 if (osdmap
.has_erasure_code_profile(name
)) {
7027 pending_inc
.old_erasure_code_profiles
.push_back(name
);
7029 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
7030 pending_inc
.new_erasure_code_profiles
.erase(name
);
7034 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7035 get_last_committed() + 1));
7038 ss
<< "erasure-code-profile " << name
<< " does not exist";
7043 } else if (prefix
== "osd erasure-code-profile set") {
7045 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7046 vector
<string
> profile
;
7047 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
7049 if (profile
.size() > 0 && profile
.back() == "--force") {
7055 map
<string
,string
> profile_map
;
7056 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
7059 if (profile_map
.find("plugin") == profile_map
.end()) {
7060 ss
<< "erasure-code-profile " << profile_map
7061 << " must contain a plugin entry" << std::endl
;
7065 string plugin
= profile_map
["plugin"];
7067 if (pending_inc
.has_erasure_code_profile(name
)) {
7068 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
7071 if (plugin
== "isa" || plugin
== "lrc") {
7072 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
, ss
);
7077 } else if (plugin
== "shec") {
7078 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
, ss
);
7084 err
= normalize_profile(name
, profile_map
, force
, &ss
);
7088 if (osdmap
.has_erasure_code_profile(name
)) {
7089 ErasureCodeProfile existing_profile_map
=
7090 osdmap
.get_erasure_code_profile(name
);
7091 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
7095 if (existing_profile_map
== profile_map
) {
7101 ss
<< "will not override erasure code profile " << name
7102 << " because the existing profile "
7103 << existing_profile_map
7104 << " is different from the proposed profile "
7110 dout(20) << "erasure code profile set " << name
<< "="
7111 << profile_map
<< dendl
;
7112 pending_inc
.set_erasure_code_profile(name
, profile_map
);
7116 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7117 get_last_committed() + 1));
7120 } else if (prefix
== "osd crush rule create-erasure") {
7121 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
7126 string name
, poolstr
;
7127 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7129 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
7131 profile
= "default";
7132 if (profile
== "default") {
7133 if (!osdmap
.has_erasure_code_profile(profile
)) {
7134 if (pending_inc
.has_erasure_code_profile(profile
)) {
7135 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
7139 map
<string
,string
> profile_map
;
7140 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
7145 err
= normalize_profile(name
, profile_map
, true, &ss
);
7148 dout(20) << "erasure code profile set " << profile
<< "="
7149 << profile_map
<< dendl
;
7150 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
7156 err
= crush_ruleset_create_erasure(name
, profile
, &ruleset
, &ss
);
7159 case -EEXIST
: // return immediately
7160 ss
<< "rule " << name
<< " already exists";
7164 case -EALREADY
: // wait for pending to be proposed
7165 ss
<< "rule " << name
<< " already exists";
7168 default: // non recoverable error
7173 ss
<< "created ruleset " << name
<< " at " << ruleset
;
7177 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7178 get_last_committed() + 1));
7181 } else if (prefix
== "osd crush rule rm") {
7183 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7185 if (!osdmap
.crush
->rule_exists(name
)) {
7186 ss
<< "rule " << name
<< " does not exist";
7191 CrushWrapper newcrush
;
7192 _get_pending_crush(newcrush
);
7194 if (!newcrush
.rule_exists(name
)) {
7195 ss
<< "rule " << name
<< " does not exist";
7198 int ruleno
= newcrush
.get_rule_id(name
);
7199 assert(ruleno
>= 0);
7201 // make sure it is not in use.
7202 // FIXME: this is ok in some situations, but let's not bother with that
7204 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
7205 if (osdmap
.crush_ruleset_in_use(ruleset
)) {
7206 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
7211 err
= newcrush
.remove_rule(ruleno
);
7216 pending_inc
.crush
.clear();
7217 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7220 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7221 get_last_committed() + 1));
7224 } else if (prefix
== "osd setmaxosd") {
7226 if (!cmd_getval(g_ceph_context
, cmdmap
, "newmax", newmax
)) {
7227 ss
<< "unable to parse 'newmax' value '"
7228 << cmd_vartype_stringify(cmdmap
["newmax"]) << "'";
7233 if (newmax
> g_conf
->mon_max_osd
) {
7235 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
7236 << g_conf
->mon_max_osd
<< ")";
7240 // Don't allow shrinking OSD number as this will cause data loss
7241 // and may cause kernel crashes.
7242 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
7243 if (newmax
< osdmap
.get_max_osd()) {
7244 // Check if the OSDs exist between current max and new value.
7245 // If there are any OSDs exist, then don't allow shrinking number
7247 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
7248 if (osdmap
.exists(i
)) {
7250 ss
<< "cannot shrink max_osd to " << newmax
7251 << " because osd." << i
<< " (and possibly others) still in use";
7257 pending_inc
.new_max_osd
= newmax
;
7258 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
7260 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7261 get_last_committed() + 1));
7264 } else if (prefix
== "osd set-full-ratio" ||
7265 prefix
== "osd set-backfillfull-ratio" ||
7266 prefix
== "osd set-nearfull-ratio") {
7267 if (!osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
7268 ss
<< "you must complete the upgrade and set require_luminous_osds before"
7269 << " using the new interface";
7274 if (!cmd_getval(g_ceph_context
, cmdmap
, "ratio", n
)) {
7275 ss
<< "unable to parse 'ratio' value '"
7276 << cmd_vartype_stringify(cmdmap
["who"]) << "'";
7280 if (prefix
== "osd set-full-ratio")
7281 pending_inc
.new_full_ratio
= n
;
7282 else if (prefix
== "osd set-backfillfull-ratio")
7283 pending_inc
.new_backfillfull_ratio
= n
;
7284 else if (prefix
== "osd set-nearfull-ratio")
7285 pending_inc
.new_nearfull_ratio
= n
;
7286 ss
<< prefix
<< " " << n
;
7288 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7289 get_last_committed() + 1));
7291 } else if (prefix
== "osd set-require-min-compat-client") {
7292 if (!osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
7293 ss
<< "you must complete the upgrade and set require_luminous_osds before"
7294 << " using the new interface";
7299 cmd_getval(g_ceph_context
, cmdmap
, "version", v
);
7300 if (v
!= "luminous" && v
!= "kraken" && v
!= "jewel" && v
!= "infernalis" &&
7301 v
!= "hammer" && v
!= "giant" && v
!= "firefly" && v
!= "emperor" &&
7302 v
!= "dumpling" && v
!= "cuttlefish" && v
!= "bobtail" && v
!= "argonaut") {
7303 ss
<< "version " << v
<< " is not recognized";
7308 newmap
.deepish_copy_from(osdmap
);
7309 newmap
.apply_incremental(pending_inc
);
7310 newmap
.require_min_compat_client
= v
;
7311 auto mv
= newmap
.get_min_compat_client();
7313 ss
<< "osdmap current utilizes features that require " << mv
7314 << "; cannot set require_min_compat_client below that to " << v
;
7318 ss
<< "set require_min_compat_client to " << v
;
7319 pending_inc
.new_require_min_compat_client
= v
;
7321 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7322 get_last_committed() + 1));
7324 } else if (prefix
== "osd pause") {
7325 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
7327 } else if (prefix
== "osd unpause") {
7328 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
7330 } else if (prefix
== "osd set") {
7332 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
7334 return prepare_set_flag(op
, CEPH_OSDMAP_FULL
);
7335 else if (key
== "pause")
7336 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
7337 else if (key
== "noup")
7338 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
7339 else if (key
== "nodown")
7340 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
7341 else if (key
== "noout")
7342 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
7343 else if (key
== "noin")
7344 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
7345 else if (key
== "nobackfill")
7346 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
7347 else if (key
== "norebalance")
7348 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
7349 else if (key
== "norecover")
7350 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
7351 else if (key
== "noscrub")
7352 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
7353 else if (key
== "nodeep-scrub")
7354 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
7355 else if (key
== "notieragent")
7356 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
7357 else if (key
== "sortbitwise") {
7358 if (osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
) {
7359 return prepare_set_flag(op
, CEPH_OSDMAP_SORTBITWISE
);
7361 ss
<< "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
7364 } else if (key
== "require_jewel_osds") {
7365 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
7366 ss
<< "the sortbitwise flag must be set before require_jewel_osds";
7368 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
)) {
7369 return prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_JEWEL
);
7371 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
7374 } else if (key
== "require_kraken_osds") {
7375 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
7376 ss
<< "the sortbitwise flag must be set before require_kraken_osds";
7378 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
)) {
7379 bool r
= prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_KRAKEN
);
7380 // ensure JEWEL is also set
7381 pending_inc
.new_flags
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
7384 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
7387 } else if (key
== "require_luminous_osds") {
7388 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
7389 ss
<< "the sortbitwise flag must be set before require_luminous_osds";
7391 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
)) {
7392 bool r
= prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_LUMINOUS
);
7393 // ensure JEWEL and KRAKEN are also set
7394 pending_inc
.new_flags
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
7395 pending_inc
.new_flags
|= CEPH_OSDMAP_REQUIRE_KRAKEN
;
7398 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
7402 ss
<< "unrecognized flag '" << key
<< "'";
7406 } else if (prefix
== "osd unset") {
7408 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
7410 return prepare_unset_flag(op
, CEPH_OSDMAP_FULL
);
7411 else if (key
== "pause")
7412 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
7413 else if (key
== "noup")
7414 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
7415 else if (key
== "nodown")
7416 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
7417 else if (key
== "noout")
7418 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
7419 else if (key
== "noin")
7420 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
7421 else if (key
== "nobackfill")
7422 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
7423 else if (key
== "norebalance")
7424 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
7425 else if (key
== "norecover")
7426 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
7427 else if (key
== "noscrub")
7428 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
7429 else if (key
== "nodeep-scrub")
7430 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
7431 else if (key
== "notieragent")
7432 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
7433 else if (key
== "sortbitwise") {
7434 ss
<< "the sortbitwise flag is required and cannot be unset";
7437 ss
<< "unrecognized flag '" << key
<< "'";
7441 } else if (prefix
== "osd cluster_snap") {
7442 // ** DISABLE THIS FOR NOW **
7443 ss
<< "cluster snapshot currently disabled (broken implementation)";
7444 // ** DISABLE THIS FOR NOW **
7446 } else if (prefix
== "osd down" ||
7447 prefix
== "osd out" ||
7448 prefix
== "osd in" ||
7449 prefix
== "osd rm") {
7453 vector
<string
> idvec
;
7454 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
7455 for (unsigned j
= 0; j
< idvec
.size(); j
++) {
7456 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
7458 ss
<< "invalid osd id" << osd
;
7461 } else if (!osdmap
.exists(osd
)) {
7462 ss
<< "osd." << osd
<< " does not exist. ";
7465 if (prefix
== "osd down") {
7466 if (osdmap
.is_down(osd
)) {
7467 ss
<< "osd." << osd
<< " is already down. ";
7469 pending_inc
.new_state
[osd
] = CEPH_OSD_UP
;
7470 ss
<< "marked down osd." << osd
<< ". ";
7473 } else if (prefix
== "osd out") {
7474 if (osdmap
.is_out(osd
)) {
7475 ss
<< "osd." << osd
<< " is already out. ";
7477 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
7478 if (osdmap
.osd_weight
[osd
]) {
7479 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
7480 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
7482 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
7484 ss
<< "marked out osd." << osd
<< ". ";
7487 } else if (prefix
== "osd in") {
7488 if (osdmap
.is_in(osd
)) {
7489 ss
<< "osd." << osd
<< " is already in. ";
7491 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
7492 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
7493 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
7494 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
7496 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
7498 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
7500 ss
<< "marked in osd." << osd
<< ". ";
7503 } else if (prefix
== "osd rm") {
7504 if (osdmap
.is_up(osd
)) {
7507 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
7510 pending_inc
.new_state
[osd
] = osdmap
.get_state(osd
);
7511 pending_inc
.new_uuid
[osd
] = uuid_d();
7512 pending_metadata_rm
.insert(osd
);
7514 ss
<< ", osd." << osd
;
7516 ss
<< "removed osd." << osd
;
7524 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
7525 get_last_committed() + 1));
7528 } else if (prefix
== "osd pg-temp") {
7530 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
7531 ss
<< "unable to parse 'pgid' value '"
7532 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
7537 if (!pgid
.parse(pgidstr
.c_str())) {
7538 ss
<< "invalid pgid '" << pgidstr
<< "'";
7542 if (!osdmap
.pg_exists(pgid
)) {
7543 ss
<< "pg " << pgid
<< " does not exist";
7547 if (pending_inc
.new_pg_temp
.count(pgid
)) {
7548 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
7549 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
7553 vector
<int64_t> id_vec
;
7554 vector
<int32_t> new_pg_temp
;
7555 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
7556 ss
<< "unable to parse 'id' value(s) '"
7557 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
7561 for (auto osd
: id_vec
) {
7562 if (!osdmap
.exists(osd
)) {
7563 ss
<< "osd." << osd
<< " does not exist";
7567 new_pg_temp
.push_back(osd
);
7570 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
7571 new_pg_temp
.begin(), new_pg_temp
.end());
7572 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
7574 } else if (prefix
== "osd primary-temp") {
7576 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
7577 ss
<< "unable to parse 'pgid' value '"
7578 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
7583 if (!pgid
.parse(pgidstr
.c_str())) {
7584 ss
<< "invalid pgid '" << pgidstr
<< "'";
7588 if (!osdmap
.pg_exists(pgid
)) {
7589 ss
<< "pg " << pgid
<< " does not exist";
7595 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
7596 ss
<< "unable to parse 'id' value '"
7597 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
7601 if (osd
!= -1 && !osdmap
.exists(osd
)) {
7602 ss
<< "osd." << osd
<< " does not exist";
7607 if (osdmap
.require_min_compat_client
.length() &&
7608 osdmap
.require_min_compat_client
< "firefly") {
7609 ss
<< "require_min_compat_client " << osdmap
.require_min_compat_client
7610 << " < firefly, which is required for primary-temp";
7613 } else if (!g_conf
->mon_osd_allow_primary_temp
) {
7614 ss
<< "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
7619 pending_inc
.new_primary_temp
[pgid
] = osd
;
7620 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
7622 } else if (prefix
== "osd pg-upmap") {
7623 if (!osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
7624 ss
<< "you must set the require_luminous_osds flag to use this feature";
7628 if (osdmap
.require_min_compat_client
< "luminous") {
7629 ss
<< "min_compat_client " << osdmap
.require_min_compat_client
7630 << " < luminous, which is required for pg-upmap";
7634 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
7640 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
7641 ss
<< "unable to parse 'pgid' value '"
7642 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
7647 if (!pgid
.parse(pgidstr
.c_str())) {
7648 ss
<< "invalid pgid '" << pgidstr
<< "'";
7652 if (!osdmap
.pg_exists(pgid
)) {
7653 ss
<< "pg " << pgid
<< " does not exist";
7657 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
7658 pending_inc
.old_pg_upmap
.count(pgid
)) {
7659 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
7660 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
7663 vector
<int64_t> id_vec
;
7664 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
7665 ss
<< "unable to parse 'id' value(s) '"
7666 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
7670 vector
<int32_t> new_pg_upmap
;
7671 for (auto osd
: id_vec
) {
7672 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
7673 ss
<< "osd." << osd
<< " does not exist";
7677 new_pg_upmap
.push_back(osd
);
7680 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
7681 new_pg_upmap
.begin(), new_pg_upmap
.end());
7682 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
7684 } else if (prefix
== "osd rm-pg-upmap") {
7685 if (!osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
7686 ss
<< "you must set the require_luminous_osds flag to use this feature";
7690 if (osdmap
.require_min_compat_client
< "luminous") {
7691 ss
<< "require_min_compat_client " << osdmap
.require_min_compat_client
7692 << " < luminous, which is required for pg-upmap";
7696 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
7702 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
7703 ss
<< "unable to parse 'pgid' value '"
7704 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
7709 if (!pgid
.parse(pgidstr
.c_str())) {
7710 ss
<< "invalid pgid '" << pgidstr
<< "'";
7714 if (!osdmap
.pg_exists(pgid
)) {
7715 ss
<< "pg " << pgid
<< " does not exist";
7719 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
7720 pending_inc
.old_pg_upmap
.count(pgid
)) {
7721 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
7722 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
7726 pending_inc
.old_pg_upmap
.insert(pgid
);
7727 ss
<< "clear " << pgid
<< " pg_upmap mapping";
7729 } else if (prefix
== "osd pg-upmap-items") {
7730 if (!osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
7731 ss
<< "you must set the require_luminous_osds flag to use this feature";
7735 if (osdmap
.require_min_compat_client
< "luminous") {
7736 ss
<< "require_min_compat_client " << osdmap
.require_min_compat_client
7737 << " < luminous, which is required for pg-upmap";
7741 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
7747 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
7748 ss
<< "unable to parse 'pgid' value '"
7749 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
7754 if (!pgid
.parse(pgidstr
.c_str())) {
7755 ss
<< "invalid pgid '" << pgidstr
<< "'";
7759 if (!osdmap
.pg_exists(pgid
)) {
7760 ss
<< "pg " << pgid
<< " does not exist";
7764 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
7765 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
7766 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
7767 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
7770 vector
<int64_t> id_vec
;
7771 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
7772 ss
<< "unable to parse 'id' value(s) '"
7773 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
7777 if (id_vec
.size() % 2) {
7778 ss
<< "you must specify pairs of osd ids to be remapped";
7782 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
7783 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
7786 if (!osdmap
.exists(from
)) {
7787 ss
<< "osd." << from
<< " does not exist";
7791 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
7792 ss
<< "osd." << to
<< " does not exist";
7796 new_pg_upmap_items
.push_back(make_pair(from
, to
));
7799 pending_inc
.new_pg_upmap_items
[pgid
] =
7800 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
7801 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
7802 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << new_pg_upmap_items
;
7804 } else if (prefix
== "osd rm-pg-upmap-items") {
7805 if (!osdmap
.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
7806 ss
<< "you must set the require_luminous_osds flag to use this feature";
7810 if (osdmap
.require_min_compat_client
< "luminous") {
7811 ss
<< "require_min_compat_client " << osdmap
.require_min_compat_client
7812 << " < luminous, which is required for pg-upmap";
7816 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
7822 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
7823 ss
<< "unable to parse 'pgid' value '"
7824 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
7829 if (!pgid
.parse(pgidstr
.c_str())) {
7830 ss
<< "invalid pgid '" << pgidstr
<< "'";
7834 if (!osdmap
.pg_exists(pgid
)) {
7835 ss
<< "pg " << pgid
<< " does not exist";
7839 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
7840 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
7841 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
7842 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
7846 pending_inc
.old_pg_upmap_items
.insert(pgid
);
7847 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
7849 } else if (prefix
== "osd primary-affinity") {
7851 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
7852 ss
<< "invalid osd id value '"
7853 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
7858 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
7859 ss
<< "unable to parse 'weight' value '"
7860 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
7864 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
7866 ss
<< "weight must be >= 0";
7870 if (osdmap
.require_min_compat_client
.length() &&
7871 osdmap
.require_min_compat_client
< "firefly") {
7872 ss
<< "require_min_compat_client " << osdmap
.require_min_compat_client
7873 << " < firefly, which is required for primary-affinity";
7876 } else if (!g_conf
->mon_osd_allow_primary_affinity
) {
7877 ss
<< "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
7881 err
= check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY
, ss
);
7886 if (osdmap
.exists(id
)) {
7887 pending_inc
.new_primary_affinity
[id
] = ww
;
7888 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
7890 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7891 get_last_committed() + 1));
7894 ss
<< "osd." << id
<< " does not exist";
7898 } else if (prefix
== "osd reweight") {
7900 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
7901 ss
<< "unable to parse osd id value '"
7902 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
7907 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
7908 ss
<< "unable to parse weight value '"
7909 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
7913 long ww
= (int)((double)CEPH_OSD_IN
*w
);
7915 ss
<< "weight must be >= 0";
7919 if (osdmap
.exists(id
)) {
7920 pending_inc
.new_weight
[id
] = ww
;
7921 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
7923 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7924 get_last_committed() + 1));
7927 ss
<< "osd." << id
<< " does not exist";
7931 } else if (prefix
== "osd reweightn") {
7932 map
<int32_t, uint32_t> weights
;
7933 err
= parse_reweights(g_ceph_context
, cmdmap
, osdmap
, &weights
);
7935 ss
<< "unable to parse 'weights' value '"
7936 << cmd_vartype_stringify(cmdmap
["weights"]) << "'";
7939 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
7940 wait_for_finished_proposal(
7942 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
7944 } else if (prefix
== "osd lost") {
7946 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
7947 ss
<< "unable to parse osd id value '"
7948 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
7953 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) || sure
!= "--yes-i-really-mean-it") {
7954 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
7955 "--yes-i-really-mean-it if you really do.";
7958 } else if (!osdmap
.exists(id
)) {
7959 ss
<< "osd." << id
<< " does not exist";
7962 } else if (!osdmap
.is_down(id
)) {
7963 ss
<< "osd." << id
<< " is not down";
7967 epoch_t e
= osdmap
.get_info(id
).down_at
;
7968 pending_inc
.new_lost
[id
] = e
;
7969 ss
<< "marked osd lost in epoch " << e
;
7971 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7972 get_last_committed() + 1));
7976 } else if (prefix
== "osd create") {
7979 // optional id provided?
7981 if (cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
7983 ss
<< "invalid osd id value '" << id
<< "'";
7987 dout(10) << " osd create got id " << id
<< dendl
;
7990 // optional uuid provided?
7993 if (cmd_getval(g_ceph_context
, cmdmap
, "uuid", uuidstr
)) {
7994 if (!uuid
.parse(uuidstr
.c_str())) {
7995 ss
<< "invalid uuid value '" << uuidstr
<< "'";
7999 dout(10) << " osd create got uuid " << uuid
<< dendl
;
8000 i
= osdmap
.identify_osd(uuid
);
8002 // osd already exists
8003 if (id
>= 0 && i
!= id
) {
8004 ss
<< "uuid " << uuidstr
<< " already in use for different id " << i
;
8010 f
->open_object_section("created_osd");
8011 f
->dump_int("osdid", i
);
8022 if (osdmap
.exists(id
)) {
8023 ss
<< "id " << id
<< " already in use and does not match uuid "
8028 if (pending_inc
.new_state
.count(id
)) {
8029 // osd is about to exist
8030 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8035 if (pending_inc
.identify_osd(uuid
) >= 0) {
8036 // osd is about to exist
8037 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8042 if (osdmap
.get_max_osd() <= i
&& pending_inc
.new_max_osd
<= i
)
8043 pending_inc
.new_max_osd
= i
+ 1;
8048 // allocate a new id
8049 for (i
=0; i
< osdmap
.get_max_osd(); i
++) {
8050 if (!osdmap
.exists(i
) &&
8051 pending_inc
.new_up_client
.count(i
) == 0 &&
8052 (pending_inc
.new_state
.count(i
) == 0 ||
8053 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
8054 pending_inc
.new_weight
[i
] = CEPH_OSD_OUT
;
8060 if (pending_inc
.new_max_osd
< 0)
8061 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
8063 pending_inc
.new_max_osd
++;
8064 i
= pending_inc
.new_max_osd
- 1;
8067 dout(10) << " creating osd." << i
<< dendl
;
8068 pending_inc
.new_state
[i
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
8069 if (!uuid
.is_zero())
8070 pending_inc
.new_uuid
[i
] = uuid
;
8072 f
->open_object_section("created_osd");
8073 f
->dump_int("osdid", i
);
8080 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
8081 get_last_committed() + 1));
8084 } else if (prefix
== "osd blacklist clear") {
8085 pending_inc
.new_blacklist
.clear();
8086 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
8087 osdmap
.get_blacklist(&blacklist
);
8088 for (const auto &entry
: blacklist
) {
8089 pending_inc
.old_blacklist
.push_back(entry
.first
);
8091 ss
<< " removed all blacklist entries";
8093 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8094 get_last_committed() + 1));
8096 } else if (prefix
== "osd blacklist") {
8098 cmd_getval(g_ceph_context
, cmdmap
, "addr", addrstr
);
8100 if (!addr
.parse(addrstr
.c_str(), 0)) {
8101 ss
<< "unable to parse address " << addrstr
;
8107 cmd_getval(g_ceph_context
, cmdmap
, "blacklistop", blacklistop
);
8108 if (blacklistop
== "add") {
8109 utime_t expires
= ceph_clock_now();
8112 cmd_getval(g_ceph_context
, cmdmap
, "expire", d
, double(60*60));
8115 pending_inc
.new_blacklist
[addr
] = expires
;
8116 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
8118 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8119 get_last_committed() + 1));
8121 } else if (blacklistop
== "rm") {
8122 if (osdmap
.is_blacklisted(addr
) ||
8123 pending_inc
.new_blacklist
.count(addr
)) {
8124 if (osdmap
.is_blacklisted(addr
))
8125 pending_inc
.old_blacklist
.push_back(addr
);
8127 pending_inc
.new_blacklist
.erase(addr
);
8128 ss
<< "un-blacklisting " << addr
;
8130 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8131 get_last_committed() + 1));
8134 ss
<< addr
<< " isn't blacklisted";
8139 } else if (prefix
== "osd pool mksnap") {
8141 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8142 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
8144 ss
<< "unrecognized pool '" << poolstr
<< "'";
8149 cmd_getval(g_ceph_context
, cmdmap
, "snap", snapname
);
8150 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
8151 if (p
->is_unmanaged_snaps_mode()) {
8152 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
8155 } else if (p
->snap_exists(snapname
.c_str())) {
8156 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
8159 } else if (p
->is_tier()) {
8160 ss
<< "pool " << poolstr
<< " is a cache tier";
8165 if (pending_inc
.new_pools
.count(pool
))
8166 pp
= &pending_inc
.new_pools
[pool
];
8168 pp
= &pending_inc
.new_pools
[pool
];
8171 if (pp
->snap_exists(snapname
.c_str())) {
8172 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
8174 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
8175 pp
->set_snap_epoch(pending_inc
.epoch
);
8176 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
8179 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8180 get_last_committed() + 1));
8182 } else if (prefix
== "osd pool rmsnap") {
8184 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8185 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
8187 ss
<< "unrecognized pool '" << poolstr
<< "'";
8192 cmd_getval(g_ceph_context
, cmdmap
, "snap", snapname
);
8193 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
8194 if (p
->is_unmanaged_snaps_mode()) {
8195 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
8198 } else if (!p
->snap_exists(snapname
.c_str())) {
8199 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
8204 if (pending_inc
.new_pools
.count(pool
))
8205 pp
= &pending_inc
.new_pools
[pool
];
8207 pp
= &pending_inc
.new_pools
[pool
];
8210 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
8212 pp
->remove_snap(sn
);
8213 pp
->set_snap_epoch(pending_inc
.epoch
);
8214 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
8216 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
8219 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8220 get_last_committed() + 1));
8222 } else if (prefix
== "osd pool create") {
8225 cmd_getval(g_ceph_context
, cmdmap
, "pg_num", pg_num
, int64_t(0));
8226 cmd_getval(g_ceph_context
, cmdmap
, "pgp_num", pgp_num
, pg_num
);
8228 string pool_type_str
;
8229 cmd_getval(g_ceph_context
, cmdmap
, "pool_type", pool_type_str
);
8230 if (pool_type_str
.empty())
8231 pool_type_str
= pg_pool_t::get_default_type();
8234 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8235 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
8237 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
8238 if (pool_type_str
!= p
->get_type_name()) {
8239 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
8242 ss
<< "pool '" << poolstr
<< "' already exists";
8249 if (pool_type_str
== "replicated") {
8250 pool_type
= pg_pool_t::TYPE_REPLICATED
;
8251 } else if (pool_type_str
== "erasure") {
8252 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
|
8253 CEPH_FEATURE_OSD_ERASURE_CODES
,
8259 pool_type
= pg_pool_t::TYPE_ERASURE
;
8261 ss
<< "unknown pool type '" << pool_type_str
<< "'";
8266 bool implicit_ruleset_creation
= false;
8267 string ruleset_name
;
8268 cmd_getval(g_ceph_context
, cmdmap
, "ruleset", ruleset_name
);
8269 string erasure_code_profile
;
8270 cmd_getval(g_ceph_context
, cmdmap
, "erasure_code_profile", erasure_code_profile
);
8272 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8273 if (erasure_code_profile
== "")
8274 erasure_code_profile
= "default";
8275 //handle the erasure code profile
8276 if (erasure_code_profile
== "default") {
8277 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
8278 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
8279 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
8283 map
<string
,string
> profile_map
;
8284 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
8289 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
8290 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
8294 if (ruleset_name
== "") {
8295 implicit_ruleset_creation
= true;
8296 if (erasure_code_profile
== "default") {
8297 ruleset_name
= "erasure-code";
8299 dout(1) << "implicitly use ruleset named after the pool: "
8300 << poolstr
<< dendl
;
8301 ruleset_name
= poolstr
;
8305 //NOTE:for replicated pool,cmd_map will put ruleset_name to erasure_code_profile field
8306 ruleset_name
= erasure_code_profile
;
8309 if (!implicit_ruleset_creation
&& ruleset_name
!= "") {
8311 err
= get_crush_ruleset(ruleset_name
, &ruleset
, &ss
);
8312 if (err
== -EAGAIN
) {
8313 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8320 int64_t expected_num_objects
;
8321 cmd_getval(g_ceph_context
, cmdmap
, "expected_num_objects", expected_num_objects
, int64_t(0));
8322 if (expected_num_objects
< 0) {
8323 ss
<< "'expected_num_objects' must be non-negative";
8328 int64_t fast_read_param
;
8329 cmd_getval(g_ceph_context
, cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
8330 FastReadType fast_read
= FAST_READ_DEFAULT
;
8331 if (fast_read_param
== 0)
8332 fast_read
= FAST_READ_OFF
;
8333 else if (fast_read_param
> 0)
8334 fast_read
= FAST_READ_ON
;
8336 err
= prepare_new_pool(poolstr
, 0, // auid=0 for admin created pool
8337 -1, // default crush rule
8340 erasure_code_profile
, pool_type
,
8341 (uint64_t)expected_num_objects
,
8347 ss
<< "pool '" << poolstr
<< "' already exists";
8350 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8359 ss
<< "pool '" << poolstr
<< "' created";
8362 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8363 get_last_committed() + 1));
8366 } else if (prefix
== "osd pool delete" ||
8367 prefix
== "osd pool rm") {
8368 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
8369 string poolstr
, poolstr2
, sure
;
8370 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8371 cmd_getval(g_ceph_context
, cmdmap
, "pool2", poolstr2
);
8372 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
8373 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
8375 ss
<< "pool '" << poolstr
<< "' does not exist";
8380 bool force_no_fake
= sure
== "--yes-i-really-really-mean-it-not-faking";
8381 if (poolstr2
!= poolstr
||
8382 (sure
!= "--yes-i-really-really-mean-it" && !force_no_fake
)) {
8383 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
8384 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
8385 << "followed by --yes-i-really-really-mean-it.";
8389 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
8390 if (err
== -EAGAIN
) {
8391 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8397 } else if (prefix
== "osd pool rename") {
8398 string srcpoolstr
, destpoolstr
;
8399 cmd_getval(g_ceph_context
, cmdmap
, "srcpool", srcpoolstr
);
8400 cmd_getval(g_ceph_context
, cmdmap
, "destpool", destpoolstr
);
8401 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
8402 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
8405 if (pool_dst
>= 0) {
8406 // src pool doesn't exist, dst pool does exist: to ensure idempotency
8407 // of operations, assume this rename succeeded, as it is not changing
8408 // the current state. Make sure we output something understandable
8409 // for whoever is issuing the command, if they are paying attention,
8410 // in case it was not intentional; or to avoid a "wtf?" and a bug
8411 // report in case it was intentional, while expecting a failure.
8412 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
8413 << destpoolstr
<< "' does -- assuming successful rename";
8416 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
8420 } else if (pool_dst
>= 0) {
8421 // source pool exists and so does the destination pool
8422 ss
<< "pool '" << destpoolstr
<< "' already exists";
8427 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
8429 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
8431 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
8432 << cpp_strerror(ret
);
8435 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
8436 get_last_committed() + 1));
8439 } else if (prefix
== "osd pool set") {
8440 err
= prepare_command_pool_set(cmdmap
, ss
);
8447 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8448 get_last_committed() + 1));
8450 } else if (prefix
== "osd tier add") {
8451 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8457 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8458 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
8460 ss
<< "unrecognized pool '" << poolstr
<< "'";
8465 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
8466 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
8467 if (tierpool_id
< 0) {
8468 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
8472 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
8474 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
8477 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
8481 // make sure new tier is empty
8482 string force_nonempty
;
8483 cmd_getval(g_ceph_context
, cmdmap
, "force_nonempty", force_nonempty
);
8484 const pool_stat_t
& tier_stats
=
8485 mon
->pgmon()->pg_map
.get_pg_pool_sum_stat(tierpool_id
);
8486 if (tier_stats
.stats
.sum
.num_objects
!= 0 &&
8487 force_nonempty
!= "--force-nonempty") {
8488 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
8492 if (tp
->ec_pool()) {
8493 ss
<< "tier pool '" << tierpoolstr
8494 << "' is an ec pool, which cannot be a tier";
8498 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
8499 ((force_nonempty
!= "--force-nonempty") ||
8500 (!g_conf
->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
8501 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
8506 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
8507 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
8508 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
8509 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8512 np
->tiers
.insert(tierpool_id
);
8513 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
8514 ntp
->tier_of
= pool_id
;
8515 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
8516 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8517 get_last_committed() + 1));
8519 } else if (prefix
== "osd tier remove" ||
8520 prefix
== "osd tier rm") {
8522 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8523 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
8525 ss
<< "unrecognized pool '" << poolstr
<< "'";
8530 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
8531 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
8532 if (tierpool_id
< 0) {
8533 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
8537 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
8539 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
8542 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
8546 if (p
->tiers
.count(tierpool_id
) == 0) {
8547 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
8551 if (tp
->tier_of
!= pool_id
) {
8552 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
8553 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
8554 // be scary about it; this is an inconsistency and bells must go off
8555 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
8559 if (p
->read_tier
== tierpool_id
) {
8560 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
8565 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
8566 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
8567 if (np
->tiers
.count(tierpool_id
) == 0 ||
8568 ntp
->tier_of
!= pool_id
||
8569 np
->read_tier
== tierpool_id
) {
8570 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8573 np
->tiers
.erase(tierpool_id
);
8575 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
8576 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8577 get_last_committed() + 1));
8579 } else if (prefix
== "osd tier set-overlay") {
8580 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8586 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8587 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
8589 ss
<< "unrecognized pool '" << poolstr
<< "'";
8593 string overlaypoolstr
;
8594 cmd_getval(g_ceph_context
, cmdmap
, "overlaypool", overlaypoolstr
);
8595 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
8596 if (overlaypool_id
< 0) {
8597 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
8601 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
8603 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
8605 if (p
->tiers
.count(overlaypool_id
) == 0) {
8606 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
8610 if (p
->read_tier
== overlaypool_id
) {
8612 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
8615 if (p
->has_read_tier()) {
8616 ss
<< "pool '" << poolstr
<< "' has overlay '"
8617 << osdmap
.get_pool_name(p
->read_tier
)
8618 << "'; please remove-overlay first";
8624 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
8625 np
->read_tier
= overlaypool_id
;
8626 np
->write_tier
= overlaypool_id
;
8627 np
->set_last_force_op_resend(pending_inc
.epoch
);
8628 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
8629 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
8630 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
8631 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
8632 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
8633 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8634 get_last_committed() + 1));
8636 } else if (prefix
== "osd tier remove-overlay" ||
8637 prefix
== "osd tier rm-overlay") {
8639 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8640 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
8642 ss
<< "unrecognized pool '" << poolstr
<< "'";
8646 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
8648 if (!p
->has_read_tier()) {
8650 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
8654 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
8659 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
8660 if (np
->has_read_tier()) {
8661 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
8662 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
8663 nop
->set_last_force_op_resend(pending_inc
.epoch
);
8665 if (np
->has_write_tier()) {
8666 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
8667 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
8668 nop
->set_last_force_op_resend(pending_inc
.epoch
);
8670 np
->clear_read_tier();
8671 np
->clear_write_tier();
8672 np
->set_last_force_op_resend(pending_inc
.epoch
);
8673 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
8674 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8675 get_last_committed() + 1));
8677 } else if (prefix
== "osd tier cache-mode") {
8678 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8684 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8685 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
8687 ss
<< "unrecognized pool '" << poolstr
<< "'";
8691 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
8693 if (!p
->is_tier()) {
8694 ss
<< "pool '" << poolstr
<< "' is not a tier";
8699 cmd_getval(g_ceph_context
, cmdmap
, "mode", modestr
);
8700 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
8702 ss
<< "'" << modestr
<< "' is not a valid cache mode";
8708 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
8709 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
8710 mode
!= pg_pool_t::CACHEMODE_NONE
&&
8711 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
8712 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
8713 sure
!= "--yes-i-really-mean-it") {
8714 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
8715 << "corrupt your data. pass --yes-i-really-mean-it to force.";
8720 // pool already has this cache-mode set and there are no pending changes
8721 if (p
->cache_mode
== mode
&&
8722 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
8723 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
8724 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
8725 << " to " << pg_pool_t::get_cache_mode_name(mode
);
8730 /* Mode description:
8732 * none: No cache-mode defined
8733 * forward: Forward all reads and writes to base pool
8734 * writeback: Cache writes, promote reads from base pool
8735 * readonly: Forward writes to base pool
8736 * readforward: Writes are in writeback mode, Reads are in forward mode
8737 * proxy: Proxy all reads and writes to base pool
8738 * readproxy: Writes are in writeback mode, Reads are in proxy mode
8740 * Hence, these are the allowed transitions:
8743 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
8744 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
8745 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
8746 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
8747 * writeback -> readforward || readproxy || forward || proxy
8751 // We check if the transition is valid against the current pool mode, as
8752 // it is the only committed state thus far. We will blantly squash
8753 // whatever mode is on the pending state.
8755 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
8756 (mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
8757 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
8758 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
8759 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
8760 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
8761 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
8763 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD
)
8765 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
8767 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD
)
8769 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
8774 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
8775 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
8776 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
8777 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
8778 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
8780 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
8781 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
8782 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
8783 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
8784 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
8786 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
8787 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
8788 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
8789 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
8790 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
8792 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
8793 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
8794 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
8795 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
8796 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
8798 const pool_stat_t
& tier_stats
=
8799 mon
->pgmon()->pg_map
.get_pg_pool_sum_stat(pool_id
);
8801 if (tier_stats
.stats
.sum
.num_objects_dirty
> 0) {
8802 ss
<< "unable to set cache-mode '"
8803 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
8804 << "': dirty objects found";
8810 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
8811 np
->cache_mode
= mode
;
8812 // set this both when moving to and from cache_mode NONE. this is to
8813 // capture legacy pools that were set up before this flag existed.
8814 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
8815 ss
<< "set cache-mode for pool '" << poolstr
8816 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
8817 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
8818 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
8820 if (base_pool
->read_tier
== pool_id
||
8821 base_pool
->write_tier
== pool_id
)
8822 ss
<<" (WARNING: pool is still configured as read or write tier)";
8824 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8825 get_last_committed() + 1));
8827 } else if (prefix
== "osd tier add-cache") {
8828 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8834 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8835 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
8837 ss
<< "unrecognized pool '" << poolstr
<< "'";
8842 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
8843 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
8844 if (tierpool_id
< 0) {
8845 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
8849 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
8851 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
8854 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
8859 if (!cmd_getval(g_ceph_context
, cmdmap
, "size", size
)) {
8860 ss
<< "unable to parse 'size' value '"
8861 << cmd_vartype_stringify(cmdmap
["size"]) << "'";
8865 // make sure new tier is empty
8866 const pool_stat_t
& tier_stats
=
8867 mon
->pgmon()->pg_map
.get_pg_pool_sum_stat(tierpool_id
);
8868 if (tier_stats
.stats
.sum
.num_objects
!= 0) {
8869 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
8873 string modestr
= g_conf
->osd_tier_default_cache_mode
;
8874 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
8876 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
8881 if (g_conf
->osd_tier_default_cache_hit_set_type
== "bloom") {
8882 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8883 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
8884 hsp
= HitSet::Params(bsp
);
8885 } else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_hash") {
8886 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
8888 else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_object") {
8889 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8891 ss
<< "osd tier cache default hit set type '" <<
8892 g_conf
->osd_tier_default_cache_hit_set_type
<< "' is not a known type";
8897 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
8898 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
8899 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
8900 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
8903 np
->tiers
.insert(tierpool_id
);
8904 np
->read_tier
= np
->write_tier
= tierpool_id
;
8905 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
8906 np
->set_last_force_op_resend(pending_inc
.epoch
);
8907 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
8908 ntp
->tier_of
= pool_id
;
8909 ntp
->cache_mode
= mode
;
8910 ntp
->hit_set_count
= g_conf
->osd_tier_default_cache_hit_set_count
;
8911 ntp
->hit_set_period
= g_conf
->osd_tier_default_cache_hit_set_period
;
8912 ntp
->min_read_recency_for_promote
= g_conf
->osd_tier_default_cache_min_read_recency_for_promote
;
8913 ntp
->min_write_recency_for_promote
= g_conf
->osd_tier_default_cache_min_write_recency_for_promote
;
8914 ntp
->hit_set_grade_decay_rate
= g_conf
->osd_tier_default_cache_hit_set_grade_decay_rate
;
8915 ntp
->hit_set_search_last_n
= g_conf
->osd_tier_default_cache_hit_set_search_last_n
;
8916 ntp
->hit_set_params
= hsp
;
8917 ntp
->target_max_bytes
= size
;
8918 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
8919 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8920 get_last_committed() + 1));
8922 } else if (prefix
== "osd pool set-quota") {
8924 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
8925 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
8927 ss
<< "unrecognized pool '" << poolstr
<< "'";
8933 cmd_getval(g_ceph_context
, cmdmap
, "field", field
);
8934 if (field
!= "max_objects" && field
!= "max_bytes") {
8935 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
8940 // val could contain unit designations, so we treat as a string
8942 cmd_getval(g_ceph_context
, cmdmap
, "val", val
);
8944 int64_t value
= unit_to_bytesize(val
, &tss
);
8946 ss
<< "error parsing value '" << value
<< "': " << tss
.str();
8951 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
8952 if (field
== "max_objects") {
8953 pi
->quota_max_objects
= value
;
8954 } else if (field
== "max_bytes") {
8955 pi
->quota_max_bytes
= value
;
8957 assert(0 == "unrecognized option");
8959 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
8961 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8962 get_last_committed() + 1));
8965 } else if (prefix
== "osd reweight-by-pg" ||
8966 prefix
== "osd reweight-by-utilization" ||
8967 prefix
== "osd test-reweight-by-pg" ||
8968 prefix
== "osd test-reweight-by-utilization") {
8970 prefix
== "osd reweight-by-pg" || prefix
== "osd test-reweight-by-pg";
8972 prefix
== "osd test-reweight-by-pg" ||
8973 prefix
== "osd test-reweight-by-utilization";
8975 cmd_getval(g_ceph_context
, cmdmap
, "oload", oload
, int64_t(120));
8977 vector
<string
> poolnamevec
;
8978 cmd_getval(g_ceph_context
, cmdmap
, "pools", poolnamevec
);
8979 for (unsigned j
= 0; j
< poolnamevec
.size(); j
++) {
8980 int64_t pool
= osdmap
.lookup_pg_pool_name(poolnamevec
[j
]);
8982 ss
<< "pool '" << poolnamevec
[j
] << "' does not exist";
8988 double max_change
= g_conf
->mon_reweight_max_change
;
8989 cmd_getval(g_ceph_context
, cmdmap
, "max_change", max_change
);
8990 if (max_change
<= 0.0) {
8991 ss
<< "max_change " << max_change
<< " must be positive";
8995 int64_t max_osds
= g_conf
->mon_reweight_max_osds
;
8996 cmd_getval(g_ceph_context
, cmdmap
, "max_osds", max_osds
);
8997 if (max_osds
<= 0) {
8998 ss
<< "max_osds " << max_osds
<< " must be positive";
9002 string no_increasing
;
9003 cmd_getval(g_ceph_context
, cmdmap
, "no_increasing", no_increasing
);
9005 mempool::osdmap::map
<int32_t, uint32_t> new_weights
;
9006 err
= reweight::by_utilization(osdmap
,
9007 mon
->pgmon()->pg_map
,
9012 pools
.empty() ? NULL
: &pools
,
9013 no_increasing
== "--no-increasing",
9015 &ss
, &out_str
, f
.get());
9017 dout(10) << "reweight::by_utilization: finished with " << out_str
<< dendl
;
9022 rdata
.append(out_str
);
9024 ss
<< "FAILED reweight-by-pg";
9025 } else if (err
== 0 || dry_run
) {
9028 ss
<< "SUCCESSFUL reweight-by-pg";
9029 pending_inc
.new_weight
= std::move(new_weights
);
9030 wait_for_finished_proposal(
9032 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
9041 if (err
< 0 && rs
.length() == 0)
9042 rs
= cpp_strerror(err
);
9043 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
9048 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9049 get_last_committed() + 1));
9053 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9057 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
9059 op
->mark_osdmon_event(__func__
);
9060 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
9062 if (m
->fsid
!= mon
->monmap
->fsid
) {
9063 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
9064 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
9065 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
9069 if (m
->op
== POOL_OP_CREATE
)
9070 return preprocess_pool_op_create(op
);
9072 if (!osdmap
.get_pg_pool(m
->pool
)) {
9073 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
9074 _pool_op_reply(op
, 0, osdmap
.get_epoch());
9078 // check if the snap and snapname exist
9079 bool snap_exists
= false;
9080 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
9081 if (p
->snap_exists(m
->name
.c_str()))
9085 case POOL_OP_CREATE_SNAP
:
9086 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
9087 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
9091 _pool_op_reply(op
, 0, osdmap
.get_epoch());
9095 case POOL_OP_CREATE_UNMANAGED_SNAP
:
9096 if (p
->is_pool_snaps_mode()) {
9097 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
9101 case POOL_OP_DELETE_SNAP
:
9102 if (p
->is_unmanaged_snaps_mode()) {
9103 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
9107 _pool_op_reply(op
, 0, osdmap
.get_epoch());
9111 case POOL_OP_DELETE_UNMANAGED_SNAP
:
9112 if (p
->is_pool_snaps_mode()) {
9113 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
9116 if (p
->is_removed_snap(m
->snapid
)) {
9117 _pool_op_reply(op
, 0, osdmap
.get_epoch());
9121 case POOL_OP_DELETE
:
9122 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
9123 _pool_op_reply(op
, 0, osdmap
.get_epoch());
9127 case POOL_OP_AUID_CHANGE
:
9137 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
9139 op
->mark_osdmon_event(__func__
);
9140 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
9141 MonSession
*session
= m
->get_session();
9143 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
9146 if (!session
->is_capable("osd", MON_CAP_W
)) {
9147 dout(5) << "attempt to create new pool without sufficient auid privileges!"
9148 << "message: " << *m
<< std::endl
9149 << "caps: " << session
->caps
<< dendl
;
9150 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
9154 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
9156 _pool_op_reply(op
, 0, osdmap
.get_epoch());
9163 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
9165 op
->mark_osdmon_event(__func__
);
9166 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
9167 dout(10) << "prepare_pool_op " << *m
<< dendl
;
9168 if (m
->op
== POOL_OP_CREATE
) {
9169 return prepare_pool_op_create(op
);
9170 } else if (m
->op
== POOL_OP_DELETE
) {
9171 return prepare_pool_op_delete(op
);
9175 bool changed
= false;
9177 if (!osdmap
.have_pg_pool(m
->pool
)) {
9178 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
9182 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
9185 case POOL_OP_CREATE_SNAP
:
9186 if (pool
->is_tier()) {
9188 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
9190 } // else, fall through
9191 case POOL_OP_DELETE_SNAP
:
9192 if (!pool
->is_unmanaged_snaps_mode()) {
9193 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
9194 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
9195 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
9203 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
9206 case POOL_OP_DELETE_UNMANAGED_SNAP
:
9207 // we won't allow removal of an unmanaged snapshot from a pool
9208 // not in unmanaged snaps mode.
9209 if (!pool
->is_unmanaged_snaps_mode()) {
9210 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
9214 case POOL_OP_CREATE_UNMANAGED_SNAP
:
9215 // but we will allow creating an unmanaged snapshot on any pool
9216 // as long as it is not in 'pool' snaps mode.
9217 if (pool
->is_pool_snaps_mode()) {
9218 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
9223 // projected pool info
9225 if (pending_inc
.new_pools
.count(m
->pool
))
9226 pp
= pending_inc
.new_pools
[m
->pool
];
9228 pp
= *osdmap
.get_pg_pool(m
->pool
);
9230 bufferlist reply_data
;
9232 // pool snaps vs unmanaged snaps are mutually exclusive
9234 case POOL_OP_CREATE_SNAP
:
9235 case POOL_OP_DELETE_SNAP
:
9236 if (pp
.is_unmanaged_snaps_mode()) {
9242 case POOL_OP_CREATE_UNMANAGED_SNAP
:
9243 case POOL_OP_DELETE_UNMANAGED_SNAP
:
9244 if (pp
.is_pool_snaps_mode()) {
9251 case POOL_OP_CREATE_SNAP
:
9252 if (!pp
.snap_exists(m
->name
.c_str())) {
9253 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
9254 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
<< " seq " << pp
.get_snap_epoch() << dendl
;
9259 case POOL_OP_DELETE_SNAP
:
9261 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
9269 case POOL_OP_CREATE_UNMANAGED_SNAP
:
9272 pp
.add_unmanaged_snap(snapid
);
9273 ::encode(snapid
, reply_data
);
9278 case POOL_OP_DELETE_UNMANAGED_SNAP
:
9279 if (!pp
.is_removed_snap(m
->snapid
)) {
9280 pp
.remove_unmanaged_snap(m
->snapid
);
9285 case POOL_OP_AUID_CHANGE
:
9286 if (pp
.auid
!= m
->auid
) {
9298 pp
.set_snap_epoch(pending_inc
.epoch
);
9299 pending_inc
.new_pools
[m
->pool
] = pp
;
9303 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
9307 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
9309 op
->mark_osdmon_event(__func__
);
9310 int err
= prepare_new_pool(op
);
9311 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
9315 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
9318 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
9320 // If the Pool is in use by CephFS, refuse to delete it
9321 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending();
9322 if (pending_fsmap
.pool_in_use(pool_id
)) {
9323 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
9327 if (pool
.tier_of
>= 0) {
9328 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
9329 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
9332 if (!pool
.tiers
.empty()) {
9333 *ss
<< "pool '" << poolstr
<< "' has tiers";
9334 for(auto tier
: pool
.tiers
) {
9335 *ss
<< " " << osdmap
.get_pool_name(tier
);
9340 if (!g_conf
->mon_allow_pool_delete
) {
9341 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
9345 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
9346 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
9350 *ss
<< "pool '" << poolstr
<< "' removed";
9355 * Check if it is safe to add a tier to a base pool
9358 * True if the operation should proceed, false if we should abort here
9359 * (abort doesn't necessarily mean error, could be idempotency)
9361 bool OSDMonitor::_check_become_tier(
9362 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
9363 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
9367 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
9368 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
9370 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending();
9371 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
9372 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
9377 if (base_pool
->tiers
.count(tier_pool_id
)) {
9378 assert(tier_pool
->tier_of
== base_pool_id
);
9380 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
9381 << base_pool_name
<< "'";
9385 if (base_pool
->is_tier()) {
9386 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
9387 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
9388 << "multiple tiers are not yet supported.";
9393 if (tier_pool
->has_tiers()) {
9394 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
9395 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
9396 it
!= tier_pool
->tiers
.end(); ++it
)
9397 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
9398 *ss
<< " multiple tiers are not yet supported.";
9403 if (tier_pool
->is_tier()) {
9404 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
9405 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
9416 * Check if it is safe to remove a tier from this base pool
9419 * True if the operation should proceed, false if we should abort here
9420 * (abort doesn't necessarily mean error, could be idempotency)
9422 bool OSDMonitor::_check_remove_tier(
9423 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
9424 const pg_pool_t
*tier_pool
,
9425 int *err
, ostream
*ss
) const
9427 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
9429 // Apply CephFS-specific checks
9430 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending();
9431 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
9432 if (base_pool
->type
!= pg_pool_t::TYPE_REPLICATED
) {
9433 // If the underlying pool is erasure coded, we can't permit the
9434 // removal of the replicated tier that CephFS relies on to access it
9435 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS via its tier";
9440 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
9441 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
9442 "tier is still in use as a writeback cache. Change the cache "
9443 "mode and flush the cache before removing it";
9453 int OSDMonitor::_prepare_remove_pool(
9454 int64_t pool
, ostream
*ss
, bool no_fake
)
9456 dout(10) << "_prepare_remove_pool " << pool
<< dendl
;
9457 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
9458 int r
= _check_remove_pool(pool
, *p
, ss
);
9462 auto new_pool
= pending_inc
.new_pools
.find(pool
);
9463 if (new_pool
!= pending_inc
.new_pools
.end()) {
9464 // if there is a problem with the pending info, wait and retry
9466 const auto& p
= new_pool
->second
;
9467 int r
= _check_remove_pool(pool
, p
, ss
);
9472 if (pending_inc
.old_pools
.count(pool
)) {
9473 dout(10) << "_prepare_remove_pool " << pool
<< " already pending removal"
9478 if (g_conf
->mon_fake_pool_delete
&& !no_fake
) {
9479 string old_name
= osdmap
.get_pool_name(pool
);
9480 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
9481 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
9482 << old_name
<< " -> " << new_name
<< dendl
;
9483 pending_inc
.new_pool_names
[pool
] = new_name
;
9488 pending_inc
.old_pools
.insert(pool
);
9490 // remove any pg_temp mappings for this pool too
9491 for (auto p
= osdmap
.pg_temp
->begin();
9492 p
!= osdmap
.pg_temp
->end();
9494 if (p
->first
.pool() == (uint64_t)pool
) {
9495 dout(10) << "_prepare_remove_pool " << pool
<< " removing obsolete pg_temp "
9496 << p
->first
<< dendl
;
9497 pending_inc
.new_pg_temp
[p
->first
].clear();
9500 for (auto p
= osdmap
.primary_temp
->begin();
9501 p
!= osdmap
.primary_temp
->end();
9503 if (p
->first
.pool() == (uint64_t)pool
) {
9504 dout(10) << "_prepare_remove_pool " << pool
9505 << " removing obsolete primary_temp" << p
->first
<< dendl
;
9506 pending_inc
.new_primary_temp
[p
->first
] = -1;
9512 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
9514 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
9515 if (pending_inc
.old_pools
.count(pool
)) {
9516 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
9519 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
9520 p
!= pending_inc
.new_pool_names
.end();
9522 if (p
->second
== newname
&& p
->first
!= pool
) {
9527 pending_inc
.new_pool_names
[pool
] = newname
;
9531 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
9533 op
->mark_osdmon_event(__func__
);
9534 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
9536 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
9537 if (ret
== -EAGAIN
) {
9538 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9542 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
9543 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
9544 pending_inc
.epoch
));
9548 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
9549 int ret
, epoch_t epoch
, bufferlist
*blp
)
9551 op
->mark_osdmon_event(__func__
);
9552 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
9553 dout(20) << "_pool_op_reply " << ret
<< dendl
;
9554 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
9555 ret
, epoch
, get_last_committed(), blp
);
9556 mon
->send_reply(op
, reply
);