]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
bump version to 12.0.3-pve3
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
20#include <sstream>
21
22#include "OSDMonitor.h"
23#include "Monitor.h"
24#include "MDSMonitor.h"
25#include "PGMonitor.h"
26
27#include "MonitorDBStore.h"
28#include "Session.h"
29
30#include "crush/CrushWrapper.h"
31#include "crush/CrushTester.h"
32#include "crush/CrushTreeDumper.h"
33
34#include "messages/MOSDBeacon.h"
35#include "messages/MOSDFailure.h"
36#include "messages/MOSDMarkMeDown.h"
37#include "messages/MOSDFull.h"
38#include "messages/MOSDMap.h"
39#include "messages/MMonGetOSDMap.h"
40#include "messages/MOSDBoot.h"
41#include "messages/MOSDAlive.h"
42#include "messages/MPoolOp.h"
43#include "messages/MPoolOpReply.h"
44#include "messages/MOSDPGCreate.h"
45#include "messages/MOSDPGCreated.h"
46#include "messages/MOSDPGTemp.h"
47#include "messages/MMonCommand.h"
48#include "messages/MRemoveSnaps.h"
49#include "messages/MOSDScrub.h"
50#include "messages/MRoute.h"
51
52#include "common/TextTable.h"
53#include "common/Timer.h"
54#include "common/ceph_argparse.h"
55#include "common/perf_counters.h"
56#include "common/strtol.h"
57
58#include "common/config.h"
59#include "common/errno.h"
60
61#include "erasure-code/ErasureCodePlugin.h"
62#include "compressor/Compressor.h"
63#include "common/Checksummer.h"
64
65#include "include/compat.h"
66#include "include/assert.h"
67#include "include/stringify.h"
68#include "include/util.h"
69#include "common/cmdparse.h"
70#include "include/str_list.h"
71#include "include/str_map.h"
72
73#include "json_spirit/json_spirit_reader.h"
74
75#define dout_subsys ceph_subsys_mon
76#define OSD_PG_CREATING_PREFIX "osd_pg_creating"
77
78void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
79{
80 if (epoch_by_pg.size() <= ps) {
81 epoch_by_pg.resize(ps + 1, 0);
82 }
83 const auto old_lec = epoch_by_pg[ps];
84 if (old_lec >= last_epoch_clean) {
85 // stale lec
86 return;
87 }
88 epoch_by_pg[ps] = last_epoch_clean;
89 if (last_epoch_clean < floor) {
90 floor = last_epoch_clean;
91 } else if (last_epoch_clean > floor) {
92 if (old_lec == floor) {
93 // probably should increase floor?
94 auto new_floor = std::min_element(std::begin(epoch_by_pg),
95 std::end(epoch_by_pg));
96 floor = *new_floor;
97 }
98 }
99 if (ps != next_missing) {
100 return;
101 }
102 for (; next_missing < epoch_by_pg.size(); next_missing++) {
103 if (epoch_by_pg[next_missing] == 0) {
104 break;
105 }
106 }
107}
108
109void LastEpochClean::remove_pool(uint64_t pool)
110{
111 report_by_pool.erase(pool);
112}
113
114void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
115{
116 auto& lec = report_by_pool[pg.pool()];
117 return lec.report(pg.ps(), last_epoch_clean);
118}
119
120epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
121{
122 auto floor = latest.get_epoch();
123 for (auto& pool : latest.get_pools()) {
124 auto reported = report_by_pool.find(pool.first);
125 if (reported == report_by_pool.end()) {
126 return 0;
127 }
128 if (reported->second.next_missing < pool.second.get_pg_num()) {
129 return 0;
130 }
131 if (reported->second.floor < floor) {
132 floor = reported->second.floor;
133 }
134 }
135 return floor;
136}
137
138
139struct C_UpdateCreatingPGs : public Context {
140 OSDMonitor *osdmon;
141 utime_t start;
142 epoch_t epoch;
143 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
144 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
145 void finish(int r) override {
146 if (r >= 0) {
147 utime_t end = ceph_clock_now();
148 dout(10) << "osdmap epoch " << epoch << " mapping took "
149 << (end - start) << " seconds" << dendl;
150 osdmon->update_creating_pgs();
151 osdmon->check_pg_creates_subs();
152 }
153 }
154};
155
156#undef dout_prefix
157#define dout_prefix _prefix(_dout, mon, osdmap)
158static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
159 return *_dout << "mon." << mon->name << "@" << mon->rank
160 << "(" << mon->get_state_name()
161 << ").osd e" << osdmap.get_epoch() << " ";
162}
163
164OSDMonitor::OSDMonitor(
165 CephContext *cct,
166 Monitor *mn,
167 Paxos *p,
168 const string& service_name)
169 : PaxosService(mn, p, service_name),
170 cct(cct),
171 inc_osd_cache(g_conf->mon_osd_cache_size),
172 full_osd_cache(g_conf->mon_osd_cache_size),
173 last_attempted_minwait_time(utime_t()),
174 mapper(mn->cct, &mn->cpu_tp),
175 op_tracker(cct, true, 1)
176{}
177
178bool OSDMonitor::_have_pending_crush()
179{
180 return pending_inc.crush.length() > 0;
181}
182
183CrushWrapper &OSDMonitor::_get_stable_crush()
184{
185 return *osdmap.crush;
186}
187
188void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
189{
190 bufferlist bl;
191 if (pending_inc.crush.length())
192 bl = pending_inc.crush;
193 else
194 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
195
196 bufferlist::iterator p = bl.begin();
197 newcrush.decode(p);
198}
199
200void OSDMonitor::create_initial()
201{
202 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
203
204 OSDMap newmap;
205
206 bufferlist bl;
207 mon->store->get("mkfs", "osdmap", bl);
208
209 if (bl.length()) {
210 newmap.decode(bl);
211 newmap.set_fsid(mon->monmap->fsid);
212 } else {
213 newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0,
214 g_conf->osd_pg_bits, g_conf->osd_pgp_bits);
215 }
216 newmap.set_epoch(1);
217 newmap.created = newmap.modified = ceph_clock_now();
218
219 // new clusters should sort bitwise by default.
220 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
221
222 // new cluster should require latest by default
223 newmap.set_flag(CEPH_OSDMAP_REQUIRE_JEWEL);
224 newmap.set_flag(CEPH_OSDMAP_REQUIRE_KRAKEN);
225 if (!g_conf->mon_debug_no_require_luminous) {
226 newmap.set_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS);
227 newmap.full_ratio = g_conf->mon_osd_full_ratio;
228 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
229 newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
230 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
231 newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
232 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
233 newmap.require_min_compat_client = g_conf->mon_osd_initial_require_min_compat_client;
234 }
235
236 // encode into pending incremental
237 newmap.encode(pending_inc.fullmap,
238 mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED);
239 pending_inc.full_crc = newmap.get_crc();
240 dout(20) << " full crc " << pending_inc.full_crc << dendl;
241}
242
243void OSDMonitor::get_store_prefixes(std::set<string>& s)
244{
245 s.insert(service_name);
246 s.insert(OSD_PG_CREATING_PREFIX);
247}
248
249void OSDMonitor::update_from_paxos(bool *need_bootstrap)
250{
251 version_t version = get_last_committed();
252 if (version == osdmap.epoch)
253 return;
254 assert(version > osdmap.epoch);
255
256 dout(15) << "update_from_paxos paxos e " << version
257 << ", my e " << osdmap.epoch << dendl;
258
259
260 /*
261 * We will possibly have a stashed latest that *we* wrote, and we will
262 * always be sure to have the oldest full map in the first..last range
263 * due to encode_trim_extra(), which includes the oldest full map in the trim
264 * transaction.
265 *
266 * encode_trim_extra() does not however write the full map's
267 * version to 'full_latest'. This is only done when we are building the
268 * full maps from the incremental versions. But don't panic! We make sure
269 * that the following conditions find whichever full map version is newer.
270 */
271 version_t latest_full = get_version_latest_full();
272 if (latest_full == 0 && get_first_committed() > 1)
273 latest_full = get_first_committed();
274
275 if (get_first_committed() > 1 &&
276 latest_full < get_first_committed()) {
277 // the monitor could be just sync'ed with its peer, and the latest_full key
278 // is not encoded in the paxos commits in encode_pending(), so we need to
279 // make sure we get it pointing to a proper version.
280 version_t lc = get_last_committed();
281 version_t fc = get_first_committed();
282
283 dout(10) << __func__ << " looking for valid full map in interval"
284 << " [" << fc << ", " << lc << "]" << dendl;
285
286 latest_full = 0;
287 for (version_t v = lc; v >= fc; v--) {
288 string full_key = "full_" + stringify(v);
289 if (mon->store->exists(get_service_name(), full_key)) {
290 dout(10) << __func__ << " found latest full map v " << v << dendl;
291 latest_full = v;
292 break;
293 }
294 }
295
296 assert(latest_full > 0);
297 auto t(std::make_shared<MonitorDBStore::Transaction>());
298 put_version_latest_full(t, latest_full);
299 mon->store->apply_transaction(t);
300 dout(10) << __func__ << " updated the on-disk full map version to "
301 << latest_full << dendl;
302 }
303
304 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
305 bufferlist latest_bl;
306 get_version_full(latest_full, latest_bl);
307 assert(latest_bl.length() != 0);
308 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
309 osdmap.decode(latest_bl);
310 }
311
312 if (mon->monmap->get_required_features().contains_all(
313 ceph::features::mon::FEATURE_LUMINOUS)) {
314 bufferlist bl;
315 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
316 auto p = bl.begin();
317 std::lock_guard<std::mutex> l(creating_pgs_lock);
318 creating_pgs.decode(p);
319 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
320 << creating_pgs.last_scan_epoch
321 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
322 } else {
323 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
324 << dendl;
325 }
326 }
327
328 // walk through incrementals
329 MonitorDBStore::TransactionRef t;
330 size_t tx_size = 0;
331 while (version > osdmap.epoch) {
332 bufferlist inc_bl;
333 int err = get_version(osdmap.epoch+1, inc_bl);
334 assert(err == 0);
335 assert(inc_bl.length());
336
337 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
338 << dendl;
339 OSDMap::Incremental inc(inc_bl);
340 err = osdmap.apply_incremental(inc);
341 assert(err == 0);
342
343 if (!t)
344 t.reset(new MonitorDBStore::Transaction);
345
346 // Write out the full map for all past epochs. Encode the full
347 // map with the same features as the incremental. If we don't
348 // know, use the quorum features. If we don't know those either,
349 // encode with all features.
350 uint64_t f = inc.encode_features;
351 if (!f)
352 f = mon->get_quorum_con_features();
353 if (!f)
354 f = -1;
355 bufferlist full_bl;
356 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
357 tx_size += full_bl.length();
358
359 bufferlist orig_full_bl;
360 get_version_full(osdmap.epoch, orig_full_bl);
361 if (orig_full_bl.length()) {
362 // the primary provided the full map
363 assert(inc.have_crc);
364 if (inc.full_crc != osdmap.crc) {
365 // This will happen if the mons were running mixed versions in
366 // the past or some other circumstance made the full encoded
367 // maps divergent. Reloading here will bring us back into
368 // sync with the primary for this and all future maps. OSDs
369 // will also be brought back into sync when they discover the
370 // crc mismatch and request a full map from a mon.
371 derr << __func__ << " full map CRC mismatch, resetting to canonical"
372 << dendl;
373 osdmap = OSDMap();
374 osdmap.decode(orig_full_bl);
375 }
376 } else {
377 assert(!inc.have_crc);
378 put_version_full(t, osdmap.epoch, full_bl);
379 }
380 put_version_latest_full(t, osdmap.epoch);
381
382 // share
383 dout(1) << osdmap << dendl;
384
385 if (osdmap.epoch == 1) {
386 t->erase("mkfs", "osdmap");
387 }
388
389 if (tx_size > g_conf->mon_sync_max_payload_size*2) {
390 mon->store->apply_transaction(t);
391 t = MonitorDBStore::TransactionRef();
392 tx_size = 0;
393 }
394 if (mon->monmap->get_required_features().contains_all(
395 ceph::features::mon::FEATURE_LUMINOUS)) {
396 creating_pgs = update_pending_pgs(inc);
397 for (const auto &osd_state : inc.new_state) {
398 if (osd_state.second & CEPH_OSD_UP) {
399 // could be marked up *or* down, but we're too lazy to check which
400 last_osd_report.erase(osd_state.first);
401 }
402 if (osd_state.second & CEPH_OSD_EXISTS) {
403 // could be created *or* destroyed, but we can safely drop it
404 osd_epochs.erase(osd_state.first);
405 }
406 }
407 }
408 }
409
410 if (t) {
411 mon->store->apply_transaction(t);
412 }
413
414 for (int o = 0; o < osdmap.get_max_osd(); o++) {
415 if (osdmap.is_out(o))
416 continue;
417 auto found = down_pending_out.find(o);
418 if (osdmap.is_down(o)) {
419 // populate down -> out map
420 if (found == down_pending_out.end()) {
421 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
422 down_pending_out[o] = ceph_clock_now();
423 }
424 } else {
425 if (found != down_pending_out.end()) {
426 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
427 down_pending_out.erase(found);
428 }
429 }
430 }
431 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
432
433 if (mon->is_leader()) {
434 // kick pgmon, make sure it's seen the latest map
435 mon->pgmon()->check_osd_map(osdmap.epoch);
436 }
437
438 check_osdmap_subs();
439 check_pg_creates_subs();
440
441 share_map_with_random_osd();
442 update_logger();
443
444 process_failures();
445
446 // make sure our feature bits reflect the latest map
447 update_msgr_features();
448
449 if (!mon->is_leader()) {
450 // will be called by on_active() on the leader, avoid doing so twice
451 start_mapping();
452 }
453}
454
455void OSDMonitor::start_mapping()
456{
457 // initiate mapping job
458 if (mapping_job) {
459 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
460 << dendl;
461 mapping_job->abort();
462 }
463 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
464 mapping_job = mapping.start_update(osdmap, mapper,
465 g_conf->mon_osd_mapping_pgs_per_chunk);
466 dout(10) << __func__ << " started mapping job " << mapping_job.get()
467 << " at " << fin->start << dendl;
468 mapping_job->set_finish_event(fin);
469}
470
471void OSDMonitor::update_msgr_features()
472{
473 set<int> types;
474 types.insert((int)entity_name_t::TYPE_OSD);
475 types.insert((int)entity_name_t::TYPE_CLIENT);
476 types.insert((int)entity_name_t::TYPE_MDS);
477 types.insert((int)entity_name_t::TYPE_MON);
478 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
479 uint64_t mask;
480 uint64_t features = osdmap.get_features(*q, &mask);
481 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
482 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
483 Messenger::Policy p = mon->messenger->get_policy(*q);
484 p.features_required = (p.features_required & ~mask) | features;
485 mon->messenger->set_policy(*q, p);
486 }
487 }
488}
489
490void OSDMonitor::on_active()
491{
492 update_logger();
493
494 if (mon->is_leader()) {
495 mon->clog->info() << "osdmap " << osdmap;
496 } else {
497 list<MonOpRequestRef> ls;
498 take_all_failures(ls);
499 while (!ls.empty()) {
500 MonOpRequestRef op = ls.front();
501 op->mark_osdmon_event(__func__);
502 dispatch(op);
503 ls.pop_front();
504 }
505 }
506 start_mapping();
507}
508
509void OSDMonitor::on_restart()
510{
511 last_osd_report.clear();
512}
513
514void OSDMonitor::on_shutdown()
515{
516 dout(10) << __func__ << dendl;
517 if (mapping_job) {
518 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
519 << dendl;
520 mapping_job->abort();
521 }
522
523 // discard failure info, waiters
524 list<MonOpRequestRef> ls;
525 take_all_failures(ls);
526 ls.clear();
527}
528
529void OSDMonitor::update_logger()
530{
531 dout(10) << "update_logger" << dendl;
532
533 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
534 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
535 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
536 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
537}
538
539template <typename F>
540class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
541public:
542 typedef CrushTreeDumper::Dumper<F> Parent;
543
544 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
545 const PGMap *pgm_, bool tree_) :
546 Parent(crush),
547 osdmap(osdmap_),
548 pgm(pgm_),
549 tree(tree_),
550 average_util(average_utilization()),
551 min_var(-1),
552 max_var(-1),
553 stddev(0),
554 sum(0) {
555 }
556
557protected:
558 void dump_stray(F *f) {
559 for (int i = 0; i < osdmap->get_max_osd(); i++) {
560 if (osdmap->exists(i) && !this->is_touched(i))
561 dump_item(CrushTreeDumper::Item(i, 0, 0), f);
562 }
563 }
564
565 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
566 if (!tree && qi.is_bucket())
567 return;
568
569 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
570 int64_t kb = 0, kb_used = 0, kb_avail = 0;
571 double util = 0;
572 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_avail))
573 if (kb_used && kb)
574 util = 100.0 * (double)kb_used / (double)kb;
575
576 double var = 1.0;
577 if (average_util)
578 var = util / average_util;
579
580 size_t num_pgs = qi.is_bucket() ? 0 : pgm->get_num_pg_by_osd(qi.id);
581
582 dump_item(qi, reweight, kb, kb_used, kb_avail, util, var, num_pgs, f);
583
584 if (!qi.is_bucket() && reweight > 0) {
585 if (min_var < 0 || var < min_var)
586 min_var = var;
587 if (max_var < 0 || var > max_var)
588 max_var = var;
589
590 double dev = util - average_util;
591 dev *= dev;
592 stddev += reweight * dev;
593 sum += reweight;
594 }
595 }
596
597 virtual void dump_item(const CrushTreeDumper::Item &qi,
598 float &reweight,
599 int64_t kb,
600 int64_t kb_used,
601 int64_t kb_avail,
602 double& util,
603 double& var,
604 const size_t num_pgs,
605 F *f) = 0;
606
607 double dev() {
608 return sum > 0 ? sqrt(stddev / sum) : 0;
609 }
610
611 double average_utilization() {
612 int64_t kb = 0, kb_used = 0;
613 for (int i = 0; i < osdmap->get_max_osd(); i++) {
614 if (!osdmap->exists(i) || osdmap->get_weight(i) == 0)
615 continue;
616 int64_t kb_i, kb_used_i, kb_avail_i;
617 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_avail_i)) {
618 kb += kb_i;
619 kb_used += kb_used_i;
620 }
621 }
622 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
623 }
624
625 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
626 int64_t* kb_avail) const {
627 typedef ceph::unordered_map<int32_t,osd_stat_t> OsdStat;
628 OsdStat::const_iterator p = pgm->osd_stat.find(id);
629 if (p == pgm->osd_stat.end())
630 return false;
631 *kb = p->second.kb;
632 *kb_used = p->second.kb_used;
633 *kb_avail = p->second.kb_avail;
634 return *kb > 0;
635 }
636
637 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
638 int64_t* kb_avail) const {
639 if (id >= 0) {
640 if (osdmap->is_out(id)) {
641 *kb = 0;
642 *kb_used = 0;
643 *kb_avail = 0;
644 return true;
645 }
646 return get_osd_utilization(id, kb, kb_used, kb_avail);
647 }
648
649 *kb = 0;
650 *kb_used = 0;
651 *kb_avail = 0;
652
653 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
654 int item = osdmap->crush->get_bucket_item(id, k);
655 int64_t kb_i = 0, kb_used_i = 0, kb_avail_i = 0;
656 if (!get_bucket_utilization(item, &kb_i, &kb_used_i, &kb_avail_i))
657 return false;
658 *kb += kb_i;
659 *kb_used += kb_used_i;
660 *kb_avail += kb_avail_i;
661 }
662 return *kb > 0;
663 }
664
665protected:
666 const OSDMap *osdmap;
667 const PGMap *pgm;
668 bool tree;
669 double average_util;
670 double min_var;
671 double max_var;
672 double stddev;
673 double sum;
674};
675
676class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
677public:
678 typedef OSDUtilizationDumper<TextTable> Parent;
679
680 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
681 const PGMap *pgm, bool tree) :
682 Parent(crush, osdmap, pgm, tree) {}
683
684 void dump(TextTable *tbl) {
685 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
686 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
687 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
688 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
689 tbl->define_column("USE", TextTable::LEFT, TextTable::RIGHT);
690 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
691 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
692 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
693 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
694 if (tree)
695 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
696
697 Parent::dump(tbl);
698
699 dump_stray(tbl);
700
701 *tbl << "" << "" << "TOTAL"
702 << si_t(pgm->osd_sum.kb << 10)
703 << si_t(pgm->osd_sum.kb_used << 10)
704 << si_t(pgm->osd_sum.kb_avail << 10)
705 << lowprecision_t(average_util)
706 << ""
707 << TextTable::endrow;
708 }
709
710protected:
711 struct lowprecision_t {
712 float v;
713 explicit lowprecision_t(float _v) : v(_v) {}
714 };
715 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
716
717 using OSDUtilizationDumper<TextTable>::dump_item;
718 void dump_item(const CrushTreeDumper::Item &qi,
719 float &reweight,
720 int64_t kb,
721 int64_t kb_used,
722 int64_t kb_avail,
723 double& util,
724 double& var,
725 const size_t num_pgs,
726 TextTable *tbl) override {
727 *tbl << qi.id
728 << weightf_t(qi.weight)
729 << weightf_t(reweight)
730 << si_t(kb << 10)
731 << si_t(kb_used << 10)
732 << si_t(kb_avail << 10)
733 << lowprecision_t(util)
734 << lowprecision_t(var);
735
736 if (qi.is_bucket()) {
737 *tbl << "-";
738 } else {
739 *tbl << num_pgs;
740 }
741
742 if (tree) {
743 ostringstream name;
744 for (int k = 0; k < qi.depth; k++)
745 name << " ";
746 if (qi.is_bucket()) {
747 int type = crush->get_bucket_type(qi.id);
748 name << crush->get_type_name(type) << " "
749 << crush->get_item_name(qi.id);
750 } else {
751 name << "osd." << qi.id;
752 }
753 *tbl << name.str();
754 }
755
756 *tbl << TextTable::endrow;
757 }
758
759public:
760 string summary() {
761 ostringstream out;
762 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
763 << "/" << lowprecision_t(max_var) << " "
764 << "STDDEV: " << lowprecision_t(dev());
765 return out.str();
766 }
767};
768
769ostream& operator<<(ostream& out,
770 const OSDUtilizationPlainDumper::lowprecision_t& v)
771{
772 if (v.v < -0.01) {
773 return out << "-";
774 } else if (v.v < 0.001) {
775 return out << "0";
776 } else {
777 std::streamsize p = out.precision();
778 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
779 }
780}
781
782class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
783public:
784 typedef OSDUtilizationDumper<Formatter> Parent;
785
786 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
787 const PGMap *pgm, bool tree) :
788 Parent(crush, osdmap, pgm, tree) {}
789
790 void dump(Formatter *f) {
791 f->open_array_section("nodes");
792 Parent::dump(f);
793 f->close_section();
794
795 f->open_array_section("stray");
796 dump_stray(f);
797 f->close_section();
798 }
799
800protected:
801 using OSDUtilizationDumper<Formatter>::dump_item;
802 void dump_item(const CrushTreeDumper::Item &qi,
803 float &reweight,
804 int64_t kb,
805 int64_t kb_used,
806 int64_t kb_avail,
807 double& util,
808 double& var,
809 const size_t num_pgs,
810 Formatter *f) override {
811 f->open_object_section("item");
812 CrushTreeDumper::dump_item_fields(crush, qi, f);
813 f->dump_float("reweight", reweight);
814 f->dump_int("kb", kb);
815 f->dump_int("kb_used", kb_used);
816 f->dump_int("kb_avail", kb_avail);
817 f->dump_float("utilization", util);
818 f->dump_float("var", var);
819 f->dump_unsigned("pgs", num_pgs);
820 CrushTreeDumper::dump_bucket_children(crush, qi, f);
821 f->close_section();
822 }
823
824public:
825 void summary(Formatter *f) {
826 f->open_object_section("summary");
827 f->dump_int("total_kb", pgm->osd_sum.kb);
828 f->dump_int("total_kb_used", pgm->osd_sum.kb_used);
829 f->dump_int("total_kb_avail", pgm->osd_sum.kb_avail);
830 f->dump_float("average_utilization", average_util);
831 f->dump_float("min_var", min_var);
832 f->dump_float("max_var", max_var);
833 f->dump_float("dev", dev());
834 f->close_section();
835 }
836};
837
838void OSDMonitor::print_utilization(ostream &out, Formatter *f, bool tree) const
839{
840 const PGMap *pgm = &mon->pgmon()->pg_map;
841 const CrushWrapper *crush = osdmap.crush.get();
842
843 if (f) {
844 f->open_object_section("df");
845 OSDUtilizationFormatDumper d(crush, &osdmap, pgm, tree);
846 d.dump(f);
847 d.summary(f);
848 f->close_section();
849 f->flush(out);
850 } else {
851 OSDUtilizationPlainDumper d(crush, &osdmap, pgm, tree);
852 TextTable tbl;
853 d.dump(&tbl);
854 out << tbl
855 << d.summary() << "\n";
856 }
857}
858
859void OSDMonitor::create_pending()
860{
861 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
862 pending_inc.fsid = mon->monmap->fsid;
863
864 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
865
866 // clean up pg_temp, primary_temp
867 OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
868 dout(10) << "create_pending did clean_temps" << dendl;
869
870 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
871 // instead of osd_backfill_full_ratio config
872 if (osdmap.backfillfull_ratio <= 0) {
873 pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
874 if (pending_inc.new_backfillfull_ratio > 1.0)
875 pending_inc.new_backfillfull_ratio /= 100;
876 dout(1) << __func__ << " setting backfillfull_ratio = "
877 << pending_inc.new_backfillfull_ratio << dendl;
878 }
879 if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
880 // transition full ratios from PGMap to OSDMap (on upgrade)
881 PGMap *pg_map = &mon->pgmon()->pg_map;
882 if (osdmap.full_ratio != pg_map->full_ratio) {
883 dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
884 << " -> " << pg_map->full_ratio << " (from pgmap)" << dendl;
885 pending_inc.new_full_ratio = pg_map->full_ratio;
886 }
887 if (osdmap.nearfull_ratio != pg_map->nearfull_ratio) {
888 dout(10) << __func__ << " nearfull_ratio " << osdmap.nearfull_ratio
889 << " -> " << pg_map->nearfull_ratio << " (from pgmap)" << dendl;
890 pending_inc.new_nearfull_ratio = pg_map->nearfull_ratio;
891 }
892 } else {
893 // safety check (this shouldn't really happen)
894 if (osdmap.full_ratio <= 0) {
895 pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
896 if (pending_inc.new_full_ratio > 1.0)
897 pending_inc.new_full_ratio /= 100;
898 dout(1) << __func__ << " setting full_ratio = "
899 << pending_inc.new_full_ratio << dendl;
900 }
901 if (osdmap.nearfull_ratio <= 0) {
902 pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
903 if (pending_inc.new_nearfull_ratio > 1.0)
904 pending_inc.new_nearfull_ratio /= 100;
905 dout(1) << __func__ << " setting nearfull_ratio = "
906 << pending_inc.new_nearfull_ratio << dendl;
907 }
908 }
909}
910
911creating_pgs_t
912OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
913{
914 creating_pgs_t pending_creatings;
915 {
916 std::lock_guard<std::mutex> l(creating_pgs_lock);
917 pending_creatings = creating_pgs;
918 }
919 if (pending_creatings.last_scan_epoch > inc.epoch) {
920 return pending_creatings;
921 }
922 for (auto& pg : pending_created_pgs) {
923 pending_creatings.created_pools.insert(pg.pool());
924 pending_creatings.pgs.erase(pg);
925 }
926 pending_created_pgs.clear();
927 // PAXOS_PGMAP is less than PAXOS_OSDMAP, so PGMonitor::update_from_paxos()
928 // should have prepared the latest pgmap if any
929 const auto& pgm = mon->pgmon()->pg_map;
930 if (pgm.last_pg_scan >= creating_pgs.last_scan_epoch) {
931 // TODO: please stop updating pgmap with pgstats once the upgrade is completed
932 for (auto& pgid : pgm.creating_pgs) {
933 auto st = pgm.pg_stat.find(pgid);
934 assert(st != pgm.pg_stat.end());
935 auto created = make_pair(st->second.created, st->second.last_scrub_stamp);
936 // no need to add the pg, if it already exists in creating_pgs
937 pending_creatings.pgs.emplace(pgid, created);
938 }
939 }
940 for (auto old_pool : inc.old_pools) {
941 pending_creatings.created_pools.erase(old_pool);
942 const auto removed_pool = (uint64_t)old_pool;
943 auto first =
944 pending_creatings.pgs.lower_bound(pg_t{0, removed_pool});
945 auto last =
946 pending_creatings.pgs.lower_bound(pg_t{0, removed_pool + 1});
947 pending_creatings.pgs.erase(first, last);
948 last_epoch_clean.remove_pool(removed_pool);
949 }
950 scan_for_creating_pgs(osdmap.get_pools(),
951 inc.old_pools,
952 inc.modified,
953 &pending_creatings);
954 scan_for_creating_pgs(inc.new_pools,
955 inc.old_pools,
956 inc.modified,
957 &pending_creatings);
958 pending_creatings.last_scan_epoch = osdmap.get_epoch();
959 return pending_creatings;
960}
961
962void OSDMonitor::maybe_prime_pg_temp()
963{
964 bool all = false;
965 if (pending_inc.crush.length()) {
966 dout(10) << __func__ << " new crush map, all" << dendl;
967 all = true;
968 }
969
970 if (!pending_inc.new_up_client.empty()) {
971 dout(10) << __func__ << " new up osds, all" << dendl;
972 all = true;
973 }
974
975 // check for interesting OSDs
976 set<int> osds;
977 for (map<int32_t,uint8_t>::iterator p = pending_inc.new_state.begin();
978 !all && p != pending_inc.new_state.end();
979 ++p) {
980 if ((p->second & CEPH_OSD_UP) &&
981 osdmap.is_up(p->first)) {
982 osds.insert(p->first);
983 }
984 }
985 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
986 !all && p != pending_inc.new_weight.end();
987 ++p) {
988 if (p->second < osdmap.get_weight(p->first)) {
989 // weight reduction
990 osds.insert(p->first);
991 } else {
992 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
993 << dendl;
994 all = true;
995 }
996 }
997
998 if (!all && osds.empty())
999 return;
1000
1001 if (!all) {
1002 unsigned estimate =
1003 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1004 if (estimate > mapping.get_num_pgs() *
1005 g_conf->mon_osd_prime_pg_temp_max_estimate) {
1006 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1007 << osds.size() << " osds >= "
1008 << g_conf->mon_osd_prime_pg_temp_max_estimate << " of total "
1009 << mapping.get_num_pgs() << " pgs, all"
1010 << dendl;
1011 all = true;
1012 } else {
1013 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1014 << osds.size() << " osds" << dendl;
1015 }
1016 }
1017
1018 OSDMap next;
1019 next.deepish_copy_from(osdmap);
1020 next.apply_incremental(pending_inc);
1021
1022 if (all) {
1023 PrimeTempJob job(next, this);
1024 mapper.queue(&job, g_conf->mon_osd_mapping_pgs_per_chunk);
1025 if (job.wait_for(g_conf->mon_osd_prime_pg_temp_max_time)) {
1026 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1027 } else {
1028 dout(10) << __func__ << " did not finish in "
1029 << g_conf->mon_osd_prime_pg_temp_max_time
1030 << ", stopping" << dendl;
1031 job.abort();
1032 }
1033 } else {
1034 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1035 utime_t stop = ceph_clock_now();
1036 stop += g_conf->mon_osd_prime_pg_temp_max_time;
1037 const int chunk = 1000;
1038 int n = chunk;
1039 std::unordered_set<pg_t> did_pgs;
1040 for (auto osd : osds) {
1041 auto& pgs = mapping.get_osd_acting_pgs(osd);
1042 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1043 for (auto pgid : pgs) {
1044 if (!did_pgs.insert(pgid).second) {
1045 continue;
1046 }
1047 prime_pg_temp(next, pgid);
1048 if (--n <= 0) {
1049 n = chunk;
1050 if (ceph_clock_now() > stop) {
1051 dout(10) << __func__ << " consumed more than "
1052 << g_conf->mon_osd_prime_pg_temp_max_time
1053 << " seconds, stopping"
1054 << dendl;
1055 return;
1056 }
1057 }
1058 }
1059 }
1060 }
1061}
1062
1063void OSDMonitor::prime_pg_temp(
1064 const OSDMap& next,
1065 pg_t pgid)
1066{
1067 if (mon->monmap->get_required_features().contains_all(
1068 ceph::features::mon::FEATURE_LUMINOUS)) {
1069 if (creating_pgs.pgs.count(pgid)) {
1070 return;
1071 }
1072 } else {
1073 const auto& pg_map = mon->pgmon()->pg_map;
1074 if (pg_map.creating_pgs.count(pgid)) {
1075 return;
1076 }
1077 }
1078 if (!osdmap.pg_exists(pgid)) {
1079 return;
1080 }
1081
1082 vector<int> up, acting;
1083 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1084
1085 vector<int> next_up, next_acting;
1086 int next_up_primary, next_acting_primary;
1087 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1088 &next_acting, &next_acting_primary);
1089 if (acting == next_acting)
1090 return; // no change since last epoch
1091
1092 if (acting.empty())
1093 return; // if previously empty now we can be no worse off
1094 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1095 if (pool && acting.size() < pool->min_size)
1096 return; // can be no worse off than before
1097
1098 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1099 << " -> " << next_up << "/" << next_acting
1100 << ", priming " << acting
1101 << dendl;
1102 {
1103 Mutex::Locker l(prime_pg_temp_lock);
1104 // do not touch a mapping if a change is pending
1105 pending_inc.new_pg_temp.emplace(
1106 pgid,
1107 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1108 }
1109}
1110
1111/**
1112 * @note receiving a transaction in this function gives a fair amount of
1113 * freedom to the service implementation if it does need it. It shouldn't.
1114 */
1115void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1116{
1117 dout(10) << "encode_pending e " << pending_inc.epoch
1118 << dendl;
1119
1120 // finalize up pending_inc
1121 pending_inc.modified = ceph_clock_now();
1122
1123 int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
1124 assert(r == 0);
1125
1126 if (mapping_job) {
1127 if (!mapping_job->is_done()) {
1128 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1129 << mapping_job.get() << " did not complete, "
1130 << mapping_job->shards << " left" << dendl;
1131 mapping_job->abort();
1132 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1133 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1134 << mapping_job.get() << " is prior epoch "
1135 << mapping.get_epoch() << dendl;
1136 } else {
1137 if (g_conf->mon_osd_prime_pg_temp) {
1138 maybe_prime_pg_temp();
1139 }
1140 }
1141 } else if (g_conf->mon_osd_prime_pg_temp) {
1142 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1143 << dendl;
1144 }
1145 mapping_job.reset();
1146
1147 bufferlist bl;
1148
1149 {
1150 OSDMap tmp;
1151 tmp.deepish_copy_from(osdmap);
1152 tmp.apply_incremental(pending_inc);
1153
1154 if (tmp.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
1155 // set or clear full/nearfull?
1156 int full, backfill, nearfull;
1157 tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
1158 if (full > 0) {
1159 if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
1160 dout(10) << __func__ << " setting full flag" << dendl;
1161 add_flag(CEPH_OSDMAP_FULL);
1162 remove_flag(CEPH_OSDMAP_NEARFULL);
1163 }
1164 } else {
1165 if (tmp.test_flag(CEPH_OSDMAP_FULL)) {
1166 dout(10) << __func__ << " clearing full flag" << dendl;
1167 remove_flag(CEPH_OSDMAP_FULL);
1168 }
1169 if (nearfull > 0) {
1170 if (!tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
1171 dout(10) << __func__ << " setting nearfull flag" << dendl;
1172 add_flag(CEPH_OSDMAP_NEARFULL);
1173 }
1174 } else {
1175 if (tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
1176 dout(10) << __func__ << " clearing nearfull flag" << dendl;
1177 remove_flag(CEPH_OSDMAP_NEARFULL);
1178 }
1179 }
1180 }
1181
1182 // min_compat_client?
1183 if (tmp.require_min_compat_client.empty()) {
1184 auto mv = tmp.get_min_compat_client();
1185 dout(1) << __func__ << " setting require_min_compat_client to current " << mv
1186 << dendl;
1187 mon->clog->info() << "setting require_min_compat_client to currently required "
1188 << mv;
1189 pending_inc.new_require_min_compat_client = mv.first;
1190 }
1191 }
1192 }
1193
1194 // tell me about it
1195 for (map<int32_t,uint8_t>::iterator i = pending_inc.new_state.begin();
1196 i != pending_inc.new_state.end();
1197 ++i) {
1198 int s = i->second ? i->second : CEPH_OSD_UP;
1199 if (s & CEPH_OSD_UP)
1200 dout(2) << " osd." << i->first << " DOWN" << dendl;
1201 if (s & CEPH_OSD_EXISTS)
1202 dout(2) << " osd." << i->first << " DNE" << dendl;
1203 }
1204 for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
1205 i != pending_inc.new_up_client.end();
1206 ++i) {
1207 //FIXME: insert cluster addresses too
1208 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1209 }
1210 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1211 i != pending_inc.new_weight.end();
1212 ++i) {
1213 if (i->second == CEPH_OSD_OUT) {
1214 dout(2) << " osd." << i->first << " OUT" << dendl;
1215 } else if (i->second == CEPH_OSD_IN) {
1216 dout(2) << " osd." << i->first << " IN" << dendl;
1217 } else {
1218 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1219 }
1220 }
1221
1222 // features for osdmap and its incremental
1223 uint64_t features = mon->get_quorum_con_features();
1224
1225 // encode full map and determine its crc
1226 OSDMap tmp;
1227 {
1228 tmp.deepish_copy_from(osdmap);
1229 tmp.apply_incremental(pending_inc);
1230
1231 // determine appropriate features
1232 if (!tmp.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
1233 dout(10) << __func__ << " encoding without feature SERVER_LUMINOUS"
1234 << dendl;
1235 features &= ~CEPH_FEATURE_SERVER_LUMINOUS;
1236 }
1237 if (!tmp.test_flag(CEPH_OSDMAP_REQUIRE_JEWEL)) {
1238 dout(10) << __func__ << " encoding without feature SERVER_JEWEL" << dendl;
1239 features &= ~CEPH_FEATURE_SERVER_JEWEL;
1240 }
1241 if (!tmp.test_flag(CEPH_OSDMAP_REQUIRE_KRAKEN)) {
1242 dout(10) << __func__ << " encoding without feature SERVER_KRAKEN | "
1243 << "MSG_ADDR2" << dendl;
1244 features &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1245 CEPH_FEATURE_MSG_ADDR2);
1246 }
1247 dout(10) << __func__ << " encoding full map with " << features << dendl;
1248
1249 bufferlist fullbl;
1250 ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1251 pending_inc.full_crc = tmp.get_crc();
1252
1253 // include full map in the txn. note that old monitors will
1254 // overwrite this. new ones will now skip the local full map
1255 // encode and reload from this.
1256 put_version_full(t, pending_inc.epoch, fullbl);
1257 }
1258
1259 // encode
1260 assert(get_last_committed() + 1 == pending_inc.epoch);
1261 ::encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1262
1263 dout(20) << " full_crc " << tmp.get_crc()
1264 << " inc_crc " << pending_inc.inc_crc << dendl;
1265
1266 /* put everything in the transaction */
1267 put_version(t, pending_inc.epoch, bl);
1268 put_last_committed(t, pending_inc.epoch);
1269
1270 // metadata, too!
1271 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1272 p != pending_metadata.end();
1273 ++p)
1274 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1275 for (set<int>::iterator p = pending_metadata_rm.begin();
1276 p != pending_metadata_rm.end();
1277 ++p)
1278 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1279 pending_metadata.clear();
1280 pending_metadata_rm.clear();
1281
1282 // and pg creating, also!
1283 if (mon->monmap->get_required_features().contains_all(
1284 ceph::features::mon::FEATURE_LUMINOUS)) {
1285 auto pending_creatings = update_pending_pgs(pending_inc);
1286 if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
1287 dout(7) << __func__ << " in the middle of upgrading, "
1288 << " trimming pending creating_pgs using pgmap" << dendl;
1289 trim_creating_pgs(&pending_creatings, mon->pgmon()->pg_map);
1290 }
1291 bufferlist creatings_bl;
1292 ::encode(pending_creatings, creatings_bl);
1293 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1294 }
1295}
1296
1297void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
1298 const PGMap& pgm)
1299{
1300 auto p = creating_pgs->pgs.begin();
1301 while (p != creating_pgs->pgs.end()) {
1302 auto q = pgm.pg_stat.find(p->first);
1303 if (q != pgm.pg_stat.end() &&
1304 !(q->second.state & PG_STATE_CREATING)) {
1305 dout(20) << __func__ << " pgmap shows " << p->first << " is created"
1306 << dendl;
1307 p = creating_pgs->pgs.erase(p);
1308 creating_pgs->created_pools.insert(q->first.pool());
1309 } else {
1310 ++p;
1311 }
1312 }
1313}
1314
1315int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1316{
1317 bufferlist bl;
1318 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1319 if (r < 0)
1320 return r;
1321 try {
1322 bufferlist::iterator p = bl.begin();
1323 ::decode(m, p);
1324 }
1325 catch (buffer::error& e) {
1326 if (err)
1327 *err << "osd." << osd << " metadata is corrupt";
1328 return -EIO;
1329 }
1330 return 0;
1331}
1332
1333int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1334{
1335 map<string, string> metadata;
1336 int r = load_metadata(osd, metadata, nullptr);
1337 if (r < 0)
1338 return r;
1339
1340 auto it = metadata.find("osd_objectstore");
1341 if (it == metadata.end())
1342 return -ENOENT;
1343 *type = it->second;
1344 return 0;
1345}
1346
1347bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1348 const pg_pool_t &pool,
1349 ostream *err)
1350{
1351 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1352 // since filestore osds could always join the pool later
1353 set<int> checked_osds;
1354 for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) {
1355 vector<int> up, acting;
1356 pg_t pgid(ps, pool_id, -1);
1357 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1358 for (int osd : up) {
1359 if (checked_osds.find(osd) != checked_osds.end())
1360 continue;
1361 string objectstore_type;
1362 int r = get_osd_objectstore_type(osd, &objectstore_type);
1363 // allow with missing metadata, e.g. due to an osd never booting yet
1364 if (r < 0 || objectstore_type == "bluestore") {
1365 checked_osds.insert(osd);
1366 continue;
1367 }
1368 *err << "osd." << osd << " uses " << objectstore_type;
1369 return false;
1370 }
1371 }
1372 return true;
1373}
1374
1375int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1376{
1377 map<string,string> m;
1378 if (int r = load_metadata(osd, m, err))
1379 return r;
1380 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1381 f->dump_string(p->first.c_str(), p->second);
1382 return 0;
1383}
1384
1385void OSDMonitor::print_nodes(Formatter *f)
1386{
1387 // group OSDs by their hosts
1388 map<string, list<int> > osds; // hostname => osd
1389 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1390 map<string, string> m;
1391 if (load_metadata(osd, m, NULL)) {
1392 continue;
1393 }
1394 map<string, string>::iterator hostname = m.find("hostname");
1395 if (hostname == m.end()) {
1396 // not likely though
1397 continue;
1398 }
1399 osds[hostname->second].push_back(osd);
1400 }
1401
1402 dump_services(f, osds, "osd");
1403}
1404
1405void OSDMonitor::share_map_with_random_osd()
1406{
1407 if (osdmap.get_num_up_osds() == 0) {
1408 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1409 return;
1410 }
1411
1412 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1413 if (!s) {
1414 dout(10) << __func__ << " no up osd on our session map" << dendl;
1415 return;
1416 }
1417
1418 dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
1419 // whatev, they'll request more if they need it
1420 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
1421 s->con->send_message(m);
1422 // NOTE: do *not* record osd has up to this epoch (as we do
1423 // elsewhere) as they may still need to request older values.
1424}
1425
1426version_t OSDMonitor::get_trim_to()
1427{
1428 epoch_t floor;
1429
1430 if (mon->monmap->get_required_features().contains_all(
1431 ceph::features::mon::FEATURE_LUMINOUS)) {
1432 {
1433 std::lock_guard<std::mutex> l(creating_pgs_lock);
1434 if (!creating_pgs.pgs.empty()) {
1435 return 0;
1436 }
1437 }
1438 floor = get_min_last_epoch_clean();
1439 } else {
1440 if (!mon->pgmon()->is_readable())
1441 return 0;
1442 if (mon->pgmon()->pg_map.creating_pgs.empty()) {
1443 return 0;
1444 }
1445 floor = mon->pgmon()->pg_map.get_min_last_epoch_clean();
1446 }
1447 {
1448 dout(10) << " min_last_epoch_clean " << floor << dendl;
1449 if (g_conf->mon_osd_force_trim_to > 0 &&
1450 g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
1451 floor = g_conf->mon_osd_force_trim_to;
1452 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1453 }
1454 unsigned min = g_conf->mon_min_osdmap_epochs;
1455 if (floor + min > get_last_committed()) {
1456 if (min < get_last_committed())
1457 floor = get_last_committed() - min;
1458 else
1459 floor = 0;
1460 }
1461 if (floor > get_first_committed())
1462 return floor;
1463 }
1464 return 0;
1465}
1466
1467epoch_t OSDMonitor::get_min_last_epoch_clean() const
1468{
1469 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1470 // also scan osd epochs
1471 // don't trim past the oldest reported osd epoch
1472 for (auto& osd_epoch : osd_epochs) {
1473 if (osd_epoch.second < floor) {
1474 floor = osd_epoch.second;
1475 }
1476 }
1477 return floor;
1478}
1479
1480void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1481 version_t first)
1482{
1483 dout(10) << __func__ << " including full map for e " << first << dendl;
1484 bufferlist bl;
1485 get_version_full(first, bl);
1486 put_version_full(tx, first, bl);
1487}
1488
1489// -------------
1490
1491bool OSDMonitor::preprocess_query(MonOpRequestRef op)
1492{
1493 op->mark_osdmon_event(__func__);
1494 Message *m = op->get_req();
1495 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
1496
1497 switch (m->get_type()) {
1498 // READs
1499 case MSG_MON_COMMAND:
1500 return preprocess_command(op);
1501 case CEPH_MSG_MON_GET_OSDMAP:
1502 return preprocess_get_osdmap(op);
1503
1504 // damp updates
1505 case MSG_OSD_MARK_ME_DOWN:
1506 return preprocess_mark_me_down(op);
1507 case MSG_OSD_FULL:
1508 return preprocess_full(op);
1509 case MSG_OSD_FAILURE:
1510 return preprocess_failure(op);
1511 case MSG_OSD_BOOT:
1512 return preprocess_boot(op);
1513 case MSG_OSD_ALIVE:
1514 return preprocess_alive(op);
1515 case MSG_OSD_PG_CREATED:
1516 return preprocess_pg_created(op);
1517 case MSG_OSD_PGTEMP:
1518 return preprocess_pgtemp(op);
1519 case MSG_OSD_BEACON:
1520 return preprocess_beacon(op);
1521
1522 case CEPH_MSG_POOLOP:
1523 return preprocess_pool_op(op);
1524
1525 case MSG_REMOVE_SNAPS:
1526 return preprocess_remove_snaps(op);
1527
1528 default:
1529 ceph_abort();
1530 return true;
1531 }
1532}
1533
1534bool OSDMonitor::prepare_update(MonOpRequestRef op)
1535{
1536 op->mark_osdmon_event(__func__);
1537 Message *m = op->get_req();
1538 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
1539
1540 switch (m->get_type()) {
1541 // damp updates
1542 case MSG_OSD_MARK_ME_DOWN:
1543 return prepare_mark_me_down(op);
1544 case MSG_OSD_FULL:
1545 return prepare_full(op);
1546 case MSG_OSD_FAILURE:
1547 return prepare_failure(op);
1548 case MSG_OSD_BOOT:
1549 return prepare_boot(op);
1550 case MSG_OSD_ALIVE:
1551 return prepare_alive(op);
1552 case MSG_OSD_PG_CREATED:
1553 return prepare_pg_created(op);
1554 case MSG_OSD_PGTEMP:
1555 return prepare_pgtemp(op);
1556 case MSG_OSD_BEACON:
1557 return prepare_beacon(op);
1558
1559 case MSG_MON_COMMAND:
1560 return prepare_command(op);
1561
1562 case CEPH_MSG_POOLOP:
1563 return prepare_pool_op(op);
1564
1565 case MSG_REMOVE_SNAPS:
1566 return prepare_remove_snaps(op);
1567
1568
1569 default:
1570 ceph_abort();
1571 }
1572
1573 return false;
1574}
1575
1576bool OSDMonitor::should_propose(double& delay)
1577{
1578 dout(10) << "should_propose" << dendl;
1579
1580 // if full map, propose immediately! any subsequent changes will be clobbered.
1581 if (pending_inc.fullmap.length())
1582 return true;
1583
1584 // adjust osd weights?
1585 if (!osd_weight.empty() &&
1586 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
1587 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
1588 osdmap.adjust_osd_weights(osd_weight, pending_inc);
1589 delay = 0.0;
1590 osd_weight.clear();
1591 return true;
1592 }
1593
1594 // propose as fast as possible if updating up_thru or pg_temp
1595 // want to merge OSDMap changes as much as possible
1596 if ((pending_inc.new_primary_temp.size() == 1
1597 || pending_inc.new_up_thru.size() == 1)
1598 && pending_inc.new_state.size() < 2) {
1599 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl;
1600
1601 utime_t now = ceph_clock_now();
1602 if (now - last_attempted_minwait_time > g_conf->paxos_propose_interval
1603 && now - paxos->get_last_commit_time() > g_conf->paxos_min_wait) {
1604 delay = g_conf->paxos_min_wait;
1605 last_attempted_minwait_time = now;
1606 return true;
1607 }
1608 }
1609
1610 return PaxosService::should_propose(delay);
1611}
1612
1613
1614
1615// ---------------------------
1616// READs
1617
1618bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
1619{
1620 op->mark_osdmon_event(__func__);
1621 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
1622 dout(10) << __func__ << " " << *m << dendl;
1623 MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
1624 epoch_t first = get_first_committed();
1625 epoch_t last = osdmap.get_epoch();
1626 int max = g_conf->osd_map_message_max;
1627 for (epoch_t e = MAX(first, m->get_full_first());
1628 e <= MIN(last, m->get_full_last()) && max > 0;
1629 ++e, --max) {
1630 int r = get_version_full(e, reply->maps[e]);
1631 assert(r >= 0);
1632 }
1633 for (epoch_t e = MAX(first, m->get_inc_first());
1634 e <= MIN(last, m->get_inc_last()) && max > 0;
1635 ++e, --max) {
1636 int r = get_version(e, reply->incremental_maps[e]);
1637 assert(r >= 0);
1638 }
1639 reply->oldest_map = first;
1640 reply->newest_map = last;
1641 mon->send_reply(op, reply);
1642 return true;
1643}
1644
1645
1646// ---------------------------
1647// UPDATEs
1648
1649// failure --
1650
1651bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
1652 // check permissions
1653 MonSession *session = m->get_session();
1654 if (!session)
1655 return true;
1656 if (!session->is_capable("osd", MON_CAP_X)) {
1657 dout(0) << "got MOSDFailure from entity with insufficient caps "
1658 << session->caps << dendl;
1659 return true;
1660 }
1661 if (fsid != mon->monmap->fsid) {
1662 dout(0) << "check_source: on fsid " << fsid
1663 << " != " << mon->monmap->fsid << dendl;
1664 return true;
1665 }
1666 return false;
1667}
1668
1669
1670bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
1671{
1672 op->mark_osdmon_event(__func__);
1673 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1674 // who is target_osd
1675 int badboy = m->get_target().name.num();
1676
1677 // check permissions
1678 if (check_source(m, m->fsid))
1679 goto didit;
1680
1681 // first, verify the reporting host is valid
1682 if (m->get_orig_source().is_osd()) {
1683 int from = m->get_orig_source().num();
1684 if (!osdmap.exists(from) ||
1685 osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
1686 (osdmap.is_down(from) && m->if_osd_failed())) {
1687 dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
1688 send_incremental(op, m->get_epoch()+1);
1689 goto didit;
1690 }
1691 }
1692
1693
1694 // weird?
1695 if (osdmap.is_down(badboy)) {
1696 dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1697 if (m->get_epoch() < osdmap.get_epoch())
1698 send_incremental(op, m->get_epoch()+1);
1699 goto didit;
1700 }
1701 if (osdmap.get_inst(badboy) != m->get_target()) {
1702 dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
1703 << ", from " << m->get_orig_source_inst() << dendl;
1704 if (m->get_epoch() < osdmap.get_epoch())
1705 send_incremental(op, m->get_epoch()+1);
1706 goto didit;
1707 }
1708
1709 // already reported?
1710 if (osdmap.is_down(badboy) ||
1711 osdmap.get_up_from(badboy) > m->get_epoch()) {
1712 dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1713 if (m->get_epoch() < osdmap.get_epoch())
1714 send_incremental(op, m->get_epoch()+1);
1715 goto didit;
1716 }
1717
1718 if (!can_mark_down(badboy)) {
1719 dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
1720 goto didit;
1721 }
1722
1723 dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1724 return false;
1725
1726 didit:
1727 return true;
1728}
1729
1730class C_AckMarkedDown : public C_MonOp {
1731 OSDMonitor *osdmon;
1732public:
1733 C_AckMarkedDown(
1734 OSDMonitor *osdmon,
1735 MonOpRequestRef op)
1736 : C_MonOp(op), osdmon(osdmon) {}
1737
1738 void _finish(int) override {
1739 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1740 osdmon->mon->send_reply(
1741 op,
1742 new MOSDMarkMeDown(
1743 m->fsid,
1744 m->get_target(),
1745 m->get_epoch(),
1746 false)); // ACK itself does not request an ack
1747 }
1748 ~C_AckMarkedDown() override {
1749 }
1750};
1751
1752bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
1753{
1754 op->mark_osdmon_event(__func__);
1755 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1756 int requesting_down = m->get_target().name.num();
1757 int from = m->get_orig_source().num();
1758
1759 // check permissions
1760 if (check_source(m, m->fsid))
1761 goto reply;
1762
1763 // first, verify the reporting host is valid
1764 if (!m->get_orig_source().is_osd())
1765 goto reply;
1766
1767 if (!osdmap.exists(from) ||
1768 osdmap.is_down(from) ||
1769 osdmap.get_addr(from) != m->get_target().addr) {
1770 dout(5) << "preprocess_mark_me_down from dead osd."
1771 << from << ", ignoring" << dendl;
1772 send_incremental(op, m->get_epoch()+1);
1773 goto reply;
1774 }
1775
1776 // no down might be set
1777 if (!can_mark_down(requesting_down))
1778 goto reply;
1779
1780 dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
1781 return false;
1782
1783 reply:
1784 if (m->request_ack) {
1785 Context *c(new C_AckMarkedDown(this, op));
1786 c->complete(0);
1787 }
1788 return true;
1789}
1790
1791bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
1792{
1793 op->mark_osdmon_event(__func__);
1794 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1795 int target_osd = m->get_target().name.num();
1796
1797 assert(osdmap.is_up(target_osd));
1798 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1799
1800 mon->clog->info() << "osd." << target_osd << " marked itself down";
1801 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1802 if (m->request_ack)
1803 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
1804 return true;
1805}
1806
1807bool OSDMonitor::can_mark_down(int i)
1808{
1809 if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
1810 dout(5) << "can_mark_down NODOWN flag set, will not mark osd." << i << " down" << dendl;
1811 return false;
1812 }
1813 int num_osds = osdmap.get_num_osds();
1814 if (num_osds == 0) {
1815 dout(5) << "can_mark_down no osds" << dendl;
1816 return false;
1817 }
1818 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
1819 float up_ratio = (float)up / (float)num_osds;
1820 if (up_ratio < g_conf->mon_osd_min_up_ratio) {
1821 dout(2) << "can_mark_down current up_ratio " << up_ratio << " < min "
1822 << g_conf->mon_osd_min_up_ratio
1823 << ", will not mark osd." << i << " down" << dendl;
1824 return false;
1825 }
1826 return true;
1827}
1828
1829bool OSDMonitor::can_mark_up(int i)
1830{
1831 if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
1832 dout(5) << "can_mark_up NOUP flag set, will not mark osd." << i << " up" << dendl;
1833 return false;
1834 }
1835 return true;
1836}
1837
1838/**
1839 * @note the parameter @p i apparently only exists here so we can output the
1840 * osd's id on messages.
1841 */
1842bool OSDMonitor::can_mark_out(int i)
1843{
1844 if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
1845 dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
1846 return false;
1847 }
1848 int num_osds = osdmap.get_num_osds();
1849 if (num_osds == 0) {
1850 dout(5) << __func__ << " no osds" << dendl;
1851 return false;
1852 }
1853 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
1854 float in_ratio = (float)in / (float)num_osds;
1855 if (in_ratio < g_conf->mon_osd_min_in_ratio) {
1856 if (i >= 0)
1857 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1858 << g_conf->mon_osd_min_in_ratio
1859 << ", will not mark osd." << i << " out" << dendl;
1860 else
1861 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1862 << g_conf->mon_osd_min_in_ratio
1863 << ", will not mark osds out" << dendl;
1864 return false;
1865 }
1866
1867 return true;
1868}
1869
1870bool OSDMonitor::can_mark_in(int i)
1871{
1872 if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
1873 dout(5) << "can_mark_in NOIN flag set, will not mark osd." << i << " in" << dendl;
1874 return false;
1875 }
1876 return true;
1877}
1878
1879bool OSDMonitor::check_failures(utime_t now)
1880{
1881 bool found_failure = false;
1882 for (map<int,failure_info_t>::iterator p = failure_info.begin();
1883 p != failure_info.end();
1884 ++p) {
1885 if (can_mark_down(p->first)) {
1886 found_failure |= check_failure(now, p->first, p->second);
1887 }
1888 }
1889 return found_failure;
1890}
1891
1892bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
1893{
1894 // already pending failure?
1895 if (pending_inc.new_state.count(target_osd) &&
1896 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1897 dout(10) << " already pending failure" << dendl;
1898 return true;
1899 }
1900
1901 set<string> reporters_by_subtree;
1902 string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
1903 utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
1904 utime_t max_failed_since = fi.get_failed_since();
1905 utime_t failed_for = now - max_failed_since;
1906
1907 utime_t grace = orig_grace;
1908 double my_grace = 0, peer_grace = 0;
1909 double decay_k = 0;
1910 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1911 double halflife = (double)g_conf->mon_osd_laggy_halflife;
1912 decay_k = ::log(.5) / halflife;
1913
1914 // scale grace period based on historical probability of 'lagginess'
1915 // (false positive failures due to slowness).
1916 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
1917 double decay = exp((double)failed_for * decay_k);
1918 dout(20) << " halflife " << halflife << " decay_k " << decay_k
1919 << " failed_for " << failed_for << " decay " << decay << dendl;
1920 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
1921 grace += my_grace;
1922 }
1923
1924 // consider the peers reporting a failure a proxy for a potential
1925 // 'subcluster' over the overall cluster that is similarly
1926 // laggy. this is clearly not true in all cases, but will sometimes
1927 // help us localize the grace correction to a subset of the system
1928 // (say, a rack with a bad switch) that is unhappy.
1929 assert(fi.reporters.size());
1930 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
1931 p != fi.reporters.end();
1932 ++p) {
1933 // get the parent bucket whose type matches with "reporter_subtree_level".
1934 // fall back to OSD if the level doesn't exist.
1935 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
1936 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
1937 if (iter == reporter_loc.end()) {
1938 reporters_by_subtree.insert("osd." + to_string(p->first));
1939 } else {
1940 reporters_by_subtree.insert(iter->second);
1941 }
1942 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1943 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
1944 utime_t elapsed = now - xi.down_stamp;
1945 double decay = exp((double)elapsed * decay_k);
1946 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
1947 }
1948 }
1949
1950 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1951 peer_grace /= (double)fi.reporters.size();
1952 grace += peer_grace;
1953 }
1954
1955 dout(10) << " osd." << target_osd << " has "
1956 << fi.reporters.size() << " reporters, "
1957 << grace << " grace (" << orig_grace << " + " << my_grace
1958 << " + " << peer_grace << "), max_failed_since " << max_failed_since
1959 << dendl;
1960
1961 if (failed_for >= grace &&
1962 (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
1963 dout(1) << " we have enough reporters to mark osd." << target_osd
1964 << " down" << dendl;
1965 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1966
1967 mon->clog->info() << osdmap.get_inst(target_osd) << " failed ("
1968 << (int)reporters_by_subtree.size() << " reporters from different "
1969 << reporter_subtree_level << " after "
1970 << failed_for << " >= grace " << grace << ")";
1971 return true;
1972 }
1973 return false;
1974}
1975
1976void OSDMonitor::force_failure(utime_t now, int target_osd)
1977{
1978 // already pending failure?
1979 if (pending_inc.new_state.count(target_osd) &&
1980 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1981 dout(10) << " already pending failure" << dendl;
1982 return;
1983 }
1984
1985 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
1986 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1987
1988 mon->clog->info() << osdmap.get_inst(target_osd) << " failed (forced)";
1989 return;
1990}
1991
1992bool OSDMonitor::prepare_failure(MonOpRequestRef op)
1993{
1994 op->mark_osdmon_event(__func__);
1995 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1996 dout(1) << "prepare_failure " << m->get_target()
1997 << " from " << m->get_orig_source_inst()
1998 << " is reporting failure:" << m->if_osd_failed() << dendl;
1999
2000 int target_osd = m->get_target().name.num();
2001 int reporter = m->get_orig_source().num();
2002 assert(osdmap.is_up(target_osd));
2003 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
2004
2005 if (m->if_osd_failed()) {
2006 // calculate failure time
2007 utime_t now = ceph_clock_now();
2008 utime_t failed_since =
2009 m->get_recv_stamp() - utime_t(m->failed_for, 0);
2010
2011 // add a report
2012 if (m->is_immediate()) {
2013 mon->clog->debug() << m->get_target() << " reported immediately failed by "
2014 << m->get_orig_source_inst();
2015 force_failure(now, target_osd);
2016 return true;
2017 }
2018 mon->clog->debug() << m->get_target() << " reported failed by "
2019 << m->get_orig_source_inst();
2020
2021 failure_info_t& fi = failure_info[target_osd];
2022 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2023 if (old_op) {
2024 mon->no_reply(old_op);
2025 }
2026
2027 return check_failure(now, target_osd, fi);
2028 } else {
2029 // remove the report
2030 mon->clog->debug() << m->get_target() << " failure report canceled by "
2031 << m->get_orig_source_inst();
2032 if (failure_info.count(target_osd)) {
2033 failure_info_t& fi = failure_info[target_osd];
2034 MonOpRequestRef report_op = fi.cancel_report(reporter);
2035 if (report_op) {
2036 mon->no_reply(report_op);
2037 }
2038 if (fi.reporters.empty()) {
2039 dout(10) << " removing last failure_info for osd." << target_osd
2040 << dendl;
2041 failure_info.erase(target_osd);
2042 } else {
2043 dout(10) << " failure_info for osd." << target_osd << " now "
2044 << fi.reporters.size() << " reporters" << dendl;
2045 }
2046 } else {
2047 dout(10) << " no failure_info for osd." << target_osd << dendl;
2048 }
2049 mon->no_reply(op);
2050 }
2051
2052 return false;
2053}
2054
2055void OSDMonitor::process_failures()
2056{
2057 map<int,failure_info_t>::iterator p = failure_info.begin();
2058 while (p != failure_info.end()) {
2059 if (osdmap.is_up(p->first)) {
2060 ++p;
2061 } else {
2062 dout(10) << "process_failures osd." << p->first << dendl;
2063 list<MonOpRequestRef> ls;
2064 p->second.take_report_messages(ls);
2065 failure_info.erase(p++);
2066
2067 while (!ls.empty()) {
2068 MonOpRequestRef o = ls.front();
2069 if (o) {
2070 o->mark_event(__func__);
2071 MOSDFailure *m = o->get_req<MOSDFailure>();
2072 send_latest(o, m->get_epoch());
2073 }
2074 ls.pop_front();
2075 }
2076 }
2077 }
2078}
2079
2080void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2081{
2082 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2083
2084 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2085 p != failure_info.end();
2086 ++p) {
2087 p->second.take_report_messages(ls);
2088 }
2089 failure_info.clear();
2090}
2091
2092
2093// boot --
2094
2095bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2096{
2097 op->mark_osdmon_event(__func__);
2098 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2099 int from = m->get_orig_source_inst().name.num();
2100
2101 // check permissions, ignore if failed (no response expected)
2102 MonSession *session = m->get_session();
2103 if (!session)
2104 goto ignore;
2105 if (!session->is_capable("osd", MON_CAP_X)) {
2106 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2107 << session->caps << dendl;
2108 goto ignore;
2109 }
2110
2111 if (m->sb.cluster_fsid != mon->monmap->fsid) {
2112 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2113 << " != " << mon->monmap->fsid << dendl;
2114 goto ignore;
2115 }
2116
2117 if (m->get_orig_source_inst().addr.is_blank_ip()) {
2118 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2119 goto ignore;
2120 }
2121
2122 assert(m->get_orig_source_inst().name.is_osd());
2123
2124 // check if osd has required features to boot
2125 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2126 CEPH_FEATURE_OSD_ERASURE_CODES) &&
2127 !(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
2128 dout(0) << __func__ << " osdmap requires erasure code but osd at "
2129 << m->get_orig_source_inst()
2130 << " doesn't announce support -- ignore" << dendl;
2131 goto ignore;
2132 }
2133
2134 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2135 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
2136 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
2137 dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
2138 << m->get_orig_source_inst()
2139 << " doesn't announce support -- ignore" << dendl;
2140 goto ignore;
2141 }
2142
2143 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2144 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
2145 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
2146 dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
2147 << m->get_orig_source_inst()
2148 << " doesn't announce support -- ignore" << dendl;
2149 goto ignore;
2150 }
2151
2152 if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS) &&
2153 !HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2154 mon->clog->info() << "disallowing boot of OSD "
2155 << m->get_orig_source_inst()
2156 << " because the osdmap requires"
2157 << " CEPH_FEATURE_SERVER_LUMINOUS"
2158 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2159 goto ignore;
2160 }
2161
2162 if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_JEWEL) &&
2163 !(m->osd_features & CEPH_FEATURE_SERVER_JEWEL)) {
2164 mon->clog->info() << "disallowing boot of OSD "
2165 << m->get_orig_source_inst()
2166 << " because the osdmap requires"
2167 << " CEPH_FEATURE_SERVER_JEWEL"
2168 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2169 goto ignore;
2170 }
2171
2172 if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_KRAKEN) &&
2173 !HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2174 mon->clog->info() << "disallowing boot of OSD "
2175 << m->get_orig_source_inst()
2176 << " because the osdmap requires"
2177 << " CEPH_FEATURE_SERVER_KRAKEN"
2178 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2179 goto ignore;
2180 }
2181
2182 if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
2183 !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
2184 mon->clog->info() << "disallowing boot of OSD "
2185 << m->get_orig_source_inst()
2186 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2187 goto ignore;
2188 }
2189
2190 if (any_of(osdmap.get_pools().begin(),
2191 osdmap.get_pools().end(),
2192 [](const std::pair<int64_t,pg_pool_t>& pool)
2193 { return pool.second.use_gmt_hitset; })) {
2194 assert(osdmap.get_num_up_osds() == 0 ||
2195 osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
2196 if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
2197 dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
2198 << m->get_orig_source_inst()
2199 << " doesn't announce support -- ignore" << dendl;
2200 goto ignore;
2201 }
2202 }
2203
2204 // make sure upgrades stop at luminous
2205 if (HAVE_FEATURE(m->osd_features, SERVER_M) &&
2206 !osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
2207 mon->clog->info() << "disallowing boot of post-luminous OSD "
2208 << m->get_orig_source_inst()
2209 << " because require_luminous_osds is not set";
2210 goto ignore;
2211 }
2212
2213 // make sure upgrades stop at jewel
2214 if (HAVE_FEATURE(m->osd_features, SERVER_KRAKEN) &&
2215 !osdmap.test_flag(CEPH_OSDMAP_REQUIRE_JEWEL)) {
2216 mon->clog->info() << "disallowing boot of post-jewel OSD "
2217 << m->get_orig_source_inst()
2218 << " because require_jewel_osds is not set";
2219 goto ignore;
2220 }
2221
2222 // make sure upgrades stop at hammer
2223 // * HAMMER_0_94_4 is the required hammer feature
2224 // * MON_METADATA is the first post-hammer feature
2225 if (osdmap.get_num_up_osds() > 0) {
2226 if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
2227 !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
2228 mon->clog->info() << "disallowing boot of post-hammer OSD "
2229 << m->get_orig_source_inst()
2230 << " because one or more up OSDs is pre-hammer v0.94.4";
2231 goto ignore;
2232 }
2233 if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
2234 (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
2235 mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2236 << m->get_orig_source_inst()
2237 << " because all up OSDs are post-hammer";
2238 goto ignore;
2239 }
2240 }
2241
2242 // already booted?
2243 if (osdmap.is_up(from) &&
2244 osdmap.get_inst(from) == m->get_orig_source_inst() &&
2245 osdmap.get_cluster_addr(from) == m->cluster_addr) {
2246 // yup.
2247 dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
2248 << " == " << osdmap.get_inst(from) << dendl;
2249 _booted(op, false);
2250 return true;
2251 }
2252
2253 if (osdmap.exists(from) &&
2254 !osdmap.get_uuid(from).is_zero() &&
2255 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2256 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2257 << " clashes with existing osd: different fsid"
2258 << " (ours: " << osdmap.get_uuid(from)
2259 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2260 goto ignore;
2261 }
2262
2263 if (osdmap.exists(from) &&
2264 osdmap.get_info(from).up_from > m->version &&
2265 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) {
2266 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2267 send_latest(op, m->sb.current_epoch+1);
2268 return true;
2269 }
2270
2271 // noup?
2272 if (!can_mark_up(from)) {
2273 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2274 send_latest(op, m->sb.current_epoch+1);
2275 return true;
2276 }
2277
2278 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2279 return false;
2280
2281 ignore:
2282 return true;
2283}
2284
2285bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2286{
2287 op->mark_osdmon_event(__func__);
2288 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2289 dout(7) << __func__ << " from " << m->get_orig_source_inst() << " sb " << m->sb
2290 << " cluster_addr " << m->cluster_addr
2291 << " hb_back_addr " << m->hb_back_addr
2292 << " hb_front_addr " << m->hb_front_addr
2293 << dendl;
2294
2295 assert(m->get_orig_source().is_osd());
2296 int from = m->get_orig_source().num();
2297
2298 // does this osd exist?
2299 if (from >= osdmap.get_max_osd()) {
2300 dout(1) << "boot from osd." << from << " >= max_osd "
2301 << osdmap.get_max_osd() << dendl;
2302 return false;
2303 }
2304
2305 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2306 if (pending_inc.new_state.count(from))
2307 oldstate ^= pending_inc.new_state[from];
2308
2309 // already up? mark down first?
2310 if (osdmap.is_up(from)) {
2311 dout(7) << __func__ << " was up, first marking down "
2312 << osdmap.get_inst(from) << dendl;
2313 // preprocess should have caught these; if not, assert.
2314 assert(osdmap.get_inst(from) != m->get_orig_source_inst() ||
2315 osdmap.get_cluster_addr(from) != m->cluster_addr);
2316 assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2317
2318 if (pending_inc.new_state.count(from) == 0 ||
2319 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2320 // mark previous guy down
2321 pending_inc.new_state[from] = CEPH_OSD_UP;
2322 }
2323 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2324 } else if (pending_inc.new_up_client.count(from)) {
2325 // already prepared, just wait
2326 dout(7) << __func__ << " already prepared, waiting on "
2327 << m->get_orig_source_addr() << dendl;
2328 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2329 } else {
2330 // mark new guy up.
2331 pending_inc.new_up_client[from] = m->get_orig_source_addr();
2332 if (!m->cluster_addr.is_blank_ip())
2333 pending_inc.new_up_cluster[from] = m->cluster_addr;
2334 pending_inc.new_hb_back_up[from] = m->hb_back_addr;
2335 if (!m->hb_front_addr.is_blank_ip())
2336 pending_inc.new_hb_front_up[from] = m->hb_front_addr;
2337
2338 down_pending_out.erase(from); // if any
2339
2340 if (m->sb.weight)
2341 osd_weight[from] = m->sb.weight;
2342
2343 // set uuid?
2344 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2345 << dendl;
2346 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2347 // preprocess should have caught this; if not, assert.
2348 assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2349 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2350 }
2351
2352 // fresh osd?
2353 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2354 const osd_info_t& i = osdmap.get_info(from);
2355 if (i.up_from > i.lost_at) {
2356 dout(10) << " fresh osd; marking lost_at too" << dendl;
2357 pending_inc.new_lost[from] = osdmap.get_epoch();
2358 }
2359 }
2360
2361 // metadata
2362 bufferlist osd_metadata;
2363 ::encode(m->metadata, osd_metadata);
2364 pending_metadata[from] = osd_metadata;
2365
2366 // adjust last clean unmount epoch?
2367 const osd_info_t& info = osdmap.get_info(from);
2368 dout(10) << " old osd_info: " << info << dendl;
2369 if (m->sb.mounted > info.last_clean_begin ||
2370 (m->sb.mounted == info.last_clean_begin &&
2371 m->sb.clean_thru > info.last_clean_end)) {
2372 epoch_t begin = m->sb.mounted;
2373 epoch_t end = m->sb.clean_thru;
2374
2375 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2376 << "[" << info.last_clean_begin << "," << info.last_clean_end
2377 << ") -> [" << begin << "-" << end << ")"
2378 << dendl;
2379 pending_inc.new_last_clean_interval[from] =
2380 pair<epoch_t,epoch_t>(begin, end);
2381 }
2382
2383 osd_xinfo_t xi = osdmap.get_xinfo(from);
2384 if (m->boot_epoch == 0) {
2385 xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
2386 xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
2387 dout(10) << " not laggy, new xi " << xi << dendl;
2388 } else {
2389 if (xi.down_stamp.sec()) {
2390 int interval = ceph_clock_now().sec() -
2391 xi.down_stamp.sec();
2392 if (g_conf->mon_osd_laggy_max_interval &&
2393 (interval > g_conf->mon_osd_laggy_max_interval)) {
2394 interval = g_conf->mon_osd_laggy_max_interval;
2395 }
2396 xi.laggy_interval =
2397 interval * g_conf->mon_osd_laggy_weight +
2398 xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
2399 }
2400 xi.laggy_probability =
2401 g_conf->mon_osd_laggy_weight +
2402 xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
2403 dout(10) << " laggy, now xi " << xi << dendl;
2404 }
2405
2406 // set features shared by the osd
2407 if (m->osd_features)
2408 xi.features = m->osd_features;
2409 else
2410 xi.features = m->get_connection()->get_features();
2411
2412 // mark in?
2413 if ((g_conf->mon_osd_auto_mark_auto_out_in &&
2414 (oldstate & CEPH_OSD_AUTOOUT)) ||
2415 (g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
2416 (g_conf->mon_osd_auto_mark_in)) {
2417 if (can_mark_in(from)) {
2418 if (osdmap.osd_xinfo[from].old_weight > 0) {
2419 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
2420 xi.old_weight = 0;
2421 } else {
2422 pending_inc.new_weight[from] = CEPH_OSD_IN;
2423 }
2424 } else {
2425 dout(7) << __func__ << " NOIN set, will not mark in "
2426 << m->get_orig_source_addr() << dendl;
2427 }
2428 }
2429
2430 pending_inc.new_xinfo[from] = xi;
2431
2432 // wait
2433 wait_for_finished_proposal(op, new C_Booted(this, op));
2434 }
2435 return true;
2436}
2437
2438void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
2439{
2440 op->mark_osdmon_event(__func__);
2441 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2442 dout(7) << "_booted " << m->get_orig_source_inst()
2443 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
2444
2445 if (logit) {
2446 mon->clog->info() << m->get_orig_source_inst() << " boot";
2447 }
2448
2449 send_latest(op, m->sb.current_epoch+1);
2450}
2451
2452
2453// -------------
2454// full
2455
2456bool OSDMonitor::preprocess_full(MonOpRequestRef op)
2457{
2458 op->mark_osdmon_event(__func__);
2459 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2460 int from = m->get_orig_source().num();
2461 set<string> state;
2462 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2463
2464 // check permissions, ignore if failed
2465 MonSession *session = m->get_session();
2466 if (!session)
2467 goto ignore;
2468 if (!session->is_capable("osd", MON_CAP_X)) {
2469 dout(0) << "MOSDFull from entity with insufficient privileges:"
2470 << session->caps << dendl;
2471 goto ignore;
2472 }
2473
2474 // ignore a full message from the osd instance that already went down
2475 if (!osdmap.exists(from)) {
2476 dout(7) << __func__ << " ignoring full message from nonexistent "
2477 << m->get_orig_source_inst() << dendl;
2478 goto ignore;
2479 }
2480 if ((!osdmap.is_up(from) &&
2481 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) ||
2482 (osdmap.is_up(from) &&
2483 osdmap.get_inst(from) != m->get_orig_source_inst())) {
2484 dout(7) << __func__ << " ignoring full message from down "
2485 << m->get_orig_source_inst() << dendl;
2486 goto ignore;
2487 }
2488
2489 OSDMap::calc_state_set(osdmap.get_state(from), state);
2490
2491 if ((osdmap.get_state(from) & mask) == m->state) {
2492 dout(7) << __func__ << " state already " << state << " for osd." << from
2493 << " " << m->get_orig_source_inst() << dendl;
2494 _reply_map(op, m->version);
2495 goto ignore;
2496 }
2497
2498 dout(10) << __func__ << " want state " << state << " for osd." << from
2499 << " " << m->get_orig_source_inst() << dendl;
2500 return false;
2501
2502 ignore:
2503 return true;
2504}
2505
2506bool OSDMonitor::prepare_full(MonOpRequestRef op)
2507{
2508 op->mark_osdmon_event(__func__);
2509 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2510 const int from = m->get_orig_source().num();
2511
2512 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2513 const unsigned want_state = m->state & mask; // safety first
2514
2515 unsigned cur_state = osdmap.get_state(from);
2516 auto p = pending_inc.new_state.find(from);
2517 if (p != pending_inc.new_state.end()) {
2518 cur_state ^= p->second;
2519 }
2520 cur_state &= mask;
2521
2522 set<string> want_state_set, cur_state_set;
2523 OSDMap::calc_state_set(want_state, want_state_set);
2524 OSDMap::calc_state_set(cur_state, cur_state_set);
2525
2526 if (cur_state != want_state) {
2527 if (p != pending_inc.new_state.end()) {
2528 p->second &= ~mask;
2529 } else {
2530 pending_inc.new_state[from] = 0;
2531 }
2532 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
2533 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2534 << " -> " << want_state_set << dendl;
2535 } else {
2536 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2537 << " = wanted " << want_state_set << ", just waiting" << dendl;
2538 }
2539
2540 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2541 return true;
2542}
2543
2544// -------------
2545// alive
2546
2547bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
2548{
2549 op->mark_osdmon_event(__func__);
2550 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2551 int from = m->get_orig_source().num();
2552
2553 // check permissions, ignore if failed
2554 MonSession *session = m->get_session();
2555 if (!session)
2556 goto ignore;
2557 if (!session->is_capable("osd", MON_CAP_X)) {
2558 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2559 << session->caps << dendl;
2560 goto ignore;
2561 }
2562
2563 if (!osdmap.is_up(from) ||
2564 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2565 dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
2566 goto ignore;
2567 }
2568
2569 if (osdmap.get_up_thru(from) >= m->want) {
2570 // yup.
2571 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
2572 _reply_map(op, m->version);
2573 return true;
2574 }
2575
2576 dout(10) << "preprocess_alive want up_thru " << m->want
2577 << " from " << m->get_orig_source_inst() << dendl;
2578 return false;
2579
2580 ignore:
2581 return true;
2582}
2583
2584bool OSDMonitor::prepare_alive(MonOpRequestRef op)
2585{
2586 op->mark_osdmon_event(__func__);
2587 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2588 int from = m->get_orig_source().num();
2589
2590 if (0) { // we probably don't care much about these
2591 mon->clog->debug() << m->get_orig_source_inst() << " alive";
2592 }
2593
2594 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
2595 << " from " << m->get_orig_source_inst() << dendl;
2596
2597 update_up_thru(from, m->version); // set to the latest map the OSD has
2598 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2599 return true;
2600}
2601
2602void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
2603{
2604 op->mark_osdmon_event(__func__);
2605 dout(7) << "_reply_map " << e
2606 << " from " << op->get_req()->get_orig_source_inst()
2607 << dendl;
2608 send_latest(op, e);
2609}
2610
2611// pg_created
2612bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
2613{
2614 op->mark_osdmon_event(__func__);
2615 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2616 dout(10) << __func__ << " " << *m << dendl;
2617 auto session = m->get_session();
2618 if (!session) {
2619 dout(10) << __func__ << ": no monitor session!" << dendl;
2620 return true;
2621 }
2622 if (!session->is_capable("osd", MON_CAP_X)) {
2623 derr << __func__ << " received from entity "
2624 << "with insufficient privileges " << session->caps << dendl;
2625 return true;
2626 }
2627 // always forward the "created!" to the leader
2628 return false;
2629}
2630
2631bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
2632{
2633 op->mark_osdmon_event(__func__);
2634 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2635 dout(10) << __func__ << " " << *m << dendl;
2636 auto src = m->get_orig_source();
2637 auto from = src.num();
2638 if (!src.is_osd() ||
2639 !mon->osdmon()->osdmap.is_up(from) ||
2640 m->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
2641 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
2642 return false;
2643 }
2644 pending_created_pgs.push_back(m->pgid);
2645 return true;
2646}
2647
2648// -------------
2649// pg_temp changes
2650
2651bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
2652{
2653 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2654 dout(10) << "preprocess_pgtemp " << *m << dendl;
2655 mempool::osdmap::vector<int> empty;
2656 int from = m->get_orig_source().num();
2657 size_t ignore_cnt = 0;
2658
2659 // check caps
2660 MonSession *session = m->get_session();
2661 if (!session)
2662 goto ignore;
2663 if (!session->is_capable("osd", MON_CAP_X)) {
2664 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2665 << session->caps << dendl;
2666 goto ignore;
2667 }
2668
2669 if (!osdmap.is_up(from) ||
2670 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2671 dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
2672 goto ignore;
2673 }
2674
2675 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2676 dout(20) << " " << p->first
2677 << (osdmap.pg_temp->count(p->first) ? (*osdmap.pg_temp)[p->first] : empty)
2678 << " -> " << p->second << dendl;
2679
2680 // does the pool exist?
2681 if (!osdmap.have_pg_pool(p->first.pool())) {
2682 /*
2683 * 1. If the osdmap does not have the pool, it means the pool has been
2684 * removed in-between the osd sending this message and us handling it.
2685 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2686 * not exist in the pending either, as the osds would not send a
2687 * message about a pool they know nothing about (yet).
2688 * 3. However, if the pool does exist in the pending, then it must be a
2689 * new pool, and not relevant to this message (see 1).
2690 */
2691 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2692 << ": pool has been removed" << dendl;
2693 ignore_cnt++;
2694 continue;
2695 }
2696
2697 int acting_primary = -1;
2698 osdmap.pg_to_up_acting_osds(
2699 p->first, nullptr, nullptr, nullptr, &acting_primary);
2700 if (acting_primary != from) {
2701 /* If the source isn't the primary based on the current osdmap, we know
2702 * that the interval changed and that we can discard this message.
2703 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2704 * which of two pg temp mappings on the same pg is more recent.
2705 */
2706 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2707 << ": primary has changed" << dendl;
2708 ignore_cnt++;
2709 continue;
2710 }
2711
2712 // removal?
2713 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
2714 osdmap.primary_temp->count(p->first)))
2715 return false;
2716 // change?
2717 // NOTE: we assume that this will clear pg_primary, so consider
2718 // an existing pg_primary field to imply a change
2719 if (p->second.size() &&
2720 (osdmap.pg_temp->count(p->first) == 0 ||
2721 !vectors_equal((*osdmap.pg_temp)[p->first], p->second) ||
2722 osdmap.primary_temp->count(p->first)))
2723 return false;
2724 }
2725
2726 // should we ignore all the pgs?
2727 if (ignore_cnt == m->pg_temp.size())
2728 goto ignore;
2729
2730 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
2731 _reply_map(op, m->map_epoch);
2732 return true;
2733
2734 ignore:
2735 return true;
2736}
2737
2738void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
2739{
2740 epoch_t old_up_thru = osdmap.get_up_thru(from);
2741 auto ut = pending_inc.new_up_thru.find(from);
2742 if (ut != pending_inc.new_up_thru.end()) {
2743 old_up_thru = ut->second;
2744 }
2745 if (up_thru > old_up_thru) {
2746 // set up_thru too, so the osd doesn't have to ask again
2747 pending_inc.new_up_thru[from] = up_thru;
2748 }
2749}
2750
2751bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
2752{
2753 op->mark_osdmon_event(__func__);
2754 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2755 int from = m->get_orig_source().num();
2756 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
2757 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2758 uint64_t pool = p->first.pool();
2759 if (pending_inc.old_pools.count(pool)) {
2760 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2761 << ": pool pending removal" << dendl;
2762 continue;
2763 }
2764 if (!osdmap.have_pg_pool(pool)) {
2765 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2766 << ": pool has been removed" << dendl;
2767 continue;
2768 }
2769 pending_inc.new_pg_temp[p->first] =
2770 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
2771
2772 // unconditionally clear pg_primary (until this message can encode
2773 // a change for that, too.. at which point we need to also fix
2774 // preprocess_pg_temp)
2775 if (osdmap.primary_temp->count(p->first) ||
2776 pending_inc.new_primary_temp.count(p->first))
2777 pending_inc.new_primary_temp[p->first] = -1;
2778 }
2779
2780 // set up_thru too, so the osd doesn't have to ask again
2781 update_up_thru(from, m->map_epoch);
2782
2783 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
2784 return true;
2785}
2786
2787
2788// ---
2789
2790bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
2791{
2792 op->mark_osdmon_event(__func__);
2793 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2794 dout(7) << "preprocess_remove_snaps " << *m << dendl;
2795
2796 // check privilege, ignore if failed
2797 MonSession *session = m->get_session();
2798 if (!session)
2799 goto ignore;
2800 if (!session->caps.is_capable(
2801 g_ceph_context,
2802 CEPH_ENTITY_TYPE_MON,
2803 session->entity_name,
2804 "osd", "osd pool rmsnap", {}, true, true, false)) {
2805 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
2806 << session->caps << dendl;
2807 goto ignore;
2808 }
2809
2810 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
2811 q != m->snaps.end();
2812 ++q) {
2813 if (!osdmap.have_pg_pool(q->first)) {
2814 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
2815 continue;
2816 }
2817 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
2818 for (vector<snapid_t>::iterator p = q->second.begin();
2819 p != q->second.end();
2820 ++p) {
2821 if (*p > pi->get_snap_seq() ||
2822 !pi->removed_snaps.contains(*p))
2823 return false;
2824 }
2825 }
2826
2827 ignore:
2828 return true;
2829}
2830
2831bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
2832{
2833 op->mark_osdmon_event(__func__);
2834 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2835 dout(7) << "prepare_remove_snaps " << *m << dendl;
2836
2837 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
2838 p != m->snaps.end();
2839 ++p) {
2840
2841 if (!osdmap.have_pg_pool(p->first)) {
2842 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
2843 continue;
2844 }
2845
2846 pg_pool_t& pi = osdmap.pools[p->first];
2847 for (vector<snapid_t>::iterator q = p->second.begin();
2848 q != p->second.end();
2849 ++q) {
2850 if (!pi.removed_snaps.contains(*q) &&
2851 (!pending_inc.new_pools.count(p->first) ||
2852 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
2853 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
2854 newpi->removed_snaps.insert(*q);
2855 dout(10) << " pool " << p->first << " removed_snaps added " << *q
2856 << " (now " << newpi->removed_snaps << ")" << dendl;
2857 if (*q > newpi->get_snap_seq()) {
2858 dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
2859 newpi->set_snap_seq(*q);
2860 }
2861 newpi->set_snap_epoch(pending_inc.epoch);
2862 }
2863 }
2864 }
2865 return true;
2866}
2867
2868// osd beacon
2869bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
2870{
2871 op->mark_osdmon_event(__func__);
2872 auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2873 // check caps
2874 auto session = beacon->get_session();
2875 if (!session) {
2876 dout(10) << __func__ << " no monitor session!" << dendl;
2877 return true;
2878 }
2879 if (!session->is_capable("osd", MON_CAP_X)) {
2880 derr << __func__ << " received from entity "
2881 << "with insufficient privileges " << session->caps << dendl;
2882 return true;
2883 }
2884 // Always forward the beacon to the leader, even if they are the same as
2885 // the old one. The leader will mark as down osds that haven't sent
2886 // beacon for a few minutes.
2887 return false;
2888}
2889
2890bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
2891{
2892 op->mark_osdmon_event(__func__);
2893 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2894 const auto src = beacon->get_orig_source();
2895 dout(10) << __func__ << " " << *beacon
2896 << " from " << src << dendl;
2897 int from = src.num();
2898
2899 if (!src.is_osd() ||
2900 !osdmap.is_up(from) ||
2901 beacon->get_orig_source_inst() != osdmap.get_inst(from)) {
2902 dout(1) << " ignoring beacon from non-active osd." << dendl;
2903 return false;
2904 }
2905
2906 last_osd_report[from] = ceph_clock_now();
2907 osd_epochs[from] = beacon->version;
2908
2909 for (const auto& pg : beacon->pgs) {
2910 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
2911 }
2912 return false;
2913}
2914
2915// ---------------
2916// map helpers
2917
2918void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
2919{
2920 op->mark_osdmon_event(__func__);
2921 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
2922 << " start " << start << dendl;
2923 if (start == 0)
2924 send_full(op);
2925 else
2926 send_incremental(op, start);
2927}
2928
2929
2930MOSDMap *OSDMonitor::build_latest_full()
2931{
2932 MOSDMap *r = new MOSDMap(mon->monmap->fsid);
2933 get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
2934 r->oldest_map = get_first_committed();
2935 r->newest_map = osdmap.get_epoch();
2936 return r;
2937}
2938
2939MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
2940{
2941 dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
2942 MOSDMap *m = new MOSDMap(mon->monmap->fsid);
2943 m->oldest_map = get_first_committed();
2944 m->newest_map = osdmap.get_epoch();
2945
2946 for (epoch_t e = to; e >= from && e > 0; e--) {
2947 bufferlist bl;
2948 int err = get_version(e, bl);
2949 if (err == 0) {
2950 assert(bl.length());
2951 // if (get_version(e, bl) > 0) {
2952 dout(20) << "build_incremental inc " << e << " "
2953 << bl.length() << " bytes" << dendl;
2954 m->incremental_maps[e] = bl;
2955 } else {
2956 assert(err == -ENOENT);
2957 assert(!bl.length());
2958 get_version_full(e, bl);
2959 if (bl.length() > 0) {
2960 //else if (get_version("full", e, bl) > 0) {
2961 dout(20) << "build_incremental full " << e << " "
2962 << bl.length() << " bytes" << dendl;
2963 m->maps[e] = bl;
2964 } else {
2965 ceph_abort(); // we should have all maps.
2966 }
2967 }
2968 }
2969 return m;
2970}
2971
2972void OSDMonitor::send_full(MonOpRequestRef op)
2973{
2974 op->mark_osdmon_event(__func__);
2975 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
2976 mon->send_reply(op, build_latest_full());
2977}
2978
2979void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
2980{
2981 op->mark_osdmon_event(__func__);
2982
2983 MonSession *s = op->get_session();
2984 assert(s);
2985
2986 if (s->proxy_con &&
2987 s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
2988 // oh, we can tell the other mon to do it
2989 dout(10) << __func__ << " asking proxying mon to send_incremental from "
2990 << first << dendl;
2991 MRoute *r = new MRoute(s->proxy_tid, NULL);
2992 r->send_osdmap_first = first;
2993 s->proxy_con->send_message(r);
2994 op->mark_event("reply: send routed send_osdmap_first reply");
2995 } else {
2996 // do it ourselves
2997 send_incremental(first, s, false, op);
2998 }
2999}
3000
3001void OSDMonitor::send_incremental(epoch_t first,
3002 MonSession *session,
3003 bool onetime,
3004 MonOpRequestRef req)
3005{
3006 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3007 << " to " << session->inst << dendl;
3008
3009 if (first <= session->osd_epoch) {
3010 dout(10) << __func__ << session->inst << " should already have epoch "
3011 << session->osd_epoch << dendl;
3012 first = session->osd_epoch + 1;
3013 }
3014
3015 if (first < get_first_committed()) {
3016 first = get_first_committed();
3017 bufferlist bl;
3018 int err = get_version_full(first, bl);
3019 assert(err == 0);
3020 assert(bl.length());
3021
3022 dout(20) << "send_incremental starting with base full "
3023 << first << " " << bl.length() << " bytes" << dendl;
3024
3025 MOSDMap *m = new MOSDMap(osdmap.get_fsid());
3026 m->oldest_map = get_first_committed();
3027 m->newest_map = osdmap.get_epoch();
3028 m->maps[first] = bl;
3029
3030 if (req) {
3031 mon->send_reply(req, m);
3032 session->osd_epoch = first;
3033 return;
3034 } else {
3035 session->con->send_message(m);
3036 session->osd_epoch = first;
3037 }
3038 first++;
3039 }
3040
3041 while (first <= osdmap.get_epoch()) {
3042 epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
3043 osdmap.get_epoch());
3044 MOSDMap *m = build_incremental(first, last);
3045
3046 if (req) {
3047 // send some maps. it may not be all of them, but it will get them
3048 // started.
3049 mon->send_reply(req, m);
3050 } else {
3051 session->con->send_message(m);
3052 first = last + 1;
3053 }
3054 session->osd_epoch = last;
3055 if (onetime || req)
3056 break;
3057 }
3058}
3059
3060int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3061{
3062 if (inc_osd_cache.lookup(ver, &bl)) {
3063 return 0;
3064 }
3065 int ret = PaxosService::get_version(ver, bl);
3066 if (!ret) {
3067 inc_osd_cache.add(ver, bl);
3068 }
3069 return ret;
3070}
3071
3072int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3073{
3074 if (full_osd_cache.lookup(ver, &bl)) {
3075 return 0;
3076 }
3077 int ret = PaxosService::get_version_full(ver, bl);
3078 if (!ret) {
3079 full_osd_cache.add(ver, bl);
3080 }
3081 return ret;
3082}
3083
3084epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
3085{
3086 dout(10) << "blacklist " << a << " until " << until << dendl;
3087 pending_inc.new_blacklist[a] = until;
3088 return pending_inc.epoch;
3089}
3090
3091
3092void OSDMonitor::check_osdmap_subs()
3093{
3094 dout(10) << __func__ << dendl;
3095 if (!osdmap.get_epoch()) {
3096 return;
3097 }
3098 auto osdmap_subs = mon->session_map.subs.find("osdmap");
3099 if (osdmap_subs == mon->session_map.subs.end()) {
3100 return;
3101 }
3102 auto p = osdmap_subs->second->begin();
3103 while (!p.end()) {
3104 auto sub = *p;
3105 ++p;
3106 check_osdmap_sub(sub);
3107 }
3108}
3109
3110void OSDMonitor::check_osdmap_sub(Subscription *sub)
3111{
3112 dout(10) << __func__ << " " << sub << " next " << sub->next
3113 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
3114 if (sub->next <= osdmap.get_epoch()) {
3115 if (sub->next >= 1)
3116 send_incremental(sub->next, sub->session, sub->incremental_onetime);
3117 else
3118 sub->session->con->send_message(build_latest_full());
3119 if (sub->onetime)
3120 mon->session_map.remove_sub(sub);
3121 else
3122 sub->next = osdmap.get_epoch() + 1;
3123 }
3124}
3125
3126void OSDMonitor::check_pg_creates_subs()
3127{
3128 if (!mon->monmap->get_required_features().contains_all(
3129 ceph::features::mon::FEATURE_LUMINOUS)) {
3130 // PGMonitor takes care of this in pre-luminous era.
3131 return;
3132 }
3133 if (!osdmap.get_num_up_osds()) {
3134 return;
3135 }
3136 assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
3137 mon->with_session_map([this](const MonSessionMap& session_map) {
3138 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
3139 if (pg_creates_subs == session_map.subs.end()) {
3140 return;
3141 }
3142 for (auto sub : *pg_creates_subs->second) {
3143 check_pg_creates_sub(sub);
3144 }
3145 });
3146}
3147
3148void OSDMonitor::check_pg_creates_sub(Subscription *sub)
3149{
3150 dout(20) << __func__ << " .. " << sub->session->inst << dendl;
3151 assert(sub->type == "osd_pg_creates");
3152 // only send these if the OSD is up. we will check_subs() when they do
3153 // come up so they will get the creates then.
3154 if (sub->session->inst.name.is_osd() &&
3155 mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
3156 sub->next = send_pg_creates(sub->session->inst.name.num(),
3157 sub->session->con.get(),
3158 sub->next);
3159 }
3160}
3161
3162void OSDMonitor::scan_for_creating_pgs(
3163 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
3164 const mempool::osdmap::set<int64_t>& removed_pools,
3165 utime_t modified,
3166 creating_pgs_t* creating_pgs) const
3167{
3168 for (auto& p : pools) {
3169 int64_t poolid = p.first;
3170 const pg_pool_t& pool = p.second;
3171 int ruleno = osdmap.crush->find_rule(pool.get_crush_ruleset(),
3172 pool.get_type(), pool.get_size());
3173 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
3174 continue;
3175
3176 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
3177 const auto created = pool.get_last_change();
3178 if (last_scan_epoch && created <= last_scan_epoch) {
3179 dout(10) << __func__ << " no change in pool " << poolid
3180 << " " << pool << dendl;
3181 continue;
3182 }
3183 if (removed_pools.count(poolid)) {
3184 dout(10) << __func__ << " pool is being removed: " << poolid
3185 << " " << pool << dendl;
3186 continue;
3187 }
3188 dout(10) << __func__ << " scanning pool " << poolid
3189 << " " << pool << dendl;
3190 if (creating_pgs->created_pools.count(poolid)) {
3191 // split pgs are skipped by OSD, so drop it early.
3192 continue;
3193 }
3194 // first pgs in this pool
3195 for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
3196 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
3197 if (creating_pgs->pgs.count(pgid)) {
3198 dout(20) << __func__ << " already have " << pgid << dendl;
3199 continue;
3200 }
3201 creating_pgs->pgs.emplace(pgid, make_pair(created, modified));
3202 dout(10) << __func__ << " adding " << pgid
3203 << " at " << osdmap.get_epoch() << dendl;
3204 }
3205 }
3206}
3207
3208void OSDMonitor::update_creating_pgs()
3209{
3210 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
3211 std::lock_guard<std::mutex> l(creating_pgs_lock);
3212 for (auto& pg : creating_pgs.pgs) {
3213 int acting_primary = -1;
3214 auto pgid = pg.first;
3215 auto mapped = pg.second.first;
3216 mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
3217 // check the previous creating_pgs, look for the target to whom the pg was
3218 // previously mapped
3219 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
3220 const auto last_acting_primary = pgs_by_epoch.first;
3221 for (auto& pgs: pgs_by_epoch.second) {
3222 if (pgs.second.count(pgid)) {
3223 if (last_acting_primary == acting_primary) {
3224 mapped = pgs.first;
3225 } else {
3226 dout(20) << __func__ << " " << pgid << " "
3227 << " acting_primary:" << last_acting_primary
3228 << " -> " << acting_primary << dendl;
3229 // note epoch if the target of the create message changed.
3230 mapped = mapping.get_epoch();
3231 }
3232 break;
3233 }
3234 }
3235 }
3236 dout(10) << __func__ << " will instruct osd." << acting_primary
3237 << " to create " << pgid << dendl;
3238 new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
3239 }
3240 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
3241 creating_pgs_epoch = mapping.get_epoch();
3242}
3243
3244epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
3245{
3246 dout(30) << __func__ << " osd." << osd << " next=" << next
3247 << " " << creating_pgs_by_osd_epoch << dendl;
3248 std::lock_guard<std::mutex> l(creating_pgs_lock);
3249 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
3250 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
3251 return next;
3252 assert(!creating_pgs_by_epoch->second.empty());
3253
3254 MOSDPGCreate *m = nullptr;
3255 epoch_t last = 0;
3256 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
3257 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
3258 auto epoch = epoch_pgs->first;
3259 auto& pgs = epoch_pgs->second;
3260 dout(20) << __func__ << " osd." << osd << " from " << next
3261 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
3262 last = epoch;
3263 for (auto& pg : pgs) {
3264 if (!m)
3265 m = new MOSDPGCreate(creating_pgs_epoch);
3266 // Need the create time from the monitor using its clock to set
3267 // last_scrub_stamp upon pg creation.
3268 const auto& creation = creating_pgs.pgs[pg];
3269 m->mkpg.emplace(pg, pg_create_t{creation.first, pg, 0});
3270 m->ctimes.emplace(pg, creation.second);
3271 dout(20) << __func__ << " will create " << pg
3272 << " at " << creation.first << dendl;
3273 }
3274 }
3275 if (!m) {
3276 dout(20) << __func__ << " osd." << osd << " from " << next
3277 << " has nothing to send" << dendl;
3278 return next;
3279 }
3280 con->send_message(m);
3281 // sub is current through last + 1
3282 return last + 1;
3283}
3284
3285// TICK
3286
3287
3288void OSDMonitor::tick()
3289{
3290 if (!is_active()) return;
3291
3292 dout(10) << osdmap << dendl;
3293
3294 if (!mon->is_leader()) return;
3295
3296 bool do_propose = false;
3297 utime_t now = ceph_clock_now();
3298
3299 if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS) &&
3300 mon->monmap->get_required_features().contains_all(
3301 ceph::features::mon::FEATURE_LUMINOUS)) {
3302 if (handle_osd_timeouts(now, last_osd_report)) {
3303 do_propose = true;
3304 }
3305 }
3306
3307 // mark osds down?
3308 if (check_failures(now))
3309 do_propose = true;
3310
3311 // mark down osds out?
3312
3313 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3314 * influence at all. The decision is made based on the ratio of "in" osds,
3315 * and the function returns false if this ratio is lower that the minimum
3316 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3317 */
3318 if (can_mark_out(-1)) {
3319 set<int> down_cache; // quick cache of down subtrees
3320
3321 map<int,utime_t>::iterator i = down_pending_out.begin();
3322 while (i != down_pending_out.end()) {
3323 int o = i->first;
3324 utime_t down = now;
3325 down -= i->second;
3326 ++i;
3327
3328 if (osdmap.is_down(o) &&
3329 osdmap.is_in(o) &&
3330 can_mark_out(o)) {
3331 utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
3332 utime_t grace = orig_grace;
3333 double my_grace = 0.0;
3334
3335 if (g_conf->mon_osd_adjust_down_out_interval) {
3336 // scale grace period the same way we do the heartbeat grace.
3337 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
3338 double halflife = (double)g_conf->mon_osd_laggy_halflife;
3339 double decay_k = ::log(.5) / halflife;
3340 double decay = exp((double)down * decay_k);
3341 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
3342 << " down for " << down << " decay " << decay << dendl;
3343 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3344 grace += my_grace;
3345 }
3346
3347 // is this an entire large subtree down?
3348 if (g_conf->mon_osd_down_out_subtree_limit.length()) {
3349 int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
3350 if (type > 0) {
3351 if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
3352 dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
3353 << " subtree for osd." << o << " is down; resetting timer" << dendl;
3354 // reset timer, too.
3355 down_pending_out[o] = now;
3356 continue;
3357 }
3358 }
3359 }
3360
3361 if (g_conf->mon_osd_down_out_interval > 0 &&
3362 down.sec() >= grace) {
3363 dout(10) << "tick marking osd." << o << " OUT after " << down
3364 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
3365 pending_inc.new_weight[o] = CEPH_OSD_OUT;
3366
3367 // set the AUTOOUT bit.
3368 if (pending_inc.new_state.count(o) == 0)
3369 pending_inc.new_state[o] = 0;
3370 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
3371
3372 // remember previous weight
3373 if (pending_inc.new_xinfo.count(o) == 0)
3374 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
3375 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
3376
3377 do_propose = true;
3378
3379 mon->clog->info() << "osd." << o << " out (down for " << down << ")";
3380 } else
3381 continue;
3382 }
3383
3384 down_pending_out.erase(o);
3385 }
3386 } else {
3387 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
3388 }
3389
3390 // expire blacklisted items?
3391 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
3392 p != osdmap.blacklist.end();
3393 ++p) {
3394 if (p->second < now) {
3395 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
3396 pending_inc.old_blacklist.push_back(p->first);
3397 do_propose = true;
3398 }
3399 }
3400
3401 // if map full setting has changed, get that info out there!
3402 if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS) &&
3403 mon->pgmon()->is_readable()) {
3404 // for pre-luminous compat only!
3405 if (!mon->pgmon()->pg_map.full_osds.empty()) {
3406 dout(5) << "There are full osds, setting full flag" << dendl;
3407 add_flag(CEPH_OSDMAP_FULL);
3408 } else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
3409 dout(10) << "No full osds, removing full flag" << dendl;
3410 remove_flag(CEPH_OSDMAP_FULL);
3411 }
3412
3413 if (!mon->pgmon()->pg_map.nearfull_osds.empty()) {
3414 dout(5) << "There are near full osds, setting nearfull flag" << dendl;
3415 add_flag(CEPH_OSDMAP_NEARFULL);
3416 } else if (osdmap.test_flag(CEPH_OSDMAP_NEARFULL)){
3417 dout(10) << "No near full osds, removing nearfull flag" << dendl;
3418 remove_flag(CEPH_OSDMAP_NEARFULL);
3419 }
3420 if (pending_inc.new_flags != -1 &&
3421 (pending_inc.new_flags ^ osdmap.flags) & (CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
3422 dout(1) << "New setting for" <<
3423 (pending_inc.new_flags & CEPH_OSDMAP_FULL ? " CEPH_OSDMAP_FULL" : "") <<
3424 (pending_inc.new_flags & CEPH_OSDMAP_NEARFULL ? " CEPH_OSDMAP_NEARFULL" : "")
3425 << " -- doing propose" << dendl;
3426 do_propose = true;
3427 }
3428 }
3429
3430 if (update_pools_status())
3431 do_propose = true;
3432
3433 if (do_propose ||
3434 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
3435 propose_pending();
3436}
3437
3438bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
3439 std::map<int,utime_t> &last_osd_report)
3440{
3441 utime_t timeo(g_conf->mon_osd_report_timeout, 0);
3442 if (now - mon->get_leader_since() < timeo) {
3443 // We haven't been the leader for long enough to consider OSD timeouts
3444 return false;
3445 }
3446
3447 int max_osd = osdmap.get_max_osd();
3448 bool new_down = false;
3449
3450 for (int i=0; i < max_osd; ++i) {
3451 dout(30) << __func__ << ": checking up on osd " << i << dendl;
3452 if (!osdmap.is_up(i))
3453 continue;
3454 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
3455 if (t == last_osd_report.end()) {
3456 // it wasn't in the map; start the timer.
3457 last_osd_report[i] = now;
3458 } else if (can_mark_down(i)) {
3459 utime_t diff = now - t->second;
3460 if (diff > timeo) {
3461 mon->clog->info() << "osd." << i << " marked down after no pg stats for " << diff << "seconds";
3462 derr << "no osd or pg stats from osd." << i << " since " << t->second << ", " << diff
3463 << " seconds ago. marking down" << dendl;
3464 pending_inc.new_state[i] = CEPH_OSD_UP;
3465 new_down = true;
3466 }
3467 }
3468 }
3469 return new_down;
3470}
3471
3472void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
3473 list<pair<health_status_t,string> > *detail,
3474 CephContext *cct) const
3475{
3476 int num_osds = osdmap.get_num_osds();
3477
3478 if (num_osds == 0) {
3479 summary.push_back(make_pair(HEALTH_ERR, "no osds"));
3480 } else {
3481 int num_in_osds = 0;
3482 int num_down_in_osds = 0;
3483 set<int> osds;
3484 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3485 if (!osdmap.exists(i)) {
3486 if (osdmap.crush->item_exists(i)) {
3487 osds.insert(i);
3488 }
3489 continue;
3490 }
3491 if (osdmap.is_out(i))
3492 continue;
3493 ++num_in_osds;
3494 if (!osdmap.is_up(i)) {
3495 ++num_down_in_osds;
3496 if (detail) {
3497 const osd_info_t& info = osdmap.get_info(i);
3498 ostringstream ss;
3499 ss << "osd." << i << " is down since epoch " << info.down_at
3500 << ", last address " << osdmap.get_addr(i);
3501 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3502 }
3503 }
3504 }
3505 assert(num_down_in_osds <= num_in_osds);
3506 if (num_down_in_osds > 0) {
3507 ostringstream ss;
3508 ss << num_down_in_osds << "/" << num_in_osds << " in osds are down";
3509 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3510 }
3511
3512 if (!osds.empty()) {
3513 ostringstream ss;
3514 ss << "osds were removed from osdmap, but still kept in crushmap";
3515 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3516 if (detail) {
3517 ss << " osds: [" << osds << "]";
3518 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3519 }
3520 }
3521
3522 if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
3523 // An osd could configure failsafe ratio, to something different
3524 // but for now assume it is the same here.
3525 float fsr = g_conf->osd_failsafe_full_ratio;
3526 if (fsr > 1.0) fsr /= 100;
3527 float fr = osdmap.get_full_ratio();
3528 float br = osdmap.get_backfillfull_ratio();
3529 float nr = osdmap.get_nearfull_ratio();
3530
3531 bool out_of_order = false;
3532 // These checks correspond to how OSDService::check_full_status() in an OSD
3533 // handles the improper setting of these values.
3534 if (br < nr) {
3535 out_of_order = true;
3536 if (detail) {
3537 ostringstream ss;
3538 ss << "backfillfull_ratio (" << br << ") < nearfull_ratio (" << nr << "), increased";
3539 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3540 }
3541 br = nr;
3542 }
3543 if (fr < br) {
3544 out_of_order = true;
3545 if (detail) {
3546 ostringstream ss;
3547 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br << "), increased";
3548 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3549 }
3550 fr = br;
3551 }
3552 if (fsr < fr) {
3553 out_of_order = true;
3554 if (detail) {
3555 ostringstream ss;
3556 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr << "), increased";
3557 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3558 }
3559 }
3560 if (out_of_order) {
3561 ostringstream ss;
3562 ss << "Full ratio(s) out of order";
3563 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3564 }
3565
3566 map<int, float> full, backfillfull, nearfull;
3567 osdmap.get_full_osd_util(mon->pgmon()->pg_map.osd_stat, &full, &backfillfull, &nearfull);
3568 if (full.size()) {
3569 ostringstream ss;
3570 ss << full.size() << " full osd(s)";
3571 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3572 }
3573 if (backfillfull.size()) {
3574 ostringstream ss;
3575 ss << backfillfull.size() << " backfillfull osd(s)";
3576 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3577 }
3578 if (nearfull.size()) {
3579 ostringstream ss;
3580 ss << nearfull.size() << " nearfull osd(s)";
3581 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3582 }
3583 if (detail) {
3584 for (auto& i: full) {
3585 ostringstream ss;
3586 ss << "osd." << i.first << " is full at " << roundf(i.second * 100) << "%";
3587 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3588 }
3589 for (auto& i: backfillfull) {
3590 ostringstream ss;
3591 ss << "osd." << i.first << " is backfill full at " << roundf(i.second * 100) << "%";
3592 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3593 }
3594 for (auto& i: nearfull) {
3595 ostringstream ss;
3596 ss << "osd." << i.first << " is near full at " << roundf(i.second * 100) << "%";
3597 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3598 }
3599 }
3600 }
3601 // note: we leave it to ceph-mgr to generate details health warnings
3602 // with actual osd utilizations
3603
3604 // warn about flags
3605 uint64_t warn_flags =
3606 CEPH_OSDMAP_FULL |
3607 CEPH_OSDMAP_PAUSERD |
3608 CEPH_OSDMAP_PAUSEWR |
3609 CEPH_OSDMAP_PAUSEREC |
3610 CEPH_OSDMAP_NOUP |
3611 CEPH_OSDMAP_NODOWN |
3612 CEPH_OSDMAP_NOIN |
3613 CEPH_OSDMAP_NOOUT |
3614 CEPH_OSDMAP_NOBACKFILL |
3615 CEPH_OSDMAP_NORECOVER |
3616 CEPH_OSDMAP_NOSCRUB |
3617 CEPH_OSDMAP_NODEEP_SCRUB |
3618 CEPH_OSDMAP_NOTIERAGENT |
3619 CEPH_OSDMAP_NOREBALANCE;
3620 if (osdmap.test_flag(warn_flags)) {
3621 ostringstream ss;
3622 ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
3623 << " flag(s) set";
3624 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3625 if (detail)
3626 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3627 }
3628
3629 // old crush tunables?
3630 if (g_conf->mon_warn_on_legacy_crush_tunables) {
3631 string min = osdmap.crush->get_min_required_version();
3632 if (min < g_conf->mon_crush_min_required_version) {
3633 ostringstream ss;
3634 ss << "crush map has legacy tunables (require " << min
3635 << ", min is " << g_conf->mon_crush_min_required_version << ")";
3636 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3637 if (detail) {
3638 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3639 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3640 }
3641 }
3642 }
3643 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
3644 if (osdmap.crush->get_straw_calc_version() == 0) {
3645 ostringstream ss;
3646 ss << "crush map has straw_calc_version=0";
3647 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3648 if (detail) {
3649 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3650 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3651 }
3652 }
3653 }
3654
3655 // hit_set-less cache_mode?
3656 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
3657 int problem_cache_pools = 0;
3658 for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
3659 p != osdmap.pools.end();
3660 ++p) {
3661 const pg_pool_t& info = p->second;
3662 if (info.cache_mode_requires_hit_set() &&
3663 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
3664 ++problem_cache_pools;
3665 if (detail) {
3666 ostringstream ss;
3667 ss << "pool '" << osdmap.get_pool_name(p->first)
3668 << "' with cache_mode " << info.get_cache_mode_name()
3669 << " needs hit_set_type to be set but it is not";
3670 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3671 }
3672 }
3673 }
3674 if (problem_cache_pools) {
3675 ostringstream ss;
3676 ss << problem_cache_pools << " cache pools are missing hit_sets";
3677 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3678 }
3679 }
3680
3681 // Not using 'sortbitwise' and should be?
3682 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
3683 (osdmap.get_up_osd_features() &
3684 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
3685 ostringstream ss;
3686 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
3687 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3688 }
3689
3690 // Warn if 'mon_osd_down_out_interval' is set to zero.
3691 // Having this option set to zero on the leader acts much like the
3692 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3693 // without the 'noout' flag set but acting like that just the same, so
3694 // we report a HEALTH_WARN in case this option is set to zero.
3695 // This is an ugly hack to get the warning out, but until we find a way
3696 // to spread global options throughout the mon cluster and have all mons
3697 // using a base set of the same options, we need to work around this sort
3698 // of things.
3699 // There's also the obvious drawback that if this is set on a single
3700 // monitor on a 3-monitor cluster, this warning will only be shown every
3701 // third monitor connection.
3702 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
3703 g_conf->mon_osd_down_out_interval == 0) {
3704 ostringstream ss;
3705 ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
3706 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3707 if (detail) {
3708 ss << "; this has the same effect as the 'noout' flag";
3709 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3710 }
3711 }
3712
3713 // warn about upgrade flags that can be set but are not.
3714 if (g_conf->mon_debug_no_require_luminous) {
3715 // ignore these checks
3716 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS) &&
3717 !osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
3718 string msg = "all OSDs are running luminous or later but the"
3719 " 'require_luminous_osds' osdmap flag is not set";
3720 summary.push_back(make_pair(HEALTH_WARN, msg));
3721 if (detail) {
3722 detail->push_back(make_pair(HEALTH_WARN, msg));
3723 }
3724 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN) &&
3725 !osdmap.test_flag(CEPH_OSDMAP_REQUIRE_KRAKEN)) {
3726 string msg = "all OSDs are running kraken or later but the"
3727 " 'require_kraken_osds' osdmap flag is not set";
3728 summary.push_back(make_pair(HEALTH_WARN, msg));
3729 if (detail) {
3730 detail->push_back(make_pair(HEALTH_WARN, msg));
3731 }
3732 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL) &&
3733 !osdmap.test_flag(CEPH_OSDMAP_REQUIRE_JEWEL)) {
3734 string msg = "all OSDs are running jewel or later but the"
3735 " 'require_jewel_osds' osdmap flag is not set";
3736 summary.push_back(make_pair(HEALTH_WARN, msg));
3737 if (detail) {
3738 detail->push_back(make_pair(HEALTH_WARN, msg));
3739 }
3740 }
3741
3742 get_pools_health(summary, detail);
3743 }
3744}
3745
3746void OSDMonitor::dump_info(Formatter *f)
3747{
3748 f->open_object_section("osdmap");
3749 osdmap.dump(f);
3750 f->close_section();
3751
3752 f->open_array_section("osd_metadata");
3753 for (int i=0; i<osdmap.get_max_osd(); ++i) {
3754 if (osdmap.exists(i)) {
3755 f->open_object_section("osd");
3756 f->dump_unsigned("id", i);
3757 dump_osd_metadata(i, f, NULL);
3758 f->close_section();
3759 }
3760 }
3761 f->close_section();
3762
3763 f->dump_unsigned("osdmap_first_committed", get_first_committed());
3764 f->dump_unsigned("osdmap_last_committed", get_last_committed());
3765
3766 f->open_object_section("crushmap");
3767 osdmap.crush->dump(f);
3768 f->close_section();
3769}
3770
3771namespace {
3772 enum osd_pool_get_choices {
3773 SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
3774 PG_NUM, PGP_NUM, CRUSH_RULE, CRUSH_RULESET, HASHPSPOOL,
3775 NODELETE, NOPGCHANGE, NOSIZECHANGE,
3776 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
3777 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
3778 USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
3779 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
3780 CACHE_TARGET_FULL_RATIO,
3781 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
3782 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
3783 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
3784 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
3785 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
3786 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
3787 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
3788 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
3789 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK };
3790
3791 std::set<osd_pool_get_choices>
3792 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
3793 const std::set<osd_pool_get_choices>& second)
3794 {
3795 std::set<osd_pool_get_choices> result;
3796 std::set_difference(first.begin(), first.end(),
3797 second.begin(), second.end(),
3798 std::inserter(result, result.end()));
3799 return result;
3800 }
3801}
3802
3803
3804bool OSDMonitor::preprocess_command(MonOpRequestRef op)
3805{
3806 op->mark_osdmon_event(__func__);
3807 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
3808 int r = 0;
3809 bufferlist rdata;
3810 stringstream ss, ds;
3811
3812 map<string, cmd_vartype> cmdmap;
3813 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
3814 string rs = ss.str();
3815 mon->reply_command(op, -EINVAL, rs, get_last_committed());
3816 return true;
3817 }
3818
3819 MonSession *session = m->get_session();
3820 if (!session) {
3821 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
3822 return true;
3823 }
3824
3825 string prefix;
3826 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
3827
3828 string format;
3829 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
3830 boost::scoped_ptr<Formatter> f(Formatter::create(format));
3831
3832 if (prefix == "osd stat") {
3833 osdmap.print_summary(f.get(), ds);
3834 if (f)
3835 f->flush(rdata);
3836 else
3837 rdata.append(ds);
3838 }
3839 else if (prefix == "osd perf" ||
3840 prefix == "osd blocked-by") {
3841 const PGMap &pgm = mon->pgmon()->pg_map;
3842 r = process_pg_map_command(prefix, cmdmap, pgm, osdmap,
3843 f.get(), &ss, &rdata);
3844 }
3845 else if (prefix == "osd dump" ||
3846 prefix == "osd tree" ||
3847 prefix == "osd ls" ||
3848 prefix == "osd getmap" ||
3849 prefix == "osd getcrushmap") {
3850 string val;
3851
3852 epoch_t epoch = 0;
3853 int64_t epochnum;
3854 cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
3855 epoch = epochnum;
3856
3857 bufferlist osdmap_bl;
3858 int err = get_version_full(epoch, osdmap_bl);
3859 if (err == -ENOENT) {
3860 r = -ENOENT;
3861 ss << "there is no map for epoch " << epoch;
3862 goto reply;
3863 }
3864 assert(err == 0);
3865 assert(osdmap_bl.length());
3866
3867 OSDMap *p;
3868 if (epoch == osdmap.get_epoch()) {
3869 p = &osdmap;
3870 } else {
3871 p = new OSDMap;
3872 p->decode(osdmap_bl);
3873 }
3874
3875 if (prefix == "osd dump") {
3876 stringstream ds;
3877 if (f) {
3878 f->open_object_section("osdmap");
3879 p->dump(f.get());
3880 f->close_section();
3881 f->flush(ds);
3882 } else {
3883 p->print(ds);
3884 }
3885 rdata.append(ds);
3886 if (!f)
3887 ds << " ";
3888 } else if (prefix == "osd ls") {
3889 if (f) {
3890 f->open_array_section("osds");
3891 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3892 if (osdmap.exists(i)) {
3893 f->dump_int("osd", i);
3894 }
3895 }
3896 f->close_section();
3897 f->flush(ds);
3898 } else {
3899 bool first = true;
3900 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3901 if (osdmap.exists(i)) {
3902 if (!first)
3903 ds << "\n";
3904 first = false;
3905 ds << i;
3906 }
3907 }
3908 }
3909 rdata.append(ds);
3910 } else if (prefix == "osd tree") {
3911 if (f) {
3912 f->open_object_section("tree");
3913 p->print_tree(f.get(), NULL);
3914 f->close_section();
3915 f->flush(ds);
3916 } else {
3917 p->print_tree(NULL, &ds);
3918 }
3919 rdata.append(ds);
3920 } else if (prefix == "osd getmap") {
3921 rdata.append(osdmap_bl);
3922 ss << "got osdmap epoch " << p->get_epoch();
3923 } else if (prefix == "osd getcrushmap") {
3924 p->crush->encode(rdata, mon->get_quorum_con_features());
3925 ss << "got crush map from osdmap epoch " << p->get_epoch();
3926 }
3927 if (p != &osdmap)
3928 delete p;
3929 } else if (prefix == "osd df") {
3930 string method;
3931 cmd_getval(g_ceph_context, cmdmap, "output_method", method);
3932 print_utilization(ds, f ? f.get() : NULL, method == "tree");
3933 rdata.append(ds);
3934 } else if (prefix == "osd getmaxosd") {
3935 if (f) {
3936 f->open_object_section("getmaxosd");
3937 f->dump_unsigned("epoch", osdmap.get_epoch());
3938 f->dump_int("max_osd", osdmap.get_max_osd());
3939 f->close_section();
3940 f->flush(rdata);
3941 } else {
3942 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
3943 rdata.append(ds);
3944 }
3945 } else if (prefix == "osd utilization") {
3946 string out;
3947 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
3948 if (f)
3949 f->flush(rdata);
3950 else
3951 rdata.append(out);
3952 r = 0;
3953 goto reply;
3954 } else if (prefix == "osd find") {
3955 int64_t osd;
3956 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
3957 ss << "unable to parse osd id value '"
3958 << cmd_vartype_stringify(cmdmap["id"]) << "'";
3959 r = -EINVAL;
3960 goto reply;
3961 }
3962 if (!osdmap.exists(osd)) {
3963 ss << "osd." << osd << " does not exist";
3964 r = -ENOENT;
3965 goto reply;
3966 }
3967 string format;
3968 cmd_getval(g_ceph_context, cmdmap, "format", format);
3969 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
3970 f->open_object_section("osd_location");
3971 f->dump_int("osd", osd);
3972 f->dump_stream("ip") << osdmap.get_addr(osd);
3973 f->open_object_section("crush_location");
3974 map<string,string> loc = osdmap.crush->get_full_location(osd);
3975 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
3976 f->dump_string(p->first.c_str(), p->second);
3977 f->close_section();
3978 f->close_section();
3979 f->flush(rdata);
3980 } else if (prefix == "osd metadata") {
3981 int64_t osd = -1;
3982 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
3983 !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
3984 ss << "unable to parse osd id value '"
3985 << cmd_vartype_stringify(cmdmap["id"]) << "'";
3986 r = -EINVAL;
3987 goto reply;
3988 }
3989 if (osd >= 0 && !osdmap.exists(osd)) {
3990 ss << "osd." << osd << " does not exist";
3991 r = -ENOENT;
3992 goto reply;
3993 }
3994 string format;
3995 cmd_getval(g_ceph_context, cmdmap, "format", format);
3996 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
3997 if (osd >= 0) {
3998 f->open_object_section("osd_metadata");
3999 f->dump_unsigned("id", osd);
4000 r = dump_osd_metadata(osd, f.get(), &ss);
4001 if (r < 0)
4002 goto reply;
4003 f->close_section();
4004 } else {
4005 r = 0;
4006 f->open_array_section("osd_metadata");
4007 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4008 if (osdmap.exists(i)) {
4009 f->open_object_section("osd");
4010 f->dump_unsigned("id", i);
4011 r = dump_osd_metadata(i, f.get(), NULL);
4012 if (r == -EINVAL || r == -ENOENT) {
4013 // Drop error, continue to get other daemons' metadata
4014 dout(4) << "No metadata for osd." << i << dendl;
4015 r = 0;
4016 } else if (r < 0) {
4017 // Unexpected error
4018 goto reply;
4019 }
4020 f->close_section();
4021 }
4022 }
4023 f->close_section();
4024 }
4025 f->flush(rdata);
4026 } else if (prefix == "osd map") {
4027 string poolstr, objstr, namespacestr;
4028 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4029 cmd_getval(g_ceph_context, cmdmap, "object", objstr);
4030 cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
4031
4032 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4033 if (pool < 0) {
4034 ss << "pool " << poolstr << " does not exist";
4035 r = -ENOENT;
4036 goto reply;
4037 }
4038 object_locator_t oloc(pool, namespacestr);
4039 object_t oid(objstr);
4040 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4041 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4042 vector<int> up, acting;
4043 int up_p, acting_p;
4044 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
4045
4046 string fullobjname;
4047 if (!namespacestr.empty())
4048 fullobjname = namespacestr + string("/") + oid.name;
4049 else
4050 fullobjname = oid.name;
4051 if (f) {
4052 f->open_object_section("osd_map");
4053 f->dump_unsigned("epoch", osdmap.get_epoch());
4054 f->dump_string("pool", poolstr);
4055 f->dump_int("pool_id", pool);
4056 f->dump_stream("objname") << fullobjname;
4057 f->dump_stream("raw_pgid") << pgid;
4058 f->dump_stream("pgid") << mpgid;
4059 f->open_array_section("up");
4060 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
4061 f->dump_int("osd", *p);
4062 f->close_section();
4063 f->dump_int("up_primary", up_p);
4064 f->open_array_section("acting");
4065 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
4066 f->dump_int("osd", *p);
4067 f->close_section();
4068 f->dump_int("acting_primary", acting_p);
4069 f->close_section(); // osd_map
4070 f->flush(rdata);
4071 } else {
4072 ds << "osdmap e" << osdmap.get_epoch()
4073 << " pool '" << poolstr << "' (" << pool << ")"
4074 << " object '" << fullobjname << "' ->"
4075 << " pg " << pgid << " (" << mpgid << ")"
4076 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
4077 << pg_vector_string(acting) << ", p" << acting_p << ")";
4078 rdata.append(ds);
4079 }
4080
4081 } else if (prefix == "pg map") {
4082 pg_t pgid;
4083 string pgidstr;
4084 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
4085 if (!pgid.parse(pgidstr.c_str())) {
4086 ss << "invalid pgid '" << pgidstr << "'";
4087 r = -EINVAL;
4088 goto reply;
4089 }
4090 vector<int> up, acting;
4091 if (!osdmap.have_pg_pool(pgid.pool())) {
4092 ss << "pg '" << pgidstr << "' does not exist";
4093 r = -ENOENT;
4094 goto reply;
4095 }
4096 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4097 osdmap.pg_to_up_acting_osds(pgid, up, acting);
4098 if (f) {
4099 f->open_object_section("pg_map");
4100 f->dump_unsigned("epoch", osdmap.get_epoch());
4101 f->dump_stream("raw_pgid") << pgid;
4102 f->dump_stream("pgid") << mpgid;
4103 f->open_array_section("up");
4104 for (auto osd : up) {
4105 f->dump_int("up_osd", osd);
4106 }
4107 f->close_section();
4108 f->open_array_section("acting");
4109 for (auto osd : acting) {
4110 f->dump_int("acting_osd", osd);
4111 }
4112 f->close_section();
4113 f->close_section();
4114 f->flush(rdata);
4115 } else {
4116 ds << "osdmap e" << osdmap.get_epoch()
4117 << " pg " << pgid << " (" << mpgid << ")"
4118 << " -> up " << up << " acting " << acting;
4119 rdata.append(ds);
4120 }
4121 goto reply;
4122
4123 } else if ((prefix == "osd scrub" ||
4124 prefix == "osd deep-scrub" ||
4125 prefix == "osd repair")) {
4126 string whostr;
4127 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
4128 vector<string> pvec;
4129 get_str_vec(prefix, pvec);
4130
4131 if (whostr == "*") {
4132 ss << "osds ";
4133 int c = 0;
4134 for (int i = 0; i < osdmap.get_max_osd(); i++)
4135 if (osdmap.is_up(i)) {
4136 ss << (c++ ? "," : "") << i;
4137 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4138 pvec.back() == "repair",
4139 pvec.back() == "deep-scrub"),
4140 osdmap.get_inst(i));
4141 }
4142 r = 0;
4143 ss << " instructed to " << pvec.back();
4144 } else {
4145 long osd = parse_osd_id(whostr.c_str(), &ss);
4146 if (osd < 0) {
4147 r = -EINVAL;
4148 } else if (osdmap.is_up(osd)) {
4149 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4150 pvec.back() == "repair",
4151 pvec.back() == "deep-scrub"),
4152 osdmap.get_inst(osd));
4153 ss << "osd." << osd << " instructed to " << pvec.back();
4154 } else {
4155 ss << "osd." << osd << " is not up";
4156 r = -EAGAIN;
4157 }
4158 }
4159 } else if (prefix == "osd lspools") {
4160 int64_t auid;
4161 cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
4162 if (f)
4163 f->open_array_section("pools");
4164 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
4165 p != osdmap.pools.end();
4166 ++p) {
4167 if (!auid || p->second.auid == (uint64_t)auid) {
4168 if (f) {
4169 f->open_object_section("pool");
4170 f->dump_int("poolnum", p->first);
4171 f->dump_string("poolname", osdmap.pool_name[p->first]);
4172 f->close_section();
4173 } else {
4174 ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
4175 }
4176 }
4177 }
4178 if (f) {
4179 f->close_section();
4180 f->flush(ds);
4181 }
4182 rdata.append(ds);
4183 } else if (prefix == "osd blacklist ls") {
4184 if (f)
4185 f->open_array_section("blacklist");
4186
4187 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4188 p != osdmap.blacklist.end();
4189 ++p) {
4190 if (f) {
4191 f->open_object_section("entry");
4192 f->dump_stream("addr") << p->first;
4193 f->dump_stream("until") << p->second;
4194 f->close_section();
4195 } else {
4196 stringstream ss;
4197 string s;
4198 ss << p->first << " " << p->second;
4199 getline(ss, s);
4200 s += "\n";
4201 rdata.append(s);
4202 }
4203 }
4204 if (f) {
4205 f->close_section();
4206 f->flush(rdata);
4207 }
4208 ss << "listed " << osdmap.blacklist.size() << " entries";
4209
4210 } else if (prefix == "osd pool ls") {
4211 string detail;
4212 cmd_getval(g_ceph_context, cmdmap, "detail", detail);
4213 if (!f && detail == "detail") {
4214 ostringstream ss;
4215 osdmap.print_pools(ss);
4216 rdata.append(ss.str());
4217 } else {
4218 if (f)
4219 f->open_array_section("pools");
4220 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
4221 it != osdmap.get_pools().end();
4222 ++it) {
4223 if (f) {
4224 if (detail == "detail") {
4225 f->open_object_section("pool");
4226 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4227 it->second.dump(f.get());
4228 f->close_section();
4229 } else {
4230 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4231 }
4232 } else {
4233 rdata.append(osdmap.get_pool_name(it->first) + "\n");
4234 }
4235 }
4236 if (f) {
4237 f->close_section();
4238 f->flush(rdata);
4239 }
4240 }
4241
4242 } else if (prefix == "osd crush get-tunable") {
4243 string tunable;
4244 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
4245 ostringstream rss;
4246 if (f)
4247 f->open_object_section("tunable");
4248 if (tunable == "straw_calc_version") {
4249 if (f)
4250 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
4251 else
4252 rss << osdmap.crush->get_straw_calc_version() << "\n";
4253 } else {
4254 r = -EINVAL;
4255 goto reply;
4256 }
4257 if (f) {
4258 f->close_section();
4259 f->flush(rdata);
4260 } else {
4261 rdata.append(rss.str());
4262 }
4263 r = 0;
4264
4265 } else if (prefix == "osd pool get") {
4266 string poolstr;
4267 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4268 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4269 if (pool < 0) {
4270 ss << "unrecognized pool '" << poolstr << "'";
4271 r = -ENOENT;
4272 goto reply;
4273 }
4274
4275 const pg_pool_t *p = osdmap.get_pg_pool(pool);
4276 string var;
4277 cmd_getval(g_ceph_context, cmdmap, "var", var);
4278
4279 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
4280 const choices_map_t ALL_CHOICES = {
4281 {"size", SIZE},
4282 {"min_size", MIN_SIZE},
4283 {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
4284 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
4285 {"crush_rule", CRUSH_RULE},
4286 {"crush_ruleset", CRUSH_RULESET},
4287 {"hashpspool", HASHPSPOOL}, {"nodelete", NODELETE},
4288 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
4289 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
4290 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
4291 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
4292 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
4293 {"use_gmt_hitset", USE_GMT_HITSET},
4294 {"auid", AUID}, {"target_max_objects", TARGET_MAX_OBJECTS},
4295 {"target_max_bytes", TARGET_MAX_BYTES},
4296 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
4297 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
4298 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
4299 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
4300 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
4301 {"erasure_code_profile", ERASURE_CODE_PROFILE},
4302 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
4303 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
4304 {"fast_read", FAST_READ},
4305 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
4306 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
4307 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
4308 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
4309 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
4310 {"recovery_priority", RECOVERY_PRIORITY},
4311 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
4312 {"scrub_priority", SCRUB_PRIORITY},
4313 {"compression_mode", COMPRESSION_MODE},
4314 {"compression_algorithm", COMPRESSION_ALGORITHM},
4315 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
4316 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
4317 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
4318 {"csum_type", CSUM_TYPE},
4319 {"csum_max_block", CSUM_MAX_BLOCK},
4320 {"csum_min_block", CSUM_MIN_BLOCK},
4321 };
4322
4323 typedef std::set<osd_pool_get_choices> choices_set_t;
4324
4325 const choices_set_t ONLY_TIER_CHOICES = {
4326 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4327 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
4328 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4329 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4330 MIN_READ_RECENCY_FOR_PROMOTE,
4331 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
4332 };
4333 const choices_set_t ONLY_ERASURE_CHOICES = {
4334 ERASURE_CODE_PROFILE
4335 };
4336
4337 choices_set_t selected_choices;
4338 if (var == "all") {
4339 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
4340 it != ALL_CHOICES.end(); ++it) {
4341 selected_choices.insert(it->second);
4342 }
4343
4344 if(!p->is_tier()) {
4345 selected_choices = subtract_second_from_first(selected_choices,
4346 ONLY_TIER_CHOICES);
4347 }
4348
4349 if(!p->is_erasure()) {
4350 selected_choices = subtract_second_from_first(selected_choices,
4351 ONLY_ERASURE_CHOICES);
4352 }
4353 } else /* var != "all" */ {
4354 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
4355 osd_pool_get_choices selected = found->second;
4356
4357 if (!p->is_tier() &&
4358 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
4359 ss << "pool '" << poolstr
4360 << "' is not a tier pool: variable not applicable";
4361 r = -EACCES;
4362 goto reply;
4363 }
4364
4365 if (!p->is_erasure() &&
4366 ONLY_ERASURE_CHOICES.find(selected)
4367 != ONLY_ERASURE_CHOICES.end()) {
4368 ss << "pool '" << poolstr
4369 << "' is not a erasure pool: variable not applicable";
4370 r = -EACCES;
4371 goto reply;
4372 }
4373
4374 selected_choices.insert(selected);
4375 }
4376
4377 if (f) {
4378 for(choices_set_t::const_iterator it = selected_choices.begin();
4379 it != selected_choices.end(); ++it) {
4380 choices_map_t::const_iterator i;
4381 f->open_object_section("pool");
4382 f->dump_string("pool", poolstr);
4383 f->dump_int("pool_id", pool);
4384 switch(*it) {
4385 case PG_NUM:
4386 f->dump_int("pg_num", p->get_pg_num());
4387 break;
4388 case PGP_NUM:
4389 f->dump_int("pgp_num", p->get_pgp_num());
4390 break;
4391 case AUID:
4392 f->dump_int("auid", p->get_auid());
4393 break;
4394 case SIZE:
4395 f->dump_int("size", p->get_size());
4396 break;
4397 case MIN_SIZE:
4398 f->dump_int("min_size", p->get_min_size());
4399 break;
4400 case CRASH_REPLAY_INTERVAL:
4401 f->dump_int("crash_replay_interval",
4402 p->get_crash_replay_interval());
4403 break;
4404 case CRUSH_RULE:
4405 if (osdmap.crush->rule_exists(p->get_crush_ruleset())) {
4406 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
4407 p->get_crush_ruleset()));
4408 } else {
4409 f->dump_string("crush_rule", stringify(p->get_crush_ruleset()));
4410 }
4411 break;
4412 case CRUSH_RULESET:
4413 f->dump_int("crush_ruleset", p->get_crush_ruleset());
4414 break;
4415 case HASHPSPOOL:
4416 case NODELETE:
4417 case NOPGCHANGE:
4418 case NOSIZECHANGE:
4419 case WRITE_FADVISE_DONTNEED:
4420 case NOSCRUB:
4421 case NODEEP_SCRUB:
4422 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4423 if (i->second == *it)
4424 break;
4425 }
4426 assert(i != ALL_CHOICES.end());
4427 f->dump_string(i->first.c_str(),
4428 p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4429 "true" : "false");
4430 break;
4431 case HIT_SET_PERIOD:
4432 f->dump_int("hit_set_period", p->hit_set_period);
4433 break;
4434 case HIT_SET_COUNT:
4435 f->dump_int("hit_set_count", p->hit_set_count);
4436 break;
4437 case HIT_SET_TYPE:
4438 f->dump_string("hit_set_type",
4439 HitSet::get_type_name(p->hit_set_params.get_type()));
4440 break;
4441 case HIT_SET_FPP:
4442 {
4443 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4444 BloomHitSet::Params *bloomp =
4445 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4446 f->dump_float("hit_set_fpp", bloomp->get_fpp());
4447 } else if(var != "all") {
4448 f->close_section();
4449 ss << "hit set is not of type Bloom; " <<
4450 "invalid to get a false positive rate!";
4451 r = -EINVAL;
4452 goto reply;
4453 }
4454 }
4455 break;
4456 case USE_GMT_HITSET:
4457 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
4458 break;
4459 case TARGET_MAX_OBJECTS:
4460 f->dump_unsigned("target_max_objects", p->target_max_objects);
4461 break;
4462 case TARGET_MAX_BYTES:
4463 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
4464 break;
4465 case CACHE_TARGET_DIRTY_RATIO:
4466 f->dump_unsigned("cache_target_dirty_ratio_micro",
4467 p->cache_target_dirty_ratio_micro);
4468 f->dump_float("cache_target_dirty_ratio",
4469 ((float)p->cache_target_dirty_ratio_micro/1000000));
4470 break;
4471 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4472 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
4473 p->cache_target_dirty_high_ratio_micro);
4474 f->dump_float("cache_target_dirty_high_ratio",
4475 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
4476 break;
4477 case CACHE_TARGET_FULL_RATIO:
4478 f->dump_unsigned("cache_target_full_ratio_micro",
4479 p->cache_target_full_ratio_micro);
4480 f->dump_float("cache_target_full_ratio",
4481 ((float)p->cache_target_full_ratio_micro/1000000));
4482 break;
4483 case CACHE_MIN_FLUSH_AGE:
4484 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
4485 break;
4486 case CACHE_MIN_EVICT_AGE:
4487 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
4488 break;
4489 case ERASURE_CODE_PROFILE:
4490 f->dump_string("erasure_code_profile", p->erasure_code_profile);
4491 break;
4492 case MIN_READ_RECENCY_FOR_PROMOTE:
4493 f->dump_int("min_read_recency_for_promote",
4494 p->min_read_recency_for_promote);
4495 break;
4496 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4497 f->dump_int("min_write_recency_for_promote",
4498 p->min_write_recency_for_promote);
4499 break;
4500 case FAST_READ:
4501 f->dump_int("fast_read", p->fast_read);
4502 break;
4503 case HIT_SET_GRADE_DECAY_RATE:
4504 f->dump_int("hit_set_grade_decay_rate",
4505 p->hit_set_grade_decay_rate);
4506 break;
4507 case HIT_SET_SEARCH_LAST_N:
4508 f->dump_int("hit_set_search_last_n",
4509 p->hit_set_search_last_n);
4510 break;
4511 case SCRUB_MIN_INTERVAL:
4512 case SCRUB_MAX_INTERVAL:
4513 case DEEP_SCRUB_INTERVAL:
4514 case RECOVERY_PRIORITY:
4515 case RECOVERY_OP_PRIORITY:
4516 case SCRUB_PRIORITY:
4517 case COMPRESSION_MODE:
4518 case COMPRESSION_ALGORITHM:
4519 case COMPRESSION_REQUIRED_RATIO:
4520 case COMPRESSION_MAX_BLOB_SIZE:
4521 case COMPRESSION_MIN_BLOB_SIZE:
4522 case CSUM_TYPE:
4523 case CSUM_MAX_BLOCK:
4524 case CSUM_MIN_BLOCK:
4525 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4526 if (i->second == *it)
4527 break;
4528 }
4529 assert(i != ALL_CHOICES.end());
4530 if(*it == CSUM_TYPE) {
4531 int val;
4532 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
4533 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
4534 }
4535 else {
4536 p->opts.dump(i->first, f.get());
4537 }
4538 break;
4539 }
4540 f->close_section();
4541 f->flush(rdata);
4542 }
4543
4544 } else /* !f */ {
4545 for(choices_set_t::const_iterator it = selected_choices.begin();
4546 it != selected_choices.end(); ++it) {
4547 choices_map_t::const_iterator i;
4548 switch(*it) {
4549 case PG_NUM:
4550 ss << "pg_num: " << p->get_pg_num() << "\n";
4551 break;
4552 case PGP_NUM:
4553 ss << "pgp_num: " << p->get_pgp_num() << "\n";
4554 break;
4555 case AUID:
4556 ss << "auid: " << p->get_auid() << "\n";
4557 break;
4558 case SIZE:
4559 ss << "size: " << p->get_size() << "\n";
4560 break;
4561 case MIN_SIZE:
4562 ss << "min_size: " << p->get_min_size() << "\n";
4563 break;
4564 case CRASH_REPLAY_INTERVAL:
4565 ss << "crash_replay_interval: " <<
4566 p->get_crash_replay_interval() << "\n";
4567 break;
4568 case CRUSH_RULE:
4569 if (osdmap.crush->rule_exists(p->get_crush_ruleset())) {
4570 ss << "crush_rule: " << osdmap.crush->get_rule_name(
4571 p->get_crush_ruleset()) << "\n";
4572 } else {
4573 ss << "crush_rule: " << p->get_crush_ruleset() << "\n";
4574 }
4575 break;
4576 case CRUSH_RULESET:
4577 ss << "crush_ruleset: " << p->get_crush_ruleset() << "\n";
4578 break;
4579 case HIT_SET_PERIOD:
4580 ss << "hit_set_period: " << p->hit_set_period << "\n";
4581 break;
4582 case HIT_SET_COUNT:
4583 ss << "hit_set_count: " << p->hit_set_count << "\n";
4584 break;
4585 case HIT_SET_TYPE:
4586 ss << "hit_set_type: " <<
4587 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
4588 break;
4589 case HIT_SET_FPP:
4590 {
4591 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4592 BloomHitSet::Params *bloomp =
4593 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4594 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
4595 } else if(var != "all") {
4596 ss << "hit set is not of type Bloom; " <<
4597 "invalid to get a false positive rate!";
4598 r = -EINVAL;
4599 goto reply;
4600 }
4601 }
4602 break;
4603 case USE_GMT_HITSET:
4604 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
4605 break;
4606 case TARGET_MAX_OBJECTS:
4607 ss << "target_max_objects: " << p->target_max_objects << "\n";
4608 break;
4609 case TARGET_MAX_BYTES:
4610 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
4611 break;
4612 case CACHE_TARGET_DIRTY_RATIO:
4613 ss << "cache_target_dirty_ratio: "
4614 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
4615 break;
4616 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4617 ss << "cache_target_dirty_high_ratio: "
4618 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
4619 break;
4620 case CACHE_TARGET_FULL_RATIO:
4621 ss << "cache_target_full_ratio: "
4622 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
4623 break;
4624 case CACHE_MIN_FLUSH_AGE:
4625 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
4626 break;
4627 case CACHE_MIN_EVICT_AGE:
4628 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
4629 break;
4630 case ERASURE_CODE_PROFILE:
4631 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
4632 break;
4633 case MIN_READ_RECENCY_FOR_PROMOTE:
4634 ss << "min_read_recency_for_promote: " <<
4635 p->min_read_recency_for_promote << "\n";
4636 break;
4637 case HIT_SET_GRADE_DECAY_RATE:
4638 ss << "hit_set_grade_decay_rate: " <<
4639 p->hit_set_grade_decay_rate << "\n";
4640 break;
4641 case HIT_SET_SEARCH_LAST_N:
4642 ss << "hit_set_search_last_n: " <<
4643 p->hit_set_search_last_n << "\n";
4644 break;
4645 case HASHPSPOOL:
4646 case NODELETE:
4647 case NOPGCHANGE:
4648 case NOSIZECHANGE:
4649 case WRITE_FADVISE_DONTNEED:
4650 case NOSCRUB:
4651 case NODEEP_SCRUB:
4652 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4653 if (i->second == *it)
4654 break;
4655 }
4656 assert(i != ALL_CHOICES.end());
4657 ss << i->first << ": " <<
4658 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4659 "true" : "false") << "\n";
4660 break;
4661 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4662 ss << "min_write_recency_for_promote: " <<
4663 p->min_write_recency_for_promote << "\n";
4664 break;
4665 case FAST_READ:
4666 ss << "fast_read: " << p->fast_read << "\n";
4667 break;
4668 case SCRUB_MIN_INTERVAL:
4669 case SCRUB_MAX_INTERVAL:
4670 case DEEP_SCRUB_INTERVAL:
4671 case RECOVERY_PRIORITY:
4672 case RECOVERY_OP_PRIORITY:
4673 case SCRUB_PRIORITY:
4674 case COMPRESSION_MODE:
4675 case COMPRESSION_ALGORITHM:
4676 case COMPRESSION_REQUIRED_RATIO:
4677 case COMPRESSION_MAX_BLOB_SIZE:
4678 case COMPRESSION_MIN_BLOB_SIZE:
4679 case CSUM_TYPE:
4680 case CSUM_MAX_BLOCK:
4681 case CSUM_MIN_BLOCK:
4682 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4683 if (i->second == *it)
4684 break;
4685 }
4686 assert(i != ALL_CHOICES.end());
4687 {
4688 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
4689 if (p->opts.is_set(key)) {
4690 if(key == pool_opts_t::CSUM_TYPE) {
4691 int val;
4692 p->opts.get(key, &val);
4693 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
4694 } else {
4695 ss << i->first << ": " << p->opts.get(key) << "\n";
4696 }
4697 }
4698 }
4699 break;
4700 }
4701 rdata.append(ss.str());
4702 ss.str("");
4703 }
4704 }
4705 r = 0;
4706 } else if (prefix == "osd pool stats") {
4707 const auto &pgm = mon->pgmon()->pg_map;
4708 r = process_pg_map_command(prefix, cmdmap, pgm, osdmap,
4709 f.get(), &ss, &rdata);
4710 } else if (prefix == "osd pool get-quota") {
4711 string pool_name;
4712 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
4713
4714 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
4715 if (poolid < 0) {
4716 assert(poolid == -ENOENT);
4717 ss << "unrecognized pool '" << pool_name << "'";
4718 r = -ENOENT;
4719 goto reply;
4720 }
4721 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
4722
4723 if (f) {
4724 f->open_object_section("pool_quotas");
4725 f->dump_string("pool_name", pool_name);
4726 f->dump_unsigned("pool_id", poolid);
4727 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
4728 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
4729 f->close_section();
4730 f->flush(rdata);
4731 } else {
4732 stringstream rs;
4733 rs << "quotas for pool '" << pool_name << "':\n"
4734 << " max objects: ";
4735 if (p->quota_max_objects == 0)
4736 rs << "N/A";
4737 else
4738 rs << si_t(p->quota_max_objects) << " objects";
4739 rs << "\n"
4740 << " max bytes : ";
4741 if (p->quota_max_bytes == 0)
4742 rs << "N/A";
4743 else
4744 rs << si_t(p->quota_max_bytes) << "B";
4745 rdata.append(rs.str());
4746 }
4747 rdata.append("\n");
4748 r = 0;
4749 } else if (prefix == "osd crush rule list" ||
4750 prefix == "osd crush rule ls") {
4751 string format;
4752 cmd_getval(g_ceph_context, cmdmap, "format", format);
4753 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4754 f->open_array_section("rules");
4755 osdmap.crush->list_rules(f.get());
4756 f->close_section();
4757 ostringstream rs;
4758 f->flush(rs);
4759 rs << "\n";
4760 rdata.append(rs.str());
4761 } else if (prefix == "osd crush rule dump") {
4762 string name;
4763 cmd_getval(g_ceph_context, cmdmap, "name", name);
4764 string format;
4765 cmd_getval(g_ceph_context, cmdmap, "format", format);
4766 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4767 if (name == "") {
4768 f->open_array_section("rules");
4769 osdmap.crush->dump_rules(f.get());
4770 f->close_section();
4771 } else {
4772 int ruleno = osdmap.crush->get_rule_id(name);
4773 if (ruleno < 0) {
4774 ss << "unknown crush ruleset '" << name << "'";
4775 r = ruleno;
4776 goto reply;
4777 }
4778 osdmap.crush->dump_rule(ruleno, f.get());
4779 }
4780 ostringstream rs;
4781 f->flush(rs);
4782 rs << "\n";
4783 rdata.append(rs.str());
4784 } else if (prefix == "osd crush dump") {
4785 string format;
4786 cmd_getval(g_ceph_context, cmdmap, "format", format);
4787 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4788 f->open_object_section("crush_map");
4789 osdmap.crush->dump(f.get());
4790 f->close_section();
4791 ostringstream rs;
4792 f->flush(rs);
4793 rs << "\n";
4794 rdata.append(rs.str());
4795 } else if (prefix == "osd crush show-tunables") {
4796 string format;
4797 cmd_getval(g_ceph_context, cmdmap, "format", format);
4798 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4799 f->open_object_section("crush_map_tunables");
4800 osdmap.crush->dump_tunables(f.get());
4801 f->close_section();
4802 ostringstream rs;
4803 f->flush(rs);
4804 rs << "\n";
4805 rdata.append(rs.str());
4806 } else if (prefix == "osd crush tree") {
4807 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4808 f->open_array_section("crush_map_roots");
4809 osdmap.crush->dump_tree(f.get());
4810 f->close_section();
4811 f->flush(rdata);
4812 } else if (prefix == "osd crush class ls") {
4813 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4814 f->open_array_section("crush_classes");
4815 for (auto i : osdmap.crush->class_name)
4816 f->dump_string("class", i.second);
4817 f->close_section();
4818 f->flush(rdata);
4819 } else if (prefix == "osd erasure-code-profile ls") {
4820 const auto &profiles = osdmap.get_erasure_code_profiles();
4821 if (f)
4822 f->open_array_section("erasure-code-profiles");
4823 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
4824 if (f)
4825 f->dump_string("profile", i->first.c_str());
4826 else
4827 rdata.append(i->first + "\n");
4828 }
4829 if (f) {
4830 f->close_section();
4831 ostringstream rs;
4832 f->flush(rs);
4833 rs << "\n";
4834 rdata.append(rs.str());
4835 }
4836 } else if (prefix == "osd erasure-code-profile get") {
4837 string name;
4838 cmd_getval(g_ceph_context, cmdmap, "name", name);
4839 if (!osdmap.has_erasure_code_profile(name)) {
4840 ss << "unknown erasure code profile '" << name << "'";
4841 r = -ENOENT;
4842 goto reply;
4843 }
4844 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
4845 if (f)
4846 f->open_object_section("profile");
4847 for (map<string,string>::const_iterator i = profile.begin();
4848 i != profile.end();
4849 ++i) {
4850 if (f)
4851 f->dump_string(i->first.c_str(), i->second.c_str());
4852 else
4853 rdata.append(i->first + "=" + i->second + "\n");
4854 }
4855 if (f) {
4856 f->close_section();
4857 ostringstream rs;
4858 f->flush(rs);
4859 rs << "\n";
4860 rdata.append(rs.str());
4861 }
4862 } else {
4863 // try prepare update
4864 return false;
4865 }
4866
4867 reply:
4868 string rs;
4869 getline(ss, rs);
4870 mon->reply_command(op, r, rs, rdata, get_last_committed());
4871 return true;
4872}
4873
4874void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
4875{
4876 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
4877 pending_inc.get_new_pool(pool_id, pool)->flags = flags;
4878}
4879
4880bool OSDMonitor::update_pools_status()
4881{
4882 if (!mon->pgmon()->is_readable())
4883 return false;
4884
4885 bool ret = false;
4886
4887 auto& pools = osdmap.get_pools();
4888 for (auto it = pools.begin(); it != pools.end(); ++it) {
4889 if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
4890 continue;
4891 pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
4892 object_stat_sum_t& sum = stats.stats.sum;
4893 const pg_pool_t &pool = it->second;
4894 const string& pool_name = osdmap.get_pool_name(it->first);
4895
4896 bool pool_is_full =
4897 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
4898 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
4899
4900 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
4901 if (pool_is_full)
4902 continue;
4903
4904 mon->clog->info() << "pool '" << pool_name
4905 << "' no longer full; removing FULL flag";
4906
4907 update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
4908 ret = true;
4909 } else {
4910 if (!pool_is_full)
4911 continue;
4912
4913 if (pool.quota_max_bytes > 0 &&
4914 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
4915 mon->clog->warn() << "pool '" << pool_name << "' is full"
4916 << " (reached quota's max_bytes: "
4917 << si_t(pool.quota_max_bytes) << ")";
4918 }
4919 if (pool.quota_max_objects > 0 &&
4920 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
4921 mon->clog->warn() << "pool '" << pool_name << "' is full"
4922 << " (reached quota's max_objects: "
4923 << pool.quota_max_objects << ")";
4924 }
4925 update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
4926 ret = true;
4927 }
4928 }
4929 return ret;
4930}
4931
4932void OSDMonitor::get_pools_health(
4933 list<pair<health_status_t,string> >& summary,
4934 list<pair<health_status_t,string> > *detail) const
4935{
4936 auto& pools = osdmap.get_pools();
4937 for (auto it = pools.begin(); it != pools.end(); ++it) {
4938 if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
4939 continue;
4940 pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
4941 object_stat_sum_t& sum = stats.stats.sum;
4942 const pg_pool_t &pool = it->second;
4943 const string& pool_name = osdmap.get_pool_name(it->first);
4944
4945 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
4946 // uncomment these asserts if/when we update the FULL flag on pg_stat update
4947 //assert((pool.quota_max_objects > 0) || (pool.quota_max_bytes > 0));
4948
4949 stringstream ss;
4950 ss << "pool '" << pool_name << "' is full";
4951 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4952 if (detail)
4953 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4954 }
4955
4956 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
4957 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
4958
4959 if (pool.quota_max_objects > 0) {
4960 stringstream ss;
4961 health_status_t status = HEALTH_OK;
4962 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
4963 // uncomment these asserts if/when we update the FULL flag on pg_stat update
4964 //assert(pool.has_flag(pg_pool_t::FLAG_FULL));
4965 } else if (crit_threshold > 0 &&
4966 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
4967 ss << "pool '" << pool_name
4968 << "' has " << sum.num_objects << " objects"
4969 << " (max " << pool.quota_max_objects << ")";
4970 status = HEALTH_ERR;
4971 } else if (warn_threshold > 0 &&
4972 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
4973 ss << "pool '" << pool_name
4974 << "' has " << sum.num_objects << " objects"
4975 << " (max " << pool.quota_max_objects << ")";
4976 status = HEALTH_WARN;
4977 }
4978 if (status != HEALTH_OK) {
4979 pair<health_status_t,string> s(status, ss.str());
4980 summary.push_back(s);
4981 if (detail)
4982 detail->push_back(s);
4983 }
4984 }
4985
4986 if (pool.quota_max_bytes > 0) {
4987 health_status_t status = HEALTH_OK;
4988 stringstream ss;
4989 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
4990 // uncomment these asserts if/when we update the FULL flag on pg_stat update
4991 //assert(pool.has_flag(pg_pool_t::FLAG_FULL));
4992 } else if (crit_threshold > 0 &&
4993 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
4994 ss << "pool '" << pool_name
4995 << "' has " << si_t(sum.num_bytes) << " bytes"
4996 << " (max " << si_t(pool.quota_max_bytes) << ")";
4997 status = HEALTH_ERR;
4998 } else if (warn_threshold > 0 &&
4999 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
5000 ss << "pool '" << pool_name
5001 << "' has " << si_t(sum.num_bytes) << " bytes"
5002 << " (max " << si_t(pool.quota_max_bytes) << ")";
5003 status = HEALTH_WARN;
5004 }
5005 if (status != HEALTH_OK) {
5006 pair<health_status_t,string> s(status, ss.str());
5007 summary.push_back(s);
5008 if (detail)
5009 detail->push_back(s);
5010 }
5011 }
5012 }
5013}
5014
5015
5016int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
5017{
5018 op->mark_osdmon_event(__func__);
5019 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
5020 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
5021 MonSession *session = m->get_session();
5022 if (!session)
5023 return -EPERM;
5024 string erasure_code_profile;
5025 stringstream ss;
5026 string ruleset_name;
5027 if (m->auid)
5028 return prepare_new_pool(m->name, m->auid, m->crush_rule, ruleset_name,
5029 0, 0,
5030 erasure_code_profile,
5031 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5032 else
5033 return prepare_new_pool(m->name, session->auid, m->crush_rule, ruleset_name,
5034 0, 0,
5035 erasure_code_profile,
5036 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5037}
5038
5039int OSDMonitor::crush_rename_bucket(const string& srcname,
5040 const string& dstname,
5041 ostream *ss)
5042{
5043 int ret;
5044 //
5045 // Avoid creating a pending crush if it does not already exists and
5046 // the rename would fail.
5047 //
5048 if (!_have_pending_crush()) {
5049 ret = _get_stable_crush().can_rename_bucket(srcname,
5050 dstname,
5051 ss);
5052 if (ret)
5053 return ret;
5054 }
5055
5056 CrushWrapper newcrush;
5057 _get_pending_crush(newcrush);
5058
5059 ret = newcrush.rename_bucket(srcname,
5060 dstname,
5061 ss);
5062 if (ret)
5063 return ret;
5064
5065 pending_inc.crush.clear();
5066 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5067 *ss << "renamed bucket " << srcname << " into " << dstname;
5068 return 0;
5069}
5070
5071void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
5072{
5073 string replacement = "";
5074
5075 if (plugin == "jerasure_generic" ||
5076 plugin == "jerasure_sse3" ||
5077 plugin == "jerasure_sse4" ||
5078 plugin == "jerasure_neon") {
5079 replacement = "jerasure";
5080 } else if (plugin == "shec_generic" ||
5081 plugin == "shec_sse3" ||
5082 plugin == "shec_sse4" ||
5083 plugin == "shec_neon") {
5084 replacement = "shec";
5085 }
5086
5087 if (replacement != "") {
5088 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
5089 << plugin << " that has been deprecated. Please use "
5090 << replacement << " instead." << dendl;
5091 }
5092}
5093
5094int OSDMonitor::normalize_profile(const string& profilename,
5095 ErasureCodeProfile &profile,
5096 bool force,
5097 ostream *ss)
5098{
5099 ErasureCodeInterfaceRef erasure_code;
5100 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5101 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
5102 check_legacy_ec_plugin(plugin->second, profilename);
5103 int err = instance.factory(plugin->second,
5104 g_conf->get_val<std::string>("erasure_code_dir"),
5105 profile, &erasure_code, ss);
5106 if (err) {
5107 return err;
5108 }
5109
5110 err = erasure_code->init(profile, ss);
5111 if (err) {
5112 return err;
5113 }
5114
5115 auto it = profile.find("stripe_unit");
5116 if (it != profile.end()) {
5117 string err_str;
5118 uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5119 if (!err_str.empty()) {
5120 *ss << "could not parse stripe_unit '" << it->second
5121 << "': " << err_str << std::endl;
5122 return -EINVAL;
5123 }
5124 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5125 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
5126 if (chunk_size != stripe_unit) {
5127 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
5128 << "alignment. Would be padded to " << chunk_size
5129 << std::endl;
5130 return -EINVAL;
5131 }
5132 if ((stripe_unit % 4096) != 0 && !force) {
5133 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
5134 << "use --force to override this check" << std::endl;
5135 return -EINVAL;
5136 }
5137 }
5138 return 0;
5139}
5140
5141int OSDMonitor::crush_ruleset_create_erasure(const string &name,
5142 const string &profile,
5143 int *ruleset,
5144 ostream *ss)
5145{
5146 int ruleid = osdmap.crush->get_rule_id(name);
5147 if (ruleid != -ENOENT) {
5148 *ruleset = osdmap.crush->get_rule_mask_ruleset(ruleid);
5149 return -EEXIST;
5150 }
5151
5152 CrushWrapper newcrush;
5153 _get_pending_crush(newcrush);
5154
5155 ruleid = newcrush.get_rule_id(name);
5156 if (ruleid != -ENOENT) {
5157 *ruleset = newcrush.get_rule_mask_ruleset(ruleid);
5158 return -EALREADY;
5159 } else {
5160 ErasureCodeInterfaceRef erasure_code;
5161 int err = get_erasure_code(profile, &erasure_code, ss);
5162 if (err) {
5163 *ss << "failed to load plugin using profile " << profile << std::endl;
5164 return err;
5165 }
5166
5167 err = erasure_code->create_ruleset(name, newcrush, ss);
5168 erasure_code.reset();
5169 if (err < 0)
5170 return err;
5171 *ruleset = err;
5172 pending_inc.crush.clear();
5173 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5174 return 0;
5175 }
5176}
5177
5178int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
5179 ErasureCodeInterfaceRef *erasure_code,
5180 ostream *ss) const
5181{
5182 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
5183 return -EAGAIN;
5184 ErasureCodeProfile profile =
5185 osdmap.get_erasure_code_profile(erasure_code_profile);
5186 ErasureCodeProfile::const_iterator plugin =
5187 profile.find("plugin");
5188 if (plugin == profile.end()) {
5189 *ss << "cannot determine the erasure code plugin"
5190 << " because there is no 'plugin' entry in the erasure_code_profile "
5191 << profile << std::endl;
5192 return -EINVAL;
5193 }
5194 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
5195 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5196 return instance.factory(plugin->second,
5197 g_conf->get_val<std::string>("erasure_code_dir"),
5198 profile, erasure_code, ss);
5199}
5200
5201int OSDMonitor::check_cluster_features(uint64_t features,
5202 stringstream &ss)
5203{
5204 stringstream unsupported_ss;
5205 int unsupported_count = 0;
5206 if ((mon->get_quorum_con_features() & features) != features) {
5207 unsupported_ss << "the monitor cluster";
5208 ++unsupported_count;
5209 }
5210
5211 set<int32_t> up_osds;
5212 osdmap.get_up_osds(up_osds);
5213 for (set<int32_t>::iterator it = up_osds.begin();
5214 it != up_osds.end(); ++it) {
5215 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
5216 if ((xi.features & features) != features) {
5217 if (unsupported_count > 0)
5218 unsupported_ss << ", ";
5219 unsupported_ss << "osd." << *it;
5220 unsupported_count ++;
5221 }
5222 }
5223
5224 if (unsupported_count > 0) {
5225 ss << "features " << features << " unsupported by: "
5226 << unsupported_ss.str();
5227 return -ENOTSUP;
5228 }
5229
5230 // check pending osd state, too!
5231 for (map<int32_t,osd_xinfo_t>::const_iterator p =
5232 pending_inc.new_xinfo.begin();
5233 p != pending_inc.new_xinfo.end(); ++p) {
5234 const osd_xinfo_t &xi = p->second;
5235 if ((xi.features & features) != features) {
5236 dout(10) << __func__ << " pending osd." << p->first
5237 << " features are insufficient; retry" << dendl;
5238 return -EAGAIN;
5239 }
5240 }
5241
5242 return 0;
5243}
5244
5245bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
5246 stringstream& ss)
5247{
5248 OSDMap::Incremental new_pending = pending_inc;
5249 ::encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
5250 OSDMap newmap;
5251 newmap.deepish_copy_from(osdmap);
5252 newmap.apply_incremental(new_pending);
5253
5254 // client compat
5255 if (newmap.require_min_compat_client.length()) {
5256 auto mv = newmap.get_min_compat_client();
5257 if (mv.first > newmap.require_min_compat_client) {
5258 ss << "new crush map requires client version " << mv
5259 << " but require_min_compat_client is "
5260 << newmap.require_min_compat_client;
5261 return false;
5262 }
5263 }
5264
5265 // osd compat
5266 uint64_t features =
5267 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
5268 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
5269 stringstream features_ss;
5270 int r = check_cluster_features(features, features_ss);
5271 if (r) {
5272 ss << "Could not change CRUSH: " << features_ss.str();
5273 return false;
5274 }
5275
5276 return true;
5277}
5278
5279bool OSDMonitor::erasure_code_profile_in_use(
5280 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
5281 const string &profile,
5282 ostream *ss)
5283{
5284 bool found = false;
5285 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5286 p != pools.end();
5287 ++p) {
5288 if (p->second.erasure_code_profile == profile) {
5289 *ss << osdmap.pool_name[p->first] << " ";
5290 found = true;
5291 }
5292 }
5293 if (found) {
5294 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
5295 }
5296 return found;
5297}
5298
5299int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
5300 map<string,string> *erasure_code_profile_map,
5301 ostream *ss)
5302{
5303 int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
5304 *ss,
5305 erasure_code_profile_map);
5306 if (r)
5307 return r;
5308 assert((*erasure_code_profile_map).count("plugin"));
5309 string default_plugin = (*erasure_code_profile_map)["plugin"];
5310 map<string,string> user_map;
5311 for (vector<string>::const_iterator i = erasure_code_profile.begin();
5312 i != erasure_code_profile.end();
5313 ++i) {
5314 size_t equal = i->find('=');
5315 if (equal == string::npos) {
5316 user_map[*i] = string();
5317 (*erasure_code_profile_map)[*i] = string();
5318 } else {
5319 const string key = i->substr(0, equal);
5320 equal++;
5321 const string value = i->substr(equal);
5322 user_map[key] = value;
5323 (*erasure_code_profile_map)[key] = value;
5324 }
5325 }
5326
5327 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
5328 (*erasure_code_profile_map) = user_map;
5329
5330 return 0;
5331}
5332
5333int OSDMonitor::prepare_pool_size(const unsigned pool_type,
5334 const string &erasure_code_profile,
5335 unsigned *size, unsigned *min_size,
5336 ostream *ss)
5337{
5338 int err = 0;
5339 switch (pool_type) {
5340 case pg_pool_t::TYPE_REPLICATED:
5341 *size = g_conf->osd_pool_default_size;
5342 *min_size = g_conf->get_osd_pool_default_min_size();
5343 break;
5344 case pg_pool_t::TYPE_ERASURE:
5345 {
5346 ErasureCodeInterfaceRef erasure_code;
5347 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5348 if (err == 0) {
5349 *size = erasure_code->get_chunk_count();
5350 *min_size = MIN(erasure_code->get_data_chunk_count() + 1, *size);
5351 }
5352 }
5353 break;
5354 default:
5355 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
5356 err = -EINVAL;
5357 break;
5358 }
5359 return err;
5360}
5361
5362int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
5363 const string &erasure_code_profile,
5364 uint32_t *stripe_width,
5365 ostream *ss)
5366{
5367 int err = 0;
5368 switch (pool_type) {
5369 case pg_pool_t::TYPE_REPLICATED:
5370 // ignored
5371 break;
5372 case pg_pool_t::TYPE_ERASURE:
5373 {
5374 ErasureCodeProfile profile =
5375 osdmap.get_erasure_code_profile(erasure_code_profile);
5376 ErasureCodeInterfaceRef erasure_code;
5377 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5378 if (err)
5379 break;
5380 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5381 uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
5382 auto it = profile.find("stripe_unit");
5383 if (it != profile.end()) {
5384 string err_str;
5385 stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5386 assert(err_str.empty());
5387 }
5388 *stripe_width = data_chunks *
5389 erasure_code->get_chunk_size(stripe_unit * data_chunks);
5390 }
5391 break;
5392 default:
5393 *ss << "prepare_pool_stripe_width: "
5394 << pool_type << " is not a known pool type";
5395 err = -EINVAL;
5396 break;
5397 }
5398 return err;
5399}
5400
5401int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
5402 const string &erasure_code_profile,
5403 const string &ruleset_name,
5404 int *crush_ruleset,
5405 ostream *ss)
5406{
5407
5408 if (*crush_ruleset < 0) {
5409 switch (pool_type) {
5410 case pg_pool_t::TYPE_REPLICATED:
5411 {
5412 if (ruleset_name == "") {
5413 //Use default ruleset
5414 *crush_ruleset = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
5415 if (*crush_ruleset < 0) {
5416 // Errors may happen e.g. if no valid ruleset is available
5417 *ss << "No suitable CRUSH ruleset exists, check "
5418 << "'osd pool default crush *' config options";
5419 return -ENOENT;
5420 }
5421 } else {
5422 return get_crush_ruleset(ruleset_name, crush_ruleset, ss);
5423 }
5424 }
5425 break;
5426 case pg_pool_t::TYPE_ERASURE:
5427 {
5428 int err = crush_ruleset_create_erasure(ruleset_name,
5429 erasure_code_profile,
5430 crush_ruleset, ss);
5431 switch (err) {
5432 case -EALREADY:
5433 dout(20) << "prepare_pool_crush_ruleset: ruleset "
5434 << ruleset_name << " try again" << dendl;
5435 // fall through
5436 case 0:
5437 // need to wait for the crush rule to be proposed before proceeding
5438 err = -EAGAIN;
5439 break;
5440 case -EEXIST:
5441 err = 0;
5442 break;
5443 }
5444 return err;
5445 }
5446 break;
5447 default:
5448 *ss << "prepare_pool_crush_ruleset: " << pool_type
5449 << " is not a known pool type";
5450 return -EINVAL;
5451 break;
5452 }
5453 } else {
5454 if (!osdmap.crush->ruleset_exists(*crush_ruleset)) {
5455 *ss << "CRUSH ruleset " << *crush_ruleset << " not found";
5456 return -ENOENT;
5457 }
5458 }
5459
5460 return 0;
5461}
5462
5463int OSDMonitor::get_crush_ruleset(const string &ruleset_name,
5464 int *crush_ruleset,
5465 ostream *ss)
5466{
5467 int ret;
5468 ret = osdmap.crush->get_rule_id(ruleset_name);
5469 if (ret != -ENOENT) {
5470 // found it, use it
5471 *crush_ruleset = ret;
5472 } else {
5473 CrushWrapper newcrush;
5474 _get_pending_crush(newcrush);
5475
5476 ret = newcrush.get_rule_id(ruleset_name);
5477 if (ret != -ENOENT) {
5478 // found it, wait for it to be proposed
5479 dout(20) << __func__ << ": ruleset " << ruleset_name
5480 << " try again" << dendl;
5481 return -EAGAIN;
5482 } else {
5483 //Cannot find it , return error
5484 *ss << "specified ruleset " << ruleset_name << " doesn't exist";
5485 return ret;
5486 }
5487 }
5488 return 0;
5489}
5490
5491/**
5492 * @param name The name of the new pool
5493 * @param auid The auid of the pool owner. Can be -1
5494 * @param crush_ruleset The crush rule to use. If <0, will use the system default
5495 * @param crush_ruleset_name The crush rule to use, if crush_rulset <0
5496 * @param pg_num The pg_num to use. If set to 0, will use the system default
5497 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
5498 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
5499 * @param pool_type TYPE_ERASURE, or TYPE_REP
5500 * @param expected_num_objects expected number of objects on the pool
5501 * @param fast_read fast read type.
5502 * @param ss human readable error message, if any.
5503 *
5504 * @return 0 on success, negative errno on failure.
5505 */
5506int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
5507 int crush_ruleset,
5508 const string &crush_ruleset_name,
5509 unsigned pg_num, unsigned pgp_num,
5510 const string &erasure_code_profile,
5511 const unsigned pool_type,
5512 const uint64_t expected_num_objects,
5513 FastReadType fast_read,
5514 ostream *ss)
5515{
5516 if (name.length() == 0)
5517 return -EINVAL;
5518 if (pg_num == 0)
5519 pg_num = g_conf->osd_pool_default_pg_num;
5520 if (pgp_num == 0)
5521 pgp_num = g_conf->osd_pool_default_pgp_num;
5522 if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
5523 *ss << "'pg_num' must be greater than 0 and less than or equal to "
5524 << g_conf->mon_max_pool_pg_num
5525 << " (you may adjust 'mon max pool pg num' for higher values)";
5526 return -ERANGE;
5527 }
5528 if (pgp_num > pg_num) {
5529 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
5530 << ", which in this case is " << pg_num;
5531 return -ERANGE;
5532 }
5533 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
5534 *ss << "'fast_read' can only apply to erasure coding pool";
5535 return -EINVAL;
5536 }
5537 int r;
5538 r = prepare_pool_crush_ruleset(pool_type, erasure_code_profile,
5539 crush_ruleset_name, &crush_ruleset, ss);
5540 if (r) {
5541 dout(10) << " prepare_pool_crush_ruleset returns " << r << dendl;
5542 return r;
5543 }
5544 CrushWrapper newcrush;
5545 _get_pending_crush(newcrush);
5546 ostringstream err;
5547 CrushTester tester(newcrush, err);
5548 // use the internal crush tester if crushtool config is empty
5549 if (g_conf->crushtool.empty()) {
5550 r = tester.test();
5551 } else {
5552 r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
5553 osdmap.get_max_osd(),
5554 g_conf->mon_lease,
5555 crush_ruleset);
5556 }
5557 if (r) {
5558 dout(10) << " tester.test_with_crushtool returns " << r
5559 << ": " << err.str() << dendl;
5560 *ss << "crushtool check failed with " << r << ": " << err.str();
5561 return r;
5562 }
5563 unsigned size, min_size;
5564 r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
5565 if (r) {
5566 dout(10) << " prepare_pool_size returns " << r << dendl;
5567 return r;
5568 }
5569
5570 if (!osdmap.crush->check_crush_rule(crush_ruleset, pool_type, size, *ss)) {
5571 return -EINVAL;
5572 }
5573
5574 uint32_t stripe_width = 0;
5575 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
5576 if (r) {
5577 dout(10) << " prepare_pool_stripe_width returns " << r << dendl;
5578 return r;
5579 }
5580
5581 bool fread = false;
5582 if (pool_type == pg_pool_t::TYPE_ERASURE) {
5583 switch (fast_read) {
5584 case FAST_READ_OFF:
5585 fread = false;
5586 break;
5587 case FAST_READ_ON:
5588 fread = true;
5589 break;
5590 case FAST_READ_DEFAULT:
5591 fread = g_conf->mon_osd_pool_ec_fast_read;
5592 break;
5593 default:
5594 *ss << "invalid fast_read setting: " << fast_read;
5595 return -EINVAL;
5596 }
5597 }
5598
5599 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
5600 p != pending_inc.new_pool_names.end();
5601 ++p) {
5602 if (p->second == name)
5603 return 0;
5604 }
5605
5606 if (-1 == pending_inc.new_pool_max)
5607 pending_inc.new_pool_max = osdmap.pool_max;
5608 int64_t pool = ++pending_inc.new_pool_max;
5609 pg_pool_t empty;
5610 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
5611 pi->type = pool_type;
5612 pi->fast_read = fread;
5613 pi->flags = g_conf->osd_pool_default_flags;
5614 if (g_conf->osd_pool_default_flag_hashpspool)
5615 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
5616 if (g_conf->osd_pool_default_flag_nodelete)
5617 pi->set_flag(pg_pool_t::FLAG_NODELETE);
5618 if (g_conf->osd_pool_default_flag_nopgchange)
5619 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
5620 if (g_conf->osd_pool_default_flag_nosizechange)
5621 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
5622 if (g_conf->osd_pool_use_gmt_hitset &&
5623 (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
5624 pi->use_gmt_hitset = true;
5625 else
5626 pi->use_gmt_hitset = false;
5627
5628 pi->size = size;
5629 pi->min_size = min_size;
5630 pi->crush_ruleset = crush_ruleset;
5631 pi->expected_num_objects = expected_num_objects;
5632 pi->object_hash = CEPH_STR_HASH_RJENKINS;
5633 pi->set_pg_num(pg_num);
5634 pi->set_pgp_num(pgp_num);
5635 pi->last_change = pending_inc.epoch;
5636 pi->auid = auid;
5637 pi->erasure_code_profile = erasure_code_profile;
5638 pi->stripe_width = stripe_width;
5639 pi->cache_target_dirty_ratio_micro =
5640 g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
5641 pi->cache_target_dirty_high_ratio_micro =
5642 g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
5643 pi->cache_target_full_ratio_micro =
5644 g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
5645 pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
5646 pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
5647 pending_inc.new_pool_names[pool] = name;
5648 return 0;
5649}
5650
5651bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
5652{
5653 op->mark_osdmon_event(__func__);
5654 ostringstream ss;
5655 if (pending_inc.new_flags < 0)
5656 pending_inc.new_flags = osdmap.get_flags();
5657 pending_inc.new_flags |= flag;
5658 ss << OSDMap::get_flag_string(flag) << " is set";
5659 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5660 get_last_committed() + 1));
5661 return true;
5662}
5663
5664bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
5665{
5666 op->mark_osdmon_event(__func__);
5667 ostringstream ss;
5668 if (pending_inc.new_flags < 0)
5669 pending_inc.new_flags = osdmap.get_flags();
5670 pending_inc.new_flags &= ~flag;
5671 ss << OSDMap::get_flag_string(flag) << " is unset";
5672 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5673 get_last_committed() + 1));
5674 return true;
5675}
5676
5677int OSDMonitor::parse_osd_id(const char *s, stringstream *pss)
5678{
5679 // osd.NNN?
5680 if (strncmp(s, "osd.", 4) == 0) {
5681 s += 4;
5682 }
5683
5684 // NNN?
5685 ostringstream ss;
5686 long id = parse_pos_long(s, &ss);
5687 if (id < 0) {
5688 *pss << ss.str();
5689 return id;
5690 }
5691 if (id > 0xffff) {
5692 *pss << "osd id " << id << " is too large";
5693 return -ERANGE;
5694 }
5695 return id;
5696}
5697
5698
5699int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
5700 stringstream& ss)
5701{
5702 string poolstr;
5703 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
5704 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5705 if (pool < 0) {
5706 ss << "unrecognized pool '" << poolstr << "'";
5707 return -ENOENT;
5708 }
5709 string var;
5710 cmd_getval(g_ceph_context, cmdmap, "var", var);
5711
5712 pg_pool_t p = *osdmap.get_pg_pool(pool);
5713 if (pending_inc.new_pools.count(pool))
5714 p = pending_inc.new_pools[pool];
5715
5716 // accept val as a json string in the normal case (current
5717 // generation monitor). parse out int or float values from the
5718 // string as needed. however, if it is not a string, try to pull
5719 // out an int, in case an older monitor with an older json schema is
5720 // forwarding a request.
5721 string val;
5722 string interr, floaterr;
5723 int64_t n = 0;
5724 double f = 0;
5725 int64_t uf = 0; // micro-f
5726 if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
5727 // wasn't a string; maybe an older mon forwarded json with an int?
5728 if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
5729 return -EINVAL; // no value!
5730 } else {
5731 // we got a string. see if it contains an int.
5732 n = strict_strtoll(val.c_str(), 10, &interr);
5733 // or a float
5734 f = strict_strtod(val.c_str(), &floaterr);
5735 uf = llrintl(f * (double)1000000.0);
5736 }
5737
5738 if (!p.is_tier() &&
5739 (var == "hit_set_type" || var == "hit_set_period" ||
5740 var == "hit_set_count" || var == "hit_set_fpp" ||
5741 var == "target_max_objects" || var == "target_max_bytes" ||
5742 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
5743 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
5744 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
5745 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
5746 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
5747 return -EACCES;
5748 }
5749
5750 if (var == "size") {
5751 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
5752 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
5753 return -EPERM;
5754 }
5755 if (p.type == pg_pool_t::TYPE_ERASURE) {
5756 ss << "can not change the size of an erasure-coded pool";
5757 return -ENOTSUP;
5758 }
5759 if (interr.length()) {
5760 ss << "error parsing integer value '" << val << "': " << interr;
5761 return -EINVAL;
5762 }
5763 if (n <= 0 || n > 10) {
5764 ss << "pool size must be between 1 and 10";
5765 return -EINVAL;
5766 }
5767 p.size = n;
5768 if (n < p.min_size)
5769 p.min_size = n;
5770 } else if (var == "min_size") {
5771 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
5772 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
5773 return -EPERM;
5774 }
5775 if (interr.length()) {
5776 ss << "error parsing integer value '" << val << "': " << interr;
5777 return -EINVAL;
5778 }
5779
5780 if (p.type != pg_pool_t::TYPE_ERASURE) {
5781 if (n < 1 || n > p.size) {
5782 ss << "pool min_size must be between 1 and " << (int)p.size;
5783 return -EINVAL;
5784 }
5785 } else {
5786 ErasureCodeInterfaceRef erasure_code;
5787 int k;
5788 stringstream tmp;
5789 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
5790 if (err == 0) {
5791 k = erasure_code->get_data_chunk_count();
5792 } else {
5793 ss << __func__ << " get_erasure_code failed: " << tmp.rdbuf();
5794 return err;
5795 }
5796
5797 if (n < k || n > p.size) {
5798 ss << "pool min_size must be between " << k << " and " << (int)p.size;
5799 return -EINVAL;
5800 }
5801 }
5802 p.min_size = n;
5803 } else if (var == "auid") {
5804 if (interr.length()) {
5805 ss << "error parsing integer value '" << val << "': " << interr;
5806 return -EINVAL;
5807 }
5808 p.auid = n;
5809 } else if (var == "crash_replay_interval") {
5810 if (interr.length()) {
5811 ss << "error parsing integer value '" << val << "': " << interr;
5812 return -EINVAL;
5813 }
5814 p.crash_replay_interval = n;
5815 } else if (var == "pg_num") {
5816 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
5817 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
5818 return -EPERM;
5819 }
5820 if (interr.length()) {
5821 ss << "error parsing integer value '" << val << "': " << interr;
5822 return -EINVAL;
5823 }
5824 if (n <= (int)p.get_pg_num()) {
5825 ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
5826 if (n < (int)p.get_pg_num())
5827 return -EEXIST;
5828 return 0;
5829 }
5830 string force;
5831 cmd_getval(g_ceph_context,cmdmap, "force", force);
5832 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
5833 force != "--yes-i-really-mean-it") {
5834 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
5835 return -EPERM;
5836 }
5837 int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
5838 int64_t new_pgs = n - p.get_pg_num();
5839 if (new_pgs > g_conf->mon_osd_max_split_count * expected_osds) {
5840 ss << "specified pg_num " << n << " is too large (creating "
5841 << new_pgs << " new PGs on ~" << expected_osds
5842 << " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
5843 << ')';
5844 return -E2BIG;
5845 }
5846 p.set_pg_num(n);
5847 // force pre-luminous clients to resend their ops, since they
5848 // don't understand that split PGs now form a new interval.
5849 p.last_force_op_resend_preluminous = pending_inc.epoch;
5850 } else if (var == "pgp_num") {
5851 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
5852 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
5853 return -EPERM;
5854 }
5855 if (interr.length()) {
5856 ss << "error parsing integer value '" << val << "': " << interr;
5857 return -EINVAL;
5858 }
5859 if (n <= 0) {
5860 ss << "specified pgp_num must > 0, but you set to " << n;
5861 return -EINVAL;
5862 }
5863 if (n > (int)p.get_pg_num()) {
5864 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
5865 return -EINVAL;
5866 }
5867 p.set_pgp_num(n);
5868 } else if (var == "crush_rule") {
5869 int id = osdmap.crush->get_rule_id(val);
5870 if (id == -ENOENT) {
5871 ss << "crush rule " << val << " does not exist";
5872 return -ENOENT;
5873 }
5874 if (id < 0) {
5875 ss << cpp_strerror(id);
5876 return -ENOENT;
5877 }
5878 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
5879 return -EINVAL;
5880 }
5881 p.crush_ruleset = id;
5882 } else if (var == "crush_ruleset") {
5883 if (interr.length()) {
5884 ss << "error parsing integer value '" << val << "': " << interr;
5885 return -EINVAL;
5886 }
5887 if (!osdmap.crush->ruleset_exists(n)) {
5888 ss << "crush ruleset " << n << " does not exist";
5889 return -ENOENT;
5890 }
5891
5892 if (!osdmap.crush->check_crush_rule(n, p.get_type(), p.get_size(), ss)) {
5893 return -EINVAL;
5894 }
5895 p.crush_ruleset = n;
5896 } else if (var == "nodelete" || var == "nopgchange" ||
5897 var == "nosizechange" || var == "write_fadvise_dontneed" ||
5898 var == "noscrub" || var == "nodeep-scrub") {
5899 uint64_t flag = pg_pool_t::get_flag_by_name(var);
5900 // make sure we only compare against 'n' if we didn't receive a string
5901 if (val == "true" || (interr.empty() && n == 1)) {
5902 p.set_flag(flag);
5903 } else if (val == "false" || (interr.empty() && n == 0)) {
5904 p.unset_flag(flag);
5905 } else {
5906 ss << "expecting value 'true', 'false', '0', or '1'";
5907 return -EINVAL;
5908 }
5909 } else if (var == "hashpspool") {
5910 uint64_t flag = pg_pool_t::get_flag_by_name(var);
5911 string force;
5912 cmd_getval(g_ceph_context, cmdmap, "force", force);
5913 if (force != "--yes-i-really-mean-it") {
5914 ss << "are you SURE? this will remap all placement groups in this pool,"
5915 " this triggers large data movement,"
5916 " pass --yes-i-really-mean-it if you really do.";
5917 return -EPERM;
5918 }
5919 // make sure we only compare against 'n' if we didn't receive a string
5920 if (val == "true" || (interr.empty() && n == 1)) {
5921 p.set_flag(flag);
5922 } else if (val == "false" || (interr.empty() && n == 0)) {
5923 p.unset_flag(flag);
5924 } else {
5925 ss << "expecting value 'true', 'false', '0', or '1'";
5926 return -EINVAL;
5927 }
5928 } else if (var == "hit_set_type") {
5929 if (val == "none")
5930 p.hit_set_params = HitSet::Params();
5931 else {
5932 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
5933 if (err)
5934 return err;
5935 if (val == "bloom") {
5936 BloomHitSet::Params *bsp = new BloomHitSet::Params;
5937 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
5938 p.hit_set_params = HitSet::Params(bsp);
5939 } else if (val == "explicit_hash")
5940 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
5941 else if (val == "explicit_object")
5942 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
5943 else {
5944 ss << "unrecognized hit_set type '" << val << "'";
5945 return -EINVAL;
5946 }
5947 }
5948 } else if (var == "hit_set_period") {
5949 if (interr.length()) {
5950 ss << "error parsing integer value '" << val << "': " << interr;
5951 return -EINVAL;
5952 }
5953 p.hit_set_period = n;
5954 } else if (var == "hit_set_count") {
5955 if (interr.length()) {
5956 ss << "error parsing integer value '" << val << "': " << interr;
5957 return -EINVAL;
5958 }
5959 p.hit_set_count = n;
5960 } else if (var == "hit_set_fpp") {
5961 if (floaterr.length()) {
5962 ss << "error parsing floating point value '" << val << "': " << floaterr;
5963 return -EINVAL;
5964 }
5965 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
5966 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
5967 return -EINVAL;
5968 }
5969 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
5970 bloomp->set_fpp(f);
5971 } else if (var == "use_gmt_hitset") {
5972 if (val == "true" || (interr.empty() && n == 1)) {
5973 if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
5974 ss << "not all OSDs support GMT hit set.";
5975 return -EINVAL;
5976 }
5977 p.use_gmt_hitset = true;
5978 } else {
5979 ss << "expecting value 'true' or '1'";
5980 return -EINVAL;
5981 }
5982 } else if (var == "allow_ec_overwrites") {
5983 if (!p.is_erasure()) {
5984 ss << "ec overwrites can only be enabled for an erasure coded pool";
5985 return -EINVAL;
5986 }
5987 if (val == "true" || (interr.empty() && n == 1)) {
5988 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
5989 } else if (val == "false" || (interr.empty() && n == 0)) {
5990 ss << "ec overwrites cannot be disabled once enabled";
5991 return -EINVAL;
5992 } else {
5993 ss << "expecting value 'true', 'false', '0', or '1'";
5994 return -EINVAL;
5995 }
5996 stringstream err;
5997 if (!is_pool_currently_all_bluestore(pool, p, &err)) {
5998 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
5999 return -EINVAL;
6000 }
6001 } else if (var == "target_max_objects") {
6002 if (interr.length()) {
6003 ss << "error parsing int '" << val << "': " << interr;
6004 return -EINVAL;
6005 }
6006 p.target_max_objects = n;
6007 } else if (var == "target_max_bytes") {
6008 if (interr.length()) {
6009 ss << "error parsing int '" << val << "': " << interr;
6010 return -EINVAL;
6011 }
6012 p.target_max_bytes = n;
6013 } else if (var == "cache_target_dirty_ratio") {
6014 if (floaterr.length()) {
6015 ss << "error parsing float '" << val << "': " << floaterr;
6016 return -EINVAL;
6017 }
6018 if (f < 0 || f > 1.0) {
6019 ss << "value must be in the range 0..1";
6020 return -ERANGE;
6021 }
6022 p.cache_target_dirty_ratio_micro = uf;
6023 } else if (var == "cache_target_dirty_high_ratio") {
6024 if (floaterr.length()) {
6025 ss << "error parsing float '" << val << "': " << floaterr;
6026 return -EINVAL;
6027 }
6028 if (f < 0 || f > 1.0) {
6029 ss << "value must be in the range 0..1";
6030 return -ERANGE;
6031 }
6032 p.cache_target_dirty_high_ratio_micro = uf;
6033 } else if (var == "cache_target_full_ratio") {
6034 if (floaterr.length()) {
6035 ss << "error parsing float '" << val << "': " << floaterr;
6036 return -EINVAL;
6037 }
6038 if (f < 0 || f > 1.0) {
6039 ss << "value must be in the range 0..1";
6040 return -ERANGE;
6041 }
6042 p.cache_target_full_ratio_micro = uf;
6043 } else if (var == "cache_min_flush_age") {
6044 if (interr.length()) {
6045 ss << "error parsing int '" << val << "': " << interr;
6046 return -EINVAL;
6047 }
6048 p.cache_min_flush_age = n;
6049 } else if (var == "cache_min_evict_age") {
6050 if (interr.length()) {
6051 ss << "error parsing int '" << val << "': " << interr;
6052 return -EINVAL;
6053 }
6054 p.cache_min_evict_age = n;
6055 } else if (var == "min_read_recency_for_promote") {
6056 if (interr.length()) {
6057 ss << "error parsing integer value '" << val << "': " << interr;
6058 return -EINVAL;
6059 }
6060 p.min_read_recency_for_promote = n;
6061 } else if (var == "hit_set_grade_decay_rate") {
6062 if (interr.length()) {
6063 ss << "error parsing integer value '" << val << "': " << interr;
6064 return -EINVAL;
6065 }
6066 if (n > 100 || n < 0) {
6067 ss << "value out of range,valid range is 0 - 100";
6068 return -EINVAL;
6069 }
6070 p.hit_set_grade_decay_rate = n;
6071 } else if (var == "hit_set_search_last_n") {
6072 if (interr.length()) {
6073 ss << "error parsing integer value '" << val << "': " << interr;
6074 return -EINVAL;
6075 }
6076 if (n > p.hit_set_count || n < 0) {
6077 ss << "value out of range,valid range is 0 - hit_set_count";
6078 return -EINVAL;
6079 }
6080 p.hit_set_search_last_n = n;
6081 } else if (var == "min_write_recency_for_promote") {
6082 if (interr.length()) {
6083 ss << "error parsing integer value '" << val << "': " << interr;
6084 return -EINVAL;
6085 }
6086 p.min_write_recency_for_promote = n;
6087 } else if (var == "fast_read") {
6088 if (p.is_replicated()) {
6089 ss << "fast read is not supported in replication pool";
6090 return -EINVAL;
6091 }
6092 if (val == "true" || (interr.empty() && n == 1)) {
6093 p.fast_read = true;
6094 } else if (val == "false" || (interr.empty() && n == 0)) {
6095 p.fast_read = false;
6096 } else {
6097 ss << "expecting value 'true', 'false', '0', or '1'";
6098 return -EINVAL;
6099 }
6100 } else if (pool_opts_t::is_opt_name(var)) {
6101 if (var == "compression_mode") {
6102 auto cmode = Compressor::get_comp_mode_type(val);
6103 if (!cmode) {
6104 ss << "unrecognized compression mode '" << val << "'";
6105 return EINVAL;
6106 }
6107 } else if (var == "compression_algorithm") {
6108 auto alg = Compressor::get_comp_alg_type(val);
6109 if (!alg) {
6110 ss << "unrecognized compression_algorithm '" << val << "'";
6111 return EINVAL;
6112 }
6113 } else if (var == "compression_required_ratio") {
6114 if (floaterr.length()) {
6115 ss << "error parsing float value '" << val << "': " << floaterr;
6116 return -EINVAL;
6117 }
6118 if (f < 0 || f>1) {
6119 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
6120 return EINVAL;
6121 }
6122 } else if (var == "csum_type") {
6123 auto t = val != "unset" ? Checksummer::get_csum_string_type(val) : 0;
6124 if (t < 0 ) {
6125 ss << "unrecognized csum_type '" << val << "'";
6126 return EINVAL;
6127 }
6128 //preserve csum_type numeric value
6129 n = t;
6130 interr.clear();
6131 } else if (var == "compression_max_blob_size" ||
6132 var == "compression_min_blob_size" ||
6133 var == "csum_max_block" ||
6134 var == "csum_min_block") {
6135 if (interr.length()) {
6136 ss << "error parsing int value '" << val << "': " << interr;
6137 return -EINVAL;
6138 }
6139 }
6140
6141 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
6142 switch (desc.type) {
6143 case pool_opts_t::STR:
6144 if (val.empty()) {
6145 p.opts.unset(desc.key);
6146 } else {
6147 p.opts.set(desc.key, static_cast<std::string>(val));
6148 }
6149 break;
6150 case pool_opts_t::INT:
6151 if (interr.length()) {
6152 ss << "error parsing integer value '" << val << "': " << interr;
6153 return -EINVAL;
6154 }
6155 if (n == 0) {
6156 p.opts.unset(desc.key);
6157 } else {
6158 p.opts.set(desc.key, static_cast<int>(n));
6159 }
6160 break;
6161 case pool_opts_t::DOUBLE:
6162 if (floaterr.length()) {
6163 ss << "error parsing floating point value '" << val << "': " << floaterr;
6164 return -EINVAL;
6165 }
6166 if (f == 0) {
6167 p.opts.unset(desc.key);
6168 } else {
6169 p.opts.set(desc.key, static_cast<double>(f));
6170 }
6171 break;
6172 default:
6173 assert(!"unknown type");
6174 }
6175 } else {
6176 ss << "unrecognized variable '" << var << "'";
6177 return -EINVAL;
6178 }
6179 ss << "set pool " << pool << " " << var << " to " << val;
6180 p.last_change = pending_inc.epoch;
6181 pending_inc.new_pools[pool] = p;
6182 return 0;
6183}
6184
6185bool OSDMonitor::prepare_command(MonOpRequestRef op)
6186{
6187 op->mark_osdmon_event(__func__);
6188 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
6189 stringstream ss;
6190 map<string, cmd_vartype> cmdmap;
6191 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
6192 string rs = ss.str();
6193 mon->reply_command(op, -EINVAL, rs, get_last_committed());
6194 return true;
6195 }
6196
6197 MonSession *session = m->get_session();
6198 if (!session) {
6199 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
6200 return true;
6201 }
6202
6203 return prepare_command_impl(op, cmdmap);
6204}
6205
6206static int parse_reweights(CephContext *cct,
6207 const map<string,cmd_vartype> &cmdmap,
6208 const OSDMap& osdmap,
6209 map<int32_t, uint32_t>* weights)
6210{
6211 string weights_str;
6212 if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
6213 return -EINVAL;
6214 }
6215 std::replace(begin(weights_str), end(weights_str), '\'', '"');
6216 json_spirit::mValue json_value;
6217 if (!json_spirit::read(weights_str, json_value)) {
6218 return -EINVAL;
6219 }
6220 if (json_value.type() != json_spirit::obj_type) {
6221 return -EINVAL;
6222 }
6223 const auto obj = json_value.get_obj();
6224 try {
6225 for (auto& osd_weight : obj) {
6226 auto osd_id = std::stoi(osd_weight.first);
6227 if (!osdmap.exists(osd_id)) {
6228 return -ENOENT;
6229 }
6230 if (osd_weight.second.type() != json_spirit::str_type) {
6231 return -EINVAL;
6232 }
6233 auto weight = std::stoul(osd_weight.second.get_str());
6234 weights->insert({osd_id, weight});
6235 }
6236 } catch (const std::logic_error& e) {
6237 return -EINVAL;
6238 }
6239 return 0;
6240}
6241
6242bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
6243 map<string,cmd_vartype> &cmdmap)
6244{
6245 op->mark_osdmon_event(__func__);
6246 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
6247 bool ret = false;
6248 stringstream ss;
6249 string rs;
6250 bufferlist rdata;
6251 int err = 0;
6252
6253 string format;
6254 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
6255 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6256
6257 string prefix;
6258 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
6259
6260 int64_t osdid;
6261 string name;
6262 bool osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
6263 if (osdid_present) {
6264 ostringstream oss;
6265 oss << "osd." << osdid;
6266 name = oss.str();
6267 }
6268
6269 // Even if there's a pending state with changes that could affect
6270 // a command, considering that said state isn't yet committed, we
6271 // just don't care about those changes if the command currently being
6272 // handled acts as a no-op against the current committed state.
6273 // In a nutshell, we assume this command happens *before*.
6274 //
6275 // Let me make this clearer:
6276 //
6277 // - If we have only one client, and that client issues some
6278 // operation that would conflict with this operation but is
6279 // still on the pending state, then we would be sure that said
6280 // operation wouldn't have returned yet, so the client wouldn't
6281 // issue this operation (unless the client didn't wait for the
6282 // operation to finish, and that would be the client's own fault).
6283 //
6284 // - If we have more than one client, each client will observe
6285 // whatever is the state at the moment of the commit. So, if we
6286 // have two clients, one issuing an unlink and another issuing a
6287 // link, and if the link happens while the unlink is still on the
6288 // pending state, from the link's point-of-view this is a no-op.
6289 // If different clients are issuing conflicting operations and
6290 // they care about that, then the clients should make sure they
6291 // enforce some kind of concurrency mechanism -- from our
6292 // perspective that's what Douglas Adams would call an SEP.
6293 //
6294 // This should be used as a general guideline for most commands handled
6295 // in this function. Adapt as you see fit, but please bear in mind that
6296 // this is the expected behavior.
6297
6298
6299 if (prefix == "osd setcrushmap" ||
6300 (prefix == "osd crush set" && !osdid_present)) {
6301 dout(10) << "prepare_command setting new crush map" << dendl;
6302 bufferlist data(m->get_data());
6303 CrushWrapper crush;
6304 try {
6305 bufferlist::iterator bl(data.begin());
6306 crush.decode(bl);
6307 }
6308 catch (const std::exception &e) {
6309 err = -EINVAL;
6310 ss << "Failed to parse crushmap: " << e.what();
6311 goto reply;
6312 }
6313
6314 if (!validate_crush_against_features(&crush, ss)) {
6315 err = -EINVAL;
6316 goto reply;
6317 }
6318
6319 const auto& osdmap_pools = osdmap.get_pools();
6320 for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
6321 const int64_t pool_id = pit->first;
6322 const pg_pool_t &pool = pit->second;
6323 int ruleno = pool.get_crush_ruleset();
6324 if (!crush.rule_exists(ruleno)) {
6325 ss << " the crush rule no "<< ruleno << " for pool id " << pool_id << " is in use";
6326 err = -EINVAL;
6327 goto reply;
6328 }
6329 }
6330
6331 // sanity check: test some inputs to make sure this map isn't totally broken
6332 dout(10) << " testing map" << dendl;
6333 stringstream ess;
6334 CrushTester tester(crush, ess);
6335 // XXX: Use mon_lease as a timeout value for crushtool.
6336 // If the crushtool consistently takes longer than 'mon_lease' seconds,
6337 // then we would consistently trigger an election before the command
6338 // finishes, having a flapping monitor unable to hold quorum.
6339 int r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
6340 osdmap.get_max_osd(),
6341 g_conf->mon_lease);
6342 if (r < 0) {
6343 derr << "error on crush map: " << ess.str() << dendl;
6344 ss << "Failed crushmap test: " << ess.str();
6345 err = r;
6346 goto reply;
6347 }
6348
6349 dout(10) << " result " << ess.str() << dendl;
6350
6351 pending_inc.crush = data;
6352 ss << "set crush map";
6353 goto update;
6354
6355 } else if (prefix == "osd crush set-device-class") {
6356 if (!osdmap.exists(osdid)) {
6357 err = -ENOENT;
6358 ss << name << " does not exist. create it before updating the crush map";
6359 goto reply;
6360 }
6361
6362 string device_class;
6363 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
6364 err = -EINVAL; // no value!
6365 goto reply;
6366 }
6367
6368 CrushWrapper newcrush;
6369 _get_pending_crush(newcrush);
6370
6371 string action;
6372 if (newcrush.item_exists(osdid)) {
6373 action = "updating";
6374 } else {
6375 action = "creating";
6376 newcrush.set_item_name(osdid, name);
6377 }
6378
6379 dout(5) << action << " crush item id " << osdid << " name '"
6380 << name << "' device_class " << device_class << dendl;
6381 err = newcrush.update_device_class(g_ceph_context, osdid, device_class, name);
6382
6383 if (err < 0)
6384 goto reply;
6385
6386 if (err == 0 && !_have_pending_crush()) {
6387 ss << "set-device-class item id " << osdid << " name '" << name << "' device_class "
6388 << device_class << " : no change";
6389 goto reply;
6390 }
6391
6392 pending_inc.crush.clear();
6393 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6394 ss << "set-device-class item id " << osdid << " name '" << name << "' device_class "
6395 << device_class;
6396 getline(ss, rs);
6397 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6398 get_last_committed() + 1));
6399 return true;
6400
6401 } else if (prefix == "osd crush add-bucket") {
6402 // os crush add-bucket <name> <type>
6403 string name, typestr;
6404 cmd_getval(g_ceph_context, cmdmap, "name", name);
6405 cmd_getval(g_ceph_context, cmdmap, "type", typestr);
6406
6407 if (!_have_pending_crush() &&
6408 _get_stable_crush().name_exists(name)) {
6409 ss << "bucket '" << name << "' already exists";
6410 goto reply;
6411 }
6412
6413 CrushWrapper newcrush;
6414 _get_pending_crush(newcrush);
6415
6416 if (newcrush.name_exists(name)) {
6417 ss << "bucket '" << name << "' already exists";
6418 goto update;
6419 }
6420 int type = newcrush.get_type_id(typestr);
6421 if (type < 0) {
6422 ss << "type '" << typestr << "' does not exist";
6423 err = -EINVAL;
6424 goto reply;
6425 }
6426 if (type == 0) {
6427 ss << "type '" << typestr << "' is for devices, not buckets";
6428 err = -EINVAL;
6429 goto reply;
6430 }
6431 int bucketno;
6432 err = newcrush.add_bucket(0, 0,
6433 CRUSH_HASH_DEFAULT, type, 0, NULL,
6434 NULL, &bucketno);
6435 if (err < 0) {
6436 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
6437 goto reply;
6438 }
6439 err = newcrush.set_item_name(bucketno, name);
6440 if (err < 0) {
6441 ss << "error setting bucket name to '" << name << "'";
6442 goto reply;
6443 }
6444
6445 pending_inc.crush.clear();
6446 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6447 ss << "added bucket " << name << " type " << typestr
6448 << " to crush map";
6449 goto update;
6450 } else if (prefix == "osd crush rename-bucket") {
6451 string srcname, dstname;
6452 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
6453 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
6454
6455 err = crush_rename_bucket(srcname, dstname, &ss);
6456 if (err == -EALREADY) // equivalent to success for idempotency
6457 err = 0;
6458 if (err)
6459 goto reply;
6460 else
6461 goto update;
6462 } else if (prefix == "osd crush class create") {
6463 string device_class;
6464 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
6465 err = -EINVAL; // no value!
6466 goto reply;
6467 }
6468
6469 if (!_have_pending_crush() &&
6470 _get_stable_crush().class_exists(device_class)) {
6471 ss << "class '" << device_class << "' already exists";
6472 goto reply;
6473 }
6474
6475 CrushWrapper newcrush;
6476 _get_pending_crush(newcrush);
6477
6478 if (newcrush.class_exists(name)) {
6479 ss << "class '" << device_class << "' already exists";
6480 goto update;
6481 }
6482
6483 int class_id = newcrush.get_or_create_class_id(device_class);
6484
6485 pending_inc.crush.clear();
6486 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6487 ss << "created class " << device_class << " with id " << class_id
6488 << " to crush map";
6489 goto update;
6490
6491 } else if (prefix == "osd crush class rm") {
6492 string device_class;
6493 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
6494 err = -EINVAL; // no value!
6495 goto reply;
6496 }
6497
6498 CrushWrapper newcrush;
6499 _get_pending_crush(newcrush);
6500
6501 if (!newcrush.class_exists(device_class)) {
6502 err = -ENOENT;
6503 ss << "class '" << device_class << "' does not exist";
6504 goto reply;
6505 }
6506
6507 int class_id = newcrush.get_class_id(device_class);
6508
6509 if (newcrush.class_is_in_use(class_id)) {
6510 err = -EBUSY;
6511 ss << "class '" << device_class << "' is in use";
6512 goto reply;
6513 }
6514
6515 err = newcrush.remove_class_name(device_class);
6516 if (err < 0) {
6517 ss << "class '" << device_class << "' cannot be removed '"
6518 << cpp_strerror(err) << "'";
6519 goto reply;
6520 }
6521
6522 pending_inc.crush.clear();
6523 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6524 ss << "removed class " << device_class << " with id " << class_id
6525 << " from crush map";
6526 goto update;
6527
6528 } else if (osdid_present &&
6529 (prefix == "osd crush set" || prefix == "osd crush add")) {
6530 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
6531 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
6532 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
6533
6534 if (!osdmap.exists(osdid)) {
6535 err = -ENOENT;
6536 ss << name << " does not exist. create it before updating the crush map";
6537 goto reply;
6538 }
6539
6540 double weight;
6541 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
6542 ss << "unable to parse weight value '"
6543 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
6544 err = -EINVAL;
6545 goto reply;
6546 }
6547
6548 string args;
6549 vector<string> argvec;
6550 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
6551 map<string,string> loc;
6552 CrushWrapper::parse_loc_map(argvec, &loc);
6553
6554 if (prefix == "osd crush set"
6555 && !_get_stable_crush().item_exists(osdid)) {
6556 err = -ENOENT;
6557 ss << "unable to set item id " << osdid << " name '" << name
6558 << "' weight " << weight << " at location " << loc
6559 << ": does not exist";
6560 goto reply;
6561 }
6562
6563 dout(5) << "adding/updating crush item id " << osdid << " name '"
6564 << name << "' weight " << weight << " at location "
6565 << loc << dendl;
6566 CrushWrapper newcrush;
6567 _get_pending_crush(newcrush);
6568
6569 string action;
6570 if (prefix == "osd crush set" ||
6571 newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
6572 action = "set";
6573 err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
6574 } else {
6575 action = "add";
6576 err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
6577 if (err == 0)
6578 err = 1;
6579 }
6580
6581 if (err < 0)
6582 goto reply;
6583
6584 if (err == 0 && !_have_pending_crush()) {
6585 ss << action << " item id " << osdid << " name '" << name << "' weight "
6586 << weight << " at location " << loc << ": no change";
6587 goto reply;
6588 }
6589
6590 pending_inc.crush.clear();
6591 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6592 ss << action << " item id " << osdid << " name '" << name << "' weight "
6593 << weight << " at location " << loc << " to crush map";
6594 getline(ss, rs);
6595 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6596 get_last_committed() + 1));
6597 return true;
6598
6599 } else if (prefix == "osd crush create-or-move") {
6600 do {
6601 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
6602 if (!osdmap.exists(osdid)) {
6603 err = -ENOENT;
6604 ss << name << " does not exist. create it before updating the crush map";
6605 goto reply;
6606 }
6607
6608 double weight;
6609 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
6610 ss << "unable to parse weight value '"
6611 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
6612 err = -EINVAL;
6613 goto reply;
6614 }
6615
6616 string args;
6617 vector<string> argvec;
6618 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
6619 map<string,string> loc;
6620 CrushWrapper::parse_loc_map(argvec, &loc);
6621
6622 dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
6623 << " at location " << loc << dendl;
6624
6625 CrushWrapper newcrush;
6626 _get_pending_crush(newcrush);
6627
6628 err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
6629 if (err == 0) {
6630 ss << "create-or-move updated item name '" << name << "' weight " << weight
6631 << " at location " << loc << " to crush map";
6632 break;
6633 }
6634 if (err > 0) {
6635 pending_inc.crush.clear();
6636 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6637 ss << "create-or-move updating item name '" << name << "' weight " << weight
6638 << " at location " << loc << " to crush map";
6639 getline(ss, rs);
6640 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6641 get_last_committed() + 1));
6642 return true;
6643 }
6644 } while (false);
6645
6646 } else if (prefix == "osd crush move") {
6647 do {
6648 // osd crush move <name> <loc1> [<loc2> ...]
6649
6650 string args;
6651 vector<string> argvec;
6652 cmd_getval(g_ceph_context, cmdmap, "name", name);
6653 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
6654 map<string,string> loc;
6655 CrushWrapper::parse_loc_map(argvec, &loc);
6656
6657 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
6658 CrushWrapper newcrush;
6659 _get_pending_crush(newcrush);
6660
6661 if (!newcrush.name_exists(name)) {
6662 err = -ENOENT;
6663 ss << "item " << name << " does not exist";
6664 break;
6665 }
6666 int id = newcrush.get_item_id(name);
6667
6668 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
6669 if (id >= 0) {
6670 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
6671 } else {
6672 err = newcrush.move_bucket(g_ceph_context, id, loc);
6673 }
6674 if (err >= 0) {
6675 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
6676 pending_inc.crush.clear();
6677 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6678 getline(ss, rs);
6679 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6680 get_last_committed() + 1));
6681 return true;
6682 }
6683 } else {
6684 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
6685 err = 0;
6686 }
6687 } while (false);
6688
6689 } else if (prefix == "osd crush link") {
6690 // osd crush link <name> <loc1> [<loc2> ...]
6691 string name;
6692 cmd_getval(g_ceph_context, cmdmap, "name", name);
6693 vector<string> argvec;
6694 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
6695 map<string,string> loc;
6696 CrushWrapper::parse_loc_map(argvec, &loc);
6697
6698 // Need an explicit check for name_exists because get_item_id returns
6699 // 0 on unfound.
6700 int id = osdmap.crush->get_item_id(name);
6701 if (!osdmap.crush->name_exists(name)) {
6702 err = -ENOENT;
6703 ss << "item " << name << " does not exist";
6704 goto reply;
6705 } else {
6706 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
6707 }
6708 if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
6709 ss << "no need to move item id " << id << " name '" << name
6710 << "' to location " << loc << " in crush map";
6711 err = 0;
6712 goto reply;
6713 }
6714
6715 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
6716 CrushWrapper newcrush;
6717 _get_pending_crush(newcrush);
6718
6719 if (!newcrush.name_exists(name)) {
6720 err = -ENOENT;
6721 ss << "item " << name << " does not exist";
6722 goto reply;
6723 } else {
6724 int id = newcrush.get_item_id(name);
6725 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
6726 err = newcrush.link_bucket(g_ceph_context, id, loc);
6727 if (err >= 0) {
6728 ss << "linked item id " << id << " name '" << name
6729 << "' to location " << loc << " in crush map";
6730 pending_inc.crush.clear();
6731 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6732 } else {
6733 ss << "cannot link item id " << id << " name '" << name
6734 << "' to location " << loc;
6735 goto reply;
6736 }
6737 } else {
6738 ss << "no need to move item id " << id << " name '" << name
6739 << "' to location " << loc << " in crush map";
6740 err = 0;
6741 }
6742 }
6743 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
6744 get_last_committed() + 1));
6745 return true;
6746 } else if (prefix == "osd crush rm" ||
6747 prefix == "osd crush remove" ||
6748 prefix == "osd crush unlink") {
6749 do {
6750 // osd crush rm <id> [ancestor]
6751 CrushWrapper newcrush;
6752 _get_pending_crush(newcrush);
6753
6754 string name;
6755 cmd_getval(g_ceph_context, cmdmap, "name", name);
6756
6757 if (!osdmap.crush->name_exists(name)) {
6758 err = 0;
6759 ss << "device '" << name << "' does not appear in the crush map";
6760 break;
6761 }
6762 if (!newcrush.name_exists(name)) {
6763 err = 0;
6764 ss << "device '" << name << "' does not appear in the crush map";
6765 getline(ss, rs);
6766 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6767 get_last_committed() + 1));
6768 return true;
6769 }
6770 int id = newcrush.get_item_id(name);
6771 bool unlink_only = prefix == "osd crush unlink";
6772 string ancestor_str;
6773 if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
6774 if (!newcrush.name_exists(ancestor_str)) {
6775 err = -ENOENT;
6776 ss << "ancestor item '" << ancestor_str
6777 << "' does not appear in the crush map";
6778 break;
6779 }
6780 int ancestor = newcrush.get_item_id(ancestor_str);
6781 err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
6782 unlink_only);
6783 } else {
6784 err = newcrush.remove_item(g_ceph_context, id, unlink_only);
6785 }
6786 if (err == -ENOENT) {
6787 ss << "item " << id << " does not appear in that position";
6788 err = 0;
6789 break;
6790 }
6791 if (err == 0) {
6792 pending_inc.crush.clear();
6793 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6794 ss << "removed item id " << id << " name '" << name << "' from crush map";
6795 getline(ss, rs);
6796 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6797 get_last_committed() + 1));
6798 return true;
6799 }
6800 } while (false);
6801
6802 } else if (prefix == "osd crush reweight-all") {
6803 // osd crush reweight <name> <weight>
6804 CrushWrapper newcrush;
6805 _get_pending_crush(newcrush);
6806
6807 newcrush.reweight(g_ceph_context);
6808 pending_inc.crush.clear();
6809 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6810 ss << "reweighted crush hierarchy";
6811 getline(ss, rs);
6812 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6813 get_last_committed() + 1));
6814 return true;
6815 } else if (prefix == "osd crush reweight") {
6816 // osd crush reweight <name> <weight>
6817 CrushWrapper newcrush;
6818 _get_pending_crush(newcrush);
6819
6820 string name;
6821 cmd_getval(g_ceph_context, cmdmap, "name", name);
6822 if (!newcrush.name_exists(name)) {
6823 err = -ENOENT;
6824 ss << "device '" << name << "' does not appear in the crush map";
6825 goto reply;
6826 }
6827
6828 int id = newcrush.get_item_id(name);
6829 if (id < 0) {
6830 ss << "device '" << name << "' is not a leaf in the crush map";
6831 err = -EINVAL;
6832 goto reply;
6833 }
6834 double w;
6835 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
6836 ss << "unable to parse weight value '"
6837 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
6838 err = -EINVAL;
6839 goto reply;
6840 }
6841
6842 err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
6843 if (err < 0)
6844 goto reply;
6845 pending_inc.crush.clear();
6846 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6847 ss << "reweighted item id " << id << " name '" << name << "' to " << w
6848 << " in crush map";
6849 getline(ss, rs);
6850 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6851 get_last_committed() + 1));
6852 return true;
6853 } else if (prefix == "osd crush reweight-subtree") {
6854 // osd crush reweight <name> <weight>
6855 CrushWrapper newcrush;
6856 _get_pending_crush(newcrush);
6857
6858 string name;
6859 cmd_getval(g_ceph_context, cmdmap, "name", name);
6860 if (!newcrush.name_exists(name)) {
6861 err = -ENOENT;
6862 ss << "device '" << name << "' does not appear in the crush map";
6863 goto reply;
6864 }
6865
6866 int id = newcrush.get_item_id(name);
6867 if (id >= 0) {
6868 ss << "device '" << name << "' is not a subtree in the crush map";
6869 err = -EINVAL;
6870 goto reply;
6871 }
6872 double w;
6873 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
6874 ss << "unable to parse weight value '"
6875 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
6876 err = -EINVAL;
6877 goto reply;
6878 }
6879
6880 err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
6881 if (err < 0)
6882 goto reply;
6883 pending_inc.crush.clear();
6884 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6885 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
6886 << " in crush map";
6887 getline(ss, rs);
6888 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6889 get_last_committed() + 1));
6890 return true;
6891 } else if (prefix == "osd crush tunables") {
6892 CrushWrapper newcrush;
6893 _get_pending_crush(newcrush);
6894
6895 err = 0;
6896 string profile;
6897 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
6898 if (profile == "legacy" || profile == "argonaut") {
6899 newcrush.set_tunables_legacy();
6900 } else if (profile == "bobtail") {
6901 newcrush.set_tunables_bobtail();
6902 } else if (profile == "firefly") {
6903 newcrush.set_tunables_firefly();
6904 } else if (profile == "hammer") {
6905 newcrush.set_tunables_hammer();
6906 } else if (profile == "jewel") {
6907 newcrush.set_tunables_jewel();
6908 } else if (profile == "optimal") {
6909 newcrush.set_tunables_optimal();
6910 } else if (profile == "default") {
6911 newcrush.set_tunables_default();
6912 } else {
6913 ss << "unrecognized profile '" << profile << "'";
6914 err = -EINVAL;
6915 goto reply;
6916 }
6917
6918 if (!validate_crush_against_features(&newcrush, ss)) {
6919 err = -EINVAL;
6920 goto reply;
6921 }
6922
6923 pending_inc.crush.clear();
6924 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6925 ss << "adjusted tunables profile to " << profile;
6926 getline(ss, rs);
6927 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6928 get_last_committed() + 1));
6929 return true;
6930 } else if (prefix == "osd crush set-tunable") {
6931 CrushWrapper newcrush;
6932 _get_pending_crush(newcrush);
6933
6934 err = 0;
6935 string tunable;
6936 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
6937
6938 int64_t value = -1;
6939 if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
6940 err = -EINVAL;
6941 ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
6942 goto reply;
6943 }
6944
6945 if (tunable == "straw_calc_version") {
6946 if (value < 0 || value > 1) {
6947 ss << "value must be 0 or 1; got " << value;
6948 err = -EINVAL;
6949 goto reply;
6950 }
6951 newcrush.set_straw_calc_version(value);
6952 } else {
6953 ss << "unrecognized tunable '" << tunable << "'";
6954 err = -EINVAL;
6955 goto reply;
6956 }
6957
6958 if (!validate_crush_against_features(&newcrush, ss)) {
6959 err = -EINVAL;
6960 goto reply;
6961 }
6962
6963 pending_inc.crush.clear();
6964 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6965 ss << "adjusted tunable " << tunable << " to " << value;
6966 getline(ss, rs);
6967 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
6968 get_last_committed() + 1));
6969 return true;
6970
6971 } else if (prefix == "osd crush rule create-simple") {
6972 string name, root, type, mode;
6973 cmd_getval(g_ceph_context, cmdmap, "name", name);
6974 cmd_getval(g_ceph_context, cmdmap, "root", root);
6975 cmd_getval(g_ceph_context, cmdmap, "type", type);
6976 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
6977 if (mode == "")
6978 mode = "firstn";
6979
6980 if (osdmap.crush->rule_exists(name)) {
6981 // The name is uniquely associated to a ruleid and the ruleset it contains
6982 // From the user point of view, the ruleset is more meaningfull.
6983 ss << "ruleset " << name << " already exists";
6984 err = 0;
6985 goto reply;
6986 }
6987
6988 CrushWrapper newcrush;
6989 _get_pending_crush(newcrush);
6990
6991 if (newcrush.rule_exists(name)) {
6992 // The name is uniquely associated to a ruleid and the ruleset it contains
6993 // From the user point of view, the ruleset is more meaningfull.
6994 ss << "ruleset " << name << " already exists";
6995 err = 0;
6996 } else {
6997 int ruleno = newcrush.add_simple_ruleset(name, root, type, mode,
6998 pg_pool_t::TYPE_REPLICATED, &ss);
6999 if (ruleno < 0) {
7000 err = ruleno;
7001 goto reply;
7002 }
7003
7004 pending_inc.crush.clear();
7005 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7006 }
7007 getline(ss, rs);
7008 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7009 get_last_committed() + 1));
7010 return true;
7011
7012 } else if (prefix == "osd erasure-code-profile rm") {
7013 string name;
7014 cmd_getval(g_ceph_context, cmdmap, "name", name);
7015
7016 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
7017 goto wait;
7018
7019 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
7020 err = -EBUSY;
7021 goto reply;
7022 }
7023
7024 if (osdmap.has_erasure_code_profile(name) ||
7025 pending_inc.new_erasure_code_profiles.count(name)) {
7026 if (osdmap.has_erasure_code_profile(name)) {
7027 pending_inc.old_erasure_code_profiles.push_back(name);
7028 } else {
7029 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
7030 pending_inc.new_erasure_code_profiles.erase(name);
7031 }
7032
7033 getline(ss, rs);
7034 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7035 get_last_committed() + 1));
7036 return true;
7037 } else {
7038 ss << "erasure-code-profile " << name << " does not exist";
7039 err = 0;
7040 goto reply;
7041 }
7042
7043 } else if (prefix == "osd erasure-code-profile set") {
7044 string name;
7045 cmd_getval(g_ceph_context, cmdmap, "name", name);
7046 vector<string> profile;
7047 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
7048 bool force;
7049 if (profile.size() > 0 && profile.back() == "--force") {
7050 profile.pop_back();
7051 force = true;
7052 } else {
7053 force = false;
7054 }
7055 map<string,string> profile_map;
7056 err = parse_erasure_code_profile(profile, &profile_map, &ss);
7057 if (err)
7058 goto reply;
7059 if (profile_map.find("plugin") == profile_map.end()) {
7060 ss << "erasure-code-profile " << profile_map
7061 << " must contain a plugin entry" << std::endl;
7062 err = -EINVAL;
7063 goto reply;
7064 }
7065 string plugin = profile_map["plugin"];
7066
7067 if (pending_inc.has_erasure_code_profile(name)) {
7068 dout(20) << "erasure code profile " << name << " try again" << dendl;
7069 goto wait;
7070 } else {
7071 if (plugin == "isa" || plugin == "lrc") {
7072 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
7073 if (err == -EAGAIN)
7074 goto wait;
7075 if (err)
7076 goto reply;
7077 } else if (plugin == "shec") {
7078 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
7079 if (err == -EAGAIN)
7080 goto wait;
7081 if (err)
7082 goto reply;
7083 }
7084 err = normalize_profile(name, profile_map, force, &ss);
7085 if (err)
7086 goto reply;
7087
7088 if (osdmap.has_erasure_code_profile(name)) {
7089 ErasureCodeProfile existing_profile_map =
7090 osdmap.get_erasure_code_profile(name);
7091 err = normalize_profile(name, existing_profile_map, force, &ss);
7092 if (err)
7093 goto reply;
7094
7095 if (existing_profile_map == profile_map) {
7096 err = 0;
7097 goto reply;
7098 }
7099 if (!force) {
7100 err = -EPERM;
7101 ss << "will not override erasure code profile " << name
7102 << " because the existing profile "
7103 << existing_profile_map
7104 << " is different from the proposed profile "
7105 << profile_map;
7106 goto reply;
7107 }
7108 }
7109
7110 dout(20) << "erasure code profile set " << name << "="
7111 << profile_map << dendl;
7112 pending_inc.set_erasure_code_profile(name, profile_map);
7113 }
7114
7115 getline(ss, rs);
7116 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7117 get_last_committed() + 1));
7118 return true;
7119
7120 } else if (prefix == "osd crush rule create-erasure") {
7121 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
7122 if (err == -EAGAIN)
7123 goto wait;
7124 if (err)
7125 goto reply;
7126 string name, poolstr;
7127 cmd_getval(g_ceph_context, cmdmap, "name", name);
7128 string profile;
7129 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
7130 if (profile == "")
7131 profile = "default";
7132 if (profile == "default") {
7133 if (!osdmap.has_erasure_code_profile(profile)) {
7134 if (pending_inc.has_erasure_code_profile(profile)) {
7135 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
7136 goto wait;
7137 }
7138
7139 map<string,string> profile_map;
7140 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
7141 profile_map,
7142 &ss);
7143 if (err)
7144 goto reply;
7145 err = normalize_profile(name, profile_map, true, &ss);
7146 if (err)
7147 goto reply;
7148 dout(20) << "erasure code profile set " << profile << "="
7149 << profile_map << dendl;
7150 pending_inc.set_erasure_code_profile(profile, profile_map);
7151 goto wait;
7152 }
7153 }
7154
7155 int ruleset;
7156 err = crush_ruleset_create_erasure(name, profile, &ruleset, &ss);
7157 if (err < 0) {
7158 switch(err) {
7159 case -EEXIST: // return immediately
7160 ss << "rule " << name << " already exists";
7161 err = 0;
7162 goto reply;
7163 break;
7164 case -EALREADY: // wait for pending to be proposed
7165 ss << "rule " << name << " already exists";
7166 err = 0;
7167 break;
7168 default: // non recoverable error
7169 goto reply;
7170 break;
7171 }
7172 } else {
7173 ss << "created ruleset " << name << " at " << ruleset;
7174 }
7175
7176 getline(ss, rs);
7177 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7178 get_last_committed() + 1));
7179 return true;
7180
7181 } else if (prefix == "osd crush rule rm") {
7182 string name;
7183 cmd_getval(g_ceph_context, cmdmap, "name", name);
7184
7185 if (!osdmap.crush->rule_exists(name)) {
7186 ss << "rule " << name << " does not exist";
7187 err = 0;
7188 goto reply;
7189 }
7190
7191 CrushWrapper newcrush;
7192 _get_pending_crush(newcrush);
7193
7194 if (!newcrush.rule_exists(name)) {
7195 ss << "rule " << name << " does not exist";
7196 err = 0;
7197 } else {
7198 int ruleno = newcrush.get_rule_id(name);
7199 assert(ruleno >= 0);
7200
7201 // make sure it is not in use.
7202 // FIXME: this is ok in some situations, but let's not bother with that
7203 // complexity now.
7204 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
7205 if (osdmap.crush_ruleset_in_use(ruleset)) {
7206 ss << "crush ruleset " << name << " " << ruleset << " is in use";
7207 err = -EBUSY;
7208 goto reply;
7209 }
7210
7211 err = newcrush.remove_rule(ruleno);
7212 if (err < 0) {
7213 goto reply;
7214 }
7215
7216 pending_inc.crush.clear();
7217 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7218 }
7219 getline(ss, rs);
7220 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7221 get_last_committed() + 1));
7222 return true;
7223
7224 } else if (prefix == "osd setmaxosd") {
7225 int64_t newmax;
7226 if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
7227 ss << "unable to parse 'newmax' value '"
7228 << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
7229 err = -EINVAL;
7230 goto reply;
7231 }
7232
7233 if (newmax > g_conf->mon_max_osd) {
7234 err = -ERANGE;
7235 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
7236 << g_conf->mon_max_osd << ")";
7237 goto reply;
7238 }
7239
7240 // Don't allow shrinking OSD number as this will cause data loss
7241 // and may cause kernel crashes.
7242 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
7243 if (newmax < osdmap.get_max_osd()) {
7244 // Check if the OSDs exist between current max and new value.
7245 // If there are any OSDs exist, then don't allow shrinking number
7246 // of OSDs.
7247 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
7248 if (osdmap.exists(i)) {
7249 err = -EBUSY;
7250 ss << "cannot shrink max_osd to " << newmax
7251 << " because osd." << i << " (and possibly others) still in use";
7252 goto reply;
7253 }
7254 }
7255 }
7256
7257 pending_inc.new_max_osd = newmax;
7258 ss << "set new max_osd = " << pending_inc.new_max_osd;
7259 getline(ss, rs);
7260 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7261 get_last_committed() + 1));
7262 return true;
7263
7264 } else if (prefix == "osd set-full-ratio" ||
7265 prefix == "osd set-backfillfull-ratio" ||
7266 prefix == "osd set-nearfull-ratio") {
7267 if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
7268 ss << "you must complete the upgrade and set require_luminous_osds before"
7269 << " using the new interface";
7270 err = -EPERM;
7271 goto reply;
7272 }
7273 double n;
7274 if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
7275 ss << "unable to parse 'ratio' value '"
7276 << cmd_vartype_stringify(cmdmap["who"]) << "'";
7277 err = -EINVAL;
7278 goto reply;
7279 }
7280 if (prefix == "osd set-full-ratio")
7281 pending_inc.new_full_ratio = n;
7282 else if (prefix == "osd set-backfillfull-ratio")
7283 pending_inc.new_backfillfull_ratio = n;
7284 else if (prefix == "osd set-nearfull-ratio")
7285 pending_inc.new_nearfull_ratio = n;
7286 ss << prefix << " " << n;
7287 getline(ss, rs);
7288 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7289 get_last_committed() + 1));
7290 return true;
7291 } else if (prefix == "osd set-require-min-compat-client") {
7292 if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
7293 ss << "you must complete the upgrade and set require_luminous_osds before"
7294 << " using the new interface";
7295 err = -EPERM;
7296 goto reply;
7297 }
7298 string v;
7299 cmd_getval(g_ceph_context, cmdmap, "version", v);
7300 if (v != "luminous" && v != "kraken" && v != "jewel" && v != "infernalis" &&
7301 v != "hammer" && v != "giant" && v != "firefly" && v != "emperor" &&
7302 v != "dumpling" && v != "cuttlefish" && v != "bobtail" && v != "argonaut") {
7303 ss << "version " << v << " is not recognized";
7304 err = -EINVAL;
7305 goto reply;
7306 }
7307 OSDMap newmap;
7308 newmap.deepish_copy_from(osdmap);
7309 newmap.apply_incremental(pending_inc);
7310 newmap.require_min_compat_client = v;
7311 auto mv = newmap.get_min_compat_client();
7312 if (v < mv.first) {
7313 ss << "osdmap current utilizes features that require " << mv
7314 << "; cannot set require_min_compat_client below that to " << v;
7315 err = -EPERM;
7316 goto reply;
7317 }
7318 ss << "set require_min_compat_client to " << v;
7319 pending_inc.new_require_min_compat_client = v;
7320 getline(ss, rs);
7321 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7322 get_last_committed() + 1));
7323 return true;
7324 } else if (prefix == "osd pause") {
7325 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
7326
7327 } else if (prefix == "osd unpause") {
7328 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
7329
7330 } else if (prefix == "osd set") {
7331 string key;
7332 cmd_getval(g_ceph_context, cmdmap, "key", key);
7333 if (key == "full")
7334 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
7335 else if (key == "pause")
7336 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
7337 else if (key == "noup")
7338 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
7339 else if (key == "nodown")
7340 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
7341 else if (key == "noout")
7342 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
7343 else if (key == "noin")
7344 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
7345 else if (key == "nobackfill")
7346 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
7347 else if (key == "norebalance")
7348 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
7349 else if (key == "norecover")
7350 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
7351 else if (key == "noscrub")
7352 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
7353 else if (key == "nodeep-scrub")
7354 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
7355 else if (key == "notieragent")
7356 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
7357 else if (key == "sortbitwise") {
7358 if (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT) {
7359 return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
7360 } else {
7361 ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
7362 err = -EPERM;
7363 }
7364 } else if (key == "require_jewel_osds") {
7365 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
7366 ss << "the sortbitwise flag must be set before require_jewel_osds";
7367 err = -EPERM;
7368 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)) {
7369 return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
7370 } else {
7371 ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
7372 err = -EPERM;
7373 }
7374 } else if (key == "require_kraken_osds") {
7375 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
7376 ss << "the sortbitwise flag must be set before require_kraken_osds";
7377 err = -EPERM;
7378 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)) {
7379 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
7380 // ensure JEWEL is also set
7381 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
7382 return r;
7383 } else {
7384 ss << "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
7385 err = -EPERM;
7386 }
7387 } else if (key == "require_luminous_osds") {
7388 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
7389 ss << "the sortbitwise flag must be set before require_luminous_osds";
7390 err = -EPERM;
7391 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS)) {
7392 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_LUMINOUS);
7393 // ensure JEWEL and KRAKEN are also set
7394 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
7395 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_KRAKEN;
7396 return r;
7397 } else {
7398 ss << "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
7399 err = -EPERM;
7400 }
7401 } else {
7402 ss << "unrecognized flag '" << key << "'";
7403 err = -EINVAL;
7404 }
7405
7406 } else if (prefix == "osd unset") {
7407 string key;
7408 cmd_getval(g_ceph_context, cmdmap, "key", key);
7409 if (key == "full")
7410 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
7411 else if (key == "pause")
7412 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
7413 else if (key == "noup")
7414 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
7415 else if (key == "nodown")
7416 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
7417 else if (key == "noout")
7418 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
7419 else if (key == "noin")
7420 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
7421 else if (key == "nobackfill")
7422 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
7423 else if (key == "norebalance")
7424 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
7425 else if (key == "norecover")
7426 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
7427 else if (key == "noscrub")
7428 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
7429 else if (key == "nodeep-scrub")
7430 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
7431 else if (key == "notieragent")
7432 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
7433 else if (key == "sortbitwise") {
7434 ss << "the sortbitwise flag is required and cannot be unset";
7435 err = -EPERM;
7436 } else {
7437 ss << "unrecognized flag '" << key << "'";
7438 err = -EINVAL;
7439 }
7440
7441 } else if (prefix == "osd cluster_snap") {
7442 // ** DISABLE THIS FOR NOW **
7443 ss << "cluster snapshot currently disabled (broken implementation)";
7444 // ** DISABLE THIS FOR NOW **
7445
7446 } else if (prefix == "osd down" ||
7447 prefix == "osd out" ||
7448 prefix == "osd in" ||
7449 prefix == "osd rm") {
7450
7451 bool any = false;
7452
7453 vector<string> idvec;
7454 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7455 for (unsigned j = 0; j < idvec.size(); j++) {
7456 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7457 if (osd < 0) {
7458 ss << "invalid osd id" << osd;
7459 err = -EINVAL;
7460 continue;
7461 } else if (!osdmap.exists(osd)) {
7462 ss << "osd." << osd << " does not exist. ";
7463 continue;
7464 }
7465 if (prefix == "osd down") {
7466 if (osdmap.is_down(osd)) {
7467 ss << "osd." << osd << " is already down. ";
7468 } else {
7469 pending_inc.new_state[osd] = CEPH_OSD_UP;
7470 ss << "marked down osd." << osd << ". ";
7471 any = true;
7472 }
7473 } else if (prefix == "osd out") {
7474 if (osdmap.is_out(osd)) {
7475 ss << "osd." << osd << " is already out. ";
7476 } else {
7477 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
7478 if (osdmap.osd_weight[osd]) {
7479 if (pending_inc.new_xinfo.count(osd) == 0) {
7480 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
7481 }
7482 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7483 }
7484 ss << "marked out osd." << osd << ". ";
7485 any = true;
7486 }
7487 } else if (prefix == "osd in") {
7488 if (osdmap.is_in(osd)) {
7489 ss << "osd." << osd << " is already in. ";
7490 } else {
7491 if (osdmap.osd_xinfo[osd].old_weight > 0) {
7492 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
7493 if (pending_inc.new_xinfo.count(osd) == 0) {
7494 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
7495 }
7496 pending_inc.new_xinfo[osd].old_weight = 0;
7497 } else {
7498 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7499 }
7500 ss << "marked in osd." << osd << ". ";
7501 any = true;
7502 }
7503 } else if (prefix == "osd rm") {
7504 if (osdmap.is_up(osd)) {
7505 if (any)
7506 ss << ", ";
7507 ss << "osd." << osd << " is still up; must be down before removal. ";
7508 err = -EBUSY;
7509 } else {
7510 pending_inc.new_state[osd] = osdmap.get_state(osd);
7511 pending_inc.new_uuid[osd] = uuid_d();
7512 pending_metadata_rm.insert(osd);
7513 if (any) {
7514 ss << ", osd." << osd;
7515 } else {
7516 ss << "removed osd." << osd;
7517 }
7518 any = true;
7519 }
7520 }
7521 }
7522 if (any) {
7523 getline(ss, rs);
7524 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
7525 get_last_committed() + 1));
7526 return true;
7527 }
7528 } else if (prefix == "osd pg-temp") {
7529 string pgidstr;
7530 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
7531 ss << "unable to parse 'pgid' value '"
7532 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
7533 err = -EINVAL;
7534 goto reply;
7535 }
7536 pg_t pgid;
7537 if (!pgid.parse(pgidstr.c_str())) {
7538 ss << "invalid pgid '" << pgidstr << "'";
7539 err = -EINVAL;
7540 goto reply;
7541 }
7542 if (!osdmap.pg_exists(pgid)) {
7543 ss << "pg " << pgid << " does not exist";
7544 err = -ENOENT;
7545 goto reply;
7546 }
7547 if (pending_inc.new_pg_temp.count(pgid)) {
7548 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
7549 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7550 return true;
7551 }
7552
7553 vector<int64_t> id_vec;
7554 vector<int32_t> new_pg_temp;
7555 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
7556 ss << "unable to parse 'id' value(s) '"
7557 << cmd_vartype_stringify(cmdmap["id"]) << "'";
7558 err = -EINVAL;
7559 goto reply;
7560 }
7561 for (auto osd : id_vec) {
7562 if (!osdmap.exists(osd)) {
7563 ss << "osd." << osd << " does not exist";
7564 err = -ENOENT;
7565 goto reply;
7566 }
7567 new_pg_temp.push_back(osd);
7568 }
7569
7570 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
7571 new_pg_temp.begin(), new_pg_temp.end());
7572 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
7573 goto update;
7574 } else if (prefix == "osd primary-temp") {
7575 string pgidstr;
7576 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
7577 ss << "unable to parse 'pgid' value '"
7578 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
7579 err = -EINVAL;
7580 goto reply;
7581 }
7582 pg_t pgid;
7583 if (!pgid.parse(pgidstr.c_str())) {
7584 ss << "invalid pgid '" << pgidstr << "'";
7585 err = -EINVAL;
7586 goto reply;
7587 }
7588 if (!osdmap.pg_exists(pgid)) {
7589 ss << "pg " << pgid << " does not exist";
7590 err = -ENOENT;
7591 goto reply;
7592 }
7593
7594 int64_t osd;
7595 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
7596 ss << "unable to parse 'id' value '"
7597 << cmd_vartype_stringify(cmdmap["id"]) << "'";
7598 err = -EINVAL;
7599 goto reply;
7600 }
7601 if (osd != -1 && !osdmap.exists(osd)) {
7602 ss << "osd." << osd << " does not exist";
7603 err = -ENOENT;
7604 goto reply;
7605 }
7606
7607 if (osdmap.require_min_compat_client.length() &&
7608 osdmap.require_min_compat_client < "firefly") {
7609 ss << "require_min_compat_client " << osdmap.require_min_compat_client
7610 << " < firefly, which is required for primary-temp";
7611 err = -EPERM;
7612 goto reply;
7613 } else if (!g_conf->mon_osd_allow_primary_temp) {
7614 ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
7615 err = -EPERM;
7616 goto reply;
7617 }
7618
7619 pending_inc.new_primary_temp[pgid] = osd;
7620 ss << "set " << pgid << " primary_temp mapping to " << osd;
7621 goto update;
7622 } else if (prefix == "osd pg-upmap") {
7623 if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
7624 ss << "you must set the require_luminous_osds flag to use this feature";
7625 err = -EPERM;
7626 goto reply;
7627 }
7628 if (osdmap.require_min_compat_client < "luminous") {
7629 ss << "min_compat_client " << osdmap.require_min_compat_client
7630 << " < luminous, which is required for pg-upmap";
7631 err = -EPERM;
7632 goto reply;
7633 }
7634 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
7635 if (err == -EAGAIN)
7636 goto wait;
7637 if (err < 0)
7638 goto reply;
7639 string pgidstr;
7640 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
7641 ss << "unable to parse 'pgid' value '"
7642 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
7643 err = -EINVAL;
7644 goto reply;
7645 }
7646 pg_t pgid;
7647 if (!pgid.parse(pgidstr.c_str())) {
7648 ss << "invalid pgid '" << pgidstr << "'";
7649 err = -EINVAL;
7650 goto reply;
7651 }
7652 if (!osdmap.pg_exists(pgid)) {
7653 ss << "pg " << pgid << " does not exist";
7654 err = -ENOENT;
7655 goto reply;
7656 }
7657 if (pending_inc.new_pg_upmap.count(pgid) ||
7658 pending_inc.old_pg_upmap.count(pgid)) {
7659 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
7660 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7661 return true;
7662 }
7663 vector<int64_t> id_vec;
7664 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
7665 ss << "unable to parse 'id' value(s) '"
7666 << cmd_vartype_stringify(cmdmap["id"]) << "'";
7667 err = -EINVAL;
7668 goto reply;
7669 }
7670 vector<int32_t> new_pg_upmap;
7671 for (auto osd : id_vec) {
7672 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
7673 ss << "osd." << osd << " does not exist";
7674 err = -ENOENT;
7675 goto reply;
7676 }
7677 new_pg_upmap.push_back(osd);
7678 }
7679
7680 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
7681 new_pg_upmap.begin(), new_pg_upmap.end());
7682 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7683 goto update;
7684 } else if (prefix == "osd rm-pg-upmap") {
7685 if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
7686 ss << "you must set the require_luminous_osds flag to use this feature";
7687 err = -EPERM;
7688 goto reply;
7689 }
7690 if (osdmap.require_min_compat_client < "luminous") {
7691 ss << "require_min_compat_client " << osdmap.require_min_compat_client
7692 << " < luminous, which is required for pg-upmap";
7693 err = -EPERM;
7694 goto reply;
7695 }
7696 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
7697 if (err == -EAGAIN)
7698 goto wait;
7699 if (err < 0)
7700 goto reply;
7701 string pgidstr;
7702 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
7703 ss << "unable to parse 'pgid' value '"
7704 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
7705 err = -EINVAL;
7706 goto reply;
7707 }
7708 pg_t pgid;
7709 if (!pgid.parse(pgidstr.c_str())) {
7710 ss << "invalid pgid '" << pgidstr << "'";
7711 err = -EINVAL;
7712 goto reply;
7713 }
7714 if (!osdmap.pg_exists(pgid)) {
7715 ss << "pg " << pgid << " does not exist";
7716 err = -ENOENT;
7717 goto reply;
7718 }
7719 if (pending_inc.new_pg_upmap.count(pgid) ||
7720 pending_inc.old_pg_upmap.count(pgid)) {
7721 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
7722 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7723 return true;
7724 }
7725
7726 pending_inc.old_pg_upmap.insert(pgid);
7727 ss << "clear " << pgid << " pg_upmap mapping";
7728 goto update;
7729 } else if (prefix == "osd pg-upmap-items") {
7730 if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
7731 ss << "you must set the require_luminous_osds flag to use this feature";
7732 err = -EPERM;
7733 goto reply;
7734 }
7735 if (osdmap.require_min_compat_client < "luminous") {
7736 ss << "require_min_compat_client " << osdmap.require_min_compat_client
7737 << " < luminous, which is required for pg-upmap";
7738 err = -EPERM;
7739 goto reply;
7740 }
7741 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
7742 if (err == -EAGAIN)
7743 goto wait;
7744 if (err < 0)
7745 goto reply;
7746 string pgidstr;
7747 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
7748 ss << "unable to parse 'pgid' value '"
7749 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
7750 err = -EINVAL;
7751 goto reply;
7752 }
7753 pg_t pgid;
7754 if (!pgid.parse(pgidstr.c_str())) {
7755 ss << "invalid pgid '" << pgidstr << "'";
7756 err = -EINVAL;
7757 goto reply;
7758 }
7759 if (!osdmap.pg_exists(pgid)) {
7760 ss << "pg " << pgid << " does not exist";
7761 err = -ENOENT;
7762 goto reply;
7763 }
7764 if (pending_inc.new_pg_upmap_items.count(pgid) ||
7765 pending_inc.old_pg_upmap_items.count(pgid)) {
7766 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
7767 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7768 return true;
7769 }
7770 vector<int64_t> id_vec;
7771 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
7772 ss << "unable to parse 'id' value(s) '"
7773 << cmd_vartype_stringify(cmdmap["id"]) << "'";
7774 err = -EINVAL;
7775 goto reply;
7776 }
7777 if (id_vec.size() % 2) {
7778 ss << "you must specify pairs of osd ids to be remapped";
7779 err = -EINVAL;
7780 goto reply;
7781 }
7782 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
7783 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
7784 int from = *p++;
7785 int to = *p;
7786 if (!osdmap.exists(from)) {
7787 ss << "osd." << from << " does not exist";
7788 err = -ENOENT;
7789 goto reply;
7790 }
7791 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
7792 ss << "osd." << to << " does not exist";
7793 err = -ENOENT;
7794 goto reply;
7795 }
7796 new_pg_upmap_items.push_back(make_pair(from, to));
7797 }
7798
7799 pending_inc.new_pg_upmap_items[pgid] =
7800 mempool::osdmap::vector<pair<int32_t,int32_t>>(
7801 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
7802 ss << "set " << pgid << " pg_upmap_items mapping to " << new_pg_upmap_items;
7803 goto update;
7804 } else if (prefix == "osd rm-pg-upmap-items") {
7805 if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
7806 ss << "you must set the require_luminous_osds flag to use this feature";
7807 err = -EPERM;
7808 goto reply;
7809 }
7810 if (osdmap.require_min_compat_client < "luminous") {
7811 ss << "require_min_compat_client " << osdmap.require_min_compat_client
7812 << " < luminous, which is required for pg-upmap";
7813 err = -EPERM;
7814 goto reply;
7815 }
7816 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
7817 if (err == -EAGAIN)
7818 goto wait;
7819 if (err < 0)
7820 goto reply;
7821 string pgidstr;
7822 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
7823 ss << "unable to parse 'pgid' value '"
7824 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
7825 err = -EINVAL;
7826 goto reply;
7827 }
7828 pg_t pgid;
7829 if (!pgid.parse(pgidstr.c_str())) {
7830 ss << "invalid pgid '" << pgidstr << "'";
7831 err = -EINVAL;
7832 goto reply;
7833 }
7834 if (!osdmap.pg_exists(pgid)) {
7835 ss << "pg " << pgid << " does not exist";
7836 err = -ENOENT;
7837 goto reply;
7838 }
7839 if (pending_inc.new_pg_upmap_items.count(pgid) ||
7840 pending_inc.old_pg_upmap_items.count(pgid)) {
7841 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
7842 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7843 return true;
7844 }
7845
7846 pending_inc.old_pg_upmap_items.insert(pgid);
7847 ss << "clear " << pgid << " pg_upmap_items mapping";
7848 goto update;
7849 } else if (prefix == "osd primary-affinity") {
7850 int64_t id;
7851 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
7852 ss << "invalid osd id value '"
7853 << cmd_vartype_stringify(cmdmap["id"]) << "'";
7854 err = -EINVAL;
7855 goto reply;
7856 }
7857 double w;
7858 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
7859 ss << "unable to parse 'weight' value '"
7860 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7861 err = -EINVAL;
7862 goto reply;
7863 }
7864 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
7865 if (ww < 0L) {
7866 ss << "weight must be >= 0";
7867 err = -EINVAL;
7868 goto reply;
7869 }
7870 if (osdmap.require_min_compat_client.length() &&
7871 osdmap.require_min_compat_client < "firefly") {
7872 ss << "require_min_compat_client " << osdmap.require_min_compat_client
7873 << " < firefly, which is required for primary-affinity";
7874 err = -EPERM;
7875 goto reply;
7876 } else if (!g_conf->mon_osd_allow_primary_affinity) {
7877 ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
7878 err = -EPERM;
7879 goto reply;
7880 }
7881 err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
7882 if (err == -EAGAIN)
7883 goto wait;
7884 if (err < 0)
7885 goto reply;
7886 if (osdmap.exists(id)) {
7887 pending_inc.new_primary_affinity[id] = ww;
7888 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
7889 getline(ss, rs);
7890 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7891 get_last_committed() + 1));
7892 return true;
7893 } else {
7894 ss << "osd." << id << " does not exist";
7895 err = -ENOENT;
7896 goto reply;
7897 }
7898 } else if (prefix == "osd reweight") {
7899 int64_t id;
7900 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
7901 ss << "unable to parse osd id value '"
7902 << cmd_vartype_stringify(cmdmap["id"]) << "'";
7903 err = -EINVAL;
7904 goto reply;
7905 }
7906 double w;
7907 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
7908 ss << "unable to parse weight value '"
7909 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7910 err = -EINVAL;
7911 goto reply;
7912 }
7913 long ww = (int)((double)CEPH_OSD_IN*w);
7914 if (ww < 0L) {
7915 ss << "weight must be >= 0";
7916 err = -EINVAL;
7917 goto reply;
7918 }
7919 if (osdmap.exists(id)) {
7920 pending_inc.new_weight[id] = ww;
7921 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
7922 getline(ss, rs);
7923 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7924 get_last_committed() + 1));
7925 return true;
7926 } else {
7927 ss << "osd." << id << " does not exist";
7928 err = -ENOENT;
7929 goto reply;
7930 }
7931 } else if (prefix == "osd reweightn") {
7932 map<int32_t, uint32_t> weights;
7933 err = parse_reweights(g_ceph_context, cmdmap, osdmap, &weights);
7934 if (err) {
7935 ss << "unable to parse 'weights' value '"
7936 << cmd_vartype_stringify(cmdmap["weights"]) << "'";
7937 goto reply;
7938 }
7939 pending_inc.new_weight.insert(weights.begin(), weights.end());
7940 wait_for_finished_proposal(
7941 op,
7942 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
7943 return true;
7944 } else if (prefix == "osd lost") {
7945 int64_t id;
7946 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
7947 ss << "unable to parse osd id value '"
7948 << cmd_vartype_stringify(cmdmap["id"]) << "'";
7949 err = -EINVAL;
7950 goto reply;
7951 }
7952 string sure;
7953 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
7954 ss << "are you SURE? this might mean real, permanent data loss. pass "
7955 "--yes-i-really-mean-it if you really do.";
7956 err = -EPERM;
7957 goto reply;
7958 } else if (!osdmap.exists(id)) {
7959 ss << "osd." << id << " does not exist";
7960 err = -ENOENT;
7961 goto reply;
7962 } else if (!osdmap.is_down(id)) {
7963 ss << "osd." << id << " is not down";
7964 err = -EBUSY;
7965 goto reply;
7966 } else {
7967 epoch_t e = osdmap.get_info(id).down_at;
7968 pending_inc.new_lost[id] = e;
7969 ss << "marked osd lost in epoch " << e;
7970 getline(ss, rs);
7971 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7972 get_last_committed() + 1));
7973 return true;
7974 }
7975
7976 } else if (prefix == "osd create") {
7977 int i = -1;
7978
7979 // optional id provided?
7980 int64_t id = -1;
7981 if (cmd_getval(g_ceph_context, cmdmap, "id", id)) {
7982 if (id < 0) {
7983 ss << "invalid osd id value '" << id << "'";
7984 err = -EINVAL;
7985 goto reply;
7986 }
7987 dout(10) << " osd create got id " << id << dendl;
7988 }
7989
7990 // optional uuid provided?
7991 uuid_d uuid;
7992 string uuidstr;
7993 if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
7994 if (!uuid.parse(uuidstr.c_str())) {
7995 ss << "invalid uuid value '" << uuidstr << "'";
7996 err = -EINVAL;
7997 goto reply;
7998 }
7999 dout(10) << " osd create got uuid " << uuid << dendl;
8000 i = osdmap.identify_osd(uuid);
8001 if (i >= 0) {
8002 // osd already exists
8003 if (id >= 0 && i != id) {
8004 ss << "uuid " << uuidstr << " already in use for different id " << i;
8005 err = -EINVAL;
8006 goto reply;
8007 }
8008 err = 0;
8009 if (f) {
8010 f->open_object_section("created_osd");
8011 f->dump_int("osdid", i);
8012 f->close_section();
8013 f->flush(rdata);
8014 } else {
8015 ss << i;
8016 rdata.append(ss);
8017 }
8018 goto reply;
8019 }
8020 // i < 0
8021 if (id >= 0) {
8022 if (osdmap.exists(id)) {
8023 ss << "id " << id << " already in use and does not match uuid "
8024 << uuid;
8025 err = -EINVAL;
8026 goto reply;
8027 }
8028 if (pending_inc.new_state.count(id)) {
8029 // osd is about to exist
8030 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8031 return true;
8032 }
8033 i = id;
8034 }
8035 if (pending_inc.identify_osd(uuid) >= 0) {
8036 // osd is about to exist
8037 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8038 return true;
8039 }
8040 if (i >= 0) {
8041 // raise max_osd
8042 if (osdmap.get_max_osd() <= i && pending_inc.new_max_osd <= i)
8043 pending_inc.new_max_osd = i + 1;
8044 goto done;
8045 }
8046 }
8047
8048 // allocate a new id
8049 for (i=0; i < osdmap.get_max_osd(); i++) {
8050 if (!osdmap.exists(i) &&
8051 pending_inc.new_up_client.count(i) == 0 &&
8052 (pending_inc.new_state.count(i) == 0 ||
8053 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8054 pending_inc.new_weight[i] = CEPH_OSD_OUT;
8055 goto done;
8056 }
8057 }
8058
8059 // raise max_osd
8060 if (pending_inc.new_max_osd < 0)
8061 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8062 else
8063 pending_inc.new_max_osd++;
8064 i = pending_inc.new_max_osd - 1;
8065
8066done:
8067 dout(10) << " creating osd." << i << dendl;
8068 pending_inc.new_state[i] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8069 if (!uuid.is_zero())
8070 pending_inc.new_uuid[i] = uuid;
8071 if (f) {
8072 f->open_object_section("created_osd");
8073 f->dump_int("osdid", i);
8074 f->close_section();
8075 f->flush(rdata);
8076 } else {
8077 ss << i;
8078 rdata.append(ss);
8079 }
8080 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, rdata,
8081 get_last_committed() + 1));
8082 return true;
8083
8084 } else if (prefix == "osd blacklist clear") {
8085 pending_inc.new_blacklist.clear();
8086 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
8087 osdmap.get_blacklist(&blacklist);
8088 for (const auto &entry : blacklist) {
8089 pending_inc.old_blacklist.push_back(entry.first);
8090 }
8091 ss << " removed all blacklist entries";
8092 getline(ss, rs);
8093 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8094 get_last_committed() + 1));
8095 return true;
8096 } else if (prefix == "osd blacklist") {
8097 string addrstr;
8098 cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
8099 entity_addr_t addr;
8100 if (!addr.parse(addrstr.c_str(), 0)) {
8101 ss << "unable to parse address " << addrstr;
8102 err = -EINVAL;
8103 goto reply;
8104 }
8105 else {
8106 string blacklistop;
8107 cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
8108 if (blacklistop == "add") {
8109 utime_t expires = ceph_clock_now();
8110 double d;
8111 // default one hour
8112 cmd_getval(g_ceph_context, cmdmap, "expire", d, double(60*60));
8113 expires += d;
8114
8115 pending_inc.new_blacklist[addr] = expires;
8116 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
8117 getline(ss, rs);
8118 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8119 get_last_committed() + 1));
8120 return true;
8121 } else if (blacklistop == "rm") {
8122 if (osdmap.is_blacklisted(addr) ||
8123 pending_inc.new_blacklist.count(addr)) {
8124 if (osdmap.is_blacklisted(addr))
8125 pending_inc.old_blacklist.push_back(addr);
8126 else
8127 pending_inc.new_blacklist.erase(addr);
8128 ss << "un-blacklisting " << addr;
8129 getline(ss, rs);
8130 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8131 get_last_committed() + 1));
8132 return true;
8133 }
8134 ss << addr << " isn't blacklisted";
8135 err = 0;
8136 goto reply;
8137 }
8138 }
8139 } else if (prefix == "osd pool mksnap") {
8140 string poolstr;
8141 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8142 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8143 if (pool < 0) {
8144 ss << "unrecognized pool '" << poolstr << "'";
8145 err = -ENOENT;
8146 goto reply;
8147 }
8148 string snapname;
8149 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
8150 const pg_pool_t *p = osdmap.get_pg_pool(pool);
8151 if (p->is_unmanaged_snaps_mode()) {
8152 ss << "pool " << poolstr << " is in unmanaged snaps mode";
8153 err = -EINVAL;
8154 goto reply;
8155 } else if (p->snap_exists(snapname.c_str())) {
8156 ss << "pool " << poolstr << " snap " << snapname << " already exists";
8157 err = 0;
8158 goto reply;
8159 } else if (p->is_tier()) {
8160 ss << "pool " << poolstr << " is a cache tier";
8161 err = -EINVAL;
8162 goto reply;
8163 }
8164 pg_pool_t *pp = 0;
8165 if (pending_inc.new_pools.count(pool))
8166 pp = &pending_inc.new_pools[pool];
8167 if (!pp) {
8168 pp = &pending_inc.new_pools[pool];
8169 *pp = *p;
8170 }
8171 if (pp->snap_exists(snapname.c_str())) {
8172 ss << "pool " << poolstr << " snap " << snapname << " already exists";
8173 } else {
8174 pp->add_snap(snapname.c_str(), ceph_clock_now());
8175 pp->set_snap_epoch(pending_inc.epoch);
8176 ss << "created pool " << poolstr << " snap " << snapname;
8177 }
8178 getline(ss, rs);
8179 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8180 get_last_committed() + 1));
8181 return true;
8182 } else if (prefix == "osd pool rmsnap") {
8183 string poolstr;
8184 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8185 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8186 if (pool < 0) {
8187 ss << "unrecognized pool '" << poolstr << "'";
8188 err = -ENOENT;
8189 goto reply;
8190 }
8191 string snapname;
8192 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
8193 const pg_pool_t *p = osdmap.get_pg_pool(pool);
8194 if (p->is_unmanaged_snaps_mode()) {
8195 ss << "pool " << poolstr << " is in unmanaged snaps mode";
8196 err = -EINVAL;
8197 goto reply;
8198 } else if (!p->snap_exists(snapname.c_str())) {
8199 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
8200 err = 0;
8201 goto reply;
8202 }
8203 pg_pool_t *pp = 0;
8204 if (pending_inc.new_pools.count(pool))
8205 pp = &pending_inc.new_pools[pool];
8206 if (!pp) {
8207 pp = &pending_inc.new_pools[pool];
8208 *pp = *p;
8209 }
8210 snapid_t sn = pp->snap_exists(snapname.c_str());
8211 if (sn) {
8212 pp->remove_snap(sn);
8213 pp->set_snap_epoch(pending_inc.epoch);
8214 ss << "removed pool " << poolstr << " snap " << snapname;
8215 } else {
8216 ss << "already removed pool " << poolstr << " snap " << snapname;
8217 }
8218 getline(ss, rs);
8219 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8220 get_last_committed() + 1));
8221 return true;
8222 } else if (prefix == "osd pool create") {
8223 int64_t pg_num;
8224 int64_t pgp_num;
8225 cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
8226 cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
8227
8228 string pool_type_str;
8229 cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
8230 if (pool_type_str.empty())
8231 pool_type_str = pg_pool_t::get_default_type();
8232
8233 string poolstr;
8234 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8235 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
8236 if (pool_id >= 0) {
8237 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
8238 if (pool_type_str != p->get_type_name()) {
8239 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
8240 err = -EINVAL;
8241 } else {
8242 ss << "pool '" << poolstr << "' already exists";
8243 err = 0;
8244 }
8245 goto reply;
8246 }
8247
8248 int pool_type;
8249 if (pool_type_str == "replicated") {
8250 pool_type = pg_pool_t::TYPE_REPLICATED;
8251 } else if (pool_type_str == "erasure") {
8252 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
8253 CEPH_FEATURE_OSD_ERASURE_CODES,
8254 ss);
8255 if (err == -EAGAIN)
8256 goto wait;
8257 if (err)
8258 goto reply;
8259 pool_type = pg_pool_t::TYPE_ERASURE;
8260 } else {
8261 ss << "unknown pool type '" << pool_type_str << "'";
8262 err = -EINVAL;
8263 goto reply;
8264 }
8265
8266 bool implicit_ruleset_creation = false;
8267 string ruleset_name;
8268 cmd_getval(g_ceph_context, cmdmap, "ruleset", ruleset_name);
8269 string erasure_code_profile;
8270 cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
8271
8272 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8273 if (erasure_code_profile == "")
8274 erasure_code_profile = "default";
8275 //handle the erasure code profile
8276 if (erasure_code_profile == "default") {
8277 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
8278 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
8279 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
8280 goto wait;
8281 }
8282
8283 map<string,string> profile_map;
8284 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
8285 profile_map,
8286 &ss);
8287 if (err)
8288 goto reply;
8289 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
8290 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
8291 goto wait;
8292 }
8293 }
8294 if (ruleset_name == "") {
8295 implicit_ruleset_creation = true;
8296 if (erasure_code_profile == "default") {
8297 ruleset_name = "erasure-code";
8298 } else {
8299 dout(1) << "implicitly use ruleset named after the pool: "
8300 << poolstr << dendl;
8301 ruleset_name = poolstr;
8302 }
8303 }
8304 } else {
8305 //NOTE:for replicated pool,cmd_map will put ruleset_name to erasure_code_profile field
8306 ruleset_name = erasure_code_profile;
8307 }
8308
8309 if (!implicit_ruleset_creation && ruleset_name != "") {
8310 int ruleset;
8311 err = get_crush_ruleset(ruleset_name, &ruleset, &ss);
8312 if (err == -EAGAIN) {
8313 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8314 return true;
8315 }
8316 if (err)
8317 goto reply;
8318 }
8319
8320 int64_t expected_num_objects;
8321 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects", expected_num_objects, int64_t(0));
8322 if (expected_num_objects < 0) {
8323 ss << "'expected_num_objects' must be non-negative";
8324 err = -EINVAL;
8325 goto reply;
8326 }
8327
8328 int64_t fast_read_param;
8329 cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
8330 FastReadType fast_read = FAST_READ_DEFAULT;
8331 if (fast_read_param == 0)
8332 fast_read = FAST_READ_OFF;
8333 else if (fast_read_param > 0)
8334 fast_read = FAST_READ_ON;
8335
8336 err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
8337 -1, // default crush rule
8338 ruleset_name,
8339 pg_num, pgp_num,
8340 erasure_code_profile, pool_type,
8341 (uint64_t)expected_num_objects,
8342 fast_read,
8343 &ss);
8344 if (err < 0) {
8345 switch(err) {
8346 case -EEXIST:
8347 ss << "pool '" << poolstr << "' already exists";
8348 break;
8349 case -EAGAIN:
8350 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8351 return true;
8352 case -ERANGE:
8353 goto reply;
8354 default:
8355 goto reply;
8356 break;
8357 }
8358 } else {
8359 ss << "pool '" << poolstr << "' created";
8360 }
8361 getline(ss, rs);
8362 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8363 get_last_committed() + 1));
8364 return true;
8365
8366 } else if (prefix == "osd pool delete" ||
8367 prefix == "osd pool rm") {
8368 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
8369 string poolstr, poolstr2, sure;
8370 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8371 cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
8372 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
8373 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8374 if (pool < 0) {
8375 ss << "pool '" << poolstr << "' does not exist";
8376 err = 0;
8377 goto reply;
8378 }
8379
8380 bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
8381 if (poolstr2 != poolstr ||
8382 (sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
8383 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
8384 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
8385 << "followed by --yes-i-really-really-mean-it.";
8386 err = -EPERM;
8387 goto reply;
8388 }
8389 err = _prepare_remove_pool(pool, &ss, force_no_fake);
8390 if (err == -EAGAIN) {
8391 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8392 return true;
8393 }
8394 if (err < 0)
8395 goto reply;
8396 goto update;
8397 } else if (prefix == "osd pool rename") {
8398 string srcpoolstr, destpoolstr;
8399 cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
8400 cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
8401 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
8402 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
8403
8404 if (pool_src < 0) {
8405 if (pool_dst >= 0) {
8406 // src pool doesn't exist, dst pool does exist: to ensure idempotency
8407 // of operations, assume this rename succeeded, as it is not changing
8408 // the current state. Make sure we output something understandable
8409 // for whoever is issuing the command, if they are paying attention,
8410 // in case it was not intentional; or to avoid a "wtf?" and a bug
8411 // report in case it was intentional, while expecting a failure.
8412 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
8413 << destpoolstr << "' does -- assuming successful rename";
8414 err = 0;
8415 } else {
8416 ss << "unrecognized pool '" << srcpoolstr << "'";
8417 err = -ENOENT;
8418 }
8419 goto reply;
8420 } else if (pool_dst >= 0) {
8421 // source pool exists and so does the destination pool
8422 ss << "pool '" << destpoolstr << "' already exists";
8423 err = -EEXIST;
8424 goto reply;
8425 }
8426
8427 int ret = _prepare_rename_pool(pool_src, destpoolstr);
8428 if (ret == 0) {
8429 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
8430 } else {
8431 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
8432 << cpp_strerror(ret);
8433 }
8434 getline(ss, rs);
8435 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
8436 get_last_committed() + 1));
8437 return true;
8438
8439 } else if (prefix == "osd pool set") {
8440 err = prepare_command_pool_set(cmdmap, ss);
8441 if (err == -EAGAIN)
8442 goto wait;
8443 if (err < 0)
8444 goto reply;
8445
8446 getline(ss, rs);
8447 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8448 get_last_committed() + 1));
8449 return true;
8450 } else if (prefix == "osd tier add") {
8451 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8452 if (err == -EAGAIN)
8453 goto wait;
8454 if (err)
8455 goto reply;
8456 string poolstr;
8457 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8458 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
8459 if (pool_id < 0) {
8460 ss << "unrecognized pool '" << poolstr << "'";
8461 err = -ENOENT;
8462 goto reply;
8463 }
8464 string tierpoolstr;
8465 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
8466 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
8467 if (tierpool_id < 0) {
8468 ss << "unrecognized pool '" << tierpoolstr << "'";
8469 err = -ENOENT;
8470 goto reply;
8471 }
8472 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
8473 assert(p);
8474 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
8475 assert(tp);
8476
8477 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
8478 goto reply;
8479 }
8480
8481 // make sure new tier is empty
8482 string force_nonempty;
8483 cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
8484 const pool_stat_t& tier_stats =
8485 mon->pgmon()->pg_map.get_pg_pool_sum_stat(tierpool_id);
8486 if (tier_stats.stats.sum.num_objects != 0 &&
8487 force_nonempty != "--force-nonempty") {
8488 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
8489 err = -ENOTEMPTY;
8490 goto reply;
8491 }
8492 if (tp->ec_pool()) {
8493 ss << "tier pool '" << tierpoolstr
8494 << "' is an ec pool, which cannot be a tier";
8495 err = -ENOTSUP;
8496 goto reply;
8497 }
8498 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
8499 ((force_nonempty != "--force-nonempty") ||
8500 (!g_conf->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
8501 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
8502 err = -ENOTEMPTY;
8503 goto reply;
8504 }
8505 // go
8506 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
8507 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
8508 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
8509 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8510 return true;
8511 }
8512 np->tiers.insert(tierpool_id);
8513 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
8514 ntp->tier_of = pool_id;
8515 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
8516 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8517 get_last_committed() + 1));
8518 return true;
8519 } else if (prefix == "osd tier remove" ||
8520 prefix == "osd tier rm") {
8521 string poolstr;
8522 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8523 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
8524 if (pool_id < 0) {
8525 ss << "unrecognized pool '" << poolstr << "'";
8526 err = -ENOENT;
8527 goto reply;
8528 }
8529 string tierpoolstr;
8530 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
8531 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
8532 if (tierpool_id < 0) {
8533 ss << "unrecognized pool '" << tierpoolstr << "'";
8534 err = -ENOENT;
8535 goto reply;
8536 }
8537 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
8538 assert(p);
8539 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
8540 assert(tp);
8541
8542 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
8543 goto reply;
8544 }
8545
8546 if (p->tiers.count(tierpool_id) == 0) {
8547 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
8548 err = 0;
8549 goto reply;
8550 }
8551 if (tp->tier_of != pool_id) {
8552 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
8553 << osdmap.get_pool_name(tp->tier_of) << "': "
8554 // be scary about it; this is an inconsistency and bells must go off
8555 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
8556 err = -EINVAL;
8557 goto reply;
8558 }
8559 if (p->read_tier == tierpool_id) {
8560 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
8561 err = -EBUSY;
8562 goto reply;
8563 }
8564 // go
8565 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
8566 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
8567 if (np->tiers.count(tierpool_id) == 0 ||
8568 ntp->tier_of != pool_id ||
8569 np->read_tier == tierpool_id) {
8570 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8571 return true;
8572 }
8573 np->tiers.erase(tierpool_id);
8574 ntp->clear_tier();
8575 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
8576 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8577 get_last_committed() + 1));
8578 return true;
8579 } else if (prefix == "osd tier set-overlay") {
8580 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8581 if (err == -EAGAIN)
8582 goto wait;
8583 if (err)
8584 goto reply;
8585 string poolstr;
8586 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8587 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
8588 if (pool_id < 0) {
8589 ss << "unrecognized pool '" << poolstr << "'";
8590 err = -ENOENT;
8591 goto reply;
8592 }
8593 string overlaypoolstr;
8594 cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
8595 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
8596 if (overlaypool_id < 0) {
8597 ss << "unrecognized pool '" << overlaypoolstr << "'";
8598 err = -ENOENT;
8599 goto reply;
8600 }
8601 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
8602 assert(p);
8603 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
8604 assert(overlay_p);
8605 if (p->tiers.count(overlaypool_id) == 0) {
8606 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
8607 err = -EINVAL;
8608 goto reply;
8609 }
8610 if (p->read_tier == overlaypool_id) {
8611 err = 0;
8612 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
8613 goto reply;
8614 }
8615 if (p->has_read_tier()) {
8616 ss << "pool '" << poolstr << "' has overlay '"
8617 << osdmap.get_pool_name(p->read_tier)
8618 << "'; please remove-overlay first";
8619 err = -EINVAL;
8620 goto reply;
8621 }
8622
8623 // go
8624 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
8625 np->read_tier = overlaypool_id;
8626 np->write_tier = overlaypool_id;
8627 np->set_last_force_op_resend(pending_inc.epoch);
8628 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
8629 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
8630 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
8631 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
8632 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
8633 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8634 get_last_committed() + 1));
8635 return true;
8636 } else if (prefix == "osd tier remove-overlay" ||
8637 prefix == "osd tier rm-overlay") {
8638 string poolstr;
8639 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8640 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
8641 if (pool_id < 0) {
8642 ss << "unrecognized pool '" << poolstr << "'";
8643 err = -ENOENT;
8644 goto reply;
8645 }
8646 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
8647 assert(p);
8648 if (!p->has_read_tier()) {
8649 err = 0;
8650 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
8651 goto reply;
8652 }
8653
8654 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
8655 goto reply;
8656 }
8657
8658 // go
8659 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
8660 if (np->has_read_tier()) {
8661 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
8662 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
8663 nop->set_last_force_op_resend(pending_inc.epoch);
8664 }
8665 if (np->has_write_tier()) {
8666 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
8667 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
8668 nop->set_last_force_op_resend(pending_inc.epoch);
8669 }
8670 np->clear_read_tier();
8671 np->clear_write_tier();
8672 np->set_last_force_op_resend(pending_inc.epoch);
8673 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
8674 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8675 get_last_committed() + 1));
8676 return true;
8677 } else if (prefix == "osd tier cache-mode") {
8678 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8679 if (err == -EAGAIN)
8680 goto wait;
8681 if (err)
8682 goto reply;
8683 string poolstr;
8684 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8685 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
8686 if (pool_id < 0) {
8687 ss << "unrecognized pool '" << poolstr << "'";
8688 err = -ENOENT;
8689 goto reply;
8690 }
8691 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
8692 assert(p);
8693 if (!p->is_tier()) {
8694 ss << "pool '" << poolstr << "' is not a tier";
8695 err = -EINVAL;
8696 goto reply;
8697 }
8698 string modestr;
8699 cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
8700 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
8701 if (mode < 0) {
8702 ss << "'" << modestr << "' is not a valid cache mode";
8703 err = -EINVAL;
8704 goto reply;
8705 }
8706
8707 string sure;
8708 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
8709 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
8710 mode != pg_pool_t::CACHEMODE_NONE &&
8711 mode != pg_pool_t::CACHEMODE_PROXY &&
8712 mode != pg_pool_t::CACHEMODE_READPROXY) &&
8713 sure != "--yes-i-really-mean-it") {
8714 ss << "'" << modestr << "' is not a well-supported cache mode and may "
8715 << "corrupt your data. pass --yes-i-really-mean-it to force.";
8716 err = -EPERM;
8717 goto reply;
8718 }
8719
8720 // pool already has this cache-mode set and there are no pending changes
8721 if (p->cache_mode == mode &&
8722 (pending_inc.new_pools.count(pool_id) == 0 ||
8723 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
8724 ss << "set cache-mode for pool '" << poolstr << "'"
8725 << " to " << pg_pool_t::get_cache_mode_name(mode);
8726 err = 0;
8727 goto reply;
8728 }
8729
8730 /* Mode description:
8731 *
8732 * none: No cache-mode defined
8733 * forward: Forward all reads and writes to base pool
8734 * writeback: Cache writes, promote reads from base pool
8735 * readonly: Forward writes to base pool
8736 * readforward: Writes are in writeback mode, Reads are in forward mode
8737 * proxy: Proxy all reads and writes to base pool
8738 * readproxy: Writes are in writeback mode, Reads are in proxy mode
8739 *
8740 * Hence, these are the allowed transitions:
8741 *
8742 * none -> any
8743 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
8744 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
8745 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
8746 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
8747 * writeback -> readforward || readproxy || forward || proxy
8748 * readonly -> any
8749 */
8750
8751 // We check if the transition is valid against the current pool mode, as
8752 // it is the only committed state thus far. We will blantly squash
8753 // whatever mode is on the pending state.
8754
8755 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
8756 (mode != pg_pool_t::CACHEMODE_FORWARD &&
8757 mode != pg_pool_t::CACHEMODE_PROXY &&
8758 mode != pg_pool_t::CACHEMODE_READFORWARD &&
8759 mode != pg_pool_t::CACHEMODE_READPROXY)) {
8760 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
8761 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
8762 << "' pool; only '"
8763 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
8764 << "','"
8765 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
8766 << "','"
8767 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
8768 << "','"
8769 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
8770 << "' allowed.";
8771 err = -EINVAL;
8772 goto reply;
8773 }
8774 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
8775 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
8776 mode != pg_pool_t::CACHEMODE_FORWARD &&
8777 mode != pg_pool_t::CACHEMODE_PROXY &&
8778 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
8779
8780 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
8781 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
8782 mode != pg_pool_t::CACHEMODE_FORWARD &&
8783 mode != pg_pool_t::CACHEMODE_READFORWARD &&
8784 mode != pg_pool_t::CACHEMODE_PROXY)) ||
8785
8786 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
8787 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
8788 mode != pg_pool_t::CACHEMODE_FORWARD &&
8789 mode != pg_pool_t::CACHEMODE_READFORWARD &&
8790 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
8791
8792 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
8793 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
8794 mode != pg_pool_t::CACHEMODE_READFORWARD &&
8795 mode != pg_pool_t::CACHEMODE_PROXY &&
8796 mode != pg_pool_t::CACHEMODE_READPROXY))) {
8797
8798 const pool_stat_t& tier_stats =
8799 mon->pgmon()->pg_map.get_pg_pool_sum_stat(pool_id);
8800
8801 if (tier_stats.stats.sum.num_objects_dirty > 0) {
8802 ss << "unable to set cache-mode '"
8803 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
8804 << "': dirty objects found";
8805 err = -EBUSY;
8806 goto reply;
8807 }
8808 }
8809 // go
8810 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
8811 np->cache_mode = mode;
8812 // set this both when moving to and from cache_mode NONE. this is to
8813 // capture legacy pools that were set up before this flag existed.
8814 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
8815 ss << "set cache-mode for pool '" << poolstr
8816 << "' to " << pg_pool_t::get_cache_mode_name(mode);
8817 if (mode == pg_pool_t::CACHEMODE_NONE) {
8818 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
8819 assert(base_pool);
8820 if (base_pool->read_tier == pool_id ||
8821 base_pool->write_tier == pool_id)
8822 ss <<" (WARNING: pool is still configured as read or write tier)";
8823 }
8824 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8825 get_last_committed() + 1));
8826 return true;
8827 } else if (prefix == "osd tier add-cache") {
8828 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8829 if (err == -EAGAIN)
8830 goto wait;
8831 if (err)
8832 goto reply;
8833 string poolstr;
8834 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8835 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
8836 if (pool_id < 0) {
8837 ss << "unrecognized pool '" << poolstr << "'";
8838 err = -ENOENT;
8839 goto reply;
8840 }
8841 string tierpoolstr;
8842 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
8843 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
8844 if (tierpool_id < 0) {
8845 ss << "unrecognized pool '" << tierpoolstr << "'";
8846 err = -ENOENT;
8847 goto reply;
8848 }
8849 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
8850 assert(p);
8851 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
8852 assert(tp);
8853
8854 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
8855 goto reply;
8856 }
8857
8858 int64_t size = 0;
8859 if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
8860 ss << "unable to parse 'size' value '"
8861 << cmd_vartype_stringify(cmdmap["size"]) << "'";
8862 err = -EINVAL;
8863 goto reply;
8864 }
8865 // make sure new tier is empty
8866 const pool_stat_t& tier_stats =
8867 mon->pgmon()->pg_map.get_pg_pool_sum_stat(tierpool_id);
8868 if (tier_stats.stats.sum.num_objects != 0) {
8869 ss << "tier pool '" << tierpoolstr << "' is not empty";
8870 err = -ENOTEMPTY;
8871 goto reply;
8872 }
8873 string modestr = g_conf->osd_tier_default_cache_mode;
8874 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
8875 if (mode < 0) {
8876 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
8877 err = -EINVAL;
8878 goto reply;
8879 }
8880 HitSet::Params hsp;
8881 if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
8882 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8883 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
8884 hsp = HitSet::Params(bsp);
8885 } else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
8886 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
8887 }
8888 else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
8889 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
8890 } else {
8891 ss << "osd tier cache default hit set type '" <<
8892 g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
8893 err = -EINVAL;
8894 goto reply;
8895 }
8896 // go
8897 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
8898 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
8899 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
8900 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8901 return true;
8902 }
8903 np->tiers.insert(tierpool_id);
8904 np->read_tier = np->write_tier = tierpool_id;
8905 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
8906 np->set_last_force_op_resend(pending_inc.epoch);
8907 ntp->set_last_force_op_resend(pending_inc.epoch);
8908 ntp->tier_of = pool_id;
8909 ntp->cache_mode = mode;
8910 ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
8911 ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
8912 ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
8913 ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
8914 ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
8915 ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
8916 ntp->hit_set_params = hsp;
8917 ntp->target_max_bytes = size;
8918 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
8919 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8920 get_last_committed() + 1));
8921 return true;
8922 } else if (prefix == "osd pool set-quota") {
8923 string poolstr;
8924 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
8925 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
8926 if (pool_id < 0) {
8927 ss << "unrecognized pool '" << poolstr << "'";
8928 err = -ENOENT;
8929 goto reply;
8930 }
8931
8932 string field;
8933 cmd_getval(g_ceph_context, cmdmap, "field", field);
8934 if (field != "max_objects" && field != "max_bytes") {
8935 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
8936 err = -EINVAL;
8937 goto reply;
8938 }
8939
8940 // val could contain unit designations, so we treat as a string
8941 string val;
8942 cmd_getval(g_ceph_context, cmdmap, "val", val);
8943 stringstream tss;
8944 int64_t value = unit_to_bytesize(val, &tss);
8945 if (value < 0) {
8946 ss << "error parsing value '" << value << "': " << tss.str();
8947 err = value;
8948 goto reply;
8949 }
8950
8951 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
8952 if (field == "max_objects") {
8953 pi->quota_max_objects = value;
8954 } else if (field == "max_bytes") {
8955 pi->quota_max_bytes = value;
8956 } else {
8957 assert(0 == "unrecognized option");
8958 }
8959 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
8960 rs = ss.str();
8961 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8962 get_last_committed() + 1));
8963 return true;
8964
8965 } else if (prefix == "osd reweight-by-pg" ||
8966 prefix == "osd reweight-by-utilization" ||
8967 prefix == "osd test-reweight-by-pg" ||
8968 prefix == "osd test-reweight-by-utilization") {
8969 bool by_pg =
8970 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
8971 bool dry_run =
8972 prefix == "osd test-reweight-by-pg" ||
8973 prefix == "osd test-reweight-by-utilization";
8974 int64_t oload;
8975 cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
8976 set<int64_t> pools;
8977 vector<string> poolnamevec;
8978 cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
8979 for (unsigned j = 0; j < poolnamevec.size(); j++) {
8980 int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
8981 if (pool < 0) {
8982 ss << "pool '" << poolnamevec[j] << "' does not exist";
8983 err = -ENOENT;
8984 goto reply;
8985 }
8986 pools.insert(pool);
8987 }
8988 double max_change = g_conf->mon_reweight_max_change;
8989 cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
8990 if (max_change <= 0.0) {
8991 ss << "max_change " << max_change << " must be positive";
8992 err = -EINVAL;
8993 goto reply;
8994 }
8995 int64_t max_osds = g_conf->mon_reweight_max_osds;
8996 cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
8997 if (max_osds <= 0) {
8998 ss << "max_osds " << max_osds << " must be positive";
8999 err = -EINVAL;
9000 goto reply;
9001 }
9002 string no_increasing;
9003 cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
9004 string out_str;
9005 mempool::osdmap::map<int32_t, uint32_t> new_weights;
9006 err = reweight::by_utilization(osdmap,
9007 mon->pgmon()->pg_map,
9008 oload,
9009 max_change,
9010 max_osds,
9011 by_pg,
9012 pools.empty() ? NULL : &pools,
9013 no_increasing == "--no-increasing",
9014 &new_weights,
9015 &ss, &out_str, f.get());
9016 if (err >= 0) {
9017 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
9018 }
9019 if (f)
9020 f->flush(rdata);
9021 else
9022 rdata.append(out_str);
9023 if (err < 0) {
9024 ss << "FAILED reweight-by-pg";
9025 } else if (err == 0 || dry_run) {
9026 ss << "no change";
9027 } else {
9028 ss << "SUCCESSFUL reweight-by-pg";
9029 pending_inc.new_weight = std::move(new_weights);
9030 wait_for_finished_proposal(
9031 op,
9032 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
9033 return true;
9034 }
9035 } else {
9036 err = -EINVAL;
9037 }
9038
9039 reply:
9040 getline(ss, rs);
9041 if (err < 0 && rs.length() == 0)
9042 rs = cpp_strerror(err);
9043 mon->reply_command(op, err, rs, rdata, get_last_committed());
9044 return ret;
9045
9046 update:
9047 getline(ss, rs);
9048 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9049 get_last_committed() + 1));
9050 return true;
9051
9052 wait:
9053 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9054 return true;
9055}
9056
9057bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
9058{
9059 op->mark_osdmon_event(__func__);
9060 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
9061
9062 if (m->fsid != mon->monmap->fsid) {
9063 dout(0) << __func__ << " drop message on fsid " << m->fsid
9064 << " != " << mon->monmap->fsid << " for " << *m << dendl;
9065 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
9066 return true;
9067 }
9068
9069 if (m->op == POOL_OP_CREATE)
9070 return preprocess_pool_op_create(op);
9071
9072 if (!osdmap.get_pg_pool(m->pool)) {
9073 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
9074 _pool_op_reply(op, 0, osdmap.get_epoch());
9075 return true;
9076 }
9077
9078 // check if the snap and snapname exist
9079 bool snap_exists = false;
9080 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
9081 if (p->snap_exists(m->name.c_str()))
9082 snap_exists = true;
9083
9084 switch (m->op) {
9085 case POOL_OP_CREATE_SNAP:
9086 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
9087 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
9088 return true;
9089 }
9090 if (snap_exists) {
9091 _pool_op_reply(op, 0, osdmap.get_epoch());
9092 return true;
9093 }
9094 return false;
9095 case POOL_OP_CREATE_UNMANAGED_SNAP:
9096 if (p->is_pool_snaps_mode()) {
9097 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
9098 return true;
9099 }
9100 return false;
9101 case POOL_OP_DELETE_SNAP:
9102 if (p->is_unmanaged_snaps_mode()) {
9103 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
9104 return true;
9105 }
9106 if (!snap_exists) {
9107 _pool_op_reply(op, 0, osdmap.get_epoch());
9108 return true;
9109 }
9110 return false;
9111 case POOL_OP_DELETE_UNMANAGED_SNAP:
9112 if (p->is_pool_snaps_mode()) {
9113 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
9114 return true;
9115 }
9116 if (p->is_removed_snap(m->snapid)) {
9117 _pool_op_reply(op, 0, osdmap.get_epoch());
9118 return true;
9119 }
9120 return false;
9121 case POOL_OP_DELETE:
9122 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
9123 _pool_op_reply(op, 0, osdmap.get_epoch());
9124 return true;
9125 }
9126 return false;
9127 case POOL_OP_AUID_CHANGE:
9128 return false;
9129 default:
9130 ceph_abort();
9131 break;
9132 }
9133
9134 return false;
9135}
9136
9137bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
9138{
9139 op->mark_osdmon_event(__func__);
9140 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
9141 MonSession *session = m->get_session();
9142 if (!session) {
9143 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
9144 return true;
9145 }
9146 if (!session->is_capable("osd", MON_CAP_W)) {
9147 dout(5) << "attempt to create new pool without sufficient auid privileges!"
9148 << "message: " << *m << std::endl
9149 << "caps: " << session->caps << dendl;
9150 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
9151 return true;
9152 }
9153
9154 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
9155 if (pool >= 0) {
9156 _pool_op_reply(op, 0, osdmap.get_epoch());
9157 return true;
9158 }
9159
9160 return false;
9161}
9162
9163bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
9164{
9165 op->mark_osdmon_event(__func__);
9166 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
9167 dout(10) << "prepare_pool_op " << *m << dendl;
9168 if (m->op == POOL_OP_CREATE) {
9169 return prepare_pool_op_create(op);
9170 } else if (m->op == POOL_OP_DELETE) {
9171 return prepare_pool_op_delete(op);
9172 }
9173
9174 int ret = 0;
9175 bool changed = false;
9176
9177 if (!osdmap.have_pg_pool(m->pool)) {
9178 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
9179 return false;
9180 }
9181
9182 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
9183
9184 switch (m->op) {
9185 case POOL_OP_CREATE_SNAP:
9186 if (pool->is_tier()) {
9187 ret = -EINVAL;
9188 _pool_op_reply(op, ret, osdmap.get_epoch());
9189 return false;
9190 } // else, fall through
9191 case POOL_OP_DELETE_SNAP:
9192 if (!pool->is_unmanaged_snaps_mode()) {
9193 bool snap_exists = pool->snap_exists(m->name.c_str());
9194 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
9195 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
9196 ret = 0;
9197 } else {
9198 break;
9199 }
9200 } else {
9201 ret = -EINVAL;
9202 }
9203 _pool_op_reply(op, ret, osdmap.get_epoch());
9204 return false;
9205
9206 case POOL_OP_DELETE_UNMANAGED_SNAP:
9207 // we won't allow removal of an unmanaged snapshot from a pool
9208 // not in unmanaged snaps mode.
9209 if (!pool->is_unmanaged_snaps_mode()) {
9210 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
9211 return false;
9212 }
9213 /* fall-thru */
9214 case POOL_OP_CREATE_UNMANAGED_SNAP:
9215 // but we will allow creating an unmanaged snapshot on any pool
9216 // as long as it is not in 'pool' snaps mode.
9217 if (pool->is_pool_snaps_mode()) {
9218 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
9219 return false;
9220 }
9221 }
9222
9223 // projected pool info
9224 pg_pool_t pp;
9225 if (pending_inc.new_pools.count(m->pool))
9226 pp = pending_inc.new_pools[m->pool];
9227 else
9228 pp = *osdmap.get_pg_pool(m->pool);
9229
9230 bufferlist reply_data;
9231
9232 // pool snaps vs unmanaged snaps are mutually exclusive
9233 switch (m->op) {
9234 case POOL_OP_CREATE_SNAP:
9235 case POOL_OP_DELETE_SNAP:
9236 if (pp.is_unmanaged_snaps_mode()) {
9237 ret = -EINVAL;
9238 goto out;
9239 }
9240 break;
9241
9242 case POOL_OP_CREATE_UNMANAGED_SNAP:
9243 case POOL_OP_DELETE_UNMANAGED_SNAP:
9244 if (pp.is_pool_snaps_mode()) {
9245 ret = -EINVAL;
9246 goto out;
9247 }
9248 }
9249
9250 switch (m->op) {
9251 case POOL_OP_CREATE_SNAP:
9252 if (!pp.snap_exists(m->name.c_str())) {
9253 pp.add_snap(m->name.c_str(), ceph_clock_now());
9254 dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
9255 changed = true;
9256 }
9257 break;
9258
9259 case POOL_OP_DELETE_SNAP:
9260 {
9261 snapid_t s = pp.snap_exists(m->name.c_str());
9262 if (s) {
9263 pp.remove_snap(s);
9264 changed = true;
9265 }
9266 }
9267 break;
9268
9269 case POOL_OP_CREATE_UNMANAGED_SNAP:
9270 {
9271 uint64_t snapid;
9272 pp.add_unmanaged_snap(snapid);
9273 ::encode(snapid, reply_data);
9274 changed = true;
9275 }
9276 break;
9277
9278 case POOL_OP_DELETE_UNMANAGED_SNAP:
9279 if (!pp.is_removed_snap(m->snapid)) {
9280 pp.remove_unmanaged_snap(m->snapid);
9281 changed = true;
9282 }
9283 break;
9284
9285 case POOL_OP_AUID_CHANGE:
9286 if (pp.auid != m->auid) {
9287 pp.auid = m->auid;
9288 changed = true;
9289 }
9290 break;
9291
9292 default:
9293 ceph_abort();
9294 break;
9295 }
9296
9297 if (changed) {
9298 pp.set_snap_epoch(pending_inc.epoch);
9299 pending_inc.new_pools[m->pool] = pp;
9300 }
9301
9302 out:
9303 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
9304 return true;
9305}
9306
9307bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
9308{
9309 op->mark_osdmon_event(__func__);
9310 int err = prepare_new_pool(op);
9311 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
9312 return true;
9313}
9314
9315int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
9316 ostream *ss)
9317{
9318 const string& poolstr = osdmap.get_pool_name(pool_id);
9319
9320 // If the Pool is in use by CephFS, refuse to delete it
9321 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
9322 if (pending_fsmap.pool_in_use(pool_id)) {
9323 *ss << "pool '" << poolstr << "' is in use by CephFS";
9324 return -EBUSY;
9325 }
9326
9327 if (pool.tier_of >= 0) {
9328 *ss << "pool '" << poolstr << "' is a tier of '"
9329 << osdmap.get_pool_name(pool.tier_of) << "'";
9330 return -EBUSY;
9331 }
9332 if (!pool.tiers.empty()) {
9333 *ss << "pool '" << poolstr << "' has tiers";
9334 for(auto tier : pool.tiers) {
9335 *ss << " " << osdmap.get_pool_name(tier);
9336 }
9337 return -EBUSY;
9338 }
9339
9340 if (!g_conf->mon_allow_pool_delete) {
9341 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
9342 return -EPERM;
9343 }
9344
9345 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
9346 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
9347 return -EPERM;
9348 }
9349
9350 *ss << "pool '" << poolstr << "' removed";
9351 return 0;
9352}
9353
9354/**
9355 * Check if it is safe to add a tier to a base pool
9356 *
9357 * @return
9358 * True if the operation should proceed, false if we should abort here
9359 * (abort doesn't necessarily mean error, could be idempotency)
9360 */
9361bool OSDMonitor::_check_become_tier(
9362 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
9363 const int64_t base_pool_id, const pg_pool_t *base_pool,
9364 int *err,
9365 ostream *ss) const
9366{
9367 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
9368 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
9369
9370 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
9371 if (pending_fsmap.pool_in_use(tier_pool_id)) {
9372 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
9373 *err = -EBUSY;
9374 return false;
9375 }
9376
9377 if (base_pool->tiers.count(tier_pool_id)) {
9378 assert(tier_pool->tier_of == base_pool_id);
9379 *err = 0;
9380 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
9381 << base_pool_name << "'";
9382 return false;
9383 }
9384
9385 if (base_pool->is_tier()) {
9386 *ss << "pool '" << base_pool_name << "' is already a tier of '"
9387 << osdmap.get_pool_name(base_pool->tier_of) << "', "
9388 << "multiple tiers are not yet supported.";
9389 *err = -EINVAL;
9390 return false;
9391 }
9392
9393 if (tier_pool->has_tiers()) {
9394 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
9395 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
9396 it != tier_pool->tiers.end(); ++it)
9397 *ss << "'" << osdmap.get_pool_name(*it) << "',";
9398 *ss << " multiple tiers are not yet supported.";
9399 *err = -EINVAL;
9400 return false;
9401 }
9402
9403 if (tier_pool->is_tier()) {
9404 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
9405 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
9406 *err = -EINVAL;
9407 return false;
9408 }
9409
9410 *err = 0;
9411 return true;
9412}
9413
9414
9415/**
9416 * Check if it is safe to remove a tier from this base pool
9417 *
9418 * @return
9419 * True if the operation should proceed, false if we should abort here
9420 * (abort doesn't necessarily mean error, could be idempotency)
9421 */
9422bool OSDMonitor::_check_remove_tier(
9423 const int64_t base_pool_id, const pg_pool_t *base_pool,
9424 const pg_pool_t *tier_pool,
9425 int *err, ostream *ss) const
9426{
9427 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
9428
9429 // Apply CephFS-specific checks
9430 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
9431 if (pending_fsmap.pool_in_use(base_pool_id)) {
9432 if (base_pool->type != pg_pool_t::TYPE_REPLICATED) {
9433 // If the underlying pool is erasure coded, we can't permit the
9434 // removal of the replicated tier that CephFS relies on to access it
9435 *ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
9436 *err = -EBUSY;
9437 return false;
9438 }
9439
9440 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
9441 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
9442 "tier is still in use as a writeback cache. Change the cache "
9443 "mode and flush the cache before removing it";
9444 *err = -EBUSY;
9445 return false;
9446 }
9447 }
9448
9449 *err = 0;
9450 return true;
9451}
9452
9453int OSDMonitor::_prepare_remove_pool(
9454 int64_t pool, ostream *ss, bool no_fake)
9455{
9456 dout(10) << "_prepare_remove_pool " << pool << dendl;
9457 const pg_pool_t *p = osdmap.get_pg_pool(pool);
9458 int r = _check_remove_pool(pool, *p, ss);
9459 if (r < 0)
9460 return r;
9461
9462 auto new_pool = pending_inc.new_pools.find(pool);
9463 if (new_pool != pending_inc.new_pools.end()) {
9464 // if there is a problem with the pending info, wait and retry
9465 // this op.
9466 const auto& p = new_pool->second;
9467 int r = _check_remove_pool(pool, p, ss);
9468 if (r < 0)
9469 return -EAGAIN;
9470 }
9471
9472 if (pending_inc.old_pools.count(pool)) {
9473 dout(10) << "_prepare_remove_pool " << pool << " already pending removal"
9474 << dendl;
9475 return 0;
9476 }
9477
9478 if (g_conf->mon_fake_pool_delete && !no_fake) {
9479 string old_name = osdmap.get_pool_name(pool);
9480 string new_name = old_name + "." + stringify(pool) + ".DELETED";
9481 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
9482 << old_name << " -> " << new_name << dendl;
9483 pending_inc.new_pool_names[pool] = new_name;
9484 return 0;
9485 }
9486
9487 // remove
9488 pending_inc.old_pools.insert(pool);
9489
9490 // remove any pg_temp mappings for this pool too
9491 for (auto p = osdmap.pg_temp->begin();
9492 p != osdmap.pg_temp->end();
9493 ++p) {
9494 if (p->first.pool() == (uint64_t)pool) {
9495 dout(10) << "_prepare_remove_pool " << pool << " removing obsolete pg_temp "
9496 << p->first << dendl;
9497 pending_inc.new_pg_temp[p->first].clear();
9498 }
9499 }
9500 for (auto p = osdmap.primary_temp->begin();
9501 p != osdmap.primary_temp->end();
9502 ++p) {
9503 if (p->first.pool() == (uint64_t)pool) {
9504 dout(10) << "_prepare_remove_pool " << pool
9505 << " removing obsolete primary_temp" << p->first << dendl;
9506 pending_inc.new_primary_temp[p->first] = -1;
9507 }
9508 }
9509 return 0;
9510}
9511
9512int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
9513{
9514 dout(10) << "_prepare_rename_pool " << pool << dendl;
9515 if (pending_inc.old_pools.count(pool)) {
9516 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
9517 return -ENOENT;
9518 }
9519 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
9520 p != pending_inc.new_pool_names.end();
9521 ++p) {
9522 if (p->second == newname && p->first != pool) {
9523 return -EEXIST;
9524 }
9525 }
9526
9527 pending_inc.new_pool_names[pool] = newname;
9528 return 0;
9529}
9530
9531bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
9532{
9533 op->mark_osdmon_event(__func__);
9534 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
9535 ostringstream ss;
9536 int ret = _prepare_remove_pool(m->pool, &ss, false);
9537 if (ret == -EAGAIN) {
9538 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9539 return true;
9540 }
9541 if (ret < 0)
9542 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
9543 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
9544 pending_inc.epoch));
9545 return true;
9546}
9547
9548void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
9549 int ret, epoch_t epoch, bufferlist *blp)
9550{
9551 op->mark_osdmon_event(__func__);
9552 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
9553 dout(20) << "_pool_op_reply " << ret << dendl;
9554 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
9555 ret, epoch, get_last_committed(), blp);
9556 mon->send_reply(op, reply);
9557}