]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
update sources to v12.2.1
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <locale>
22 #include <sstream>
23
24 #include "mon/OSDMonitor.h"
25 #include "mon/Monitor.h"
26 #include "mon/MDSMonitor.h"
27 #include "mon/PGMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreated.h"
51 #include "messages/MOSDPGTemp.h"
52 #include "messages/MMonCommand.h"
53 #include "messages/MRemoveSnaps.h"
54 #include "messages/MOSDScrub.h"
55 #include "messages/MRoute.h"
56
57 #include "common/TextTable.h"
58 #include "common/Timer.h"
59 #include "common/ceph_argparse.h"
60 #include "common/perf_counters.h"
61 #include "common/strtol.h"
62
63 #include "common/config.h"
64 #include "common/errno.h"
65
66 #include "erasure-code/ErasureCodePlugin.h"
67 #include "compressor/Compressor.h"
68 #include "common/Checksummer.h"
69
70 #include "include/compat.h"
71 #include "include/assert.h"
72 #include "include/stringify.h"
73 #include "include/util.h"
74 #include "common/cmdparse.h"
75 #include "include/str_list.h"
76 #include "include/str_map.h"
77 #include "include/scope_guard.h"
78
79 #include "json_spirit/json_spirit_reader.h"
80
81 #include <boost/algorithm/string/predicate.hpp>
82
83 #define dout_subsys ceph_subsys_mon
84 #define OSD_PG_CREATING_PREFIX "osd_pg_creating"
85
86 namespace {
87
88 const uint32_t MAX_POOL_APPLICATIONS = 4;
89 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
90 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
91
92 } // anonymous namespace
93
94 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
95 {
96 if (epoch_by_pg.size() <= ps) {
97 epoch_by_pg.resize(ps + 1, 0);
98 }
99 const auto old_lec = epoch_by_pg[ps];
100 if (old_lec >= last_epoch_clean) {
101 // stale lec
102 return;
103 }
104 epoch_by_pg[ps] = last_epoch_clean;
105 if (last_epoch_clean < floor) {
106 floor = last_epoch_clean;
107 } else if (last_epoch_clean > floor) {
108 if (old_lec == floor) {
109 // probably should increase floor?
110 auto new_floor = std::min_element(std::begin(epoch_by_pg),
111 std::end(epoch_by_pg));
112 floor = *new_floor;
113 }
114 }
115 if (ps != next_missing) {
116 return;
117 }
118 for (; next_missing < epoch_by_pg.size(); next_missing++) {
119 if (epoch_by_pg[next_missing] == 0) {
120 break;
121 }
122 }
123 }
124
125 void LastEpochClean::remove_pool(uint64_t pool)
126 {
127 report_by_pool.erase(pool);
128 }
129
130 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
131 {
132 auto& lec = report_by_pool[pg.pool()];
133 return lec.report(pg.ps(), last_epoch_clean);
134 }
135
136 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
137 {
138 auto floor = latest.get_epoch();
139 for (auto& pool : latest.get_pools()) {
140 auto reported = report_by_pool.find(pool.first);
141 if (reported == report_by_pool.end()) {
142 return 0;
143 }
144 if (reported->second.next_missing < pool.second.get_pg_num()) {
145 return 0;
146 }
147 if (reported->second.floor < floor) {
148 floor = reported->second.floor;
149 }
150 }
151 return floor;
152 }
153
154
155 struct C_UpdateCreatingPGs : public Context {
156 OSDMonitor *osdmon;
157 utime_t start;
158 epoch_t epoch;
159 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
160 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
161 void finish(int r) override {
162 if (r >= 0) {
163 utime_t end = ceph_clock_now();
164 dout(10) << "osdmap epoch " << epoch << " mapping took "
165 << (end - start) << " seconds" << dendl;
166 osdmon->update_creating_pgs();
167 osdmon->check_pg_creates_subs();
168 }
169 }
170 };
171
172 #undef dout_prefix
173 #define dout_prefix _prefix(_dout, mon, osdmap)
174 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
175 return *_dout << "mon." << mon->name << "@" << mon->rank
176 << "(" << mon->get_state_name()
177 << ").osd e" << osdmap.get_epoch() << " ";
178 }
179
180 OSDMonitor::OSDMonitor(
181 CephContext *cct,
182 Monitor *mn,
183 Paxos *p,
184 const string& service_name)
185 : PaxosService(mn, p, service_name),
186 cct(cct),
187 inc_osd_cache(g_conf->mon_osd_cache_size),
188 full_osd_cache(g_conf->mon_osd_cache_size),
189 last_attempted_minwait_time(utime_t()),
190 mapper(mn->cct, &mn->cpu_tp),
191 op_tracker(cct, true, 1)
192 {}
193
194 bool OSDMonitor::_have_pending_crush()
195 {
196 return pending_inc.crush.length() > 0;
197 }
198
199 CrushWrapper &OSDMonitor::_get_stable_crush()
200 {
201 return *osdmap.crush;
202 }
203
204 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
205 {
206 bufferlist bl;
207 if (pending_inc.crush.length())
208 bl = pending_inc.crush;
209 else
210 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
211
212 bufferlist::iterator p = bl.begin();
213 newcrush.decode(p);
214 }
215
216 void OSDMonitor::create_initial()
217 {
218 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
219
220 OSDMap newmap;
221
222 bufferlist bl;
223 mon->store->get("mkfs", "osdmap", bl);
224
225 if (bl.length()) {
226 newmap.decode(bl);
227 newmap.set_fsid(mon->monmap->fsid);
228 } else {
229 newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0);
230 }
231 newmap.set_epoch(1);
232 newmap.created = newmap.modified = ceph_clock_now();
233
234 // new clusters should sort bitwise by default.
235 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
236
237 // new cluster should require latest by default
238 if (g_conf->mon_debug_no_require_luminous) {
239 newmap.require_osd_release = CEPH_RELEASE_KRAKEN;
240 derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
241 } else {
242 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
243 newmap.flags |=
244 CEPH_OSDMAP_RECOVERY_DELETES |
245 CEPH_OSDMAP_PURGED_SNAPDIRS;
246 newmap.full_ratio = g_conf->mon_osd_full_ratio;
247 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
248 newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
249 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
250 newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
251 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
252 int r = ceph_release_from_name(
253 g_conf->mon_osd_initial_require_min_compat_client.c_str());
254 if (r <= 0) {
255 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
256 }
257 newmap.require_min_compat_client = r;
258 }
259
260 // encode into pending incremental
261 newmap.encode(pending_inc.fullmap,
262 mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED);
263 pending_inc.full_crc = newmap.get_crc();
264 dout(20) << " full crc " << pending_inc.full_crc << dendl;
265 }
266
267 void OSDMonitor::get_store_prefixes(std::set<string>& s)
268 {
269 s.insert(service_name);
270 s.insert(OSD_PG_CREATING_PREFIX);
271 }
272
273 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
274 {
275 version_t version = get_last_committed();
276 if (version == osdmap.epoch)
277 return;
278 assert(version > osdmap.epoch);
279
280 dout(15) << "update_from_paxos paxos e " << version
281 << ", my e " << osdmap.epoch << dendl;
282
283 if (mapping_job) {
284 if (!mapping_job->is_done()) {
285 dout(1) << __func__ << " mapping job "
286 << mapping_job.get() << " did not complete, "
287 << mapping_job->shards << " left, canceling" << dendl;
288 mapping_job->abort();
289 }
290 mapping_job.reset();
291 }
292
293 load_health();
294
295 /*
296 * We will possibly have a stashed latest that *we* wrote, and we will
297 * always be sure to have the oldest full map in the first..last range
298 * due to encode_trim_extra(), which includes the oldest full map in the trim
299 * transaction.
300 *
301 * encode_trim_extra() does not however write the full map's
302 * version to 'full_latest'. This is only done when we are building the
303 * full maps from the incremental versions. But don't panic! We make sure
304 * that the following conditions find whichever full map version is newer.
305 */
306 version_t latest_full = get_version_latest_full();
307 if (latest_full == 0 && get_first_committed() > 1)
308 latest_full = get_first_committed();
309
310 if (get_first_committed() > 1 &&
311 latest_full < get_first_committed()) {
312 // the monitor could be just sync'ed with its peer, and the latest_full key
313 // is not encoded in the paxos commits in encode_pending(), so we need to
314 // make sure we get it pointing to a proper version.
315 version_t lc = get_last_committed();
316 version_t fc = get_first_committed();
317
318 dout(10) << __func__ << " looking for valid full map in interval"
319 << " [" << fc << ", " << lc << "]" << dendl;
320
321 latest_full = 0;
322 for (version_t v = lc; v >= fc; v--) {
323 string full_key = "full_" + stringify(v);
324 if (mon->store->exists(get_service_name(), full_key)) {
325 dout(10) << __func__ << " found latest full map v " << v << dendl;
326 latest_full = v;
327 break;
328 }
329 }
330
331 assert(latest_full > 0);
332 auto t(std::make_shared<MonitorDBStore::Transaction>());
333 put_version_latest_full(t, latest_full);
334 mon->store->apply_transaction(t);
335 dout(10) << __func__ << " updated the on-disk full map version to "
336 << latest_full << dendl;
337 }
338
339 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
340 bufferlist latest_bl;
341 get_version_full(latest_full, latest_bl);
342 assert(latest_bl.length() != 0);
343 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
344 osdmap.decode(latest_bl);
345 }
346
347 if (mon->monmap->get_required_features().contains_all(
348 ceph::features::mon::FEATURE_LUMINOUS)) {
349 bufferlist bl;
350 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
351 auto p = bl.begin();
352 std::lock_guard<std::mutex> l(creating_pgs_lock);
353 creating_pgs.decode(p);
354 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
355 << creating_pgs.last_scan_epoch
356 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
357 } else {
358 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
359 << dendl;
360 }
361 }
362
363 // make sure we're using the right pg service.. remove me post-luminous!
364 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
365 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
366 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
367 } else {
368 dout(10) << __func__ << " pgservice is pg" << dendl;
369 mon->pgservice = mon->pgmon()->get_pg_stat_service();
370 }
371
372 // walk through incrementals
373 MonitorDBStore::TransactionRef t;
374 size_t tx_size = 0;
375 while (version > osdmap.epoch) {
376 bufferlist inc_bl;
377 int err = get_version(osdmap.epoch+1, inc_bl);
378 assert(err == 0);
379 assert(inc_bl.length());
380
381 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
382 << dendl;
383 OSDMap::Incremental inc(inc_bl);
384 err = osdmap.apply_incremental(inc);
385 assert(err == 0);
386
387 if (!t)
388 t.reset(new MonitorDBStore::Transaction);
389
390 // Write out the full map for all past epochs. Encode the full
391 // map with the same features as the incremental. If we don't
392 // know, use the quorum features. If we don't know those either,
393 // encode with all features.
394 uint64_t f = inc.encode_features;
395 if (!f)
396 f = mon->get_quorum_con_features();
397 if (!f)
398 f = -1;
399 bufferlist full_bl;
400 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
401 tx_size += full_bl.length();
402
403 bufferlist orig_full_bl;
404 get_version_full(osdmap.epoch, orig_full_bl);
405 if (orig_full_bl.length()) {
406 // the primary provided the full map
407 assert(inc.have_crc);
408 if (inc.full_crc != osdmap.crc) {
409 // This will happen if the mons were running mixed versions in
410 // the past or some other circumstance made the full encoded
411 // maps divergent. Reloading here will bring us back into
412 // sync with the primary for this and all future maps. OSDs
413 // will also be brought back into sync when they discover the
414 // crc mismatch and request a full map from a mon.
415 derr << __func__ << " full map CRC mismatch, resetting to canonical"
416 << dendl;
417 osdmap = OSDMap();
418 osdmap.decode(orig_full_bl);
419 }
420 } else {
421 assert(!inc.have_crc);
422 put_version_full(t, osdmap.epoch, full_bl);
423 }
424 put_version_latest_full(t, osdmap.epoch);
425
426 // share
427 dout(1) << osdmap << dendl;
428
429 if (osdmap.epoch == 1) {
430 t->erase("mkfs", "osdmap");
431 }
432
433 // make sure we're using the right pg service.. remove me post-luminous!
434 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
435 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
436 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
437 } else {
438 dout(10) << __func__ << " pgservice is pg" << dendl;
439 mon->pgservice = mon->pgmon()->get_pg_stat_service();
440 }
441
442 if (tx_size > g_conf->mon_sync_max_payload_size*2) {
443 mon->store->apply_transaction(t);
444 t = MonitorDBStore::TransactionRef();
445 tx_size = 0;
446 }
447 if (mon->monmap->get_required_features().contains_all(
448 ceph::features::mon::FEATURE_LUMINOUS)) {
449 for (const auto &osd_state : inc.new_state) {
450 if (osd_state.second & CEPH_OSD_UP) {
451 // could be marked up *or* down, but we're too lazy to check which
452 last_osd_report.erase(osd_state.first);
453 }
454 if (osd_state.second & CEPH_OSD_EXISTS) {
455 // could be created *or* destroyed, but we can safely drop it
456 osd_epochs.erase(osd_state.first);
457 }
458 }
459 }
460 }
461
462 if (t) {
463 mon->store->apply_transaction(t);
464 }
465
466 for (int o = 0; o < osdmap.get_max_osd(); o++) {
467 if (osdmap.is_out(o))
468 continue;
469 auto found = down_pending_out.find(o);
470 if (osdmap.is_down(o)) {
471 // populate down -> out map
472 if (found == down_pending_out.end()) {
473 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
474 down_pending_out[o] = ceph_clock_now();
475 }
476 } else {
477 if (found != down_pending_out.end()) {
478 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
479 down_pending_out.erase(found);
480 }
481 }
482 }
483 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
484
485 if (mon->is_leader()) {
486 // kick pgmon, make sure it's seen the latest map
487 mon->pgmon()->check_osd_map(osdmap.epoch);
488 }
489
490 check_osdmap_subs();
491 check_pg_creates_subs();
492
493 share_map_with_random_osd();
494 update_logger();
495
496 process_failures();
497
498 // make sure our feature bits reflect the latest map
499 update_msgr_features();
500
501 if (!mon->is_leader()) {
502 // will be called by on_active() on the leader, avoid doing so twice
503 start_mapping();
504 }
505 }
506
507 void OSDMonitor::start_mapping()
508 {
509 // initiate mapping job
510 if (mapping_job) {
511 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
512 << dendl;
513 mapping_job->abort();
514 }
515 if (!osdmap.get_pools().empty()) {
516 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
517 mapping_job = mapping.start_update(osdmap, mapper,
518 g_conf->mon_osd_mapping_pgs_per_chunk);
519 dout(10) << __func__ << " started mapping job " << mapping_job.get()
520 << " at " << fin->start << dendl;
521 mapping_job->set_finish_event(fin);
522 } else {
523 dout(10) << __func__ << " no pools, no mapping job" << dendl;
524 mapping_job = nullptr;
525 }
526 }
527
528 void OSDMonitor::update_msgr_features()
529 {
530 set<int> types;
531 types.insert((int)entity_name_t::TYPE_OSD);
532 types.insert((int)entity_name_t::TYPE_CLIENT);
533 types.insert((int)entity_name_t::TYPE_MDS);
534 types.insert((int)entity_name_t::TYPE_MON);
535 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
536 uint64_t mask;
537 uint64_t features = osdmap.get_features(*q, &mask);
538 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
539 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
540 Messenger::Policy p = mon->messenger->get_policy(*q);
541 p.features_required = (p.features_required & ~mask) | features;
542 mon->messenger->set_policy(*q, p);
543 }
544 }
545 }
546
547 void OSDMonitor::on_active()
548 {
549 update_logger();
550
551 if (mon->is_leader()) {
552 mon->clog->debug() << "osdmap " << osdmap;
553 } else {
554 list<MonOpRequestRef> ls;
555 take_all_failures(ls);
556 while (!ls.empty()) {
557 MonOpRequestRef op = ls.front();
558 op->mark_osdmon_event(__func__);
559 dispatch(op);
560 ls.pop_front();
561 }
562 }
563 start_mapping();
564 }
565
566 void OSDMonitor::on_restart()
567 {
568 last_osd_report.clear();
569
570 if (mon->is_leader()) {
571 // fix ruleset != ruleid
572 if (osdmap.crush->has_legacy_rulesets() &&
573 !osdmap.crush->has_multirule_rulesets()) {
574 CrushWrapper newcrush;
575 _get_pending_crush(newcrush);
576 int r = newcrush.renumber_rules_by_ruleset();
577 if (r >= 0) {
578 dout(1) << __func__ << " crush map has ruleset != rule id; fixing" << dendl;
579 pending_inc.crush.clear();
580 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
581 } else {
582 dout(10) << __func__ << " unable to renumber rules by ruleset" << dendl;
583 }
584 }
585 }
586 }
587
588 void OSDMonitor::on_shutdown()
589 {
590 dout(10) << __func__ << dendl;
591 if (mapping_job) {
592 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
593 << dendl;
594 mapping_job->abort();
595 }
596
597 // discard failure info, waiters
598 list<MonOpRequestRef> ls;
599 take_all_failures(ls);
600 ls.clear();
601 }
602
603 void OSDMonitor::update_logger()
604 {
605 dout(10) << "update_logger" << dendl;
606
607 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
608 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
609 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
610 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
611 }
612
613 void OSDMonitor::create_pending()
614 {
615 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
616 pending_inc.fsid = mon->monmap->fsid;
617
618 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
619
620 // clean up pg_temp, primary_temp
621 OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
622 dout(10) << "create_pending did clean_temps" << dendl;
623
624 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
625 // instead of osd_backfill_full_ratio config
626 if (osdmap.backfillfull_ratio <= 0) {
627 pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
628 if (pending_inc.new_backfillfull_ratio > 1.0)
629 pending_inc.new_backfillfull_ratio /= 100;
630 dout(1) << __func__ << " setting backfillfull_ratio = "
631 << pending_inc.new_backfillfull_ratio << dendl;
632 }
633 if (osdmap.get_epoch() > 0 &&
634 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
635 // transition full ratios from PGMap to OSDMap (on upgrade)
636 float full_ratio = mon->pgservice->get_full_ratio();
637 float nearfull_ratio = mon->pgservice->get_nearfull_ratio();
638 if (osdmap.full_ratio != full_ratio) {
639 dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
640 << " -> " << full_ratio << " (from pgmap)" << dendl;
641 pending_inc.new_full_ratio = full_ratio;
642 }
643 if (osdmap.nearfull_ratio != nearfull_ratio) {
644 dout(10) << __func__ << " nearfull_ratio " << osdmap.nearfull_ratio
645 << " -> " << nearfull_ratio << " (from pgmap)" << dendl;
646 pending_inc.new_nearfull_ratio = nearfull_ratio;
647 }
648 } else {
649 // safety check (this shouldn't really happen)
650 if (osdmap.full_ratio <= 0) {
651 pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
652 if (pending_inc.new_full_ratio > 1.0)
653 pending_inc.new_full_ratio /= 100;
654 dout(1) << __func__ << " setting full_ratio = "
655 << pending_inc.new_full_ratio << dendl;
656 }
657 if (osdmap.nearfull_ratio <= 0) {
658 pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
659 if (pending_inc.new_nearfull_ratio > 1.0)
660 pending_inc.new_nearfull_ratio /= 100;
661 dout(1) << __func__ << " setting nearfull_ratio = "
662 << pending_inc.new_nearfull_ratio << dendl;
663 }
664 }
665 }
666
667 creating_pgs_t
668 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
669 {
670 dout(10) << __func__ << dendl;
671 creating_pgs_t pending_creatings;
672 {
673 std::lock_guard<std::mutex> l(creating_pgs_lock);
674 pending_creatings = creating_pgs;
675 }
676 // check for new or old pools
677 if (pending_creatings.last_scan_epoch < inc.epoch) {
678 if (osdmap.get_epoch() &&
679 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
680 auto added =
681 mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
682 osdmap.get_pools(),
683 &pending_creatings);
684 dout(7) << __func__ << " " << added << " pgs added from pgmap" << dendl;
685 }
686 unsigned queued = 0;
687 queued += scan_for_creating_pgs(osdmap.get_pools(),
688 inc.old_pools,
689 inc.modified,
690 &pending_creatings);
691 queued += scan_for_creating_pgs(inc.new_pools,
692 inc.old_pools,
693 inc.modified,
694 &pending_creatings);
695 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
696 for (auto deleted_pool : inc.old_pools) {
697 auto removed = pending_creatings.remove_pool(deleted_pool);
698 dout(10) << __func__ << " " << removed
699 << " pg removed because containing pool deleted: "
700 << deleted_pool << dendl;
701 last_epoch_clean.remove_pool(deleted_pool);
702 }
703 // pgmon updates its creating_pgs in check_osd_map() which is called by
704 // on_active() and check_osd_map() could be delayed if lease expires, so its
705 // creating_pgs could be stale in comparison with the one of osdmon. let's
706 // trim them here. otherwise, they will be added back after being erased.
707 unsigned removed = 0;
708 for (auto& pg : pending_created_pgs) {
709 dout(20) << __func__ << " noting created pg " << pg << dendl;
710 pending_creatings.created_pools.insert(pg.pool());
711 removed += pending_creatings.pgs.erase(pg);
712 }
713 pending_created_pgs.clear();
714 dout(10) << __func__ << " " << removed
715 << " pgs removed because they're created" << dendl;
716 pending_creatings.last_scan_epoch = osdmap.get_epoch();
717 }
718
719 // process queue
720 unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
721 const auto total = pending_creatings.pgs.size();
722 while (pending_creatings.pgs.size() < max &&
723 !pending_creatings.queue.empty()) {
724 auto p = pending_creatings.queue.begin();
725 int64_t poolid = p->first;
726 dout(10) << __func__ << " pool " << poolid
727 << " created " << p->second.created
728 << " modified " << p->second.modified
729 << " [" << p->second.start << "-" << p->second.end << ")"
730 << dendl;
731 int n = MIN(max - pending_creatings.pgs.size(),
732 p->second.end - p->second.start);
733 ps_t first = p->second.start;
734 ps_t end = first + n;
735 for (ps_t ps = first; ps < end; ++ps) {
736 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
737 // NOTE: use the *current* epoch as the PG creation epoch so that the
738 // OSD does not have to generate a long set of PastIntervals.
739 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
740 p->second.modified));
741 dout(10) << __func__ << " adding " << pgid << dendl;
742 }
743 p->second.start = end;
744 if (p->second.done()) {
745 dout(10) << __func__ << " done with queue for " << poolid << dendl;
746 pending_creatings.queue.erase(p);
747 } else {
748 dout(10) << __func__ << " pool " << poolid
749 << " now [" << p->second.start << "-" << p->second.end << ")"
750 << dendl;
751 }
752 }
753 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
754 << " pools" << dendl;
755 dout(10) << __func__
756 << " " << (pending_creatings.pgs.size() - total)
757 << "/" << pending_creatings.pgs.size()
758 << " pgs added from queued pools" << dendl;
759 return pending_creatings;
760 }
761
762 void OSDMonitor::maybe_prime_pg_temp()
763 {
764 bool all = false;
765 if (pending_inc.crush.length()) {
766 dout(10) << __func__ << " new crush map, all" << dendl;
767 all = true;
768 }
769
770 if (!pending_inc.new_up_client.empty()) {
771 dout(10) << __func__ << " new up osds, all" << dendl;
772 all = true;
773 }
774
775 // check for interesting OSDs
776 set<int> osds;
777 for (auto p = pending_inc.new_state.begin();
778 !all && p != pending_inc.new_state.end();
779 ++p) {
780 if ((p->second & CEPH_OSD_UP) &&
781 osdmap.is_up(p->first)) {
782 osds.insert(p->first);
783 }
784 }
785 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
786 !all && p != pending_inc.new_weight.end();
787 ++p) {
788 if (p->second < osdmap.get_weight(p->first)) {
789 // weight reduction
790 osds.insert(p->first);
791 } else {
792 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
793 << dendl;
794 all = true;
795 }
796 }
797
798 if (!all && osds.empty())
799 return;
800
801 if (!all) {
802 unsigned estimate =
803 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
804 if (estimate > mapping.get_num_pgs() *
805 g_conf->mon_osd_prime_pg_temp_max_estimate) {
806 dout(10) << __func__ << " estimate " << estimate << " pgs on "
807 << osds.size() << " osds >= "
808 << g_conf->mon_osd_prime_pg_temp_max_estimate << " of total "
809 << mapping.get_num_pgs() << " pgs, all"
810 << dendl;
811 all = true;
812 } else {
813 dout(10) << __func__ << " estimate " << estimate << " pgs on "
814 << osds.size() << " osds" << dendl;
815 }
816 }
817
818 OSDMap next;
819 next.deepish_copy_from(osdmap);
820 next.apply_incremental(pending_inc);
821
822 if (next.get_pools().empty()) {
823 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
824 } else if (all) {
825 PrimeTempJob job(next, this);
826 mapper.queue(&job, g_conf->mon_osd_mapping_pgs_per_chunk);
827 if (job.wait_for(g_conf->mon_osd_prime_pg_temp_max_time)) {
828 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
829 } else {
830 dout(10) << __func__ << " did not finish in "
831 << g_conf->mon_osd_prime_pg_temp_max_time
832 << ", stopping" << dendl;
833 job.abort();
834 }
835 } else {
836 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
837 utime_t stop = ceph_clock_now();
838 stop += g_conf->mon_osd_prime_pg_temp_max_time;
839 const int chunk = 1000;
840 int n = chunk;
841 std::unordered_set<pg_t> did_pgs;
842 for (auto osd : osds) {
843 auto& pgs = mapping.get_osd_acting_pgs(osd);
844 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
845 for (auto pgid : pgs) {
846 if (!did_pgs.insert(pgid).second) {
847 continue;
848 }
849 prime_pg_temp(next, pgid);
850 if (--n <= 0) {
851 n = chunk;
852 if (ceph_clock_now() > stop) {
853 dout(10) << __func__ << " consumed more than "
854 << g_conf->mon_osd_prime_pg_temp_max_time
855 << " seconds, stopping"
856 << dendl;
857 return;
858 }
859 }
860 }
861 }
862 }
863 }
864
865 void OSDMonitor::prime_pg_temp(
866 const OSDMap& next,
867 pg_t pgid)
868 {
869 if (mon->monmap->get_required_features().contains_all(
870 ceph::features::mon::FEATURE_LUMINOUS)) {
871 // TODO: remove this creating_pgs direct access?
872 if (creating_pgs.pgs.count(pgid)) {
873 return;
874 }
875 } else {
876 if (mon->pgservice->is_creating_pg(pgid)) {
877 return;
878 }
879 }
880 if (!osdmap.pg_exists(pgid)) {
881 return;
882 }
883
884 vector<int> up, acting;
885 mapping.get(pgid, &up, nullptr, &acting, nullptr);
886
887 vector<int> next_up, next_acting;
888 int next_up_primary, next_acting_primary;
889 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
890 &next_acting, &next_acting_primary);
891 if (acting == next_acting && next_up != next_acting)
892 return; // no change since last epoch
893
894 if (acting.empty())
895 return; // if previously empty now we can be no worse off
896 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
897 if (pool && acting.size() < pool->min_size)
898 return; // can be no worse off than before
899
900 if (next_up == next_acting) {
901 acting.clear();
902 dout(20) << __func__ << "next_up === next_acting now, clear pg_temp"
903 << dendl;
904 }
905
906 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
907 << " -> " << next_up << "/" << next_acting
908 << ", priming " << acting
909 << dendl;
910 {
911 Mutex::Locker l(prime_pg_temp_lock);
912 // do not touch a mapping if a change is pending
913 pending_inc.new_pg_temp.emplace(
914 pgid,
915 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
916 }
917 }
918
919 /**
920 * @note receiving a transaction in this function gives a fair amount of
921 * freedom to the service implementation if it does need it. It shouldn't.
922 */
923 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
924 {
925 dout(10) << "encode_pending e " << pending_inc.epoch
926 << dendl;
927
928 // finalize up pending_inc
929 pending_inc.modified = ceph_clock_now();
930
931 int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
932 assert(r == 0);
933
934 if (mapping_job) {
935 if (!mapping_job->is_done()) {
936 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
937 << mapping_job.get() << " did not complete, "
938 << mapping_job->shards << " left" << dendl;
939 mapping_job->abort();
940 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
941 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
942 << mapping_job.get() << " is prior epoch "
943 << mapping.get_epoch() << dendl;
944 } else {
945 if (g_conf->mon_osd_prime_pg_temp) {
946 maybe_prime_pg_temp();
947 }
948 }
949 } else if (g_conf->mon_osd_prime_pg_temp) {
950 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
951 << dendl;
952 }
953 mapping_job.reset();
954
955 // ensure we don't have blank new_state updates. these are interrpeted as
956 // CEPH_OSD_UP (and almost certainly not what we want!).
957 auto p = pending_inc.new_state.begin();
958 while (p != pending_inc.new_state.end()) {
959 if (p->second == 0) {
960 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
961 p = pending_inc.new_state.erase(p);
962 } else {
963 ++p;
964 }
965 }
966
967 bufferlist bl;
968
969 {
970 OSDMap tmp;
971 tmp.deepish_copy_from(osdmap);
972 tmp.apply_incremental(pending_inc);
973
974 if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
975 // set or clear full/nearfull?
976 int full, backfill, nearfull;
977 tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
978 if (full > 0) {
979 if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
980 dout(10) << __func__ << " setting full flag" << dendl;
981 add_flag(CEPH_OSDMAP_FULL);
982 remove_flag(CEPH_OSDMAP_NEARFULL);
983 }
984 } else {
985 if (tmp.test_flag(CEPH_OSDMAP_FULL)) {
986 dout(10) << __func__ << " clearing full flag" << dendl;
987 remove_flag(CEPH_OSDMAP_FULL);
988 }
989 if (nearfull > 0) {
990 if (!tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
991 dout(10) << __func__ << " setting nearfull flag" << dendl;
992 add_flag(CEPH_OSDMAP_NEARFULL);
993 }
994 } else {
995 if (tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
996 dout(10) << __func__ << " clearing nearfull flag" << dendl;
997 remove_flag(CEPH_OSDMAP_NEARFULL);
998 }
999 }
1000 }
1001
1002 // min_compat_client?
1003 if (tmp.require_min_compat_client == 0) {
1004 auto mv = tmp.get_min_compat_client();
1005 dout(1) << __func__ << " setting require_min_compat_client to currently "
1006 << "required " << ceph_release_name(mv) << dendl;
1007 mon->clog->info() << "setting require_min_compat_client to currently "
1008 << "required " << ceph_release_name(mv);
1009 pending_inc.new_require_min_compat_client = mv;
1010 }
1011
1012 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1013 // convert ec profile ruleset-* -> crush-*
1014 for (auto& p : tmp.erasure_code_profiles) {
1015 bool changed = false;
1016 map<string,string> newprofile;
1017 for (auto& q : p.second) {
1018 if (q.first.find("ruleset-") == 0) {
1019 string key = "crush-";
1020 key += q.first.substr(8);
1021 newprofile[key] = q.second;
1022 changed = true;
1023 dout(20) << " updating ec profile " << p.first
1024 << " key " << q.first << " -> " << key << dendl;
1025 } else {
1026 newprofile[q.first] = q.second;
1027 }
1028 }
1029 if (changed) {
1030 dout(10) << " updated ec profile " << p.first << ": "
1031 << newprofile << dendl;
1032 pending_inc.new_erasure_code_profiles[p.first] = newprofile;
1033 }
1034 }
1035
1036 // auto-enable pool applications upon upgrade
1037 // NOTE: this can be removed post-Luminous assuming upgrades need to
1038 // proceed through Luminous
1039 for (auto &pool_pair : tmp.pools) {
1040 int64_t pool_id = pool_pair.first;
1041 pg_pool_t pg_pool = pool_pair.second;
1042 if (pg_pool.is_tier()) {
1043 continue;
1044 }
1045
1046 std::string pool_name = tmp.get_pool_name(pool_id);
1047 uint32_t match_count = 0;
1048
1049 // CephFS
1050 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
1051 if (pending_fsmap.pool_in_use(pool_id)) {
1052 dout(10) << __func__ << " auto-enabling CephFS on pool '"
1053 << pool_name << "'" << dendl;
1054 pg_pool.application_metadata.insert(
1055 {pg_pool_t::APPLICATION_NAME_CEPHFS, {}});
1056 ++match_count;
1057 }
1058
1059 // RBD heuristics (default OpenStack pool names from docs and
1060 // ceph-ansible)
1061 if (boost::algorithm::contains(pool_name, "rbd") ||
1062 pool_name == "images" || pool_name == "volumes" ||
1063 pool_name == "backups" || pool_name == "vms") {
1064 dout(10) << __func__ << " auto-enabling RBD on pool '"
1065 << pool_name << "'" << dendl;
1066 pg_pool.application_metadata.insert(
1067 {pg_pool_t::APPLICATION_NAME_RBD, {}});
1068 ++match_count;
1069 }
1070
1071 // RGW heuristics
1072 if (boost::algorithm::contains(pool_name, ".rgw") ||
1073 boost::algorithm::contains(pool_name, ".log") ||
1074 boost::algorithm::contains(pool_name, ".intent-log") ||
1075 boost::algorithm::contains(pool_name, ".usage") ||
1076 boost::algorithm::contains(pool_name, ".users")) {
1077 dout(10) << __func__ << " auto-enabling RGW on pool '"
1078 << pool_name << "'" << dendl;
1079 pg_pool.application_metadata.insert(
1080 {pg_pool_t::APPLICATION_NAME_RGW, {}});
1081 ++match_count;
1082 }
1083
1084 // OpenStack gnocchi (from ceph-ansible)
1085 if (pool_name == "metrics" && match_count == 0) {
1086 dout(10) << __func__ << " auto-enabling OpenStack Gnocchi on pool '"
1087 << pool_name << "'" << dendl;
1088 pg_pool.application_metadata.insert({"openstack_gnocchi", {}});
1089 ++match_count;
1090 }
1091
1092 if (match_count == 1) {
1093 pg_pool.last_change = pending_inc.epoch;
1094 pending_inc.new_pools[pool_id] = pg_pool;
1095 } else if (match_count > 1) {
1096 auto pstat = mon->pgservice->get_pool_stat(pool_id);
1097 if (pstat != nullptr && pstat->stats.sum.num_objects > 0) {
1098 mon->clog->info() << "unable to auto-enable application for pool "
1099 << "'" << pool_name << "'";
1100 }
1101 }
1102 }
1103 }
1104 }
1105 }
1106
1107 // tell me about it
1108 for (auto i = pending_inc.new_state.begin();
1109 i != pending_inc.new_state.end();
1110 ++i) {
1111 int s = i->second ? i->second : CEPH_OSD_UP;
1112 if (s & CEPH_OSD_UP)
1113 dout(2) << " osd." << i->first << " DOWN" << dendl;
1114 if (s & CEPH_OSD_EXISTS)
1115 dout(2) << " osd." << i->first << " DNE" << dendl;
1116 }
1117 for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
1118 i != pending_inc.new_up_client.end();
1119 ++i) {
1120 //FIXME: insert cluster addresses too
1121 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1122 }
1123 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1124 i != pending_inc.new_weight.end();
1125 ++i) {
1126 if (i->second == CEPH_OSD_OUT) {
1127 dout(2) << " osd." << i->first << " OUT" << dendl;
1128 } else if (i->second == CEPH_OSD_IN) {
1129 dout(2) << " osd." << i->first << " IN" << dendl;
1130 } else {
1131 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1132 }
1133 }
1134
1135 // features for osdmap and its incremental
1136 uint64_t features = mon->get_quorum_con_features();
1137
1138 // encode full map and determine its crc
1139 OSDMap tmp;
1140 {
1141 tmp.deepish_copy_from(osdmap);
1142 tmp.apply_incremental(pending_inc);
1143
1144 // determine appropriate features
1145 if (tmp.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1146 dout(10) << __func__ << " encoding without feature SERVER_LUMINOUS"
1147 << dendl;
1148 features &= ~CEPH_FEATURE_SERVER_LUMINOUS;
1149 }
1150 if (tmp.require_osd_release < CEPH_RELEASE_KRAKEN) {
1151 dout(10) << __func__ << " encoding without feature SERVER_KRAKEN | "
1152 << "MSG_ADDR2" << dendl;
1153 features &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1154 CEPH_FEATURE_MSG_ADDR2);
1155 }
1156 if (tmp.require_osd_release < CEPH_RELEASE_JEWEL) {
1157 dout(10) << __func__ << " encoding without feature SERVER_JEWEL" << dendl;
1158 features &= ~CEPH_FEATURE_SERVER_JEWEL;
1159 }
1160 dout(10) << __func__ << " encoding full map with " << features << dendl;
1161
1162 bufferlist fullbl;
1163 ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1164 pending_inc.full_crc = tmp.get_crc();
1165
1166 // include full map in the txn. note that old monitors will
1167 // overwrite this. new ones will now skip the local full map
1168 // encode and reload from this.
1169 put_version_full(t, pending_inc.epoch, fullbl);
1170 }
1171
1172 // encode
1173 assert(get_last_committed() + 1 == pending_inc.epoch);
1174 ::encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1175
1176 dout(20) << " full_crc " << tmp.get_crc()
1177 << " inc_crc " << pending_inc.inc_crc << dendl;
1178
1179 /* put everything in the transaction */
1180 put_version(t, pending_inc.epoch, bl);
1181 put_last_committed(t, pending_inc.epoch);
1182
1183 // metadata, too!
1184 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1185 p != pending_metadata.end();
1186 ++p)
1187 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1188 for (set<int>::iterator p = pending_metadata_rm.begin();
1189 p != pending_metadata_rm.end();
1190 ++p)
1191 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1192 pending_metadata.clear();
1193 pending_metadata_rm.clear();
1194
1195 // and pg creating, also!
1196 if (mon->monmap->get_required_features().contains_all(
1197 ceph::features::mon::FEATURE_LUMINOUS)) {
1198 auto pending_creatings = update_pending_pgs(pending_inc);
1199 if (osdmap.get_epoch() &&
1200 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1201 dout(7) << __func__ << " in the middle of upgrading, "
1202 << " trimming pending creating_pgs using pgmap" << dendl;
1203 mon->pgservice->maybe_trim_creating_pgs(&pending_creatings);
1204 }
1205 bufferlist creatings_bl;
1206 ::encode(pending_creatings, creatings_bl);
1207 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1208 }
1209
1210 // health
1211 health_check_map_t next;
1212 tmp.check_health(&next);
1213 encode_health(next, t);
1214 }
1215
1216 void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
1217 const ceph::unordered_map<pg_t,pg_stat_t>& pg_stat)
1218 {
1219 auto p = creating_pgs->pgs.begin();
1220 while (p != creating_pgs->pgs.end()) {
1221 auto q = pg_stat.find(p->first);
1222 if (q != pg_stat.end() &&
1223 !(q->second.state & PG_STATE_CREATING)) {
1224 dout(20) << __func__ << " pgmap shows " << p->first << " is created"
1225 << dendl;
1226 p = creating_pgs->pgs.erase(p);
1227 } else {
1228 ++p;
1229 }
1230 }
1231 }
1232
1233 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1234 {
1235 bufferlist bl;
1236 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1237 if (r < 0)
1238 return r;
1239 try {
1240 bufferlist::iterator p = bl.begin();
1241 ::decode(m, p);
1242 }
1243 catch (buffer::error& e) {
1244 if (err)
1245 *err << "osd." << osd << " metadata is corrupt";
1246 return -EIO;
1247 }
1248 return 0;
1249 }
1250
1251 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
1252 {
1253 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1254 if (osdmap.is_up(osd)) {
1255 map<string,string> meta;
1256 load_metadata(osd, meta, nullptr);
1257 auto p = meta.find(field);
1258 if (p == meta.end()) {
1259 (*out)["unknown"]++;
1260 } else {
1261 (*out)[p->second]++;
1262 }
1263 }
1264 }
1265 }
1266
1267 void OSDMonitor::count_metadata(const string& field, Formatter *f)
1268 {
1269 map<string,int> by_val;
1270 count_metadata(field, &by_val);
1271 f->open_object_section(field.c_str());
1272 for (auto& p : by_val) {
1273 f->dump_int(p.first.c_str(), p.second);
1274 }
1275 f->close_section();
1276 }
1277
1278 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1279 {
1280 map<string, string> metadata;
1281 int r = load_metadata(osd, metadata, nullptr);
1282 if (r < 0)
1283 return r;
1284
1285 auto it = metadata.find("osd_objectstore");
1286 if (it == metadata.end())
1287 return -ENOENT;
1288 *type = it->second;
1289 return 0;
1290 }
1291
1292 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1293 const pg_pool_t &pool,
1294 ostream *err)
1295 {
1296 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1297 // since filestore osds could always join the pool later
1298 set<int> checked_osds;
1299 for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) {
1300 vector<int> up, acting;
1301 pg_t pgid(ps, pool_id, -1);
1302 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1303 for (int osd : up) {
1304 if (checked_osds.find(osd) != checked_osds.end())
1305 continue;
1306 string objectstore_type;
1307 int r = get_osd_objectstore_type(osd, &objectstore_type);
1308 // allow with missing metadata, e.g. due to an osd never booting yet
1309 if (r < 0 || objectstore_type == "bluestore") {
1310 checked_osds.insert(osd);
1311 continue;
1312 }
1313 *err << "osd." << osd << " uses " << objectstore_type;
1314 return false;
1315 }
1316 }
1317 return true;
1318 }
1319
1320 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1321 {
1322 map<string,string> m;
1323 if (int r = load_metadata(osd, m, err))
1324 return r;
1325 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1326 f->dump_string(p->first.c_str(), p->second);
1327 return 0;
1328 }
1329
1330 void OSDMonitor::print_nodes(Formatter *f)
1331 {
1332 // group OSDs by their hosts
1333 map<string, list<int> > osds; // hostname => osd
1334 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1335 map<string, string> m;
1336 if (load_metadata(osd, m, NULL)) {
1337 continue;
1338 }
1339 map<string, string>::iterator hostname = m.find("hostname");
1340 if (hostname == m.end()) {
1341 // not likely though
1342 continue;
1343 }
1344 osds[hostname->second].push_back(osd);
1345 }
1346
1347 dump_services(f, osds, "osd");
1348 }
1349
1350 void OSDMonitor::share_map_with_random_osd()
1351 {
1352 if (osdmap.get_num_up_osds() == 0) {
1353 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1354 return;
1355 }
1356
1357 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1358 if (!s) {
1359 dout(10) << __func__ << " no up osd on our session map" << dendl;
1360 return;
1361 }
1362
1363 dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
1364 // whatev, they'll request more if they need it
1365 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
1366 s->con->send_message(m);
1367 // NOTE: do *not* record osd has up to this epoch (as we do
1368 // elsewhere) as they may still need to request older values.
1369 }
1370
1371 version_t OSDMonitor::get_trim_to()
1372 {
1373 if (mon->get_quorum().empty()) {
1374 dout(10) << __func__ << ": quorum not formed" << dendl;
1375 return 0;
1376 }
1377
1378 epoch_t floor;
1379 if (mon->monmap->get_required_features().contains_all(
1380 ceph::features::mon::FEATURE_LUMINOUS)) {
1381 {
1382 // TODO: Get this hidden in PGStatService
1383 std::lock_guard<std::mutex> l(creating_pgs_lock);
1384 if (!creating_pgs.pgs.empty()) {
1385 return 0;
1386 }
1387 }
1388 floor = get_min_last_epoch_clean();
1389 } else {
1390 if (!mon->pgservice->is_readable())
1391 return 0;
1392 if (mon->pgservice->have_creating_pgs()) {
1393 return 0;
1394 }
1395 floor = mon->pgservice->get_min_last_epoch_clean();
1396 }
1397 {
1398 dout(10) << " min_last_epoch_clean " << floor << dendl;
1399 if (g_conf->mon_osd_force_trim_to > 0 &&
1400 g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
1401 floor = g_conf->mon_osd_force_trim_to;
1402 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1403 }
1404 unsigned min = g_conf->mon_min_osdmap_epochs;
1405 if (floor + min > get_last_committed()) {
1406 if (min < get_last_committed())
1407 floor = get_last_committed() - min;
1408 else
1409 floor = 0;
1410 }
1411 if (floor > get_first_committed())
1412 return floor;
1413 }
1414 return 0;
1415 }
1416
1417 epoch_t OSDMonitor::get_min_last_epoch_clean() const
1418 {
1419 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1420 // also scan osd epochs
1421 // don't trim past the oldest reported osd epoch
1422 for (auto& osd_epoch : osd_epochs) {
1423 if (osd_epoch.second < floor) {
1424 floor = osd_epoch.second;
1425 }
1426 }
1427 return floor;
1428 }
1429
1430 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1431 version_t first)
1432 {
1433 dout(10) << __func__ << " including full map for e " << first << dendl;
1434 bufferlist bl;
1435 get_version_full(first, bl);
1436 put_version_full(tx, first, bl);
1437 }
1438
1439 // -------------
1440
1441 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
1442 {
1443 op->mark_osdmon_event(__func__);
1444 Message *m = op->get_req();
1445 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
1446
1447 switch (m->get_type()) {
1448 // READs
1449 case MSG_MON_COMMAND:
1450 return preprocess_command(op);
1451 case CEPH_MSG_MON_GET_OSDMAP:
1452 return preprocess_get_osdmap(op);
1453
1454 // damp updates
1455 case MSG_OSD_MARK_ME_DOWN:
1456 return preprocess_mark_me_down(op);
1457 case MSG_OSD_FULL:
1458 return preprocess_full(op);
1459 case MSG_OSD_FAILURE:
1460 return preprocess_failure(op);
1461 case MSG_OSD_BOOT:
1462 return preprocess_boot(op);
1463 case MSG_OSD_ALIVE:
1464 return preprocess_alive(op);
1465 case MSG_OSD_PG_CREATED:
1466 return preprocess_pg_created(op);
1467 case MSG_OSD_PGTEMP:
1468 return preprocess_pgtemp(op);
1469 case MSG_OSD_BEACON:
1470 return preprocess_beacon(op);
1471
1472 case CEPH_MSG_POOLOP:
1473 return preprocess_pool_op(op);
1474
1475 case MSG_REMOVE_SNAPS:
1476 return preprocess_remove_snaps(op);
1477
1478 default:
1479 ceph_abort();
1480 return true;
1481 }
1482 }
1483
1484 bool OSDMonitor::prepare_update(MonOpRequestRef op)
1485 {
1486 op->mark_osdmon_event(__func__);
1487 Message *m = op->get_req();
1488 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
1489
1490 switch (m->get_type()) {
1491 // damp updates
1492 case MSG_OSD_MARK_ME_DOWN:
1493 return prepare_mark_me_down(op);
1494 case MSG_OSD_FULL:
1495 return prepare_full(op);
1496 case MSG_OSD_FAILURE:
1497 return prepare_failure(op);
1498 case MSG_OSD_BOOT:
1499 return prepare_boot(op);
1500 case MSG_OSD_ALIVE:
1501 return prepare_alive(op);
1502 case MSG_OSD_PG_CREATED:
1503 return prepare_pg_created(op);
1504 case MSG_OSD_PGTEMP:
1505 return prepare_pgtemp(op);
1506 case MSG_OSD_BEACON:
1507 return prepare_beacon(op);
1508
1509 case MSG_MON_COMMAND:
1510 return prepare_command(op);
1511
1512 case CEPH_MSG_POOLOP:
1513 return prepare_pool_op(op);
1514
1515 case MSG_REMOVE_SNAPS:
1516 return prepare_remove_snaps(op);
1517
1518
1519 default:
1520 ceph_abort();
1521 }
1522
1523 return false;
1524 }
1525
1526 bool OSDMonitor::should_propose(double& delay)
1527 {
1528 dout(10) << "should_propose" << dendl;
1529
1530 // if full map, propose immediately! any subsequent changes will be clobbered.
1531 if (pending_inc.fullmap.length())
1532 return true;
1533
1534 // adjust osd weights?
1535 if (!osd_weight.empty() &&
1536 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
1537 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
1538 osdmap.adjust_osd_weights(osd_weight, pending_inc);
1539 delay = 0.0;
1540 osd_weight.clear();
1541 return true;
1542 }
1543
1544 // propose as fast as possible if updating up_thru or pg_temp
1545 // want to merge OSDMap changes as much as possible
1546 if ((pending_inc.new_primary_temp.size() == 1
1547 || pending_inc.new_up_thru.size() == 1)
1548 && pending_inc.new_state.size() < 2) {
1549 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl;
1550
1551 utime_t now = ceph_clock_now();
1552 if (now - last_attempted_minwait_time > g_conf->paxos_propose_interval
1553 && now - paxos->get_last_commit_time() > g_conf->paxos_min_wait) {
1554 delay = g_conf->paxos_min_wait;
1555 last_attempted_minwait_time = now;
1556 return true;
1557 }
1558 }
1559
1560 return PaxosService::should_propose(delay);
1561 }
1562
1563
1564
1565 // ---------------------------
1566 // READs
1567
1568 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
1569 {
1570 op->mark_osdmon_event(__func__);
1571 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
1572 dout(10) << __func__ << " " << *m << dendl;
1573 MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
1574 epoch_t first = get_first_committed();
1575 epoch_t last = osdmap.get_epoch();
1576 int max = g_conf->osd_map_message_max;
1577 for (epoch_t e = MAX(first, m->get_full_first());
1578 e <= MIN(last, m->get_full_last()) && max > 0;
1579 ++e, --max) {
1580 int r = get_version_full(e, reply->maps[e]);
1581 assert(r >= 0);
1582 }
1583 for (epoch_t e = MAX(first, m->get_inc_first());
1584 e <= MIN(last, m->get_inc_last()) && max > 0;
1585 ++e, --max) {
1586 int r = get_version(e, reply->incremental_maps[e]);
1587 assert(r >= 0);
1588 }
1589 reply->oldest_map = first;
1590 reply->newest_map = last;
1591 mon->send_reply(op, reply);
1592 return true;
1593 }
1594
1595
1596 // ---------------------------
1597 // UPDATEs
1598
1599 // failure --
1600
1601 bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
1602 // check permissions
1603 MonSession *session = m->get_session();
1604 if (!session)
1605 return true;
1606 if (!session->is_capable("osd", MON_CAP_X)) {
1607 dout(0) << "got MOSDFailure from entity with insufficient caps "
1608 << session->caps << dendl;
1609 return true;
1610 }
1611 if (fsid != mon->monmap->fsid) {
1612 dout(0) << "check_source: on fsid " << fsid
1613 << " != " << mon->monmap->fsid << dendl;
1614 return true;
1615 }
1616 return false;
1617 }
1618
1619
1620 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
1621 {
1622 op->mark_osdmon_event(__func__);
1623 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1624 // who is target_osd
1625 int badboy = m->get_target().name.num();
1626
1627 // check permissions
1628 if (check_source(m, m->fsid))
1629 goto didit;
1630
1631 // first, verify the reporting host is valid
1632 if (m->get_orig_source().is_osd()) {
1633 int from = m->get_orig_source().num();
1634 if (!osdmap.exists(from) ||
1635 osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
1636 (osdmap.is_down(from) && m->if_osd_failed())) {
1637 dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
1638 send_incremental(op, m->get_epoch()+1);
1639 goto didit;
1640 }
1641 }
1642
1643
1644 // weird?
1645 if (osdmap.is_down(badboy)) {
1646 dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1647 if (m->get_epoch() < osdmap.get_epoch())
1648 send_incremental(op, m->get_epoch()+1);
1649 goto didit;
1650 }
1651 if (osdmap.get_inst(badboy) != m->get_target()) {
1652 dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
1653 << ", from " << m->get_orig_source_inst() << dendl;
1654 if (m->get_epoch() < osdmap.get_epoch())
1655 send_incremental(op, m->get_epoch()+1);
1656 goto didit;
1657 }
1658
1659 // already reported?
1660 if (osdmap.is_down(badboy) ||
1661 osdmap.get_up_from(badboy) > m->get_epoch()) {
1662 dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1663 if (m->get_epoch() < osdmap.get_epoch())
1664 send_incremental(op, m->get_epoch()+1);
1665 goto didit;
1666 }
1667
1668 if (!can_mark_down(badboy)) {
1669 dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
1670 goto didit;
1671 }
1672
1673 dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1674 return false;
1675
1676 didit:
1677 return true;
1678 }
1679
1680 class C_AckMarkedDown : public C_MonOp {
1681 OSDMonitor *osdmon;
1682 public:
1683 C_AckMarkedDown(
1684 OSDMonitor *osdmon,
1685 MonOpRequestRef op)
1686 : C_MonOp(op), osdmon(osdmon) {}
1687
1688 void _finish(int) override {
1689 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1690 osdmon->mon->send_reply(
1691 op,
1692 new MOSDMarkMeDown(
1693 m->fsid,
1694 m->get_target(),
1695 m->get_epoch(),
1696 false)); // ACK itself does not request an ack
1697 }
1698 ~C_AckMarkedDown() override {
1699 }
1700 };
1701
1702 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
1703 {
1704 op->mark_osdmon_event(__func__);
1705 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1706 int requesting_down = m->get_target().name.num();
1707 int from = m->get_orig_source().num();
1708
1709 // check permissions
1710 if (check_source(m, m->fsid))
1711 goto reply;
1712
1713 // first, verify the reporting host is valid
1714 if (!m->get_orig_source().is_osd())
1715 goto reply;
1716
1717 if (!osdmap.exists(from) ||
1718 osdmap.is_down(from) ||
1719 osdmap.get_addr(from) != m->get_target().addr) {
1720 dout(5) << "preprocess_mark_me_down from dead osd."
1721 << from << ", ignoring" << dendl;
1722 send_incremental(op, m->get_epoch()+1);
1723 goto reply;
1724 }
1725
1726 // no down might be set
1727 if (!can_mark_down(requesting_down))
1728 goto reply;
1729
1730 dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
1731 return false;
1732
1733 reply:
1734 if (m->request_ack) {
1735 Context *c(new C_AckMarkedDown(this, op));
1736 c->complete(0);
1737 }
1738 return true;
1739 }
1740
1741 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
1742 {
1743 op->mark_osdmon_event(__func__);
1744 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1745 int target_osd = m->get_target().name.num();
1746
1747 assert(osdmap.is_up(target_osd));
1748 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1749
1750 mon->clog->info() << "osd." << target_osd << " marked itself down";
1751 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1752 if (m->request_ack)
1753 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
1754 return true;
1755 }
1756
1757 bool OSDMonitor::can_mark_down(int i)
1758 {
1759 if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
1760 dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i
1761 << " down" << dendl;
1762 return false;
1763 }
1764
1765 if (osdmap.is_nodown(i)) {
1766 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
1767 << "will not mark it down" << dendl;
1768 return false;
1769 }
1770
1771 int num_osds = osdmap.get_num_osds();
1772 if (num_osds == 0) {
1773 dout(5) << __func__ << " no osds" << dendl;
1774 return false;
1775 }
1776 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
1777 float up_ratio = (float)up / (float)num_osds;
1778 if (up_ratio < g_conf->mon_osd_min_up_ratio) {
1779 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
1780 << g_conf->mon_osd_min_up_ratio
1781 << ", will not mark osd." << i << " down" << dendl;
1782 return false;
1783 }
1784 return true;
1785 }
1786
1787 bool OSDMonitor::can_mark_up(int i)
1788 {
1789 if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
1790 dout(5) << __func__ << " NOUP flag set, will not mark osd." << i
1791 << " up" << dendl;
1792 return false;
1793 }
1794
1795 if (osdmap.is_noup(i)) {
1796 dout(5) << __func__ << " osd." << i << " is marked as noup, "
1797 << "will not mark it up" << dendl;
1798 return false;
1799 }
1800
1801 return true;
1802 }
1803
1804 /**
1805 * @note the parameter @p i apparently only exists here so we can output the
1806 * osd's id on messages.
1807 */
1808 bool OSDMonitor::can_mark_out(int i)
1809 {
1810 if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
1811 dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
1812 return false;
1813 }
1814
1815 if (osdmap.is_noout(i)) {
1816 dout(5) << __func__ << " osd." << i << " is marked as noout, "
1817 << "will not mark it out" << dendl;
1818 return false;
1819 }
1820
1821 int num_osds = osdmap.get_num_osds();
1822 if (num_osds == 0) {
1823 dout(5) << __func__ << " no osds" << dendl;
1824 return false;
1825 }
1826 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
1827 float in_ratio = (float)in / (float)num_osds;
1828 if (in_ratio < g_conf->mon_osd_min_in_ratio) {
1829 if (i >= 0)
1830 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1831 << g_conf->mon_osd_min_in_ratio
1832 << ", will not mark osd." << i << " out" << dendl;
1833 else
1834 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1835 << g_conf->mon_osd_min_in_ratio
1836 << ", will not mark osds out" << dendl;
1837 return false;
1838 }
1839
1840 return true;
1841 }
1842
1843 bool OSDMonitor::can_mark_in(int i)
1844 {
1845 if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
1846 dout(5) << __func__ << " NOIN flag set, will not mark osd." << i
1847 << " in" << dendl;
1848 return false;
1849 }
1850
1851 if (osdmap.is_noin(i)) {
1852 dout(5) << __func__ << " osd." << i << " is marked as noin, "
1853 << "will not mark it in" << dendl;
1854 return false;
1855 }
1856
1857 return true;
1858 }
1859
1860 bool OSDMonitor::check_failures(utime_t now)
1861 {
1862 bool found_failure = false;
1863 for (map<int,failure_info_t>::iterator p = failure_info.begin();
1864 p != failure_info.end();
1865 ++p) {
1866 if (can_mark_down(p->first)) {
1867 found_failure |= check_failure(now, p->first, p->second);
1868 }
1869 }
1870 return found_failure;
1871 }
1872
1873 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
1874 {
1875 // already pending failure?
1876 if (pending_inc.new_state.count(target_osd) &&
1877 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1878 dout(10) << " already pending failure" << dendl;
1879 return true;
1880 }
1881
1882 set<string> reporters_by_subtree;
1883 string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
1884 utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
1885 utime_t max_failed_since = fi.get_failed_since();
1886 utime_t failed_for = now - max_failed_since;
1887
1888 utime_t grace = orig_grace;
1889 double my_grace = 0, peer_grace = 0;
1890 double decay_k = 0;
1891 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1892 double halflife = (double)g_conf->mon_osd_laggy_halflife;
1893 decay_k = ::log(.5) / halflife;
1894
1895 // scale grace period based on historical probability of 'lagginess'
1896 // (false positive failures due to slowness).
1897 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
1898 double decay = exp((double)failed_for * decay_k);
1899 dout(20) << " halflife " << halflife << " decay_k " << decay_k
1900 << " failed_for " << failed_for << " decay " << decay << dendl;
1901 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
1902 grace += my_grace;
1903 }
1904
1905 // consider the peers reporting a failure a proxy for a potential
1906 // 'subcluster' over the overall cluster that is similarly
1907 // laggy. this is clearly not true in all cases, but will sometimes
1908 // help us localize the grace correction to a subset of the system
1909 // (say, a rack with a bad switch) that is unhappy.
1910 assert(fi.reporters.size());
1911 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
1912 p != fi.reporters.end();
1913 ++p) {
1914 // get the parent bucket whose type matches with "reporter_subtree_level".
1915 // fall back to OSD if the level doesn't exist.
1916 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
1917 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
1918 if (iter == reporter_loc.end()) {
1919 reporters_by_subtree.insert("osd." + to_string(p->first));
1920 } else {
1921 reporters_by_subtree.insert(iter->second);
1922 }
1923 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1924 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
1925 utime_t elapsed = now - xi.down_stamp;
1926 double decay = exp((double)elapsed * decay_k);
1927 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
1928 }
1929 }
1930
1931 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1932 peer_grace /= (double)fi.reporters.size();
1933 grace += peer_grace;
1934 }
1935
1936 dout(10) << " osd." << target_osd << " has "
1937 << fi.reporters.size() << " reporters, "
1938 << grace << " grace (" << orig_grace << " + " << my_grace
1939 << " + " << peer_grace << "), max_failed_since " << max_failed_since
1940 << dendl;
1941
1942 if (failed_for >= grace &&
1943 (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
1944 dout(1) << " we have enough reporters to mark osd." << target_osd
1945 << " down" << dendl;
1946 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1947
1948 mon->clog->info() << "osd." << target_osd << " failed ("
1949 << osdmap.crush->get_full_location_ordered_string(
1950 target_osd)
1951 << ") ("
1952 << (int)reporters_by_subtree.size()
1953 << " reporters from different "
1954 << reporter_subtree_level << " after "
1955 << failed_for << " >= grace " << grace << ")";
1956 return true;
1957 }
1958 return false;
1959 }
1960
1961 void OSDMonitor::force_failure(int target_osd, int by)
1962 {
1963 // already pending failure?
1964 if (pending_inc.new_state.count(target_osd) &&
1965 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1966 dout(10) << " already pending failure" << dendl;
1967 return;
1968 }
1969
1970 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
1971 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1972
1973 mon->clog->info() << "osd." << target_osd << " failed ("
1974 << osdmap.crush->get_full_location_ordered_string(target_osd)
1975 << ") (connection refused reported by osd." << by << ")";
1976 return;
1977 }
1978
1979 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
1980 {
1981 op->mark_osdmon_event(__func__);
1982 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1983 dout(1) << "prepare_failure " << m->get_target()
1984 << " from " << m->get_orig_source_inst()
1985 << " is reporting failure:" << m->if_osd_failed() << dendl;
1986
1987 int target_osd = m->get_target().name.num();
1988 int reporter = m->get_orig_source().num();
1989 assert(osdmap.is_up(target_osd));
1990 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1991
1992 if (m->if_osd_failed()) {
1993 // calculate failure time
1994 utime_t now = ceph_clock_now();
1995 utime_t failed_since =
1996 m->get_recv_stamp() - utime_t(m->failed_for, 0);
1997
1998 // add a report
1999 if (m->is_immediate()) {
2000 mon->clog->debug() << m->get_target() << " reported immediately failed by "
2001 << m->get_orig_source_inst();
2002 force_failure(target_osd, reporter);
2003 return true;
2004 }
2005 mon->clog->debug() << m->get_target() << " reported failed by "
2006 << m->get_orig_source_inst();
2007
2008 failure_info_t& fi = failure_info[target_osd];
2009 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2010 if (old_op) {
2011 mon->no_reply(old_op);
2012 }
2013
2014 return check_failure(now, target_osd, fi);
2015 } else {
2016 // remove the report
2017 mon->clog->debug() << m->get_target() << " failure report canceled by "
2018 << m->get_orig_source_inst();
2019 if (failure_info.count(target_osd)) {
2020 failure_info_t& fi = failure_info[target_osd];
2021 MonOpRequestRef report_op = fi.cancel_report(reporter);
2022 if (report_op) {
2023 mon->no_reply(report_op);
2024 }
2025 if (fi.reporters.empty()) {
2026 dout(10) << " removing last failure_info for osd." << target_osd
2027 << dendl;
2028 failure_info.erase(target_osd);
2029 } else {
2030 dout(10) << " failure_info for osd." << target_osd << " now "
2031 << fi.reporters.size() << " reporters" << dendl;
2032 }
2033 } else {
2034 dout(10) << " no failure_info for osd." << target_osd << dendl;
2035 }
2036 mon->no_reply(op);
2037 }
2038
2039 return false;
2040 }
2041
2042 void OSDMonitor::process_failures()
2043 {
2044 map<int,failure_info_t>::iterator p = failure_info.begin();
2045 while (p != failure_info.end()) {
2046 if (osdmap.is_up(p->first)) {
2047 ++p;
2048 } else {
2049 dout(10) << "process_failures osd." << p->first << dendl;
2050 list<MonOpRequestRef> ls;
2051 p->second.take_report_messages(ls);
2052 failure_info.erase(p++);
2053
2054 while (!ls.empty()) {
2055 MonOpRequestRef o = ls.front();
2056 if (o) {
2057 o->mark_event(__func__);
2058 MOSDFailure *m = o->get_req<MOSDFailure>();
2059 send_latest(o, m->get_epoch());
2060 }
2061 ls.pop_front();
2062 }
2063 }
2064 }
2065 }
2066
2067 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2068 {
2069 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2070
2071 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2072 p != failure_info.end();
2073 ++p) {
2074 p->second.take_report_messages(ls);
2075 }
2076 failure_info.clear();
2077 }
2078
2079
2080 // boot --
2081
2082 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2083 {
2084 op->mark_osdmon_event(__func__);
2085 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2086 int from = m->get_orig_source_inst().name.num();
2087
2088 // check permissions, ignore if failed (no response expected)
2089 MonSession *session = m->get_session();
2090 if (!session)
2091 goto ignore;
2092 if (!session->is_capable("osd", MON_CAP_X)) {
2093 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2094 << session->caps << dendl;
2095 goto ignore;
2096 }
2097
2098 if (m->sb.cluster_fsid != mon->monmap->fsid) {
2099 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2100 << " != " << mon->monmap->fsid << dendl;
2101 goto ignore;
2102 }
2103
2104 if (m->get_orig_source_inst().addr.is_blank_ip()) {
2105 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2106 goto ignore;
2107 }
2108
2109 assert(m->get_orig_source_inst().name.is_osd());
2110
2111 // check if osd has required features to boot
2112 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2113 CEPH_FEATURE_OSD_ERASURE_CODES) &&
2114 !(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
2115 dout(0) << __func__ << " osdmap requires erasure code but osd at "
2116 << m->get_orig_source_inst()
2117 << " doesn't announce support -- ignore" << dendl;
2118 goto ignore;
2119 }
2120
2121 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2122 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
2123 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
2124 dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
2125 << m->get_orig_source_inst()
2126 << " doesn't announce support -- ignore" << dendl;
2127 goto ignore;
2128 }
2129
2130 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2131 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
2132 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
2133 dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
2134 << m->get_orig_source_inst()
2135 << " doesn't announce support -- ignore" << dendl;
2136 goto ignore;
2137 }
2138
2139 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
2140 !HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2141 mon->clog->info() << "disallowing boot of OSD "
2142 << m->get_orig_source_inst()
2143 << " because the osdmap requires"
2144 << " CEPH_FEATURE_SERVER_LUMINOUS"
2145 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2146 goto ignore;
2147 }
2148
2149 if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL &&
2150 !(m->osd_features & CEPH_FEATURE_SERVER_JEWEL)) {
2151 mon->clog->info() << "disallowing boot of OSD "
2152 << m->get_orig_source_inst()
2153 << " because the osdmap requires"
2154 << " CEPH_FEATURE_SERVER_JEWEL"
2155 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2156 goto ignore;
2157 }
2158
2159 if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN &&
2160 !HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2161 mon->clog->info() << "disallowing boot of OSD "
2162 << m->get_orig_source_inst()
2163 << " because the osdmap requires"
2164 << " CEPH_FEATURE_SERVER_KRAKEN"
2165 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2166 goto ignore;
2167 }
2168
2169 if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
2170 !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
2171 mon->clog->info() << "disallowing boot of OSD "
2172 << m->get_orig_source_inst()
2173 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2174 goto ignore;
2175 }
2176
2177 if (osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES) &&
2178 !(m->osd_features & CEPH_FEATURE_OSD_RECOVERY_DELETES)) {
2179 mon->clog->info() << "disallowing boot of OSD "
2180 << m->get_orig_source_inst()
2181 << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
2182 goto ignore;
2183 }
2184
2185 if (any_of(osdmap.get_pools().begin(),
2186 osdmap.get_pools().end(),
2187 [](const std::pair<int64_t,pg_pool_t>& pool)
2188 { return pool.second.use_gmt_hitset; })) {
2189 assert(osdmap.get_num_up_osds() == 0 ||
2190 osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
2191 if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
2192 dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
2193 << m->get_orig_source_inst()
2194 << " doesn't announce support -- ignore" << dendl;
2195 goto ignore;
2196 }
2197 }
2198
2199 // make sure upgrades stop at luminous
2200 if (HAVE_FEATURE(m->osd_features, SERVER_M) &&
2201 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
2202 mon->clog->info() << "disallowing boot of post-luminous OSD "
2203 << m->get_orig_source_inst()
2204 << " because require_osd_release < luminous";
2205 goto ignore;
2206 }
2207
2208 // make sure upgrades stop at jewel
2209 if (HAVE_FEATURE(m->osd_features, SERVER_KRAKEN) &&
2210 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
2211 mon->clog->info() << "disallowing boot of post-jewel OSD "
2212 << m->get_orig_source_inst()
2213 << " because require_osd_release < jewel";
2214 goto ignore;
2215 }
2216
2217 // make sure upgrades stop at hammer
2218 // * HAMMER_0_94_4 is the required hammer feature
2219 // * MON_METADATA is the first post-hammer feature
2220 if (osdmap.get_num_up_osds() > 0) {
2221 if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
2222 !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
2223 mon->clog->info() << "disallowing boot of post-hammer OSD "
2224 << m->get_orig_source_inst()
2225 << " because one or more up OSDs is pre-hammer v0.94.4";
2226 goto ignore;
2227 }
2228 if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
2229 (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
2230 mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2231 << m->get_orig_source_inst()
2232 << " because all up OSDs are post-hammer";
2233 goto ignore;
2234 }
2235 }
2236
2237 // already booted?
2238 if (osdmap.is_up(from) &&
2239 osdmap.get_inst(from) == m->get_orig_source_inst() &&
2240 osdmap.get_cluster_addr(from) == m->cluster_addr) {
2241 // yup.
2242 dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
2243 << " == " << osdmap.get_inst(from) << dendl;
2244 _booted(op, false);
2245 return true;
2246 }
2247
2248 if (osdmap.exists(from) &&
2249 !osdmap.get_uuid(from).is_zero() &&
2250 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2251 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2252 << " clashes with existing osd: different fsid"
2253 << " (ours: " << osdmap.get_uuid(from)
2254 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2255 goto ignore;
2256 }
2257
2258 if (osdmap.exists(from) &&
2259 osdmap.get_info(from).up_from > m->version &&
2260 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) {
2261 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2262 send_latest(op, m->sb.current_epoch+1);
2263 return true;
2264 }
2265
2266 // noup?
2267 if (!can_mark_up(from)) {
2268 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2269 send_latest(op, m->sb.current_epoch+1);
2270 return true;
2271 }
2272
2273 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2274 return false;
2275
2276 ignore:
2277 return true;
2278 }
2279
2280 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2281 {
2282 op->mark_osdmon_event(__func__);
2283 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2284 dout(7) << __func__ << " from " << m->get_orig_source_inst() << " sb " << m->sb
2285 << " cluster_addr " << m->cluster_addr
2286 << " hb_back_addr " << m->hb_back_addr
2287 << " hb_front_addr " << m->hb_front_addr
2288 << dendl;
2289
2290 assert(m->get_orig_source().is_osd());
2291 int from = m->get_orig_source().num();
2292
2293 // does this osd exist?
2294 if (from >= osdmap.get_max_osd()) {
2295 dout(1) << "boot from osd." << from << " >= max_osd "
2296 << osdmap.get_max_osd() << dendl;
2297 return false;
2298 }
2299
2300 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2301 if (pending_inc.new_state.count(from))
2302 oldstate ^= pending_inc.new_state[from];
2303
2304 // already up? mark down first?
2305 if (osdmap.is_up(from)) {
2306 dout(7) << __func__ << " was up, first marking down "
2307 << osdmap.get_inst(from) << dendl;
2308 // preprocess should have caught these; if not, assert.
2309 assert(osdmap.get_inst(from) != m->get_orig_source_inst() ||
2310 osdmap.get_cluster_addr(from) != m->cluster_addr);
2311 assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2312
2313 if (pending_inc.new_state.count(from) == 0 ||
2314 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2315 // mark previous guy down
2316 pending_inc.new_state[from] = CEPH_OSD_UP;
2317 }
2318 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2319 } else if (pending_inc.new_up_client.count(from)) {
2320 // already prepared, just wait
2321 dout(7) << __func__ << " already prepared, waiting on "
2322 << m->get_orig_source_addr() << dendl;
2323 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2324 } else {
2325 // mark new guy up.
2326 pending_inc.new_up_client[from] = m->get_orig_source_addr();
2327 if (!m->cluster_addr.is_blank_ip())
2328 pending_inc.new_up_cluster[from] = m->cluster_addr;
2329 pending_inc.new_hb_back_up[from] = m->hb_back_addr;
2330 if (!m->hb_front_addr.is_blank_ip())
2331 pending_inc.new_hb_front_up[from] = m->hb_front_addr;
2332
2333 down_pending_out.erase(from); // if any
2334
2335 if (m->sb.weight)
2336 osd_weight[from] = m->sb.weight;
2337
2338 // set uuid?
2339 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2340 << dendl;
2341 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2342 // preprocess should have caught this; if not, assert.
2343 assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2344 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2345 }
2346
2347 // fresh osd?
2348 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2349 const osd_info_t& i = osdmap.get_info(from);
2350 if (i.up_from > i.lost_at) {
2351 dout(10) << " fresh osd; marking lost_at too" << dendl;
2352 pending_inc.new_lost[from] = osdmap.get_epoch();
2353 }
2354 }
2355
2356 // metadata
2357 bufferlist osd_metadata;
2358 ::encode(m->metadata, osd_metadata);
2359 pending_metadata[from] = osd_metadata;
2360 pending_metadata_rm.erase(from);
2361
2362 // adjust last clean unmount epoch?
2363 const osd_info_t& info = osdmap.get_info(from);
2364 dout(10) << " old osd_info: " << info << dendl;
2365 if (m->sb.mounted > info.last_clean_begin ||
2366 (m->sb.mounted == info.last_clean_begin &&
2367 m->sb.clean_thru > info.last_clean_end)) {
2368 epoch_t begin = m->sb.mounted;
2369 epoch_t end = m->sb.clean_thru;
2370
2371 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2372 << "[" << info.last_clean_begin << "," << info.last_clean_end
2373 << ") -> [" << begin << "-" << end << ")"
2374 << dendl;
2375 pending_inc.new_last_clean_interval[from] =
2376 pair<epoch_t,epoch_t>(begin, end);
2377 }
2378
2379 osd_xinfo_t xi = osdmap.get_xinfo(from);
2380 if (m->boot_epoch == 0) {
2381 xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
2382 xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
2383 dout(10) << " not laggy, new xi " << xi << dendl;
2384 } else {
2385 if (xi.down_stamp.sec()) {
2386 int interval = ceph_clock_now().sec() -
2387 xi.down_stamp.sec();
2388 if (g_conf->mon_osd_laggy_max_interval &&
2389 (interval > g_conf->mon_osd_laggy_max_interval)) {
2390 interval = g_conf->mon_osd_laggy_max_interval;
2391 }
2392 xi.laggy_interval =
2393 interval * g_conf->mon_osd_laggy_weight +
2394 xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
2395 }
2396 xi.laggy_probability =
2397 g_conf->mon_osd_laggy_weight +
2398 xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
2399 dout(10) << " laggy, now xi " << xi << dendl;
2400 }
2401
2402 // set features shared by the osd
2403 if (m->osd_features)
2404 xi.features = m->osd_features;
2405 else
2406 xi.features = m->get_connection()->get_features();
2407
2408 // mark in?
2409 if ((g_conf->mon_osd_auto_mark_auto_out_in &&
2410 (oldstate & CEPH_OSD_AUTOOUT)) ||
2411 (g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
2412 (g_conf->mon_osd_auto_mark_in)) {
2413 if (can_mark_in(from)) {
2414 if (osdmap.osd_xinfo[from].old_weight > 0) {
2415 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
2416 xi.old_weight = 0;
2417 } else {
2418 pending_inc.new_weight[from] = CEPH_OSD_IN;
2419 }
2420 } else {
2421 dout(7) << __func__ << " NOIN set, will not mark in "
2422 << m->get_orig_source_addr() << dendl;
2423 }
2424 }
2425
2426 pending_inc.new_xinfo[from] = xi;
2427
2428 // wait
2429 wait_for_finished_proposal(op, new C_Booted(this, op));
2430 }
2431 return true;
2432 }
2433
2434 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
2435 {
2436 op->mark_osdmon_event(__func__);
2437 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2438 dout(7) << "_booted " << m->get_orig_source_inst()
2439 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
2440
2441 if (logit) {
2442 mon->clog->info() << m->get_orig_source_inst() << " boot";
2443 }
2444
2445 send_latest(op, m->sb.current_epoch+1);
2446 }
2447
2448
2449 // -------------
2450 // full
2451
2452 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
2453 {
2454 op->mark_osdmon_event(__func__);
2455 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2456 int from = m->get_orig_source().num();
2457 set<string> state;
2458 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2459
2460 // check permissions, ignore if failed
2461 MonSession *session = m->get_session();
2462 if (!session)
2463 goto ignore;
2464 if (!session->is_capable("osd", MON_CAP_X)) {
2465 dout(0) << "MOSDFull from entity with insufficient privileges:"
2466 << session->caps << dendl;
2467 goto ignore;
2468 }
2469
2470 // ignore a full message from the osd instance that already went down
2471 if (!osdmap.exists(from)) {
2472 dout(7) << __func__ << " ignoring full message from nonexistent "
2473 << m->get_orig_source_inst() << dendl;
2474 goto ignore;
2475 }
2476 if ((!osdmap.is_up(from) &&
2477 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) ||
2478 (osdmap.is_up(from) &&
2479 osdmap.get_inst(from) != m->get_orig_source_inst())) {
2480 dout(7) << __func__ << " ignoring full message from down "
2481 << m->get_orig_source_inst() << dendl;
2482 goto ignore;
2483 }
2484
2485 OSDMap::calc_state_set(osdmap.get_state(from), state);
2486
2487 if ((osdmap.get_state(from) & mask) == m->state) {
2488 dout(7) << __func__ << " state already " << state << " for osd." << from
2489 << " " << m->get_orig_source_inst() << dendl;
2490 _reply_map(op, m->version);
2491 goto ignore;
2492 }
2493
2494 dout(10) << __func__ << " want state " << state << " for osd." << from
2495 << " " << m->get_orig_source_inst() << dendl;
2496 return false;
2497
2498 ignore:
2499 return true;
2500 }
2501
2502 bool OSDMonitor::prepare_full(MonOpRequestRef op)
2503 {
2504 op->mark_osdmon_event(__func__);
2505 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2506 const int from = m->get_orig_source().num();
2507
2508 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2509 const unsigned want_state = m->state & mask; // safety first
2510
2511 unsigned cur_state = osdmap.get_state(from);
2512 auto p = pending_inc.new_state.find(from);
2513 if (p != pending_inc.new_state.end()) {
2514 cur_state ^= p->second;
2515 }
2516 cur_state &= mask;
2517
2518 set<string> want_state_set, cur_state_set;
2519 OSDMap::calc_state_set(want_state, want_state_set);
2520 OSDMap::calc_state_set(cur_state, cur_state_set);
2521
2522 if (cur_state != want_state) {
2523 if (p != pending_inc.new_state.end()) {
2524 p->second &= ~mask;
2525 } else {
2526 pending_inc.new_state[from] = 0;
2527 }
2528 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
2529 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2530 << " -> " << want_state_set << dendl;
2531 } else {
2532 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2533 << " = wanted " << want_state_set << ", just waiting" << dendl;
2534 }
2535
2536 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2537 return true;
2538 }
2539
2540 // -------------
2541 // alive
2542
2543 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
2544 {
2545 op->mark_osdmon_event(__func__);
2546 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2547 int from = m->get_orig_source().num();
2548
2549 // check permissions, ignore if failed
2550 MonSession *session = m->get_session();
2551 if (!session)
2552 goto ignore;
2553 if (!session->is_capable("osd", MON_CAP_X)) {
2554 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2555 << session->caps << dendl;
2556 goto ignore;
2557 }
2558
2559 if (!osdmap.is_up(from) ||
2560 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2561 dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
2562 goto ignore;
2563 }
2564
2565 if (osdmap.get_up_thru(from) >= m->want) {
2566 // yup.
2567 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
2568 _reply_map(op, m->version);
2569 return true;
2570 }
2571
2572 dout(10) << "preprocess_alive want up_thru " << m->want
2573 << " from " << m->get_orig_source_inst() << dendl;
2574 return false;
2575
2576 ignore:
2577 return true;
2578 }
2579
2580 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
2581 {
2582 op->mark_osdmon_event(__func__);
2583 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2584 int from = m->get_orig_source().num();
2585
2586 if (0) { // we probably don't care much about these
2587 mon->clog->debug() << m->get_orig_source_inst() << " alive";
2588 }
2589
2590 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
2591 << " from " << m->get_orig_source_inst() << dendl;
2592
2593 update_up_thru(from, m->version); // set to the latest map the OSD has
2594 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2595 return true;
2596 }
2597
2598 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
2599 {
2600 op->mark_osdmon_event(__func__);
2601 dout(7) << "_reply_map " << e
2602 << " from " << op->get_req()->get_orig_source_inst()
2603 << dendl;
2604 send_latest(op, e);
2605 }
2606
2607 // pg_created
2608 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
2609 {
2610 op->mark_osdmon_event(__func__);
2611 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2612 dout(10) << __func__ << " " << *m << dendl;
2613 auto session = m->get_session();
2614 if (!session) {
2615 dout(10) << __func__ << ": no monitor session!" << dendl;
2616 return true;
2617 }
2618 if (!session->is_capable("osd", MON_CAP_X)) {
2619 derr << __func__ << " received from entity "
2620 << "with insufficient privileges " << session->caps << dendl;
2621 return true;
2622 }
2623 // always forward the "created!" to the leader
2624 return false;
2625 }
2626
2627 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
2628 {
2629 op->mark_osdmon_event(__func__);
2630 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2631 dout(10) << __func__ << " " << *m << dendl;
2632 auto src = m->get_orig_source();
2633 auto from = src.num();
2634 if (!src.is_osd() ||
2635 !mon->osdmon()->osdmap.is_up(from) ||
2636 m->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
2637 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
2638 return false;
2639 }
2640 pending_created_pgs.push_back(m->pgid);
2641 return true;
2642 }
2643
2644 // -------------
2645 // pg_temp changes
2646
2647 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
2648 {
2649 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2650 dout(10) << "preprocess_pgtemp " << *m << dendl;
2651 mempool::osdmap::vector<int> empty;
2652 int from = m->get_orig_source().num();
2653 size_t ignore_cnt = 0;
2654
2655 // check caps
2656 MonSession *session = m->get_session();
2657 if (!session)
2658 goto ignore;
2659 if (!session->is_capable("osd", MON_CAP_X)) {
2660 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2661 << session->caps << dendl;
2662 goto ignore;
2663 }
2664
2665 if (!osdmap.is_up(from) ||
2666 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2667 dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
2668 goto ignore;
2669 }
2670
2671 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2672 dout(20) << " " << p->first
2673 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
2674 << " -> " << p->second << dendl;
2675
2676 // does the pool exist?
2677 if (!osdmap.have_pg_pool(p->first.pool())) {
2678 /*
2679 * 1. If the osdmap does not have the pool, it means the pool has been
2680 * removed in-between the osd sending this message and us handling it.
2681 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2682 * not exist in the pending either, as the osds would not send a
2683 * message about a pool they know nothing about (yet).
2684 * 3. However, if the pool does exist in the pending, then it must be a
2685 * new pool, and not relevant to this message (see 1).
2686 */
2687 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2688 << ": pool has been removed" << dendl;
2689 ignore_cnt++;
2690 continue;
2691 }
2692
2693 int acting_primary = -1;
2694 osdmap.pg_to_up_acting_osds(
2695 p->first, nullptr, nullptr, nullptr, &acting_primary);
2696 if (acting_primary != from) {
2697 /* If the source isn't the primary based on the current osdmap, we know
2698 * that the interval changed and that we can discard this message.
2699 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2700 * which of two pg temp mappings on the same pg is more recent.
2701 */
2702 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2703 << ": primary has changed" << dendl;
2704 ignore_cnt++;
2705 continue;
2706 }
2707
2708 // removal?
2709 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
2710 osdmap.primary_temp->count(p->first)))
2711 return false;
2712 // change?
2713 // NOTE: we assume that this will clear pg_primary, so consider
2714 // an existing pg_primary field to imply a change
2715 if (p->second.size() &&
2716 (osdmap.pg_temp->count(p->first) == 0 ||
2717 !vectors_equal(osdmap.pg_temp->get(p->first), p->second) ||
2718 osdmap.primary_temp->count(p->first)))
2719 return false;
2720 }
2721
2722 // should we ignore all the pgs?
2723 if (ignore_cnt == m->pg_temp.size())
2724 goto ignore;
2725
2726 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
2727 _reply_map(op, m->map_epoch);
2728 return true;
2729
2730 ignore:
2731 return true;
2732 }
2733
2734 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
2735 {
2736 epoch_t old_up_thru = osdmap.get_up_thru(from);
2737 auto ut = pending_inc.new_up_thru.find(from);
2738 if (ut != pending_inc.new_up_thru.end()) {
2739 old_up_thru = ut->second;
2740 }
2741 if (up_thru > old_up_thru) {
2742 // set up_thru too, so the osd doesn't have to ask again
2743 pending_inc.new_up_thru[from] = up_thru;
2744 }
2745 }
2746
2747 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
2748 {
2749 op->mark_osdmon_event(__func__);
2750 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2751 int from = m->get_orig_source().num();
2752 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
2753 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2754 uint64_t pool = p->first.pool();
2755 if (pending_inc.old_pools.count(pool)) {
2756 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2757 << ": pool pending removal" << dendl;
2758 continue;
2759 }
2760 if (!osdmap.have_pg_pool(pool)) {
2761 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2762 << ": pool has been removed" << dendl;
2763 continue;
2764 }
2765 pending_inc.new_pg_temp[p->first] =
2766 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
2767
2768 // unconditionally clear pg_primary (until this message can encode
2769 // a change for that, too.. at which point we need to also fix
2770 // preprocess_pg_temp)
2771 if (osdmap.primary_temp->count(p->first) ||
2772 pending_inc.new_primary_temp.count(p->first))
2773 pending_inc.new_primary_temp[p->first] = -1;
2774 }
2775
2776 // set up_thru too, so the osd doesn't have to ask again
2777 update_up_thru(from, m->map_epoch);
2778
2779 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
2780 return true;
2781 }
2782
2783
2784 // ---
2785
2786 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
2787 {
2788 op->mark_osdmon_event(__func__);
2789 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2790 dout(7) << "preprocess_remove_snaps " << *m << dendl;
2791
2792 // check privilege, ignore if failed
2793 MonSession *session = m->get_session();
2794 if (!session)
2795 goto ignore;
2796 if (!session->caps.is_capable(
2797 g_ceph_context,
2798 CEPH_ENTITY_TYPE_MON,
2799 session->entity_name,
2800 "osd", "osd pool rmsnap", {}, true, true, false)) {
2801 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
2802 << session->caps << dendl;
2803 goto ignore;
2804 }
2805
2806 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
2807 q != m->snaps.end();
2808 ++q) {
2809 if (!osdmap.have_pg_pool(q->first)) {
2810 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
2811 continue;
2812 }
2813 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
2814 for (vector<snapid_t>::iterator p = q->second.begin();
2815 p != q->second.end();
2816 ++p) {
2817 if (*p > pi->get_snap_seq() ||
2818 !pi->removed_snaps.contains(*p))
2819 return false;
2820 }
2821 }
2822
2823 ignore:
2824 return true;
2825 }
2826
2827 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
2828 {
2829 op->mark_osdmon_event(__func__);
2830 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2831 dout(7) << "prepare_remove_snaps " << *m << dendl;
2832
2833 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
2834 p != m->snaps.end();
2835 ++p) {
2836
2837 if (!osdmap.have_pg_pool(p->first)) {
2838 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
2839 continue;
2840 }
2841
2842 pg_pool_t& pi = osdmap.pools[p->first];
2843 for (vector<snapid_t>::iterator q = p->second.begin();
2844 q != p->second.end();
2845 ++q) {
2846 if (!pi.removed_snaps.contains(*q) &&
2847 (!pending_inc.new_pools.count(p->first) ||
2848 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
2849 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
2850 newpi->removed_snaps.insert(*q);
2851 dout(10) << " pool " << p->first << " removed_snaps added " << *q
2852 << " (now " << newpi->removed_snaps << ")" << dendl;
2853 if (*q > newpi->get_snap_seq()) {
2854 dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
2855 newpi->set_snap_seq(*q);
2856 }
2857 newpi->set_snap_epoch(pending_inc.epoch);
2858 }
2859 }
2860 }
2861 return true;
2862 }
2863
2864 // osd beacon
2865 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
2866 {
2867 op->mark_osdmon_event(__func__);
2868 auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2869 // check caps
2870 auto session = beacon->get_session();
2871 if (!session) {
2872 dout(10) << __func__ << " no monitor session!" << dendl;
2873 return true;
2874 }
2875 if (!session->is_capable("osd", MON_CAP_X)) {
2876 derr << __func__ << " received from entity "
2877 << "with insufficient privileges " << session->caps << dendl;
2878 return true;
2879 }
2880 // Always forward the beacon to the leader, even if they are the same as
2881 // the old one. The leader will mark as down osds that haven't sent
2882 // beacon for a few minutes.
2883 return false;
2884 }
2885
2886 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
2887 {
2888 op->mark_osdmon_event(__func__);
2889 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2890 const auto src = beacon->get_orig_source();
2891 dout(10) << __func__ << " " << *beacon
2892 << " from " << src << dendl;
2893 int from = src.num();
2894
2895 if (!src.is_osd() ||
2896 !osdmap.is_up(from) ||
2897 beacon->get_orig_source_inst() != osdmap.get_inst(from)) {
2898 dout(1) << " ignoring beacon from non-active osd." << dendl;
2899 return false;
2900 }
2901
2902 last_osd_report[from] = ceph_clock_now();
2903 osd_epochs[from] = beacon->version;
2904
2905 for (const auto& pg : beacon->pgs) {
2906 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
2907 }
2908 return false;
2909 }
2910
2911 // ---------------
2912 // map helpers
2913
2914 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
2915 {
2916 op->mark_osdmon_event(__func__);
2917 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
2918 << " start " << start << dendl;
2919 if (start == 0)
2920 send_full(op);
2921 else
2922 send_incremental(op, start);
2923 }
2924
2925
2926 MOSDMap *OSDMonitor::build_latest_full()
2927 {
2928 MOSDMap *r = new MOSDMap(mon->monmap->fsid);
2929 get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
2930 r->oldest_map = get_first_committed();
2931 r->newest_map = osdmap.get_epoch();
2932 return r;
2933 }
2934
2935 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
2936 {
2937 dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
2938 MOSDMap *m = new MOSDMap(mon->monmap->fsid);
2939 m->oldest_map = get_first_committed();
2940 m->newest_map = osdmap.get_epoch();
2941
2942 for (epoch_t e = to; e >= from && e > 0; e--) {
2943 bufferlist bl;
2944 int err = get_version(e, bl);
2945 if (err == 0) {
2946 assert(bl.length());
2947 // if (get_version(e, bl) > 0) {
2948 dout(20) << "build_incremental inc " << e << " "
2949 << bl.length() << " bytes" << dendl;
2950 m->incremental_maps[e] = bl;
2951 } else {
2952 assert(err == -ENOENT);
2953 assert(!bl.length());
2954 get_version_full(e, bl);
2955 if (bl.length() > 0) {
2956 //else if (get_version("full", e, bl) > 0) {
2957 dout(20) << "build_incremental full " << e << " "
2958 << bl.length() << " bytes" << dendl;
2959 m->maps[e] = bl;
2960 } else {
2961 ceph_abort(); // we should have all maps.
2962 }
2963 }
2964 }
2965 return m;
2966 }
2967
2968 void OSDMonitor::send_full(MonOpRequestRef op)
2969 {
2970 op->mark_osdmon_event(__func__);
2971 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
2972 mon->send_reply(op, build_latest_full());
2973 }
2974
2975 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
2976 {
2977 op->mark_osdmon_event(__func__);
2978
2979 MonSession *s = op->get_session();
2980 assert(s);
2981
2982 if (s->proxy_con &&
2983 s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
2984 // oh, we can tell the other mon to do it
2985 dout(10) << __func__ << " asking proxying mon to send_incremental from "
2986 << first << dendl;
2987 MRoute *r = new MRoute(s->proxy_tid, NULL);
2988 r->send_osdmap_first = first;
2989 s->proxy_con->send_message(r);
2990 op->mark_event("reply: send routed send_osdmap_first reply");
2991 } else {
2992 // do it ourselves
2993 send_incremental(first, s, false, op);
2994 }
2995 }
2996
2997 void OSDMonitor::send_incremental(epoch_t first,
2998 MonSession *session,
2999 bool onetime,
3000 MonOpRequestRef req)
3001 {
3002 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3003 << " to " << session->inst << dendl;
3004
3005 if (first <= session->osd_epoch) {
3006 dout(10) << __func__ << " " << session->inst << " should already have epoch "
3007 << session->osd_epoch << dendl;
3008 first = session->osd_epoch + 1;
3009 }
3010
3011 if (first < get_first_committed()) {
3012 first = get_first_committed();
3013 bufferlist bl;
3014 int err = get_version_full(first, bl);
3015 assert(err == 0);
3016 assert(bl.length());
3017
3018 dout(20) << "send_incremental starting with base full "
3019 << first << " " << bl.length() << " bytes" << dendl;
3020
3021 MOSDMap *m = new MOSDMap(osdmap.get_fsid());
3022 m->oldest_map = get_first_committed();
3023 m->newest_map = osdmap.get_epoch();
3024 m->maps[first] = bl;
3025
3026 if (req) {
3027 mon->send_reply(req, m);
3028 session->osd_epoch = first;
3029 return;
3030 } else {
3031 session->con->send_message(m);
3032 session->osd_epoch = first;
3033 }
3034 first++;
3035 }
3036
3037 while (first <= osdmap.get_epoch()) {
3038 epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
3039 osdmap.get_epoch());
3040 MOSDMap *m = build_incremental(first, last);
3041
3042 if (req) {
3043 // send some maps. it may not be all of them, but it will get them
3044 // started.
3045 mon->send_reply(req, m);
3046 } else {
3047 session->con->send_message(m);
3048 first = last + 1;
3049 }
3050 session->osd_epoch = last;
3051 if (onetime || req)
3052 break;
3053 }
3054 }
3055
3056 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3057 {
3058 if (inc_osd_cache.lookup(ver, &bl)) {
3059 return 0;
3060 }
3061 int ret = PaxosService::get_version(ver, bl);
3062 if (!ret) {
3063 inc_osd_cache.add(ver, bl);
3064 }
3065 return ret;
3066 }
3067
3068 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3069 {
3070 if (full_osd_cache.lookup(ver, &bl)) {
3071 return 0;
3072 }
3073 int ret = PaxosService::get_version_full(ver, bl);
3074 if (!ret) {
3075 full_osd_cache.add(ver, bl);
3076 }
3077 return ret;
3078 }
3079
3080 epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
3081 {
3082 dout(10) << "blacklist " << a << " until " << until << dendl;
3083 pending_inc.new_blacklist[a] = until;
3084 return pending_inc.epoch;
3085 }
3086
3087
3088 void OSDMonitor::check_osdmap_subs()
3089 {
3090 dout(10) << __func__ << dendl;
3091 if (!osdmap.get_epoch()) {
3092 return;
3093 }
3094 auto osdmap_subs = mon->session_map.subs.find("osdmap");
3095 if (osdmap_subs == mon->session_map.subs.end()) {
3096 return;
3097 }
3098 auto p = osdmap_subs->second->begin();
3099 while (!p.end()) {
3100 auto sub = *p;
3101 ++p;
3102 check_osdmap_sub(sub);
3103 }
3104 }
3105
3106 void OSDMonitor::check_osdmap_sub(Subscription *sub)
3107 {
3108 dout(10) << __func__ << " " << sub << " next " << sub->next
3109 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
3110 if (sub->next <= osdmap.get_epoch()) {
3111 if (sub->next >= 1)
3112 send_incremental(sub->next, sub->session, sub->incremental_onetime);
3113 else
3114 sub->session->con->send_message(build_latest_full());
3115 if (sub->onetime)
3116 mon->session_map.remove_sub(sub);
3117 else
3118 sub->next = osdmap.get_epoch() + 1;
3119 }
3120 }
3121
3122 void OSDMonitor::check_pg_creates_subs()
3123 {
3124 if (!mon->monmap->get_required_features().contains_all(
3125 ceph::features::mon::FEATURE_LUMINOUS)) {
3126 // PGMonitor takes care of this in pre-luminous era.
3127 return;
3128 }
3129 if (!osdmap.get_num_up_osds()) {
3130 return;
3131 }
3132 assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
3133 mon->with_session_map([this](const MonSessionMap& session_map) {
3134 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
3135 if (pg_creates_subs == session_map.subs.end()) {
3136 return;
3137 }
3138 for (auto sub : *pg_creates_subs->second) {
3139 check_pg_creates_sub(sub);
3140 }
3141 });
3142 }
3143
3144 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
3145 {
3146 dout(20) << __func__ << " .. " << sub->session->inst << dendl;
3147 assert(sub->type == "osd_pg_creates");
3148 // only send these if the OSD is up. we will check_subs() when they do
3149 // come up so they will get the creates then.
3150 if (sub->session->inst.name.is_osd() &&
3151 mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
3152 sub->next = send_pg_creates(sub->session->inst.name.num(),
3153 sub->session->con.get(),
3154 sub->next);
3155 }
3156 }
3157
3158 void OSDMonitor::do_application_enable(int64_t pool_id,
3159 const std::string &app_name)
3160 {
3161 assert(paxos->is_plugged() && is_writeable());
3162
3163 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
3164 << dendl;
3165
3166 assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS ||
3167 pending_inc.new_require_osd_release >= CEPH_RELEASE_LUMINOUS);
3168
3169 auto pp = osdmap.get_pg_pool(pool_id);
3170 assert(pp != nullptr);
3171
3172 pg_pool_t p = *pp;
3173 if (pending_inc.new_pools.count(pool_id)) {
3174 p = pending_inc.new_pools[pool_id];
3175 }
3176
3177 p.application_metadata.insert({app_name, {}});
3178 p.last_change = pending_inc.epoch;
3179 pending_inc.new_pools[pool_id] = p;
3180 }
3181
3182 unsigned OSDMonitor::scan_for_creating_pgs(
3183 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
3184 const mempool::osdmap::set<int64_t>& removed_pools,
3185 utime_t modified,
3186 creating_pgs_t* creating_pgs) const
3187 {
3188 unsigned queued = 0;
3189 for (auto& p : pools) {
3190 int64_t poolid = p.first;
3191 const pg_pool_t& pool = p.second;
3192 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
3193 pool.get_type(), pool.get_size());
3194 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
3195 continue;
3196
3197 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
3198 const auto created = pool.get_last_change();
3199 if (last_scan_epoch && created <= last_scan_epoch) {
3200 dout(10) << __func__ << " no change in pool " << poolid
3201 << " " << pool << dendl;
3202 continue;
3203 }
3204 if (removed_pools.count(poolid)) {
3205 dout(10) << __func__ << " pool is being removed: " << poolid
3206 << " " << pool << dendl;
3207 continue;
3208 }
3209 dout(10) << __func__ << " queueing pool create for " << poolid
3210 << " " << pool << dendl;
3211 if (creating_pgs->create_pool(poolid, pool.get_pg_num(),
3212 created, modified)) {
3213 queued++;
3214 }
3215 }
3216 return queued;
3217 }
3218
3219 void OSDMonitor::update_creating_pgs()
3220 {
3221 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
3222 << creating_pgs.queue.size() << " pools in queue" << dendl;
3223 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
3224 std::lock_guard<std::mutex> l(creating_pgs_lock);
3225 for (const auto& pg : creating_pgs.pgs) {
3226 int acting_primary = -1;
3227 auto pgid = pg.first;
3228 auto mapped = pg.second.first;
3229 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
3230 mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
3231 // check the previous creating_pgs, look for the target to whom the pg was
3232 // previously mapped
3233 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
3234 const auto last_acting_primary = pgs_by_epoch.first;
3235 for (auto& pgs: pgs_by_epoch.second) {
3236 if (pgs.second.count(pgid)) {
3237 if (last_acting_primary == acting_primary) {
3238 mapped = pgs.first;
3239 } else {
3240 dout(20) << __func__ << " " << pgid << " "
3241 << " acting_primary:" << last_acting_primary
3242 << " -> " << acting_primary << dendl;
3243 // note epoch if the target of the create message changed.
3244 mapped = mapping.get_epoch();
3245 }
3246 break;
3247 } else {
3248 // newly creating
3249 mapped = mapping.get_epoch();
3250 }
3251 }
3252 }
3253 dout(10) << __func__ << " will instruct osd." << acting_primary
3254 << " to create " << pgid << "@" << mapped << dendl;
3255 new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
3256 }
3257 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
3258 creating_pgs_epoch = mapping.get_epoch();
3259 }
3260
3261 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
3262 {
3263 dout(30) << __func__ << " osd." << osd << " next=" << next
3264 << " " << creating_pgs_by_osd_epoch << dendl;
3265 std::lock_guard<std::mutex> l(creating_pgs_lock);
3266 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
3267 dout(20) << __func__
3268 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
3269 // the subscribers will be updated when the mapping is completed anyway
3270 return next;
3271 }
3272 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
3273 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
3274 return next;
3275 assert(!creating_pgs_by_epoch->second.empty());
3276
3277 MOSDPGCreate *m = nullptr;
3278 epoch_t last = 0;
3279 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
3280 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
3281 auto epoch = epoch_pgs->first;
3282 auto& pgs = epoch_pgs->second;
3283 dout(20) << __func__ << " osd." << osd << " from " << next
3284 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
3285 last = epoch;
3286 for (auto& pg : pgs) {
3287 if (!m)
3288 m = new MOSDPGCreate(creating_pgs_epoch);
3289 // Need the create time from the monitor using its clock to set
3290 // last_scrub_stamp upon pg creation.
3291 auto create = creating_pgs.pgs.find(pg);
3292 assert(create != creating_pgs.pgs.end());
3293 m->mkpg.emplace(pg, pg_create_t{create->second.first, pg, 0});
3294 m->ctimes.emplace(pg, create->second.second);
3295 dout(20) << __func__ << " will create " << pg
3296 << " at " << create->second.first << dendl;
3297 }
3298 }
3299 if (!m) {
3300 dout(20) << __func__ << " osd." << osd << " from " << next
3301 << " has nothing to send" << dendl;
3302 return next;
3303 }
3304 con->send_message(m);
3305 // sub is current through last + 1
3306 return last + 1;
3307 }
3308
3309 // TICK
3310
3311
3312 void OSDMonitor::tick()
3313 {
3314 if (!is_active()) return;
3315
3316 dout(10) << osdmap << dendl;
3317
3318 if (!mon->is_leader()) return;
3319
3320 bool do_propose = false;
3321 utime_t now = ceph_clock_now();
3322
3323 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3324 mon->monmap->get_required_features().contains_all(
3325 ceph::features::mon::FEATURE_LUMINOUS)) {
3326 if (handle_osd_timeouts(now, last_osd_report)) {
3327 do_propose = true;
3328 }
3329 }
3330 if (!osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) &&
3331 osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3332 mon->mgrstatmon()->is_readable() &&
3333 mon->mgrstatmon()->definitely_converted_snapsets()) {
3334 dout(1) << __func__ << " all snapsets converted, setting purged_snapdirs"
3335 << dendl;
3336 add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS);
3337 do_propose = true;
3338 }
3339
3340 // mark osds down?
3341 if (check_failures(now))
3342 do_propose = true;
3343
3344 // mark down osds out?
3345
3346 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3347 * influence at all. The decision is made based on the ratio of "in" osds,
3348 * and the function returns false if this ratio is lower that the minimum
3349 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3350 */
3351 if (can_mark_out(-1)) {
3352 set<int> down_cache; // quick cache of down subtrees
3353
3354 map<int,utime_t>::iterator i = down_pending_out.begin();
3355 while (i != down_pending_out.end()) {
3356 int o = i->first;
3357 utime_t down = now;
3358 down -= i->second;
3359 ++i;
3360
3361 if (osdmap.is_down(o) &&
3362 osdmap.is_in(o) &&
3363 can_mark_out(o)) {
3364 utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
3365 utime_t grace = orig_grace;
3366 double my_grace = 0.0;
3367
3368 if (g_conf->mon_osd_adjust_down_out_interval) {
3369 // scale grace period the same way we do the heartbeat grace.
3370 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
3371 double halflife = (double)g_conf->mon_osd_laggy_halflife;
3372 double decay_k = ::log(.5) / halflife;
3373 double decay = exp((double)down * decay_k);
3374 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
3375 << " down for " << down << " decay " << decay << dendl;
3376 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3377 grace += my_grace;
3378 }
3379
3380 // is this an entire large subtree down?
3381 if (g_conf->mon_osd_down_out_subtree_limit.length()) {
3382 int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
3383 if (type > 0) {
3384 if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
3385 dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
3386 << " subtree for osd." << o << " is down; resetting timer" << dendl;
3387 // reset timer, too.
3388 down_pending_out[o] = now;
3389 continue;
3390 }
3391 }
3392 }
3393
3394 bool down_out = !osdmap.is_destroyed(o) &&
3395 g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace;
3396 bool destroyed_out = osdmap.is_destroyed(o) &&
3397 g_conf->mon_osd_destroyed_out_interval > 0 &&
3398 // this is not precise enough as we did not make a note when this osd
3399 // was marked as destroyed, but let's not bother with that
3400 // complexity for now.
3401 down.sec() >= g_conf->mon_osd_destroyed_out_interval;
3402 if (down_out || destroyed_out) {
3403 dout(10) << "tick marking osd." << o << " OUT after " << down
3404 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
3405 pending_inc.new_weight[o] = CEPH_OSD_OUT;
3406
3407 // set the AUTOOUT bit.
3408 if (pending_inc.new_state.count(o) == 0)
3409 pending_inc.new_state[o] = 0;
3410 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
3411
3412 // remember previous weight
3413 if (pending_inc.new_xinfo.count(o) == 0)
3414 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
3415 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
3416
3417 do_propose = true;
3418
3419 mon->clog->info() << "Marking osd." << o << " out (has been down for "
3420 << int(down.sec()) << " seconds)";
3421 } else
3422 continue;
3423 }
3424
3425 down_pending_out.erase(o);
3426 }
3427 } else {
3428 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
3429 }
3430
3431 // expire blacklisted items?
3432 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
3433 p != osdmap.blacklist.end();
3434 ++p) {
3435 if (p->second < now) {
3436 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
3437 pending_inc.old_blacklist.push_back(p->first);
3438 do_propose = true;
3439 }
3440 }
3441
3442 // if map full setting has changed, get that info out there!
3443 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS &&
3444 mon->pgservice->is_readable()) {
3445 // for pre-luminous compat only!
3446 if (mon->pgservice->have_full_osds()) {
3447 dout(5) << "There are full osds, setting full flag" << dendl;
3448 add_flag(CEPH_OSDMAP_FULL);
3449 } else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
3450 dout(10) << "No full osds, removing full flag" << dendl;
3451 remove_flag(CEPH_OSDMAP_FULL);
3452 }
3453
3454 if (mon->pgservice->have_nearfull_osds()) {
3455 dout(5) << "There are near full osds, setting nearfull flag" << dendl;
3456 add_flag(CEPH_OSDMAP_NEARFULL);
3457 } else if (osdmap.test_flag(CEPH_OSDMAP_NEARFULL)){
3458 dout(10) << "No near full osds, removing nearfull flag" << dendl;
3459 remove_flag(CEPH_OSDMAP_NEARFULL);
3460 }
3461 if (pending_inc.new_flags != -1 &&
3462 (pending_inc.new_flags ^ osdmap.flags) & (CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
3463 dout(1) << "New setting for" <<
3464 (pending_inc.new_flags & CEPH_OSDMAP_FULL ? " CEPH_OSDMAP_FULL" : "") <<
3465 (pending_inc.new_flags & CEPH_OSDMAP_NEARFULL ? " CEPH_OSDMAP_NEARFULL" : "")
3466 << " -- doing propose" << dendl;
3467 do_propose = true;
3468 }
3469 }
3470
3471 if (update_pools_status())
3472 do_propose = true;
3473
3474 if (do_propose ||
3475 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
3476 propose_pending();
3477 }
3478
3479 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
3480 std::map<int,utime_t> &last_osd_report)
3481 {
3482 utime_t timeo(g_conf->mon_osd_report_timeout, 0);
3483 if (now - mon->get_leader_since() < timeo) {
3484 // We haven't been the leader for long enough to consider OSD timeouts
3485 return false;
3486 }
3487
3488 int max_osd = osdmap.get_max_osd();
3489 bool new_down = false;
3490
3491 for (int i=0; i < max_osd; ++i) {
3492 dout(30) << __func__ << ": checking up on osd " << i << dendl;
3493 if (!osdmap.exists(i)) {
3494 last_osd_report.erase(i); // if any
3495 continue;
3496 }
3497 if (!osdmap.is_up(i))
3498 continue;
3499 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
3500 if (t == last_osd_report.end()) {
3501 // it wasn't in the map; start the timer.
3502 last_osd_report[i] = now;
3503 } else if (can_mark_down(i)) {
3504 utime_t diff = now - t->second;
3505 if (diff > timeo) {
3506 mon->clog->info() << "osd." << i << " marked down after no beacon for "
3507 << diff << " seconds";
3508 derr << "no beacon from osd." << i << " since " << t->second
3509 << ", " << diff << " seconds ago. marking down" << dendl;
3510 pending_inc.new_state[i] = CEPH_OSD_UP;
3511 new_down = true;
3512 }
3513 }
3514 }
3515 return new_down;
3516 }
3517
3518 void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
3519 list<pair<health_status_t,string> > *detail,
3520 CephContext *cct) const
3521 {
3522 int num_osds = osdmap.get_num_osds();
3523
3524 if (num_osds == 0) {
3525 summary.push_back(make_pair(HEALTH_ERR, "no osds"));
3526 } else {
3527 int num_in_osds = 0;
3528 int num_down_in_osds = 0;
3529 set<int> osds;
3530 set<int> down_in_osds;
3531 set<int> up_in_osds;
3532 set<int> subtree_up;
3533 unordered_map<int, set<int> > subtree_type_down;
3534 unordered_map<int, int> num_osds_subtree;
3535 int max_type = osdmap.crush->get_max_type_id();
3536
3537 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3538 if (!osdmap.exists(i)) {
3539 if (osdmap.crush->item_exists(i)) {
3540 osds.insert(i);
3541 }
3542 continue;
3543 }
3544 if (osdmap.is_out(i))
3545 continue;
3546 ++num_in_osds;
3547 if (down_in_osds.count(i) || up_in_osds.count(i))
3548 continue;
3549 if (!osdmap.is_up(i)) {
3550 down_in_osds.insert(i);
3551 int parent_id = 0;
3552 int current = i;
3553 for (int type = 0; type <= max_type; type++) {
3554 if (!osdmap.crush->get_type_name(type))
3555 continue;
3556 int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
3557 if (r == -ENOENT)
3558 break;
3559 // break early if this parent is already marked as up
3560 if (subtree_up.count(parent_id))
3561 break;
3562 type = osdmap.crush->get_bucket_type(parent_id);
3563 if (!osdmap.subtree_type_is_down(
3564 g_ceph_context, parent_id, type,
3565 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
3566 break;
3567 current = parent_id;
3568 }
3569 }
3570 }
3571
3572 // calculate the number of down osds in each down subtree and
3573 // store it in num_osds_subtree
3574 for (int type = 1; type <= max_type; type++) {
3575 if (!osdmap.crush->get_type_name(type))
3576 continue;
3577 for (auto j = subtree_type_down[type].begin();
3578 j != subtree_type_down[type].end();
3579 ++j) {
3580 if (type == 1) {
3581 list<int> children;
3582 int num = osdmap.crush->get_children(*j, &children);
3583 num_osds_subtree[*j] = num;
3584 } else {
3585 list<int> children;
3586 int num = 0;
3587 int num_children = osdmap.crush->get_children(*j, &children);
3588 if (num_children == 0)
3589 continue;
3590 for (auto l = children.begin(); l != children.end(); ++l) {
3591 if (num_osds_subtree[*l] > 0) {
3592 num = num + num_osds_subtree[*l];
3593 }
3594 }
3595 num_osds_subtree[*j] = num;
3596 }
3597 }
3598 }
3599 num_down_in_osds = down_in_osds.size();
3600 assert(num_down_in_osds <= num_in_osds);
3601 if (num_down_in_osds > 0) {
3602 // summary of down subtree types and osds
3603 for (int type = max_type; type > 0; type--) {
3604 if (!osdmap.crush->get_type_name(type))
3605 continue;
3606 if (subtree_type_down[type].size() > 0) {
3607 ostringstream ss;
3608 ss << subtree_type_down[type].size() << " "
3609 << osdmap.crush->get_type_name(type);
3610 if (subtree_type_down[type].size() > 1) {
3611 ss << "s";
3612 }
3613 int sum_down_osds = 0;
3614 for (auto j = subtree_type_down[type].begin();
3615 j != subtree_type_down[type].end();
3616 ++j) {
3617 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
3618 }
3619 ss << " (" << sum_down_osds << " osds) down";
3620 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3621 }
3622 }
3623 ostringstream ss;
3624 ss << down_in_osds.size() << " osds down";
3625 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3626
3627 if (detail) {
3628 // details of down subtree types
3629 for (int type = max_type; type > 0; type--) {
3630 if (!osdmap.crush->get_type_name(type))
3631 continue;
3632 for (auto j = subtree_type_down[type].rbegin();
3633 j != subtree_type_down[type].rend();
3634 ++j) {
3635 ostringstream ss;
3636 ss << osdmap.crush->get_type_name(type);
3637 ss << " ";
3638 ss << osdmap.crush->get_item_name(*j);
3639 // at the top level, do not print location
3640 if (type != max_type) {
3641 ss << " (";
3642 ss << osdmap.crush->get_full_location_ordered_string(*j);
3643 ss << ")";
3644 }
3645 int num = num_osds_subtree[*j];
3646 ss << " (" << num << " osds)";
3647 ss << " is down";
3648 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3649 }
3650 }
3651 // details of down osds
3652 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
3653 ostringstream ss;
3654 ss << "osd." << *it << " (";
3655 ss << osdmap.crush->get_full_location_ordered_string(*it);
3656 ss << ") is down";
3657 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3658 }
3659 }
3660 }
3661
3662 if (!osds.empty()) {
3663 ostringstream ss;
3664 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
3665 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3666 if (detail) {
3667 ss << " (osds: " << osds << ")";
3668 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3669 }
3670 }
3671
3672 // note: we leave it to ceph-mgr to generate details health warnings
3673 // with actual osd utilizations
3674
3675 // warn about flags
3676 uint64_t warn_flags =
3677 CEPH_OSDMAP_FULL |
3678 CEPH_OSDMAP_PAUSERD |
3679 CEPH_OSDMAP_PAUSEWR |
3680 CEPH_OSDMAP_PAUSEREC |
3681 CEPH_OSDMAP_NOUP |
3682 CEPH_OSDMAP_NODOWN |
3683 CEPH_OSDMAP_NOIN |
3684 CEPH_OSDMAP_NOOUT |
3685 CEPH_OSDMAP_NOBACKFILL |
3686 CEPH_OSDMAP_NORECOVER |
3687 CEPH_OSDMAP_NOSCRUB |
3688 CEPH_OSDMAP_NODEEP_SCRUB |
3689 CEPH_OSDMAP_NOTIERAGENT |
3690 CEPH_OSDMAP_NOREBALANCE;
3691 if (osdmap.test_flag(warn_flags)) {
3692 ostringstream ss;
3693 ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
3694 << " flag(s) set";
3695 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3696 if (detail)
3697 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3698 }
3699
3700 // old crush tunables?
3701 if (g_conf->mon_warn_on_legacy_crush_tunables) {
3702 string min = osdmap.crush->get_min_required_version();
3703 if (min < g_conf->mon_crush_min_required_version) {
3704 ostringstream ss;
3705 ss << "crush map has legacy tunables (require " << min
3706 << ", min is " << g_conf->mon_crush_min_required_version << ")";
3707 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3708 if (detail) {
3709 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3710 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3711 }
3712 }
3713 }
3714 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
3715 if (osdmap.crush->get_straw_calc_version() == 0) {
3716 ostringstream ss;
3717 ss << "crush map has straw_calc_version=0";
3718 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3719 if (detail) {
3720 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3721 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3722 }
3723 }
3724 }
3725
3726 // hit_set-less cache_mode?
3727 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
3728 int problem_cache_pools = 0;
3729 for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
3730 p != osdmap.pools.end();
3731 ++p) {
3732 const pg_pool_t& info = p->second;
3733 if (info.cache_mode_requires_hit_set() &&
3734 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
3735 ++problem_cache_pools;
3736 if (detail) {
3737 ostringstream ss;
3738 ss << "pool '" << osdmap.get_pool_name(p->first)
3739 << "' with cache_mode " << info.get_cache_mode_name()
3740 << " needs hit_set_type to be set but it is not";
3741 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3742 }
3743 }
3744 }
3745 if (problem_cache_pools) {
3746 ostringstream ss;
3747 ss << problem_cache_pools << " cache pools are missing hit_sets";
3748 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3749 }
3750 }
3751
3752 if (osdmap.crush->has_multirule_rulesets()) {
3753 ostringstream ss;
3754 ss << "CRUSH map contains multirule rulesets";
3755 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3756 if (detail) {
3757 ss << "; please manually fix the map";
3758 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3759 }
3760 }
3761
3762 // Not using 'sortbitwise' and should be?
3763 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
3764 (osdmap.get_up_osd_features() &
3765 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
3766 ostringstream ss;
3767 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
3768 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3769 }
3770
3771 // Warn if 'mon_osd_down_out_interval' is set to zero.
3772 // Having this option set to zero on the leader acts much like the
3773 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3774 // without the 'noout' flag set but acting like that just the same, so
3775 // we report a HEALTH_WARN in case this option is set to zero.
3776 // This is an ugly hack to get the warning out, but until we find a way
3777 // to spread global options throughout the mon cluster and have all mons
3778 // using a base set of the same options, we need to work around this sort
3779 // of things.
3780 // There's also the obvious drawback that if this is set on a single
3781 // monitor on a 3-monitor cluster, this warning will only be shown every
3782 // third monitor connection.
3783 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
3784 g_conf->mon_osd_down_out_interval == 0) {
3785 ostringstream ss;
3786 ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
3787 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3788 if (detail) {
3789 ss << "; this has the same effect as the 'noout' flag";
3790 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3791 }
3792 }
3793
3794 // warn about upgrade flags that can be set but are not.
3795 if (g_conf->mon_debug_no_require_luminous) {
3796 // ignore these checks
3797 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS) &&
3798 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
3799 string msg = "all OSDs are running luminous or later but"
3800 " require_osd_release < luminous";
3801 summary.push_back(make_pair(HEALTH_WARN, msg));
3802 if (detail) {
3803 detail->push_back(make_pair(HEALTH_WARN, msg));
3804 }
3805 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN) &&
3806 osdmap.require_osd_release < CEPH_RELEASE_KRAKEN) {
3807 string msg = "all OSDs are running kraken or later but"
3808 " require_osd_release < kraken";
3809 summary.push_back(make_pair(HEALTH_WARN, msg));
3810 if (detail) {
3811 detail->push_back(make_pair(HEALTH_WARN, msg));
3812 }
3813 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL) &&
3814 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
3815 string msg = "all OSDs are running jewel or later but"
3816 " require_osd_release < jewel";
3817 summary.push_back(make_pair(HEALTH_WARN, msg));
3818 if (detail) {
3819 detail->push_back(make_pair(HEALTH_WARN, msg));
3820 }
3821 }
3822
3823 for (auto it : osdmap.get_pools()) {
3824 const pg_pool_t &pool = it.second;
3825 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
3826 const string& pool_name = osdmap.get_pool_name(it.first);
3827 stringstream ss;
3828 ss << "pool '" << pool_name << "' is full";
3829 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3830 if (detail)
3831 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3832 }
3833 }
3834 }
3835 }
3836
3837 void OSDMonitor::dump_info(Formatter *f)
3838 {
3839 f->open_object_section("osdmap");
3840 osdmap.dump(f);
3841 f->close_section();
3842
3843 f->open_array_section("osd_metadata");
3844 for (int i=0; i<osdmap.get_max_osd(); ++i) {
3845 if (osdmap.exists(i)) {
3846 f->open_object_section("osd");
3847 f->dump_unsigned("id", i);
3848 dump_osd_metadata(i, f, NULL);
3849 f->close_section();
3850 }
3851 }
3852 f->close_section();
3853
3854 f->dump_unsigned("osdmap_first_committed", get_first_committed());
3855 f->dump_unsigned("osdmap_last_committed", get_last_committed());
3856
3857 f->open_object_section("crushmap");
3858 osdmap.crush->dump(f);
3859 f->close_section();
3860 }
3861
3862 namespace {
3863 enum osd_pool_get_choices {
3864 SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
3865 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL,
3866 NODELETE, NOPGCHANGE, NOSIZECHANGE,
3867 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
3868 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
3869 USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
3870 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
3871 CACHE_TARGET_FULL_RATIO,
3872 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
3873 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
3874 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
3875 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
3876 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
3877 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
3878 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
3879 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
3880 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK };
3881
3882 std::set<osd_pool_get_choices>
3883 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
3884 const std::set<osd_pool_get_choices>& second)
3885 {
3886 std::set<osd_pool_get_choices> result;
3887 std::set_difference(first.begin(), first.end(),
3888 second.begin(), second.end(),
3889 std::inserter(result, result.end()));
3890 return result;
3891 }
3892 }
3893
3894
3895 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
3896 {
3897 op->mark_osdmon_event(__func__);
3898 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
3899 int r = 0;
3900 bufferlist rdata;
3901 stringstream ss, ds;
3902
3903 map<string, cmd_vartype> cmdmap;
3904 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
3905 string rs = ss.str();
3906 mon->reply_command(op, -EINVAL, rs, get_last_committed());
3907 return true;
3908 }
3909
3910 MonSession *session = m->get_session();
3911 if (!session) {
3912 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
3913 return true;
3914 }
3915
3916 string prefix;
3917 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
3918
3919 string format;
3920 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
3921 boost::scoped_ptr<Formatter> f(Formatter::create(format));
3922
3923 if (prefix == "osd stat") {
3924 osdmap.print_summary(f.get(), ds, "");
3925 if (f)
3926 f->flush(rdata);
3927 else
3928 rdata.append(ds);
3929 }
3930 else if (prefix == "osd perf" ||
3931 prefix == "osd blocked-by") {
3932 r = mon->pgservice->process_pg_command(prefix, cmdmap,
3933 osdmap, f.get(), &ss, &rdata);
3934 }
3935 else if (prefix == "osd dump" ||
3936 prefix == "osd tree" ||
3937 prefix == "osd ls" ||
3938 prefix == "osd getmap" ||
3939 prefix == "osd getcrushmap" ||
3940 prefix == "osd ls-tree") {
3941 string val;
3942
3943 epoch_t epoch = 0;
3944 int64_t epochnum;
3945 cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
3946 epoch = epochnum;
3947
3948 bufferlist osdmap_bl;
3949 int err = get_version_full(epoch, osdmap_bl);
3950 if (err == -ENOENT) {
3951 r = -ENOENT;
3952 ss << "there is no map for epoch " << epoch;
3953 goto reply;
3954 }
3955 assert(err == 0);
3956 assert(osdmap_bl.length());
3957
3958 OSDMap *p;
3959 if (epoch == osdmap.get_epoch()) {
3960 p = &osdmap;
3961 } else {
3962 p = new OSDMap;
3963 p->decode(osdmap_bl);
3964 }
3965
3966 auto sg = make_scope_guard([&] {
3967 if (p != &osdmap) {
3968 delete p;
3969 }
3970 });
3971
3972 if (prefix == "osd dump") {
3973 stringstream ds;
3974 if (f) {
3975 f->open_object_section("osdmap");
3976 p->dump(f.get());
3977 f->close_section();
3978 f->flush(ds);
3979 } else {
3980 p->print(ds);
3981 }
3982 rdata.append(ds);
3983 if (!f)
3984 ds << " ";
3985 } else if (prefix == "osd ls") {
3986 if (f) {
3987 f->open_array_section("osds");
3988 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3989 if (osdmap.exists(i)) {
3990 f->dump_int("osd", i);
3991 }
3992 }
3993 f->close_section();
3994 f->flush(ds);
3995 } else {
3996 bool first = true;
3997 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3998 if (osdmap.exists(i)) {
3999 if (!first)
4000 ds << "\n";
4001 first = false;
4002 ds << i;
4003 }
4004 }
4005 }
4006 rdata.append(ds);
4007 } else if (prefix == "osd tree") {
4008 vector<string> states;
4009 cmd_getval(g_ceph_context, cmdmap, "states", states);
4010 unsigned filter = 0;
4011 for (auto& s : states) {
4012 if (s == "up") {
4013 filter |= OSDMap::DUMP_UP;
4014 } else if (s == "down") {
4015 filter |= OSDMap::DUMP_DOWN;
4016 } else if (s == "in") {
4017 filter |= OSDMap::DUMP_IN;
4018 } else if (s == "out") {
4019 filter |= OSDMap::DUMP_OUT;
4020 } else if (s == "destroyed") {
4021 filter |= OSDMap::DUMP_DESTROYED;
4022 } else {
4023 ss << "unrecognized state '" << s << "'";
4024 r = -EINVAL;
4025 goto reply;
4026 }
4027 }
4028 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
4029 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4030 ss << "cannot specify both 'in' and 'out'";
4031 r = -EINVAL;
4032 goto reply;
4033 }
4034 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4035 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4036 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4037 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4038 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4039 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4040 ss << "can specify only one of 'up', 'down' and 'destroyed'";
4041 r = -EINVAL;
4042 goto reply;
4043 }
4044 if (f) {
4045 f->open_object_section("tree");
4046 p->print_tree(f.get(), NULL, filter);
4047 f->close_section();
4048 f->flush(ds);
4049 } else {
4050 p->print_tree(NULL, &ds, filter);
4051 }
4052 rdata.append(ds);
4053 } else if (prefix == "osd getmap") {
4054 rdata.append(osdmap_bl);
4055 ss << "got osdmap epoch " << p->get_epoch();
4056 } else if (prefix == "osd getcrushmap") {
4057 p->crush->encode(rdata, mon->get_quorum_con_features());
4058 ss << p->get_crush_version();
4059 } else if (prefix == "osd ls-tree") {
4060 string bucket_name;
4061 cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
4062 set<int> osds;
4063 r = p->get_osds_by_bucket_name(bucket_name, &osds);
4064 if (r == -ENOENT) {
4065 ss << "\"" << bucket_name << "\" does not exist";
4066 goto reply;
4067 } else if (r < 0) {
4068 ss << "can not parse bucket name:\"" << bucket_name << "\"";
4069 goto reply;
4070 }
4071
4072 if (f) {
4073 f->open_array_section("osds");
4074 for (auto &i : osds) {
4075 if (osdmap.exists(i)) {
4076 f->dump_int("osd", i);
4077 }
4078 }
4079 f->close_section();
4080 f->flush(ds);
4081 } else {
4082 bool first = true;
4083 for (auto &i : osds) {
4084 if (osdmap.exists(i)) {
4085 if (!first)
4086 ds << "\n";
4087 first = false;
4088 ds << i;
4089 }
4090 }
4091 }
4092
4093 rdata.append(ds);
4094 }
4095 } else if (prefix == "osd df") {
4096 string method;
4097 cmd_getval(g_ceph_context, cmdmap, "output_method", method);
4098 print_osd_utilization(osdmap, mon->pgservice, ds,
4099 f.get(), method == "tree");
4100 rdata.append(ds);
4101 } else if (prefix == "osd getmaxosd") {
4102 if (f) {
4103 f->open_object_section("getmaxosd");
4104 f->dump_unsigned("epoch", osdmap.get_epoch());
4105 f->dump_int("max_osd", osdmap.get_max_osd());
4106 f->close_section();
4107 f->flush(rdata);
4108 } else {
4109 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4110 rdata.append(ds);
4111 }
4112 } else if (prefix == "osd utilization") {
4113 string out;
4114 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4115 if (f)
4116 f->flush(rdata);
4117 else
4118 rdata.append(out);
4119 r = 0;
4120 goto reply;
4121 } else if (prefix == "osd find") {
4122 int64_t osd;
4123 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4124 ss << "unable to parse osd id value '"
4125 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4126 r = -EINVAL;
4127 goto reply;
4128 }
4129 if (!osdmap.exists(osd)) {
4130 ss << "osd." << osd << " does not exist";
4131 r = -ENOENT;
4132 goto reply;
4133 }
4134 string format;
4135 cmd_getval(g_ceph_context, cmdmap, "format", format);
4136 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4137 f->open_object_section("osd_location");
4138 f->dump_int("osd", osd);
4139 f->dump_stream("ip") << osdmap.get_addr(osd);
4140 f->open_object_section("crush_location");
4141 map<string,string> loc = osdmap.crush->get_full_location(osd);
4142 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4143 f->dump_string(p->first.c_str(), p->second);
4144 f->close_section();
4145 f->close_section();
4146 f->flush(rdata);
4147 } else if (prefix == "osd metadata") {
4148 int64_t osd = -1;
4149 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4150 !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4151 ss << "unable to parse osd id value '"
4152 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4153 r = -EINVAL;
4154 goto reply;
4155 }
4156 if (osd >= 0 && !osdmap.exists(osd)) {
4157 ss << "osd." << osd << " does not exist";
4158 r = -ENOENT;
4159 goto reply;
4160 }
4161 string format;
4162 cmd_getval(g_ceph_context, cmdmap, "format", format);
4163 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4164 if (osd >= 0) {
4165 f->open_object_section("osd_metadata");
4166 f->dump_unsigned("id", osd);
4167 r = dump_osd_metadata(osd, f.get(), &ss);
4168 if (r < 0)
4169 goto reply;
4170 f->close_section();
4171 } else {
4172 r = 0;
4173 f->open_array_section("osd_metadata");
4174 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4175 if (osdmap.exists(i)) {
4176 f->open_object_section("osd");
4177 f->dump_unsigned("id", i);
4178 r = dump_osd_metadata(i, f.get(), NULL);
4179 if (r == -EINVAL || r == -ENOENT) {
4180 // Drop error, continue to get other daemons' metadata
4181 dout(4) << "No metadata for osd." << i << dendl;
4182 r = 0;
4183 } else if (r < 0) {
4184 // Unexpected error
4185 goto reply;
4186 }
4187 f->close_section();
4188 }
4189 }
4190 f->close_section();
4191 }
4192 f->flush(rdata);
4193 } else if (prefix == "osd versions") {
4194 if (!f)
4195 f.reset(Formatter::create("json-pretty"));
4196 count_metadata("ceph_version", f.get());
4197 f->flush(rdata);
4198 r = 0;
4199 } else if (prefix == "osd count-metadata") {
4200 if (!f)
4201 f.reset(Formatter::create("json-pretty"));
4202 string field;
4203 cmd_getval(g_ceph_context, cmdmap, "property", field);
4204 count_metadata(field, f.get());
4205 f->flush(rdata);
4206 r = 0;
4207 } else if (prefix == "osd map") {
4208 string poolstr, objstr, namespacestr;
4209 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4210 cmd_getval(g_ceph_context, cmdmap, "object", objstr);
4211 cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
4212
4213 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4214 if (pool < 0) {
4215 ss << "pool " << poolstr << " does not exist";
4216 r = -ENOENT;
4217 goto reply;
4218 }
4219 object_locator_t oloc(pool, namespacestr);
4220 object_t oid(objstr);
4221 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4222 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4223 vector<int> up, acting;
4224 int up_p, acting_p;
4225 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
4226
4227 string fullobjname;
4228 if (!namespacestr.empty())
4229 fullobjname = namespacestr + string("/") + oid.name;
4230 else
4231 fullobjname = oid.name;
4232 if (f) {
4233 f->open_object_section("osd_map");
4234 f->dump_unsigned("epoch", osdmap.get_epoch());
4235 f->dump_string("pool", poolstr);
4236 f->dump_int("pool_id", pool);
4237 f->dump_stream("objname") << fullobjname;
4238 f->dump_stream("raw_pgid") << pgid;
4239 f->dump_stream("pgid") << mpgid;
4240 f->open_array_section("up");
4241 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
4242 f->dump_int("osd", *p);
4243 f->close_section();
4244 f->dump_int("up_primary", up_p);
4245 f->open_array_section("acting");
4246 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
4247 f->dump_int("osd", *p);
4248 f->close_section();
4249 f->dump_int("acting_primary", acting_p);
4250 f->close_section(); // osd_map
4251 f->flush(rdata);
4252 } else {
4253 ds << "osdmap e" << osdmap.get_epoch()
4254 << " pool '" << poolstr << "' (" << pool << ")"
4255 << " object '" << fullobjname << "' ->"
4256 << " pg " << pgid << " (" << mpgid << ")"
4257 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
4258 << pg_vector_string(acting) << ", p" << acting_p << ")";
4259 rdata.append(ds);
4260 }
4261
4262 } else if (prefix == "pg map") {
4263 pg_t pgid;
4264 string pgidstr;
4265 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
4266 if (!pgid.parse(pgidstr.c_str())) {
4267 ss << "invalid pgid '" << pgidstr << "'";
4268 r = -EINVAL;
4269 goto reply;
4270 }
4271 vector<int> up, acting;
4272 if (!osdmap.have_pg_pool(pgid.pool())) {
4273 ss << "pg '" << pgidstr << "' does not exist";
4274 r = -ENOENT;
4275 goto reply;
4276 }
4277 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4278 osdmap.pg_to_up_acting_osds(pgid, up, acting);
4279 if (f) {
4280 f->open_object_section("pg_map");
4281 f->dump_unsigned("epoch", osdmap.get_epoch());
4282 f->dump_stream("raw_pgid") << pgid;
4283 f->dump_stream("pgid") << mpgid;
4284 f->open_array_section("up");
4285 for (auto osd : up) {
4286 f->dump_int("up_osd", osd);
4287 }
4288 f->close_section();
4289 f->open_array_section("acting");
4290 for (auto osd : acting) {
4291 f->dump_int("acting_osd", osd);
4292 }
4293 f->close_section();
4294 f->close_section();
4295 f->flush(rdata);
4296 } else {
4297 ds << "osdmap e" << osdmap.get_epoch()
4298 << " pg " << pgid << " (" << mpgid << ")"
4299 << " -> up " << up << " acting " << acting;
4300 rdata.append(ds);
4301 }
4302 goto reply;
4303
4304 } else if (prefix == "osd scrub" ||
4305 prefix == "osd deep-scrub" ||
4306 prefix == "osd repair") {
4307 string whostr;
4308 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
4309 vector<string> pvec;
4310 get_str_vec(prefix, pvec);
4311
4312 if (whostr == "*" || whostr == "all" || whostr == "any") {
4313 ss << "osds ";
4314 int c = 0;
4315 for (int i = 0; i < osdmap.get_max_osd(); i++)
4316 if (osdmap.is_up(i)) {
4317 ss << (c++ ? "," : "") << i;
4318 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4319 pvec.back() == "repair",
4320 pvec.back() == "deep-scrub"),
4321 osdmap.get_inst(i));
4322 }
4323 r = 0;
4324 ss << " instructed to " << pvec.back();
4325 } else {
4326 long osd = parse_osd_id(whostr.c_str(), &ss);
4327 if (osd < 0) {
4328 r = -EINVAL;
4329 } else if (osdmap.is_up(osd)) {
4330 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4331 pvec.back() == "repair",
4332 pvec.back() == "deep-scrub"),
4333 osdmap.get_inst(osd));
4334 ss << "osd." << osd << " instructed to " << pvec.back();
4335 } else {
4336 ss << "osd." << osd << " is not up";
4337 r = -EAGAIN;
4338 }
4339 }
4340 } else if (prefix == "osd lspools") {
4341 int64_t auid;
4342 cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
4343 if (f)
4344 f->open_array_section("pools");
4345 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
4346 p != osdmap.pools.end();
4347 ++p) {
4348 if (!auid || p->second.auid == (uint64_t)auid) {
4349 if (f) {
4350 f->open_object_section("pool");
4351 f->dump_int("poolnum", p->first);
4352 f->dump_string("poolname", osdmap.pool_name[p->first]);
4353 f->close_section();
4354 } else {
4355 ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
4356 }
4357 }
4358 }
4359 if (f) {
4360 f->close_section();
4361 f->flush(ds);
4362 }
4363 rdata.append(ds);
4364 } else if (prefix == "osd blacklist ls") {
4365 if (f)
4366 f->open_array_section("blacklist");
4367
4368 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4369 p != osdmap.blacklist.end();
4370 ++p) {
4371 if (f) {
4372 f->open_object_section("entry");
4373 f->dump_stream("addr") << p->first;
4374 f->dump_stream("until") << p->second;
4375 f->close_section();
4376 } else {
4377 stringstream ss;
4378 string s;
4379 ss << p->first << " " << p->second;
4380 getline(ss, s);
4381 s += "\n";
4382 rdata.append(s);
4383 }
4384 }
4385 if (f) {
4386 f->close_section();
4387 f->flush(rdata);
4388 }
4389 ss << "listed " << osdmap.blacklist.size() << " entries";
4390
4391 } else if (prefix == "osd pool ls") {
4392 string detail;
4393 cmd_getval(g_ceph_context, cmdmap, "detail", detail);
4394 if (!f && detail == "detail") {
4395 ostringstream ss;
4396 osdmap.print_pools(ss);
4397 rdata.append(ss.str());
4398 } else {
4399 if (f)
4400 f->open_array_section("pools");
4401 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
4402 it != osdmap.get_pools().end();
4403 ++it) {
4404 if (f) {
4405 if (detail == "detail") {
4406 f->open_object_section("pool");
4407 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4408 it->second.dump(f.get());
4409 f->close_section();
4410 } else {
4411 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4412 }
4413 } else {
4414 rdata.append(osdmap.get_pool_name(it->first) + "\n");
4415 }
4416 }
4417 if (f) {
4418 f->close_section();
4419 f->flush(rdata);
4420 }
4421 }
4422
4423 } else if (prefix == "osd crush get-tunable") {
4424 string tunable;
4425 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
4426 ostringstream rss;
4427 if (f)
4428 f->open_object_section("tunable");
4429 if (tunable == "straw_calc_version") {
4430 if (f)
4431 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
4432 else
4433 rss << osdmap.crush->get_straw_calc_version() << "\n";
4434 } else {
4435 r = -EINVAL;
4436 goto reply;
4437 }
4438 if (f) {
4439 f->close_section();
4440 f->flush(rdata);
4441 } else {
4442 rdata.append(rss.str());
4443 }
4444 r = 0;
4445
4446 } else if (prefix == "osd pool get") {
4447 string poolstr;
4448 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4449 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4450 if (pool < 0) {
4451 ss << "unrecognized pool '" << poolstr << "'";
4452 r = -ENOENT;
4453 goto reply;
4454 }
4455
4456 const pg_pool_t *p = osdmap.get_pg_pool(pool);
4457 string var;
4458 cmd_getval(g_ceph_context, cmdmap, "var", var);
4459
4460 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
4461 const choices_map_t ALL_CHOICES = {
4462 {"size", SIZE},
4463 {"min_size", MIN_SIZE},
4464 {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
4465 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
4466 {"crush_rule", CRUSH_RULE},
4467 {"hashpspool", HASHPSPOOL}, {"nodelete", NODELETE},
4468 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
4469 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
4470 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
4471 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
4472 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
4473 {"use_gmt_hitset", USE_GMT_HITSET},
4474 {"auid", AUID}, {"target_max_objects", TARGET_MAX_OBJECTS},
4475 {"target_max_bytes", TARGET_MAX_BYTES},
4476 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
4477 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
4478 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
4479 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
4480 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
4481 {"erasure_code_profile", ERASURE_CODE_PROFILE},
4482 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
4483 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
4484 {"fast_read", FAST_READ},
4485 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
4486 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
4487 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
4488 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
4489 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
4490 {"recovery_priority", RECOVERY_PRIORITY},
4491 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
4492 {"scrub_priority", SCRUB_PRIORITY},
4493 {"compression_mode", COMPRESSION_MODE},
4494 {"compression_algorithm", COMPRESSION_ALGORITHM},
4495 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
4496 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
4497 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
4498 {"csum_type", CSUM_TYPE},
4499 {"csum_max_block", CSUM_MAX_BLOCK},
4500 {"csum_min_block", CSUM_MIN_BLOCK},
4501 };
4502
4503 typedef std::set<osd_pool_get_choices> choices_set_t;
4504
4505 const choices_set_t ONLY_TIER_CHOICES = {
4506 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4507 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
4508 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4509 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4510 MIN_READ_RECENCY_FOR_PROMOTE,
4511 MIN_WRITE_RECENCY_FOR_PROMOTE,
4512 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
4513 };
4514 const choices_set_t ONLY_ERASURE_CHOICES = {
4515 ERASURE_CODE_PROFILE
4516 };
4517
4518 choices_set_t selected_choices;
4519 if (var == "all") {
4520 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
4521 it != ALL_CHOICES.end(); ++it) {
4522 selected_choices.insert(it->second);
4523 }
4524
4525 if(!p->is_tier()) {
4526 selected_choices = subtract_second_from_first(selected_choices,
4527 ONLY_TIER_CHOICES);
4528 }
4529
4530 if(!p->is_erasure()) {
4531 selected_choices = subtract_second_from_first(selected_choices,
4532 ONLY_ERASURE_CHOICES);
4533 }
4534 } else /* var != "all" */ {
4535 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
4536 osd_pool_get_choices selected = found->second;
4537
4538 if (!p->is_tier() &&
4539 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
4540 ss << "pool '" << poolstr
4541 << "' is not a tier pool: variable not applicable";
4542 r = -EACCES;
4543 goto reply;
4544 }
4545
4546 if (!p->is_erasure() &&
4547 ONLY_ERASURE_CHOICES.find(selected)
4548 != ONLY_ERASURE_CHOICES.end()) {
4549 ss << "pool '" << poolstr
4550 << "' is not a erasure pool: variable not applicable";
4551 r = -EACCES;
4552 goto reply;
4553 }
4554
4555 selected_choices.insert(selected);
4556 }
4557
4558 if (f) {
4559 for(choices_set_t::const_iterator it = selected_choices.begin();
4560 it != selected_choices.end(); ++it) {
4561 choices_map_t::const_iterator i;
4562 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4563 if (i->second == *it) {
4564 break;
4565 }
4566 }
4567 assert(i != ALL_CHOICES.end());
4568 bool pool_opt = pool_opts_t::is_opt_name(i->first);
4569 if (!pool_opt) {
4570 f->open_object_section("pool");
4571 f->dump_string("pool", poolstr);
4572 f->dump_int("pool_id", pool);
4573 }
4574 switch(*it) {
4575 case PG_NUM:
4576 f->dump_int("pg_num", p->get_pg_num());
4577 break;
4578 case PGP_NUM:
4579 f->dump_int("pgp_num", p->get_pgp_num());
4580 break;
4581 case AUID:
4582 f->dump_int("auid", p->get_auid());
4583 break;
4584 case SIZE:
4585 f->dump_int("size", p->get_size());
4586 break;
4587 case MIN_SIZE:
4588 f->dump_int("min_size", p->get_min_size());
4589 break;
4590 case CRASH_REPLAY_INTERVAL:
4591 f->dump_int("crash_replay_interval",
4592 p->get_crash_replay_interval());
4593 break;
4594 case CRUSH_RULE:
4595 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
4596 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
4597 p->get_crush_rule()));
4598 } else {
4599 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
4600 }
4601 break;
4602 case HASHPSPOOL:
4603 case NODELETE:
4604 case NOPGCHANGE:
4605 case NOSIZECHANGE:
4606 case WRITE_FADVISE_DONTNEED:
4607 case NOSCRUB:
4608 case NODEEP_SCRUB:
4609 f->dump_string(i->first.c_str(),
4610 p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4611 "true" : "false");
4612 break;
4613 case HIT_SET_PERIOD:
4614 f->dump_int("hit_set_period", p->hit_set_period);
4615 break;
4616 case HIT_SET_COUNT:
4617 f->dump_int("hit_set_count", p->hit_set_count);
4618 break;
4619 case HIT_SET_TYPE:
4620 f->dump_string("hit_set_type",
4621 HitSet::get_type_name(p->hit_set_params.get_type()));
4622 break;
4623 case HIT_SET_FPP:
4624 {
4625 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4626 BloomHitSet::Params *bloomp =
4627 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4628 f->dump_float("hit_set_fpp", bloomp->get_fpp());
4629 } else if(var != "all") {
4630 f->close_section();
4631 ss << "hit set is not of type Bloom; " <<
4632 "invalid to get a false positive rate!";
4633 r = -EINVAL;
4634 goto reply;
4635 }
4636 }
4637 break;
4638 case USE_GMT_HITSET:
4639 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
4640 break;
4641 case TARGET_MAX_OBJECTS:
4642 f->dump_unsigned("target_max_objects", p->target_max_objects);
4643 break;
4644 case TARGET_MAX_BYTES:
4645 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
4646 break;
4647 case CACHE_TARGET_DIRTY_RATIO:
4648 f->dump_unsigned("cache_target_dirty_ratio_micro",
4649 p->cache_target_dirty_ratio_micro);
4650 f->dump_float("cache_target_dirty_ratio",
4651 ((float)p->cache_target_dirty_ratio_micro/1000000));
4652 break;
4653 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4654 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
4655 p->cache_target_dirty_high_ratio_micro);
4656 f->dump_float("cache_target_dirty_high_ratio",
4657 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
4658 break;
4659 case CACHE_TARGET_FULL_RATIO:
4660 f->dump_unsigned("cache_target_full_ratio_micro",
4661 p->cache_target_full_ratio_micro);
4662 f->dump_float("cache_target_full_ratio",
4663 ((float)p->cache_target_full_ratio_micro/1000000));
4664 break;
4665 case CACHE_MIN_FLUSH_AGE:
4666 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
4667 break;
4668 case CACHE_MIN_EVICT_AGE:
4669 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
4670 break;
4671 case ERASURE_CODE_PROFILE:
4672 f->dump_string("erasure_code_profile", p->erasure_code_profile);
4673 break;
4674 case MIN_READ_RECENCY_FOR_PROMOTE:
4675 f->dump_int("min_read_recency_for_promote",
4676 p->min_read_recency_for_promote);
4677 break;
4678 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4679 f->dump_int("min_write_recency_for_promote",
4680 p->min_write_recency_for_promote);
4681 break;
4682 case FAST_READ:
4683 f->dump_int("fast_read", p->fast_read);
4684 break;
4685 case HIT_SET_GRADE_DECAY_RATE:
4686 f->dump_int("hit_set_grade_decay_rate",
4687 p->hit_set_grade_decay_rate);
4688 break;
4689 case HIT_SET_SEARCH_LAST_N:
4690 f->dump_int("hit_set_search_last_n",
4691 p->hit_set_search_last_n);
4692 break;
4693 case SCRUB_MIN_INTERVAL:
4694 case SCRUB_MAX_INTERVAL:
4695 case DEEP_SCRUB_INTERVAL:
4696 case RECOVERY_PRIORITY:
4697 case RECOVERY_OP_PRIORITY:
4698 case SCRUB_PRIORITY:
4699 case COMPRESSION_MODE:
4700 case COMPRESSION_ALGORITHM:
4701 case COMPRESSION_REQUIRED_RATIO:
4702 case COMPRESSION_MAX_BLOB_SIZE:
4703 case COMPRESSION_MIN_BLOB_SIZE:
4704 case CSUM_TYPE:
4705 case CSUM_MAX_BLOCK:
4706 case CSUM_MIN_BLOCK:
4707 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
4708 if (p->opts.is_set(key)) {
4709 f->open_object_section("pool");
4710 f->dump_string("pool", poolstr);
4711 f->dump_int("pool_id", pool);
4712 if(*it == CSUM_TYPE) {
4713 int val;
4714 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
4715 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
4716 } else {
4717 p->opts.dump(i->first, f.get());
4718 }
4719 f->close_section();
4720 f->flush(rdata);
4721 }
4722 break;
4723 }
4724 if (!pool_opt) {
4725 f->close_section();
4726 f->flush(rdata);
4727 }
4728 }
4729
4730 } else /* !f */ {
4731 for(choices_set_t::const_iterator it = selected_choices.begin();
4732 it != selected_choices.end(); ++it) {
4733 choices_map_t::const_iterator i;
4734 switch(*it) {
4735 case PG_NUM:
4736 ss << "pg_num: " << p->get_pg_num() << "\n";
4737 break;
4738 case PGP_NUM:
4739 ss << "pgp_num: " << p->get_pgp_num() << "\n";
4740 break;
4741 case AUID:
4742 ss << "auid: " << p->get_auid() << "\n";
4743 break;
4744 case SIZE:
4745 ss << "size: " << p->get_size() << "\n";
4746 break;
4747 case MIN_SIZE:
4748 ss << "min_size: " << p->get_min_size() << "\n";
4749 break;
4750 case CRASH_REPLAY_INTERVAL:
4751 ss << "crash_replay_interval: " <<
4752 p->get_crash_replay_interval() << "\n";
4753 break;
4754 case CRUSH_RULE:
4755 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
4756 ss << "crush_rule: " << osdmap.crush->get_rule_name(
4757 p->get_crush_rule()) << "\n";
4758 } else {
4759 ss << "crush_rule: " << p->get_crush_rule() << "\n";
4760 }
4761 break;
4762 case HIT_SET_PERIOD:
4763 ss << "hit_set_period: " << p->hit_set_period << "\n";
4764 break;
4765 case HIT_SET_COUNT:
4766 ss << "hit_set_count: " << p->hit_set_count << "\n";
4767 break;
4768 case HIT_SET_TYPE:
4769 ss << "hit_set_type: " <<
4770 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
4771 break;
4772 case HIT_SET_FPP:
4773 {
4774 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4775 BloomHitSet::Params *bloomp =
4776 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4777 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
4778 } else if(var != "all") {
4779 ss << "hit set is not of type Bloom; " <<
4780 "invalid to get a false positive rate!";
4781 r = -EINVAL;
4782 goto reply;
4783 }
4784 }
4785 break;
4786 case USE_GMT_HITSET:
4787 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
4788 break;
4789 case TARGET_MAX_OBJECTS:
4790 ss << "target_max_objects: " << p->target_max_objects << "\n";
4791 break;
4792 case TARGET_MAX_BYTES:
4793 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
4794 break;
4795 case CACHE_TARGET_DIRTY_RATIO:
4796 ss << "cache_target_dirty_ratio: "
4797 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
4798 break;
4799 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4800 ss << "cache_target_dirty_high_ratio: "
4801 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
4802 break;
4803 case CACHE_TARGET_FULL_RATIO:
4804 ss << "cache_target_full_ratio: "
4805 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
4806 break;
4807 case CACHE_MIN_FLUSH_AGE:
4808 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
4809 break;
4810 case CACHE_MIN_EVICT_AGE:
4811 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
4812 break;
4813 case ERASURE_CODE_PROFILE:
4814 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
4815 break;
4816 case MIN_READ_RECENCY_FOR_PROMOTE:
4817 ss << "min_read_recency_for_promote: " <<
4818 p->min_read_recency_for_promote << "\n";
4819 break;
4820 case HIT_SET_GRADE_DECAY_RATE:
4821 ss << "hit_set_grade_decay_rate: " <<
4822 p->hit_set_grade_decay_rate << "\n";
4823 break;
4824 case HIT_SET_SEARCH_LAST_N:
4825 ss << "hit_set_search_last_n: " <<
4826 p->hit_set_search_last_n << "\n";
4827 break;
4828 case HASHPSPOOL:
4829 case NODELETE:
4830 case NOPGCHANGE:
4831 case NOSIZECHANGE:
4832 case WRITE_FADVISE_DONTNEED:
4833 case NOSCRUB:
4834 case NODEEP_SCRUB:
4835 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4836 if (i->second == *it)
4837 break;
4838 }
4839 assert(i != ALL_CHOICES.end());
4840 ss << i->first << ": " <<
4841 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4842 "true" : "false") << "\n";
4843 break;
4844 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4845 ss << "min_write_recency_for_promote: " <<
4846 p->min_write_recency_for_promote << "\n";
4847 break;
4848 case FAST_READ:
4849 ss << "fast_read: " << p->fast_read << "\n";
4850 break;
4851 case SCRUB_MIN_INTERVAL:
4852 case SCRUB_MAX_INTERVAL:
4853 case DEEP_SCRUB_INTERVAL:
4854 case RECOVERY_PRIORITY:
4855 case RECOVERY_OP_PRIORITY:
4856 case SCRUB_PRIORITY:
4857 case COMPRESSION_MODE:
4858 case COMPRESSION_ALGORITHM:
4859 case COMPRESSION_REQUIRED_RATIO:
4860 case COMPRESSION_MAX_BLOB_SIZE:
4861 case COMPRESSION_MIN_BLOB_SIZE:
4862 case CSUM_TYPE:
4863 case CSUM_MAX_BLOCK:
4864 case CSUM_MIN_BLOCK:
4865 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4866 if (i->second == *it)
4867 break;
4868 }
4869 assert(i != ALL_CHOICES.end());
4870 {
4871 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
4872 if (p->opts.is_set(key)) {
4873 if(key == pool_opts_t::CSUM_TYPE) {
4874 int val;
4875 p->opts.get(key, &val);
4876 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
4877 } else {
4878 ss << i->first << ": " << p->opts.get(key) << "\n";
4879 }
4880 }
4881 }
4882 break;
4883 }
4884 rdata.append(ss.str());
4885 ss.str("");
4886 }
4887 }
4888 r = 0;
4889 } else if (prefix == "osd pool stats") {
4890 r = mon->pgservice->process_pg_command(prefix, cmdmap,
4891 osdmap, f.get(), &ss, &rdata);
4892 } else if (prefix == "osd pool get-quota") {
4893 string pool_name;
4894 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
4895
4896 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
4897 if (poolid < 0) {
4898 assert(poolid == -ENOENT);
4899 ss << "unrecognized pool '" << pool_name << "'";
4900 r = -ENOENT;
4901 goto reply;
4902 }
4903 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
4904
4905 if (f) {
4906 f->open_object_section("pool_quotas");
4907 f->dump_string("pool_name", pool_name);
4908 f->dump_unsigned("pool_id", poolid);
4909 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
4910 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
4911 f->close_section();
4912 f->flush(rdata);
4913 } else {
4914 stringstream rs;
4915 rs << "quotas for pool '" << pool_name << "':\n"
4916 << " max objects: ";
4917 if (p->quota_max_objects == 0)
4918 rs << "N/A";
4919 else
4920 rs << si_t(p->quota_max_objects) << " objects";
4921 rs << "\n"
4922 << " max bytes : ";
4923 if (p->quota_max_bytes == 0)
4924 rs << "N/A";
4925 else
4926 rs << si_t(p->quota_max_bytes) << "B";
4927 rdata.append(rs.str());
4928 }
4929 rdata.append("\n");
4930 r = 0;
4931 } else if (prefix == "osd crush rule list" ||
4932 prefix == "osd crush rule ls") {
4933 if (f) {
4934 f->open_array_section("rules");
4935 osdmap.crush->list_rules(f.get());
4936 f->close_section();
4937 f->flush(rdata);
4938 } else {
4939 ostringstream ss;
4940 osdmap.crush->list_rules(&ss);
4941 rdata.append(ss.str());
4942 }
4943 } else if (prefix == "osd crush rule ls-by-class") {
4944 string class_name;
4945 cmd_getval(g_ceph_context, cmdmap, "class", class_name);
4946 if (class_name.empty()) {
4947 ss << "no class specified";
4948 r = -EINVAL;
4949 goto reply;
4950 }
4951 set<int> rules;
4952 r = osdmap.crush->get_rules_by_class(class_name, &rules);
4953 if (r < 0) {
4954 ss << "failed to get rules by class '" << class_name << "'";
4955 goto reply;
4956 }
4957 if (f) {
4958 f->open_array_section("rules");
4959 for (auto &rule: rules) {
4960 f->dump_string("name", osdmap.crush->get_rule_name(rule));
4961 }
4962 f->close_section();
4963 f->flush(rdata);
4964 } else {
4965 ostringstream rs;
4966 for (auto &rule: rules) {
4967 rs << osdmap.crush->get_rule_name(rule) << "\n";
4968 }
4969 rdata.append(rs.str());
4970 }
4971 } else if (prefix == "osd crush rule dump") {
4972 string name;
4973 cmd_getval(g_ceph_context, cmdmap, "name", name);
4974 string format;
4975 cmd_getval(g_ceph_context, cmdmap, "format", format);
4976 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4977 if (name == "") {
4978 f->open_array_section("rules");
4979 osdmap.crush->dump_rules(f.get());
4980 f->close_section();
4981 } else {
4982 int ruleno = osdmap.crush->get_rule_id(name);
4983 if (ruleno < 0) {
4984 ss << "unknown crush rule '" << name << "'";
4985 r = ruleno;
4986 goto reply;
4987 }
4988 osdmap.crush->dump_rule(ruleno, f.get());
4989 }
4990 ostringstream rs;
4991 f->flush(rs);
4992 rs << "\n";
4993 rdata.append(rs.str());
4994 } else if (prefix == "osd crush dump") {
4995 string format;
4996 cmd_getval(g_ceph_context, cmdmap, "format", format);
4997 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4998 f->open_object_section("crush_map");
4999 osdmap.crush->dump(f.get());
5000 f->close_section();
5001 ostringstream rs;
5002 f->flush(rs);
5003 rs << "\n";
5004 rdata.append(rs.str());
5005 } else if (prefix == "osd crush show-tunables") {
5006 string format;
5007 cmd_getval(g_ceph_context, cmdmap, "format", format);
5008 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5009 f->open_object_section("crush_map_tunables");
5010 osdmap.crush->dump_tunables(f.get());
5011 f->close_section();
5012 ostringstream rs;
5013 f->flush(rs);
5014 rs << "\n";
5015 rdata.append(rs.str());
5016 } else if (prefix == "osd crush tree") {
5017 string shadow;
5018 cmd_getval(g_ceph_context, cmdmap, "shadow", shadow);
5019 bool show_shadow = shadow == "--show-shadow";
5020 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5021 if (f) {
5022 osdmap.crush->dump_tree(nullptr,
5023 f.get(),
5024 osdmap.get_pool_names(),
5025 show_shadow);
5026 f->flush(rdata);
5027 } else {
5028 ostringstream ss;
5029 osdmap.crush->dump_tree(&ss,
5030 nullptr,
5031 osdmap.get_pool_names(),
5032 show_shadow);
5033 rdata.append(ss.str());
5034 }
5035 } else if (prefix == "osd crush ls") {
5036 string name;
5037 if (!cmd_getval(g_ceph_context, cmdmap, "node", name)) {
5038 ss << "no node specified";
5039 r = -EINVAL;
5040 goto reply;
5041 }
5042 if (!osdmap.crush->name_exists(name)) {
5043 ss << "node '" << name << "' does not exist";
5044 r = -ENOENT;
5045 goto reply;
5046 }
5047 int id = osdmap.crush->get_item_id(name);
5048 list<int> result;
5049 if (id >= 0) {
5050 result.push_back(id);
5051 } else {
5052 int num = osdmap.crush->get_bucket_size(id);
5053 for (int i = 0; i < num; ++i) {
5054 result.push_back(osdmap.crush->get_bucket_item(id, i));
5055 }
5056 }
5057 if (f) {
5058 f->open_array_section("items");
5059 for (auto i : result) {
5060 f->dump_string("item", osdmap.crush->get_item_name(i));
5061 }
5062 f->close_section();
5063 f->flush(rdata);
5064 } else {
5065 ostringstream ss;
5066 for (auto i : result) {
5067 ss << osdmap.crush->get_item_name(i) << "\n";
5068 }
5069 rdata.append(ss.str());
5070 }
5071 r = 0;
5072 } else if (prefix == "osd crush class ls") {
5073 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5074 f->open_array_section("crush_classes");
5075 for (auto i : osdmap.crush->class_name)
5076 f->dump_string("class", i.second);
5077 f->close_section();
5078 f->flush(rdata);
5079 } else if (prefix == "osd crush class ls-osd") {
5080 string name;
5081 cmd_getval(g_ceph_context, cmdmap, "class", name);
5082 set<int> osds;
5083 osdmap.crush->get_devices_by_class(name, &osds);
5084 if (f) {
5085 f->open_array_section("osds");
5086 for (auto &osd: osds)
5087 f->dump_int("osd", osd);
5088 f->close_section();
5089 f->flush(rdata);
5090 } else {
5091 bool first = true;
5092 for (auto &osd : osds) {
5093 if (!first)
5094 ds << "\n";
5095 first = false;
5096 ds << osd;
5097 }
5098 rdata.append(ds);
5099 }
5100 } else if (prefix == "osd erasure-code-profile ls") {
5101 const auto &profiles = osdmap.get_erasure_code_profiles();
5102 if (f)
5103 f->open_array_section("erasure-code-profiles");
5104 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5105 if (f)
5106 f->dump_string("profile", i->first.c_str());
5107 else
5108 rdata.append(i->first + "\n");
5109 }
5110 if (f) {
5111 f->close_section();
5112 ostringstream rs;
5113 f->flush(rs);
5114 rs << "\n";
5115 rdata.append(rs.str());
5116 }
5117 } else if (prefix == "osd crush weight-set ls") {
5118 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5119 if (f) {
5120 f->open_array_section("weight_sets");
5121 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5122 f->dump_string("pool", "(compat)");
5123 }
5124 for (auto& i : osdmap.crush->choose_args) {
5125 if (i.first >= 0) {
5126 f->dump_string("pool", osdmap.get_pool_name(i.first));
5127 }
5128 }
5129 f->close_section();
5130 f->flush(rdata);
5131 } else {
5132 ostringstream rs;
5133 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5134 rs << "(compat)\n";
5135 }
5136 for (auto& i : osdmap.crush->choose_args) {
5137 if (i.first >= 0) {
5138 rs << osdmap.get_pool_name(i.first) << "\n";
5139 }
5140 }
5141 rdata.append(rs.str());
5142 }
5143 } else if (prefix == "osd crush weight-set dump") {
5144 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5145 "json-pretty"));
5146 osdmap.crush->dump_choose_args(f.get());
5147 f->flush(rdata);
5148 } else if (prefix == "osd erasure-code-profile get") {
5149 string name;
5150 cmd_getval(g_ceph_context, cmdmap, "name", name);
5151 if (!osdmap.has_erasure_code_profile(name)) {
5152 ss << "unknown erasure code profile '" << name << "'";
5153 r = -ENOENT;
5154 goto reply;
5155 }
5156 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5157 if (f)
5158 f->open_object_section("profile");
5159 for (map<string,string>::const_iterator i = profile.begin();
5160 i != profile.end();
5161 ++i) {
5162 if (f)
5163 f->dump_string(i->first.c_str(), i->second.c_str());
5164 else
5165 rdata.append(i->first + "=" + i->second + "\n");
5166 }
5167 if (f) {
5168 f->close_section();
5169 ostringstream rs;
5170 f->flush(rs);
5171 rs << "\n";
5172 rdata.append(rs.str());
5173 }
5174 } else if (prefix == "osd pool application get") {
5175 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5176 "json-pretty"));
5177 string pool_name;
5178 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
5179 string app;
5180 cmd_getval(g_ceph_context, cmdmap, "app", app);
5181 string key;
5182 cmd_getval(g_ceph_context, cmdmap, "key", key);
5183
5184 if (pool_name.empty()) {
5185 // all
5186 f->open_object_section("pools");
5187 for (const auto &pool : osdmap.pools) {
5188 std::string name("<unknown>");
5189 const auto &pni = osdmap.pool_name.find(pool.first);
5190 if (pni != osdmap.pool_name.end())
5191 name = pni->second;
5192 f->open_object_section(name.c_str());
5193 for (auto &app_pair : pool.second.application_metadata) {
5194 f->open_object_section(app_pair.first.c_str());
5195 for (auto &kv_pair : app_pair.second) {
5196 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5197 }
5198 f->close_section();
5199 }
5200 f->close_section(); // name
5201 }
5202 f->close_section(); // pools
5203 f->flush(rdata);
5204 } else {
5205 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
5206 if (pool < 0) {
5207 ss << "unrecognized pool '" << pool_name << "'";
5208 r = -ENOENT;
5209 goto reply;
5210 }
5211 auto p = osdmap.get_pg_pool(pool);
5212 // filter by pool
5213 if (app.empty()) {
5214 f->open_object_section(pool_name.c_str());
5215 for (auto &app_pair : p->application_metadata) {
5216 f->open_object_section(app_pair.first.c_str());
5217 for (auto &kv_pair : app_pair.second) {
5218 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5219 }
5220 f->close_section(); // application
5221 }
5222 f->close_section(); // pool_name
5223 f->flush(rdata);
5224 goto reply;
5225 }
5226
5227 auto app_it = p->application_metadata.find(app);
5228 if (app_it == p->application_metadata.end()) {
5229 ss << "pool '" << pool_name << "' has no application '" << app << "'";
5230 r = -ENOENT;
5231 goto reply;
5232 }
5233 // filter by pool + app
5234 if (key.empty()) {
5235 f->open_object_section(app_it->first.c_str());
5236 for (auto &kv_pair : app_it->second) {
5237 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5238 }
5239 f->close_section(); // application
5240 f->flush(rdata);
5241 goto reply;
5242 }
5243 // filter by pool + app + key
5244 auto key_it = app_it->second.find(key);
5245 if (key_it == app_it->second.end()) {
5246 ss << "application '" << app << "' on pool '" << pool_name
5247 << "' does not have key '" << key << "'";
5248 r = -ENOENT;
5249 goto reply;
5250 }
5251 ss << key_it->second << "\n";
5252 rdata.append(ss.str());
5253 ss.str("");
5254 }
5255 } else {
5256 // try prepare update
5257 return false;
5258 }
5259
5260 reply:
5261 string rs;
5262 getline(ss, rs);
5263 mon->reply_command(op, r, rs, rdata, get_last_committed());
5264 return true;
5265 }
5266
5267 void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
5268 {
5269 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
5270 pending_inc.get_new_pool(pool_id, pool)->flags = flags;
5271 }
5272
5273 bool OSDMonitor::update_pools_status()
5274 {
5275 if (!mon->pgservice->is_readable())
5276 return false;
5277
5278 bool ret = false;
5279
5280 auto& pools = osdmap.get_pools();
5281 for (auto it = pools.begin(); it != pools.end(); ++it) {
5282 const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
5283 if (!pstat)
5284 continue;
5285 const object_stat_sum_t& sum = pstat->stats.sum;
5286 const pg_pool_t &pool = it->second;
5287 const string& pool_name = osdmap.get_pool_name(it->first);
5288
5289 bool pool_is_full =
5290 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
5291 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
5292
5293 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
5294 if (pool_is_full)
5295 continue;
5296
5297 mon->clog->info() << "pool '" << pool_name
5298 << "' no longer full; removing FULL flag";
5299
5300 update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
5301 ret = true;
5302 } else {
5303 if (!pool_is_full)
5304 continue;
5305
5306 if (pool.quota_max_bytes > 0 &&
5307 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
5308 mon->clog->warn() << "pool '" << pool_name << "' is full"
5309 << " (reached quota's max_bytes: "
5310 << si_t(pool.quota_max_bytes) << ")";
5311 }
5312 if (pool.quota_max_objects > 0 &&
5313 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
5314 mon->clog->warn() << "pool '" << pool_name << "' is full"
5315 << " (reached quota's max_objects: "
5316 << pool.quota_max_objects << ")";
5317 }
5318 update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
5319 ret = true;
5320 }
5321 }
5322 return ret;
5323 }
5324
5325 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
5326 {
5327 op->mark_osdmon_event(__func__);
5328 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
5329 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
5330 MonSession *session = m->get_session();
5331 if (!session)
5332 return -EPERM;
5333 string erasure_code_profile;
5334 stringstream ss;
5335 string rule_name;
5336 if (m->auid)
5337 return prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
5338 0, 0,
5339 erasure_code_profile,
5340 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5341 else
5342 return prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
5343 0, 0,
5344 erasure_code_profile,
5345 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5346 }
5347
5348 int OSDMonitor::crush_rename_bucket(const string& srcname,
5349 const string& dstname,
5350 ostream *ss)
5351 {
5352 int ret;
5353 //
5354 // Avoid creating a pending crush if it does not already exists and
5355 // the rename would fail.
5356 //
5357 if (!_have_pending_crush()) {
5358 ret = _get_stable_crush().can_rename_bucket(srcname,
5359 dstname,
5360 ss);
5361 if (ret)
5362 return ret;
5363 }
5364
5365 CrushWrapper newcrush;
5366 _get_pending_crush(newcrush);
5367
5368 ret = newcrush.rename_bucket(srcname,
5369 dstname,
5370 ss);
5371 if (ret)
5372 return ret;
5373
5374 pending_inc.crush.clear();
5375 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5376 *ss << "renamed bucket " << srcname << " into " << dstname;
5377 return 0;
5378 }
5379
5380 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
5381 {
5382 string replacement = "";
5383
5384 if (plugin == "jerasure_generic" ||
5385 plugin == "jerasure_sse3" ||
5386 plugin == "jerasure_sse4" ||
5387 plugin == "jerasure_neon") {
5388 replacement = "jerasure";
5389 } else if (plugin == "shec_generic" ||
5390 plugin == "shec_sse3" ||
5391 plugin == "shec_sse4" ||
5392 plugin == "shec_neon") {
5393 replacement = "shec";
5394 }
5395
5396 if (replacement != "") {
5397 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
5398 << plugin << " that has been deprecated. Please use "
5399 << replacement << " instead." << dendl;
5400 }
5401 }
5402
5403 int OSDMonitor::normalize_profile(const string& profilename,
5404 ErasureCodeProfile &profile,
5405 bool force,
5406 ostream *ss)
5407 {
5408 ErasureCodeInterfaceRef erasure_code;
5409 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5410 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
5411 check_legacy_ec_plugin(plugin->second, profilename);
5412 int err = instance.factory(plugin->second,
5413 g_conf->get_val<std::string>("erasure_code_dir"),
5414 profile, &erasure_code, ss);
5415 if (err) {
5416 return err;
5417 }
5418
5419 err = erasure_code->init(profile, ss);
5420 if (err) {
5421 return err;
5422 }
5423
5424 auto it = profile.find("stripe_unit");
5425 if (it != profile.end()) {
5426 string err_str;
5427 uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5428 if (!err_str.empty()) {
5429 *ss << "could not parse stripe_unit '" << it->second
5430 << "': " << err_str << std::endl;
5431 return -EINVAL;
5432 }
5433 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5434 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
5435 if (chunk_size != stripe_unit) {
5436 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
5437 << "alignment. Would be padded to " << chunk_size
5438 << std::endl;
5439 return -EINVAL;
5440 }
5441 if ((stripe_unit % 4096) != 0 && !force) {
5442 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
5443 << "use --force to override this check" << std::endl;
5444 return -EINVAL;
5445 }
5446 }
5447 return 0;
5448 }
5449
5450 int OSDMonitor::crush_rule_create_erasure(const string &name,
5451 const string &profile,
5452 int *rule,
5453 ostream *ss)
5454 {
5455 int ruleid = osdmap.crush->get_rule_id(name);
5456 if (ruleid != -ENOENT) {
5457 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
5458 return -EEXIST;
5459 }
5460
5461 CrushWrapper newcrush;
5462 _get_pending_crush(newcrush);
5463
5464 ruleid = newcrush.get_rule_id(name);
5465 if (ruleid != -ENOENT) {
5466 *rule = newcrush.get_rule_mask_ruleset(ruleid);
5467 return -EALREADY;
5468 } else {
5469 ErasureCodeInterfaceRef erasure_code;
5470 int err = get_erasure_code(profile, &erasure_code, ss);
5471 if (err) {
5472 *ss << "failed to load plugin using profile " << profile << std::endl;
5473 return err;
5474 }
5475
5476 err = erasure_code->create_rule(name, newcrush, ss);
5477 erasure_code.reset();
5478 if (err < 0)
5479 return err;
5480 *rule = err;
5481 pending_inc.crush.clear();
5482 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5483 return 0;
5484 }
5485 }
5486
5487 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
5488 ErasureCodeInterfaceRef *erasure_code,
5489 ostream *ss) const
5490 {
5491 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
5492 return -EAGAIN;
5493 ErasureCodeProfile profile =
5494 osdmap.get_erasure_code_profile(erasure_code_profile);
5495 ErasureCodeProfile::const_iterator plugin =
5496 profile.find("plugin");
5497 if (plugin == profile.end()) {
5498 *ss << "cannot determine the erasure code plugin"
5499 << " because there is no 'plugin' entry in the erasure_code_profile "
5500 << profile << std::endl;
5501 return -EINVAL;
5502 }
5503 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
5504 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5505 return instance.factory(plugin->second,
5506 g_conf->get_val<std::string>("erasure_code_dir"),
5507 profile, erasure_code, ss);
5508 }
5509
5510 int OSDMonitor::check_cluster_features(uint64_t features,
5511 stringstream &ss)
5512 {
5513 stringstream unsupported_ss;
5514 int unsupported_count = 0;
5515 if ((mon->get_quorum_con_features() & features) != features) {
5516 unsupported_ss << "the monitor cluster";
5517 ++unsupported_count;
5518 }
5519
5520 set<int32_t> up_osds;
5521 osdmap.get_up_osds(up_osds);
5522 for (set<int32_t>::iterator it = up_osds.begin();
5523 it != up_osds.end(); ++it) {
5524 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
5525 if ((xi.features & features) != features) {
5526 if (unsupported_count > 0)
5527 unsupported_ss << ", ";
5528 unsupported_ss << "osd." << *it;
5529 unsupported_count ++;
5530 }
5531 }
5532
5533 if (unsupported_count > 0) {
5534 ss << "features " << features << " unsupported by: "
5535 << unsupported_ss.str();
5536 return -ENOTSUP;
5537 }
5538
5539 // check pending osd state, too!
5540 for (map<int32_t,osd_xinfo_t>::const_iterator p =
5541 pending_inc.new_xinfo.begin();
5542 p != pending_inc.new_xinfo.end(); ++p) {
5543 const osd_xinfo_t &xi = p->second;
5544 if ((xi.features & features) != features) {
5545 dout(10) << __func__ << " pending osd." << p->first
5546 << " features are insufficient; retry" << dendl;
5547 return -EAGAIN;
5548 }
5549 }
5550
5551 return 0;
5552 }
5553
5554 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
5555 stringstream& ss)
5556 {
5557 OSDMap::Incremental new_pending = pending_inc;
5558 ::encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
5559 OSDMap newmap;
5560 newmap.deepish_copy_from(osdmap);
5561 newmap.apply_incremental(new_pending);
5562
5563 // client compat
5564 if (newmap.require_min_compat_client > 0) {
5565 auto mv = newmap.get_min_compat_client();
5566 if (mv > newmap.require_min_compat_client) {
5567 ss << "new crush map requires client version " << ceph_release_name(mv)
5568 << " but require_min_compat_client is "
5569 << ceph_release_name(newmap.require_min_compat_client);
5570 return false;
5571 }
5572 }
5573
5574 // osd compat
5575 uint64_t features =
5576 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
5577 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
5578 stringstream features_ss;
5579 int r = check_cluster_features(features, features_ss);
5580 if (r) {
5581 ss << "Could not change CRUSH: " << features_ss.str();
5582 return false;
5583 }
5584
5585 return true;
5586 }
5587
5588 bool OSDMonitor::erasure_code_profile_in_use(
5589 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
5590 const string &profile,
5591 ostream *ss)
5592 {
5593 bool found = false;
5594 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5595 p != pools.end();
5596 ++p) {
5597 if (p->second.erasure_code_profile == profile) {
5598 *ss << osdmap.pool_name[p->first] << " ";
5599 found = true;
5600 }
5601 }
5602 if (found) {
5603 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
5604 }
5605 return found;
5606 }
5607
5608 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
5609 map<string,string> *erasure_code_profile_map,
5610 ostream *ss)
5611 {
5612 int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
5613 *ss,
5614 erasure_code_profile_map);
5615 if (r)
5616 return r;
5617 assert((*erasure_code_profile_map).count("plugin"));
5618 string default_plugin = (*erasure_code_profile_map)["plugin"];
5619 map<string,string> user_map;
5620 for (vector<string>::const_iterator i = erasure_code_profile.begin();
5621 i != erasure_code_profile.end();
5622 ++i) {
5623 size_t equal = i->find('=');
5624 if (equal == string::npos) {
5625 user_map[*i] = string();
5626 (*erasure_code_profile_map)[*i] = string();
5627 } else {
5628 const string key = i->substr(0, equal);
5629 equal++;
5630 const string value = i->substr(equal);
5631 user_map[key] = value;
5632 (*erasure_code_profile_map)[key] = value;
5633 }
5634 }
5635
5636 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
5637 (*erasure_code_profile_map) = user_map;
5638
5639 return 0;
5640 }
5641
5642 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
5643 const string &erasure_code_profile,
5644 unsigned *size, unsigned *min_size,
5645 ostream *ss)
5646 {
5647 int err = 0;
5648 switch (pool_type) {
5649 case pg_pool_t::TYPE_REPLICATED:
5650 *size = g_conf->osd_pool_default_size;
5651 *min_size = g_conf->get_osd_pool_default_min_size();
5652 break;
5653 case pg_pool_t::TYPE_ERASURE:
5654 {
5655 ErasureCodeInterfaceRef erasure_code;
5656 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5657 if (err == 0) {
5658 *size = erasure_code->get_chunk_count();
5659 *min_size = MIN(erasure_code->get_data_chunk_count() + 1, *size);
5660 }
5661 }
5662 break;
5663 default:
5664 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
5665 err = -EINVAL;
5666 break;
5667 }
5668 return err;
5669 }
5670
5671 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
5672 const string &erasure_code_profile,
5673 uint32_t *stripe_width,
5674 ostream *ss)
5675 {
5676 int err = 0;
5677 switch (pool_type) {
5678 case pg_pool_t::TYPE_REPLICATED:
5679 // ignored
5680 break;
5681 case pg_pool_t::TYPE_ERASURE:
5682 {
5683 ErasureCodeProfile profile =
5684 osdmap.get_erasure_code_profile(erasure_code_profile);
5685 ErasureCodeInterfaceRef erasure_code;
5686 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5687 if (err)
5688 break;
5689 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5690 uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
5691 auto it = profile.find("stripe_unit");
5692 if (it != profile.end()) {
5693 string err_str;
5694 stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5695 assert(err_str.empty());
5696 }
5697 *stripe_width = data_chunks *
5698 erasure_code->get_chunk_size(stripe_unit * data_chunks);
5699 }
5700 break;
5701 default:
5702 *ss << "prepare_pool_stripe_width: "
5703 << pool_type << " is not a known pool type";
5704 err = -EINVAL;
5705 break;
5706 }
5707 return err;
5708 }
5709
5710 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
5711 const string &erasure_code_profile,
5712 const string &rule_name,
5713 int *crush_rule,
5714 ostream *ss)
5715 {
5716
5717 if (*crush_rule < 0) {
5718 switch (pool_type) {
5719 case pg_pool_t::TYPE_REPLICATED:
5720 {
5721 if (rule_name == "") {
5722 // Use default rule
5723 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
5724 if (*crush_rule < 0) {
5725 // Errors may happen e.g. if no valid rule is available
5726 *ss << "No suitable CRUSH rule exists, check "
5727 << "'osd pool default crush *' config options";
5728 return -ENOENT;
5729 }
5730 } else {
5731 return get_crush_rule(rule_name, crush_rule, ss);
5732 }
5733 }
5734 break;
5735 case pg_pool_t::TYPE_ERASURE:
5736 {
5737 int err = crush_rule_create_erasure(rule_name,
5738 erasure_code_profile,
5739 crush_rule, ss);
5740 switch (err) {
5741 case -EALREADY:
5742 dout(20) << "prepare_pool_crush_rule: rule "
5743 << rule_name << " try again" << dendl;
5744 // fall through
5745 case 0:
5746 // need to wait for the crush rule to be proposed before proceeding
5747 err = -EAGAIN;
5748 break;
5749 case -EEXIST:
5750 err = 0;
5751 break;
5752 }
5753 return err;
5754 }
5755 break;
5756 default:
5757 *ss << "prepare_pool_crush_rule: " << pool_type
5758 << " is not a known pool type";
5759 return -EINVAL;
5760 break;
5761 }
5762 } else {
5763 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
5764 *ss << "CRUSH rule " << *crush_rule << " not found";
5765 return -ENOENT;
5766 }
5767 }
5768
5769 return 0;
5770 }
5771
5772 int OSDMonitor::get_crush_rule(const string &rule_name,
5773 int *crush_rule,
5774 ostream *ss)
5775 {
5776 int ret;
5777 ret = osdmap.crush->get_rule_id(rule_name);
5778 if (ret != -ENOENT) {
5779 // found it, use it
5780 *crush_rule = ret;
5781 } else {
5782 CrushWrapper newcrush;
5783 _get_pending_crush(newcrush);
5784
5785 ret = newcrush.get_rule_id(rule_name);
5786 if (ret != -ENOENT) {
5787 // found it, wait for it to be proposed
5788 dout(20) << __func__ << ": rule " << rule_name
5789 << " try again" << dendl;
5790 return -EAGAIN;
5791 } else {
5792 // Cannot find it , return error
5793 *ss << "specified rule " << rule_name << " doesn't exist";
5794 return ret;
5795 }
5796 }
5797 return 0;
5798 }
5799
5800 /**
5801 * @param name The name of the new pool
5802 * @param auid The auid of the pool owner. Can be -1
5803 * @param crush_rule The crush rule to use. If <0, will use the system default
5804 * @param crush_rule_name The crush rule to use, if crush_rulset <0
5805 * @param pg_num The pg_num to use. If set to 0, will use the system default
5806 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
5807 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
5808 * @param pool_type TYPE_ERASURE, or TYPE_REP
5809 * @param expected_num_objects expected number of objects on the pool
5810 * @param fast_read fast read type.
5811 * @param ss human readable error message, if any.
5812 *
5813 * @return 0 on success, negative errno on failure.
5814 */
5815 int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
5816 int crush_rule,
5817 const string &crush_rule_name,
5818 unsigned pg_num, unsigned pgp_num,
5819 const string &erasure_code_profile,
5820 const unsigned pool_type,
5821 const uint64_t expected_num_objects,
5822 FastReadType fast_read,
5823 ostream *ss)
5824 {
5825 if (name.length() == 0)
5826 return -EINVAL;
5827 if (pg_num == 0)
5828 pg_num = g_conf->osd_pool_default_pg_num;
5829 if (pgp_num == 0)
5830 pgp_num = g_conf->osd_pool_default_pgp_num;
5831 if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
5832 *ss << "'pg_num' must be greater than 0 and less than or equal to "
5833 << g_conf->mon_max_pool_pg_num
5834 << " (you may adjust 'mon max pool pg num' for higher values)";
5835 return -ERANGE;
5836 }
5837 if (pgp_num > pg_num) {
5838 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
5839 << ", which in this case is " << pg_num;
5840 return -ERANGE;
5841 }
5842 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
5843 *ss << "'fast_read' can only apply to erasure coding pool";
5844 return -EINVAL;
5845 }
5846 int r;
5847 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
5848 crush_rule_name, &crush_rule, ss);
5849 if (r) {
5850 dout(10) << " prepare_pool_crush_rule returns " << r << dendl;
5851 return r;
5852 }
5853 if (g_conf->mon_osd_crush_smoke_test) {
5854 CrushWrapper newcrush;
5855 _get_pending_crush(newcrush);
5856 ostringstream err;
5857 CrushTester tester(newcrush, err);
5858 tester.set_min_x(0);
5859 tester.set_max_x(50);
5860 tester.set_rule(crush_rule);
5861 auto start = ceph::coarse_mono_clock::now();
5862 r = tester.test_with_fork(g_conf->mon_lease);
5863 auto duration = ceph::coarse_mono_clock::now() - start;
5864 if (r < 0) {
5865 dout(10) << " tester.test_with_fork returns " << r
5866 << ": " << err.str() << dendl;
5867 *ss << "crush test failed with " << r << ": " << err.str();
5868 return r;
5869 }
5870 dout(10) << __func__ << " crush smoke test duration: "
5871 << duration << dendl;
5872 }
5873 unsigned size, min_size;
5874 r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
5875 if (r) {
5876 dout(10) << " prepare_pool_size returns " << r << dendl;
5877 return r;
5878 }
5879
5880 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
5881 return -EINVAL;
5882 }
5883
5884 uint32_t stripe_width = 0;
5885 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
5886 if (r) {
5887 dout(10) << " prepare_pool_stripe_width returns " << r << dendl;
5888 return r;
5889 }
5890
5891 bool fread = false;
5892 if (pool_type == pg_pool_t::TYPE_ERASURE) {
5893 switch (fast_read) {
5894 case FAST_READ_OFF:
5895 fread = false;
5896 break;
5897 case FAST_READ_ON:
5898 fread = true;
5899 break;
5900 case FAST_READ_DEFAULT:
5901 fread = g_conf->mon_osd_pool_ec_fast_read;
5902 break;
5903 default:
5904 *ss << "invalid fast_read setting: " << fast_read;
5905 return -EINVAL;
5906 }
5907 }
5908
5909 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
5910 p != pending_inc.new_pool_names.end();
5911 ++p) {
5912 if (p->second == name)
5913 return 0;
5914 }
5915
5916 if (-1 == pending_inc.new_pool_max)
5917 pending_inc.new_pool_max = osdmap.pool_max;
5918 int64_t pool = ++pending_inc.new_pool_max;
5919 pg_pool_t empty;
5920 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
5921 pi->type = pool_type;
5922 pi->fast_read = fread;
5923 pi->flags = g_conf->osd_pool_default_flags;
5924 if (g_conf->osd_pool_default_flag_hashpspool)
5925 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
5926 if (g_conf->osd_pool_default_flag_nodelete)
5927 pi->set_flag(pg_pool_t::FLAG_NODELETE);
5928 if (g_conf->osd_pool_default_flag_nopgchange)
5929 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
5930 if (g_conf->osd_pool_default_flag_nosizechange)
5931 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
5932 if (g_conf->osd_pool_use_gmt_hitset &&
5933 (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
5934 pi->use_gmt_hitset = true;
5935 else
5936 pi->use_gmt_hitset = false;
5937
5938 pi->size = size;
5939 pi->min_size = min_size;
5940 pi->crush_rule = crush_rule;
5941 pi->expected_num_objects = expected_num_objects;
5942 pi->object_hash = CEPH_STR_HASH_RJENKINS;
5943 pi->set_pg_num(pg_num);
5944 pi->set_pgp_num(pgp_num);
5945 pi->last_change = pending_inc.epoch;
5946 pi->auid = auid;
5947 pi->erasure_code_profile = erasure_code_profile;
5948 pi->stripe_width = stripe_width;
5949 pi->cache_target_dirty_ratio_micro =
5950 g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
5951 pi->cache_target_dirty_high_ratio_micro =
5952 g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
5953 pi->cache_target_full_ratio_micro =
5954 g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
5955 pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
5956 pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
5957 pending_inc.new_pool_names[pool] = name;
5958 return 0;
5959 }
5960
5961 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
5962 {
5963 op->mark_osdmon_event(__func__);
5964 ostringstream ss;
5965 if (pending_inc.new_flags < 0)
5966 pending_inc.new_flags = osdmap.get_flags();
5967 pending_inc.new_flags |= flag;
5968 ss << OSDMap::get_flag_string(flag) << " is set";
5969 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5970 get_last_committed() + 1));
5971 return true;
5972 }
5973
5974 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
5975 {
5976 op->mark_osdmon_event(__func__);
5977 ostringstream ss;
5978 if (pending_inc.new_flags < 0)
5979 pending_inc.new_flags = osdmap.get_flags();
5980 pending_inc.new_flags &= ~flag;
5981 ss << OSDMap::get_flag_string(flag) << " is unset";
5982 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5983 get_last_committed() + 1));
5984 return true;
5985 }
5986
5987 int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
5988 stringstream& ss)
5989 {
5990 string poolstr;
5991 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
5992 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5993 if (pool < 0) {
5994 ss << "unrecognized pool '" << poolstr << "'";
5995 return -ENOENT;
5996 }
5997 string var;
5998 cmd_getval(g_ceph_context, cmdmap, "var", var);
5999
6000 pg_pool_t p = *osdmap.get_pg_pool(pool);
6001 if (pending_inc.new_pools.count(pool))
6002 p = pending_inc.new_pools[pool];
6003
6004 // accept val as a json string in the normal case (current
6005 // generation monitor). parse out int or float values from the
6006 // string as needed. however, if it is not a string, try to pull
6007 // out an int, in case an older monitor with an older json schema is
6008 // forwarding a request.
6009 string val;
6010 string interr, floaterr;
6011 int64_t n = 0;
6012 double f = 0;
6013 int64_t uf = 0; // micro-f
6014 if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
6015 // wasn't a string; maybe an older mon forwarded json with an int?
6016 if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
6017 return -EINVAL; // no value!
6018 } else {
6019 // we got a string. see if it contains an int.
6020 n = strict_strtoll(val.c_str(), 10, &interr);
6021 // or a float
6022 f = strict_strtod(val.c_str(), &floaterr);
6023 uf = llrintl(f * (double)1000000.0);
6024 }
6025
6026 if (!p.is_tier() &&
6027 (var == "hit_set_type" || var == "hit_set_period" ||
6028 var == "hit_set_count" || var == "hit_set_fpp" ||
6029 var == "target_max_objects" || var == "target_max_bytes" ||
6030 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
6031 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
6032 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
6033 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
6034 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
6035 return -EACCES;
6036 }
6037
6038 if (var == "size") {
6039 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6040 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
6041 return -EPERM;
6042 }
6043 if (p.type == pg_pool_t::TYPE_ERASURE) {
6044 ss << "can not change the size of an erasure-coded pool";
6045 return -ENOTSUP;
6046 }
6047 if (interr.length()) {
6048 ss << "error parsing integer value '" << val << "': " << interr;
6049 return -EINVAL;
6050 }
6051 if (n <= 0 || n > 10) {
6052 ss << "pool size must be between 1 and 10";
6053 return -EINVAL;
6054 }
6055 p.size = n;
6056 if (n < p.min_size)
6057 p.min_size = n;
6058 } else if (var == "min_size") {
6059 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6060 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
6061 return -EPERM;
6062 }
6063 if (interr.length()) {
6064 ss << "error parsing integer value '" << val << "': " << interr;
6065 return -EINVAL;
6066 }
6067
6068 if (p.type != pg_pool_t::TYPE_ERASURE) {
6069 if (n < 1 || n > p.size) {
6070 ss << "pool min_size must be between 1 and " << (int)p.size;
6071 return -EINVAL;
6072 }
6073 } else {
6074 ErasureCodeInterfaceRef erasure_code;
6075 int k;
6076 stringstream tmp;
6077 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
6078 if (err == 0) {
6079 k = erasure_code->get_data_chunk_count();
6080 } else {
6081 ss << __func__ << " get_erasure_code failed: " << tmp.rdbuf();
6082 return err;
6083 }
6084
6085 if (n < k || n > p.size) {
6086 ss << "pool min_size must be between " << k << " and " << (int)p.size;
6087 return -EINVAL;
6088 }
6089 }
6090 p.min_size = n;
6091 } else if (var == "auid") {
6092 if (interr.length()) {
6093 ss << "error parsing integer value '" << val << "': " << interr;
6094 return -EINVAL;
6095 }
6096 p.auid = n;
6097 } else if (var == "crash_replay_interval") {
6098 if (interr.length()) {
6099 ss << "error parsing integer value '" << val << "': " << interr;
6100 return -EINVAL;
6101 }
6102 p.crash_replay_interval = n;
6103 } else if (var == "pg_num") {
6104 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6105 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
6106 return -EPERM;
6107 }
6108 if (interr.length()) {
6109 ss << "error parsing integer value '" << val << "': " << interr;
6110 return -EINVAL;
6111 }
6112 if (n <= (int)p.get_pg_num()) {
6113 ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
6114 if (n < (int)p.get_pg_num())
6115 return -EEXIST;
6116 return 0;
6117 }
6118 if (n > (unsigned)g_conf->mon_max_pool_pg_num) {
6119 ss << "'pg_num' must be greater than 0 and less than or equal to "
6120 << g_conf->mon_max_pool_pg_num
6121 << " (you may adjust 'mon max pool pg num' for higher values)";
6122 return -ERANGE;
6123 }
6124 string force;
6125 cmd_getval(g_ceph_context,cmdmap, "force", force);
6126 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
6127 force != "--yes-i-really-mean-it") {
6128 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
6129 return -EPERM;
6130 }
6131 int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
6132 int64_t new_pgs = n - p.get_pg_num();
6133 if (new_pgs > g_conf->mon_osd_max_split_count * expected_osds) {
6134 ss << "specified pg_num " << n << " is too large (creating "
6135 << new_pgs << " new PGs on ~" << expected_osds
6136 << " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
6137 << ')';
6138 return -E2BIG;
6139 }
6140 p.set_pg_num(n);
6141 // force pre-luminous clients to resend their ops, since they
6142 // don't understand that split PGs now form a new interval.
6143 p.last_force_op_resend_preluminous = pending_inc.epoch;
6144 } else if (var == "pgp_num") {
6145 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6146 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
6147 return -EPERM;
6148 }
6149 if (interr.length()) {
6150 ss << "error parsing integer value '" << val << "': " << interr;
6151 return -EINVAL;
6152 }
6153 if (n <= 0) {
6154 ss << "specified pgp_num must > 0, but you set to " << n;
6155 return -EINVAL;
6156 }
6157 if (n > (int)p.get_pg_num()) {
6158 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
6159 return -EINVAL;
6160 }
6161 p.set_pgp_num(n);
6162 } else if (var == "crush_rule") {
6163 int id = osdmap.crush->get_rule_id(val);
6164 if (id == -ENOENT) {
6165 ss << "crush rule " << val << " does not exist";
6166 return -ENOENT;
6167 }
6168 if (id < 0) {
6169 ss << cpp_strerror(id);
6170 return -ENOENT;
6171 }
6172 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
6173 return -EINVAL;
6174 }
6175 p.crush_rule = id;
6176 } else if (var == "nodelete" || var == "nopgchange" ||
6177 var == "nosizechange" || var == "write_fadvise_dontneed" ||
6178 var == "noscrub" || var == "nodeep-scrub") {
6179 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6180 // make sure we only compare against 'n' if we didn't receive a string
6181 if (val == "true" || (interr.empty() && n == 1)) {
6182 p.set_flag(flag);
6183 } else if (val == "false" || (interr.empty() && n == 0)) {
6184 p.unset_flag(flag);
6185 } else {
6186 ss << "expecting value 'true', 'false', '0', or '1'";
6187 return -EINVAL;
6188 }
6189 } else if (var == "hashpspool") {
6190 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6191 string force;
6192 cmd_getval(g_ceph_context, cmdmap, "force", force);
6193 if (force != "--yes-i-really-mean-it") {
6194 ss << "are you SURE? this will remap all placement groups in this pool,"
6195 " this triggers large data movement,"
6196 " pass --yes-i-really-mean-it if you really do.";
6197 return -EPERM;
6198 }
6199 // make sure we only compare against 'n' if we didn't receive a string
6200 if (val == "true" || (interr.empty() && n == 1)) {
6201 p.set_flag(flag);
6202 } else if (val == "false" || (interr.empty() && n == 0)) {
6203 p.unset_flag(flag);
6204 } else {
6205 ss << "expecting value 'true', 'false', '0', or '1'";
6206 return -EINVAL;
6207 }
6208 } else if (var == "hit_set_type") {
6209 if (val == "none")
6210 p.hit_set_params = HitSet::Params();
6211 else {
6212 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
6213 if (err)
6214 return err;
6215 if (val == "bloom") {
6216 BloomHitSet::Params *bsp = new BloomHitSet::Params;
6217 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
6218 p.hit_set_params = HitSet::Params(bsp);
6219 } else if (val == "explicit_hash")
6220 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
6221 else if (val == "explicit_object")
6222 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
6223 else {
6224 ss << "unrecognized hit_set type '" << val << "'";
6225 return -EINVAL;
6226 }
6227 }
6228 } else if (var == "hit_set_period") {
6229 if (interr.length()) {
6230 ss << "error parsing integer value '" << val << "': " << interr;
6231 return -EINVAL;
6232 }
6233 p.hit_set_period = n;
6234 } else if (var == "hit_set_count") {
6235 if (interr.length()) {
6236 ss << "error parsing integer value '" << val << "': " << interr;
6237 return -EINVAL;
6238 }
6239 p.hit_set_count = n;
6240 } else if (var == "hit_set_fpp") {
6241 if (floaterr.length()) {
6242 ss << "error parsing floating point value '" << val << "': " << floaterr;
6243 return -EINVAL;
6244 }
6245 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
6246 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
6247 return -EINVAL;
6248 }
6249 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
6250 bloomp->set_fpp(f);
6251 } else if (var == "use_gmt_hitset") {
6252 if (val == "true" || (interr.empty() && n == 1)) {
6253 if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
6254 ss << "not all OSDs support GMT hit set.";
6255 return -EINVAL;
6256 }
6257 p.use_gmt_hitset = true;
6258 } else {
6259 ss << "expecting value 'true' or '1'";
6260 return -EINVAL;
6261 }
6262 } else if (var == "allow_ec_overwrites") {
6263 if (!p.is_erasure()) {
6264 ss << "ec overwrites can only be enabled for an erasure coded pool";
6265 return -EINVAL;
6266 }
6267 stringstream err;
6268 if (!g_conf->mon_debug_no_require_bluestore_for_ec_overwrites &&
6269 !is_pool_currently_all_bluestore(pool, p, &err)) {
6270 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
6271 return -EINVAL;
6272 }
6273 if (val == "true" || (interr.empty() && n == 1)) {
6274 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
6275 } else if (val == "false" || (interr.empty() && n == 0)) {
6276 ss << "ec overwrites cannot be disabled once enabled";
6277 return -EINVAL;
6278 } else {
6279 ss << "expecting value 'true', 'false', '0', or '1'";
6280 return -EINVAL;
6281 }
6282 } else if (var == "target_max_objects") {
6283 if (interr.length()) {
6284 ss << "error parsing int '" << val << "': " << interr;
6285 return -EINVAL;
6286 }
6287 p.target_max_objects = n;
6288 } else if (var == "target_max_bytes") {
6289 if (interr.length()) {
6290 ss << "error parsing int '" << val << "': " << interr;
6291 return -EINVAL;
6292 }
6293 p.target_max_bytes = n;
6294 } else if (var == "cache_target_dirty_ratio") {
6295 if (floaterr.length()) {
6296 ss << "error parsing float '" << val << "': " << floaterr;
6297 return -EINVAL;
6298 }
6299 if (f < 0 || f > 1.0) {
6300 ss << "value must be in the range 0..1";
6301 return -ERANGE;
6302 }
6303 p.cache_target_dirty_ratio_micro = uf;
6304 } else if (var == "cache_target_dirty_high_ratio") {
6305 if (floaterr.length()) {
6306 ss << "error parsing float '" << val << "': " << floaterr;
6307 return -EINVAL;
6308 }
6309 if (f < 0 || f > 1.0) {
6310 ss << "value must be in the range 0..1";
6311 return -ERANGE;
6312 }
6313 p.cache_target_dirty_high_ratio_micro = uf;
6314 } else if (var == "cache_target_full_ratio") {
6315 if (floaterr.length()) {
6316 ss << "error parsing float '" << val << "': " << floaterr;
6317 return -EINVAL;
6318 }
6319 if (f < 0 || f > 1.0) {
6320 ss << "value must be in the range 0..1";
6321 return -ERANGE;
6322 }
6323 p.cache_target_full_ratio_micro = uf;
6324 } else if (var == "cache_min_flush_age") {
6325 if (interr.length()) {
6326 ss << "error parsing int '" << val << "': " << interr;
6327 return -EINVAL;
6328 }
6329 p.cache_min_flush_age = n;
6330 } else if (var == "cache_min_evict_age") {
6331 if (interr.length()) {
6332 ss << "error parsing int '" << val << "': " << interr;
6333 return -EINVAL;
6334 }
6335 p.cache_min_evict_age = n;
6336 } else if (var == "min_read_recency_for_promote") {
6337 if (interr.length()) {
6338 ss << "error parsing integer value '" << val << "': " << interr;
6339 return -EINVAL;
6340 }
6341 p.min_read_recency_for_promote = n;
6342 } else if (var == "hit_set_grade_decay_rate") {
6343 if (interr.length()) {
6344 ss << "error parsing integer value '" << val << "': " << interr;
6345 return -EINVAL;
6346 }
6347 if (n > 100 || n < 0) {
6348 ss << "value out of range,valid range is 0 - 100";
6349 return -EINVAL;
6350 }
6351 p.hit_set_grade_decay_rate = n;
6352 } else if (var == "hit_set_search_last_n") {
6353 if (interr.length()) {
6354 ss << "error parsing integer value '" << val << "': " << interr;
6355 return -EINVAL;
6356 }
6357 if (n > p.hit_set_count || n < 0) {
6358 ss << "value out of range,valid range is 0 - hit_set_count";
6359 return -EINVAL;
6360 }
6361 p.hit_set_search_last_n = n;
6362 } else if (var == "min_write_recency_for_promote") {
6363 if (interr.length()) {
6364 ss << "error parsing integer value '" << val << "': " << interr;
6365 return -EINVAL;
6366 }
6367 p.min_write_recency_for_promote = n;
6368 } else if (var == "fast_read") {
6369 if (p.is_replicated()) {
6370 ss << "fast read is not supported in replication pool";
6371 return -EINVAL;
6372 }
6373 if (val == "true" || (interr.empty() && n == 1)) {
6374 p.fast_read = true;
6375 } else if (val == "false" || (interr.empty() && n == 0)) {
6376 p.fast_read = false;
6377 } else {
6378 ss << "expecting value 'true', 'false', '0', or '1'";
6379 return -EINVAL;
6380 }
6381 } else if (pool_opts_t::is_opt_name(var)) {
6382 bool unset = val == "unset";
6383 if (var == "compression_mode") {
6384 if (!unset) {
6385 auto cmode = Compressor::get_comp_mode_type(val);
6386 if (!cmode) {
6387 ss << "unrecognized compression mode '" << val << "'";
6388 return -EINVAL;
6389 }
6390 }
6391 } else if (var == "compression_algorithm") {
6392 if (!unset) {
6393 auto alg = Compressor::get_comp_alg_type(val);
6394 if (!alg) {
6395 ss << "unrecognized compression_algorithm '" << val << "'";
6396 return -EINVAL;
6397 }
6398 }
6399 } else if (var == "compression_required_ratio") {
6400 if (floaterr.length()) {
6401 ss << "error parsing float value '" << val << "': " << floaterr;
6402 return -EINVAL;
6403 }
6404 if (f < 0 || f > 1) {
6405 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
6406 return -EINVAL;
6407 }
6408 } else if (var == "csum_type") {
6409 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
6410 if (t < 0 ) {
6411 ss << "unrecognized csum_type '" << val << "'";
6412 return -EINVAL;
6413 }
6414 //preserve csum_type numeric value
6415 n = t;
6416 interr.clear();
6417 } else if (var == "compression_max_blob_size" ||
6418 var == "compression_min_blob_size" ||
6419 var == "csum_max_block" ||
6420 var == "csum_min_block") {
6421 if (interr.length()) {
6422 ss << "error parsing int value '" << val << "': " << interr;
6423 return -EINVAL;
6424 }
6425 }
6426
6427 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
6428 switch (desc.type) {
6429 case pool_opts_t::STR:
6430 if (unset) {
6431 p.opts.unset(desc.key);
6432 } else {
6433 p.opts.set(desc.key, static_cast<std::string>(val));
6434 }
6435 break;
6436 case pool_opts_t::INT:
6437 if (interr.length()) {
6438 ss << "error parsing integer value '" << val << "': " << interr;
6439 return -EINVAL;
6440 }
6441 if (n == 0) {
6442 p.opts.unset(desc.key);
6443 } else {
6444 p.opts.set(desc.key, static_cast<int>(n));
6445 }
6446 break;
6447 case pool_opts_t::DOUBLE:
6448 if (floaterr.length()) {
6449 ss << "error parsing floating point value '" << val << "': " << floaterr;
6450 return -EINVAL;
6451 }
6452 if (f == 0) {
6453 p.opts.unset(desc.key);
6454 } else {
6455 p.opts.set(desc.key, static_cast<double>(f));
6456 }
6457 break;
6458 default:
6459 assert(!"unknown type");
6460 }
6461 } else {
6462 ss << "unrecognized variable '" << var << "'";
6463 return -EINVAL;
6464 }
6465 if (val != "unset") {
6466 ss << "set pool " << pool << " " << var << " to " << val;
6467 } else {
6468 ss << "unset pool " << pool << " " << var;
6469 }
6470 p.last_change = pending_inc.epoch;
6471 pending_inc.new_pools[pool] = p;
6472 return 0;
6473 }
6474
6475 int OSDMonitor::prepare_command_pool_application(const string &prefix,
6476 map<string,cmd_vartype> &cmdmap,
6477 stringstream& ss)
6478 {
6479 string pool_name;
6480 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
6481 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6482 if (pool < 0) {
6483 ss << "unrecognized pool '" << pool_name << "'";
6484 return -ENOENT;
6485 }
6486
6487 pg_pool_t p = *osdmap.get_pg_pool(pool);
6488 if (pending_inc.new_pools.count(pool)) {
6489 p = pending_inc.new_pools[pool];
6490 }
6491
6492 string app;
6493 cmd_getval(g_ceph_context, cmdmap, "app", app);
6494 bool app_exists = (p.application_metadata.count(app) > 0);
6495
6496 if (boost::algorithm::ends_with(prefix, "enable")) {
6497 if (app.empty()) {
6498 ss << "application name must be provided";
6499 return -EINVAL;
6500 }
6501
6502 if (p.is_tier()) {
6503 ss << "application must be enabled on base tier";
6504 return -EINVAL;
6505 }
6506
6507 string force;
6508 cmd_getval(g_ceph_context, cmdmap, "force", force);
6509
6510 if (!app_exists && !p.application_metadata.empty() &&
6511 force != "--yes-i-really-mean-it") {
6512 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
6513 << "application; pass --yes-i-really-mean-it to proceed anyway";
6514 return -EPERM;
6515 }
6516
6517 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
6518 ss << "too many enabled applications on pool '" << pool_name << "'; "
6519 << "max " << MAX_POOL_APPLICATIONS;
6520 return -EINVAL;
6521 }
6522
6523 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
6524 ss << "application name '" << app << "' too long; max length "
6525 << MAX_POOL_APPLICATION_LENGTH;
6526 return -EINVAL;
6527 }
6528
6529 if (!app_exists) {
6530 p.application_metadata[app] = {};
6531 }
6532 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
6533
6534 } else if (boost::algorithm::ends_with(prefix, "disable")) {
6535 string force;
6536 cmd_getval(g_ceph_context, cmdmap, "force", force);
6537
6538 if (force != "--yes-i-really-mean-it") {
6539 ss << "Are you SURE? Disabling an application within a pool might result "
6540 << "in loss of application functionality; pass "
6541 << "--yes-i-really-mean-it to proceed anyway";
6542 return -EPERM;
6543 }
6544
6545 if (!app_exists) {
6546 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6547 << "'";
6548 return 0; // idempotent
6549 }
6550
6551 p.application_metadata.erase(app);
6552 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
6553
6554 } else if (boost::algorithm::ends_with(prefix, "set")) {
6555 if (p.is_tier()) {
6556 ss << "application metadata must be set on base tier";
6557 return -EINVAL;
6558 }
6559
6560 if (!app_exists) {
6561 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6562 << "'";
6563 return -ENOENT;
6564 }
6565
6566 string key;
6567 cmd_getval(g_ceph_context, cmdmap, "key", key);
6568
6569 if (key.empty()) {
6570 ss << "key must be provided";
6571 return -EINVAL;
6572 }
6573
6574 auto &app_keys = p.application_metadata[app];
6575 if (app_keys.count(key) == 0 &&
6576 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
6577 ss << "too many keys set for application '" << app << "' on pool '"
6578 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
6579 return -EINVAL;
6580 }
6581
6582 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
6583 ss << "key '" << app << "' too long; max length "
6584 << MAX_POOL_APPLICATION_LENGTH;
6585 return -EINVAL;
6586 }
6587
6588 string value;
6589 cmd_getval(g_ceph_context, cmdmap, "value", value);
6590 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
6591 ss << "value '" << value << "' too long; max length "
6592 << MAX_POOL_APPLICATION_LENGTH;
6593 return -EINVAL;
6594 }
6595
6596 p.application_metadata[app][key] = value;
6597 ss << "set application '" << app << "' key '" << key << "' to '"
6598 << value << "' on pool '" << pool_name << "'";
6599 } else if (boost::algorithm::ends_with(prefix, "rm")) {
6600 if (!app_exists) {
6601 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6602 << "'";
6603 return -ENOENT;
6604 }
6605
6606 string key;
6607 cmd_getval(g_ceph_context, cmdmap, "key", key);
6608 auto it = p.application_metadata[app].find(key);
6609 if (it == p.application_metadata[app].end()) {
6610 ss << "application '" << app << "' on pool '" << pool_name
6611 << "' does not have key '" << key << "'";
6612 return 0; // idempotent
6613 }
6614
6615 p.application_metadata[app].erase(it);
6616 ss << "removed application '" << app << "' key '" << key << "' on pool '"
6617 << pool_name << "'";
6618 } else {
6619 assert(false);
6620 }
6621
6622 p.last_change = pending_inc.epoch;
6623 pending_inc.new_pools[pool] = p;
6624 return 0;
6625 }
6626
6627 int OSDMonitor::_prepare_command_osd_crush_remove(
6628 CrushWrapper &newcrush,
6629 int32_t id,
6630 int32_t ancestor,
6631 bool has_ancestor,
6632 bool unlink_only)
6633 {
6634 int err = 0;
6635
6636 if (has_ancestor) {
6637 err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
6638 unlink_only);
6639 } else {
6640 err = newcrush.remove_item(g_ceph_context, id, unlink_only);
6641 }
6642 return err;
6643 }
6644
6645 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
6646 {
6647 pending_inc.crush.clear();
6648 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6649 }
6650
6651 int OSDMonitor::prepare_command_osd_crush_remove(
6652 CrushWrapper &newcrush,
6653 int32_t id,
6654 int32_t ancestor,
6655 bool has_ancestor,
6656 bool unlink_only)
6657 {
6658 int err = _prepare_command_osd_crush_remove(
6659 newcrush, id, ancestor,
6660 has_ancestor, unlink_only);
6661
6662 if (err < 0)
6663 return err;
6664
6665 assert(err == 0);
6666 do_osd_crush_remove(newcrush);
6667
6668 return 0;
6669 }
6670
6671 int OSDMonitor::prepare_command_osd_remove(int32_t id)
6672 {
6673 if (osdmap.is_up(id)) {
6674 return -EBUSY;
6675 }
6676
6677 pending_inc.new_state[id] = osdmap.get_state(id);
6678 pending_inc.new_uuid[id] = uuid_d();
6679 pending_metadata_rm.insert(id);
6680 pending_metadata.erase(id);
6681
6682 return 0;
6683 }
6684
6685 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
6686 {
6687 assert(existing_id);
6688 *existing_id = -1;
6689
6690 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
6691 if (!osdmap.exists(i) &&
6692 pending_inc.new_up_client.count(i) == 0 &&
6693 (pending_inc.new_state.count(i) == 0 ||
6694 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
6695 *existing_id = i;
6696 return -1;
6697 }
6698 }
6699
6700 if (pending_inc.new_max_osd < 0) {
6701 return osdmap.get_max_osd();
6702 }
6703 return pending_inc.new_max_osd;
6704 }
6705
6706 void OSDMonitor::do_osd_create(
6707 const int32_t id,
6708 const uuid_d& uuid,
6709 int32_t* new_id)
6710 {
6711 dout(10) << __func__ << " uuid " << uuid << dendl;
6712 assert(new_id);
6713
6714 // We presume validation has been performed prior to calling this
6715 // function. We assert with prejudice.
6716
6717 int32_t allocated_id = -1; // declare here so we can jump
6718 int32_t existing_id = -1;
6719 if (!uuid.is_zero()) {
6720 existing_id = osdmap.identify_osd(uuid);
6721 if (existing_id >= 0) {
6722 assert(id < 0 || id == existing_id);
6723 *new_id = existing_id;
6724 goto out;
6725 } else if (id >= 0) {
6726 // uuid does not exist, and id has been provided, so just create
6727 // the new osd.id
6728 *new_id = id;
6729 goto out;
6730 }
6731 }
6732
6733 // allocate a new id
6734 allocated_id = _allocate_osd_id(&existing_id);
6735 dout(10) << __func__ << " allocated id " << allocated_id
6736 << " existing id " << existing_id << dendl;
6737 if (existing_id >= 0) {
6738 assert(existing_id < osdmap.get_max_osd());
6739 assert(allocated_id < 0);
6740 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
6741 *new_id = existing_id;
6742
6743 } else if (allocated_id >= 0) {
6744 assert(existing_id < 0);
6745 // raise max_osd
6746 if (pending_inc.new_max_osd < 0) {
6747 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
6748 } else {
6749 ++pending_inc.new_max_osd;
6750 }
6751 *new_id = pending_inc.new_max_osd - 1;
6752 assert(*new_id == allocated_id);
6753 } else {
6754 assert(0 == "unexpected condition");
6755 }
6756
6757 out:
6758 dout(10) << __func__ << " using id " << *new_id << dendl;
6759 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
6760 pending_inc.new_max_osd = *new_id + 1;
6761 }
6762
6763 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
6764 if (!uuid.is_zero())
6765 pending_inc.new_uuid[*new_id] = uuid;
6766 }
6767
6768 int OSDMonitor::validate_osd_create(
6769 const int32_t id,
6770 const uuid_d& uuid,
6771 const bool check_osd_exists,
6772 int32_t* existing_id,
6773 stringstream& ss)
6774 {
6775
6776 dout(10) << __func__ << " id " << id << " uuid " << uuid
6777 << " check_osd_exists " << check_osd_exists << dendl;
6778
6779 assert(existing_id);
6780
6781 if (id < 0 && uuid.is_zero()) {
6782 // we have nothing to validate
6783 *existing_id = -1;
6784 return 0;
6785 } else if (uuid.is_zero()) {
6786 // we have an id but we will ignore it - because that's what
6787 // `osd create` does.
6788 return 0;
6789 }
6790
6791 /*
6792 * This function will be used to validate whether we are able to
6793 * create a new osd when the `uuid` is specified.
6794 *
6795 * It will be used by both `osd create` and `osd new`, as the checks
6796 * are basically the same when it pertains to osd id and uuid validation.
6797 * However, `osd create` presumes an `uuid` is optional, for legacy
6798 * reasons, while `osd new` requires the `uuid` to be provided. This
6799 * means that `osd create` will not be idempotent if an `uuid` is not
6800 * provided, but we will always guarantee the idempotency of `osd new`.
6801 */
6802
6803 assert(!uuid.is_zero());
6804 if (pending_inc.identify_osd(uuid) >= 0) {
6805 // osd is about to exist
6806 return -EAGAIN;
6807 }
6808
6809 int32_t i = osdmap.identify_osd(uuid);
6810 if (i >= 0) {
6811 // osd already exists
6812 if (id >= 0 && i != id) {
6813 ss << "uuid " << uuid << " already in use for different id " << i;
6814 return -EEXIST;
6815 }
6816 // return a positive errno to distinguish between a blocking error
6817 // and an error we consider to not be a problem (i.e., this would be
6818 // an idempotent operation).
6819 *existing_id = i;
6820 return EEXIST;
6821 }
6822 // i < 0
6823 if (id >= 0) {
6824 if (pending_inc.new_state.count(id)) {
6825 // osd is about to exist
6826 return -EAGAIN;
6827 }
6828 // we may not care if an osd exists if we are recreating a previously
6829 // destroyed osd.
6830 if (check_osd_exists && osdmap.exists(id)) {
6831 ss << "id " << id << " already in use and does not match uuid "
6832 << uuid;
6833 return -EINVAL;
6834 }
6835 }
6836 return 0;
6837 }
6838
6839 int OSDMonitor::prepare_command_osd_create(
6840 const int32_t id,
6841 const uuid_d& uuid,
6842 int32_t* existing_id,
6843 stringstream& ss)
6844 {
6845 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
6846 assert(existing_id);
6847 if (osdmap.is_destroyed(id)) {
6848 ss << "ceph osd create has been deprecated. Please use ceph osd new "
6849 "instead.";
6850 return -EINVAL;
6851 }
6852
6853 if (uuid.is_zero()) {
6854 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
6855 }
6856
6857 return validate_osd_create(id, uuid, true, existing_id, ss);
6858 }
6859
6860 int OSDMonitor::prepare_command_osd_new(
6861 MonOpRequestRef op,
6862 const map<string,cmd_vartype>& cmdmap,
6863 const map<string,string>& secrets,
6864 stringstream &ss,
6865 Formatter *f)
6866 {
6867 uuid_d uuid;
6868 string uuidstr;
6869 int64_t id = -1;
6870
6871 assert(paxos->is_plugged());
6872
6873 dout(10) << __func__ << " " << op << dendl;
6874
6875 /* validate command. abort now if something's wrong. */
6876
6877 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
6878 *
6879 * If `id` is not specified, we will identify any existing osd based
6880 * on `uuid`. Operation will be idempotent iff secrets match.
6881 *
6882 * If `id` is specified, we will identify any existing osd based on
6883 * `uuid` and match against `id`. If they match, operation will be
6884 * idempotent iff secrets match.
6885 *
6886 * `-i secrets.json` will be optional. If supplied, will be used
6887 * to check for idempotency when `id` and `uuid` match.
6888 *
6889 * If `id` is not specified, and `uuid` does not exist, an id will
6890 * be found or allocated for the osd.
6891 *
6892 * If `id` is specified, and the osd has been previously marked
6893 * as destroyed, then the `id` will be reused.
6894 */
6895 if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
6896 ss << "requires the OSD's UUID to be specified.";
6897 return -EINVAL;
6898 } else if (!uuid.parse(uuidstr.c_str())) {
6899 ss << "invalid UUID value '" << uuidstr << "'.";
6900 return -EINVAL;
6901 }
6902
6903 if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
6904 (id < 0)) {
6905 ss << "invalid OSD id; must be greater or equal than zero.";
6906 return -EINVAL;
6907 }
6908
6909 // are we running an `osd create`-like command, or recreating
6910 // a previously destroyed osd?
6911
6912 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
6913
6914 // we will care about `id` to assess whether osd is `destroyed`, or
6915 // to create a new osd.
6916 // we will need an `id` by the time we reach auth.
6917
6918 int32_t existing_id = -1;
6919 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
6920 &existing_id, ss);
6921
6922 bool may_be_idempotent = false;
6923 if (err == EEXIST) {
6924 // this is idempotent from the osdmon's point-of-view
6925 may_be_idempotent = true;
6926 assert(existing_id >= 0);
6927 id = existing_id;
6928 } else if (err < 0) {
6929 return err;
6930 }
6931
6932 if (!may_be_idempotent) {
6933 // idempotency is out of the window. We are either creating a new
6934 // osd or recreating a destroyed osd.
6935 //
6936 // We now need to figure out if we have an `id` (and if it's valid),
6937 // of find an `id` if we don't have one.
6938
6939 // NOTE: we need to consider the case where the `id` is specified for
6940 // `osd create`, and we must honor it. So this means checking if
6941 // the `id` is destroyed, and if so assume the destroy; otherwise,
6942 // check if it `exists` - in which case we complain about not being
6943 // `destroyed`. In the end, if nothing fails, we must allow the
6944 // creation, so that we are compatible with `create`.
6945 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
6946 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
6947 ss << "OSD " << id << " has not yet been destroyed";
6948 return -EINVAL;
6949 } else if (id < 0) {
6950 // find an `id`
6951 id = _allocate_osd_id(&existing_id);
6952 if (id < 0) {
6953 assert(existing_id >= 0);
6954 id = existing_id;
6955 }
6956 dout(10) << __func__ << " found id " << id << " to use" << dendl;
6957 } else if (id >= 0 && osdmap.is_destroyed(id)) {
6958 dout(10) << __func__ << " recreating osd." << id << dendl;
6959 } else {
6960 dout(10) << __func__ << " creating new osd." << id << dendl;
6961 }
6962 } else {
6963 assert(id >= 0);
6964 assert(osdmap.exists(id));
6965 }
6966
6967 // we are now able to either create a brand new osd or reuse an existing
6968 // osd that has been previously destroyed.
6969
6970 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
6971
6972 if (may_be_idempotent && secrets.empty()) {
6973 // nothing to do, really.
6974 dout(10) << __func__ << " idempotent and no secrets -- no op." << dendl;
6975 assert(id >= 0);
6976 if (f) {
6977 f->open_object_section("created_osd");
6978 f->dump_int("osdid", id);
6979 f->close_section();
6980 } else {
6981 ss << id;
6982 }
6983 return EEXIST;
6984 }
6985
6986 string cephx_secret, lockbox_secret, dmcrypt_key;
6987 bool has_lockbox = false;
6988 bool has_secrets = (!secrets.empty());
6989
6990 ConfigKeyService *svc = nullptr;
6991 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
6992
6993 if (has_secrets) {
6994 if (secrets.count("cephx_secret") == 0) {
6995 ss << "requires a cephx secret.";
6996 return -EINVAL;
6997 }
6998 cephx_secret = secrets.at("cephx_secret");
6999
7000 bool has_lockbox_secret = (secrets.count("cephx_lockbox_secret") > 0);
7001 bool has_dmcrypt_key = (secrets.count("dmcrypt_key") > 0);
7002
7003 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
7004 << " dmcrypt " << has_dmcrypt_key << dendl;
7005
7006 if (has_lockbox_secret && has_dmcrypt_key) {
7007 has_lockbox = true;
7008 lockbox_secret = secrets.at("cephx_lockbox_secret");
7009 dmcrypt_key = secrets.at("dmcrypt_key");
7010 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
7011 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
7012 return -EINVAL;
7013 }
7014
7015 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
7016
7017 err = mon->authmon()->validate_osd_new(id, uuid,
7018 cephx_secret,
7019 lockbox_secret,
7020 cephx_entity,
7021 lockbox_entity,
7022 ss);
7023 if (err < 0) {
7024 return err;
7025 } else if (may_be_idempotent && err != EEXIST) {
7026 // for this to be idempotent, `id` should already be >= 0; no need
7027 // to use validate_id.
7028 assert(id >= 0);
7029 ss << "osd." << id << " exists but secrets do not match";
7030 return -EEXIST;
7031 }
7032
7033 if (has_lockbox) {
7034 svc = (ConfigKeyService*)mon->config_key_service;
7035 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
7036 if (err < 0) {
7037 return err;
7038 } else if (may_be_idempotent && err != EEXIST) {
7039 assert(id >= 0);
7040 ss << "osd." << id << " exists but dm-crypt key does not match.";
7041 return -EEXIST;
7042 }
7043 }
7044 }
7045 assert(!has_secrets || !cephx_secret.empty());
7046 assert(!has_lockbox || !lockbox_secret.empty());
7047
7048 if (may_be_idempotent) {
7049 // we have nothing to do for either the osdmon or the authmon,
7050 // and we have no lockbox - so the config key service will not be
7051 // touched. This is therefore an idempotent operation, and we can
7052 // just return right away.
7053 dout(10) << __func__ << " idempotent -- no op." << dendl;
7054 assert(id >= 0);
7055 if (f) {
7056 f->open_object_section("created_osd");
7057 f->dump_int("osdid", id);
7058 f->close_section();
7059 } else {
7060 ss << id;
7061 }
7062 return EEXIST;
7063 }
7064 assert(!may_be_idempotent);
7065
7066 // perform updates.
7067 if (has_secrets) {
7068 assert(!cephx_secret.empty());
7069 assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
7070 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
7071
7072 err = mon->authmon()->do_osd_new(cephx_entity,
7073 lockbox_entity,
7074 has_lockbox);
7075 assert(0 == err);
7076
7077 if (has_lockbox) {
7078 assert(nullptr != svc);
7079 svc->do_osd_new(uuid, dmcrypt_key);
7080 }
7081 }
7082
7083 if (is_recreate_destroyed) {
7084 assert(id >= 0);
7085 assert(osdmap.is_destroyed(id));
7086 pending_inc.new_weight[id] = CEPH_OSD_OUT;
7087 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED | CEPH_OSD_NEW;
7088 if (osdmap.get_state(id) & CEPH_OSD_UP) {
7089 // due to http://tracker.ceph.com/issues/20751 some clusters may
7090 // have UP set for non-existent OSDs; make sure it is cleared
7091 // for a newly created osd.
7092 pending_inc.new_state[id] |= CEPH_OSD_UP;
7093 }
7094 pending_inc.new_uuid[id] = uuid;
7095 } else {
7096 assert(id >= 0);
7097 int32_t new_id = -1;
7098 do_osd_create(id, uuid, &new_id);
7099 assert(new_id >= 0);
7100 assert(id == new_id);
7101 }
7102
7103 if (f) {
7104 f->open_object_section("created_osd");
7105 f->dump_int("osdid", id);
7106 f->close_section();
7107 } else {
7108 ss << id;
7109 }
7110
7111 return 0;
7112 }
7113
7114 bool OSDMonitor::prepare_command(MonOpRequestRef op)
7115 {
7116 op->mark_osdmon_event(__func__);
7117 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7118 stringstream ss;
7119 map<string, cmd_vartype> cmdmap;
7120 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
7121 string rs = ss.str();
7122 mon->reply_command(op, -EINVAL, rs, get_last_committed());
7123 return true;
7124 }
7125
7126 MonSession *session = m->get_session();
7127 if (!session) {
7128 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
7129 return true;
7130 }
7131
7132 return prepare_command_impl(op, cmdmap);
7133 }
7134
7135 static int parse_reweights(CephContext *cct,
7136 const map<string,cmd_vartype> &cmdmap,
7137 const OSDMap& osdmap,
7138 map<int32_t, uint32_t>* weights)
7139 {
7140 string weights_str;
7141 if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
7142 return -EINVAL;
7143 }
7144 std::replace(begin(weights_str), end(weights_str), '\'', '"');
7145 json_spirit::mValue json_value;
7146 if (!json_spirit::read(weights_str, json_value)) {
7147 return -EINVAL;
7148 }
7149 if (json_value.type() != json_spirit::obj_type) {
7150 return -EINVAL;
7151 }
7152 const auto obj = json_value.get_obj();
7153 try {
7154 for (auto& osd_weight : obj) {
7155 auto osd_id = std::stoi(osd_weight.first);
7156 if (!osdmap.exists(osd_id)) {
7157 return -ENOENT;
7158 }
7159 if (osd_weight.second.type() != json_spirit::str_type) {
7160 return -EINVAL;
7161 }
7162 auto weight = std::stoul(osd_weight.second.get_str());
7163 weights->insert({osd_id, weight});
7164 }
7165 } catch (const std::logic_error& e) {
7166 return -EINVAL;
7167 }
7168 return 0;
7169 }
7170
7171 int OSDMonitor::prepare_command_osd_destroy(
7172 int32_t id,
7173 stringstream& ss)
7174 {
7175 assert(paxos->is_plugged());
7176
7177 // we check if the osd exists for the benefit of `osd purge`, which may
7178 // have previously removed the osd. If the osd does not exist, return
7179 // -ENOENT to convey this, and let the caller deal with it.
7180 //
7181 // we presume that all auth secrets and config keys were removed prior
7182 // to this command being called. if they exist by now, we also assume
7183 // they must have been created by some other command and do not pertain
7184 // to this non-existent osd.
7185 if (!osdmap.exists(id)) {
7186 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
7187 return -ENOENT;
7188 }
7189
7190 uuid_d uuid = osdmap.get_uuid(id);
7191 dout(10) << __func__ << " destroying osd." << id
7192 << " uuid " << uuid << dendl;
7193
7194 // if it has been destroyed, we assume our work here is done.
7195 if (osdmap.is_destroyed(id)) {
7196 ss << "destroyed osd." << id;
7197 return 0;
7198 }
7199
7200 EntityName cephx_entity, lockbox_entity;
7201 bool idempotent_auth = false, idempotent_cks = false;
7202
7203 int err = mon->authmon()->validate_osd_destroy(id, uuid,
7204 cephx_entity,
7205 lockbox_entity,
7206 ss);
7207 if (err < 0) {
7208 if (err == -ENOENT) {
7209 idempotent_auth = true;
7210 } else {
7211 return err;
7212 }
7213 }
7214
7215 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
7216 err = svc->validate_osd_destroy(id, uuid);
7217 if (err < 0) {
7218 assert(err == -ENOENT);
7219 err = 0;
7220 idempotent_cks = true;
7221 }
7222
7223 if (!idempotent_auth) {
7224 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
7225 assert(0 == err);
7226 }
7227
7228 if (!idempotent_cks) {
7229 svc->do_osd_destroy(id, uuid);
7230 }
7231
7232 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
7233 pending_inc.new_uuid[id] = uuid_d();
7234
7235 // we can only propose_pending() once per service, otherwise we'll be
7236 // defying PaxosService and all laws of nature. Therefore, as we may
7237 // be used during 'osd purge', let's keep the caller responsible for
7238 // proposing.
7239 assert(err == 0);
7240 return 0;
7241 }
7242
7243 int OSDMonitor::prepare_command_osd_purge(
7244 int32_t id,
7245 stringstream& ss)
7246 {
7247 assert(paxos->is_plugged());
7248 dout(10) << __func__ << " purging osd." << id << dendl;
7249
7250 assert(!osdmap.is_up(id));
7251
7252 /*
7253 * This may look a bit weird, but this is what's going to happen:
7254 *
7255 * 1. we make sure that removing from crush works
7256 * 2. we call `prepare_command_osd_destroy()`. If it returns an
7257 * error, then we abort the whole operation, as no updates
7258 * have been made. However, we this function will have
7259 * side-effects, thus we need to make sure that all operations
7260 * performed henceforth will *always* succeed.
7261 * 3. we call `prepare_command_osd_remove()`. Although this
7262 * function can return an error, it currently only checks if the
7263 * osd is up - and we have made sure that it is not so, so there
7264 * is no conflict, and it is effectively an update.
7265 * 4. finally, we call `do_osd_crush_remove()`, which will perform
7266 * the crush update we delayed from before.
7267 */
7268
7269 CrushWrapper newcrush;
7270 _get_pending_crush(newcrush);
7271
7272 bool may_be_idempotent = false;
7273
7274 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
7275 if (err == -ENOENT) {
7276 err = 0;
7277 may_be_idempotent = true;
7278 } else if (err < 0) {
7279 ss << "error removing osd." << id << " from crush";
7280 return err;
7281 }
7282
7283 // no point destroying the osd again if it has already been marked destroyed
7284 if (!osdmap.is_destroyed(id)) {
7285 err = prepare_command_osd_destroy(id, ss);
7286 if (err < 0) {
7287 if (err == -ENOENT) {
7288 err = 0;
7289 } else {
7290 return err;
7291 }
7292 } else {
7293 may_be_idempotent = false;
7294 }
7295 }
7296 assert(0 == err);
7297
7298 if (may_be_idempotent && !osdmap.exists(id)) {
7299 dout(10) << __func__ << " osd." << id << " does not exist and "
7300 << "we are idempotent." << dendl;
7301 return -ENOENT;
7302 }
7303
7304 err = prepare_command_osd_remove(id);
7305 // we should not be busy, as we should have made sure this id is not up.
7306 assert(0 == err);
7307
7308 do_osd_crush_remove(newcrush);
7309 return 0;
7310 }
7311
7312 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
7313 map<string,cmd_vartype> &cmdmap)
7314 {
7315 op->mark_osdmon_event(__func__);
7316 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7317 bool ret = false;
7318 stringstream ss;
7319 string rs;
7320 bufferlist rdata;
7321 int err = 0;
7322
7323 string format;
7324 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
7325 boost::scoped_ptr<Formatter> f(Formatter::create(format));
7326
7327 string prefix;
7328 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
7329
7330 int64_t osdid;
7331 string name;
7332 bool osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
7333 if (osdid_present) {
7334 ostringstream oss;
7335 oss << "osd." << osdid;
7336 name = oss.str();
7337 }
7338
7339 // Even if there's a pending state with changes that could affect
7340 // a command, considering that said state isn't yet committed, we
7341 // just don't care about those changes if the command currently being
7342 // handled acts as a no-op against the current committed state.
7343 // In a nutshell, we assume this command happens *before*.
7344 //
7345 // Let me make this clearer:
7346 //
7347 // - If we have only one client, and that client issues some
7348 // operation that would conflict with this operation but is
7349 // still on the pending state, then we would be sure that said
7350 // operation wouldn't have returned yet, so the client wouldn't
7351 // issue this operation (unless the client didn't wait for the
7352 // operation to finish, and that would be the client's own fault).
7353 //
7354 // - If we have more than one client, each client will observe
7355 // whatever is the state at the moment of the commit. So, if we
7356 // have two clients, one issuing an unlink and another issuing a
7357 // link, and if the link happens while the unlink is still on the
7358 // pending state, from the link's point-of-view this is a no-op.
7359 // If different clients are issuing conflicting operations and
7360 // they care about that, then the clients should make sure they
7361 // enforce some kind of concurrency mechanism -- from our
7362 // perspective that's what Douglas Adams would call an SEP.
7363 //
7364 // This should be used as a general guideline for most commands handled
7365 // in this function. Adapt as you see fit, but please bear in mind that
7366 // this is the expected behavior.
7367
7368
7369 if (prefix == "osd setcrushmap" ||
7370 (prefix == "osd crush set" && !osdid_present)) {
7371 if (pending_inc.crush.length()) {
7372 dout(10) << __func__ << " waiting for pending crush update " << dendl;
7373 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7374 return true;
7375 }
7376 dout(10) << "prepare_command setting new crush map" << dendl;
7377 bufferlist data(m->get_data());
7378 CrushWrapper crush;
7379 try {
7380 bufferlist::iterator bl(data.begin());
7381 crush.decode(bl);
7382 }
7383 catch (const std::exception &e) {
7384 err = -EINVAL;
7385 ss << "Failed to parse crushmap: " << e.what();
7386 goto reply;
7387 }
7388
7389 int64_t prior_version = 0;
7390 if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
7391 if (prior_version == osdmap.get_crush_version() - 1) {
7392 // see if we are a resend of the last update. this is imperfect
7393 // (multiple racing updaters may not both get reliable success)
7394 // but we expect crush updaters (via this interface) to be rare-ish.
7395 bufferlist current, proposed;
7396 osdmap.crush->encode(current, mon->get_quorum_con_features());
7397 crush.encode(proposed, mon->get_quorum_con_features());
7398 if (current.contents_equal(proposed)) {
7399 dout(10) << __func__
7400 << " proposed matches current and version equals previous"
7401 << dendl;
7402 err = 0;
7403 ss << osdmap.get_crush_version();
7404 goto reply;
7405 }
7406 }
7407 if (prior_version != osdmap.get_crush_version()) {
7408 err = -EPERM;
7409 ss << "prior_version " << prior_version << " != crush version "
7410 << osdmap.get_crush_version();
7411 goto reply;
7412 }
7413 }
7414
7415 if (crush.has_legacy_rulesets()) {
7416 err = -EINVAL;
7417 ss << "crush maps with ruleset != ruleid are no longer allowed";
7418 goto reply;
7419 }
7420 if (!validate_crush_against_features(&crush, ss)) {
7421 err = -EINVAL;
7422 goto reply;
7423 }
7424
7425 const auto& osdmap_pools = osdmap.get_pools();
7426 for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
7427 const int64_t pool_id = pit->first;
7428 const pg_pool_t &pool = pit->second;
7429 int ruleno = pool.get_crush_rule();
7430 if (!crush.rule_exists(ruleno)) {
7431 ss << " the crush rule no "<< ruleno << " for pool id " << pool_id << " is in use";
7432 err = -EINVAL;
7433 goto reply;
7434 }
7435 }
7436
7437 if (g_conf->mon_osd_crush_smoke_test) {
7438 // sanity check: test some inputs to make sure this map isn't
7439 // totally broken
7440 dout(10) << " testing map" << dendl;
7441 stringstream ess;
7442 CrushTester tester(crush, ess);
7443 tester.set_min_x(0);
7444 tester.set_max_x(50);
7445 auto start = ceph::coarse_mono_clock::now();
7446 int r = tester.test_with_fork(g_conf->mon_lease);
7447 auto duration = ceph::coarse_mono_clock::now() - start;
7448 if (r < 0) {
7449 dout(10) << " tester.test_with_fork returns " << r
7450 << ": " << ess.str() << dendl;
7451 ss << "crush smoke test failed with " << r << ": " << ess.str();
7452 err = r;
7453 goto reply;
7454 }
7455 dout(10) << __func__ << " crush somke test duration: "
7456 << duration << ", result: " << ess.str() << dendl;
7457 }
7458
7459 pending_inc.crush = data;
7460 ss << osdmap.get_crush_version() + 1;
7461 goto update;
7462
7463 } else if (prefix == "osd crush set-device-class") {
7464 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7465 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7466 << "luminous' before using crush device classes";
7467 err = -EPERM;
7468 goto reply;
7469 }
7470
7471 string device_class;
7472 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7473 err = -EINVAL; // no value!
7474 goto reply;
7475 }
7476
7477 bool stop = false;
7478 vector<string> idvec;
7479 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7480 CrushWrapper newcrush;
7481 _get_pending_crush(newcrush);
7482 set<int> updated;
7483 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7484 set<int> osds;
7485 // wildcard?
7486 if (j == 0 &&
7487 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7488 osdmap.get_all_osds(osds);
7489 stop = true;
7490 } else {
7491 // try traditional single osd way
7492 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7493 if (osd < 0) {
7494 // ss has reason for failure
7495 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7496 err = -EINVAL;
7497 continue;
7498 }
7499 osds.insert(osd);
7500 }
7501
7502 for (auto &osd : osds) {
7503 if (!osdmap.exists(osd)) {
7504 ss << "osd." << osd << " does not exist. ";
7505 continue;
7506 }
7507
7508 ostringstream oss;
7509 oss << "osd." << osd;
7510 string name = oss.str();
7511
7512 string action;
7513 if (newcrush.item_exists(osd)) {
7514 action = "updating";
7515 } else {
7516 action = "creating";
7517 newcrush.set_item_name(osd, name);
7518 }
7519
7520 dout(5) << action << " crush item id " << osd << " name '" << name
7521 << "' device_class '" << device_class << "'"
7522 << dendl;
7523 err = newcrush.update_device_class(osd, device_class, name, &ss);
7524 if (err < 0) {
7525 goto reply;
7526 }
7527 if (err == 0 && !_have_pending_crush()) {
7528 if (!stop) {
7529 // for single osd only, wildcard makes too much noise
7530 ss << "set-device-class item id " << osd << " name '" << name
7531 << "' device_class '" << device_class << "': no change";
7532 }
7533 } else {
7534 updated.insert(osd);
7535 }
7536 }
7537 }
7538
7539 if (!updated.empty()) {
7540 pending_inc.crush.clear();
7541 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7542 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
7543 getline(ss, rs);
7544 wait_for_finished_proposal(op,
7545 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
7546 return true;
7547 }
7548
7549 } else if (prefix == "osd crush rm-device-class") {
7550 bool stop = false;
7551 vector<string> idvec;
7552 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7553 CrushWrapper newcrush;
7554 _get_pending_crush(newcrush);
7555 set<int> updated;
7556
7557 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7558 set<int> osds;
7559
7560 // wildcard?
7561 if (j == 0 &&
7562 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7563 osdmap.get_all_osds(osds);
7564 stop = true;
7565 } else {
7566 // try traditional single osd way
7567 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7568 if (osd < 0) {
7569 // ss has reason for failure
7570 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7571 err = -EINVAL;
7572 goto reply;
7573 }
7574 osds.insert(osd);
7575 }
7576
7577 for (auto &osd : osds) {
7578 if (!osdmap.exists(osd)) {
7579 ss << "osd." << osd << " does not exist. ";
7580 continue;
7581 }
7582
7583 auto class_name = newcrush.get_item_class(osd);
7584 if (!class_name) {
7585 ss << "osd." << osd << " belongs to no class, ";
7586 continue;
7587 }
7588 // note that we do not verify if class_is_in_use here
7589 // in case the device is misclassified and user wants
7590 // to overridely reset...
7591
7592 err = newcrush.remove_device_class(g_ceph_context, osd, &ss);
7593 if (err < 0) {
7594 // ss has reason for failure
7595 goto reply;
7596 }
7597 updated.insert(osd);
7598 }
7599 }
7600
7601 if (!updated.empty()) {
7602 pending_inc.crush.clear();
7603 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7604 ss << "done removing class of osd(s): " << updated;
7605 getline(ss, rs);
7606 wait_for_finished_proposal(op,
7607 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
7608 return true;
7609 }
7610 } else if (prefix == "osd crush class rename") {
7611 string srcname, dstname;
7612 if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
7613 err = -EINVAL;
7614 goto reply;
7615 }
7616 if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
7617 err = -EINVAL;
7618 goto reply;
7619 }
7620
7621 CrushWrapper newcrush;
7622 _get_pending_crush(newcrush);
7623 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
7624 // suppose this is a replay and return success
7625 // so command is idempotent
7626 ss << "already renamed to '" << dstname << "'";
7627 err = 0;
7628 goto reply;
7629 }
7630
7631 err = newcrush.rename_class(srcname, dstname);
7632 if (err < 0) {
7633 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
7634 << cpp_strerror(err);
7635 goto reply;
7636 }
7637
7638 pending_inc.crush.clear();
7639 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7640 ss << "rename class '" << srcname << "' to '" << dstname << "'";
7641 goto update;
7642 } else if (prefix == "osd crush add-bucket") {
7643 // os crush add-bucket <name> <type>
7644 string name, typestr;
7645 cmd_getval(g_ceph_context, cmdmap, "name", name);
7646 cmd_getval(g_ceph_context, cmdmap, "type", typestr);
7647
7648 if (!_have_pending_crush() &&
7649 _get_stable_crush().name_exists(name)) {
7650 ss << "bucket '" << name << "' already exists";
7651 goto reply;
7652 }
7653
7654 CrushWrapper newcrush;
7655 _get_pending_crush(newcrush);
7656
7657 if (newcrush.name_exists(name)) {
7658 ss << "bucket '" << name << "' already exists";
7659 goto update;
7660 }
7661 int type = newcrush.get_type_id(typestr);
7662 if (type < 0) {
7663 ss << "type '" << typestr << "' does not exist";
7664 err = -EINVAL;
7665 goto reply;
7666 }
7667 if (type == 0) {
7668 ss << "type '" << typestr << "' is for devices, not buckets";
7669 err = -EINVAL;
7670 goto reply;
7671 }
7672 int bucketno;
7673 err = newcrush.add_bucket(0, 0,
7674 CRUSH_HASH_DEFAULT, type, 0, NULL,
7675 NULL, &bucketno);
7676 if (err < 0) {
7677 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
7678 goto reply;
7679 }
7680 err = newcrush.set_item_name(bucketno, name);
7681 if (err < 0) {
7682 ss << "error setting bucket name to '" << name << "'";
7683 goto reply;
7684 }
7685
7686 pending_inc.crush.clear();
7687 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7688 ss << "added bucket " << name << " type " << typestr
7689 << " to crush map";
7690 goto update;
7691 } else if (prefix == "osd crush rename-bucket") {
7692 string srcname, dstname;
7693 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
7694 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
7695
7696 err = crush_rename_bucket(srcname, dstname, &ss);
7697 if (err == -EALREADY) // equivalent to success for idempotency
7698 err = 0;
7699 if (err)
7700 goto reply;
7701 else
7702 goto update;
7703 } else if (prefix == "osd crush weight-set create" ||
7704 prefix == "osd crush weight-set create-compat") {
7705 CrushWrapper newcrush;
7706 _get_pending_crush(newcrush);
7707 int64_t pool;
7708 int positions;
7709 if (newcrush.has_non_straw2_buckets()) {
7710 ss << "crush map contains one or more bucket(s) that are not straw2";
7711 err = -EPERM;
7712 goto reply;
7713 }
7714 if (prefix == "osd crush weight-set create") {
7715 if (osdmap.require_min_compat_client > 0 &&
7716 osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
7717 ss << "require_min_compat_client "
7718 << ceph_release_name(osdmap.require_min_compat_client)
7719 << " < luminous, which is required for per-pool weight-sets. "
7720 << "Try 'ceph osd set-require-min-compat-client luminous' "
7721 << "before using the new interface";
7722 err = -EPERM;
7723 goto reply;
7724 }
7725 string poolname, mode;
7726 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
7727 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
7728 if (pool < 0) {
7729 ss << "pool '" << poolname << "' not found";
7730 err = -ENOENT;
7731 goto reply;
7732 }
7733 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
7734 if (mode != "flat" && mode != "positional") {
7735 ss << "unrecognized weight-set mode '" << mode << "'";
7736 err = -EINVAL;
7737 goto reply;
7738 }
7739 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
7740 } else {
7741 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
7742 positions = 1;
7743 }
7744 newcrush.create_choose_args(pool, positions);
7745 pending_inc.crush.clear();
7746 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7747 goto update;
7748
7749 } else if (prefix == "osd crush weight-set rm" ||
7750 prefix == "osd crush weight-set rm-compat") {
7751 CrushWrapper newcrush;
7752 _get_pending_crush(newcrush);
7753 int64_t pool;
7754 if (prefix == "osd crush weight-set rm") {
7755 string poolname;
7756 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
7757 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
7758 if (pool < 0) {
7759 ss << "pool '" << poolname << "' not found";
7760 err = -ENOENT;
7761 goto reply;
7762 }
7763 } else {
7764 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
7765 }
7766 newcrush.rm_choose_args(pool);
7767 pending_inc.crush.clear();
7768 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7769 goto update;
7770
7771 } else if (prefix == "osd crush weight-set reweight" ||
7772 prefix == "osd crush weight-set reweight-compat") {
7773 string poolname, item;
7774 vector<double> weight;
7775 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
7776 cmd_getval(g_ceph_context, cmdmap, "item", item);
7777 cmd_getval(g_ceph_context, cmdmap, "weight", weight);
7778 CrushWrapper newcrush;
7779 _get_pending_crush(newcrush);
7780 int64_t pool;
7781 if (prefix == "osd crush weight-set reweight") {
7782 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
7783 if (pool < 0) {
7784 ss << "pool '" << poolname << "' not found";
7785 err = -ENOENT;
7786 goto reply;
7787 }
7788 if (!newcrush.have_choose_args(pool)) {
7789 ss << "no weight-set for pool '" << poolname << "'";
7790 err = -ENOENT;
7791 goto reply;
7792 }
7793 auto arg_map = newcrush.choose_args_get(pool);
7794 int positions = newcrush.get_choose_args_positions(arg_map);
7795 if (weight.size() != (size_t)positions) {
7796 ss << "must specify exact " << positions << " weight values";
7797 err = -EINVAL;
7798 goto reply;
7799 }
7800 } else {
7801 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
7802 if (!newcrush.have_choose_args(pool)) {
7803 ss << "no backward-compatible weight-set";
7804 err = -ENOENT;
7805 goto reply;
7806 }
7807 }
7808 if (!newcrush.name_exists(item)) {
7809 ss << "item '" << item << "' does not exist";
7810 err = -ENOENT;
7811 goto reply;
7812 }
7813 err = newcrush.choose_args_adjust_item_weightf(
7814 g_ceph_context,
7815 newcrush.choose_args_get(pool),
7816 newcrush.get_item_id(item),
7817 weight,
7818 &ss);
7819 if (err < 0) {
7820 goto reply;
7821 }
7822 err = 0;
7823 pending_inc.crush.clear();
7824 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7825 goto update;
7826 } else if (osdid_present &&
7827 (prefix == "osd crush set" || prefix == "osd crush add")) {
7828 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
7829 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
7830 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
7831
7832 if (!osdmap.exists(osdid)) {
7833 err = -ENOENT;
7834 ss << name << " does not exist. Create it before updating the crush map";
7835 goto reply;
7836 }
7837
7838 double weight;
7839 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
7840 ss << "unable to parse weight value '"
7841 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7842 err = -EINVAL;
7843 goto reply;
7844 }
7845
7846 string args;
7847 vector<string> argvec;
7848 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7849 map<string,string> loc;
7850 CrushWrapper::parse_loc_map(argvec, &loc);
7851
7852 if (prefix == "osd crush set"
7853 && !_get_stable_crush().item_exists(osdid)) {
7854 err = -ENOENT;
7855 ss << "unable to set item id " << osdid << " name '" << name
7856 << "' weight " << weight << " at location " << loc
7857 << ": does not exist";
7858 goto reply;
7859 }
7860
7861 dout(5) << "adding/updating crush item id " << osdid << " name '"
7862 << name << "' weight " << weight << " at location "
7863 << loc << dendl;
7864 CrushWrapper newcrush;
7865 _get_pending_crush(newcrush);
7866
7867 string action;
7868 if (prefix == "osd crush set" ||
7869 newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
7870 action = "set";
7871 err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
7872 } else {
7873 action = "add";
7874 err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
7875 if (err == 0)
7876 err = 1;
7877 }
7878
7879 if (err < 0)
7880 goto reply;
7881
7882 if (err == 0 && !_have_pending_crush()) {
7883 ss << action << " item id " << osdid << " name '" << name << "' weight "
7884 << weight << " at location " << loc << ": no change";
7885 goto reply;
7886 }
7887
7888 pending_inc.crush.clear();
7889 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7890 ss << action << " item id " << osdid << " name '" << name << "' weight "
7891 << weight << " at location " << loc << " to crush map";
7892 getline(ss, rs);
7893 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7894 get_last_committed() + 1));
7895 return true;
7896
7897 } else if (prefix == "osd crush create-or-move") {
7898 do {
7899 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
7900 if (!osdmap.exists(osdid)) {
7901 err = -ENOENT;
7902 ss << name << " does not exist. create it before updating the crush map";
7903 goto reply;
7904 }
7905
7906 double weight;
7907 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
7908 ss << "unable to parse weight value '"
7909 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7910 err = -EINVAL;
7911 goto reply;
7912 }
7913
7914 string args;
7915 vector<string> argvec;
7916 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7917 map<string,string> loc;
7918 CrushWrapper::parse_loc_map(argvec, &loc);
7919
7920 dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
7921 << " at location " << loc << dendl;
7922
7923 CrushWrapper newcrush;
7924 _get_pending_crush(newcrush);
7925
7926 err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
7927 if (err == 0) {
7928 ss << "create-or-move updated item name '" << name << "' weight " << weight
7929 << " at location " << loc << " to crush map";
7930 break;
7931 }
7932 if (err > 0) {
7933 pending_inc.crush.clear();
7934 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7935 ss << "create-or-move updating item name '" << name << "' weight " << weight
7936 << " at location " << loc << " to crush map";
7937 getline(ss, rs);
7938 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7939 get_last_committed() + 1));
7940 return true;
7941 }
7942 } while (false);
7943
7944 } else if (prefix == "osd crush move") {
7945 do {
7946 // osd crush move <name> <loc1> [<loc2> ...]
7947
7948 string args;
7949 vector<string> argvec;
7950 cmd_getval(g_ceph_context, cmdmap, "name", name);
7951 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7952 map<string,string> loc;
7953 CrushWrapper::parse_loc_map(argvec, &loc);
7954
7955 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
7956 CrushWrapper newcrush;
7957 _get_pending_crush(newcrush);
7958
7959 if (!newcrush.name_exists(name)) {
7960 err = -ENOENT;
7961 ss << "item " << name << " does not exist";
7962 break;
7963 }
7964 int id = newcrush.get_item_id(name);
7965
7966 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
7967 if (id >= 0) {
7968 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
7969 } else {
7970 err = newcrush.move_bucket(g_ceph_context, id, loc);
7971 }
7972 if (err >= 0) {
7973 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
7974 pending_inc.crush.clear();
7975 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7976 getline(ss, rs);
7977 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7978 get_last_committed() + 1));
7979 return true;
7980 }
7981 } else {
7982 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
7983 err = 0;
7984 }
7985 } while (false);
7986 } else if (prefix == "osd crush swap-bucket") {
7987 string source, dest, force;
7988 cmd_getval(g_ceph_context, cmdmap, "source", source);
7989 cmd_getval(g_ceph_context, cmdmap, "dest", dest);
7990 cmd_getval(g_ceph_context, cmdmap, "force", force);
7991 CrushWrapper newcrush;
7992 _get_pending_crush(newcrush);
7993 if (!newcrush.name_exists(source)) {
7994 ss << "source item " << source << " does not exist";
7995 err = -ENOENT;
7996 goto reply;
7997 }
7998 if (!newcrush.name_exists(dest)) {
7999 ss << "dest item " << dest << " does not exist";
8000 err = -ENOENT;
8001 goto reply;
8002 }
8003 int sid = newcrush.get_item_id(source);
8004 int did = newcrush.get_item_id(dest);
8005 int sparent;
8006 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 &&
8007 force != "--yes-i-really-mean-it") {
8008 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
8009 err = -EPERM;
8010 goto reply;
8011 }
8012 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
8013 force != "--yes-i-really-mean-it") {
8014 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
8015 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
8016 << "; pass --yes-i-really-mean-it to proceed anyway";
8017 err = -EPERM;
8018 goto reply;
8019 }
8020 int r = newcrush.swap_bucket(g_ceph_context, sid, did);
8021 if (r < 0) {
8022 ss << "failed to swap bucket contents: " << cpp_strerror(r);
8023 err = r;
8024 goto reply;
8025 }
8026 ss << "swapped bucket of " << source << " to " << dest;
8027 pending_inc.crush.clear();
8028 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8029 wait_for_finished_proposal(op,
8030 new Monitor::C_Command(mon, op, err, ss.str(),
8031 get_last_committed() + 1));
8032 return true;
8033 } else if (prefix == "osd crush link") {
8034 // osd crush link <name> <loc1> [<loc2> ...]
8035 string name;
8036 cmd_getval(g_ceph_context, cmdmap, "name", name);
8037 vector<string> argvec;
8038 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8039 map<string,string> loc;
8040 CrushWrapper::parse_loc_map(argvec, &loc);
8041
8042 // Need an explicit check for name_exists because get_item_id returns
8043 // 0 on unfound.
8044 int id = osdmap.crush->get_item_id(name);
8045 if (!osdmap.crush->name_exists(name)) {
8046 err = -ENOENT;
8047 ss << "item " << name << " does not exist";
8048 goto reply;
8049 } else {
8050 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
8051 }
8052 if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
8053 ss << "no need to move item id " << id << " name '" << name
8054 << "' to location " << loc << " in crush map";
8055 err = 0;
8056 goto reply;
8057 }
8058
8059 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
8060 CrushWrapper newcrush;
8061 _get_pending_crush(newcrush);
8062
8063 if (!newcrush.name_exists(name)) {
8064 err = -ENOENT;
8065 ss << "item " << name << " does not exist";
8066 goto reply;
8067 } else {
8068 int id = newcrush.get_item_id(name);
8069 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
8070 err = newcrush.link_bucket(g_ceph_context, id, loc);
8071 if (err >= 0) {
8072 ss << "linked item id " << id << " name '" << name
8073 << "' to location " << loc << " in crush map";
8074 pending_inc.crush.clear();
8075 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8076 } else {
8077 ss << "cannot link item id " << id << " name '" << name
8078 << "' to location " << loc;
8079 goto reply;
8080 }
8081 } else {
8082 ss << "no need to move item id " << id << " name '" << name
8083 << "' to location " << loc << " in crush map";
8084 err = 0;
8085 }
8086 }
8087 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
8088 get_last_committed() + 1));
8089 return true;
8090 } else if (prefix == "osd crush rm" ||
8091 prefix == "osd crush remove" ||
8092 prefix == "osd crush unlink") {
8093 do {
8094 // osd crush rm <id> [ancestor]
8095 CrushWrapper newcrush;
8096 _get_pending_crush(newcrush);
8097
8098 string name;
8099 cmd_getval(g_ceph_context, cmdmap, "name", name);
8100
8101 if (!osdmap.crush->name_exists(name)) {
8102 err = 0;
8103 ss << "device '" << name << "' does not appear in the crush map";
8104 break;
8105 }
8106 if (!newcrush.name_exists(name)) {
8107 err = 0;
8108 ss << "device '" << name << "' does not appear in the crush map";
8109 getline(ss, rs);
8110 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8111 get_last_committed() + 1));
8112 return true;
8113 }
8114 int id = newcrush.get_item_id(name);
8115 int ancestor = 0;
8116
8117 bool unlink_only = prefix == "osd crush unlink";
8118 string ancestor_str;
8119 if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
8120 if (!newcrush.name_exists(ancestor_str)) {
8121 err = -ENOENT;
8122 ss << "ancestor item '" << ancestor_str
8123 << "' does not appear in the crush map";
8124 break;
8125 }
8126 ancestor = newcrush.get_item_id(ancestor_str);
8127 }
8128
8129 err = prepare_command_osd_crush_remove(
8130 newcrush,
8131 id, ancestor,
8132 (ancestor < 0), unlink_only);
8133
8134 if (err == -ENOENT) {
8135 ss << "item " << id << " does not appear in that position";
8136 err = 0;
8137 break;
8138 }
8139 if (err == 0) {
8140 ss << "removed item id " << id << " name '" << name << "' from crush map";
8141 getline(ss, rs);
8142 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8143 get_last_committed() + 1));
8144 return true;
8145 }
8146 } while (false);
8147
8148 } else if (prefix == "osd crush reweight-all") {
8149 CrushWrapper newcrush;
8150 _get_pending_crush(newcrush);
8151
8152 newcrush.reweight(g_ceph_context);
8153 pending_inc.crush.clear();
8154 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8155 ss << "reweighted crush hierarchy";
8156 getline(ss, rs);
8157 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8158 get_last_committed() + 1));
8159 return true;
8160 } else if (prefix == "osd crush reweight") {
8161 // osd crush reweight <name> <weight>
8162 CrushWrapper newcrush;
8163 _get_pending_crush(newcrush);
8164
8165 string name;
8166 cmd_getval(g_ceph_context, cmdmap, "name", name);
8167 if (!newcrush.name_exists(name)) {
8168 err = -ENOENT;
8169 ss << "device '" << name << "' does not appear in the crush map";
8170 goto reply;
8171 }
8172
8173 int id = newcrush.get_item_id(name);
8174 if (id < 0) {
8175 ss << "device '" << name << "' is not a leaf in the crush map";
8176 err = -EINVAL;
8177 goto reply;
8178 }
8179 double w;
8180 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8181 ss << "unable to parse weight value '"
8182 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8183 err = -EINVAL;
8184 goto reply;
8185 }
8186
8187 err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
8188 if (err < 0)
8189 goto reply;
8190 pending_inc.crush.clear();
8191 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8192 ss << "reweighted item id " << id << " name '" << name << "' to " << w
8193 << " in crush map";
8194 getline(ss, rs);
8195 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8196 get_last_committed() + 1));
8197 return true;
8198 } else if (prefix == "osd crush reweight-subtree") {
8199 // osd crush reweight <name> <weight>
8200 CrushWrapper newcrush;
8201 _get_pending_crush(newcrush);
8202
8203 string name;
8204 cmd_getval(g_ceph_context, cmdmap, "name", name);
8205 if (!newcrush.name_exists(name)) {
8206 err = -ENOENT;
8207 ss << "device '" << name << "' does not appear in the crush map";
8208 goto reply;
8209 }
8210
8211 int id = newcrush.get_item_id(name);
8212 if (id >= 0) {
8213 ss << "device '" << name << "' is not a subtree in the crush map";
8214 err = -EINVAL;
8215 goto reply;
8216 }
8217 double w;
8218 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8219 ss << "unable to parse weight value '"
8220 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8221 err = -EINVAL;
8222 goto reply;
8223 }
8224
8225 err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
8226 if (err < 0)
8227 goto reply;
8228 pending_inc.crush.clear();
8229 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8230 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
8231 << " in crush map";
8232 getline(ss, rs);
8233 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8234 get_last_committed() + 1));
8235 return true;
8236 } else if (prefix == "osd crush tunables") {
8237 CrushWrapper newcrush;
8238 _get_pending_crush(newcrush);
8239
8240 err = 0;
8241 string profile;
8242 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8243 if (profile == "legacy" || profile == "argonaut") {
8244 newcrush.set_tunables_legacy();
8245 } else if (profile == "bobtail") {
8246 newcrush.set_tunables_bobtail();
8247 } else if (profile == "firefly") {
8248 newcrush.set_tunables_firefly();
8249 } else if (profile == "hammer") {
8250 newcrush.set_tunables_hammer();
8251 } else if (profile == "jewel") {
8252 newcrush.set_tunables_jewel();
8253 } else if (profile == "optimal") {
8254 newcrush.set_tunables_optimal();
8255 } else if (profile == "default") {
8256 newcrush.set_tunables_default();
8257 } else {
8258 ss << "unrecognized profile '" << profile << "'";
8259 err = -EINVAL;
8260 goto reply;
8261 }
8262
8263 if (!validate_crush_against_features(&newcrush, ss)) {
8264 err = -EINVAL;
8265 goto reply;
8266 }
8267
8268 pending_inc.crush.clear();
8269 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8270 ss << "adjusted tunables profile to " << profile;
8271 getline(ss, rs);
8272 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8273 get_last_committed() + 1));
8274 return true;
8275 } else if (prefix == "osd crush set-tunable") {
8276 CrushWrapper newcrush;
8277 _get_pending_crush(newcrush);
8278
8279 err = 0;
8280 string tunable;
8281 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
8282
8283 int64_t value = -1;
8284 if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
8285 err = -EINVAL;
8286 ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
8287 goto reply;
8288 }
8289
8290 if (tunable == "straw_calc_version") {
8291 if (value != 0 && value != 1) {
8292 ss << "value must be 0 or 1; got " << value;
8293 err = -EINVAL;
8294 goto reply;
8295 }
8296 newcrush.set_straw_calc_version(value);
8297 } else {
8298 ss << "unrecognized tunable '" << tunable << "'";
8299 err = -EINVAL;
8300 goto reply;
8301 }
8302
8303 if (!validate_crush_against_features(&newcrush, ss)) {
8304 err = -EINVAL;
8305 goto reply;
8306 }
8307
8308 pending_inc.crush.clear();
8309 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8310 ss << "adjusted tunable " << tunable << " to " << value;
8311 getline(ss, rs);
8312 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8313 get_last_committed() + 1));
8314 return true;
8315
8316 } else if (prefix == "osd crush rule create-simple") {
8317 string name, root, type, mode;
8318 cmd_getval(g_ceph_context, cmdmap, "name", name);
8319 cmd_getval(g_ceph_context, cmdmap, "root", root);
8320 cmd_getval(g_ceph_context, cmdmap, "type", type);
8321 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
8322 if (mode == "")
8323 mode = "firstn";
8324
8325 if (osdmap.crush->rule_exists(name)) {
8326 // The name is uniquely associated to a ruleid and the rule it contains
8327 // From the user point of view, the rule is more meaningfull.
8328 ss << "rule " << name << " already exists";
8329 err = 0;
8330 goto reply;
8331 }
8332
8333 CrushWrapper newcrush;
8334 _get_pending_crush(newcrush);
8335
8336 if (newcrush.rule_exists(name)) {
8337 // The name is uniquely associated to a ruleid and the rule it contains
8338 // From the user point of view, the rule is more meaningfull.
8339 ss << "rule " << name << " already exists";
8340 err = 0;
8341 } else {
8342 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
8343 pg_pool_t::TYPE_REPLICATED, &ss);
8344 if (ruleno < 0) {
8345 err = ruleno;
8346 goto reply;
8347 }
8348
8349 pending_inc.crush.clear();
8350 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8351 }
8352 getline(ss, rs);
8353 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8354 get_last_committed() + 1));
8355 return true;
8356
8357 } else if (prefix == "osd crush rule create-replicated") {
8358 string name, root, type, device_class;
8359 cmd_getval(g_ceph_context, cmdmap, "name", name);
8360 cmd_getval(g_ceph_context, cmdmap, "root", root);
8361 cmd_getval(g_ceph_context, cmdmap, "type", type);
8362 cmd_getval(g_ceph_context, cmdmap, "class", device_class);
8363
8364 if (!device_class.empty()) {
8365 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8366 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8367 << "luminous' before using crush device classes";
8368 err = -EPERM;
8369 goto reply;
8370 }
8371 }
8372
8373 if (osdmap.crush->rule_exists(name)) {
8374 // The name is uniquely associated to a ruleid and the rule it contains
8375 // From the user point of view, the rule is more meaningfull.
8376 ss << "rule " << name << " already exists";
8377 err = 0;
8378 goto reply;
8379 }
8380
8381 CrushWrapper newcrush;
8382 _get_pending_crush(newcrush);
8383
8384 if (newcrush.rule_exists(name)) {
8385 // The name is uniquely associated to a ruleid and the rule it contains
8386 // From the user point of view, the rule is more meaningfull.
8387 ss << "rule " << name << " already exists";
8388 err = 0;
8389 } else {
8390 int ruleno = newcrush.add_simple_rule(
8391 name, root, type, device_class,
8392 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
8393 if (ruleno < 0) {
8394 err = ruleno;
8395 goto reply;
8396 }
8397
8398 pending_inc.crush.clear();
8399 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8400 }
8401 getline(ss, rs);
8402 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8403 get_last_committed() + 1));
8404 return true;
8405
8406 } else if (prefix == "osd erasure-code-profile rm") {
8407 string name;
8408 cmd_getval(g_ceph_context, cmdmap, "name", name);
8409
8410 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
8411 goto wait;
8412
8413 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
8414 err = -EBUSY;
8415 goto reply;
8416 }
8417
8418 if (osdmap.has_erasure_code_profile(name) ||
8419 pending_inc.new_erasure_code_profiles.count(name)) {
8420 if (osdmap.has_erasure_code_profile(name)) {
8421 pending_inc.old_erasure_code_profiles.push_back(name);
8422 } else {
8423 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
8424 pending_inc.new_erasure_code_profiles.erase(name);
8425 }
8426
8427 getline(ss, rs);
8428 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8429 get_last_committed() + 1));
8430 return true;
8431 } else {
8432 ss << "erasure-code-profile " << name << " does not exist";
8433 err = 0;
8434 goto reply;
8435 }
8436
8437 } else if (prefix == "osd erasure-code-profile set") {
8438 string name;
8439 cmd_getval(g_ceph_context, cmdmap, "name", name);
8440 vector<string> profile;
8441 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8442 bool force;
8443 if (profile.size() > 0 && profile.back() == "--force") {
8444 profile.pop_back();
8445 force = true;
8446 } else {
8447 force = false;
8448 }
8449 map<string,string> profile_map;
8450 err = parse_erasure_code_profile(profile, &profile_map, &ss);
8451 if (err)
8452 goto reply;
8453 if (profile_map.find("plugin") == profile_map.end()) {
8454 ss << "erasure-code-profile " << profile_map
8455 << " must contain a plugin entry" << std::endl;
8456 err = -EINVAL;
8457 goto reply;
8458 }
8459 string plugin = profile_map["plugin"];
8460
8461 if (pending_inc.has_erasure_code_profile(name)) {
8462 dout(20) << "erasure code profile " << name << " try again" << dendl;
8463 goto wait;
8464 } else {
8465 if (plugin == "isa" || plugin == "lrc") {
8466 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
8467 if (err == -EAGAIN)
8468 goto wait;
8469 if (err)
8470 goto reply;
8471 } else if (plugin == "shec") {
8472 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
8473 if (err == -EAGAIN)
8474 goto wait;
8475 if (err)
8476 goto reply;
8477 }
8478 err = normalize_profile(name, profile_map, force, &ss);
8479 if (err)
8480 goto reply;
8481
8482 if (osdmap.has_erasure_code_profile(name)) {
8483 ErasureCodeProfile existing_profile_map =
8484 osdmap.get_erasure_code_profile(name);
8485 err = normalize_profile(name, existing_profile_map, force, &ss);
8486 if (err)
8487 goto reply;
8488
8489 if (existing_profile_map == profile_map) {
8490 err = 0;
8491 goto reply;
8492 }
8493 if (!force) {
8494 err = -EPERM;
8495 ss << "will not override erasure code profile " << name
8496 << " because the existing profile "
8497 << existing_profile_map
8498 << " is different from the proposed profile "
8499 << profile_map;
8500 goto reply;
8501 }
8502 }
8503
8504 dout(20) << "erasure code profile set " << name << "="
8505 << profile_map << dendl;
8506 pending_inc.set_erasure_code_profile(name, profile_map);
8507 }
8508
8509 getline(ss, rs);
8510 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8511 get_last_committed() + 1));
8512 return true;
8513
8514 } else if (prefix == "osd crush rule create-erasure") {
8515 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
8516 if (err == -EAGAIN)
8517 goto wait;
8518 if (err)
8519 goto reply;
8520 string name, poolstr;
8521 cmd_getval(g_ceph_context, cmdmap, "name", name);
8522 string profile;
8523 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8524 if (profile == "")
8525 profile = "default";
8526 if (profile == "default") {
8527 if (!osdmap.has_erasure_code_profile(profile)) {
8528 if (pending_inc.has_erasure_code_profile(profile)) {
8529 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
8530 goto wait;
8531 }
8532
8533 map<string,string> profile_map;
8534 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
8535 profile_map,
8536 &ss);
8537 if (err)
8538 goto reply;
8539 err = normalize_profile(name, profile_map, true, &ss);
8540 if (err)
8541 goto reply;
8542 dout(20) << "erasure code profile set " << profile << "="
8543 << profile_map << dendl;
8544 pending_inc.set_erasure_code_profile(profile, profile_map);
8545 goto wait;
8546 }
8547 }
8548
8549 int rule;
8550 err = crush_rule_create_erasure(name, profile, &rule, &ss);
8551 if (err < 0) {
8552 switch(err) {
8553 case -EEXIST: // return immediately
8554 ss << "rule " << name << " already exists";
8555 err = 0;
8556 goto reply;
8557 break;
8558 case -EALREADY: // wait for pending to be proposed
8559 ss << "rule " << name << " already exists";
8560 err = 0;
8561 break;
8562 default: // non recoverable error
8563 goto reply;
8564 break;
8565 }
8566 } else {
8567 ss << "created rule " << name << " at " << rule;
8568 }
8569
8570 getline(ss, rs);
8571 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8572 get_last_committed() + 1));
8573 return true;
8574
8575 } else if (prefix == "osd crush rule rm") {
8576 string name;
8577 cmd_getval(g_ceph_context, cmdmap, "name", name);
8578
8579 if (!osdmap.crush->rule_exists(name)) {
8580 ss << "rule " << name << " does not exist";
8581 err = 0;
8582 goto reply;
8583 }
8584
8585 CrushWrapper newcrush;
8586 _get_pending_crush(newcrush);
8587
8588 if (!newcrush.rule_exists(name)) {
8589 ss << "rule " << name << " does not exist";
8590 err = 0;
8591 } else {
8592 int ruleno = newcrush.get_rule_id(name);
8593 assert(ruleno >= 0);
8594
8595 // make sure it is not in use.
8596 // FIXME: this is ok in some situations, but let's not bother with that
8597 // complexity now.
8598 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
8599 if (osdmap.crush_ruleset_in_use(ruleset)) {
8600 ss << "crush ruleset " << name << " " << ruleset << " is in use";
8601 err = -EBUSY;
8602 goto reply;
8603 }
8604
8605 err = newcrush.remove_rule(ruleno);
8606 if (err < 0) {
8607 goto reply;
8608 }
8609
8610 pending_inc.crush.clear();
8611 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8612 }
8613 getline(ss, rs);
8614 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8615 get_last_committed() + 1));
8616 return true;
8617
8618 } else if (prefix == "osd crush rule rename") {
8619 string srcname;
8620 string dstname;
8621 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
8622 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
8623 if (srcname.empty() || dstname.empty()) {
8624 ss << "must specify both source rule name and destination rule name";
8625 err = -EINVAL;
8626 goto reply;
8627 }
8628 if (srcname == dstname) {
8629 ss << "destination rule name is equal to source rule name";
8630 err = 0;
8631 goto reply;
8632 }
8633
8634 CrushWrapper newcrush;
8635 _get_pending_crush(newcrush);
8636 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
8637 // srcname does not exist and dstname already exists
8638 // suppose this is a replay and return success
8639 // (so this command is idempotent)
8640 ss << "already renamed to '" << dstname << "'";
8641 err = 0;
8642 goto reply;
8643 }
8644
8645 err = newcrush.rename_rule(srcname, dstname, &ss);
8646 if (err < 0) {
8647 // ss has reason for failure
8648 goto reply;
8649 }
8650 pending_inc.crush.clear();
8651 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8652 getline(ss, rs);
8653 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8654 get_last_committed() + 1));
8655 return true;
8656
8657 } else if (prefix == "osd setmaxosd") {
8658 int64_t newmax;
8659 if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
8660 ss << "unable to parse 'newmax' value '"
8661 << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
8662 err = -EINVAL;
8663 goto reply;
8664 }
8665
8666 if (newmax > g_conf->mon_max_osd) {
8667 err = -ERANGE;
8668 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
8669 << g_conf->mon_max_osd << ")";
8670 goto reply;
8671 }
8672
8673 // Don't allow shrinking OSD number as this will cause data loss
8674 // and may cause kernel crashes.
8675 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
8676 if (newmax < osdmap.get_max_osd()) {
8677 // Check if the OSDs exist between current max and new value.
8678 // If there are any OSDs exist, then don't allow shrinking number
8679 // of OSDs.
8680 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
8681 if (osdmap.exists(i)) {
8682 err = -EBUSY;
8683 ss << "cannot shrink max_osd to " << newmax
8684 << " because osd." << i << " (and possibly others) still in use";
8685 goto reply;
8686 }
8687 }
8688 }
8689
8690 pending_inc.new_max_osd = newmax;
8691 ss << "set new max_osd = " << pending_inc.new_max_osd;
8692 getline(ss, rs);
8693 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8694 get_last_committed() + 1));
8695 return true;
8696
8697 } else if (prefix == "osd set-full-ratio" ||
8698 prefix == "osd set-backfillfull-ratio" ||
8699 prefix == "osd set-nearfull-ratio") {
8700 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8701 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8702 << "luminous' before using the new interface";
8703 err = -EPERM;
8704 goto reply;
8705 }
8706 double n;
8707 if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
8708 ss << "unable to parse 'ratio' value '"
8709 << cmd_vartype_stringify(cmdmap["ratio"]) << "'";
8710 err = -EINVAL;
8711 goto reply;
8712 }
8713 if (prefix == "osd set-full-ratio")
8714 pending_inc.new_full_ratio = n;
8715 else if (prefix == "osd set-backfillfull-ratio")
8716 pending_inc.new_backfillfull_ratio = n;
8717 else if (prefix == "osd set-nearfull-ratio")
8718 pending_inc.new_nearfull_ratio = n;
8719 ss << prefix << " " << n;
8720 getline(ss, rs);
8721 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8722 get_last_committed() + 1));
8723 return true;
8724 } else if (prefix == "osd set-require-min-compat-client") {
8725 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8726 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8727 << "luminous' before using the new interface";
8728 err = -EPERM;
8729 goto reply;
8730 }
8731 string v;
8732 cmd_getval(g_ceph_context, cmdmap, "version", v);
8733 int vno = ceph_release_from_name(v.c_str());
8734 if (vno <= 0) {
8735 ss << "version " << v << " is not recognized";
8736 err = -EINVAL;
8737 goto reply;
8738 }
8739 OSDMap newmap;
8740 newmap.deepish_copy_from(osdmap);
8741 newmap.apply_incremental(pending_inc);
8742 newmap.require_min_compat_client = vno;
8743 auto mvno = newmap.get_min_compat_client();
8744 if (vno < mvno) {
8745 ss << "osdmap current utilizes features that require "
8746 << ceph_release_name(mvno)
8747 << "; cannot set require_min_compat_client below that to "
8748 << ceph_release_name(vno);
8749 err = -EPERM;
8750 goto reply;
8751 }
8752 string sure;
8753 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
8754 if (sure != "--yes-i-really-mean-it") {
8755 FeatureMap m;
8756 mon->get_combined_feature_map(&m);
8757 uint64_t features = ceph_release_features(vno);
8758 bool first = true;
8759 bool ok = true;
8760 for (int type : {
8761 CEPH_ENTITY_TYPE_CLIENT,
8762 CEPH_ENTITY_TYPE_MDS,
8763 CEPH_ENTITY_TYPE_MGR }) {
8764 auto p = m.m.find(type);
8765 if (p == m.m.end()) {
8766 continue;
8767 }
8768 for (auto& q : p->second) {
8769 uint64_t missing = ~q.first & features;
8770 if (missing) {
8771 if (first) {
8772 ss << "cannot set require_min_compat_client to " << v << ": ";
8773 } else {
8774 ss << "; ";
8775 }
8776 first = false;
8777 ss << q.second << " connected " << ceph_entity_type_name(type)
8778 << "(s) look like " << ceph_release_name(
8779 ceph_release_from_features(q.first))
8780 << " (missing 0x" << std::hex << missing << std::dec << ")";
8781 ok = false;
8782 }
8783 }
8784 }
8785 if (!ok) {
8786 ss << "; add --yes-i-really-mean-it to do it anyway";
8787 err = -EPERM;
8788 goto reply;
8789 }
8790 }
8791 ss << "set require_min_compat_client to " << ceph_release_name(vno);
8792 pending_inc.new_require_min_compat_client = vno;
8793 getline(ss, rs);
8794 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8795 get_last_committed() + 1));
8796 return true;
8797 } else if (prefix == "osd pause") {
8798 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8799
8800 } else if (prefix == "osd unpause") {
8801 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8802
8803 } else if (prefix == "osd set") {
8804 string key;
8805 cmd_getval(g_ceph_context, cmdmap, "key", key);
8806 if (key == "full")
8807 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
8808 else if (key == "pause")
8809 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8810 else if (key == "noup")
8811 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
8812 else if (key == "nodown")
8813 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
8814 else if (key == "noout")
8815 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
8816 else if (key == "noin")
8817 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
8818 else if (key == "nobackfill")
8819 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
8820 else if (key == "norebalance")
8821 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
8822 else if (key == "norecover")
8823 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
8824 else if (key == "noscrub")
8825 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
8826 else if (key == "nodeep-scrub")
8827 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
8828 else if (key == "notieragent")
8829 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
8830 else if (key == "sortbitwise") {
8831 if (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT) {
8832 return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
8833 } else {
8834 ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
8835 err = -EPERM;
8836 goto reply;
8837 }
8838 } else if (key == "recovery_deletes") {
8839 if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)) {
8840 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
8841 } else {
8842 ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
8843 err = -EPERM;
8844 goto reply;
8845 }
8846 } else if (key == "require_jewel_osds") {
8847 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8848 ss << "the sortbitwise flag must be set before require_jewel_osds";
8849 err = -EPERM;
8850 goto reply;
8851 } else if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL) {
8852 ss << "require_osd_release is already >= jewel";
8853 err = 0;
8854 goto reply;
8855 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)) {
8856 return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
8857 } else {
8858 ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
8859 err = -EPERM;
8860 }
8861 } else if (key == "require_kraken_osds") {
8862 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8863 ss << "the sortbitwise flag must be set before require_kraken_osds";
8864 err = -EPERM;
8865 goto reply;
8866 } else if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN) {
8867 ss << "require_osd_release is already >= kraken";
8868 err = 0;
8869 goto reply;
8870 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)) {
8871 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
8872 // ensure JEWEL is also set
8873 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
8874 return r;
8875 } else {
8876 ss << "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
8877 err = -EPERM;
8878 }
8879 } else {
8880 ss << "unrecognized flag '" << key << "'";
8881 err = -EINVAL;
8882 }
8883
8884 } else if (prefix == "osd unset") {
8885 string key;
8886 cmd_getval(g_ceph_context, cmdmap, "key", key);
8887 if (key == "full")
8888 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
8889 else if (key == "pause")
8890 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8891 else if (key == "noup")
8892 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
8893 else if (key == "nodown")
8894 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
8895 else if (key == "noout")
8896 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
8897 else if (key == "noin")
8898 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
8899 else if (key == "nobackfill")
8900 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
8901 else if (key == "norebalance")
8902 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
8903 else if (key == "norecover")
8904 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
8905 else if (key == "noscrub")
8906 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
8907 else if (key == "nodeep-scrub")
8908 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
8909 else if (key == "notieragent")
8910 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
8911 else {
8912 ss << "unrecognized flag '" << key << "'";
8913 err = -EINVAL;
8914 }
8915
8916 } else if (prefix == "osd require-osd-release") {
8917 string release;
8918 cmd_getval(g_ceph_context, cmdmap, "release", release);
8919 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8920 ss << "the sortbitwise flag must be set first";
8921 err = -EPERM;
8922 goto reply;
8923 }
8924 int rel = ceph_release_from_name(release.c_str());
8925 if (rel <= 0) {
8926 ss << "unrecognized release " << release;
8927 err = -EINVAL;
8928 goto reply;
8929 }
8930 if (rel < CEPH_RELEASE_LUMINOUS) {
8931 ss << "use this command only for luminous and later";
8932 err = -EINVAL;
8933 goto reply;
8934 }
8935 if (rel == osdmap.require_osd_release) {
8936 // idempotent
8937 err = 0;
8938 goto reply;
8939 }
8940 if (rel == CEPH_RELEASE_LUMINOUS) {
8941 if (!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS)) {
8942 ss << "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
8943 err = -EPERM;
8944 goto reply;
8945 }
8946 } else {
8947 ss << "not supported for this release yet";
8948 err = -EPERM;
8949 goto reply;
8950 }
8951 if (rel < osdmap.require_osd_release) {
8952 ss << "require_osd_release cannot be lowered once it has been set";
8953 err = -EPERM;
8954 goto reply;
8955 }
8956 pending_inc.new_require_osd_release = rel;
8957 if (rel >= CEPH_RELEASE_LUMINOUS &&
8958 !osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
8959 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
8960 }
8961 goto update;
8962 } else if (prefix == "osd cluster_snap") {
8963 // ** DISABLE THIS FOR NOW **
8964 ss << "cluster snapshot currently disabled (broken implementation)";
8965 // ** DISABLE THIS FOR NOW **
8966
8967 } else if (prefix == "osd down" ||
8968 prefix == "osd out" ||
8969 prefix == "osd in" ||
8970 prefix == "osd rm") {
8971
8972 bool any = false;
8973 bool stop = false;
8974 bool verbose = true;
8975
8976 vector<string> idvec;
8977 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
8978 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8979 set<int> osds;
8980
8981 // wildcard?
8982 if (j == 0 &&
8983 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8984 if (prefix == "osd in") {
8985 // touch out osds only
8986 osdmap.get_out_osds(osds);
8987 } else {
8988 osdmap.get_all_osds(osds);
8989 }
8990 stop = true;
8991 verbose = false; // so the output is less noisy.
8992 } else {
8993 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8994 if (osd < 0) {
8995 ss << "invalid osd id" << osd;
8996 err = -EINVAL;
8997 continue;
8998 } else if (!osdmap.exists(osd)) {
8999 ss << "osd." << osd << " does not exist. ";
9000 continue;
9001 }
9002
9003 osds.insert(osd);
9004 }
9005
9006 for (auto &osd : osds) {
9007 if (prefix == "osd down") {
9008 if (osdmap.is_down(osd)) {
9009 if (verbose)
9010 ss << "osd." << osd << " is already down. ";
9011 } else {
9012 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
9013 ss << "marked down osd." << osd << ". ";
9014 any = true;
9015 }
9016 } else if (prefix == "osd out") {
9017 if (osdmap.is_out(osd)) {
9018 if (verbose)
9019 ss << "osd." << osd << " is already out. ";
9020 } else {
9021 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
9022 if (osdmap.osd_weight[osd]) {
9023 if (pending_inc.new_xinfo.count(osd) == 0) {
9024 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9025 }
9026 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
9027 }
9028 ss << "marked out osd." << osd << ". ";
9029 std::ostringstream msg;
9030 msg << "Client " << op->get_session()->entity_name
9031 << " marked osd." << osd << " out";
9032 if (osdmap.is_up(osd)) {
9033 msg << ", while it was still marked up";
9034 } else {
9035 msg << ", after it was down for " << int(down_pending_out[osd].sec())
9036 << " seconds";
9037 }
9038
9039 mon->clog->info() << msg.str();
9040 any = true;
9041 }
9042 } else if (prefix == "osd in") {
9043 if (osdmap.is_in(osd)) {
9044 if (verbose)
9045 ss << "osd." << osd << " is already in. ";
9046 } else {
9047 if (osdmap.osd_xinfo[osd].old_weight > 0) {
9048 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
9049 if (pending_inc.new_xinfo.count(osd) == 0) {
9050 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9051 }
9052 pending_inc.new_xinfo[osd].old_weight = 0;
9053 } else {
9054 pending_inc.new_weight[osd] = CEPH_OSD_IN;
9055 }
9056 ss << "marked in osd." << osd << ". ";
9057 any = true;
9058 }
9059 } else if (prefix == "osd rm") {
9060 err = prepare_command_osd_remove(osd);
9061
9062 if (err == -EBUSY) {
9063 if (any)
9064 ss << ", ";
9065 ss << "osd." << osd << " is still up; must be down before removal. ";
9066 } else {
9067 assert(err == 0);
9068 if (any) {
9069 ss << ", osd." << osd;
9070 } else {
9071 ss << "removed osd." << osd;
9072 }
9073 any = true;
9074 }
9075 }
9076 }
9077 }
9078 if (any) {
9079 getline(ss, rs);
9080 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9081 get_last_committed() + 1));
9082 return true;
9083 }
9084 } else if (prefix == "osd add-noup" ||
9085 prefix == "osd add-nodown" ||
9086 prefix == "osd add-noin" ||
9087 prefix == "osd add-noout") {
9088
9089 enum {
9090 OP_NOUP,
9091 OP_NODOWN,
9092 OP_NOIN,
9093 OP_NOOUT,
9094 } option;
9095
9096 if (prefix == "osd add-noup") {
9097 option = OP_NOUP;
9098 } else if (prefix == "osd add-nodown") {
9099 option = OP_NODOWN;
9100 } else if (prefix == "osd add-noin") {
9101 option = OP_NOIN;
9102 } else {
9103 option = OP_NOOUT;
9104 }
9105
9106 bool any = false;
9107 bool stop = false;
9108
9109 vector<string> idvec;
9110 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9111 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9112
9113 set<int> osds;
9114
9115 // wildcard?
9116 if (j == 0 &&
9117 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9118 osdmap.get_all_osds(osds);
9119 stop = true;
9120 } else {
9121 // try traditional single osd way
9122
9123 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9124 if (osd < 0) {
9125 // ss has reason for failure
9126 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9127 err = -EINVAL;
9128 continue;
9129 }
9130
9131 osds.insert(osd);
9132 }
9133
9134 for (auto &osd : osds) {
9135
9136 if (!osdmap.exists(osd)) {
9137 ss << "osd." << osd << " does not exist. ";
9138 continue;
9139 }
9140
9141 switch (option) {
9142 case OP_NOUP:
9143 if (osdmap.is_up(osd)) {
9144 ss << "osd." << osd << " is already up. ";
9145 continue;
9146 }
9147
9148 if (osdmap.is_noup(osd)) {
9149 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP))
9150 any = true;
9151 } else {
9152 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9153 any = true;
9154 }
9155
9156 break;
9157
9158 case OP_NODOWN:
9159 if (osdmap.is_down(osd)) {
9160 ss << "osd." << osd << " is already down. ";
9161 continue;
9162 }
9163
9164 if (osdmap.is_nodown(osd)) {
9165 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN))
9166 any = true;
9167 } else {
9168 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9169 any = true;
9170 }
9171
9172 break;
9173
9174 case OP_NOIN:
9175 if (osdmap.is_in(osd)) {
9176 ss << "osd." << osd << " is already in. ";
9177 continue;
9178 }
9179
9180 if (osdmap.is_noin(osd)) {
9181 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN))
9182 any = true;
9183 } else {
9184 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9185 any = true;
9186 }
9187
9188 break;
9189
9190 case OP_NOOUT:
9191 if (osdmap.is_out(osd)) {
9192 ss << "osd." << osd << " is already out. ";
9193 continue;
9194 }
9195
9196 if (osdmap.is_noout(osd)) {
9197 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT))
9198 any = true;
9199 } else {
9200 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9201 any = true;
9202 }
9203
9204 break;
9205
9206 default:
9207 assert(0 == "invalid option");
9208 }
9209 }
9210 }
9211
9212 if (any) {
9213 getline(ss, rs);
9214 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9215 get_last_committed() + 1));
9216 return true;
9217 }
9218 } else if (prefix == "osd rm-noup" ||
9219 prefix == "osd rm-nodown" ||
9220 prefix == "osd rm-noin" ||
9221 prefix == "osd rm-noout") {
9222
9223 enum {
9224 OP_NOUP,
9225 OP_NODOWN,
9226 OP_NOIN,
9227 OP_NOOUT,
9228 } option;
9229
9230 if (prefix == "osd rm-noup") {
9231 option = OP_NOUP;
9232 } else if (prefix == "osd rm-nodown") {
9233 option = OP_NODOWN;
9234 } else if (prefix == "osd rm-noin") {
9235 option = OP_NOIN;
9236 } else {
9237 option = OP_NOOUT;
9238 }
9239
9240 bool any = false;
9241 bool stop = false;
9242
9243 vector<string> idvec;
9244 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9245
9246 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9247
9248 vector<int> osds;
9249
9250 // wildcard?
9251 if (j == 0 &&
9252 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9253
9254 // touch previous noup/nodown/noin/noout osds only
9255 switch (option) {
9256 case OP_NOUP:
9257 osdmap.get_noup_osds(&osds);
9258 break;
9259 case OP_NODOWN:
9260 osdmap.get_nodown_osds(&osds);
9261 break;
9262 case OP_NOIN:
9263 osdmap.get_noin_osds(&osds);
9264 break;
9265 case OP_NOOUT:
9266 osdmap.get_noout_osds(&osds);
9267 break;
9268 default:
9269 assert(0 == "invalid option");
9270 }
9271
9272 // cancel any pending noup/nodown/noin/noout requests too
9273 vector<int> pending_state_osds;
9274 (void) pending_inc.get_pending_state_osds(&pending_state_osds);
9275 for (auto &p : pending_state_osds) {
9276
9277 switch (option) {
9278 case OP_NOUP:
9279 if (!osdmap.is_noup(p) &&
9280 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOUP)) {
9281 any = true;
9282 }
9283 break;
9284
9285 case OP_NODOWN:
9286 if (!osdmap.is_nodown(p) &&
9287 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN)) {
9288 any = true;
9289 }
9290 break;
9291
9292 case OP_NOIN:
9293 if (!osdmap.is_noin(p) &&
9294 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOIN)) {
9295 any = true;
9296 }
9297 break;
9298
9299 case OP_NOOUT:
9300 if (!osdmap.is_noout(p) &&
9301 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT)) {
9302 any = true;
9303 }
9304 break;
9305
9306 default:
9307 assert(0 == "invalid option");
9308 }
9309 }
9310
9311 stop = true;
9312 } else {
9313 // try traditional single osd way
9314
9315 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9316 if (osd < 0) {
9317 // ss has reason for failure
9318 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9319 err = -EINVAL;
9320 continue;
9321 }
9322
9323 osds.push_back(osd);
9324 }
9325
9326 for (auto &osd : osds) {
9327
9328 if (!osdmap.exists(osd)) {
9329 ss << "osd." << osd << " does not exist. ";
9330 continue;
9331 }
9332
9333 switch (option) {
9334 case OP_NOUP:
9335 if (osdmap.is_noup(osd)) {
9336 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9337 any = true;
9338 } else if (pending_inc.pending_osd_state_clear(
9339 osd, CEPH_OSD_NOUP)) {
9340 any = true;
9341 }
9342 break;
9343
9344 case OP_NODOWN:
9345 if (osdmap.is_nodown(osd)) {
9346 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9347 any = true;
9348 } else if (pending_inc.pending_osd_state_clear(
9349 osd, CEPH_OSD_NODOWN)) {
9350 any = true;
9351 }
9352 break;
9353
9354 case OP_NOIN:
9355 if (osdmap.is_noin(osd)) {
9356 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9357 any = true;
9358 } else if (pending_inc.pending_osd_state_clear(
9359 osd, CEPH_OSD_NOIN)) {
9360 any = true;
9361 }
9362 break;
9363
9364 case OP_NOOUT:
9365 if (osdmap.is_noout(osd)) {
9366 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9367 any = true;
9368 } else if (pending_inc.pending_osd_state_clear(
9369 osd, CEPH_OSD_NOOUT)) {
9370 any = true;
9371 }
9372 break;
9373
9374 default:
9375 assert(0 == "invalid option");
9376 }
9377 }
9378 }
9379
9380 if (any) {
9381 getline(ss, rs);
9382 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9383 get_last_committed() + 1));
9384 return true;
9385 }
9386 } else if (prefix == "osd pg-temp") {
9387 string pgidstr;
9388 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9389 ss << "unable to parse 'pgid' value '"
9390 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9391 err = -EINVAL;
9392 goto reply;
9393 }
9394 pg_t pgid;
9395 if (!pgid.parse(pgidstr.c_str())) {
9396 ss << "invalid pgid '" << pgidstr << "'";
9397 err = -EINVAL;
9398 goto reply;
9399 }
9400 if (!osdmap.pg_exists(pgid)) {
9401 ss << "pg " << pgid << " does not exist";
9402 err = -ENOENT;
9403 goto reply;
9404 }
9405 if (pending_inc.new_pg_temp.count(pgid)) {
9406 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
9407 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9408 return true;
9409 }
9410
9411 vector<int64_t> id_vec;
9412 vector<int32_t> new_pg_temp;
9413 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9414 ss << "unable to parse 'id' value(s) '"
9415 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9416 err = -EINVAL;
9417 goto reply;
9418 }
9419 for (auto osd : id_vec) {
9420 if (!osdmap.exists(osd)) {
9421 ss << "osd." << osd << " does not exist";
9422 err = -ENOENT;
9423 goto reply;
9424 }
9425 new_pg_temp.push_back(osd);
9426 }
9427
9428 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9429 if ((int)new_pg_temp.size() < pool_min_size) {
9430 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
9431 << pool_min_size << ")";
9432 err = -EINVAL;
9433 goto reply;
9434 }
9435
9436 int pool_size = osdmap.get_pg_pool_size(pgid);
9437 if ((int)new_pg_temp.size() > pool_size) {
9438 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
9439 << pool_size << ")";
9440 err = -EINVAL;
9441 goto reply;
9442 }
9443
9444 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
9445 new_pg_temp.begin(), new_pg_temp.end());
9446 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
9447 goto update;
9448 } else if (prefix == "osd primary-temp") {
9449 string pgidstr;
9450 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9451 ss << "unable to parse 'pgid' value '"
9452 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9453 err = -EINVAL;
9454 goto reply;
9455 }
9456 pg_t pgid;
9457 if (!pgid.parse(pgidstr.c_str())) {
9458 ss << "invalid pgid '" << pgidstr << "'";
9459 err = -EINVAL;
9460 goto reply;
9461 }
9462 if (!osdmap.pg_exists(pgid)) {
9463 ss << "pg " << pgid << " does not exist";
9464 err = -ENOENT;
9465 goto reply;
9466 }
9467
9468 int64_t osd;
9469 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
9470 ss << "unable to parse 'id' value '"
9471 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9472 err = -EINVAL;
9473 goto reply;
9474 }
9475 if (osd != -1 && !osdmap.exists(osd)) {
9476 ss << "osd." << osd << " does not exist";
9477 err = -ENOENT;
9478 goto reply;
9479 }
9480
9481 if (osdmap.require_min_compat_client > 0 &&
9482 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
9483 ss << "require_min_compat_client "
9484 << ceph_release_name(osdmap.require_min_compat_client)
9485 << " < firefly, which is required for primary-temp";
9486 err = -EPERM;
9487 goto reply;
9488 } else if (!g_conf->mon_osd_allow_primary_temp) {
9489 ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
9490 err = -EPERM;
9491 goto reply;
9492 }
9493
9494 pending_inc.new_primary_temp[pgid] = osd;
9495 ss << "set " << pgid << " primary_temp mapping to " << osd;
9496 goto update;
9497 } else if (prefix == "osd pg-upmap" ||
9498 prefix == "osd rm-pg-upmap" ||
9499 prefix == "osd pg-upmap-items" ||
9500 prefix == "osd rm-pg-upmap-items") {
9501 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
9502 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9503 << "luminous' before using the new interface";
9504 err = -EPERM;
9505 goto reply;
9506 }
9507 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
9508 ss << "min_compat_client "
9509 << ceph_release_name(osdmap.require_min_compat_client)
9510 << " < luminous, which is required for pg-upmap. "
9511 << "Try 'ceph osd set-require-min-compat-client luminous' "
9512 << "before using the new interface";
9513 err = -EPERM;
9514 goto reply;
9515 }
9516 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
9517 if (err == -EAGAIN)
9518 goto wait;
9519 if (err < 0)
9520 goto reply;
9521 string pgidstr;
9522 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9523 ss << "unable to parse 'pgid' value '"
9524 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9525 err = -EINVAL;
9526 goto reply;
9527 }
9528 pg_t pgid;
9529 if (!pgid.parse(pgidstr.c_str())) {
9530 ss << "invalid pgid '" << pgidstr << "'";
9531 err = -EINVAL;
9532 goto reply;
9533 }
9534 if (!osdmap.pg_exists(pgid)) {
9535 ss << "pg " << pgid << " does not exist";
9536 err = -ENOENT;
9537 goto reply;
9538 }
9539
9540 enum {
9541 OP_PG_UPMAP,
9542 OP_RM_PG_UPMAP,
9543 OP_PG_UPMAP_ITEMS,
9544 OP_RM_PG_UPMAP_ITEMS,
9545 } option;
9546
9547 if (prefix == "osd pg-upmap") {
9548 option = OP_PG_UPMAP;
9549 } else if (prefix == "osd rm-pg-upmap") {
9550 option = OP_RM_PG_UPMAP;
9551 } else if (prefix == "osd pg-upmap-items") {
9552 option = OP_PG_UPMAP_ITEMS;
9553 } else {
9554 option = OP_RM_PG_UPMAP_ITEMS;
9555 }
9556
9557 // check pending upmap changes
9558 switch (option) {
9559 case OP_PG_UPMAP: // fall through
9560 case OP_RM_PG_UPMAP:
9561 if (pending_inc.new_pg_upmap.count(pgid) ||
9562 pending_inc.old_pg_upmap.count(pgid)) {
9563 dout(10) << __func__ << " waiting for pending update on "
9564 << pgid << dendl;
9565 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9566 return true;
9567 }
9568 break;
9569
9570 case OP_PG_UPMAP_ITEMS: // fall through
9571 case OP_RM_PG_UPMAP_ITEMS:
9572 if (pending_inc.new_pg_upmap_items.count(pgid) ||
9573 pending_inc.old_pg_upmap_items.count(pgid)) {
9574 dout(10) << __func__ << " waiting for pending update on "
9575 << pgid << dendl;
9576 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9577 return true;
9578 }
9579 break;
9580
9581 default:
9582 assert(0 == "invalid option");
9583 }
9584
9585 switch (option) {
9586 case OP_PG_UPMAP:
9587 {
9588 vector<int64_t> id_vec;
9589 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9590 ss << "unable to parse 'id' value(s) '"
9591 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9592 err = -EINVAL;
9593 goto reply;
9594 }
9595
9596 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9597 if ((int)id_vec.size() < pool_min_size) {
9598 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
9599 << pool_min_size << ")";
9600 err = -EINVAL;
9601 goto reply;
9602 }
9603
9604 int pool_size = osdmap.get_pg_pool_size(pgid);
9605 if ((int)id_vec.size() > pool_size) {
9606 ss << "num of osds (" << id_vec.size() <<") > pool size ("
9607 << pool_size << ")";
9608 err = -EINVAL;
9609 goto reply;
9610 }
9611
9612 vector<int32_t> new_pg_upmap;
9613 for (auto osd : id_vec) {
9614 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
9615 ss << "osd." << osd << " does not exist";
9616 err = -ENOENT;
9617 goto reply;
9618 }
9619 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
9620 if (it != new_pg_upmap.end()) {
9621 ss << "osd." << osd << " already exists, ";
9622 continue;
9623 }
9624 new_pg_upmap.push_back(osd);
9625 }
9626
9627 if (new_pg_upmap.empty()) {
9628 ss << "no valid upmap items(pairs) is specified";
9629 err = -EINVAL;
9630 goto reply;
9631 }
9632
9633 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
9634 new_pg_upmap.begin(), new_pg_upmap.end());
9635 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
9636 }
9637 break;
9638
9639 case OP_RM_PG_UPMAP:
9640 {
9641 pending_inc.old_pg_upmap.insert(pgid);
9642 ss << "clear " << pgid << " pg_upmap mapping";
9643 }
9644 break;
9645
9646 case OP_PG_UPMAP_ITEMS:
9647 {
9648 vector<int64_t> id_vec;
9649 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9650 ss << "unable to parse 'id' value(s) '"
9651 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9652 err = -EINVAL;
9653 goto reply;
9654 }
9655
9656 if (id_vec.size() % 2) {
9657 ss << "you must specify pairs of osd ids to be remapped";
9658 err = -EINVAL;
9659 goto reply;
9660 }
9661
9662 int pool_size = osdmap.get_pg_pool_size(pgid);
9663 if ((int)(id_vec.size() / 2) > pool_size) {
9664 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
9665 << pool_size << ")";
9666 err = -EINVAL;
9667 goto reply;
9668 }
9669
9670 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
9671 ostringstream items;
9672 items << "[";
9673 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
9674 int from = *p++;
9675 int to = *p;
9676 if (from == to) {
9677 ss << "from osd." << from << " == to osd." << to << ", ";
9678 continue;
9679 }
9680 if (!osdmap.exists(from)) {
9681 ss << "osd." << from << " does not exist";
9682 err = -ENOENT;
9683 goto reply;
9684 }
9685 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
9686 ss << "osd." << to << " does not exist";
9687 err = -ENOENT;
9688 goto reply;
9689 }
9690 pair<int32_t,int32_t> entry = make_pair(from, to);
9691 auto it = std::find(new_pg_upmap_items.begin(),
9692 new_pg_upmap_items.end(), entry);
9693 if (it != new_pg_upmap_items.end()) {
9694 ss << "osd." << from << " -> osd." << to << " already exists, ";
9695 continue;
9696 }
9697 new_pg_upmap_items.push_back(entry);
9698 items << from << "->" << to << ",";
9699 }
9700 string out(items.str());
9701 out.resize(out.size() - 1); // drop last ','
9702 out += "]";
9703
9704 if (new_pg_upmap_items.empty()) {
9705 ss << "no valid upmap items(pairs) is specified";
9706 err = -EINVAL;
9707 goto reply;
9708 }
9709
9710 pending_inc.new_pg_upmap_items[pgid] =
9711 mempool::osdmap::vector<pair<int32_t,int32_t>>(
9712 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
9713 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
9714 }
9715 break;
9716
9717 case OP_RM_PG_UPMAP_ITEMS:
9718 {
9719 pending_inc.old_pg_upmap_items.insert(pgid);
9720 ss << "clear " << pgid << " pg_upmap_items mapping";
9721 }
9722 break;
9723
9724 default:
9725 assert(0 == "invalid option");
9726 }
9727
9728 goto update;
9729 } else if (prefix == "osd primary-affinity") {
9730 int64_t id;
9731 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9732 ss << "invalid osd id value '"
9733 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9734 err = -EINVAL;
9735 goto reply;
9736 }
9737 double w;
9738 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
9739 ss << "unable to parse 'weight' value '"
9740 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
9741 err = -EINVAL;
9742 goto reply;
9743 }
9744 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
9745 if (ww < 0L) {
9746 ss << "weight must be >= 0";
9747 err = -EINVAL;
9748 goto reply;
9749 }
9750 if (osdmap.require_min_compat_client > 0 &&
9751 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
9752 ss << "require_min_compat_client "
9753 << ceph_release_name(osdmap.require_min_compat_client)
9754 << " < firefly, which is required for primary-affinity";
9755 err = -EPERM;
9756 goto reply;
9757 } else if (!g_conf->mon_osd_allow_primary_affinity) {
9758 ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
9759 err = -EPERM;
9760 goto reply;
9761 }
9762 err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
9763 if (err == -EAGAIN)
9764 goto wait;
9765 if (err < 0)
9766 goto reply;
9767 if (osdmap.exists(id)) {
9768 pending_inc.new_primary_affinity[id] = ww;
9769 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
9770 getline(ss, rs);
9771 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9772 get_last_committed() + 1));
9773 return true;
9774 } else {
9775 ss << "osd." << id << " does not exist";
9776 err = -ENOENT;
9777 goto reply;
9778 }
9779 } else if (prefix == "osd reweight") {
9780 int64_t id;
9781 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9782 ss << "unable to parse osd id value '"
9783 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9784 err = -EINVAL;
9785 goto reply;
9786 }
9787 double w;
9788 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
9789 ss << "unable to parse weight value '"
9790 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
9791 err = -EINVAL;
9792 goto reply;
9793 }
9794 long ww = (int)((double)CEPH_OSD_IN*w);
9795 if (ww < 0L) {
9796 ss << "weight must be >= 0";
9797 err = -EINVAL;
9798 goto reply;
9799 }
9800 if (osdmap.exists(id)) {
9801 pending_inc.new_weight[id] = ww;
9802 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
9803 getline(ss, rs);
9804 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9805 get_last_committed() + 1));
9806 return true;
9807 } else {
9808 ss << "osd." << id << " does not exist";
9809 err = -ENOENT;
9810 goto reply;
9811 }
9812 } else if (prefix == "osd reweightn") {
9813 map<int32_t, uint32_t> weights;
9814 err = parse_reweights(g_ceph_context, cmdmap, osdmap, &weights);
9815 if (err) {
9816 ss << "unable to parse 'weights' value '"
9817 << cmd_vartype_stringify(cmdmap["weights"]) << "'";
9818 goto reply;
9819 }
9820 pending_inc.new_weight.insert(weights.begin(), weights.end());
9821 wait_for_finished_proposal(
9822 op,
9823 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
9824 return true;
9825 } else if (prefix == "osd lost") {
9826 int64_t id;
9827 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9828 ss << "unable to parse osd id value '"
9829 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9830 err = -EINVAL;
9831 goto reply;
9832 }
9833 string sure;
9834 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
9835 ss << "are you SURE? this might mean real, permanent data loss. pass "
9836 "--yes-i-really-mean-it if you really do.";
9837 err = -EPERM;
9838 goto reply;
9839 } else if (!osdmap.exists(id)) {
9840 ss << "osd." << id << " does not exist";
9841 err = -ENOENT;
9842 goto reply;
9843 } else if (!osdmap.is_down(id)) {
9844 ss << "osd." << id << " is not down";
9845 err = -EBUSY;
9846 goto reply;
9847 } else {
9848 epoch_t e = osdmap.get_info(id).down_at;
9849 pending_inc.new_lost[id] = e;
9850 ss << "marked osd lost in epoch " << e;
9851 getline(ss, rs);
9852 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9853 get_last_committed() + 1));
9854 return true;
9855 }
9856
9857 } else if (prefix == "osd destroy" || prefix == "osd purge") {
9858 /* Destroying an OSD means that we don't expect to further make use of
9859 * the OSDs data (which may even become unreadable after this operation),
9860 * and that we are okay with scrubbing all its cephx keys and config-key
9861 * data (which may include lockbox keys, thus rendering the osd's data
9862 * unreadable).
9863 *
9864 * The OSD will not be removed. Instead, we will mark it as destroyed,
9865 * such that a subsequent call to `create` will not reuse the osd id.
9866 * This will play into being able to recreate the OSD, at the same
9867 * crush location, with minimal data movement.
9868 */
9869
9870 // make sure authmon is writeable.
9871 if (!mon->authmon()->is_writeable()) {
9872 dout(10) << __func__ << " waiting for auth mon to be writeable for "
9873 << "osd destroy" << dendl;
9874 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
9875 return false;
9876 }
9877
9878 int64_t id;
9879 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9880 ss << "unable to parse osd id value '"
9881 << cmd_vartype_stringify(cmdmap["id"]) << "";
9882 err = -EINVAL;
9883 goto reply;
9884 }
9885
9886 bool is_destroy = (prefix == "osd destroy");
9887 if (!is_destroy) {
9888 assert("osd purge" == prefix);
9889 }
9890
9891 string sure;
9892 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
9893 sure != "--yes-i-really-mean-it") {
9894 ss << "Are you SURE? This will mean real, permanent data loss, as well "
9895 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
9896 << "really do.";
9897 err = -EPERM;
9898 goto reply;
9899 } else if (!osdmap.exists(id)) {
9900 ss << "osd." << id << " does not exist";
9901 err = 0; // idempotent
9902 goto reply;
9903 } else if (osdmap.is_up(id)) {
9904 ss << "osd." << id << " is not `down`.";
9905 err = -EBUSY;
9906 goto reply;
9907 } else if (is_destroy && osdmap.is_destroyed(id)) {
9908 ss << "destroyed osd." << id;
9909 err = 0;
9910 goto reply;
9911 }
9912
9913 bool goto_reply = false;
9914
9915 paxos->plug();
9916 if (is_destroy) {
9917 err = prepare_command_osd_destroy(id, ss);
9918 // we checked above that it should exist.
9919 assert(err != -ENOENT);
9920 } else {
9921 err = prepare_command_osd_purge(id, ss);
9922 if (err == -ENOENT) {
9923 err = 0;
9924 ss << "osd." << id << " does not exist.";
9925 goto_reply = true;
9926 }
9927 }
9928 paxos->unplug();
9929
9930 if (err < 0 || goto_reply) {
9931 goto reply;
9932 }
9933
9934 if (is_destroy) {
9935 ss << "destroyed osd." << id;
9936 } else {
9937 ss << "purged osd." << id;
9938 }
9939
9940 getline(ss, rs);
9941 wait_for_finished_proposal(op,
9942 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
9943 force_immediate_propose();
9944 return true;
9945
9946 } else if (prefix == "osd new") {
9947
9948 // make sure authmon is writeable.
9949 if (!mon->authmon()->is_writeable()) {
9950 dout(10) << __func__ << " waiting for auth mon to be writeable for "
9951 << "osd new" << dendl;
9952 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
9953 return false;
9954 }
9955
9956 map<string,string> secrets_map;
9957
9958 bufferlist bl = m->get_data();
9959 string secrets_json = bl.to_str();
9960 dout(20) << __func__ << " osd new json = " << secrets_json << dendl;
9961
9962 err = get_json_str_map(secrets_json, ss, &secrets_map);
9963 if (err < 0)
9964 goto reply;
9965
9966 dout(20) << __func__ << " osd new secrets " << secrets_map << dendl;
9967
9968 paxos->plug();
9969 err = prepare_command_osd_new(op, cmdmap, secrets_map, ss, f.get());
9970 paxos->unplug();
9971
9972 if (err < 0) {
9973 goto reply;
9974 }
9975
9976 if (f) {
9977 f->flush(rdata);
9978 } else {
9979 rdata.append(ss);
9980 }
9981
9982 if (err == EEXIST) {
9983 // idempotent operation
9984 err = 0;
9985 goto reply;
9986 }
9987
9988 wait_for_finished_proposal(op,
9989 new Monitor::C_Command(mon, op, 0, rs, rdata,
9990 get_last_committed() + 1));
9991 force_immediate_propose();
9992 return true;
9993
9994 } else if (prefix == "osd create") {
9995
9996 // optional id provided?
9997 int64_t id = -1, cmd_id = -1;
9998 if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
9999 if (cmd_id < 0) {
10000 ss << "invalid osd id value '" << cmd_id << "'";
10001 err = -EINVAL;
10002 goto reply;
10003 }
10004 dout(10) << " osd create got id " << cmd_id << dendl;
10005 }
10006
10007 uuid_d uuid;
10008 string uuidstr;
10009 if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
10010 if (!uuid.parse(uuidstr.c_str())) {
10011 ss << "invalid uuid value '" << uuidstr << "'";
10012 err = -EINVAL;
10013 goto reply;
10014 }
10015 // we only care about the id if we also have the uuid, to
10016 // ensure the operation's idempotency.
10017 id = cmd_id;
10018 }
10019
10020 int32_t new_id = -1;
10021 err = prepare_command_osd_create(id, uuid, &new_id, ss);
10022 if (err < 0) {
10023 if (err == -EAGAIN) {
10024 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10025 return true;
10026 }
10027 // a check has failed; reply to the user.
10028 goto reply;
10029
10030 } else if (err == EEXIST) {
10031 // this is an idempotent operation; we can go ahead and reply.
10032 if (f) {
10033 f->open_object_section("created_osd");
10034 f->dump_int("osdid", new_id);
10035 f->close_section();
10036 f->flush(rdata);
10037 } else {
10038 ss << new_id;
10039 rdata.append(ss);
10040 }
10041 err = 0;
10042 goto reply;
10043 }
10044
10045 do_osd_create(id, uuid, &new_id);
10046
10047 if (f) {
10048 f->open_object_section("created_osd");
10049 f->dump_int("osdid", new_id);
10050 f->close_section();
10051 f->flush(rdata);
10052 } else {
10053 ss << new_id;
10054 rdata.append(ss);
10055 }
10056 wait_for_finished_proposal(op,
10057 new Monitor::C_Command(mon, op, 0, rs, rdata,
10058 get_last_committed() + 1));
10059 return true;
10060
10061 } else if (prefix == "osd blacklist clear") {
10062 pending_inc.new_blacklist.clear();
10063 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
10064 osdmap.get_blacklist(&blacklist);
10065 for (const auto &entry : blacklist) {
10066 pending_inc.old_blacklist.push_back(entry.first);
10067 }
10068 ss << " removed all blacklist entries";
10069 getline(ss, rs);
10070 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10071 get_last_committed() + 1));
10072 return true;
10073 } else if (prefix == "osd blacklist") {
10074 string addrstr;
10075 cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
10076 entity_addr_t addr;
10077 if (!addr.parse(addrstr.c_str(), 0)) {
10078 ss << "unable to parse address " << addrstr;
10079 err = -EINVAL;
10080 goto reply;
10081 }
10082 else {
10083 string blacklistop;
10084 cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
10085 if (blacklistop == "add") {
10086 utime_t expires = ceph_clock_now();
10087 double d;
10088 // default one hour
10089 cmd_getval(g_ceph_context, cmdmap, "expire", d,
10090 g_conf->mon_osd_blacklist_default_expire);
10091 expires += d;
10092
10093 pending_inc.new_blacklist[addr] = expires;
10094
10095 {
10096 // cancel any pending un-blacklisting request too
10097 auto it = std::find(pending_inc.old_blacklist.begin(),
10098 pending_inc.old_blacklist.end(), addr);
10099 if (it != pending_inc.old_blacklist.end()) {
10100 pending_inc.old_blacklist.erase(it);
10101 }
10102 }
10103
10104 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
10105 getline(ss, rs);
10106 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10107 get_last_committed() + 1));
10108 return true;
10109 } else if (blacklistop == "rm") {
10110 if (osdmap.is_blacklisted(addr) ||
10111 pending_inc.new_blacklist.count(addr)) {
10112 if (osdmap.is_blacklisted(addr))
10113 pending_inc.old_blacklist.push_back(addr);
10114 else
10115 pending_inc.new_blacklist.erase(addr);
10116 ss << "un-blacklisting " << addr;
10117 getline(ss, rs);
10118 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10119 get_last_committed() + 1));
10120 return true;
10121 }
10122 ss << addr << " isn't blacklisted";
10123 err = 0;
10124 goto reply;
10125 }
10126 }
10127 } else if (prefix == "osd pool mksnap") {
10128 string poolstr;
10129 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10130 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10131 if (pool < 0) {
10132 ss << "unrecognized pool '" << poolstr << "'";
10133 err = -ENOENT;
10134 goto reply;
10135 }
10136 string snapname;
10137 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10138 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10139 if (p->is_unmanaged_snaps_mode()) {
10140 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10141 err = -EINVAL;
10142 goto reply;
10143 } else if (p->snap_exists(snapname.c_str())) {
10144 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10145 err = 0;
10146 goto reply;
10147 } else if (p->is_tier()) {
10148 ss << "pool " << poolstr << " is a cache tier";
10149 err = -EINVAL;
10150 goto reply;
10151 }
10152 pg_pool_t *pp = 0;
10153 if (pending_inc.new_pools.count(pool))
10154 pp = &pending_inc.new_pools[pool];
10155 if (!pp) {
10156 pp = &pending_inc.new_pools[pool];
10157 *pp = *p;
10158 }
10159 if (pp->snap_exists(snapname.c_str())) {
10160 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10161 } else {
10162 pp->add_snap(snapname.c_str(), ceph_clock_now());
10163 pp->set_snap_epoch(pending_inc.epoch);
10164 ss << "created pool " << poolstr << " snap " << snapname;
10165 }
10166 getline(ss, rs);
10167 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10168 get_last_committed() + 1));
10169 return true;
10170 } else if (prefix == "osd pool rmsnap") {
10171 string poolstr;
10172 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10173 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10174 if (pool < 0) {
10175 ss << "unrecognized pool '" << poolstr << "'";
10176 err = -ENOENT;
10177 goto reply;
10178 }
10179 string snapname;
10180 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10181 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10182 if (p->is_unmanaged_snaps_mode()) {
10183 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10184 err = -EINVAL;
10185 goto reply;
10186 } else if (!p->snap_exists(snapname.c_str())) {
10187 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
10188 err = 0;
10189 goto reply;
10190 }
10191 pg_pool_t *pp = 0;
10192 if (pending_inc.new_pools.count(pool))
10193 pp = &pending_inc.new_pools[pool];
10194 if (!pp) {
10195 pp = &pending_inc.new_pools[pool];
10196 *pp = *p;
10197 }
10198 snapid_t sn = pp->snap_exists(snapname.c_str());
10199 if (sn) {
10200 pp->remove_snap(sn);
10201 pp->set_snap_epoch(pending_inc.epoch);
10202 ss << "removed pool " << poolstr << " snap " << snapname;
10203 } else {
10204 ss << "already removed pool " << poolstr << " snap " << snapname;
10205 }
10206 getline(ss, rs);
10207 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10208 get_last_committed() + 1));
10209 return true;
10210 } else if (prefix == "osd pool create") {
10211 int64_t pg_num;
10212 int64_t pgp_num;
10213 cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
10214 cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
10215
10216 string pool_type_str;
10217 cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
10218 if (pool_type_str.empty())
10219 pool_type_str = g_conf->osd_pool_default_type;
10220
10221 string poolstr;
10222 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10223 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10224 if (pool_id >= 0) {
10225 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10226 if (pool_type_str != p->get_type_name()) {
10227 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
10228 err = -EINVAL;
10229 } else {
10230 ss << "pool '" << poolstr << "' already exists";
10231 err = 0;
10232 }
10233 goto reply;
10234 }
10235
10236 int pool_type;
10237 if (pool_type_str == "replicated") {
10238 pool_type = pg_pool_t::TYPE_REPLICATED;
10239 } else if (pool_type_str == "erasure") {
10240 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
10241 CEPH_FEATURE_OSD_ERASURE_CODES,
10242 ss);
10243 if (err == -EAGAIN)
10244 goto wait;
10245 if (err)
10246 goto reply;
10247 pool_type = pg_pool_t::TYPE_ERASURE;
10248 } else {
10249 ss << "unknown pool type '" << pool_type_str << "'";
10250 err = -EINVAL;
10251 goto reply;
10252 }
10253
10254 bool implicit_rule_creation = false;
10255 string rule_name;
10256 cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
10257 string erasure_code_profile;
10258 cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
10259
10260 if (pool_type == pg_pool_t::TYPE_ERASURE) {
10261 if (erasure_code_profile == "")
10262 erasure_code_profile = "default";
10263 //handle the erasure code profile
10264 if (erasure_code_profile == "default") {
10265 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
10266 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
10267 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
10268 goto wait;
10269 }
10270
10271 map<string,string> profile_map;
10272 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
10273 profile_map,
10274 &ss);
10275 if (err)
10276 goto reply;
10277 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
10278 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
10279 goto wait;
10280 }
10281 }
10282 if (rule_name == "") {
10283 implicit_rule_creation = true;
10284 if (erasure_code_profile == "default") {
10285 rule_name = "erasure-code";
10286 } else {
10287 dout(1) << "implicitly use rule named after the pool: "
10288 << poolstr << dendl;
10289 rule_name = poolstr;
10290 }
10291 }
10292 } else {
10293 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
10294 rule_name = erasure_code_profile;
10295 }
10296
10297 if (!implicit_rule_creation && rule_name != "") {
10298 int rule;
10299 err = get_crush_rule(rule_name, &rule, &ss);
10300 if (err == -EAGAIN) {
10301 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10302 return true;
10303 }
10304 if (err)
10305 goto reply;
10306 }
10307
10308 int64_t expected_num_objects;
10309 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects", expected_num_objects, int64_t(0));
10310 if (expected_num_objects < 0) {
10311 ss << "'expected_num_objects' must be non-negative";
10312 err = -EINVAL;
10313 goto reply;
10314 }
10315
10316 int64_t fast_read_param;
10317 cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
10318 FastReadType fast_read = FAST_READ_DEFAULT;
10319 if (fast_read_param == 0)
10320 fast_read = FAST_READ_OFF;
10321 else if (fast_read_param > 0)
10322 fast_read = FAST_READ_ON;
10323
10324 err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
10325 -1, // default crush rule
10326 rule_name,
10327 pg_num, pgp_num,
10328 erasure_code_profile, pool_type,
10329 (uint64_t)expected_num_objects,
10330 fast_read,
10331 &ss);
10332 if (err < 0) {
10333 switch(err) {
10334 case -EEXIST:
10335 ss << "pool '" << poolstr << "' already exists";
10336 break;
10337 case -EAGAIN:
10338 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10339 return true;
10340 case -ERANGE:
10341 goto reply;
10342 default:
10343 goto reply;
10344 break;
10345 }
10346 } else {
10347 ss << "pool '" << poolstr << "' created";
10348 }
10349 getline(ss, rs);
10350 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10351 get_last_committed() + 1));
10352 return true;
10353
10354 } else if (prefix == "osd pool delete" ||
10355 prefix == "osd pool rm") {
10356 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
10357 string poolstr, poolstr2, sure;
10358 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10359 cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
10360 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
10361 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10362 if (pool < 0) {
10363 ss << "pool '" << poolstr << "' does not exist";
10364 err = 0;
10365 goto reply;
10366 }
10367
10368 bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
10369 if (poolstr2 != poolstr ||
10370 (sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
10371 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
10372 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
10373 << "followed by --yes-i-really-really-mean-it.";
10374 err = -EPERM;
10375 goto reply;
10376 }
10377 err = _prepare_remove_pool(pool, &ss, force_no_fake);
10378 if (err == -EAGAIN) {
10379 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10380 return true;
10381 }
10382 if (err < 0)
10383 goto reply;
10384 goto update;
10385 } else if (prefix == "osd pool rename") {
10386 string srcpoolstr, destpoolstr;
10387 cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
10388 cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
10389 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
10390 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
10391
10392 if (pool_src < 0) {
10393 if (pool_dst >= 0) {
10394 // src pool doesn't exist, dst pool does exist: to ensure idempotency
10395 // of operations, assume this rename succeeded, as it is not changing
10396 // the current state. Make sure we output something understandable
10397 // for whoever is issuing the command, if they are paying attention,
10398 // in case it was not intentional; or to avoid a "wtf?" and a bug
10399 // report in case it was intentional, while expecting a failure.
10400 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
10401 << destpoolstr << "' does -- assuming successful rename";
10402 err = 0;
10403 } else {
10404 ss << "unrecognized pool '" << srcpoolstr << "'";
10405 err = -ENOENT;
10406 }
10407 goto reply;
10408 } else if (pool_dst >= 0) {
10409 // source pool exists and so does the destination pool
10410 ss << "pool '" << destpoolstr << "' already exists";
10411 err = -EEXIST;
10412 goto reply;
10413 }
10414
10415 int ret = _prepare_rename_pool(pool_src, destpoolstr);
10416 if (ret == 0) {
10417 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
10418 } else {
10419 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
10420 << cpp_strerror(ret);
10421 }
10422 getline(ss, rs);
10423 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
10424 get_last_committed() + 1));
10425 return true;
10426
10427 } else if (prefix == "osd pool set") {
10428 err = prepare_command_pool_set(cmdmap, ss);
10429 if (err == -EAGAIN)
10430 goto wait;
10431 if (err < 0)
10432 goto reply;
10433
10434 getline(ss, rs);
10435 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10436 get_last_committed() + 1));
10437 return true;
10438 } else if (prefix == "osd tier add") {
10439 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10440 if (err == -EAGAIN)
10441 goto wait;
10442 if (err)
10443 goto reply;
10444 string poolstr;
10445 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10446 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10447 if (pool_id < 0) {
10448 ss << "unrecognized pool '" << poolstr << "'";
10449 err = -ENOENT;
10450 goto reply;
10451 }
10452 string tierpoolstr;
10453 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10454 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10455 if (tierpool_id < 0) {
10456 ss << "unrecognized pool '" << tierpoolstr << "'";
10457 err = -ENOENT;
10458 goto reply;
10459 }
10460 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10461 assert(p);
10462 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10463 assert(tp);
10464
10465 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
10466 goto reply;
10467 }
10468
10469 // make sure new tier is empty
10470 string force_nonempty;
10471 cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
10472 const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
10473 if (pstats && pstats->stats.sum.num_objects != 0 &&
10474 force_nonempty != "--force-nonempty") {
10475 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
10476 err = -ENOTEMPTY;
10477 goto reply;
10478 }
10479 if (tp->ec_pool()) {
10480 ss << "tier pool '" << tierpoolstr
10481 << "' is an ec pool, which cannot be a tier";
10482 err = -ENOTSUP;
10483 goto reply;
10484 }
10485 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
10486 ((force_nonempty != "--force-nonempty") ||
10487 (!g_conf->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
10488 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
10489 err = -ENOTEMPTY;
10490 goto reply;
10491 }
10492 // go
10493 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10494 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10495 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
10496 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10497 return true;
10498 }
10499 np->tiers.insert(tierpool_id);
10500 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
10501 ntp->tier_of = pool_id;
10502 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
10503 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10504 get_last_committed() + 1));
10505 return true;
10506 } else if (prefix == "osd tier remove" ||
10507 prefix == "osd tier rm") {
10508 string poolstr;
10509 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10510 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10511 if (pool_id < 0) {
10512 ss << "unrecognized pool '" << poolstr << "'";
10513 err = -ENOENT;
10514 goto reply;
10515 }
10516 string tierpoolstr;
10517 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10518 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10519 if (tierpool_id < 0) {
10520 ss << "unrecognized pool '" << tierpoolstr << "'";
10521 err = -ENOENT;
10522 goto reply;
10523 }
10524 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10525 assert(p);
10526 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10527 assert(tp);
10528
10529 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
10530 goto reply;
10531 }
10532
10533 if (p->tiers.count(tierpool_id) == 0) {
10534 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
10535 err = 0;
10536 goto reply;
10537 }
10538 if (tp->tier_of != pool_id) {
10539 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
10540 << osdmap.get_pool_name(tp->tier_of) << "': "
10541 // be scary about it; this is an inconsistency and bells must go off
10542 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
10543 err = -EINVAL;
10544 goto reply;
10545 }
10546 if (p->read_tier == tierpool_id) {
10547 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
10548 err = -EBUSY;
10549 goto reply;
10550 }
10551 // go
10552 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10553 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10554 if (np->tiers.count(tierpool_id) == 0 ||
10555 ntp->tier_of != pool_id ||
10556 np->read_tier == tierpool_id) {
10557 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10558 return true;
10559 }
10560 np->tiers.erase(tierpool_id);
10561 ntp->clear_tier();
10562 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
10563 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10564 get_last_committed() + 1));
10565 return true;
10566 } else if (prefix == "osd tier set-overlay") {
10567 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10568 if (err == -EAGAIN)
10569 goto wait;
10570 if (err)
10571 goto reply;
10572 string poolstr;
10573 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10574 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10575 if (pool_id < 0) {
10576 ss << "unrecognized pool '" << poolstr << "'";
10577 err = -ENOENT;
10578 goto reply;
10579 }
10580 string overlaypoolstr;
10581 cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
10582 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
10583 if (overlaypool_id < 0) {
10584 ss << "unrecognized pool '" << overlaypoolstr << "'";
10585 err = -ENOENT;
10586 goto reply;
10587 }
10588 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10589 assert(p);
10590 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
10591 assert(overlay_p);
10592 if (p->tiers.count(overlaypool_id) == 0) {
10593 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
10594 err = -EINVAL;
10595 goto reply;
10596 }
10597 if (p->read_tier == overlaypool_id) {
10598 err = 0;
10599 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
10600 goto reply;
10601 }
10602 if (p->has_read_tier()) {
10603 ss << "pool '" << poolstr << "' has overlay '"
10604 << osdmap.get_pool_name(p->read_tier)
10605 << "'; please remove-overlay first";
10606 err = -EINVAL;
10607 goto reply;
10608 }
10609
10610 // go
10611 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10612 np->read_tier = overlaypool_id;
10613 np->write_tier = overlaypool_id;
10614 np->set_last_force_op_resend(pending_inc.epoch);
10615 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
10616 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
10617 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
10618 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
10619 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
10620 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10621 get_last_committed() + 1));
10622 return true;
10623 } else if (prefix == "osd tier remove-overlay" ||
10624 prefix == "osd tier rm-overlay") {
10625 string poolstr;
10626 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10627 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10628 if (pool_id < 0) {
10629 ss << "unrecognized pool '" << poolstr << "'";
10630 err = -ENOENT;
10631 goto reply;
10632 }
10633 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10634 assert(p);
10635 if (!p->has_read_tier()) {
10636 err = 0;
10637 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
10638 goto reply;
10639 }
10640
10641 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
10642 goto reply;
10643 }
10644
10645 // go
10646 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10647 if (np->has_read_tier()) {
10648 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
10649 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
10650 nop->set_last_force_op_resend(pending_inc.epoch);
10651 }
10652 if (np->has_write_tier()) {
10653 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
10654 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
10655 nop->set_last_force_op_resend(pending_inc.epoch);
10656 }
10657 np->clear_read_tier();
10658 np->clear_write_tier();
10659 np->set_last_force_op_resend(pending_inc.epoch);
10660 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
10661 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10662 get_last_committed() + 1));
10663 return true;
10664 } else if (prefix == "osd tier cache-mode") {
10665 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10666 if (err == -EAGAIN)
10667 goto wait;
10668 if (err)
10669 goto reply;
10670 string poolstr;
10671 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10672 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10673 if (pool_id < 0) {
10674 ss << "unrecognized pool '" << poolstr << "'";
10675 err = -ENOENT;
10676 goto reply;
10677 }
10678 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10679 assert(p);
10680 if (!p->is_tier()) {
10681 ss << "pool '" << poolstr << "' is not a tier";
10682 err = -EINVAL;
10683 goto reply;
10684 }
10685 string modestr;
10686 cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
10687 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
10688 if (mode < 0) {
10689 ss << "'" << modestr << "' is not a valid cache mode";
10690 err = -EINVAL;
10691 goto reply;
10692 }
10693
10694 string sure;
10695 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
10696 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10697 mode != pg_pool_t::CACHEMODE_NONE &&
10698 mode != pg_pool_t::CACHEMODE_PROXY &&
10699 mode != pg_pool_t::CACHEMODE_READPROXY) &&
10700 sure != "--yes-i-really-mean-it") {
10701 ss << "'" << modestr << "' is not a well-supported cache mode and may "
10702 << "corrupt your data. pass --yes-i-really-mean-it to force.";
10703 err = -EPERM;
10704 goto reply;
10705 }
10706
10707 // pool already has this cache-mode set and there are no pending changes
10708 if (p->cache_mode == mode &&
10709 (pending_inc.new_pools.count(pool_id) == 0 ||
10710 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
10711 ss << "set cache-mode for pool '" << poolstr << "'"
10712 << " to " << pg_pool_t::get_cache_mode_name(mode);
10713 err = 0;
10714 goto reply;
10715 }
10716
10717 /* Mode description:
10718 *
10719 * none: No cache-mode defined
10720 * forward: Forward all reads and writes to base pool
10721 * writeback: Cache writes, promote reads from base pool
10722 * readonly: Forward writes to base pool
10723 * readforward: Writes are in writeback mode, Reads are in forward mode
10724 * proxy: Proxy all reads and writes to base pool
10725 * readproxy: Writes are in writeback mode, Reads are in proxy mode
10726 *
10727 * Hence, these are the allowed transitions:
10728 *
10729 * none -> any
10730 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10731 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10732 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
10733 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
10734 * writeback -> readforward || readproxy || forward || proxy
10735 * readonly -> any
10736 */
10737
10738 // We check if the transition is valid against the current pool mode, as
10739 // it is the only committed state thus far. We will blantly squash
10740 // whatever mode is on the pending state.
10741
10742 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
10743 (mode != pg_pool_t::CACHEMODE_FORWARD &&
10744 mode != pg_pool_t::CACHEMODE_PROXY &&
10745 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10746 mode != pg_pool_t::CACHEMODE_READPROXY)) {
10747 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
10748 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
10749 << "' pool; only '"
10750 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
10751 << "','"
10752 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
10753 << "','"
10754 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
10755 << "','"
10756 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
10757 << "' allowed.";
10758 err = -EINVAL;
10759 goto reply;
10760 }
10761 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
10762 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10763 mode != pg_pool_t::CACHEMODE_FORWARD &&
10764 mode != pg_pool_t::CACHEMODE_PROXY &&
10765 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
10766
10767 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
10768 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10769 mode != pg_pool_t::CACHEMODE_FORWARD &&
10770 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10771 mode != pg_pool_t::CACHEMODE_PROXY)) ||
10772
10773 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
10774 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10775 mode != pg_pool_t::CACHEMODE_FORWARD &&
10776 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10777 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
10778
10779 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
10780 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10781 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10782 mode != pg_pool_t::CACHEMODE_PROXY &&
10783 mode != pg_pool_t::CACHEMODE_READPROXY))) {
10784
10785 const pool_stat_t* pstats =
10786 mon->pgservice->get_pool_stat(pool_id);
10787
10788 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
10789 ss << "unable to set cache-mode '"
10790 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
10791 << "': dirty objects found";
10792 err = -EBUSY;
10793 goto reply;
10794 }
10795 }
10796 // go
10797 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10798 np->cache_mode = mode;
10799 // set this both when moving to and from cache_mode NONE. this is to
10800 // capture legacy pools that were set up before this flag existed.
10801 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
10802 ss << "set cache-mode for pool '" << poolstr
10803 << "' to " << pg_pool_t::get_cache_mode_name(mode);
10804 if (mode == pg_pool_t::CACHEMODE_NONE) {
10805 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
10806 assert(base_pool);
10807 if (base_pool->read_tier == pool_id ||
10808 base_pool->write_tier == pool_id)
10809 ss <<" (WARNING: pool is still configured as read or write tier)";
10810 }
10811 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10812 get_last_committed() + 1));
10813 return true;
10814 } else if (prefix == "osd tier add-cache") {
10815 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10816 if (err == -EAGAIN)
10817 goto wait;
10818 if (err)
10819 goto reply;
10820 string poolstr;
10821 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10822 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10823 if (pool_id < 0) {
10824 ss << "unrecognized pool '" << poolstr << "'";
10825 err = -ENOENT;
10826 goto reply;
10827 }
10828 string tierpoolstr;
10829 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10830 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10831 if (tierpool_id < 0) {
10832 ss << "unrecognized pool '" << tierpoolstr << "'";
10833 err = -ENOENT;
10834 goto reply;
10835 }
10836 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10837 assert(p);
10838 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10839 assert(tp);
10840
10841 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
10842 goto reply;
10843 }
10844
10845 int64_t size = 0;
10846 if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
10847 ss << "unable to parse 'size' value '"
10848 << cmd_vartype_stringify(cmdmap["size"]) << "'";
10849 err = -EINVAL;
10850 goto reply;
10851 }
10852 // make sure new tier is empty
10853 const pool_stat_t *pstats =
10854 mon->pgservice->get_pool_stat(tierpool_id);
10855 if (pstats && pstats->stats.sum.num_objects != 0) {
10856 ss << "tier pool '" << tierpoolstr << "' is not empty";
10857 err = -ENOTEMPTY;
10858 goto reply;
10859 }
10860 string modestr = g_conf->osd_tier_default_cache_mode;
10861 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
10862 if (mode < 0) {
10863 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
10864 err = -EINVAL;
10865 goto reply;
10866 }
10867 HitSet::Params hsp;
10868 if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
10869 BloomHitSet::Params *bsp = new BloomHitSet::Params;
10870 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
10871 hsp = HitSet::Params(bsp);
10872 } else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
10873 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
10874 }
10875 else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
10876 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
10877 } else {
10878 ss << "osd tier cache default hit set type '" <<
10879 g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
10880 err = -EINVAL;
10881 goto reply;
10882 }
10883 // go
10884 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10885 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10886 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
10887 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10888 return true;
10889 }
10890 np->tiers.insert(tierpool_id);
10891 np->read_tier = np->write_tier = tierpool_id;
10892 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
10893 np->set_last_force_op_resend(pending_inc.epoch);
10894 ntp->set_last_force_op_resend(pending_inc.epoch);
10895 ntp->tier_of = pool_id;
10896 ntp->cache_mode = mode;
10897 ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
10898 ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
10899 ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
10900 ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
10901 ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
10902 ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
10903 ntp->hit_set_params = hsp;
10904 ntp->target_max_bytes = size;
10905 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
10906 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10907 get_last_committed() + 1));
10908 return true;
10909 } else if (prefix == "osd pool set-quota") {
10910 string poolstr;
10911 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10912 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10913 if (pool_id < 0) {
10914 ss << "unrecognized pool '" << poolstr << "'";
10915 err = -ENOENT;
10916 goto reply;
10917 }
10918
10919 string field;
10920 cmd_getval(g_ceph_context, cmdmap, "field", field);
10921 if (field != "max_objects" && field != "max_bytes") {
10922 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
10923 err = -EINVAL;
10924 goto reply;
10925 }
10926
10927 // val could contain unit designations, so we treat as a string
10928 string val;
10929 cmd_getval(g_ceph_context, cmdmap, "val", val);
10930 stringstream tss;
10931 int64_t value = unit_to_bytesize(val, &tss);
10932 if (value < 0) {
10933 ss << "error parsing value '" << value << "': " << tss.str();
10934 err = value;
10935 goto reply;
10936 }
10937
10938 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
10939 if (field == "max_objects") {
10940 pi->quota_max_objects = value;
10941 } else if (field == "max_bytes") {
10942 pi->quota_max_bytes = value;
10943 } else {
10944 assert(0 == "unrecognized option");
10945 }
10946 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
10947 rs = ss.str();
10948 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10949 get_last_committed() + 1));
10950 return true;
10951 } else if (prefix == "osd pool application enable" ||
10952 prefix == "osd pool application disable" ||
10953 prefix == "osd pool application set" ||
10954 prefix == "osd pool application rm") {
10955 err = prepare_command_pool_application(prefix, cmdmap, ss);
10956 if (err == -EAGAIN)
10957 goto wait;
10958 if (err < 0)
10959 goto reply;
10960
10961 getline(ss, rs);
10962 wait_for_finished_proposal(
10963 op, new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
10964 return true;
10965 } else if (prefix == "osd reweight-by-pg" ||
10966 prefix == "osd reweight-by-utilization" ||
10967 prefix == "osd test-reweight-by-pg" ||
10968 prefix == "osd test-reweight-by-utilization") {
10969 bool by_pg =
10970 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
10971 bool dry_run =
10972 prefix == "osd test-reweight-by-pg" ||
10973 prefix == "osd test-reweight-by-utilization";
10974 int64_t oload;
10975 cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
10976 set<int64_t> pools;
10977 vector<string> poolnamevec;
10978 cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
10979 for (unsigned j = 0; j < poolnamevec.size(); j++) {
10980 int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
10981 if (pool < 0) {
10982 ss << "pool '" << poolnamevec[j] << "' does not exist";
10983 err = -ENOENT;
10984 goto reply;
10985 }
10986 pools.insert(pool);
10987 }
10988 double max_change = g_conf->mon_reweight_max_change;
10989 cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
10990 if (max_change <= 0.0) {
10991 ss << "max_change " << max_change << " must be positive";
10992 err = -EINVAL;
10993 goto reply;
10994 }
10995 int64_t max_osds = g_conf->mon_reweight_max_osds;
10996 cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
10997 if (max_osds <= 0) {
10998 ss << "max_osds " << max_osds << " must be positive";
10999 err = -EINVAL;
11000 goto reply;
11001 }
11002 string no_increasing;
11003 cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
11004 string out_str;
11005 mempool::osdmap::map<int32_t, uint32_t> new_weights;
11006 err = mon->pgservice->reweight_by_utilization(osdmap,
11007 oload,
11008 max_change,
11009 max_osds,
11010 by_pg,
11011 pools.empty() ? NULL : &pools,
11012 no_increasing == "--no-increasing",
11013 &new_weights,
11014 &ss, &out_str, f.get());
11015 if (err >= 0) {
11016 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
11017 }
11018 if (f)
11019 f->flush(rdata);
11020 else
11021 rdata.append(out_str);
11022 if (err < 0) {
11023 ss << "FAILED reweight-by-pg";
11024 } else if (err == 0 || dry_run) {
11025 ss << "no change";
11026 } else {
11027 ss << "SUCCESSFUL reweight-by-pg";
11028 pending_inc.new_weight = std::move(new_weights);
11029 wait_for_finished_proposal(
11030 op,
11031 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11032 return true;
11033 }
11034 } else if (prefix == "osd force-create-pg") {
11035 pg_t pgid;
11036 string pgidstr;
11037 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
11038 if (!pgid.parse(pgidstr.c_str())) {
11039 ss << "invalid pgid '" << pgidstr << "'";
11040 err = -EINVAL;
11041 goto reply;
11042 }
11043 bool creating_now;
11044 {
11045 std::lock_guard<std::mutex> l(creating_pgs_lock);
11046 auto emplaced = creating_pgs.pgs.emplace(pgid,
11047 make_pair(osdmap.get_epoch(),
11048 ceph_clock_now()));
11049 creating_now = emplaced.second;
11050 }
11051 if (creating_now) {
11052 ss << "pg " << pgidstr << " now creating, ok";
11053 err = 0;
11054 goto update;
11055 } else {
11056 ss << "pg " << pgid << " already creating";
11057 err = 0;
11058 goto reply;
11059 }
11060 } else {
11061 err = -EINVAL;
11062 }
11063
11064 reply:
11065 getline(ss, rs);
11066 if (err < 0 && rs.length() == 0)
11067 rs = cpp_strerror(err);
11068 mon->reply_command(op, err, rs, rdata, get_last_committed());
11069 return ret;
11070
11071 update:
11072 getline(ss, rs);
11073 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11074 get_last_committed() + 1));
11075 return true;
11076
11077 wait:
11078 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11079 return true;
11080 }
11081
11082 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
11083 {
11084 op->mark_osdmon_event(__func__);
11085 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11086
11087 if (m->fsid != mon->monmap->fsid) {
11088 dout(0) << __func__ << " drop message on fsid " << m->fsid
11089 << " != " << mon->monmap->fsid << " for " << *m << dendl;
11090 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11091 return true;
11092 }
11093
11094 if (m->op == POOL_OP_CREATE)
11095 return preprocess_pool_op_create(op);
11096
11097 if (!osdmap.get_pg_pool(m->pool)) {
11098 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11099 _pool_op_reply(op, 0, osdmap.get_epoch());
11100 return true;
11101 }
11102
11103 // check if the snap and snapname exist
11104 bool snap_exists = false;
11105 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
11106 if (p->snap_exists(m->name.c_str()))
11107 snap_exists = true;
11108
11109 switch (m->op) {
11110 case POOL_OP_CREATE_SNAP:
11111 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
11112 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11113 return true;
11114 }
11115 if (snap_exists) {
11116 _pool_op_reply(op, 0, osdmap.get_epoch());
11117 return true;
11118 }
11119 return false;
11120 case POOL_OP_CREATE_UNMANAGED_SNAP:
11121 if (p->is_pool_snaps_mode()) {
11122 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11123 return true;
11124 }
11125 return false;
11126 case POOL_OP_DELETE_SNAP:
11127 if (p->is_unmanaged_snaps_mode()) {
11128 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11129 return true;
11130 }
11131 if (!snap_exists) {
11132 _pool_op_reply(op, 0, osdmap.get_epoch());
11133 return true;
11134 }
11135 return false;
11136 case POOL_OP_DELETE_UNMANAGED_SNAP:
11137 if (p->is_pool_snaps_mode()) {
11138 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11139 return true;
11140 }
11141 if (p->is_removed_snap(m->snapid)) {
11142 _pool_op_reply(op, 0, osdmap.get_epoch());
11143 return true;
11144 }
11145 return false;
11146 case POOL_OP_DELETE:
11147 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
11148 _pool_op_reply(op, 0, osdmap.get_epoch());
11149 return true;
11150 }
11151 return false;
11152 case POOL_OP_AUID_CHANGE:
11153 return false;
11154 default:
11155 ceph_abort();
11156 break;
11157 }
11158
11159 return false;
11160 }
11161
11162 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
11163 {
11164 op->mark_osdmon_event(__func__);
11165 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11166 MonSession *session = m->get_session();
11167 if (!session) {
11168 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11169 return true;
11170 }
11171 if (!session->is_capable("osd", MON_CAP_W)) {
11172 dout(5) << "attempt to create new pool without sufficient auid privileges!"
11173 << "message: " << *m << std::endl
11174 << "caps: " << session->caps << dendl;
11175 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11176 return true;
11177 }
11178
11179 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
11180 if (pool >= 0) {
11181 _pool_op_reply(op, 0, osdmap.get_epoch());
11182 return true;
11183 }
11184
11185 return false;
11186 }
11187
11188 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
11189 {
11190 op->mark_osdmon_event(__func__);
11191 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11192 dout(10) << "prepare_pool_op " << *m << dendl;
11193 if (m->op == POOL_OP_CREATE) {
11194 return prepare_pool_op_create(op);
11195 } else if (m->op == POOL_OP_DELETE) {
11196 return prepare_pool_op_delete(op);
11197 }
11198
11199 int ret = 0;
11200 bool changed = false;
11201
11202 if (!osdmap.have_pg_pool(m->pool)) {
11203 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
11204 return false;
11205 }
11206
11207 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
11208
11209 switch (m->op) {
11210 case POOL_OP_CREATE_SNAP:
11211 if (pool->is_tier()) {
11212 ret = -EINVAL;
11213 _pool_op_reply(op, ret, osdmap.get_epoch());
11214 return false;
11215 } // else, fall through
11216 case POOL_OP_DELETE_SNAP:
11217 if (!pool->is_unmanaged_snaps_mode()) {
11218 bool snap_exists = pool->snap_exists(m->name.c_str());
11219 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
11220 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
11221 ret = 0;
11222 } else {
11223 break;
11224 }
11225 } else {
11226 ret = -EINVAL;
11227 }
11228 _pool_op_reply(op, ret, osdmap.get_epoch());
11229 return false;
11230
11231 case POOL_OP_DELETE_UNMANAGED_SNAP:
11232 // we won't allow removal of an unmanaged snapshot from a pool
11233 // not in unmanaged snaps mode.
11234 if (!pool->is_unmanaged_snaps_mode()) {
11235 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
11236 return false;
11237 }
11238 /* fall-thru */
11239 case POOL_OP_CREATE_UNMANAGED_SNAP:
11240 // but we will allow creating an unmanaged snapshot on any pool
11241 // as long as it is not in 'pool' snaps mode.
11242 if (pool->is_pool_snaps_mode()) {
11243 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11244 return false;
11245 }
11246 }
11247
11248 // projected pool info
11249 pg_pool_t pp;
11250 if (pending_inc.new_pools.count(m->pool))
11251 pp = pending_inc.new_pools[m->pool];
11252 else
11253 pp = *osdmap.get_pg_pool(m->pool);
11254
11255 bufferlist reply_data;
11256
11257 // pool snaps vs unmanaged snaps are mutually exclusive
11258 switch (m->op) {
11259 case POOL_OP_CREATE_SNAP:
11260 case POOL_OP_DELETE_SNAP:
11261 if (pp.is_unmanaged_snaps_mode()) {
11262 ret = -EINVAL;
11263 goto out;
11264 }
11265 break;
11266
11267 case POOL_OP_CREATE_UNMANAGED_SNAP:
11268 case POOL_OP_DELETE_UNMANAGED_SNAP:
11269 if (pp.is_pool_snaps_mode()) {
11270 ret = -EINVAL;
11271 goto out;
11272 }
11273 }
11274
11275 switch (m->op) {
11276 case POOL_OP_CREATE_SNAP:
11277 if (!pp.snap_exists(m->name.c_str())) {
11278 pp.add_snap(m->name.c_str(), ceph_clock_now());
11279 dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
11280 changed = true;
11281 }
11282 break;
11283
11284 case POOL_OP_DELETE_SNAP:
11285 {
11286 snapid_t s = pp.snap_exists(m->name.c_str());
11287 if (s) {
11288 pp.remove_snap(s);
11289 changed = true;
11290 }
11291 }
11292 break;
11293
11294 case POOL_OP_CREATE_UNMANAGED_SNAP:
11295 {
11296 uint64_t snapid;
11297 pp.add_unmanaged_snap(snapid);
11298 ::encode(snapid, reply_data);
11299 changed = true;
11300 }
11301 break;
11302
11303 case POOL_OP_DELETE_UNMANAGED_SNAP:
11304 if (!pp.is_removed_snap(m->snapid)) {
11305 pp.remove_unmanaged_snap(m->snapid);
11306 changed = true;
11307 }
11308 break;
11309
11310 case POOL_OP_AUID_CHANGE:
11311 if (pp.auid != m->auid) {
11312 pp.auid = m->auid;
11313 changed = true;
11314 }
11315 break;
11316
11317 default:
11318 ceph_abort();
11319 break;
11320 }
11321
11322 if (changed) {
11323 pp.set_snap_epoch(pending_inc.epoch);
11324 pending_inc.new_pools[m->pool] = pp;
11325 }
11326
11327 out:
11328 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
11329 return true;
11330 }
11331
11332 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
11333 {
11334 op->mark_osdmon_event(__func__);
11335 int err = prepare_new_pool(op);
11336 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
11337 return true;
11338 }
11339
11340 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
11341 ostream *ss)
11342 {
11343 const string& poolstr = osdmap.get_pool_name(pool_id);
11344
11345 // If the Pool is in use by CephFS, refuse to delete it
11346 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
11347 if (pending_fsmap.pool_in_use(pool_id)) {
11348 *ss << "pool '" << poolstr << "' is in use by CephFS";
11349 return -EBUSY;
11350 }
11351
11352 if (pool.tier_of >= 0) {
11353 *ss << "pool '" << poolstr << "' is a tier of '"
11354 << osdmap.get_pool_name(pool.tier_of) << "'";
11355 return -EBUSY;
11356 }
11357 if (!pool.tiers.empty()) {
11358 *ss << "pool '" << poolstr << "' has tiers";
11359 for(auto tier : pool.tiers) {
11360 *ss << " " << osdmap.get_pool_name(tier);
11361 }
11362 return -EBUSY;
11363 }
11364
11365 if (!g_conf->mon_allow_pool_delete) {
11366 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
11367 return -EPERM;
11368 }
11369
11370 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
11371 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
11372 return -EPERM;
11373 }
11374
11375 *ss << "pool '" << poolstr << "' removed";
11376 return 0;
11377 }
11378
11379 /**
11380 * Check if it is safe to add a tier to a base pool
11381 *
11382 * @return
11383 * True if the operation should proceed, false if we should abort here
11384 * (abort doesn't necessarily mean error, could be idempotency)
11385 */
11386 bool OSDMonitor::_check_become_tier(
11387 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
11388 const int64_t base_pool_id, const pg_pool_t *base_pool,
11389 int *err,
11390 ostream *ss) const
11391 {
11392 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
11393 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
11394
11395 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
11396 if (pending_fsmap.pool_in_use(tier_pool_id)) {
11397 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
11398 *err = -EBUSY;
11399 return false;
11400 }
11401
11402 if (base_pool->tiers.count(tier_pool_id)) {
11403 assert(tier_pool->tier_of == base_pool_id);
11404 *err = 0;
11405 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
11406 << base_pool_name << "'";
11407 return false;
11408 }
11409
11410 if (base_pool->is_tier()) {
11411 *ss << "pool '" << base_pool_name << "' is already a tier of '"
11412 << osdmap.get_pool_name(base_pool->tier_of) << "', "
11413 << "multiple tiers are not yet supported.";
11414 *err = -EINVAL;
11415 return false;
11416 }
11417
11418 if (tier_pool->has_tiers()) {
11419 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
11420 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
11421 it != tier_pool->tiers.end(); ++it)
11422 *ss << "'" << osdmap.get_pool_name(*it) << "',";
11423 *ss << " multiple tiers are not yet supported.";
11424 *err = -EINVAL;
11425 return false;
11426 }
11427
11428 if (tier_pool->is_tier()) {
11429 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
11430 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
11431 *err = -EINVAL;
11432 return false;
11433 }
11434
11435 *err = 0;
11436 return true;
11437 }
11438
11439
11440 /**
11441 * Check if it is safe to remove a tier from this base pool
11442 *
11443 * @return
11444 * True if the operation should proceed, false if we should abort here
11445 * (abort doesn't necessarily mean error, could be idempotency)
11446 */
11447 bool OSDMonitor::_check_remove_tier(
11448 const int64_t base_pool_id, const pg_pool_t *base_pool,
11449 const pg_pool_t *tier_pool,
11450 int *err, ostream *ss) const
11451 {
11452 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
11453
11454 // Apply CephFS-specific checks
11455 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
11456 if (pending_fsmap.pool_in_use(base_pool_id)) {
11457 if (base_pool->type != pg_pool_t::TYPE_REPLICATED) {
11458 // If the underlying pool is erasure coded, we can't permit the
11459 // removal of the replicated tier that CephFS relies on to access it
11460 *ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
11461 *err = -EBUSY;
11462 return false;
11463 }
11464
11465 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
11466 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
11467 "tier is still in use as a writeback cache. Change the cache "
11468 "mode and flush the cache before removing it";
11469 *err = -EBUSY;
11470 return false;
11471 }
11472 }
11473
11474 *err = 0;
11475 return true;
11476 }
11477
11478 int OSDMonitor::_prepare_remove_pool(
11479 int64_t pool, ostream *ss, bool no_fake)
11480 {
11481 dout(10) << __func__ << " " << pool << dendl;
11482 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11483 int r = _check_remove_pool(pool, *p, ss);
11484 if (r < 0)
11485 return r;
11486
11487 auto new_pool = pending_inc.new_pools.find(pool);
11488 if (new_pool != pending_inc.new_pools.end()) {
11489 // if there is a problem with the pending info, wait and retry
11490 // this op.
11491 const auto& p = new_pool->second;
11492 int r = _check_remove_pool(pool, p, ss);
11493 if (r < 0)
11494 return -EAGAIN;
11495 }
11496
11497 if (pending_inc.old_pools.count(pool)) {
11498 dout(10) << __func__ << " " << pool << " already pending removal"
11499 << dendl;
11500 return 0;
11501 }
11502
11503 if (g_conf->mon_fake_pool_delete && !no_fake) {
11504 string old_name = osdmap.get_pool_name(pool);
11505 string new_name = old_name + "." + stringify(pool) + ".DELETED";
11506 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
11507 << old_name << " -> " << new_name << dendl;
11508 pending_inc.new_pool_names[pool] = new_name;
11509 return 0;
11510 }
11511
11512 // remove
11513 pending_inc.old_pools.insert(pool);
11514
11515 // remove any pg_temp mappings for this pool
11516 for (auto p = osdmap.pg_temp->begin();
11517 p != osdmap.pg_temp->end();
11518 ++p) {
11519 if (p->first.pool() == (uint64_t)pool) {
11520 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
11521 << p->first << dendl;
11522 pending_inc.new_pg_temp[p->first].clear();
11523 }
11524 }
11525 // remove any primary_temp mappings for this pool
11526 for (auto p = osdmap.primary_temp->begin();
11527 p != osdmap.primary_temp->end();
11528 ++p) {
11529 if (p->first.pool() == (uint64_t)pool) {
11530 dout(10) << __func__ << " " << pool
11531 << " removing obsolete primary_temp" << p->first << dendl;
11532 pending_inc.new_primary_temp[p->first] = -1;
11533 }
11534 }
11535 // remove any pg_upmap mappings for this pool
11536 for (auto& p : osdmap.pg_upmap) {
11537 if (p.first.pool() == (uint64_t)pool) {
11538 dout(10) << __func__ << " " << pool
11539 << " removing obsolete pg_upmap "
11540 << p.first << dendl;
11541 pending_inc.old_pg_upmap.insert(p.first);
11542 }
11543 }
11544 // remove any pg_upmap_items mappings for this pool
11545 for (auto& p : osdmap.pg_upmap_items) {
11546 if (p.first.pool() == (uint64_t)pool) {
11547 dout(10) << __func__ << " " << pool
11548 << " removing obsolete pg_upmap_items " << p.first
11549 << dendl;
11550 pending_inc.old_pg_upmap_items.insert(p.first);
11551 }
11552 }
11553
11554 // remove any choose_args for this pool
11555 CrushWrapper newcrush;
11556 _get_pending_crush(newcrush);
11557 if (newcrush.have_choose_args(pool)) {
11558 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
11559 newcrush.rm_choose_args(pool);
11560 pending_inc.crush.clear();
11561 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11562 }
11563 return 0;
11564 }
11565
11566 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
11567 {
11568 dout(10) << "_prepare_rename_pool " << pool << dendl;
11569 if (pending_inc.old_pools.count(pool)) {
11570 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
11571 return -ENOENT;
11572 }
11573 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
11574 p != pending_inc.new_pool_names.end();
11575 ++p) {
11576 if (p->second == newname && p->first != pool) {
11577 return -EEXIST;
11578 }
11579 }
11580
11581 pending_inc.new_pool_names[pool] = newname;
11582 return 0;
11583 }
11584
11585 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
11586 {
11587 op->mark_osdmon_event(__func__);
11588 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11589 ostringstream ss;
11590 int ret = _prepare_remove_pool(m->pool, &ss, false);
11591 if (ret == -EAGAIN) {
11592 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11593 return true;
11594 }
11595 if (ret < 0)
11596 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
11597 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
11598 pending_inc.epoch));
11599 return true;
11600 }
11601
11602 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
11603 int ret, epoch_t epoch, bufferlist *blp)
11604 {
11605 op->mark_osdmon_event(__func__);
11606 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11607 dout(20) << "_pool_op_reply " << ret << dendl;
11608 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
11609 ret, epoch, get_last_committed(), blp);
11610 mon->send_reply(op, reply);
11611 }