]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
ddfeb2a2933824e94601fc17181314a5edce8a88
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <sstream>
21
22 #include "mon/OSDMonitor.h"
23 #include "mon/Monitor.h"
24 #include "mon/MDSMonitor.h"
25 #include "mon/PGMonitor.h"
26 #include "mon/MgrStatMonitor.h"
27 #include "mon/AuthMonitor.h"
28 #include "mon/ConfigKeyService.h"
29
30 #include "mon/MonitorDBStore.h"
31 #include "mon/Session.h"
32
33 #include "crush/CrushWrapper.h"
34 #include "crush/CrushTester.h"
35 #include "crush/CrushTreeDumper.h"
36
37 #include "messages/MOSDBeacon.h"
38 #include "messages/MOSDFailure.h"
39 #include "messages/MOSDMarkMeDown.h"
40 #include "messages/MOSDFull.h"
41 #include "messages/MOSDMap.h"
42 #include "messages/MMonGetOSDMap.h"
43 #include "messages/MOSDBoot.h"
44 #include "messages/MOSDAlive.h"
45 #include "messages/MPoolOp.h"
46 #include "messages/MPoolOpReply.h"
47 #include "messages/MOSDPGCreate.h"
48 #include "messages/MOSDPGCreated.h"
49 #include "messages/MOSDPGTemp.h"
50 #include "messages/MMonCommand.h"
51 #include "messages/MRemoveSnaps.h"
52 #include "messages/MOSDScrub.h"
53 #include "messages/MRoute.h"
54
55 #include "common/TextTable.h"
56 #include "common/Timer.h"
57 #include "common/ceph_argparse.h"
58 #include "common/perf_counters.h"
59 #include "common/strtol.h"
60
61 #include "common/config.h"
62 #include "common/errno.h"
63
64 #include "erasure-code/ErasureCodePlugin.h"
65 #include "compressor/Compressor.h"
66 #include "common/Checksummer.h"
67
68 #include "include/compat.h"
69 #include "include/assert.h"
70 #include "include/stringify.h"
71 #include "include/util.h"
72 #include "common/cmdparse.h"
73 #include "include/str_list.h"
74 #include "include/str_map.h"
75
76 #include "json_spirit/json_spirit_reader.h"
77
78 #define dout_subsys ceph_subsys_mon
79 #define OSD_PG_CREATING_PREFIX "osd_pg_creating"
80
81 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
82 {
83 if (epoch_by_pg.size() <= ps) {
84 epoch_by_pg.resize(ps + 1, 0);
85 }
86 const auto old_lec = epoch_by_pg[ps];
87 if (old_lec >= last_epoch_clean) {
88 // stale lec
89 return;
90 }
91 epoch_by_pg[ps] = last_epoch_clean;
92 if (last_epoch_clean < floor) {
93 floor = last_epoch_clean;
94 } else if (last_epoch_clean > floor) {
95 if (old_lec == floor) {
96 // probably should increase floor?
97 auto new_floor = std::min_element(std::begin(epoch_by_pg),
98 std::end(epoch_by_pg));
99 floor = *new_floor;
100 }
101 }
102 if (ps != next_missing) {
103 return;
104 }
105 for (; next_missing < epoch_by_pg.size(); next_missing++) {
106 if (epoch_by_pg[next_missing] == 0) {
107 break;
108 }
109 }
110 }
111
112 void LastEpochClean::remove_pool(uint64_t pool)
113 {
114 report_by_pool.erase(pool);
115 }
116
117 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
118 {
119 auto& lec = report_by_pool[pg.pool()];
120 return lec.report(pg.ps(), last_epoch_clean);
121 }
122
123 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
124 {
125 auto floor = latest.get_epoch();
126 for (auto& pool : latest.get_pools()) {
127 auto reported = report_by_pool.find(pool.first);
128 if (reported == report_by_pool.end()) {
129 return 0;
130 }
131 if (reported->second.next_missing < pool.second.get_pg_num()) {
132 return 0;
133 }
134 if (reported->second.floor < floor) {
135 floor = reported->second.floor;
136 }
137 }
138 return floor;
139 }
140
141
142 struct C_UpdateCreatingPGs : public Context {
143 OSDMonitor *osdmon;
144 utime_t start;
145 epoch_t epoch;
146 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
147 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
148 void finish(int r) override {
149 if (r >= 0) {
150 utime_t end = ceph_clock_now();
151 dout(10) << "osdmap epoch " << epoch << " mapping took "
152 << (end - start) << " seconds" << dendl;
153 osdmon->update_creating_pgs();
154 osdmon->check_pg_creates_subs();
155 }
156 }
157 };
158
159 #undef dout_prefix
160 #define dout_prefix _prefix(_dout, mon, osdmap)
161 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
162 return *_dout << "mon." << mon->name << "@" << mon->rank
163 << "(" << mon->get_state_name()
164 << ").osd e" << osdmap.get_epoch() << " ";
165 }
166
167 OSDMonitor::OSDMonitor(
168 CephContext *cct,
169 Monitor *mn,
170 Paxos *p,
171 const string& service_name)
172 : PaxosService(mn, p, service_name),
173 cct(cct),
174 inc_osd_cache(g_conf->mon_osd_cache_size),
175 full_osd_cache(g_conf->mon_osd_cache_size),
176 last_attempted_minwait_time(utime_t()),
177 mapper(mn->cct, &mn->cpu_tp),
178 op_tracker(cct, true, 1)
179 {}
180
181 bool OSDMonitor::_have_pending_crush()
182 {
183 return pending_inc.crush.length() > 0;
184 }
185
186 CrushWrapper &OSDMonitor::_get_stable_crush()
187 {
188 return *osdmap.crush;
189 }
190
191 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
192 {
193 bufferlist bl;
194 if (pending_inc.crush.length())
195 bl = pending_inc.crush;
196 else
197 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
198
199 bufferlist::iterator p = bl.begin();
200 newcrush.decode(p);
201 }
202
203 void OSDMonitor::create_initial()
204 {
205 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
206
207 OSDMap newmap;
208
209 bufferlist bl;
210 mon->store->get("mkfs", "osdmap", bl);
211
212 if (bl.length()) {
213 newmap.decode(bl);
214 newmap.set_fsid(mon->monmap->fsid);
215 } else {
216 newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0,
217 g_conf->osd_pg_bits, g_conf->osd_pgp_bits);
218 }
219 newmap.set_epoch(1);
220 newmap.created = newmap.modified = ceph_clock_now();
221
222 // new clusters should sort bitwise by default.
223 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
224
225 // new cluster should require latest by default
226 if (g_conf->mon_debug_no_require_luminous) {
227 newmap.require_osd_release = CEPH_RELEASE_KRAKEN;
228 derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
229 } else {
230 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
231 newmap.full_ratio = g_conf->mon_osd_full_ratio;
232 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
233 newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
234 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
235 newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
236 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
237 int r = ceph_release_from_name(
238 g_conf->mon_osd_initial_require_min_compat_client.c_str());
239 if (r <= 0) {
240 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
241 }
242 newmap.require_min_compat_client = r;
243 }
244
245 // encode into pending incremental
246 newmap.encode(pending_inc.fullmap,
247 mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED);
248 pending_inc.full_crc = newmap.get_crc();
249 dout(20) << " full crc " << pending_inc.full_crc << dendl;
250 }
251
252 void OSDMonitor::get_store_prefixes(std::set<string>& s)
253 {
254 s.insert(service_name);
255 s.insert(OSD_PG_CREATING_PREFIX);
256 }
257
258 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
259 {
260 version_t version = get_last_committed();
261 if (version == osdmap.epoch)
262 return;
263 assert(version > osdmap.epoch);
264
265 dout(15) << "update_from_paxos paxos e " << version
266 << ", my e " << osdmap.epoch << dendl;
267
268 if (mapping_job) {
269 if (!mapping_job->is_done()) {
270 dout(1) << __func__ << " mapping job "
271 << mapping_job.get() << " did not complete, "
272 << mapping_job->shards << " left, canceling" << dendl;
273 mapping_job->abort();
274 }
275 mapping_job.reset();
276 }
277
278 /*
279 * We will possibly have a stashed latest that *we* wrote, and we will
280 * always be sure to have the oldest full map in the first..last range
281 * due to encode_trim_extra(), which includes the oldest full map in the trim
282 * transaction.
283 *
284 * encode_trim_extra() does not however write the full map's
285 * version to 'full_latest'. This is only done when we are building the
286 * full maps from the incremental versions. But don't panic! We make sure
287 * that the following conditions find whichever full map version is newer.
288 */
289 version_t latest_full = get_version_latest_full();
290 if (latest_full == 0 && get_first_committed() > 1)
291 latest_full = get_first_committed();
292
293 if (get_first_committed() > 1 &&
294 latest_full < get_first_committed()) {
295 // the monitor could be just sync'ed with its peer, and the latest_full key
296 // is not encoded in the paxos commits in encode_pending(), so we need to
297 // make sure we get it pointing to a proper version.
298 version_t lc = get_last_committed();
299 version_t fc = get_first_committed();
300
301 dout(10) << __func__ << " looking for valid full map in interval"
302 << " [" << fc << ", " << lc << "]" << dendl;
303
304 latest_full = 0;
305 for (version_t v = lc; v >= fc; v--) {
306 string full_key = "full_" + stringify(v);
307 if (mon->store->exists(get_service_name(), full_key)) {
308 dout(10) << __func__ << " found latest full map v " << v << dendl;
309 latest_full = v;
310 break;
311 }
312 }
313
314 assert(latest_full > 0);
315 auto t(std::make_shared<MonitorDBStore::Transaction>());
316 put_version_latest_full(t, latest_full);
317 mon->store->apply_transaction(t);
318 dout(10) << __func__ << " updated the on-disk full map version to "
319 << latest_full << dendl;
320 }
321
322 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
323 bufferlist latest_bl;
324 get_version_full(latest_full, latest_bl);
325 assert(latest_bl.length() != 0);
326 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
327 osdmap.decode(latest_bl);
328 }
329
330 if (mon->monmap->get_required_features().contains_all(
331 ceph::features::mon::FEATURE_LUMINOUS)) {
332 bufferlist bl;
333 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
334 auto p = bl.begin();
335 std::lock_guard<std::mutex> l(creating_pgs_lock);
336 creating_pgs.decode(p);
337 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
338 << creating_pgs.last_scan_epoch
339 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
340 } else {
341 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
342 << dendl;
343 }
344 }
345
346 // make sure we're using the right pg service.. remove me post-luminous!
347 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
348 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
349 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
350 } else {
351 dout(10) << __func__ << " pgservice is pg" << dendl;
352 mon->pgservice = mon->pgmon()->get_pg_stat_service();
353 }
354
355 // walk through incrementals
356 MonitorDBStore::TransactionRef t;
357 size_t tx_size = 0;
358 while (version > osdmap.epoch) {
359 bufferlist inc_bl;
360 int err = get_version(osdmap.epoch+1, inc_bl);
361 assert(err == 0);
362 assert(inc_bl.length());
363
364 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
365 << dendl;
366 OSDMap::Incremental inc(inc_bl);
367 err = osdmap.apply_incremental(inc);
368 assert(err == 0);
369
370 if (!t)
371 t.reset(new MonitorDBStore::Transaction);
372
373 // Write out the full map for all past epochs. Encode the full
374 // map with the same features as the incremental. If we don't
375 // know, use the quorum features. If we don't know those either,
376 // encode with all features.
377 uint64_t f = inc.encode_features;
378 if (!f)
379 f = mon->get_quorum_con_features();
380 if (!f)
381 f = -1;
382 bufferlist full_bl;
383 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
384 tx_size += full_bl.length();
385
386 bufferlist orig_full_bl;
387 get_version_full(osdmap.epoch, orig_full_bl);
388 if (orig_full_bl.length()) {
389 // the primary provided the full map
390 assert(inc.have_crc);
391 if (inc.full_crc != osdmap.crc) {
392 // This will happen if the mons were running mixed versions in
393 // the past or some other circumstance made the full encoded
394 // maps divergent. Reloading here will bring us back into
395 // sync with the primary for this and all future maps. OSDs
396 // will also be brought back into sync when they discover the
397 // crc mismatch and request a full map from a mon.
398 derr << __func__ << " full map CRC mismatch, resetting to canonical"
399 << dendl;
400 osdmap = OSDMap();
401 osdmap.decode(orig_full_bl);
402 }
403 } else {
404 assert(!inc.have_crc);
405 put_version_full(t, osdmap.epoch, full_bl);
406 }
407 put_version_latest_full(t, osdmap.epoch);
408
409 // share
410 dout(1) << osdmap << dendl;
411
412 if (osdmap.epoch == 1) {
413 t->erase("mkfs", "osdmap");
414 }
415
416 // make sure we're using the right pg service.. remove me post-luminous!
417 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
418 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
419 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
420 } else {
421 dout(10) << __func__ << " pgservice is pg" << dendl;
422 mon->pgservice = mon->pgmon()->get_pg_stat_service();
423 }
424
425 if (tx_size > g_conf->mon_sync_max_payload_size*2) {
426 mon->store->apply_transaction(t);
427 t = MonitorDBStore::TransactionRef();
428 tx_size = 0;
429 }
430 if (mon->monmap->get_required_features().contains_all(
431 ceph::features::mon::FEATURE_LUMINOUS)) {
432 for (const auto &osd_state : inc.new_state) {
433 if (osd_state.second & CEPH_OSD_UP) {
434 // could be marked up *or* down, but we're too lazy to check which
435 last_osd_report.erase(osd_state.first);
436 }
437 if (osd_state.second & CEPH_OSD_EXISTS) {
438 // could be created *or* destroyed, but we can safely drop it
439 osd_epochs.erase(osd_state.first);
440 }
441 }
442 }
443 }
444
445 if (t) {
446 mon->store->apply_transaction(t);
447 }
448
449 for (int o = 0; o < osdmap.get_max_osd(); o++) {
450 if (osdmap.is_out(o))
451 continue;
452 auto found = down_pending_out.find(o);
453 if (osdmap.is_down(o)) {
454 // populate down -> out map
455 if (found == down_pending_out.end()) {
456 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
457 down_pending_out[o] = ceph_clock_now();
458 }
459 } else {
460 if (found != down_pending_out.end()) {
461 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
462 down_pending_out.erase(found);
463 }
464 }
465 }
466 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
467
468 if (mon->is_leader()) {
469 // kick pgmon, make sure it's seen the latest map
470 mon->pgmon()->check_osd_map(osdmap.epoch);
471 }
472
473 check_osdmap_subs();
474 check_pg_creates_subs();
475
476 share_map_with_random_osd();
477 update_logger();
478
479 process_failures();
480
481 // make sure our feature bits reflect the latest map
482 update_msgr_features();
483
484 if (!mon->is_leader()) {
485 // will be called by on_active() on the leader, avoid doing so twice
486 start_mapping();
487 }
488 }
489
490 void OSDMonitor::start_mapping()
491 {
492 // initiate mapping job
493 if (mapping_job) {
494 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
495 << dendl;
496 mapping_job->abort();
497 }
498 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
499 mapping_job = mapping.start_update(osdmap, mapper,
500 g_conf->mon_osd_mapping_pgs_per_chunk);
501 dout(10) << __func__ << " started mapping job " << mapping_job.get()
502 << " at " << fin->start << dendl;
503 mapping_job->set_finish_event(fin);
504 }
505
506 void OSDMonitor::update_msgr_features()
507 {
508 set<int> types;
509 types.insert((int)entity_name_t::TYPE_OSD);
510 types.insert((int)entity_name_t::TYPE_CLIENT);
511 types.insert((int)entity_name_t::TYPE_MDS);
512 types.insert((int)entity_name_t::TYPE_MON);
513 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
514 uint64_t mask;
515 uint64_t features = osdmap.get_features(*q, &mask);
516 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
517 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
518 Messenger::Policy p = mon->messenger->get_policy(*q);
519 p.features_required = (p.features_required & ~mask) | features;
520 mon->messenger->set_policy(*q, p);
521 }
522 }
523 }
524
525 void OSDMonitor::on_active()
526 {
527 update_logger();
528
529 if (mon->is_leader()) {
530 mon->clog->info() << "osdmap " << osdmap;
531 } else {
532 list<MonOpRequestRef> ls;
533 take_all_failures(ls);
534 while (!ls.empty()) {
535 MonOpRequestRef op = ls.front();
536 op->mark_osdmon_event(__func__);
537 dispatch(op);
538 ls.pop_front();
539 }
540 }
541 start_mapping();
542 }
543
544 void OSDMonitor::on_restart()
545 {
546 last_osd_report.clear();
547
548 if (mon->is_leader()) {
549 // fix ruleset != ruleid
550 if (osdmap.crush->has_legacy_rulesets() &&
551 !osdmap.crush->has_multirule_rulesets()) {
552 CrushWrapper newcrush;
553 _get_pending_crush(newcrush);
554 int r = newcrush.renumber_rules_by_ruleset();
555 if (r >= 0) {
556 dout(1) << __func__ << " crush map has ruleset != rule id; fixing" << dendl;
557 pending_inc.crush.clear();
558 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
559 } else {
560 dout(10) << __func__ << " unable to renumber rules by ruleset" << dendl;
561 }
562 }
563 }
564 }
565
566 void OSDMonitor::on_shutdown()
567 {
568 dout(10) << __func__ << dendl;
569 if (mapping_job) {
570 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
571 << dendl;
572 mapping_job->abort();
573 }
574
575 // discard failure info, waiters
576 list<MonOpRequestRef> ls;
577 take_all_failures(ls);
578 ls.clear();
579 }
580
581 void OSDMonitor::update_logger()
582 {
583 dout(10) << "update_logger" << dendl;
584
585 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
586 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
587 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
588 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
589 }
590
591 void OSDMonitor::create_pending()
592 {
593 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
594 pending_inc.fsid = mon->monmap->fsid;
595
596 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
597
598 // clean up pg_temp, primary_temp
599 OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
600 dout(10) << "create_pending did clean_temps" << dendl;
601
602 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
603 // instead of osd_backfill_full_ratio config
604 if (osdmap.backfillfull_ratio <= 0) {
605 pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
606 if (pending_inc.new_backfillfull_ratio > 1.0)
607 pending_inc.new_backfillfull_ratio /= 100;
608 dout(1) << __func__ << " setting backfillfull_ratio = "
609 << pending_inc.new_backfillfull_ratio << dendl;
610 }
611 if (osdmap.get_epoch() > 0 &&
612 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
613 // transition full ratios from PGMap to OSDMap (on upgrade)
614 float full_ratio = mon->pgservice->get_full_ratio();
615 float nearfull_ratio = mon->pgservice->get_nearfull_ratio();
616 if (osdmap.full_ratio != full_ratio) {
617 dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
618 << " -> " << full_ratio << " (from pgmap)" << dendl;
619 pending_inc.new_full_ratio = full_ratio;
620 }
621 if (osdmap.nearfull_ratio != nearfull_ratio) {
622 dout(10) << __func__ << " nearfull_ratio " << osdmap.nearfull_ratio
623 << " -> " << nearfull_ratio << " (from pgmap)" << dendl;
624 pending_inc.new_nearfull_ratio = nearfull_ratio;
625 }
626 } else {
627 // safety check (this shouldn't really happen)
628 if (osdmap.full_ratio <= 0) {
629 pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
630 if (pending_inc.new_full_ratio > 1.0)
631 pending_inc.new_full_ratio /= 100;
632 dout(1) << __func__ << " setting full_ratio = "
633 << pending_inc.new_full_ratio << dendl;
634 }
635 if (osdmap.nearfull_ratio <= 0) {
636 pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
637 if (pending_inc.new_nearfull_ratio > 1.0)
638 pending_inc.new_nearfull_ratio /= 100;
639 dout(1) << __func__ << " setting nearfull_ratio = "
640 << pending_inc.new_nearfull_ratio << dendl;
641 }
642 }
643 }
644
645 creating_pgs_t
646 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
647 {
648 dout(10) << __func__ << dendl;
649 creating_pgs_t pending_creatings;
650 {
651 std::lock_guard<std::mutex> l(creating_pgs_lock);
652 pending_creatings = creating_pgs;
653 }
654 // check for new or old pools
655 if (pending_creatings.last_scan_epoch < inc.epoch) {
656 if (osdmap.get_epoch() &&
657 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
658 auto added =
659 mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
660 osdmap.get_pools(),
661 &pending_creatings);
662 dout(7) << __func__ << " " << added << " pgs added from pgmap" << dendl;
663 }
664 unsigned queued = 0;
665 queued += scan_for_creating_pgs(osdmap.get_pools(),
666 inc.old_pools,
667 inc.modified,
668 &pending_creatings);
669 queued += scan_for_creating_pgs(inc.new_pools,
670 inc.old_pools,
671 inc.modified,
672 &pending_creatings);
673 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
674 for (auto deleted_pool : inc.old_pools) {
675 auto removed = pending_creatings.remove_pool(deleted_pool);
676 dout(10) << __func__ << " " << removed
677 << " pg removed because containing pool deleted: "
678 << deleted_pool << dendl;
679 last_epoch_clean.remove_pool(deleted_pool);
680 }
681 // pgmon updates its creating_pgs in check_osd_map() which is called by
682 // on_active() and check_osd_map() could be delayed if lease expires, so its
683 // creating_pgs could be stale in comparison with the one of osdmon. let's
684 // trim them here. otherwise, they will be added back after being erased.
685 unsigned removed = 0;
686 for (auto& pg : pending_created_pgs) {
687 dout(20) << __func__ << " noting created pg " << pg << dendl;
688 pending_creatings.created_pools.insert(pg.pool());
689 removed += pending_creatings.pgs.erase(pg);
690 }
691 pending_created_pgs.clear();
692 dout(10) << __func__ << " " << removed
693 << " pgs removed because they're created" << dendl;
694 pending_creatings.last_scan_epoch = osdmap.get_epoch();
695 }
696
697 // process queue
698 unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
699 const auto total = pending_creatings.pgs.size();
700 while (pending_creatings.pgs.size() < max &&
701 !pending_creatings.queue.empty()) {
702 auto p = pending_creatings.queue.begin();
703 int64_t poolid = p->first;
704 dout(10) << __func__ << " pool " << poolid
705 << " created " << p->second.created
706 << " modified " << p->second.modified
707 << " [" << p->second.start << "-" << p->second.end << ")"
708 << dendl;
709 int n = MIN(max - pending_creatings.pgs.size(),
710 p->second.end - p->second.start);
711 ps_t first = p->second.start;
712 ps_t end = first + n;
713 for (ps_t ps = first; ps < end; ++ps) {
714 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
715 // NOTE: use the *current* epoch as the PG creation epoch so that the
716 // OSD does not have to generate a long set of PastIntervals.
717 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
718 p->second.modified));
719 dout(10) << __func__ << " adding " << pgid << dendl;
720 }
721 p->second.start = end;
722 if (p->second.done()) {
723 dout(10) << __func__ << " done with queue for " << poolid << dendl;
724 pending_creatings.queue.erase(p);
725 } else {
726 dout(10) << __func__ << " pool " << poolid
727 << " now [" << p->second.start << "-" << p->second.end << ")"
728 << dendl;
729 }
730 }
731 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
732 << " pools" << dendl;
733 dout(10) << __func__ << " " << pending_creatings.pgs.size() - total
734 << " pgs added from queued pools" << dendl;
735 return pending_creatings;
736 }
737
738 void OSDMonitor::maybe_prime_pg_temp()
739 {
740 bool all = false;
741 if (pending_inc.crush.length()) {
742 dout(10) << __func__ << " new crush map, all" << dendl;
743 all = true;
744 }
745
746 if (!pending_inc.new_up_client.empty()) {
747 dout(10) << __func__ << " new up osds, all" << dendl;
748 all = true;
749 }
750
751 // check for interesting OSDs
752 set<int> osds;
753 for (auto p = pending_inc.new_state.begin();
754 !all && p != pending_inc.new_state.end();
755 ++p) {
756 if ((p->second & CEPH_OSD_UP) &&
757 osdmap.is_up(p->first)) {
758 osds.insert(p->first);
759 }
760 }
761 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
762 !all && p != pending_inc.new_weight.end();
763 ++p) {
764 if (p->second < osdmap.get_weight(p->first)) {
765 // weight reduction
766 osds.insert(p->first);
767 } else {
768 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
769 << dendl;
770 all = true;
771 }
772 }
773
774 if (!all && osds.empty())
775 return;
776
777 if (!all) {
778 unsigned estimate =
779 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
780 if (estimate > mapping.get_num_pgs() *
781 g_conf->mon_osd_prime_pg_temp_max_estimate) {
782 dout(10) << __func__ << " estimate " << estimate << " pgs on "
783 << osds.size() << " osds >= "
784 << g_conf->mon_osd_prime_pg_temp_max_estimate << " of total "
785 << mapping.get_num_pgs() << " pgs, all"
786 << dendl;
787 all = true;
788 } else {
789 dout(10) << __func__ << " estimate " << estimate << " pgs on "
790 << osds.size() << " osds" << dendl;
791 }
792 }
793
794 OSDMap next;
795 next.deepish_copy_from(osdmap);
796 next.apply_incremental(pending_inc);
797
798 if (all) {
799 PrimeTempJob job(next, this);
800 mapper.queue(&job, g_conf->mon_osd_mapping_pgs_per_chunk);
801 if (job.wait_for(g_conf->mon_osd_prime_pg_temp_max_time)) {
802 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
803 } else {
804 dout(10) << __func__ << " did not finish in "
805 << g_conf->mon_osd_prime_pg_temp_max_time
806 << ", stopping" << dendl;
807 job.abort();
808 }
809 } else {
810 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
811 utime_t stop = ceph_clock_now();
812 stop += g_conf->mon_osd_prime_pg_temp_max_time;
813 const int chunk = 1000;
814 int n = chunk;
815 std::unordered_set<pg_t> did_pgs;
816 for (auto osd : osds) {
817 auto& pgs = mapping.get_osd_acting_pgs(osd);
818 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
819 for (auto pgid : pgs) {
820 if (!did_pgs.insert(pgid).second) {
821 continue;
822 }
823 prime_pg_temp(next, pgid);
824 if (--n <= 0) {
825 n = chunk;
826 if (ceph_clock_now() > stop) {
827 dout(10) << __func__ << " consumed more than "
828 << g_conf->mon_osd_prime_pg_temp_max_time
829 << " seconds, stopping"
830 << dendl;
831 return;
832 }
833 }
834 }
835 }
836 }
837 }
838
839 void OSDMonitor::prime_pg_temp(
840 const OSDMap& next,
841 pg_t pgid)
842 {
843 if (mon->monmap->get_required_features().contains_all(
844 ceph::features::mon::FEATURE_LUMINOUS)) {
845 // TODO: remove this creating_pgs direct access?
846 if (creating_pgs.pgs.count(pgid)) {
847 return;
848 }
849 } else {
850 if (mon->pgservice->is_creating_pg(pgid)) {
851 return;
852 }
853 }
854 if (!osdmap.pg_exists(pgid)) {
855 return;
856 }
857
858 vector<int> up, acting;
859 mapping.get(pgid, &up, nullptr, &acting, nullptr);
860
861 vector<int> next_up, next_acting;
862 int next_up_primary, next_acting_primary;
863 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
864 &next_acting, &next_acting_primary);
865 if (acting == next_acting)
866 return; // no change since last epoch
867
868 if (acting.empty())
869 return; // if previously empty now we can be no worse off
870 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
871 if (pool && acting.size() < pool->min_size)
872 return; // can be no worse off than before
873
874 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
875 << " -> " << next_up << "/" << next_acting
876 << ", priming " << acting
877 << dendl;
878 {
879 Mutex::Locker l(prime_pg_temp_lock);
880 // do not touch a mapping if a change is pending
881 pending_inc.new_pg_temp.emplace(
882 pgid,
883 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
884 }
885 }
886
887 /**
888 * @note receiving a transaction in this function gives a fair amount of
889 * freedom to the service implementation if it does need it. It shouldn't.
890 */
891 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
892 {
893 dout(10) << "encode_pending e " << pending_inc.epoch
894 << dendl;
895
896 // finalize up pending_inc
897 pending_inc.modified = ceph_clock_now();
898
899 int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
900 assert(r == 0);
901
902 if (mapping_job) {
903 if (!mapping_job->is_done()) {
904 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
905 << mapping_job.get() << " did not complete, "
906 << mapping_job->shards << " left" << dendl;
907 mapping_job->abort();
908 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
909 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
910 << mapping_job.get() << " is prior epoch "
911 << mapping.get_epoch() << dendl;
912 } else {
913 if (g_conf->mon_osd_prime_pg_temp) {
914 maybe_prime_pg_temp();
915 }
916 }
917 } else if (g_conf->mon_osd_prime_pg_temp) {
918 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
919 << dendl;
920 }
921 mapping_job.reset();
922
923 bufferlist bl;
924
925 {
926 OSDMap tmp;
927 tmp.deepish_copy_from(osdmap);
928 tmp.apply_incremental(pending_inc);
929
930 if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
931 // set or clear full/nearfull?
932 int full, backfill, nearfull;
933 tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
934 if (full > 0) {
935 if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
936 dout(10) << __func__ << " setting full flag" << dendl;
937 add_flag(CEPH_OSDMAP_FULL);
938 remove_flag(CEPH_OSDMAP_NEARFULL);
939 }
940 } else {
941 if (tmp.test_flag(CEPH_OSDMAP_FULL)) {
942 dout(10) << __func__ << " clearing full flag" << dendl;
943 remove_flag(CEPH_OSDMAP_FULL);
944 }
945 if (nearfull > 0) {
946 if (!tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
947 dout(10) << __func__ << " setting nearfull flag" << dendl;
948 add_flag(CEPH_OSDMAP_NEARFULL);
949 }
950 } else {
951 if (tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
952 dout(10) << __func__ << " clearing nearfull flag" << dendl;
953 remove_flag(CEPH_OSDMAP_NEARFULL);
954 }
955 }
956 }
957
958 // min_compat_client?
959 if (tmp.require_min_compat_client == 0) {
960 auto mv = tmp.get_min_compat_client();
961 dout(1) << __func__ << " setting require_min_compat_client to currently "
962 << "required " << ceph_release_name(mv) << dendl;
963 mon->clog->info() << "setting require_min_compat_client to currently "
964 << "required " << ceph_release_name(mv);
965 pending_inc.new_require_min_compat_client = mv;
966 }
967 }
968 }
969
970 // tell me about it
971 for (auto i = pending_inc.new_state.begin();
972 i != pending_inc.new_state.end();
973 ++i) {
974 int s = i->second ? i->second : CEPH_OSD_UP;
975 if (s & CEPH_OSD_UP)
976 dout(2) << " osd." << i->first << " DOWN" << dendl;
977 if (s & CEPH_OSD_EXISTS)
978 dout(2) << " osd." << i->first << " DNE" << dendl;
979 }
980 for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
981 i != pending_inc.new_up_client.end();
982 ++i) {
983 //FIXME: insert cluster addresses too
984 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
985 }
986 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
987 i != pending_inc.new_weight.end();
988 ++i) {
989 if (i->second == CEPH_OSD_OUT) {
990 dout(2) << " osd." << i->first << " OUT" << dendl;
991 } else if (i->second == CEPH_OSD_IN) {
992 dout(2) << " osd." << i->first << " IN" << dendl;
993 } else {
994 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
995 }
996 }
997
998 // features for osdmap and its incremental
999 uint64_t features = mon->get_quorum_con_features();
1000
1001 // encode full map and determine its crc
1002 OSDMap tmp;
1003 {
1004 tmp.deepish_copy_from(osdmap);
1005 tmp.apply_incremental(pending_inc);
1006
1007 // determine appropriate features
1008 if (tmp.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1009 dout(10) << __func__ << " encoding without feature SERVER_LUMINOUS"
1010 << dendl;
1011 features &= ~CEPH_FEATURE_SERVER_LUMINOUS;
1012 }
1013 if (tmp.require_osd_release < CEPH_RELEASE_KRAKEN) {
1014 dout(10) << __func__ << " encoding without feature SERVER_KRAKEN | "
1015 << "MSG_ADDR2" << dendl;
1016 features &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1017 CEPH_FEATURE_MSG_ADDR2);
1018 }
1019 if (tmp.require_osd_release < CEPH_RELEASE_JEWEL) {
1020 dout(10) << __func__ << " encoding without feature SERVER_JEWEL" << dendl;
1021 features &= ~CEPH_FEATURE_SERVER_JEWEL;
1022 }
1023 dout(10) << __func__ << " encoding full map with " << features << dendl;
1024
1025 bufferlist fullbl;
1026 ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1027 pending_inc.full_crc = tmp.get_crc();
1028
1029 // include full map in the txn. note that old monitors will
1030 // overwrite this. new ones will now skip the local full map
1031 // encode and reload from this.
1032 put_version_full(t, pending_inc.epoch, fullbl);
1033 }
1034
1035 // encode
1036 assert(get_last_committed() + 1 == pending_inc.epoch);
1037 ::encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1038
1039 dout(20) << " full_crc " << tmp.get_crc()
1040 << " inc_crc " << pending_inc.inc_crc << dendl;
1041
1042 /* put everything in the transaction */
1043 put_version(t, pending_inc.epoch, bl);
1044 put_last_committed(t, pending_inc.epoch);
1045
1046 // metadata, too!
1047 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1048 p != pending_metadata.end();
1049 ++p)
1050 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1051 for (set<int>::iterator p = pending_metadata_rm.begin();
1052 p != pending_metadata_rm.end();
1053 ++p)
1054 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1055 pending_metadata.clear();
1056 pending_metadata_rm.clear();
1057
1058 // and pg creating, also!
1059 if (mon->monmap->get_required_features().contains_all(
1060 ceph::features::mon::FEATURE_LUMINOUS)) {
1061 auto pending_creatings = update_pending_pgs(pending_inc);
1062 if (osdmap.get_epoch() &&
1063 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1064 dout(7) << __func__ << " in the middle of upgrading, "
1065 << " trimming pending creating_pgs using pgmap" << dendl;
1066 mon->pgservice->maybe_trim_creating_pgs(&pending_creatings);
1067 }
1068 bufferlist creatings_bl;
1069 ::encode(pending_creatings, creatings_bl);
1070 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1071 }
1072 }
1073
1074 void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
1075 const ceph::unordered_map<pg_t,pg_stat_t>& pg_stat)
1076 {
1077 auto p = creating_pgs->pgs.begin();
1078 while (p != creating_pgs->pgs.end()) {
1079 auto q = pg_stat.find(p->first);
1080 if (q != pg_stat.end() &&
1081 !(q->second.state & PG_STATE_CREATING)) {
1082 dout(20) << __func__ << " pgmap shows " << p->first << " is created"
1083 << dendl;
1084 p = creating_pgs->pgs.erase(p);
1085 } else {
1086 ++p;
1087 }
1088 }
1089 }
1090
1091 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1092 {
1093 bufferlist bl;
1094 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1095 if (r < 0)
1096 return r;
1097 try {
1098 bufferlist::iterator p = bl.begin();
1099 ::decode(m, p);
1100 }
1101 catch (buffer::error& e) {
1102 if (err)
1103 *err << "osd." << osd << " metadata is corrupt";
1104 return -EIO;
1105 }
1106 return 0;
1107 }
1108
1109 void OSDMonitor::count_metadata(const string& field, Formatter *f)
1110 {
1111 map<string,int> by_val;
1112 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1113 if (osdmap.is_up(osd)) {
1114 map<string,string> meta;
1115 load_metadata(osd, meta, nullptr);
1116 auto p = meta.find(field);
1117 if (p == meta.end()) {
1118 by_val["unknown"]++;
1119 } else {
1120 by_val[p->second]++;
1121 }
1122 }
1123 }
1124 f->open_object_section(field.c_str());
1125 for (auto& p : by_val) {
1126 f->dump_int(p.first.c_str(), p.second);
1127 }
1128 f->close_section();
1129 }
1130
1131 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1132 {
1133 map<string, string> metadata;
1134 int r = load_metadata(osd, metadata, nullptr);
1135 if (r < 0)
1136 return r;
1137
1138 auto it = metadata.find("osd_objectstore");
1139 if (it == metadata.end())
1140 return -ENOENT;
1141 *type = it->second;
1142 return 0;
1143 }
1144
1145 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1146 const pg_pool_t &pool,
1147 ostream *err)
1148 {
1149 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1150 // since filestore osds could always join the pool later
1151 set<int> checked_osds;
1152 for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) {
1153 vector<int> up, acting;
1154 pg_t pgid(ps, pool_id, -1);
1155 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1156 for (int osd : up) {
1157 if (checked_osds.find(osd) != checked_osds.end())
1158 continue;
1159 string objectstore_type;
1160 int r = get_osd_objectstore_type(osd, &objectstore_type);
1161 // allow with missing metadata, e.g. due to an osd never booting yet
1162 if (r < 0 || objectstore_type == "bluestore") {
1163 checked_osds.insert(osd);
1164 continue;
1165 }
1166 *err << "osd." << osd << " uses " << objectstore_type;
1167 return false;
1168 }
1169 }
1170 return true;
1171 }
1172
1173 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1174 {
1175 map<string,string> m;
1176 if (int r = load_metadata(osd, m, err))
1177 return r;
1178 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1179 f->dump_string(p->first.c_str(), p->second);
1180 return 0;
1181 }
1182
1183 void OSDMonitor::print_nodes(Formatter *f)
1184 {
1185 // group OSDs by their hosts
1186 map<string, list<int> > osds; // hostname => osd
1187 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1188 map<string, string> m;
1189 if (load_metadata(osd, m, NULL)) {
1190 continue;
1191 }
1192 map<string, string>::iterator hostname = m.find("hostname");
1193 if (hostname == m.end()) {
1194 // not likely though
1195 continue;
1196 }
1197 osds[hostname->second].push_back(osd);
1198 }
1199
1200 dump_services(f, osds, "osd");
1201 }
1202
1203 void OSDMonitor::share_map_with_random_osd()
1204 {
1205 if (osdmap.get_num_up_osds() == 0) {
1206 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1207 return;
1208 }
1209
1210 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1211 if (!s) {
1212 dout(10) << __func__ << " no up osd on our session map" << dendl;
1213 return;
1214 }
1215
1216 dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
1217 // whatev, they'll request more if they need it
1218 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
1219 s->con->send_message(m);
1220 // NOTE: do *not* record osd has up to this epoch (as we do
1221 // elsewhere) as they may still need to request older values.
1222 }
1223
1224 version_t OSDMonitor::get_trim_to()
1225 {
1226 if (mon->get_quorum().empty()) {
1227 dout(10) << __func__ << ": quorum not formed" << dendl;
1228 return 0;
1229 }
1230
1231 epoch_t floor;
1232 if (mon->monmap->get_required_features().contains_all(
1233 ceph::features::mon::FEATURE_LUMINOUS)) {
1234 {
1235 // TODO: Get this hidden in PGStatService
1236 std::lock_guard<std::mutex> l(creating_pgs_lock);
1237 if (!creating_pgs.pgs.empty()) {
1238 return 0;
1239 }
1240 }
1241 floor = get_min_last_epoch_clean();
1242 } else {
1243 if (!mon->pgservice->is_readable())
1244 return 0;
1245 if (mon->pgservice->have_creating_pgs()) {
1246 return 0;
1247 }
1248 floor = mon->pgservice->get_min_last_epoch_clean();
1249 }
1250 {
1251 dout(10) << " min_last_epoch_clean " << floor << dendl;
1252 if (g_conf->mon_osd_force_trim_to > 0 &&
1253 g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
1254 floor = g_conf->mon_osd_force_trim_to;
1255 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1256 }
1257 unsigned min = g_conf->mon_min_osdmap_epochs;
1258 if (floor + min > get_last_committed()) {
1259 if (min < get_last_committed())
1260 floor = get_last_committed() - min;
1261 else
1262 floor = 0;
1263 }
1264 if (floor > get_first_committed())
1265 return floor;
1266 }
1267 return 0;
1268 }
1269
1270 epoch_t OSDMonitor::get_min_last_epoch_clean() const
1271 {
1272 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1273 // also scan osd epochs
1274 // don't trim past the oldest reported osd epoch
1275 for (auto& osd_epoch : osd_epochs) {
1276 if (osd_epoch.second < floor) {
1277 floor = osd_epoch.second;
1278 }
1279 }
1280 return floor;
1281 }
1282
1283 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1284 version_t first)
1285 {
1286 dout(10) << __func__ << " including full map for e " << first << dendl;
1287 bufferlist bl;
1288 get_version_full(first, bl);
1289 put_version_full(tx, first, bl);
1290 }
1291
1292 // -------------
1293
1294 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
1295 {
1296 op->mark_osdmon_event(__func__);
1297 Message *m = op->get_req();
1298 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
1299
1300 switch (m->get_type()) {
1301 // READs
1302 case MSG_MON_COMMAND:
1303 return preprocess_command(op);
1304 case CEPH_MSG_MON_GET_OSDMAP:
1305 return preprocess_get_osdmap(op);
1306
1307 // damp updates
1308 case MSG_OSD_MARK_ME_DOWN:
1309 return preprocess_mark_me_down(op);
1310 case MSG_OSD_FULL:
1311 return preprocess_full(op);
1312 case MSG_OSD_FAILURE:
1313 return preprocess_failure(op);
1314 case MSG_OSD_BOOT:
1315 return preprocess_boot(op);
1316 case MSG_OSD_ALIVE:
1317 return preprocess_alive(op);
1318 case MSG_OSD_PG_CREATED:
1319 return preprocess_pg_created(op);
1320 case MSG_OSD_PGTEMP:
1321 return preprocess_pgtemp(op);
1322 case MSG_OSD_BEACON:
1323 return preprocess_beacon(op);
1324
1325 case CEPH_MSG_POOLOP:
1326 return preprocess_pool_op(op);
1327
1328 case MSG_REMOVE_SNAPS:
1329 return preprocess_remove_snaps(op);
1330
1331 default:
1332 ceph_abort();
1333 return true;
1334 }
1335 }
1336
1337 bool OSDMonitor::prepare_update(MonOpRequestRef op)
1338 {
1339 op->mark_osdmon_event(__func__);
1340 Message *m = op->get_req();
1341 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
1342
1343 switch (m->get_type()) {
1344 // damp updates
1345 case MSG_OSD_MARK_ME_DOWN:
1346 return prepare_mark_me_down(op);
1347 case MSG_OSD_FULL:
1348 return prepare_full(op);
1349 case MSG_OSD_FAILURE:
1350 return prepare_failure(op);
1351 case MSG_OSD_BOOT:
1352 return prepare_boot(op);
1353 case MSG_OSD_ALIVE:
1354 return prepare_alive(op);
1355 case MSG_OSD_PG_CREATED:
1356 return prepare_pg_created(op);
1357 case MSG_OSD_PGTEMP:
1358 return prepare_pgtemp(op);
1359 case MSG_OSD_BEACON:
1360 return prepare_beacon(op);
1361
1362 case MSG_MON_COMMAND:
1363 return prepare_command(op);
1364
1365 case CEPH_MSG_POOLOP:
1366 return prepare_pool_op(op);
1367
1368 case MSG_REMOVE_SNAPS:
1369 return prepare_remove_snaps(op);
1370
1371
1372 default:
1373 ceph_abort();
1374 }
1375
1376 return false;
1377 }
1378
1379 bool OSDMonitor::should_propose(double& delay)
1380 {
1381 dout(10) << "should_propose" << dendl;
1382
1383 // if full map, propose immediately! any subsequent changes will be clobbered.
1384 if (pending_inc.fullmap.length())
1385 return true;
1386
1387 // adjust osd weights?
1388 if (!osd_weight.empty() &&
1389 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
1390 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
1391 osdmap.adjust_osd_weights(osd_weight, pending_inc);
1392 delay = 0.0;
1393 osd_weight.clear();
1394 return true;
1395 }
1396
1397 // propose as fast as possible if updating up_thru or pg_temp
1398 // want to merge OSDMap changes as much as possible
1399 if ((pending_inc.new_primary_temp.size() == 1
1400 || pending_inc.new_up_thru.size() == 1)
1401 && pending_inc.new_state.size() < 2) {
1402 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl;
1403
1404 utime_t now = ceph_clock_now();
1405 if (now - last_attempted_minwait_time > g_conf->paxos_propose_interval
1406 && now - paxos->get_last_commit_time() > g_conf->paxos_min_wait) {
1407 delay = g_conf->paxos_min_wait;
1408 last_attempted_minwait_time = now;
1409 return true;
1410 }
1411 }
1412
1413 return PaxosService::should_propose(delay);
1414 }
1415
1416
1417
1418 // ---------------------------
1419 // READs
1420
1421 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
1422 {
1423 op->mark_osdmon_event(__func__);
1424 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
1425 dout(10) << __func__ << " " << *m << dendl;
1426 MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
1427 epoch_t first = get_first_committed();
1428 epoch_t last = osdmap.get_epoch();
1429 int max = g_conf->osd_map_message_max;
1430 for (epoch_t e = MAX(first, m->get_full_first());
1431 e <= MIN(last, m->get_full_last()) && max > 0;
1432 ++e, --max) {
1433 int r = get_version_full(e, reply->maps[e]);
1434 assert(r >= 0);
1435 }
1436 for (epoch_t e = MAX(first, m->get_inc_first());
1437 e <= MIN(last, m->get_inc_last()) && max > 0;
1438 ++e, --max) {
1439 int r = get_version(e, reply->incremental_maps[e]);
1440 assert(r >= 0);
1441 }
1442 reply->oldest_map = first;
1443 reply->newest_map = last;
1444 mon->send_reply(op, reply);
1445 return true;
1446 }
1447
1448
1449 // ---------------------------
1450 // UPDATEs
1451
1452 // failure --
1453
1454 bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
1455 // check permissions
1456 MonSession *session = m->get_session();
1457 if (!session)
1458 return true;
1459 if (!session->is_capable("osd", MON_CAP_X)) {
1460 dout(0) << "got MOSDFailure from entity with insufficient caps "
1461 << session->caps << dendl;
1462 return true;
1463 }
1464 if (fsid != mon->monmap->fsid) {
1465 dout(0) << "check_source: on fsid " << fsid
1466 << " != " << mon->monmap->fsid << dendl;
1467 return true;
1468 }
1469 return false;
1470 }
1471
1472
1473 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
1474 {
1475 op->mark_osdmon_event(__func__);
1476 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1477 // who is target_osd
1478 int badboy = m->get_target().name.num();
1479
1480 // check permissions
1481 if (check_source(m, m->fsid))
1482 goto didit;
1483
1484 // first, verify the reporting host is valid
1485 if (m->get_orig_source().is_osd()) {
1486 int from = m->get_orig_source().num();
1487 if (!osdmap.exists(from) ||
1488 osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
1489 (osdmap.is_down(from) && m->if_osd_failed())) {
1490 dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
1491 send_incremental(op, m->get_epoch()+1);
1492 goto didit;
1493 }
1494 }
1495
1496
1497 // weird?
1498 if (osdmap.is_down(badboy)) {
1499 dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1500 if (m->get_epoch() < osdmap.get_epoch())
1501 send_incremental(op, m->get_epoch()+1);
1502 goto didit;
1503 }
1504 if (osdmap.get_inst(badboy) != m->get_target()) {
1505 dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
1506 << ", from " << m->get_orig_source_inst() << dendl;
1507 if (m->get_epoch() < osdmap.get_epoch())
1508 send_incremental(op, m->get_epoch()+1);
1509 goto didit;
1510 }
1511
1512 // already reported?
1513 if (osdmap.is_down(badboy) ||
1514 osdmap.get_up_from(badboy) > m->get_epoch()) {
1515 dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1516 if (m->get_epoch() < osdmap.get_epoch())
1517 send_incremental(op, m->get_epoch()+1);
1518 goto didit;
1519 }
1520
1521 if (!can_mark_down(badboy)) {
1522 dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
1523 goto didit;
1524 }
1525
1526 dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1527 return false;
1528
1529 didit:
1530 return true;
1531 }
1532
1533 class C_AckMarkedDown : public C_MonOp {
1534 OSDMonitor *osdmon;
1535 public:
1536 C_AckMarkedDown(
1537 OSDMonitor *osdmon,
1538 MonOpRequestRef op)
1539 : C_MonOp(op), osdmon(osdmon) {}
1540
1541 void _finish(int) override {
1542 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1543 osdmon->mon->send_reply(
1544 op,
1545 new MOSDMarkMeDown(
1546 m->fsid,
1547 m->get_target(),
1548 m->get_epoch(),
1549 false)); // ACK itself does not request an ack
1550 }
1551 ~C_AckMarkedDown() override {
1552 }
1553 };
1554
1555 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
1556 {
1557 op->mark_osdmon_event(__func__);
1558 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1559 int requesting_down = m->get_target().name.num();
1560 int from = m->get_orig_source().num();
1561
1562 // check permissions
1563 if (check_source(m, m->fsid))
1564 goto reply;
1565
1566 // first, verify the reporting host is valid
1567 if (!m->get_orig_source().is_osd())
1568 goto reply;
1569
1570 if (!osdmap.exists(from) ||
1571 osdmap.is_down(from) ||
1572 osdmap.get_addr(from) != m->get_target().addr) {
1573 dout(5) << "preprocess_mark_me_down from dead osd."
1574 << from << ", ignoring" << dendl;
1575 send_incremental(op, m->get_epoch()+1);
1576 goto reply;
1577 }
1578
1579 // no down might be set
1580 if (!can_mark_down(requesting_down))
1581 goto reply;
1582
1583 dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
1584 return false;
1585
1586 reply:
1587 if (m->request_ack) {
1588 Context *c(new C_AckMarkedDown(this, op));
1589 c->complete(0);
1590 }
1591 return true;
1592 }
1593
1594 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
1595 {
1596 op->mark_osdmon_event(__func__);
1597 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1598 int target_osd = m->get_target().name.num();
1599
1600 assert(osdmap.is_up(target_osd));
1601 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1602
1603 mon->clog->info() << "osd." << target_osd << " marked itself down";
1604 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1605 if (m->request_ack)
1606 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
1607 return true;
1608 }
1609
1610 bool OSDMonitor::can_mark_down(int i)
1611 {
1612 if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
1613 dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i
1614 << " down" << dendl;
1615 return false;
1616 }
1617
1618 if (osdmap.is_nodown(i)) {
1619 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
1620 << "will not mark it down" << dendl;
1621 return false;
1622 }
1623
1624 int num_osds = osdmap.get_num_osds();
1625 if (num_osds == 0) {
1626 dout(5) << __func__ << " no osds" << dendl;
1627 return false;
1628 }
1629 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
1630 float up_ratio = (float)up / (float)num_osds;
1631 if (up_ratio < g_conf->mon_osd_min_up_ratio) {
1632 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
1633 << g_conf->mon_osd_min_up_ratio
1634 << ", will not mark osd." << i << " down" << dendl;
1635 return false;
1636 }
1637 return true;
1638 }
1639
1640 bool OSDMonitor::can_mark_up(int i)
1641 {
1642 if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
1643 dout(5) << __func__ << " NOUP flag set, will not mark osd." << i
1644 << " up" << dendl;
1645 return false;
1646 }
1647
1648 if (osdmap.is_noup(i)) {
1649 dout(5) << __func__ << " osd." << i << " is marked as noup, "
1650 << "will not mark it up" << dendl;
1651 return false;
1652 }
1653
1654 return true;
1655 }
1656
1657 /**
1658 * @note the parameter @p i apparently only exists here so we can output the
1659 * osd's id on messages.
1660 */
1661 bool OSDMonitor::can_mark_out(int i)
1662 {
1663 if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
1664 dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
1665 return false;
1666 }
1667
1668 if (osdmap.is_noout(i)) {
1669 dout(5) << __func__ << " osd." << i << " is marked as noout, "
1670 << "will not mark it out" << dendl;
1671 return false;
1672 }
1673
1674 int num_osds = osdmap.get_num_osds();
1675 if (num_osds == 0) {
1676 dout(5) << __func__ << " no osds" << dendl;
1677 return false;
1678 }
1679 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
1680 float in_ratio = (float)in / (float)num_osds;
1681 if (in_ratio < g_conf->mon_osd_min_in_ratio) {
1682 if (i >= 0)
1683 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1684 << g_conf->mon_osd_min_in_ratio
1685 << ", will not mark osd." << i << " out" << dendl;
1686 else
1687 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1688 << g_conf->mon_osd_min_in_ratio
1689 << ", will not mark osds out" << dendl;
1690 return false;
1691 }
1692
1693 return true;
1694 }
1695
1696 bool OSDMonitor::can_mark_in(int i)
1697 {
1698 if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
1699 dout(5) << __func__ << " NOIN flag set, will not mark osd." << i
1700 << " in" << dendl;
1701 return false;
1702 }
1703
1704 if (osdmap.is_noin(i)) {
1705 dout(5) << __func__ << " osd." << i << " is marked as noin, "
1706 << "will not mark it in" << dendl;
1707 return false;
1708 }
1709
1710 return true;
1711 }
1712
1713 bool OSDMonitor::check_failures(utime_t now)
1714 {
1715 bool found_failure = false;
1716 for (map<int,failure_info_t>::iterator p = failure_info.begin();
1717 p != failure_info.end();
1718 ++p) {
1719 if (can_mark_down(p->first)) {
1720 found_failure |= check_failure(now, p->first, p->second);
1721 }
1722 }
1723 return found_failure;
1724 }
1725
1726 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
1727 {
1728 // already pending failure?
1729 if (pending_inc.new_state.count(target_osd) &&
1730 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1731 dout(10) << " already pending failure" << dendl;
1732 return true;
1733 }
1734
1735 set<string> reporters_by_subtree;
1736 string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
1737 utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
1738 utime_t max_failed_since = fi.get_failed_since();
1739 utime_t failed_for = now - max_failed_since;
1740
1741 utime_t grace = orig_grace;
1742 double my_grace = 0, peer_grace = 0;
1743 double decay_k = 0;
1744 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1745 double halflife = (double)g_conf->mon_osd_laggy_halflife;
1746 decay_k = ::log(.5) / halflife;
1747
1748 // scale grace period based on historical probability of 'lagginess'
1749 // (false positive failures due to slowness).
1750 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
1751 double decay = exp((double)failed_for * decay_k);
1752 dout(20) << " halflife " << halflife << " decay_k " << decay_k
1753 << " failed_for " << failed_for << " decay " << decay << dendl;
1754 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
1755 grace += my_grace;
1756 }
1757
1758 // consider the peers reporting a failure a proxy for a potential
1759 // 'subcluster' over the overall cluster that is similarly
1760 // laggy. this is clearly not true in all cases, but will sometimes
1761 // help us localize the grace correction to a subset of the system
1762 // (say, a rack with a bad switch) that is unhappy.
1763 assert(fi.reporters.size());
1764 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
1765 p != fi.reporters.end();
1766 ++p) {
1767 // get the parent bucket whose type matches with "reporter_subtree_level".
1768 // fall back to OSD if the level doesn't exist.
1769 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
1770 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
1771 if (iter == reporter_loc.end()) {
1772 reporters_by_subtree.insert("osd." + to_string(p->first));
1773 } else {
1774 reporters_by_subtree.insert(iter->second);
1775 }
1776 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1777 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
1778 utime_t elapsed = now - xi.down_stamp;
1779 double decay = exp((double)elapsed * decay_k);
1780 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
1781 }
1782 }
1783
1784 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1785 peer_grace /= (double)fi.reporters.size();
1786 grace += peer_grace;
1787 }
1788
1789 dout(10) << " osd." << target_osd << " has "
1790 << fi.reporters.size() << " reporters, "
1791 << grace << " grace (" << orig_grace << " + " << my_grace
1792 << " + " << peer_grace << "), max_failed_since " << max_failed_since
1793 << dendl;
1794
1795 if (failed_for >= grace &&
1796 (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
1797 dout(1) << " we have enough reporters to mark osd." << target_osd
1798 << " down" << dendl;
1799 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1800
1801 mon->clog->info() << "osd." << target_osd << " failed ("
1802 << osdmap.crush->get_full_location_ordered_string(
1803 target_osd)
1804 << ") ("
1805 << (int)reporters_by_subtree.size()
1806 << " reporters from different "
1807 << reporter_subtree_level << " after "
1808 << failed_for << " >= grace " << grace << ")";
1809 return true;
1810 }
1811 return false;
1812 }
1813
1814 void OSDMonitor::force_failure(utime_t now, int target_osd, int by)
1815 {
1816 // already pending failure?
1817 if (pending_inc.new_state.count(target_osd) &&
1818 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1819 dout(10) << " already pending failure" << dendl;
1820 return;
1821 }
1822
1823 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
1824 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1825
1826 mon->clog->info() << "osd." << target_osd << " failed ("
1827 << osdmap.crush->get_full_location_ordered_string(target_osd)
1828 << ") (connection refused reported by osd." << by << ")";
1829 return;
1830 }
1831
1832 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
1833 {
1834 op->mark_osdmon_event(__func__);
1835 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1836 dout(1) << "prepare_failure " << m->get_target()
1837 << " from " << m->get_orig_source_inst()
1838 << " is reporting failure:" << m->if_osd_failed() << dendl;
1839
1840 int target_osd = m->get_target().name.num();
1841 int reporter = m->get_orig_source().num();
1842 assert(osdmap.is_up(target_osd));
1843 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1844
1845 if (m->if_osd_failed()) {
1846 // calculate failure time
1847 utime_t now = ceph_clock_now();
1848 utime_t failed_since =
1849 m->get_recv_stamp() - utime_t(m->failed_for, 0);
1850
1851 // add a report
1852 if (m->is_immediate()) {
1853 mon->clog->debug() << m->get_target() << " reported immediately failed by "
1854 << m->get_orig_source_inst();
1855 force_failure(now, target_osd, reporter);
1856 return true;
1857 }
1858 mon->clog->debug() << m->get_target() << " reported failed by "
1859 << m->get_orig_source_inst();
1860
1861 failure_info_t& fi = failure_info[target_osd];
1862 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
1863 if (old_op) {
1864 mon->no_reply(old_op);
1865 }
1866
1867 return check_failure(now, target_osd, fi);
1868 } else {
1869 // remove the report
1870 mon->clog->debug() << m->get_target() << " failure report canceled by "
1871 << m->get_orig_source_inst();
1872 if (failure_info.count(target_osd)) {
1873 failure_info_t& fi = failure_info[target_osd];
1874 MonOpRequestRef report_op = fi.cancel_report(reporter);
1875 if (report_op) {
1876 mon->no_reply(report_op);
1877 }
1878 if (fi.reporters.empty()) {
1879 dout(10) << " removing last failure_info for osd." << target_osd
1880 << dendl;
1881 failure_info.erase(target_osd);
1882 } else {
1883 dout(10) << " failure_info for osd." << target_osd << " now "
1884 << fi.reporters.size() << " reporters" << dendl;
1885 }
1886 } else {
1887 dout(10) << " no failure_info for osd." << target_osd << dendl;
1888 }
1889 mon->no_reply(op);
1890 }
1891
1892 return false;
1893 }
1894
1895 void OSDMonitor::process_failures()
1896 {
1897 map<int,failure_info_t>::iterator p = failure_info.begin();
1898 while (p != failure_info.end()) {
1899 if (osdmap.is_up(p->first)) {
1900 ++p;
1901 } else {
1902 dout(10) << "process_failures osd." << p->first << dendl;
1903 list<MonOpRequestRef> ls;
1904 p->second.take_report_messages(ls);
1905 failure_info.erase(p++);
1906
1907 while (!ls.empty()) {
1908 MonOpRequestRef o = ls.front();
1909 if (o) {
1910 o->mark_event(__func__);
1911 MOSDFailure *m = o->get_req<MOSDFailure>();
1912 send_latest(o, m->get_epoch());
1913 }
1914 ls.pop_front();
1915 }
1916 }
1917 }
1918 }
1919
1920 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
1921 {
1922 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
1923
1924 for (map<int,failure_info_t>::iterator p = failure_info.begin();
1925 p != failure_info.end();
1926 ++p) {
1927 p->second.take_report_messages(ls);
1928 }
1929 failure_info.clear();
1930 }
1931
1932
1933 // boot --
1934
1935 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
1936 {
1937 op->mark_osdmon_event(__func__);
1938 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
1939 int from = m->get_orig_source_inst().name.num();
1940
1941 // check permissions, ignore if failed (no response expected)
1942 MonSession *session = m->get_session();
1943 if (!session)
1944 goto ignore;
1945 if (!session->is_capable("osd", MON_CAP_X)) {
1946 dout(0) << "got preprocess_boot message from entity with insufficient caps"
1947 << session->caps << dendl;
1948 goto ignore;
1949 }
1950
1951 if (m->sb.cluster_fsid != mon->monmap->fsid) {
1952 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
1953 << " != " << mon->monmap->fsid << dendl;
1954 goto ignore;
1955 }
1956
1957 if (m->get_orig_source_inst().addr.is_blank_ip()) {
1958 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
1959 goto ignore;
1960 }
1961
1962 assert(m->get_orig_source_inst().name.is_osd());
1963
1964 // check if osd has required features to boot
1965 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
1966 CEPH_FEATURE_OSD_ERASURE_CODES) &&
1967 !(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
1968 dout(0) << __func__ << " osdmap requires erasure code but osd at "
1969 << m->get_orig_source_inst()
1970 << " doesn't announce support -- ignore" << dendl;
1971 goto ignore;
1972 }
1973
1974 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
1975 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
1976 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
1977 dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
1978 << m->get_orig_source_inst()
1979 << " doesn't announce support -- ignore" << dendl;
1980 goto ignore;
1981 }
1982
1983 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
1984 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
1985 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
1986 dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
1987 << m->get_orig_source_inst()
1988 << " doesn't announce support -- ignore" << dendl;
1989 goto ignore;
1990 }
1991
1992 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
1993 !HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
1994 mon->clog->info() << "disallowing boot of OSD "
1995 << m->get_orig_source_inst()
1996 << " because the osdmap requires"
1997 << " CEPH_FEATURE_SERVER_LUMINOUS"
1998 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
1999 goto ignore;
2000 }
2001
2002 if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL &&
2003 !(m->osd_features & CEPH_FEATURE_SERVER_JEWEL)) {
2004 mon->clog->info() << "disallowing boot of OSD "
2005 << m->get_orig_source_inst()
2006 << " because the osdmap requires"
2007 << " CEPH_FEATURE_SERVER_JEWEL"
2008 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2009 goto ignore;
2010 }
2011
2012 if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN &&
2013 !HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2014 mon->clog->info() << "disallowing boot of OSD "
2015 << m->get_orig_source_inst()
2016 << " because the osdmap requires"
2017 << " CEPH_FEATURE_SERVER_KRAKEN"
2018 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2019 goto ignore;
2020 }
2021
2022 if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
2023 !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
2024 mon->clog->info() << "disallowing boot of OSD "
2025 << m->get_orig_source_inst()
2026 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2027 goto ignore;
2028 }
2029
2030 if (any_of(osdmap.get_pools().begin(),
2031 osdmap.get_pools().end(),
2032 [](const std::pair<int64_t,pg_pool_t>& pool)
2033 { return pool.second.use_gmt_hitset; })) {
2034 assert(osdmap.get_num_up_osds() == 0 ||
2035 osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
2036 if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
2037 dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
2038 << m->get_orig_source_inst()
2039 << " doesn't announce support -- ignore" << dendl;
2040 goto ignore;
2041 }
2042 }
2043
2044 // make sure upgrades stop at luminous
2045 if (HAVE_FEATURE(m->osd_features, SERVER_M) &&
2046 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
2047 mon->clog->info() << "disallowing boot of post-luminous OSD "
2048 << m->get_orig_source_inst()
2049 << " because require_osd_release < luminous";
2050 goto ignore;
2051 }
2052
2053 // make sure upgrades stop at jewel
2054 if (HAVE_FEATURE(m->osd_features, SERVER_KRAKEN) &&
2055 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
2056 mon->clog->info() << "disallowing boot of post-jewel OSD "
2057 << m->get_orig_source_inst()
2058 << " because require_osd_release < jewel";
2059 goto ignore;
2060 }
2061
2062 // make sure upgrades stop at hammer
2063 // * HAMMER_0_94_4 is the required hammer feature
2064 // * MON_METADATA is the first post-hammer feature
2065 if (osdmap.get_num_up_osds() > 0) {
2066 if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
2067 !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
2068 mon->clog->info() << "disallowing boot of post-hammer OSD "
2069 << m->get_orig_source_inst()
2070 << " because one or more up OSDs is pre-hammer v0.94.4";
2071 goto ignore;
2072 }
2073 if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
2074 (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
2075 mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2076 << m->get_orig_source_inst()
2077 << " because all up OSDs are post-hammer";
2078 goto ignore;
2079 }
2080 }
2081
2082 // already booted?
2083 if (osdmap.is_up(from) &&
2084 osdmap.get_inst(from) == m->get_orig_source_inst() &&
2085 osdmap.get_cluster_addr(from) == m->cluster_addr) {
2086 // yup.
2087 dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
2088 << " == " << osdmap.get_inst(from) << dendl;
2089 _booted(op, false);
2090 return true;
2091 }
2092
2093 if (osdmap.exists(from) &&
2094 !osdmap.get_uuid(from).is_zero() &&
2095 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2096 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2097 << " clashes with existing osd: different fsid"
2098 << " (ours: " << osdmap.get_uuid(from)
2099 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2100 goto ignore;
2101 }
2102
2103 if (osdmap.exists(from) &&
2104 osdmap.get_info(from).up_from > m->version &&
2105 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) {
2106 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2107 send_latest(op, m->sb.current_epoch+1);
2108 return true;
2109 }
2110
2111 // noup?
2112 if (!can_mark_up(from)) {
2113 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2114 send_latest(op, m->sb.current_epoch+1);
2115 return true;
2116 }
2117
2118 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2119 return false;
2120
2121 ignore:
2122 return true;
2123 }
2124
2125 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2126 {
2127 op->mark_osdmon_event(__func__);
2128 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2129 dout(7) << __func__ << " from " << m->get_orig_source_inst() << " sb " << m->sb
2130 << " cluster_addr " << m->cluster_addr
2131 << " hb_back_addr " << m->hb_back_addr
2132 << " hb_front_addr " << m->hb_front_addr
2133 << dendl;
2134
2135 assert(m->get_orig_source().is_osd());
2136 int from = m->get_orig_source().num();
2137
2138 // does this osd exist?
2139 if (from >= osdmap.get_max_osd()) {
2140 dout(1) << "boot from osd." << from << " >= max_osd "
2141 << osdmap.get_max_osd() << dendl;
2142 return false;
2143 }
2144
2145 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2146 if (pending_inc.new_state.count(from))
2147 oldstate ^= pending_inc.new_state[from];
2148
2149 // already up? mark down first?
2150 if (osdmap.is_up(from)) {
2151 dout(7) << __func__ << " was up, first marking down "
2152 << osdmap.get_inst(from) << dendl;
2153 // preprocess should have caught these; if not, assert.
2154 assert(osdmap.get_inst(from) != m->get_orig_source_inst() ||
2155 osdmap.get_cluster_addr(from) != m->cluster_addr);
2156 assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2157
2158 if (pending_inc.new_state.count(from) == 0 ||
2159 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2160 // mark previous guy down
2161 pending_inc.new_state[from] = CEPH_OSD_UP;
2162 }
2163 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2164 } else if (pending_inc.new_up_client.count(from)) {
2165 // already prepared, just wait
2166 dout(7) << __func__ << " already prepared, waiting on "
2167 << m->get_orig_source_addr() << dendl;
2168 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2169 } else {
2170 // mark new guy up.
2171 pending_inc.new_up_client[from] = m->get_orig_source_addr();
2172 if (!m->cluster_addr.is_blank_ip())
2173 pending_inc.new_up_cluster[from] = m->cluster_addr;
2174 pending_inc.new_hb_back_up[from] = m->hb_back_addr;
2175 if (!m->hb_front_addr.is_blank_ip())
2176 pending_inc.new_hb_front_up[from] = m->hb_front_addr;
2177
2178 down_pending_out.erase(from); // if any
2179
2180 if (m->sb.weight)
2181 osd_weight[from] = m->sb.weight;
2182
2183 // set uuid?
2184 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2185 << dendl;
2186 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2187 // preprocess should have caught this; if not, assert.
2188 assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2189 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2190 }
2191
2192 // fresh osd?
2193 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2194 const osd_info_t& i = osdmap.get_info(from);
2195 if (i.up_from > i.lost_at) {
2196 dout(10) << " fresh osd; marking lost_at too" << dendl;
2197 pending_inc.new_lost[from] = osdmap.get_epoch();
2198 }
2199 }
2200
2201 // metadata
2202 bufferlist osd_metadata;
2203 ::encode(m->metadata, osd_metadata);
2204 pending_metadata[from] = osd_metadata;
2205 pending_metadata_rm.erase(from);
2206
2207 // adjust last clean unmount epoch?
2208 const osd_info_t& info = osdmap.get_info(from);
2209 dout(10) << " old osd_info: " << info << dendl;
2210 if (m->sb.mounted > info.last_clean_begin ||
2211 (m->sb.mounted == info.last_clean_begin &&
2212 m->sb.clean_thru > info.last_clean_end)) {
2213 epoch_t begin = m->sb.mounted;
2214 epoch_t end = m->sb.clean_thru;
2215
2216 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2217 << "[" << info.last_clean_begin << "," << info.last_clean_end
2218 << ") -> [" << begin << "-" << end << ")"
2219 << dendl;
2220 pending_inc.new_last_clean_interval[from] =
2221 pair<epoch_t,epoch_t>(begin, end);
2222 }
2223
2224 osd_xinfo_t xi = osdmap.get_xinfo(from);
2225 if (m->boot_epoch == 0) {
2226 xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
2227 xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
2228 dout(10) << " not laggy, new xi " << xi << dendl;
2229 } else {
2230 if (xi.down_stamp.sec()) {
2231 int interval = ceph_clock_now().sec() -
2232 xi.down_stamp.sec();
2233 if (g_conf->mon_osd_laggy_max_interval &&
2234 (interval > g_conf->mon_osd_laggy_max_interval)) {
2235 interval = g_conf->mon_osd_laggy_max_interval;
2236 }
2237 xi.laggy_interval =
2238 interval * g_conf->mon_osd_laggy_weight +
2239 xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
2240 }
2241 xi.laggy_probability =
2242 g_conf->mon_osd_laggy_weight +
2243 xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
2244 dout(10) << " laggy, now xi " << xi << dendl;
2245 }
2246
2247 // set features shared by the osd
2248 if (m->osd_features)
2249 xi.features = m->osd_features;
2250 else
2251 xi.features = m->get_connection()->get_features();
2252
2253 // mark in?
2254 if ((g_conf->mon_osd_auto_mark_auto_out_in &&
2255 (oldstate & CEPH_OSD_AUTOOUT)) ||
2256 (g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
2257 (g_conf->mon_osd_auto_mark_in)) {
2258 if (can_mark_in(from)) {
2259 if (osdmap.osd_xinfo[from].old_weight > 0) {
2260 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
2261 xi.old_weight = 0;
2262 } else {
2263 pending_inc.new_weight[from] = CEPH_OSD_IN;
2264 }
2265 } else {
2266 dout(7) << __func__ << " NOIN set, will not mark in "
2267 << m->get_orig_source_addr() << dendl;
2268 }
2269 }
2270
2271 pending_inc.new_xinfo[from] = xi;
2272
2273 // wait
2274 wait_for_finished_proposal(op, new C_Booted(this, op));
2275 }
2276 return true;
2277 }
2278
2279 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
2280 {
2281 op->mark_osdmon_event(__func__);
2282 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2283 dout(7) << "_booted " << m->get_orig_source_inst()
2284 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
2285
2286 if (logit) {
2287 mon->clog->info() << m->get_orig_source_inst() << " boot";
2288 }
2289
2290 send_latest(op, m->sb.current_epoch+1);
2291 }
2292
2293
2294 // -------------
2295 // full
2296
2297 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
2298 {
2299 op->mark_osdmon_event(__func__);
2300 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2301 int from = m->get_orig_source().num();
2302 set<string> state;
2303 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2304
2305 // check permissions, ignore if failed
2306 MonSession *session = m->get_session();
2307 if (!session)
2308 goto ignore;
2309 if (!session->is_capable("osd", MON_CAP_X)) {
2310 dout(0) << "MOSDFull from entity with insufficient privileges:"
2311 << session->caps << dendl;
2312 goto ignore;
2313 }
2314
2315 // ignore a full message from the osd instance that already went down
2316 if (!osdmap.exists(from)) {
2317 dout(7) << __func__ << " ignoring full message from nonexistent "
2318 << m->get_orig_source_inst() << dendl;
2319 goto ignore;
2320 }
2321 if ((!osdmap.is_up(from) &&
2322 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) ||
2323 (osdmap.is_up(from) &&
2324 osdmap.get_inst(from) != m->get_orig_source_inst())) {
2325 dout(7) << __func__ << " ignoring full message from down "
2326 << m->get_orig_source_inst() << dendl;
2327 goto ignore;
2328 }
2329
2330 OSDMap::calc_state_set(osdmap.get_state(from), state);
2331
2332 if ((osdmap.get_state(from) & mask) == m->state) {
2333 dout(7) << __func__ << " state already " << state << " for osd." << from
2334 << " " << m->get_orig_source_inst() << dendl;
2335 _reply_map(op, m->version);
2336 goto ignore;
2337 }
2338
2339 dout(10) << __func__ << " want state " << state << " for osd." << from
2340 << " " << m->get_orig_source_inst() << dendl;
2341 return false;
2342
2343 ignore:
2344 return true;
2345 }
2346
2347 bool OSDMonitor::prepare_full(MonOpRequestRef op)
2348 {
2349 op->mark_osdmon_event(__func__);
2350 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2351 const int from = m->get_orig_source().num();
2352
2353 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2354 const unsigned want_state = m->state & mask; // safety first
2355
2356 unsigned cur_state = osdmap.get_state(from);
2357 auto p = pending_inc.new_state.find(from);
2358 if (p != pending_inc.new_state.end()) {
2359 cur_state ^= p->second;
2360 }
2361 cur_state &= mask;
2362
2363 set<string> want_state_set, cur_state_set;
2364 OSDMap::calc_state_set(want_state, want_state_set);
2365 OSDMap::calc_state_set(cur_state, cur_state_set);
2366
2367 if (cur_state != want_state) {
2368 if (p != pending_inc.new_state.end()) {
2369 p->second &= ~mask;
2370 } else {
2371 pending_inc.new_state[from] = 0;
2372 }
2373 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
2374 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2375 << " -> " << want_state_set << dendl;
2376 } else {
2377 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2378 << " = wanted " << want_state_set << ", just waiting" << dendl;
2379 }
2380
2381 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2382 return true;
2383 }
2384
2385 // -------------
2386 // alive
2387
2388 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
2389 {
2390 op->mark_osdmon_event(__func__);
2391 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2392 int from = m->get_orig_source().num();
2393
2394 // check permissions, ignore if failed
2395 MonSession *session = m->get_session();
2396 if (!session)
2397 goto ignore;
2398 if (!session->is_capable("osd", MON_CAP_X)) {
2399 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2400 << session->caps << dendl;
2401 goto ignore;
2402 }
2403
2404 if (!osdmap.is_up(from) ||
2405 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2406 dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
2407 goto ignore;
2408 }
2409
2410 if (osdmap.get_up_thru(from) >= m->want) {
2411 // yup.
2412 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
2413 _reply_map(op, m->version);
2414 return true;
2415 }
2416
2417 dout(10) << "preprocess_alive want up_thru " << m->want
2418 << " from " << m->get_orig_source_inst() << dendl;
2419 return false;
2420
2421 ignore:
2422 return true;
2423 }
2424
2425 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
2426 {
2427 op->mark_osdmon_event(__func__);
2428 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2429 int from = m->get_orig_source().num();
2430
2431 if (0) { // we probably don't care much about these
2432 mon->clog->debug() << m->get_orig_source_inst() << " alive";
2433 }
2434
2435 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
2436 << " from " << m->get_orig_source_inst() << dendl;
2437
2438 update_up_thru(from, m->version); // set to the latest map the OSD has
2439 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2440 return true;
2441 }
2442
2443 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
2444 {
2445 op->mark_osdmon_event(__func__);
2446 dout(7) << "_reply_map " << e
2447 << " from " << op->get_req()->get_orig_source_inst()
2448 << dendl;
2449 send_latest(op, e);
2450 }
2451
2452 // pg_created
2453 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
2454 {
2455 op->mark_osdmon_event(__func__);
2456 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2457 dout(10) << __func__ << " " << *m << dendl;
2458 auto session = m->get_session();
2459 if (!session) {
2460 dout(10) << __func__ << ": no monitor session!" << dendl;
2461 return true;
2462 }
2463 if (!session->is_capable("osd", MON_CAP_X)) {
2464 derr << __func__ << " received from entity "
2465 << "with insufficient privileges " << session->caps << dendl;
2466 return true;
2467 }
2468 // always forward the "created!" to the leader
2469 return false;
2470 }
2471
2472 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
2473 {
2474 op->mark_osdmon_event(__func__);
2475 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2476 dout(10) << __func__ << " " << *m << dendl;
2477 auto src = m->get_orig_source();
2478 auto from = src.num();
2479 if (!src.is_osd() ||
2480 !mon->osdmon()->osdmap.is_up(from) ||
2481 m->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
2482 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
2483 return false;
2484 }
2485 pending_created_pgs.push_back(m->pgid);
2486 return true;
2487 }
2488
2489 // -------------
2490 // pg_temp changes
2491
2492 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
2493 {
2494 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2495 dout(10) << "preprocess_pgtemp " << *m << dendl;
2496 mempool::osdmap::vector<int> empty;
2497 int from = m->get_orig_source().num();
2498 size_t ignore_cnt = 0;
2499
2500 // check caps
2501 MonSession *session = m->get_session();
2502 if (!session)
2503 goto ignore;
2504 if (!session->is_capable("osd", MON_CAP_X)) {
2505 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2506 << session->caps << dendl;
2507 goto ignore;
2508 }
2509
2510 if (!osdmap.is_up(from) ||
2511 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2512 dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
2513 goto ignore;
2514 }
2515
2516 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2517 dout(20) << " " << p->first
2518 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
2519 << " -> " << p->second << dendl;
2520
2521 // does the pool exist?
2522 if (!osdmap.have_pg_pool(p->first.pool())) {
2523 /*
2524 * 1. If the osdmap does not have the pool, it means the pool has been
2525 * removed in-between the osd sending this message and us handling it.
2526 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2527 * not exist in the pending either, as the osds would not send a
2528 * message about a pool they know nothing about (yet).
2529 * 3. However, if the pool does exist in the pending, then it must be a
2530 * new pool, and not relevant to this message (see 1).
2531 */
2532 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2533 << ": pool has been removed" << dendl;
2534 ignore_cnt++;
2535 continue;
2536 }
2537
2538 int acting_primary = -1;
2539 osdmap.pg_to_up_acting_osds(
2540 p->first, nullptr, nullptr, nullptr, &acting_primary);
2541 if (acting_primary != from) {
2542 /* If the source isn't the primary based on the current osdmap, we know
2543 * that the interval changed and that we can discard this message.
2544 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2545 * which of two pg temp mappings on the same pg is more recent.
2546 */
2547 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2548 << ": primary has changed" << dendl;
2549 ignore_cnt++;
2550 continue;
2551 }
2552
2553 // removal?
2554 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
2555 osdmap.primary_temp->count(p->first)))
2556 return false;
2557 // change?
2558 // NOTE: we assume that this will clear pg_primary, so consider
2559 // an existing pg_primary field to imply a change
2560 if (p->second.size() &&
2561 (osdmap.pg_temp->count(p->first) == 0 ||
2562 !vectors_equal(osdmap.pg_temp->get(p->first), p->second) ||
2563 osdmap.primary_temp->count(p->first)))
2564 return false;
2565 }
2566
2567 // should we ignore all the pgs?
2568 if (ignore_cnt == m->pg_temp.size())
2569 goto ignore;
2570
2571 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
2572 _reply_map(op, m->map_epoch);
2573 return true;
2574
2575 ignore:
2576 return true;
2577 }
2578
2579 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
2580 {
2581 epoch_t old_up_thru = osdmap.get_up_thru(from);
2582 auto ut = pending_inc.new_up_thru.find(from);
2583 if (ut != pending_inc.new_up_thru.end()) {
2584 old_up_thru = ut->second;
2585 }
2586 if (up_thru > old_up_thru) {
2587 // set up_thru too, so the osd doesn't have to ask again
2588 pending_inc.new_up_thru[from] = up_thru;
2589 }
2590 }
2591
2592 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
2593 {
2594 op->mark_osdmon_event(__func__);
2595 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2596 int from = m->get_orig_source().num();
2597 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
2598 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2599 uint64_t pool = p->first.pool();
2600 if (pending_inc.old_pools.count(pool)) {
2601 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2602 << ": pool pending removal" << dendl;
2603 continue;
2604 }
2605 if (!osdmap.have_pg_pool(pool)) {
2606 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2607 << ": pool has been removed" << dendl;
2608 continue;
2609 }
2610 pending_inc.new_pg_temp[p->first] =
2611 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
2612
2613 // unconditionally clear pg_primary (until this message can encode
2614 // a change for that, too.. at which point we need to also fix
2615 // preprocess_pg_temp)
2616 if (osdmap.primary_temp->count(p->first) ||
2617 pending_inc.new_primary_temp.count(p->first))
2618 pending_inc.new_primary_temp[p->first] = -1;
2619 }
2620
2621 // set up_thru too, so the osd doesn't have to ask again
2622 update_up_thru(from, m->map_epoch);
2623
2624 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
2625 return true;
2626 }
2627
2628
2629 // ---
2630
2631 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
2632 {
2633 op->mark_osdmon_event(__func__);
2634 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2635 dout(7) << "preprocess_remove_snaps " << *m << dendl;
2636
2637 // check privilege, ignore if failed
2638 MonSession *session = m->get_session();
2639 if (!session)
2640 goto ignore;
2641 if (!session->caps.is_capable(
2642 g_ceph_context,
2643 CEPH_ENTITY_TYPE_MON,
2644 session->entity_name,
2645 "osd", "osd pool rmsnap", {}, true, true, false)) {
2646 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
2647 << session->caps << dendl;
2648 goto ignore;
2649 }
2650
2651 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
2652 q != m->snaps.end();
2653 ++q) {
2654 if (!osdmap.have_pg_pool(q->first)) {
2655 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
2656 continue;
2657 }
2658 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
2659 for (vector<snapid_t>::iterator p = q->second.begin();
2660 p != q->second.end();
2661 ++p) {
2662 if (*p > pi->get_snap_seq() ||
2663 !pi->removed_snaps.contains(*p))
2664 return false;
2665 }
2666 }
2667
2668 ignore:
2669 return true;
2670 }
2671
2672 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
2673 {
2674 op->mark_osdmon_event(__func__);
2675 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2676 dout(7) << "prepare_remove_snaps " << *m << dendl;
2677
2678 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
2679 p != m->snaps.end();
2680 ++p) {
2681
2682 if (!osdmap.have_pg_pool(p->first)) {
2683 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
2684 continue;
2685 }
2686
2687 pg_pool_t& pi = osdmap.pools[p->first];
2688 for (vector<snapid_t>::iterator q = p->second.begin();
2689 q != p->second.end();
2690 ++q) {
2691 if (!pi.removed_snaps.contains(*q) &&
2692 (!pending_inc.new_pools.count(p->first) ||
2693 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
2694 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
2695 newpi->removed_snaps.insert(*q);
2696 dout(10) << " pool " << p->first << " removed_snaps added " << *q
2697 << " (now " << newpi->removed_snaps << ")" << dendl;
2698 if (*q > newpi->get_snap_seq()) {
2699 dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
2700 newpi->set_snap_seq(*q);
2701 }
2702 newpi->set_snap_epoch(pending_inc.epoch);
2703 }
2704 }
2705 }
2706 return true;
2707 }
2708
2709 // osd beacon
2710 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
2711 {
2712 op->mark_osdmon_event(__func__);
2713 auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2714 // check caps
2715 auto session = beacon->get_session();
2716 if (!session) {
2717 dout(10) << __func__ << " no monitor session!" << dendl;
2718 return true;
2719 }
2720 if (!session->is_capable("osd", MON_CAP_X)) {
2721 derr << __func__ << " received from entity "
2722 << "with insufficient privileges " << session->caps << dendl;
2723 return true;
2724 }
2725 // Always forward the beacon to the leader, even if they are the same as
2726 // the old one. The leader will mark as down osds that haven't sent
2727 // beacon for a few minutes.
2728 return false;
2729 }
2730
2731 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
2732 {
2733 op->mark_osdmon_event(__func__);
2734 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2735 const auto src = beacon->get_orig_source();
2736 dout(10) << __func__ << " " << *beacon
2737 << " from " << src << dendl;
2738 int from = src.num();
2739
2740 if (!src.is_osd() ||
2741 !osdmap.is_up(from) ||
2742 beacon->get_orig_source_inst() != osdmap.get_inst(from)) {
2743 dout(1) << " ignoring beacon from non-active osd." << dendl;
2744 return false;
2745 }
2746
2747 last_osd_report[from] = ceph_clock_now();
2748 osd_epochs[from] = beacon->version;
2749
2750 for (const auto& pg : beacon->pgs) {
2751 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
2752 }
2753 return false;
2754 }
2755
2756 // ---------------
2757 // map helpers
2758
2759 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
2760 {
2761 op->mark_osdmon_event(__func__);
2762 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
2763 << " start " << start << dendl;
2764 if (start == 0)
2765 send_full(op);
2766 else
2767 send_incremental(op, start);
2768 }
2769
2770
2771 MOSDMap *OSDMonitor::build_latest_full()
2772 {
2773 MOSDMap *r = new MOSDMap(mon->monmap->fsid);
2774 get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
2775 r->oldest_map = get_first_committed();
2776 r->newest_map = osdmap.get_epoch();
2777 return r;
2778 }
2779
2780 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
2781 {
2782 dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
2783 MOSDMap *m = new MOSDMap(mon->monmap->fsid);
2784 m->oldest_map = get_first_committed();
2785 m->newest_map = osdmap.get_epoch();
2786
2787 for (epoch_t e = to; e >= from && e > 0; e--) {
2788 bufferlist bl;
2789 int err = get_version(e, bl);
2790 if (err == 0) {
2791 assert(bl.length());
2792 // if (get_version(e, bl) > 0) {
2793 dout(20) << "build_incremental inc " << e << " "
2794 << bl.length() << " bytes" << dendl;
2795 m->incremental_maps[e] = bl;
2796 } else {
2797 assert(err == -ENOENT);
2798 assert(!bl.length());
2799 get_version_full(e, bl);
2800 if (bl.length() > 0) {
2801 //else if (get_version("full", e, bl) > 0) {
2802 dout(20) << "build_incremental full " << e << " "
2803 << bl.length() << " bytes" << dendl;
2804 m->maps[e] = bl;
2805 } else {
2806 ceph_abort(); // we should have all maps.
2807 }
2808 }
2809 }
2810 return m;
2811 }
2812
2813 void OSDMonitor::send_full(MonOpRequestRef op)
2814 {
2815 op->mark_osdmon_event(__func__);
2816 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
2817 mon->send_reply(op, build_latest_full());
2818 }
2819
2820 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
2821 {
2822 op->mark_osdmon_event(__func__);
2823
2824 MonSession *s = op->get_session();
2825 assert(s);
2826
2827 if (s->proxy_con &&
2828 s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
2829 // oh, we can tell the other mon to do it
2830 dout(10) << __func__ << " asking proxying mon to send_incremental from "
2831 << first << dendl;
2832 MRoute *r = new MRoute(s->proxy_tid, NULL);
2833 r->send_osdmap_first = first;
2834 s->proxy_con->send_message(r);
2835 op->mark_event("reply: send routed send_osdmap_first reply");
2836 } else {
2837 // do it ourselves
2838 send_incremental(first, s, false, op);
2839 }
2840 }
2841
2842 void OSDMonitor::send_incremental(epoch_t first,
2843 MonSession *session,
2844 bool onetime,
2845 MonOpRequestRef req)
2846 {
2847 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
2848 << " to " << session->inst << dendl;
2849
2850 if (first <= session->osd_epoch) {
2851 dout(10) << __func__ << " " << session->inst << " should already have epoch "
2852 << session->osd_epoch << dendl;
2853 first = session->osd_epoch + 1;
2854 }
2855
2856 if (first < get_first_committed()) {
2857 first = get_first_committed();
2858 bufferlist bl;
2859 int err = get_version_full(first, bl);
2860 assert(err == 0);
2861 assert(bl.length());
2862
2863 dout(20) << "send_incremental starting with base full "
2864 << first << " " << bl.length() << " bytes" << dendl;
2865
2866 MOSDMap *m = new MOSDMap(osdmap.get_fsid());
2867 m->oldest_map = get_first_committed();
2868 m->newest_map = osdmap.get_epoch();
2869 m->maps[first] = bl;
2870
2871 if (req) {
2872 mon->send_reply(req, m);
2873 session->osd_epoch = first;
2874 return;
2875 } else {
2876 session->con->send_message(m);
2877 session->osd_epoch = first;
2878 }
2879 first++;
2880 }
2881
2882 while (first <= osdmap.get_epoch()) {
2883 epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
2884 osdmap.get_epoch());
2885 MOSDMap *m = build_incremental(first, last);
2886
2887 if (req) {
2888 // send some maps. it may not be all of them, but it will get them
2889 // started.
2890 mon->send_reply(req, m);
2891 } else {
2892 session->con->send_message(m);
2893 first = last + 1;
2894 }
2895 session->osd_epoch = last;
2896 if (onetime || req)
2897 break;
2898 }
2899 }
2900
2901 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
2902 {
2903 if (inc_osd_cache.lookup(ver, &bl)) {
2904 return 0;
2905 }
2906 int ret = PaxosService::get_version(ver, bl);
2907 if (!ret) {
2908 inc_osd_cache.add(ver, bl);
2909 }
2910 return ret;
2911 }
2912
2913 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
2914 {
2915 if (full_osd_cache.lookup(ver, &bl)) {
2916 return 0;
2917 }
2918 int ret = PaxosService::get_version_full(ver, bl);
2919 if (!ret) {
2920 full_osd_cache.add(ver, bl);
2921 }
2922 return ret;
2923 }
2924
2925 epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
2926 {
2927 dout(10) << "blacklist " << a << " until " << until << dendl;
2928 pending_inc.new_blacklist[a] = until;
2929 return pending_inc.epoch;
2930 }
2931
2932
2933 void OSDMonitor::check_osdmap_subs()
2934 {
2935 dout(10) << __func__ << dendl;
2936 if (!osdmap.get_epoch()) {
2937 return;
2938 }
2939 auto osdmap_subs = mon->session_map.subs.find("osdmap");
2940 if (osdmap_subs == mon->session_map.subs.end()) {
2941 return;
2942 }
2943 auto p = osdmap_subs->second->begin();
2944 while (!p.end()) {
2945 auto sub = *p;
2946 ++p;
2947 check_osdmap_sub(sub);
2948 }
2949 }
2950
2951 void OSDMonitor::check_osdmap_sub(Subscription *sub)
2952 {
2953 dout(10) << __func__ << " " << sub << " next " << sub->next
2954 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
2955 if (sub->next <= osdmap.get_epoch()) {
2956 if (sub->next >= 1)
2957 send_incremental(sub->next, sub->session, sub->incremental_onetime);
2958 else
2959 sub->session->con->send_message(build_latest_full());
2960 if (sub->onetime)
2961 mon->session_map.remove_sub(sub);
2962 else
2963 sub->next = osdmap.get_epoch() + 1;
2964 }
2965 }
2966
2967 void OSDMonitor::check_pg_creates_subs()
2968 {
2969 if (!mon->monmap->get_required_features().contains_all(
2970 ceph::features::mon::FEATURE_LUMINOUS)) {
2971 // PGMonitor takes care of this in pre-luminous era.
2972 return;
2973 }
2974 if (!osdmap.get_num_up_osds()) {
2975 return;
2976 }
2977 assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
2978 mon->with_session_map([this](const MonSessionMap& session_map) {
2979 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
2980 if (pg_creates_subs == session_map.subs.end()) {
2981 return;
2982 }
2983 for (auto sub : *pg_creates_subs->second) {
2984 check_pg_creates_sub(sub);
2985 }
2986 });
2987 }
2988
2989 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
2990 {
2991 dout(20) << __func__ << " .. " << sub->session->inst << dendl;
2992 assert(sub->type == "osd_pg_creates");
2993 // only send these if the OSD is up. we will check_subs() when they do
2994 // come up so they will get the creates then.
2995 if (sub->session->inst.name.is_osd() &&
2996 mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
2997 sub->next = send_pg_creates(sub->session->inst.name.num(),
2998 sub->session->con.get(),
2999 sub->next);
3000 }
3001 }
3002
3003 unsigned OSDMonitor::scan_for_creating_pgs(
3004 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
3005 const mempool::osdmap::set<int64_t>& removed_pools,
3006 utime_t modified,
3007 creating_pgs_t* creating_pgs) const
3008 {
3009 unsigned queued = 0;
3010 for (auto& p : pools) {
3011 int64_t poolid = p.first;
3012 const pg_pool_t& pool = p.second;
3013 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
3014 pool.get_type(), pool.get_size());
3015 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
3016 continue;
3017
3018 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
3019 const auto created = pool.get_last_change();
3020 if (last_scan_epoch && created <= last_scan_epoch) {
3021 dout(10) << __func__ << " no change in pool " << poolid
3022 << " " << pool << dendl;
3023 continue;
3024 }
3025 if (removed_pools.count(poolid)) {
3026 dout(10) << __func__ << " pool is being removed: " << poolid
3027 << " " << pool << dendl;
3028 continue;
3029 }
3030 dout(10) << __func__ << " queueing pool create for " << poolid
3031 << " " << pool << dendl;
3032 if (creating_pgs->create_pool(poolid, pool.get_pg_num(),
3033 created, modified)) {
3034 queued++;
3035 }
3036 }
3037 return queued;
3038 }
3039
3040 void OSDMonitor::update_creating_pgs()
3041 {
3042 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
3043 << creating_pgs.queue.size() << " pools in queue" << dendl;
3044 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
3045 std::lock_guard<std::mutex> l(creating_pgs_lock);
3046 for (auto& pg : creating_pgs.pgs) {
3047 int acting_primary = -1;
3048 auto pgid = pg.first;
3049 auto mapped = pg.second.first;
3050 dout(20) << __func__ << " looking up " << pgid << dendl;
3051 mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
3052 // check the previous creating_pgs, look for the target to whom the pg was
3053 // previously mapped
3054 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
3055 const auto last_acting_primary = pgs_by_epoch.first;
3056 for (auto& pgs: pgs_by_epoch.second) {
3057 if (pgs.second.count(pgid)) {
3058 if (last_acting_primary == acting_primary) {
3059 mapped = pgs.first;
3060 } else {
3061 dout(20) << __func__ << " " << pgid << " "
3062 << " acting_primary:" << last_acting_primary
3063 << " -> " << acting_primary << dendl;
3064 // note epoch if the target of the create message changed.
3065 mapped = mapping.get_epoch();
3066 }
3067 break;
3068 } else {
3069 // newly creating
3070 mapped = mapping.get_epoch();
3071 }
3072 }
3073 }
3074 dout(10) << __func__ << " will instruct osd." << acting_primary
3075 << " to create " << pgid << dendl;
3076 new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
3077 }
3078 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
3079 creating_pgs_epoch = mapping.get_epoch();
3080 }
3081
3082 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
3083 {
3084 dout(30) << __func__ << " osd." << osd << " next=" << next
3085 << " " << creating_pgs_by_osd_epoch << dendl;
3086 std::lock_guard<std::mutex> l(creating_pgs_lock);
3087 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
3088 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
3089 return next;
3090 assert(!creating_pgs_by_epoch->second.empty());
3091
3092 MOSDPGCreate *m = nullptr;
3093 epoch_t last = 0;
3094 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
3095 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
3096 auto epoch = epoch_pgs->first;
3097 auto& pgs = epoch_pgs->second;
3098 dout(20) << __func__ << " osd." << osd << " from " << next
3099 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
3100 last = epoch;
3101 for (auto& pg : pgs) {
3102 if (!m)
3103 m = new MOSDPGCreate(creating_pgs_epoch);
3104 // Need the create time from the monitor using its clock to set
3105 // last_scrub_stamp upon pg creation.
3106 const auto& creation = creating_pgs.pgs[pg];
3107 m->mkpg.emplace(pg, pg_create_t{creation.first, pg, 0});
3108 m->ctimes.emplace(pg, creation.second);
3109 dout(20) << __func__ << " will create " << pg
3110 << " at " << creation.first << dendl;
3111 }
3112 }
3113 if (!m) {
3114 dout(20) << __func__ << " osd." << osd << " from " << next
3115 << " has nothing to send" << dendl;
3116 return next;
3117 }
3118 con->send_message(m);
3119 // sub is current through last + 1
3120 return last + 1;
3121 }
3122
3123 // TICK
3124
3125
3126 void OSDMonitor::tick()
3127 {
3128 if (!is_active()) return;
3129
3130 dout(10) << osdmap << dendl;
3131
3132 if (!mon->is_leader()) return;
3133
3134 bool do_propose = false;
3135 utime_t now = ceph_clock_now();
3136
3137 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3138 mon->monmap->get_required_features().contains_all(
3139 ceph::features::mon::FEATURE_LUMINOUS)) {
3140 if (handle_osd_timeouts(now, last_osd_report)) {
3141 do_propose = true;
3142 }
3143 }
3144
3145 // mark osds down?
3146 if (check_failures(now))
3147 do_propose = true;
3148
3149 // mark down osds out?
3150
3151 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3152 * influence at all. The decision is made based on the ratio of "in" osds,
3153 * and the function returns false if this ratio is lower that the minimum
3154 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3155 */
3156 if (can_mark_out(-1)) {
3157 set<int> down_cache; // quick cache of down subtrees
3158
3159 map<int,utime_t>::iterator i = down_pending_out.begin();
3160 while (i != down_pending_out.end()) {
3161 int o = i->first;
3162 utime_t down = now;
3163 down -= i->second;
3164 ++i;
3165
3166 if (osdmap.is_down(o) &&
3167 osdmap.is_in(o) &&
3168 can_mark_out(o)) {
3169 utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
3170 utime_t grace = orig_grace;
3171 double my_grace = 0.0;
3172
3173 if (g_conf->mon_osd_adjust_down_out_interval) {
3174 // scale grace period the same way we do the heartbeat grace.
3175 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
3176 double halflife = (double)g_conf->mon_osd_laggy_halflife;
3177 double decay_k = ::log(.5) / halflife;
3178 double decay = exp((double)down * decay_k);
3179 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
3180 << " down for " << down << " decay " << decay << dendl;
3181 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3182 grace += my_grace;
3183 }
3184
3185 // is this an entire large subtree down?
3186 if (g_conf->mon_osd_down_out_subtree_limit.length()) {
3187 int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
3188 if (type > 0) {
3189 if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
3190 dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
3191 << " subtree for osd." << o << " is down; resetting timer" << dendl;
3192 // reset timer, too.
3193 down_pending_out[o] = now;
3194 continue;
3195 }
3196 }
3197 }
3198
3199 if (g_conf->mon_osd_down_out_interval > 0 &&
3200 down.sec() >= grace) {
3201 dout(10) << "tick marking osd." << o << " OUT after " << down
3202 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
3203 pending_inc.new_weight[o] = CEPH_OSD_OUT;
3204
3205 // set the AUTOOUT bit.
3206 if (pending_inc.new_state.count(o) == 0)
3207 pending_inc.new_state[o] = 0;
3208 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
3209
3210 // remember previous weight
3211 if (pending_inc.new_xinfo.count(o) == 0)
3212 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
3213 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
3214
3215 do_propose = true;
3216
3217 mon->clog->info() << "osd." << o << " out (down for " << down << ")";
3218 } else
3219 continue;
3220 }
3221
3222 down_pending_out.erase(o);
3223 }
3224 } else {
3225 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
3226 }
3227
3228 // expire blacklisted items?
3229 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
3230 p != osdmap.blacklist.end();
3231 ++p) {
3232 if (p->second < now) {
3233 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
3234 pending_inc.old_blacklist.push_back(p->first);
3235 do_propose = true;
3236 }
3237 }
3238
3239 // if map full setting has changed, get that info out there!
3240 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS &&
3241 mon->pgservice->is_readable()) {
3242 // for pre-luminous compat only!
3243 if (mon->pgservice->have_full_osds()) {
3244 dout(5) << "There are full osds, setting full flag" << dendl;
3245 add_flag(CEPH_OSDMAP_FULL);
3246 } else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
3247 dout(10) << "No full osds, removing full flag" << dendl;
3248 remove_flag(CEPH_OSDMAP_FULL);
3249 }
3250
3251 if (mon->pgservice->have_nearfull_osds()) {
3252 dout(5) << "There are near full osds, setting nearfull flag" << dendl;
3253 add_flag(CEPH_OSDMAP_NEARFULL);
3254 } else if (osdmap.test_flag(CEPH_OSDMAP_NEARFULL)){
3255 dout(10) << "No near full osds, removing nearfull flag" << dendl;
3256 remove_flag(CEPH_OSDMAP_NEARFULL);
3257 }
3258 if (pending_inc.new_flags != -1 &&
3259 (pending_inc.new_flags ^ osdmap.flags) & (CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
3260 dout(1) << "New setting for" <<
3261 (pending_inc.new_flags & CEPH_OSDMAP_FULL ? " CEPH_OSDMAP_FULL" : "") <<
3262 (pending_inc.new_flags & CEPH_OSDMAP_NEARFULL ? " CEPH_OSDMAP_NEARFULL" : "")
3263 << " -- doing propose" << dendl;
3264 do_propose = true;
3265 }
3266 }
3267
3268 if (update_pools_status())
3269 do_propose = true;
3270
3271 if (do_propose ||
3272 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
3273 propose_pending();
3274 }
3275
3276 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
3277 std::map<int,utime_t> &last_osd_report)
3278 {
3279 utime_t timeo(g_conf->mon_osd_report_timeout, 0);
3280 if (now - mon->get_leader_since() < timeo) {
3281 // We haven't been the leader for long enough to consider OSD timeouts
3282 return false;
3283 }
3284
3285 int max_osd = osdmap.get_max_osd();
3286 bool new_down = false;
3287
3288 for (int i=0; i < max_osd; ++i) {
3289 dout(30) << __func__ << ": checking up on osd " << i << dendl;
3290 if (!osdmap.is_up(i))
3291 continue;
3292 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
3293 if (t == last_osd_report.end()) {
3294 // it wasn't in the map; start the timer.
3295 last_osd_report[i] = now;
3296 } else if (can_mark_down(i)) {
3297 utime_t diff = now - t->second;
3298 if (diff > timeo) {
3299 mon->clog->info() << "osd." << i << " marked down after no beacon for "
3300 << diff << " seconds";
3301 derr << "no beacon from osd." << i << " since " << t->second
3302 << ", " << diff << " seconds ago. marking down" << dendl;
3303 pending_inc.new_state[i] = CEPH_OSD_UP;
3304 new_down = true;
3305 }
3306 }
3307 }
3308 return new_down;
3309 }
3310
3311 void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
3312 list<pair<health_status_t,string> > *detail,
3313 CephContext *cct) const
3314 {
3315 int num_osds = osdmap.get_num_osds();
3316
3317 if (num_osds == 0) {
3318 summary.push_back(make_pair(HEALTH_ERR, "no osds"));
3319 } else {
3320 int num_in_osds = 0;
3321 int num_down_in_osds = 0;
3322 set<int> osds;
3323 set<int> down_in_osds;
3324 set<int> up_in_osds;
3325 set<int> subtree_up;
3326 unordered_map<int, set<int> > subtree_type_down;
3327 unordered_map<int, int> num_osds_subtree;
3328 int max_type = osdmap.crush->get_max_type_id();
3329
3330 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3331 if (!osdmap.exists(i)) {
3332 if (osdmap.crush->item_exists(i)) {
3333 osds.insert(i);
3334 }
3335 continue;
3336 }
3337 if (osdmap.is_out(i))
3338 continue;
3339 ++num_in_osds;
3340 if (down_in_osds.count(i) || up_in_osds.count(i))
3341 continue;
3342 if (!osdmap.is_up(i)) {
3343 down_in_osds.insert(i);
3344 int parent_id = 0;
3345 int current = i;
3346 for (int type = 0; type <= max_type; type++) {
3347 if (!osdmap.crush->get_type_name(type))
3348 continue;
3349 int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
3350 if (r == -ENOENT)
3351 break;
3352 // break early if this parent is already marked as up
3353 if (subtree_up.count(parent_id))
3354 break;
3355 type = osdmap.crush->get_bucket_type(parent_id);
3356 if (!osdmap.subtree_type_is_down(
3357 g_ceph_context, parent_id, type,
3358 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
3359 break;
3360 current = parent_id;
3361 }
3362 }
3363 }
3364
3365 // calculate the number of down osds in each down subtree and
3366 // store it in num_osds_subtree
3367 for (int type = 1; type <= max_type; type++) {
3368 if (!osdmap.crush->get_type_name(type))
3369 continue;
3370 for (auto j = subtree_type_down[type].begin();
3371 j != subtree_type_down[type].end();
3372 ++j) {
3373 if (type == 1) {
3374 list<int> children;
3375 int num = osdmap.crush->get_children(*j, &children);
3376 num_osds_subtree[*j] = num;
3377 } else {
3378 list<int> children;
3379 int num = 0;
3380 int num_children = osdmap.crush->get_children(*j, &children);
3381 if (num_children == 0)
3382 continue;
3383 for (auto l = children.begin(); l != children.end(); ++l) {
3384 if (num_osds_subtree[*l] > 0) {
3385 num = num + num_osds_subtree[*l];
3386 }
3387 }
3388 num_osds_subtree[*j] = num;
3389 }
3390 }
3391 }
3392 num_down_in_osds = down_in_osds.size();
3393 assert(num_down_in_osds <= num_in_osds);
3394 if (num_down_in_osds > 0) {
3395 // summary of down subtree types and osds
3396 for (int type = max_type; type > 0; type--) {
3397 if (!osdmap.crush->get_type_name(type))
3398 continue;
3399 if (subtree_type_down[type].size() > 0) {
3400 ostringstream ss;
3401 ss << subtree_type_down[type].size() << " "
3402 << osdmap.crush->get_type_name(type);
3403 if (subtree_type_down[type].size() > 1) {
3404 ss << "s";
3405 }
3406 int sum_down_osds = 0;
3407 for (auto j = subtree_type_down[type].begin();
3408 j != subtree_type_down[type].end();
3409 ++j) {
3410 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
3411 }
3412 ss << " (" << sum_down_osds << " osds) down";
3413 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3414 }
3415 }
3416 ostringstream ss;
3417 ss << down_in_osds.size() << " osds down";
3418 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3419
3420 if (detail) {
3421 // details of down subtree types
3422 for (int type = max_type; type > 0; type--) {
3423 if (!osdmap.crush->get_type_name(type))
3424 continue;
3425 for (auto j = subtree_type_down[type].rbegin();
3426 j != subtree_type_down[type].rend();
3427 ++j) {
3428 ostringstream ss;
3429 ss << osdmap.crush->get_type_name(type);
3430 ss << " ";
3431 ss << osdmap.crush->get_item_name(*j);
3432 // at the top level, do not print location
3433 if (type != max_type) {
3434 ss << " (";
3435 ss << osdmap.crush->get_full_location_ordered_string(*j);
3436 ss << ")";
3437 }
3438 int num = num_osds_subtree[*j];
3439 ss << " (" << num << " osds)";
3440 ss << " is down";
3441 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3442 }
3443 }
3444 // details of down osds
3445 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
3446 ostringstream ss;
3447 ss << "osd." << *it << " (";
3448 ss << osdmap.crush->get_full_location_ordered_string(*it);
3449 ss << ") is down";
3450 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3451 }
3452 }
3453 }
3454
3455 if (!osds.empty()) {
3456 ostringstream ss;
3457 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
3458 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3459 if (detail) {
3460 ss << " (osds: " << osds << ")";
3461 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3462 }
3463 }
3464
3465 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
3466 // An osd could configure failsafe ratio, to something different
3467 // but for now assume it is the same here.
3468 float fsr = g_conf->osd_failsafe_full_ratio;
3469 if (fsr > 1.0) fsr /= 100;
3470 float fr = osdmap.get_full_ratio();
3471 float br = osdmap.get_backfillfull_ratio();
3472 float nr = osdmap.get_nearfull_ratio();
3473
3474 bool out_of_order = false;
3475 // These checks correspond to how OSDService::check_full_status() in an OSD
3476 // handles the improper setting of these values.
3477 if (br < nr) {
3478 out_of_order = true;
3479 if (detail) {
3480 ostringstream ss;
3481 ss << "backfillfull_ratio (" << br << ") < nearfull_ratio (" << nr << "), increased";
3482 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3483 }
3484 br = nr;
3485 }
3486 if (fr < br) {
3487 out_of_order = true;
3488 if (detail) {
3489 ostringstream ss;
3490 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br << "), increased";
3491 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3492 }
3493 fr = br;
3494 }
3495 if (fsr < fr) {
3496 out_of_order = true;
3497 if (detail) {
3498 ostringstream ss;
3499 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr << "), increased";
3500 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3501 }
3502 }
3503 if (out_of_order) {
3504 ostringstream ss;
3505 ss << "Full ratio(s) out of order";
3506 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3507 }
3508
3509 set<int> full, backfillfull, nearfull;
3510 osdmap.get_full_osd_counts(&full, &backfillfull, &nearfull);
3511 if (full.size()) {
3512 ostringstream ss;
3513 ss << full.size() << " full osd(s)";
3514 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3515 }
3516 if (backfillfull.size()) {
3517 ostringstream ss;
3518 ss << backfillfull.size() << " backfillfull osd(s)";
3519 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3520 }
3521 if (nearfull.size()) {
3522 ostringstream ss;
3523 ss << nearfull.size() << " nearfull osd(s)";
3524 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3525 }
3526 if (detail) {
3527 for (auto& i: full) {
3528 ostringstream ss;
3529 ss << "osd." << i << " is full";
3530 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3531 }
3532 for (auto& i: backfillfull) {
3533 ostringstream ss;
3534 ss << "osd." << i << " is backfill full";
3535 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3536 }
3537 for (auto& i: nearfull) {
3538 ostringstream ss;
3539 ss << "osd." << i << " is near full";
3540 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3541 }
3542 }
3543
3544 // warn if there is any noup osds.
3545 vector<int> noup_osds;
3546 osdmap.get_noup_osds(&noup_osds);
3547 if (noup_osds.size()) {
3548 ostringstream ss;
3549 ss << noup_osds.size() << " noup osd(s)";
3550 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3551 if (detail) {
3552 ss << ": " << noup_osds;
3553 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3554 }
3555 }
3556
3557 // warn if there is any nodown osds.
3558 vector<int> nodown_osds;
3559 osdmap.get_nodown_osds(&nodown_osds);
3560 if (nodown_osds.size()) {
3561 ostringstream ss;
3562 ss << nodown_osds.size() << " nodown osd(s)";
3563 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3564 if (detail) {
3565 ss << ": " << nodown_osds;
3566 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3567 }
3568 }
3569
3570 // warn if there is any noin osds.
3571 vector<int> noin_osds;
3572 osdmap.get_noin_osds(&noin_osds);
3573 if (noin_osds.size()) {
3574 ostringstream ss;
3575 ss << noin_osds.size() << " noin osd(s)";
3576 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3577 if (detail) {
3578 ss << ": " << noin_osds;
3579 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3580 }
3581 }
3582
3583 // warn if there is any noout osds.
3584 vector<int> noout_osds;
3585 osdmap.get_noout_osds(&noout_osds);
3586 if (noout_osds.size()) {
3587 ostringstream ss;
3588 ss << noout_osds.size() << " noout osd(s)";
3589 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3590 if (detail) {
3591 ss << ": " << noout_osds;
3592 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3593 }
3594 }
3595 }
3596 // note: we leave it to ceph-mgr to generate details health warnings
3597 // with actual osd utilizations
3598
3599 // warn about flags
3600 uint64_t warn_flags =
3601 CEPH_OSDMAP_FULL |
3602 CEPH_OSDMAP_PAUSERD |
3603 CEPH_OSDMAP_PAUSEWR |
3604 CEPH_OSDMAP_PAUSEREC |
3605 CEPH_OSDMAP_NOUP |
3606 CEPH_OSDMAP_NODOWN |
3607 CEPH_OSDMAP_NOIN |
3608 CEPH_OSDMAP_NOOUT |
3609 CEPH_OSDMAP_NOBACKFILL |
3610 CEPH_OSDMAP_NORECOVER |
3611 CEPH_OSDMAP_NOSCRUB |
3612 CEPH_OSDMAP_NODEEP_SCRUB |
3613 CEPH_OSDMAP_NOTIERAGENT |
3614 CEPH_OSDMAP_NOREBALANCE;
3615 if (osdmap.test_flag(warn_flags)) {
3616 ostringstream ss;
3617 ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
3618 << " flag(s) set";
3619 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3620 if (detail)
3621 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3622 }
3623
3624 // old crush tunables?
3625 if (g_conf->mon_warn_on_legacy_crush_tunables) {
3626 string min = osdmap.crush->get_min_required_version();
3627 if (min < g_conf->mon_crush_min_required_version) {
3628 ostringstream ss;
3629 ss << "crush map has legacy tunables (require " << min
3630 << ", min is " << g_conf->mon_crush_min_required_version << ")";
3631 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3632 if (detail) {
3633 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3634 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3635 }
3636 }
3637 }
3638 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
3639 if (osdmap.crush->get_straw_calc_version() == 0) {
3640 ostringstream ss;
3641 ss << "crush map has straw_calc_version=0";
3642 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3643 if (detail) {
3644 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3645 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3646 }
3647 }
3648 }
3649
3650 // hit_set-less cache_mode?
3651 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
3652 int problem_cache_pools = 0;
3653 for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
3654 p != osdmap.pools.end();
3655 ++p) {
3656 const pg_pool_t& info = p->second;
3657 if (info.cache_mode_requires_hit_set() &&
3658 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
3659 ++problem_cache_pools;
3660 if (detail) {
3661 ostringstream ss;
3662 ss << "pool '" << osdmap.get_pool_name(p->first)
3663 << "' with cache_mode " << info.get_cache_mode_name()
3664 << " needs hit_set_type to be set but it is not";
3665 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3666 }
3667 }
3668 }
3669 if (problem_cache_pools) {
3670 ostringstream ss;
3671 ss << problem_cache_pools << " cache pools are missing hit_sets";
3672 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3673 }
3674 }
3675
3676 if (osdmap.crush->has_multirule_rulesets()) {
3677 ostringstream ss;
3678 ss << "CRUSH map contains multirule rulesets";
3679 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3680 if (detail) {
3681 ss << "; please manually fix the map";
3682 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3683 }
3684 }
3685
3686 // Not using 'sortbitwise' and should be?
3687 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
3688 (osdmap.get_up_osd_features() &
3689 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
3690 ostringstream ss;
3691 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
3692 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3693 }
3694
3695 // Warn if 'mon_osd_down_out_interval' is set to zero.
3696 // Having this option set to zero on the leader acts much like the
3697 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3698 // without the 'noout' flag set but acting like that just the same, so
3699 // we report a HEALTH_WARN in case this option is set to zero.
3700 // This is an ugly hack to get the warning out, but until we find a way
3701 // to spread global options throughout the mon cluster and have all mons
3702 // using a base set of the same options, we need to work around this sort
3703 // of things.
3704 // There's also the obvious drawback that if this is set on a single
3705 // monitor on a 3-monitor cluster, this warning will only be shown every
3706 // third monitor connection.
3707 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
3708 g_conf->mon_osd_down_out_interval == 0) {
3709 ostringstream ss;
3710 ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
3711 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3712 if (detail) {
3713 ss << "; this has the same effect as the 'noout' flag";
3714 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3715 }
3716 }
3717
3718 // warn about upgrade flags that can be set but are not.
3719 if (g_conf->mon_debug_no_require_luminous) {
3720 // ignore these checks
3721 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS) &&
3722 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
3723 string msg = "all OSDs are running luminous or later but"
3724 " require_osd_release < luminous";
3725 summary.push_back(make_pair(HEALTH_WARN, msg));
3726 if (detail) {
3727 detail->push_back(make_pair(HEALTH_WARN, msg));
3728 }
3729 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN) &&
3730 osdmap.require_osd_release < CEPH_RELEASE_KRAKEN) {
3731 string msg = "all OSDs are running kraken or later but"
3732 " require_osd_release < kraken";
3733 summary.push_back(make_pair(HEALTH_WARN, msg));
3734 if (detail) {
3735 detail->push_back(make_pair(HEALTH_WARN, msg));
3736 }
3737 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL) &&
3738 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
3739 string msg = "all OSDs are running jewel or later but"
3740 " require_osd_release < jewel";
3741 summary.push_back(make_pair(HEALTH_WARN, msg));
3742 if (detail) {
3743 detail->push_back(make_pair(HEALTH_WARN, msg));
3744 }
3745 }
3746
3747 get_pools_health(summary, detail);
3748 }
3749 }
3750
3751 void OSDMonitor::dump_info(Formatter *f)
3752 {
3753 f->open_object_section("osdmap");
3754 osdmap.dump(f);
3755 f->close_section();
3756
3757 f->open_array_section("osd_metadata");
3758 for (int i=0; i<osdmap.get_max_osd(); ++i) {
3759 if (osdmap.exists(i)) {
3760 f->open_object_section("osd");
3761 f->dump_unsigned("id", i);
3762 dump_osd_metadata(i, f, NULL);
3763 f->close_section();
3764 }
3765 }
3766 f->close_section();
3767
3768 f->dump_unsigned("osdmap_first_committed", get_first_committed());
3769 f->dump_unsigned("osdmap_last_committed", get_last_committed());
3770
3771 f->open_object_section("crushmap");
3772 osdmap.crush->dump(f);
3773 f->close_section();
3774 }
3775
3776 namespace {
3777 enum osd_pool_get_choices {
3778 SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
3779 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL,
3780 NODELETE, NOPGCHANGE, NOSIZECHANGE,
3781 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
3782 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
3783 USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
3784 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
3785 CACHE_TARGET_FULL_RATIO,
3786 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
3787 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
3788 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
3789 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
3790 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
3791 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
3792 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
3793 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
3794 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK };
3795
3796 std::set<osd_pool_get_choices>
3797 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
3798 const std::set<osd_pool_get_choices>& second)
3799 {
3800 std::set<osd_pool_get_choices> result;
3801 std::set_difference(first.begin(), first.end(),
3802 second.begin(), second.end(),
3803 std::inserter(result, result.end()));
3804 return result;
3805 }
3806 }
3807
3808
3809 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
3810 {
3811 op->mark_osdmon_event(__func__);
3812 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
3813 int r = 0;
3814 bufferlist rdata;
3815 stringstream ss, ds;
3816
3817 map<string, cmd_vartype> cmdmap;
3818 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
3819 string rs = ss.str();
3820 mon->reply_command(op, -EINVAL, rs, get_last_committed());
3821 return true;
3822 }
3823
3824 MonSession *session = m->get_session();
3825 if (!session) {
3826 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
3827 return true;
3828 }
3829
3830 string prefix;
3831 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
3832
3833 string format;
3834 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
3835 boost::scoped_ptr<Formatter> f(Formatter::create(format));
3836
3837 if (prefix == "osd stat") {
3838 osdmap.print_summary(f.get(), ds);
3839 if (f)
3840 f->flush(rdata);
3841 else
3842 rdata.append(ds);
3843 }
3844 else if (prefix == "osd perf" ||
3845 prefix == "osd blocked-by") {
3846 r = mon->pgservice->process_pg_command(prefix, cmdmap,
3847 osdmap, f.get(), &ss, &rdata);
3848 }
3849 else if (prefix == "osd dump" ||
3850 prefix == "osd tree" ||
3851 prefix == "osd ls" ||
3852 prefix == "osd getmap" ||
3853 prefix == "osd getcrushmap" ||
3854 prefix == "osd ls-tree") {
3855 string val;
3856
3857 epoch_t epoch = 0;
3858 int64_t epochnum;
3859 cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
3860 epoch = epochnum;
3861
3862 bufferlist osdmap_bl;
3863 int err = get_version_full(epoch, osdmap_bl);
3864 if (err == -ENOENT) {
3865 r = -ENOENT;
3866 ss << "there is no map for epoch " << epoch;
3867 goto reply;
3868 }
3869 assert(err == 0);
3870 assert(osdmap_bl.length());
3871
3872 OSDMap *p;
3873 if (epoch == osdmap.get_epoch()) {
3874 p = &osdmap;
3875 } else {
3876 p = new OSDMap;
3877 p->decode(osdmap_bl);
3878 }
3879
3880 if (prefix == "osd dump") {
3881 stringstream ds;
3882 if (f) {
3883 f->open_object_section("osdmap");
3884 p->dump(f.get());
3885 f->close_section();
3886 f->flush(ds);
3887 } else {
3888 p->print(ds);
3889 }
3890 rdata.append(ds);
3891 if (!f)
3892 ds << " ";
3893 } else if (prefix == "osd ls") {
3894 if (f) {
3895 f->open_array_section("osds");
3896 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3897 if (osdmap.exists(i)) {
3898 f->dump_int("osd", i);
3899 }
3900 }
3901 f->close_section();
3902 f->flush(ds);
3903 } else {
3904 bool first = true;
3905 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3906 if (osdmap.exists(i)) {
3907 if (!first)
3908 ds << "\n";
3909 first = false;
3910 ds << i;
3911 }
3912 }
3913 }
3914 rdata.append(ds);
3915 } else if (prefix == "osd tree") {
3916 vector<string> states;
3917 cmd_getval(g_ceph_context, cmdmap, "states", states);
3918 unsigned filter = 0;
3919 for (auto& s : states) {
3920 if (s == "up") {
3921 filter |= OSDMap::DUMP_UP;
3922 } else if (s == "down") {
3923 filter |= OSDMap::DUMP_DOWN;
3924 } else if (s == "in") {
3925 filter |= OSDMap::DUMP_IN;
3926 } else if (s == "out") {
3927 filter |= OSDMap::DUMP_OUT;
3928 } else {
3929 ss << "unrecognized state '" << s << "'";
3930 r = -EINVAL;
3931 goto reply;
3932 }
3933 }
3934 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
3935 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT) ||
3936 (filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
3937 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) {
3938 ss << "cannot specify both up and down or both in and out";
3939 r = -EINVAL;
3940 goto reply;
3941 }
3942 if (f) {
3943 f->open_object_section("tree");
3944 p->print_tree(f.get(), NULL, filter);
3945 f->close_section();
3946 f->flush(ds);
3947 } else {
3948 p->print_tree(NULL, &ds, filter);
3949 }
3950 rdata.append(ds);
3951 } else if (prefix == "osd getmap") {
3952 rdata.append(osdmap_bl);
3953 ss << "got osdmap epoch " << p->get_epoch();
3954 } else if (prefix == "osd getcrushmap") {
3955 p->crush->encode(rdata, mon->get_quorum_con_features());
3956 ss << p->get_crush_version();
3957 } else if (prefix == "osd ls-tree") {
3958 string bucket_name;
3959 cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
3960 set<int> osds;
3961 r = p->get_osds_by_bucket_name(bucket_name, &osds);
3962 if (r == -ENOENT) {
3963 ss << "\"" << bucket_name << "\" does not exist";
3964 goto reply;
3965 } else if (r < 0) {
3966 ss << "can not parse bucket name:\"" << bucket_name << "\"";
3967 goto reply;
3968 }
3969
3970 if (f) {
3971 f->open_array_section("osds");
3972 for (auto &i : osds) {
3973 if (osdmap.exists(i)) {
3974 f->dump_int("osd", i);
3975 }
3976 }
3977 f->close_section();
3978 f->flush(ds);
3979 } else {
3980 bool first = true;
3981 for (auto &i : osds) {
3982 if (osdmap.exists(i)) {
3983 if (!first)
3984 ds << "\n";
3985 first = false;
3986 ds << i;
3987 }
3988 }
3989 }
3990
3991 rdata.append(ds);
3992 }
3993 if (p != &osdmap)
3994 delete p;
3995 } else if (prefix == "osd df") {
3996 string method;
3997 cmd_getval(g_ceph_context, cmdmap, "output_method", method);
3998 print_osd_utilization(osdmap, mon->pgservice, ds,
3999 f.get(), method == "tree");
4000 rdata.append(ds);
4001 } else if (prefix == "osd getmaxosd") {
4002 if (f) {
4003 f->open_object_section("getmaxosd");
4004 f->dump_unsigned("epoch", osdmap.get_epoch());
4005 f->dump_int("max_osd", osdmap.get_max_osd());
4006 f->close_section();
4007 f->flush(rdata);
4008 } else {
4009 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4010 rdata.append(ds);
4011 }
4012 } else if (prefix == "osd utilization") {
4013 string out;
4014 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4015 if (f)
4016 f->flush(rdata);
4017 else
4018 rdata.append(out);
4019 r = 0;
4020 goto reply;
4021 } else if (prefix == "osd find") {
4022 int64_t osd;
4023 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4024 ss << "unable to parse osd id value '"
4025 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4026 r = -EINVAL;
4027 goto reply;
4028 }
4029 if (!osdmap.exists(osd)) {
4030 ss << "osd." << osd << " does not exist";
4031 r = -ENOENT;
4032 goto reply;
4033 }
4034 string format;
4035 cmd_getval(g_ceph_context, cmdmap, "format", format);
4036 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4037 f->open_object_section("osd_location");
4038 f->dump_int("osd", osd);
4039 f->dump_stream("ip") << osdmap.get_addr(osd);
4040 f->open_object_section("crush_location");
4041 map<string,string> loc = osdmap.crush->get_full_location(osd);
4042 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4043 f->dump_string(p->first.c_str(), p->second);
4044 f->close_section();
4045 f->close_section();
4046 f->flush(rdata);
4047 } else if (prefix == "osd metadata") {
4048 int64_t osd = -1;
4049 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4050 !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4051 ss << "unable to parse osd id value '"
4052 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4053 r = -EINVAL;
4054 goto reply;
4055 }
4056 if (osd >= 0 && !osdmap.exists(osd)) {
4057 ss << "osd." << osd << " does not exist";
4058 r = -ENOENT;
4059 goto reply;
4060 }
4061 string format;
4062 cmd_getval(g_ceph_context, cmdmap, "format", format);
4063 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4064 if (osd >= 0) {
4065 f->open_object_section("osd_metadata");
4066 f->dump_unsigned("id", osd);
4067 r = dump_osd_metadata(osd, f.get(), &ss);
4068 if (r < 0)
4069 goto reply;
4070 f->close_section();
4071 } else {
4072 r = 0;
4073 f->open_array_section("osd_metadata");
4074 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4075 if (osdmap.exists(i)) {
4076 f->open_object_section("osd");
4077 f->dump_unsigned("id", i);
4078 r = dump_osd_metadata(i, f.get(), NULL);
4079 if (r == -EINVAL || r == -ENOENT) {
4080 // Drop error, continue to get other daemons' metadata
4081 dout(4) << "No metadata for osd." << i << dendl;
4082 r = 0;
4083 } else if (r < 0) {
4084 // Unexpected error
4085 goto reply;
4086 }
4087 f->close_section();
4088 }
4089 }
4090 f->close_section();
4091 }
4092 f->flush(rdata);
4093 } else if (prefix == "osd versions") {
4094 if (!f)
4095 f.reset(Formatter::create("json-pretty"));
4096 count_metadata("ceph_version", f.get());
4097 f->flush(rdata);
4098 r = 0;
4099 } else if (prefix == "osd count-metadata") {
4100 if (!f)
4101 f.reset(Formatter::create("json-pretty"));
4102 string field;
4103 cmd_getval(g_ceph_context, cmdmap, "property", field);
4104 count_metadata(field, f.get());
4105 f->flush(rdata);
4106 r = 0;
4107 } else if (prefix == "osd map") {
4108 string poolstr, objstr, namespacestr;
4109 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4110 cmd_getval(g_ceph_context, cmdmap, "object", objstr);
4111 cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
4112
4113 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4114 if (pool < 0) {
4115 ss << "pool " << poolstr << " does not exist";
4116 r = -ENOENT;
4117 goto reply;
4118 }
4119 object_locator_t oloc(pool, namespacestr);
4120 object_t oid(objstr);
4121 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4122 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4123 vector<int> up, acting;
4124 int up_p, acting_p;
4125 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
4126
4127 string fullobjname;
4128 if (!namespacestr.empty())
4129 fullobjname = namespacestr + string("/") + oid.name;
4130 else
4131 fullobjname = oid.name;
4132 if (f) {
4133 f->open_object_section("osd_map");
4134 f->dump_unsigned("epoch", osdmap.get_epoch());
4135 f->dump_string("pool", poolstr);
4136 f->dump_int("pool_id", pool);
4137 f->dump_stream("objname") << fullobjname;
4138 f->dump_stream("raw_pgid") << pgid;
4139 f->dump_stream("pgid") << mpgid;
4140 f->open_array_section("up");
4141 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
4142 f->dump_int("osd", *p);
4143 f->close_section();
4144 f->dump_int("up_primary", up_p);
4145 f->open_array_section("acting");
4146 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
4147 f->dump_int("osd", *p);
4148 f->close_section();
4149 f->dump_int("acting_primary", acting_p);
4150 f->close_section(); // osd_map
4151 f->flush(rdata);
4152 } else {
4153 ds << "osdmap e" << osdmap.get_epoch()
4154 << " pool '" << poolstr << "' (" << pool << ")"
4155 << " object '" << fullobjname << "' ->"
4156 << " pg " << pgid << " (" << mpgid << ")"
4157 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
4158 << pg_vector_string(acting) << ", p" << acting_p << ")";
4159 rdata.append(ds);
4160 }
4161
4162 } else if (prefix == "pg map") {
4163 pg_t pgid;
4164 string pgidstr;
4165 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
4166 if (!pgid.parse(pgidstr.c_str())) {
4167 ss << "invalid pgid '" << pgidstr << "'";
4168 r = -EINVAL;
4169 goto reply;
4170 }
4171 vector<int> up, acting;
4172 if (!osdmap.have_pg_pool(pgid.pool())) {
4173 ss << "pg '" << pgidstr << "' does not exist";
4174 r = -ENOENT;
4175 goto reply;
4176 }
4177 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4178 osdmap.pg_to_up_acting_osds(pgid, up, acting);
4179 if (f) {
4180 f->open_object_section("pg_map");
4181 f->dump_unsigned("epoch", osdmap.get_epoch());
4182 f->dump_stream("raw_pgid") << pgid;
4183 f->dump_stream("pgid") << mpgid;
4184 f->open_array_section("up");
4185 for (auto osd : up) {
4186 f->dump_int("up_osd", osd);
4187 }
4188 f->close_section();
4189 f->open_array_section("acting");
4190 for (auto osd : acting) {
4191 f->dump_int("acting_osd", osd);
4192 }
4193 f->close_section();
4194 f->close_section();
4195 f->flush(rdata);
4196 } else {
4197 ds << "osdmap e" << osdmap.get_epoch()
4198 << " pg " << pgid << " (" << mpgid << ")"
4199 << " -> up " << up << " acting " << acting;
4200 rdata.append(ds);
4201 }
4202 goto reply;
4203
4204 } else if ((prefix == "osd scrub" ||
4205 prefix == "osd deep-scrub" ||
4206 prefix == "osd repair")) {
4207 string whostr;
4208 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
4209 vector<string> pvec;
4210 get_str_vec(prefix, pvec);
4211
4212 if (whostr == "*") {
4213 ss << "osds ";
4214 int c = 0;
4215 for (int i = 0; i < osdmap.get_max_osd(); i++)
4216 if (osdmap.is_up(i)) {
4217 ss << (c++ ? "," : "") << i;
4218 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4219 pvec.back() == "repair",
4220 pvec.back() == "deep-scrub"),
4221 osdmap.get_inst(i));
4222 }
4223 r = 0;
4224 ss << " instructed to " << pvec.back();
4225 } else {
4226 long osd = parse_osd_id(whostr.c_str(), &ss);
4227 if (osd < 0) {
4228 r = -EINVAL;
4229 } else if (osdmap.is_up(osd)) {
4230 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4231 pvec.back() == "repair",
4232 pvec.back() == "deep-scrub"),
4233 osdmap.get_inst(osd));
4234 ss << "osd." << osd << " instructed to " << pvec.back();
4235 } else {
4236 ss << "osd." << osd << " is not up";
4237 r = -EAGAIN;
4238 }
4239 }
4240 } else if (prefix == "osd lspools") {
4241 int64_t auid;
4242 cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
4243 if (f)
4244 f->open_array_section("pools");
4245 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
4246 p != osdmap.pools.end();
4247 ++p) {
4248 if (!auid || p->second.auid == (uint64_t)auid) {
4249 if (f) {
4250 f->open_object_section("pool");
4251 f->dump_int("poolnum", p->first);
4252 f->dump_string("poolname", osdmap.pool_name[p->first]);
4253 f->close_section();
4254 } else {
4255 ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
4256 }
4257 }
4258 }
4259 if (f) {
4260 f->close_section();
4261 f->flush(ds);
4262 }
4263 rdata.append(ds);
4264 } else if (prefix == "osd blacklist ls") {
4265 if (f)
4266 f->open_array_section("blacklist");
4267
4268 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4269 p != osdmap.blacklist.end();
4270 ++p) {
4271 if (f) {
4272 f->open_object_section("entry");
4273 f->dump_stream("addr") << p->first;
4274 f->dump_stream("until") << p->second;
4275 f->close_section();
4276 } else {
4277 stringstream ss;
4278 string s;
4279 ss << p->first << " " << p->second;
4280 getline(ss, s);
4281 s += "\n";
4282 rdata.append(s);
4283 }
4284 }
4285 if (f) {
4286 f->close_section();
4287 f->flush(rdata);
4288 }
4289 ss << "listed " << osdmap.blacklist.size() << " entries";
4290
4291 } else if (prefix == "osd pool ls") {
4292 string detail;
4293 cmd_getval(g_ceph_context, cmdmap, "detail", detail);
4294 if (!f && detail == "detail") {
4295 ostringstream ss;
4296 osdmap.print_pools(ss);
4297 rdata.append(ss.str());
4298 } else {
4299 if (f)
4300 f->open_array_section("pools");
4301 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
4302 it != osdmap.get_pools().end();
4303 ++it) {
4304 if (f) {
4305 if (detail == "detail") {
4306 f->open_object_section("pool");
4307 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4308 it->second.dump(f.get());
4309 f->close_section();
4310 } else {
4311 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4312 }
4313 } else {
4314 rdata.append(osdmap.get_pool_name(it->first) + "\n");
4315 }
4316 }
4317 if (f) {
4318 f->close_section();
4319 f->flush(rdata);
4320 }
4321 }
4322
4323 } else if (prefix == "osd crush get-tunable") {
4324 string tunable;
4325 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
4326 ostringstream rss;
4327 if (f)
4328 f->open_object_section("tunable");
4329 if (tunable == "straw_calc_version") {
4330 if (f)
4331 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
4332 else
4333 rss << osdmap.crush->get_straw_calc_version() << "\n";
4334 } else {
4335 r = -EINVAL;
4336 goto reply;
4337 }
4338 if (f) {
4339 f->close_section();
4340 f->flush(rdata);
4341 } else {
4342 rdata.append(rss.str());
4343 }
4344 r = 0;
4345
4346 } else if (prefix == "osd pool get") {
4347 string poolstr;
4348 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4349 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4350 if (pool < 0) {
4351 ss << "unrecognized pool '" << poolstr << "'";
4352 r = -ENOENT;
4353 goto reply;
4354 }
4355
4356 const pg_pool_t *p = osdmap.get_pg_pool(pool);
4357 string var;
4358 cmd_getval(g_ceph_context, cmdmap, "var", var);
4359
4360 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
4361 const choices_map_t ALL_CHOICES = {
4362 {"size", SIZE},
4363 {"min_size", MIN_SIZE},
4364 {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
4365 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
4366 {"crush_rule", CRUSH_RULE},
4367 {"hashpspool", HASHPSPOOL}, {"nodelete", NODELETE},
4368 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
4369 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
4370 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
4371 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
4372 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
4373 {"use_gmt_hitset", USE_GMT_HITSET},
4374 {"auid", AUID}, {"target_max_objects", TARGET_MAX_OBJECTS},
4375 {"target_max_bytes", TARGET_MAX_BYTES},
4376 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
4377 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
4378 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
4379 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
4380 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
4381 {"erasure_code_profile", ERASURE_CODE_PROFILE},
4382 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
4383 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
4384 {"fast_read", FAST_READ},
4385 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
4386 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
4387 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
4388 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
4389 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
4390 {"recovery_priority", RECOVERY_PRIORITY},
4391 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
4392 {"scrub_priority", SCRUB_PRIORITY},
4393 {"compression_mode", COMPRESSION_MODE},
4394 {"compression_algorithm", COMPRESSION_ALGORITHM},
4395 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
4396 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
4397 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
4398 {"csum_type", CSUM_TYPE},
4399 {"csum_max_block", CSUM_MAX_BLOCK},
4400 {"csum_min_block", CSUM_MIN_BLOCK},
4401 };
4402
4403 typedef std::set<osd_pool_get_choices> choices_set_t;
4404
4405 const choices_set_t ONLY_TIER_CHOICES = {
4406 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4407 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
4408 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4409 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4410 MIN_READ_RECENCY_FOR_PROMOTE,
4411 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
4412 };
4413 const choices_set_t ONLY_ERASURE_CHOICES = {
4414 ERASURE_CODE_PROFILE
4415 };
4416
4417 choices_set_t selected_choices;
4418 if (var == "all") {
4419 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
4420 it != ALL_CHOICES.end(); ++it) {
4421 selected_choices.insert(it->second);
4422 }
4423
4424 if(!p->is_tier()) {
4425 selected_choices = subtract_second_from_first(selected_choices,
4426 ONLY_TIER_CHOICES);
4427 }
4428
4429 if(!p->is_erasure()) {
4430 selected_choices = subtract_second_from_first(selected_choices,
4431 ONLY_ERASURE_CHOICES);
4432 }
4433 } else /* var != "all" */ {
4434 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
4435 osd_pool_get_choices selected = found->second;
4436
4437 if (!p->is_tier() &&
4438 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
4439 ss << "pool '" << poolstr
4440 << "' is not a tier pool: variable not applicable";
4441 r = -EACCES;
4442 goto reply;
4443 }
4444
4445 if (!p->is_erasure() &&
4446 ONLY_ERASURE_CHOICES.find(selected)
4447 != ONLY_ERASURE_CHOICES.end()) {
4448 ss << "pool '" << poolstr
4449 << "' is not a erasure pool: variable not applicable";
4450 r = -EACCES;
4451 goto reply;
4452 }
4453
4454 selected_choices.insert(selected);
4455 }
4456
4457 if (f) {
4458 for(choices_set_t::const_iterator it = selected_choices.begin();
4459 it != selected_choices.end(); ++it) {
4460 choices_map_t::const_iterator i;
4461 f->open_object_section("pool");
4462 f->dump_string("pool", poolstr);
4463 f->dump_int("pool_id", pool);
4464 switch(*it) {
4465 case PG_NUM:
4466 f->dump_int("pg_num", p->get_pg_num());
4467 break;
4468 case PGP_NUM:
4469 f->dump_int("pgp_num", p->get_pgp_num());
4470 break;
4471 case AUID:
4472 f->dump_int("auid", p->get_auid());
4473 break;
4474 case SIZE:
4475 f->dump_int("size", p->get_size());
4476 break;
4477 case MIN_SIZE:
4478 f->dump_int("min_size", p->get_min_size());
4479 break;
4480 case CRASH_REPLAY_INTERVAL:
4481 f->dump_int("crash_replay_interval",
4482 p->get_crash_replay_interval());
4483 break;
4484 case CRUSH_RULE:
4485 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
4486 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
4487 p->get_crush_rule()));
4488 } else {
4489 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
4490 }
4491 break;
4492 case HASHPSPOOL:
4493 case NODELETE:
4494 case NOPGCHANGE:
4495 case NOSIZECHANGE:
4496 case WRITE_FADVISE_DONTNEED:
4497 case NOSCRUB:
4498 case NODEEP_SCRUB:
4499 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4500 if (i->second == *it)
4501 break;
4502 }
4503 assert(i != ALL_CHOICES.end());
4504 f->dump_string(i->first.c_str(),
4505 p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4506 "true" : "false");
4507 break;
4508 case HIT_SET_PERIOD:
4509 f->dump_int("hit_set_period", p->hit_set_period);
4510 break;
4511 case HIT_SET_COUNT:
4512 f->dump_int("hit_set_count", p->hit_set_count);
4513 break;
4514 case HIT_SET_TYPE:
4515 f->dump_string("hit_set_type",
4516 HitSet::get_type_name(p->hit_set_params.get_type()));
4517 break;
4518 case HIT_SET_FPP:
4519 {
4520 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4521 BloomHitSet::Params *bloomp =
4522 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4523 f->dump_float("hit_set_fpp", bloomp->get_fpp());
4524 } else if(var != "all") {
4525 f->close_section();
4526 ss << "hit set is not of type Bloom; " <<
4527 "invalid to get a false positive rate!";
4528 r = -EINVAL;
4529 goto reply;
4530 }
4531 }
4532 break;
4533 case USE_GMT_HITSET:
4534 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
4535 break;
4536 case TARGET_MAX_OBJECTS:
4537 f->dump_unsigned("target_max_objects", p->target_max_objects);
4538 break;
4539 case TARGET_MAX_BYTES:
4540 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
4541 break;
4542 case CACHE_TARGET_DIRTY_RATIO:
4543 f->dump_unsigned("cache_target_dirty_ratio_micro",
4544 p->cache_target_dirty_ratio_micro);
4545 f->dump_float("cache_target_dirty_ratio",
4546 ((float)p->cache_target_dirty_ratio_micro/1000000));
4547 break;
4548 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4549 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
4550 p->cache_target_dirty_high_ratio_micro);
4551 f->dump_float("cache_target_dirty_high_ratio",
4552 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
4553 break;
4554 case CACHE_TARGET_FULL_RATIO:
4555 f->dump_unsigned("cache_target_full_ratio_micro",
4556 p->cache_target_full_ratio_micro);
4557 f->dump_float("cache_target_full_ratio",
4558 ((float)p->cache_target_full_ratio_micro/1000000));
4559 break;
4560 case CACHE_MIN_FLUSH_AGE:
4561 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
4562 break;
4563 case CACHE_MIN_EVICT_AGE:
4564 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
4565 break;
4566 case ERASURE_CODE_PROFILE:
4567 f->dump_string("erasure_code_profile", p->erasure_code_profile);
4568 break;
4569 case MIN_READ_RECENCY_FOR_PROMOTE:
4570 f->dump_int("min_read_recency_for_promote",
4571 p->min_read_recency_for_promote);
4572 break;
4573 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4574 f->dump_int("min_write_recency_for_promote",
4575 p->min_write_recency_for_promote);
4576 break;
4577 case FAST_READ:
4578 f->dump_int("fast_read", p->fast_read);
4579 break;
4580 case HIT_SET_GRADE_DECAY_RATE:
4581 f->dump_int("hit_set_grade_decay_rate",
4582 p->hit_set_grade_decay_rate);
4583 break;
4584 case HIT_SET_SEARCH_LAST_N:
4585 f->dump_int("hit_set_search_last_n",
4586 p->hit_set_search_last_n);
4587 break;
4588 case SCRUB_MIN_INTERVAL:
4589 case SCRUB_MAX_INTERVAL:
4590 case DEEP_SCRUB_INTERVAL:
4591 case RECOVERY_PRIORITY:
4592 case RECOVERY_OP_PRIORITY:
4593 case SCRUB_PRIORITY:
4594 case COMPRESSION_MODE:
4595 case COMPRESSION_ALGORITHM:
4596 case COMPRESSION_REQUIRED_RATIO:
4597 case COMPRESSION_MAX_BLOB_SIZE:
4598 case COMPRESSION_MIN_BLOB_SIZE:
4599 case CSUM_TYPE:
4600 case CSUM_MAX_BLOCK:
4601 case CSUM_MIN_BLOCK:
4602 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4603 if (i->second == *it)
4604 break;
4605 }
4606 assert(i != ALL_CHOICES.end());
4607 if(*it == CSUM_TYPE) {
4608 int val;
4609 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
4610 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
4611 }
4612 else {
4613 p->opts.dump(i->first, f.get());
4614 }
4615 break;
4616 }
4617 f->close_section();
4618 f->flush(rdata);
4619 }
4620
4621 } else /* !f */ {
4622 for(choices_set_t::const_iterator it = selected_choices.begin();
4623 it != selected_choices.end(); ++it) {
4624 choices_map_t::const_iterator i;
4625 switch(*it) {
4626 case PG_NUM:
4627 ss << "pg_num: " << p->get_pg_num() << "\n";
4628 break;
4629 case PGP_NUM:
4630 ss << "pgp_num: " << p->get_pgp_num() << "\n";
4631 break;
4632 case AUID:
4633 ss << "auid: " << p->get_auid() << "\n";
4634 break;
4635 case SIZE:
4636 ss << "size: " << p->get_size() << "\n";
4637 break;
4638 case MIN_SIZE:
4639 ss << "min_size: " << p->get_min_size() << "\n";
4640 break;
4641 case CRASH_REPLAY_INTERVAL:
4642 ss << "crash_replay_interval: " <<
4643 p->get_crash_replay_interval() << "\n";
4644 break;
4645 case CRUSH_RULE:
4646 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
4647 ss << "crush_rule: " << osdmap.crush->get_rule_name(
4648 p->get_crush_rule()) << "\n";
4649 } else {
4650 ss << "crush_rule: " << p->get_crush_rule() << "\n";
4651 }
4652 break;
4653 case HIT_SET_PERIOD:
4654 ss << "hit_set_period: " << p->hit_set_period << "\n";
4655 break;
4656 case HIT_SET_COUNT:
4657 ss << "hit_set_count: " << p->hit_set_count << "\n";
4658 break;
4659 case HIT_SET_TYPE:
4660 ss << "hit_set_type: " <<
4661 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
4662 break;
4663 case HIT_SET_FPP:
4664 {
4665 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4666 BloomHitSet::Params *bloomp =
4667 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4668 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
4669 } else if(var != "all") {
4670 ss << "hit set is not of type Bloom; " <<
4671 "invalid to get a false positive rate!";
4672 r = -EINVAL;
4673 goto reply;
4674 }
4675 }
4676 break;
4677 case USE_GMT_HITSET:
4678 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
4679 break;
4680 case TARGET_MAX_OBJECTS:
4681 ss << "target_max_objects: " << p->target_max_objects << "\n";
4682 break;
4683 case TARGET_MAX_BYTES:
4684 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
4685 break;
4686 case CACHE_TARGET_DIRTY_RATIO:
4687 ss << "cache_target_dirty_ratio: "
4688 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
4689 break;
4690 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4691 ss << "cache_target_dirty_high_ratio: "
4692 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
4693 break;
4694 case CACHE_TARGET_FULL_RATIO:
4695 ss << "cache_target_full_ratio: "
4696 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
4697 break;
4698 case CACHE_MIN_FLUSH_AGE:
4699 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
4700 break;
4701 case CACHE_MIN_EVICT_AGE:
4702 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
4703 break;
4704 case ERASURE_CODE_PROFILE:
4705 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
4706 break;
4707 case MIN_READ_RECENCY_FOR_PROMOTE:
4708 ss << "min_read_recency_for_promote: " <<
4709 p->min_read_recency_for_promote << "\n";
4710 break;
4711 case HIT_SET_GRADE_DECAY_RATE:
4712 ss << "hit_set_grade_decay_rate: " <<
4713 p->hit_set_grade_decay_rate << "\n";
4714 break;
4715 case HIT_SET_SEARCH_LAST_N:
4716 ss << "hit_set_search_last_n: " <<
4717 p->hit_set_search_last_n << "\n";
4718 break;
4719 case HASHPSPOOL:
4720 case NODELETE:
4721 case NOPGCHANGE:
4722 case NOSIZECHANGE:
4723 case WRITE_FADVISE_DONTNEED:
4724 case NOSCRUB:
4725 case NODEEP_SCRUB:
4726 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4727 if (i->second == *it)
4728 break;
4729 }
4730 assert(i != ALL_CHOICES.end());
4731 ss << i->first << ": " <<
4732 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4733 "true" : "false") << "\n";
4734 break;
4735 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4736 ss << "min_write_recency_for_promote: " <<
4737 p->min_write_recency_for_promote << "\n";
4738 break;
4739 case FAST_READ:
4740 ss << "fast_read: " << p->fast_read << "\n";
4741 break;
4742 case SCRUB_MIN_INTERVAL:
4743 case SCRUB_MAX_INTERVAL:
4744 case DEEP_SCRUB_INTERVAL:
4745 case RECOVERY_PRIORITY:
4746 case RECOVERY_OP_PRIORITY:
4747 case SCRUB_PRIORITY:
4748 case COMPRESSION_MODE:
4749 case COMPRESSION_ALGORITHM:
4750 case COMPRESSION_REQUIRED_RATIO:
4751 case COMPRESSION_MAX_BLOB_SIZE:
4752 case COMPRESSION_MIN_BLOB_SIZE:
4753 case CSUM_TYPE:
4754 case CSUM_MAX_BLOCK:
4755 case CSUM_MIN_BLOCK:
4756 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4757 if (i->second == *it)
4758 break;
4759 }
4760 assert(i != ALL_CHOICES.end());
4761 {
4762 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
4763 if (p->opts.is_set(key)) {
4764 if(key == pool_opts_t::CSUM_TYPE) {
4765 int val;
4766 p->opts.get(key, &val);
4767 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
4768 } else {
4769 ss << i->first << ": " << p->opts.get(key) << "\n";
4770 }
4771 }
4772 }
4773 break;
4774 }
4775 rdata.append(ss.str());
4776 ss.str("");
4777 }
4778 }
4779 r = 0;
4780 } else if (prefix == "osd pool stats") {
4781 r = mon->pgservice->process_pg_command(prefix, cmdmap,
4782 osdmap, f.get(), &ss, &rdata);
4783 } else if (prefix == "osd pool get-quota") {
4784 string pool_name;
4785 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
4786
4787 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
4788 if (poolid < 0) {
4789 assert(poolid == -ENOENT);
4790 ss << "unrecognized pool '" << pool_name << "'";
4791 r = -ENOENT;
4792 goto reply;
4793 }
4794 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
4795
4796 if (f) {
4797 f->open_object_section("pool_quotas");
4798 f->dump_string("pool_name", pool_name);
4799 f->dump_unsigned("pool_id", poolid);
4800 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
4801 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
4802 f->close_section();
4803 f->flush(rdata);
4804 } else {
4805 stringstream rs;
4806 rs << "quotas for pool '" << pool_name << "':\n"
4807 << " max objects: ";
4808 if (p->quota_max_objects == 0)
4809 rs << "N/A";
4810 else
4811 rs << si_t(p->quota_max_objects) << " objects";
4812 rs << "\n"
4813 << " max bytes : ";
4814 if (p->quota_max_bytes == 0)
4815 rs << "N/A";
4816 else
4817 rs << si_t(p->quota_max_bytes) << "B";
4818 rdata.append(rs.str());
4819 }
4820 rdata.append("\n");
4821 r = 0;
4822 } else if (prefix == "osd crush rule list" ||
4823 prefix == "osd crush rule ls") {
4824 string format;
4825 cmd_getval(g_ceph_context, cmdmap, "format", format);
4826 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4827 f->open_array_section("rules");
4828 osdmap.crush->list_rules(f.get());
4829 f->close_section();
4830 ostringstream rs;
4831 f->flush(rs);
4832 rs << "\n";
4833 rdata.append(rs.str());
4834 } else if (prefix == "osd crush rule dump") {
4835 string name;
4836 cmd_getval(g_ceph_context, cmdmap, "name", name);
4837 string format;
4838 cmd_getval(g_ceph_context, cmdmap, "format", format);
4839 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4840 if (name == "") {
4841 f->open_array_section("rules");
4842 osdmap.crush->dump_rules(f.get());
4843 f->close_section();
4844 } else {
4845 int ruleno = osdmap.crush->get_rule_id(name);
4846 if (ruleno < 0) {
4847 ss << "unknown crush rule '" << name << "'";
4848 r = ruleno;
4849 goto reply;
4850 }
4851 osdmap.crush->dump_rule(ruleno, f.get());
4852 }
4853 ostringstream rs;
4854 f->flush(rs);
4855 rs << "\n";
4856 rdata.append(rs.str());
4857 } else if (prefix == "osd crush dump") {
4858 string format;
4859 cmd_getval(g_ceph_context, cmdmap, "format", format);
4860 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4861 f->open_object_section("crush_map");
4862 osdmap.crush->dump(f.get());
4863 f->close_section();
4864 ostringstream rs;
4865 f->flush(rs);
4866 rs << "\n";
4867 rdata.append(rs.str());
4868 } else if (prefix == "osd crush show-tunables") {
4869 string format;
4870 cmd_getval(g_ceph_context, cmdmap, "format", format);
4871 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4872 f->open_object_section("crush_map_tunables");
4873 osdmap.crush->dump_tunables(f.get());
4874 f->close_section();
4875 ostringstream rs;
4876 f->flush(rs);
4877 rs << "\n";
4878 rdata.append(rs.str());
4879 } else if (prefix == "osd crush tree") {
4880 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4881 f->open_array_section("crush_map_roots");
4882 osdmap.crush->dump_tree(f.get());
4883 f->close_section();
4884 f->flush(rdata);
4885 } else if (prefix == "osd crush class ls") {
4886 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4887 f->open_array_section("crush_classes");
4888 for (auto i : osdmap.crush->class_name)
4889 f->dump_string("class", i.second);
4890 f->close_section();
4891 f->flush(rdata);
4892 } else if (prefix == "osd erasure-code-profile ls") {
4893 const auto &profiles = osdmap.get_erasure_code_profiles();
4894 if (f)
4895 f->open_array_section("erasure-code-profiles");
4896 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
4897 if (f)
4898 f->dump_string("profile", i->first.c_str());
4899 else
4900 rdata.append(i->first + "\n");
4901 }
4902 if (f) {
4903 f->close_section();
4904 ostringstream rs;
4905 f->flush(rs);
4906 rs << "\n";
4907 rdata.append(rs.str());
4908 }
4909 } else if (prefix == "osd erasure-code-profile get") {
4910 string name;
4911 cmd_getval(g_ceph_context, cmdmap, "name", name);
4912 if (!osdmap.has_erasure_code_profile(name)) {
4913 ss << "unknown erasure code profile '" << name << "'";
4914 r = -ENOENT;
4915 goto reply;
4916 }
4917 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
4918 if (f)
4919 f->open_object_section("profile");
4920 for (map<string,string>::const_iterator i = profile.begin();
4921 i != profile.end();
4922 ++i) {
4923 if (f)
4924 f->dump_string(i->first.c_str(), i->second.c_str());
4925 else
4926 rdata.append(i->first + "=" + i->second + "\n");
4927 }
4928 if (f) {
4929 f->close_section();
4930 ostringstream rs;
4931 f->flush(rs);
4932 rs << "\n";
4933 rdata.append(rs.str());
4934 }
4935 } else {
4936 // try prepare update
4937 return false;
4938 }
4939
4940 reply:
4941 string rs;
4942 getline(ss, rs);
4943 mon->reply_command(op, r, rs, rdata, get_last_committed());
4944 return true;
4945 }
4946
4947 void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
4948 {
4949 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
4950 pending_inc.get_new_pool(pool_id, pool)->flags = flags;
4951 }
4952
4953 bool OSDMonitor::update_pools_status()
4954 {
4955 if (!mon->pgservice->is_readable())
4956 return false;
4957
4958 bool ret = false;
4959
4960 auto& pools = osdmap.get_pools();
4961 for (auto it = pools.begin(); it != pools.end(); ++it) {
4962 const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
4963 if (!pstat)
4964 continue;
4965 const object_stat_sum_t& sum = pstat->stats.sum;
4966 const pg_pool_t &pool = it->second;
4967 const string& pool_name = osdmap.get_pool_name(it->first);
4968
4969 bool pool_is_full =
4970 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
4971 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
4972
4973 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
4974 if (pool_is_full)
4975 continue;
4976
4977 mon->clog->info() << "pool '" << pool_name
4978 << "' no longer full; removing FULL flag";
4979
4980 update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
4981 ret = true;
4982 } else {
4983 if (!pool_is_full)
4984 continue;
4985
4986 if (pool.quota_max_bytes > 0 &&
4987 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
4988 mon->clog->warn() << "pool '" << pool_name << "' is full"
4989 << " (reached quota's max_bytes: "
4990 << si_t(pool.quota_max_bytes) << ")";
4991 }
4992 if (pool.quota_max_objects > 0 &&
4993 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
4994 mon->clog->warn() << "pool '" << pool_name << "' is full"
4995 << " (reached quota's max_objects: "
4996 << pool.quota_max_objects << ")";
4997 }
4998 update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
4999 ret = true;
5000 }
5001 }
5002 return ret;
5003 }
5004
5005 void OSDMonitor::get_pools_health(
5006 list<pair<health_status_t,string> >& summary,
5007 list<pair<health_status_t,string> > *detail) const
5008 {
5009 auto& pools = osdmap.get_pools();
5010 for (auto it = pools.begin(); it != pools.end(); ++it) {
5011 const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
5012 if (!pstat)
5013 continue;
5014 const object_stat_sum_t& sum = pstat->stats.sum;
5015 const pg_pool_t &pool = it->second;
5016 const string& pool_name = osdmap.get_pool_name(it->first);
5017
5018 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
5019 // uncomment these asserts if/when we update the FULL flag on pg_stat update
5020 //assert((pool.quota_max_objects > 0) || (pool.quota_max_bytes > 0));
5021
5022 stringstream ss;
5023 ss << "pool '" << pool_name << "' is full";
5024 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
5025 if (detail)
5026 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
5027 }
5028
5029 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
5030 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
5031
5032 if (pool.quota_max_objects > 0) {
5033 stringstream ss;
5034 health_status_t status = HEALTH_OK;
5035 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
5036 // uncomment these asserts if/when we update the FULL flag on pg_stat update
5037 //assert(pool.has_flag(pg_pool_t::FLAG_FULL));
5038 } else if (crit_threshold > 0 &&
5039 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
5040 ss << "pool '" << pool_name
5041 << "' has " << sum.num_objects << " objects"
5042 << " (max " << pool.quota_max_objects << ")";
5043 status = HEALTH_ERR;
5044 } else if (warn_threshold > 0 &&
5045 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
5046 ss << "pool '" << pool_name
5047 << "' has " << sum.num_objects << " objects"
5048 << " (max " << pool.quota_max_objects << ")";
5049 status = HEALTH_WARN;
5050 }
5051 if (status != HEALTH_OK) {
5052 pair<health_status_t,string> s(status, ss.str());
5053 summary.push_back(s);
5054 if (detail)
5055 detail->push_back(s);
5056 }
5057 }
5058
5059 if (pool.quota_max_bytes > 0) {
5060 health_status_t status = HEALTH_OK;
5061 stringstream ss;
5062 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
5063 // uncomment these asserts if/when we update the FULL flag on pg_stat update
5064 //assert(pool.has_flag(pg_pool_t::FLAG_FULL));
5065 } else if (crit_threshold > 0 &&
5066 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
5067 ss << "pool '" << pool_name
5068 << "' has " << si_t(sum.num_bytes) << " bytes"
5069 << " (max " << si_t(pool.quota_max_bytes) << ")";
5070 status = HEALTH_ERR;
5071 } else if (warn_threshold > 0 &&
5072 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
5073 ss << "pool '" << pool_name
5074 << "' has " << si_t(sum.num_bytes) << " bytes"
5075 << " (max " << si_t(pool.quota_max_bytes) << ")";
5076 status = HEALTH_WARN;
5077 }
5078 if (status != HEALTH_OK) {
5079 pair<health_status_t,string> s(status, ss.str());
5080 summary.push_back(s);
5081 if (detail)
5082 detail->push_back(s);
5083 }
5084 }
5085 }
5086 }
5087
5088
5089 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
5090 {
5091 op->mark_osdmon_event(__func__);
5092 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
5093 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
5094 MonSession *session = m->get_session();
5095 if (!session)
5096 return -EPERM;
5097 string erasure_code_profile;
5098 stringstream ss;
5099 string rule_name;
5100 if (m->auid)
5101 return prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
5102 0, 0,
5103 erasure_code_profile,
5104 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5105 else
5106 return prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
5107 0, 0,
5108 erasure_code_profile,
5109 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5110 }
5111
5112 int OSDMonitor::crush_rename_bucket(const string& srcname,
5113 const string& dstname,
5114 ostream *ss)
5115 {
5116 int ret;
5117 //
5118 // Avoid creating a pending crush if it does not already exists and
5119 // the rename would fail.
5120 //
5121 if (!_have_pending_crush()) {
5122 ret = _get_stable_crush().can_rename_bucket(srcname,
5123 dstname,
5124 ss);
5125 if (ret)
5126 return ret;
5127 }
5128
5129 CrushWrapper newcrush;
5130 _get_pending_crush(newcrush);
5131
5132 ret = newcrush.rename_bucket(srcname,
5133 dstname,
5134 ss);
5135 if (ret)
5136 return ret;
5137
5138 pending_inc.crush.clear();
5139 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5140 *ss << "renamed bucket " << srcname << " into " << dstname;
5141 return 0;
5142 }
5143
5144 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
5145 {
5146 string replacement = "";
5147
5148 if (plugin == "jerasure_generic" ||
5149 plugin == "jerasure_sse3" ||
5150 plugin == "jerasure_sse4" ||
5151 plugin == "jerasure_neon") {
5152 replacement = "jerasure";
5153 } else if (plugin == "shec_generic" ||
5154 plugin == "shec_sse3" ||
5155 plugin == "shec_sse4" ||
5156 plugin == "shec_neon") {
5157 replacement = "shec";
5158 }
5159
5160 if (replacement != "") {
5161 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
5162 << plugin << " that has been deprecated. Please use "
5163 << replacement << " instead." << dendl;
5164 }
5165 }
5166
5167 int OSDMonitor::normalize_profile(const string& profilename,
5168 ErasureCodeProfile &profile,
5169 bool force,
5170 ostream *ss)
5171 {
5172 ErasureCodeInterfaceRef erasure_code;
5173 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5174 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
5175 check_legacy_ec_plugin(plugin->second, profilename);
5176 int err = instance.factory(plugin->second,
5177 g_conf->get_val<std::string>("erasure_code_dir"),
5178 profile, &erasure_code, ss);
5179 if (err) {
5180 return err;
5181 }
5182
5183 err = erasure_code->init(profile, ss);
5184 if (err) {
5185 return err;
5186 }
5187
5188 auto it = profile.find("stripe_unit");
5189 if (it != profile.end()) {
5190 string err_str;
5191 uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5192 if (!err_str.empty()) {
5193 *ss << "could not parse stripe_unit '" << it->second
5194 << "': " << err_str << std::endl;
5195 return -EINVAL;
5196 }
5197 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5198 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
5199 if (chunk_size != stripe_unit) {
5200 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
5201 << "alignment. Would be padded to " << chunk_size
5202 << std::endl;
5203 return -EINVAL;
5204 }
5205 if ((stripe_unit % 4096) != 0 && !force) {
5206 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
5207 << "use --force to override this check" << std::endl;
5208 return -EINVAL;
5209 }
5210 }
5211 return 0;
5212 }
5213
5214 int OSDMonitor::crush_rule_create_erasure(const string &name,
5215 const string &profile,
5216 int *rule,
5217 ostream *ss)
5218 {
5219 int ruleid = osdmap.crush->get_rule_id(name);
5220 if (ruleid != -ENOENT) {
5221 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
5222 return -EEXIST;
5223 }
5224
5225 CrushWrapper newcrush;
5226 _get_pending_crush(newcrush);
5227
5228 ruleid = newcrush.get_rule_id(name);
5229 if (ruleid != -ENOENT) {
5230 *rule = newcrush.get_rule_mask_ruleset(ruleid);
5231 return -EALREADY;
5232 } else {
5233 ErasureCodeInterfaceRef erasure_code;
5234 int err = get_erasure_code(profile, &erasure_code, ss);
5235 if (err) {
5236 *ss << "failed to load plugin using profile " << profile << std::endl;
5237 return err;
5238 }
5239
5240 err = erasure_code->create_ruleset(name, newcrush, ss);
5241 erasure_code.reset();
5242 if (err < 0)
5243 return err;
5244 *rule = err;
5245 pending_inc.crush.clear();
5246 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5247 return 0;
5248 }
5249 }
5250
5251 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
5252 ErasureCodeInterfaceRef *erasure_code,
5253 ostream *ss) const
5254 {
5255 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
5256 return -EAGAIN;
5257 ErasureCodeProfile profile =
5258 osdmap.get_erasure_code_profile(erasure_code_profile);
5259 ErasureCodeProfile::const_iterator plugin =
5260 profile.find("plugin");
5261 if (plugin == profile.end()) {
5262 *ss << "cannot determine the erasure code plugin"
5263 << " because there is no 'plugin' entry in the erasure_code_profile "
5264 << profile << std::endl;
5265 return -EINVAL;
5266 }
5267 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
5268 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5269 return instance.factory(plugin->second,
5270 g_conf->get_val<std::string>("erasure_code_dir"),
5271 profile, erasure_code, ss);
5272 }
5273
5274 int OSDMonitor::check_cluster_features(uint64_t features,
5275 stringstream &ss)
5276 {
5277 stringstream unsupported_ss;
5278 int unsupported_count = 0;
5279 if ((mon->get_quorum_con_features() & features) != features) {
5280 unsupported_ss << "the monitor cluster";
5281 ++unsupported_count;
5282 }
5283
5284 set<int32_t> up_osds;
5285 osdmap.get_up_osds(up_osds);
5286 for (set<int32_t>::iterator it = up_osds.begin();
5287 it != up_osds.end(); ++it) {
5288 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
5289 if ((xi.features & features) != features) {
5290 if (unsupported_count > 0)
5291 unsupported_ss << ", ";
5292 unsupported_ss << "osd." << *it;
5293 unsupported_count ++;
5294 }
5295 }
5296
5297 if (unsupported_count > 0) {
5298 ss << "features " << features << " unsupported by: "
5299 << unsupported_ss.str();
5300 return -ENOTSUP;
5301 }
5302
5303 // check pending osd state, too!
5304 for (map<int32_t,osd_xinfo_t>::const_iterator p =
5305 pending_inc.new_xinfo.begin();
5306 p != pending_inc.new_xinfo.end(); ++p) {
5307 const osd_xinfo_t &xi = p->second;
5308 if ((xi.features & features) != features) {
5309 dout(10) << __func__ << " pending osd." << p->first
5310 << " features are insufficient; retry" << dendl;
5311 return -EAGAIN;
5312 }
5313 }
5314
5315 return 0;
5316 }
5317
5318 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
5319 stringstream& ss)
5320 {
5321 OSDMap::Incremental new_pending = pending_inc;
5322 ::encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
5323 OSDMap newmap;
5324 newmap.deepish_copy_from(osdmap);
5325 newmap.apply_incremental(new_pending);
5326
5327 // client compat
5328 if (newmap.require_min_compat_client > 0) {
5329 auto mv = newmap.get_min_compat_client();
5330 if (mv > newmap.require_min_compat_client) {
5331 ss << "new crush map requires client version " << ceph_release_name(mv)
5332 << " but require_min_compat_client is "
5333 << ceph_release_name(newmap.require_min_compat_client);
5334 return false;
5335 }
5336 }
5337
5338 // osd compat
5339 uint64_t features =
5340 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
5341 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
5342 stringstream features_ss;
5343 int r = check_cluster_features(features, features_ss);
5344 if (r) {
5345 ss << "Could not change CRUSH: " << features_ss.str();
5346 return false;
5347 }
5348
5349 return true;
5350 }
5351
5352 bool OSDMonitor::erasure_code_profile_in_use(
5353 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
5354 const string &profile,
5355 ostream *ss)
5356 {
5357 bool found = false;
5358 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5359 p != pools.end();
5360 ++p) {
5361 if (p->second.erasure_code_profile == profile) {
5362 *ss << osdmap.pool_name[p->first] << " ";
5363 found = true;
5364 }
5365 }
5366 if (found) {
5367 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
5368 }
5369 return found;
5370 }
5371
5372 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
5373 map<string,string> *erasure_code_profile_map,
5374 ostream *ss)
5375 {
5376 int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
5377 *ss,
5378 erasure_code_profile_map);
5379 if (r)
5380 return r;
5381 assert((*erasure_code_profile_map).count("plugin"));
5382 string default_plugin = (*erasure_code_profile_map)["plugin"];
5383 map<string,string> user_map;
5384 for (vector<string>::const_iterator i = erasure_code_profile.begin();
5385 i != erasure_code_profile.end();
5386 ++i) {
5387 size_t equal = i->find('=');
5388 if (equal == string::npos) {
5389 user_map[*i] = string();
5390 (*erasure_code_profile_map)[*i] = string();
5391 } else {
5392 const string key = i->substr(0, equal);
5393 equal++;
5394 const string value = i->substr(equal);
5395 user_map[key] = value;
5396 (*erasure_code_profile_map)[key] = value;
5397 }
5398 }
5399
5400 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
5401 (*erasure_code_profile_map) = user_map;
5402
5403 return 0;
5404 }
5405
5406 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
5407 const string &erasure_code_profile,
5408 unsigned *size, unsigned *min_size,
5409 ostream *ss)
5410 {
5411 int err = 0;
5412 switch (pool_type) {
5413 case pg_pool_t::TYPE_REPLICATED:
5414 *size = g_conf->osd_pool_default_size;
5415 *min_size = g_conf->get_osd_pool_default_min_size();
5416 break;
5417 case pg_pool_t::TYPE_ERASURE:
5418 {
5419 ErasureCodeInterfaceRef erasure_code;
5420 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5421 if (err == 0) {
5422 *size = erasure_code->get_chunk_count();
5423 *min_size = MIN(erasure_code->get_data_chunk_count() + 1, *size);
5424 }
5425 }
5426 break;
5427 default:
5428 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
5429 err = -EINVAL;
5430 break;
5431 }
5432 return err;
5433 }
5434
5435 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
5436 const string &erasure_code_profile,
5437 uint32_t *stripe_width,
5438 ostream *ss)
5439 {
5440 int err = 0;
5441 switch (pool_type) {
5442 case pg_pool_t::TYPE_REPLICATED:
5443 // ignored
5444 break;
5445 case pg_pool_t::TYPE_ERASURE:
5446 {
5447 ErasureCodeProfile profile =
5448 osdmap.get_erasure_code_profile(erasure_code_profile);
5449 ErasureCodeInterfaceRef erasure_code;
5450 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5451 if (err)
5452 break;
5453 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5454 uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
5455 auto it = profile.find("stripe_unit");
5456 if (it != profile.end()) {
5457 string err_str;
5458 stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5459 assert(err_str.empty());
5460 }
5461 *stripe_width = data_chunks *
5462 erasure_code->get_chunk_size(stripe_unit * data_chunks);
5463 }
5464 break;
5465 default:
5466 *ss << "prepare_pool_stripe_width: "
5467 << pool_type << " is not a known pool type";
5468 err = -EINVAL;
5469 break;
5470 }
5471 return err;
5472 }
5473
5474 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
5475 const string &erasure_code_profile,
5476 const string &rule_name,
5477 int *crush_rule,
5478 ostream *ss)
5479 {
5480
5481 if (*crush_rule < 0) {
5482 switch (pool_type) {
5483 case pg_pool_t::TYPE_REPLICATED:
5484 {
5485 if (rule_name == "") {
5486 //Use default rule
5487 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
5488 if (*crush_rule < 0) {
5489 // Errors may happen e.g. if no valid rule is available
5490 *ss << "No suitable CRUSH rule exists, check "
5491 << "'osd pool default crush *' config options";
5492 return -ENOENT;
5493 }
5494 } else {
5495 return get_crush_rule(rule_name, crush_rule, ss);
5496 }
5497 }
5498 break;
5499 case pg_pool_t::TYPE_ERASURE:
5500 {
5501 int err = crush_rule_create_erasure(rule_name,
5502 erasure_code_profile,
5503 crush_rule, ss);
5504 switch (err) {
5505 case -EALREADY:
5506 dout(20) << "prepare_pool_crush_rule: rule "
5507 << rule_name << " try again" << dendl;
5508 // fall through
5509 case 0:
5510 // need to wait for the crush rule to be proposed before proceeding
5511 err = -EAGAIN;
5512 break;
5513 case -EEXIST:
5514 err = 0;
5515 break;
5516 }
5517 return err;
5518 }
5519 break;
5520 default:
5521 *ss << "prepare_pool_crush_rule: " << pool_type
5522 << " is not a known pool type";
5523 return -EINVAL;
5524 break;
5525 }
5526 } else {
5527 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
5528 *ss << "CRUSH rule " << *crush_rule << " not found";
5529 return -ENOENT;
5530 }
5531 }
5532
5533 return 0;
5534 }
5535
5536 int OSDMonitor::get_crush_rule(const string &rule_name,
5537 int *crush_rule,
5538 ostream *ss)
5539 {
5540 int ret;
5541 ret = osdmap.crush->get_rule_id(rule_name);
5542 if (ret != -ENOENT) {
5543 // found it, use it
5544 *crush_rule = ret;
5545 } else {
5546 CrushWrapper newcrush;
5547 _get_pending_crush(newcrush);
5548
5549 ret = newcrush.get_rule_id(rule_name);
5550 if (ret != -ENOENT) {
5551 // found it, wait for it to be proposed
5552 dout(20) << __func__ << ": rule " << rule_name
5553 << " try again" << dendl;
5554 return -EAGAIN;
5555 } else {
5556 //Cannot find it , return error
5557 *ss << "specified rule " << rule_name << " doesn't exist";
5558 return ret;
5559 }
5560 }
5561 return 0;
5562 }
5563
5564 /**
5565 * @param name The name of the new pool
5566 * @param auid The auid of the pool owner. Can be -1
5567 * @param crush_rule The crush rule to use. If <0, will use the system default
5568 * @param crush_rule_name The crush rule to use, if crush_rulset <0
5569 * @param pg_num The pg_num to use. If set to 0, will use the system default
5570 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
5571 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
5572 * @param pool_type TYPE_ERASURE, or TYPE_REP
5573 * @param expected_num_objects expected number of objects on the pool
5574 * @param fast_read fast read type.
5575 * @param ss human readable error message, if any.
5576 *
5577 * @return 0 on success, negative errno on failure.
5578 */
5579 int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
5580 int crush_rule,
5581 const string &crush_rule_name,
5582 unsigned pg_num, unsigned pgp_num,
5583 const string &erasure_code_profile,
5584 const unsigned pool_type,
5585 const uint64_t expected_num_objects,
5586 FastReadType fast_read,
5587 ostream *ss)
5588 {
5589 if (name.length() == 0)
5590 return -EINVAL;
5591 if (pg_num == 0)
5592 pg_num = g_conf->osd_pool_default_pg_num;
5593 if (pgp_num == 0)
5594 pgp_num = g_conf->osd_pool_default_pgp_num;
5595 if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
5596 *ss << "'pg_num' must be greater than 0 and less than or equal to "
5597 << g_conf->mon_max_pool_pg_num
5598 << " (you may adjust 'mon max pool pg num' for higher values)";
5599 return -ERANGE;
5600 }
5601 if (pgp_num > pg_num) {
5602 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
5603 << ", which in this case is " << pg_num;
5604 return -ERANGE;
5605 }
5606 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
5607 *ss << "'fast_read' can only apply to erasure coding pool";
5608 return -EINVAL;
5609 }
5610 int r;
5611 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
5612 crush_rule_name, &crush_rule, ss);
5613 if (r) {
5614 dout(10) << " prepare_pool_crush_rule returns " << r << dendl;
5615 return r;
5616 }
5617 CrushWrapper newcrush;
5618 _get_pending_crush(newcrush);
5619 ostringstream err;
5620 CrushTester tester(newcrush, err);
5621 // use the internal crush tester if crushtool config is empty
5622 if (g_conf->crushtool.empty()) {
5623 r = tester.test();
5624 } else {
5625 r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
5626 osdmap.get_max_osd(),
5627 g_conf->mon_lease,
5628 crush_rule);
5629 }
5630 if (r) {
5631 dout(10) << " tester.test_with_crushtool returns " << r
5632 << ": " << err.str() << dendl;
5633 *ss << "crushtool check failed with " << r << ": " << err.str();
5634 return r;
5635 }
5636 unsigned size, min_size;
5637 r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
5638 if (r) {
5639 dout(10) << " prepare_pool_size returns " << r << dendl;
5640 return r;
5641 }
5642
5643 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
5644 return -EINVAL;
5645 }
5646
5647 uint32_t stripe_width = 0;
5648 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
5649 if (r) {
5650 dout(10) << " prepare_pool_stripe_width returns " << r << dendl;
5651 return r;
5652 }
5653
5654 bool fread = false;
5655 if (pool_type == pg_pool_t::TYPE_ERASURE) {
5656 switch (fast_read) {
5657 case FAST_READ_OFF:
5658 fread = false;
5659 break;
5660 case FAST_READ_ON:
5661 fread = true;
5662 break;
5663 case FAST_READ_DEFAULT:
5664 fread = g_conf->mon_osd_pool_ec_fast_read;
5665 break;
5666 default:
5667 *ss << "invalid fast_read setting: " << fast_read;
5668 return -EINVAL;
5669 }
5670 }
5671
5672 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
5673 p != pending_inc.new_pool_names.end();
5674 ++p) {
5675 if (p->second == name)
5676 return 0;
5677 }
5678
5679 if (-1 == pending_inc.new_pool_max)
5680 pending_inc.new_pool_max = osdmap.pool_max;
5681 int64_t pool = ++pending_inc.new_pool_max;
5682 pg_pool_t empty;
5683 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
5684 pi->type = pool_type;
5685 pi->fast_read = fread;
5686 pi->flags = g_conf->osd_pool_default_flags;
5687 if (g_conf->osd_pool_default_flag_hashpspool)
5688 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
5689 if (g_conf->osd_pool_default_flag_nodelete)
5690 pi->set_flag(pg_pool_t::FLAG_NODELETE);
5691 if (g_conf->osd_pool_default_flag_nopgchange)
5692 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
5693 if (g_conf->osd_pool_default_flag_nosizechange)
5694 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
5695 if (g_conf->osd_pool_use_gmt_hitset &&
5696 (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
5697 pi->use_gmt_hitset = true;
5698 else
5699 pi->use_gmt_hitset = false;
5700
5701 pi->size = size;
5702 pi->min_size = min_size;
5703 pi->crush_rule = crush_rule;
5704 pi->expected_num_objects = expected_num_objects;
5705 pi->object_hash = CEPH_STR_HASH_RJENKINS;
5706 pi->set_pg_num(pg_num);
5707 pi->set_pgp_num(pgp_num);
5708 pi->last_change = pending_inc.epoch;
5709 pi->auid = auid;
5710 pi->erasure_code_profile = erasure_code_profile;
5711 pi->stripe_width = stripe_width;
5712 pi->cache_target_dirty_ratio_micro =
5713 g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
5714 pi->cache_target_dirty_high_ratio_micro =
5715 g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
5716 pi->cache_target_full_ratio_micro =
5717 g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
5718 pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
5719 pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
5720 pending_inc.new_pool_names[pool] = name;
5721 return 0;
5722 }
5723
5724 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
5725 {
5726 op->mark_osdmon_event(__func__);
5727 ostringstream ss;
5728 if (pending_inc.new_flags < 0)
5729 pending_inc.new_flags = osdmap.get_flags();
5730 pending_inc.new_flags |= flag;
5731 ss << OSDMap::get_flag_string(flag) << " is set";
5732 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5733 get_last_committed() + 1));
5734 return true;
5735 }
5736
5737 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
5738 {
5739 op->mark_osdmon_event(__func__);
5740 ostringstream ss;
5741 if (pending_inc.new_flags < 0)
5742 pending_inc.new_flags = osdmap.get_flags();
5743 pending_inc.new_flags &= ~flag;
5744 ss << OSDMap::get_flag_string(flag) << " is unset";
5745 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5746 get_last_committed() + 1));
5747 return true;
5748 }
5749
5750 int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
5751 stringstream& ss)
5752 {
5753 string poolstr;
5754 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
5755 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5756 if (pool < 0) {
5757 ss << "unrecognized pool '" << poolstr << "'";
5758 return -ENOENT;
5759 }
5760 string var;
5761 cmd_getval(g_ceph_context, cmdmap, "var", var);
5762
5763 pg_pool_t p = *osdmap.get_pg_pool(pool);
5764 if (pending_inc.new_pools.count(pool))
5765 p = pending_inc.new_pools[pool];
5766
5767 // accept val as a json string in the normal case (current
5768 // generation monitor). parse out int or float values from the
5769 // string as needed. however, if it is not a string, try to pull
5770 // out an int, in case an older monitor with an older json schema is
5771 // forwarding a request.
5772 string val;
5773 string interr, floaterr;
5774 int64_t n = 0;
5775 double f = 0;
5776 int64_t uf = 0; // micro-f
5777 if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
5778 // wasn't a string; maybe an older mon forwarded json with an int?
5779 if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
5780 return -EINVAL; // no value!
5781 } else {
5782 // we got a string. see if it contains an int.
5783 n = strict_strtoll(val.c_str(), 10, &interr);
5784 // or a float
5785 f = strict_strtod(val.c_str(), &floaterr);
5786 uf = llrintl(f * (double)1000000.0);
5787 }
5788
5789 if (!p.is_tier() &&
5790 (var == "hit_set_type" || var == "hit_set_period" ||
5791 var == "hit_set_count" || var == "hit_set_fpp" ||
5792 var == "target_max_objects" || var == "target_max_bytes" ||
5793 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
5794 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
5795 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
5796 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
5797 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
5798 return -EACCES;
5799 }
5800
5801 if (var == "size") {
5802 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
5803 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
5804 return -EPERM;
5805 }
5806 if (p.type == pg_pool_t::TYPE_ERASURE) {
5807 ss << "can not change the size of an erasure-coded pool";
5808 return -ENOTSUP;
5809 }
5810 if (interr.length()) {
5811 ss << "error parsing integer value '" << val << "': " << interr;
5812 return -EINVAL;
5813 }
5814 if (n <= 0 || n > 10) {
5815 ss << "pool size must be between 1 and 10";
5816 return -EINVAL;
5817 }
5818 p.size = n;
5819 if (n < p.min_size)
5820 p.min_size = n;
5821 } else if (var == "min_size") {
5822 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
5823 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
5824 return -EPERM;
5825 }
5826 if (interr.length()) {
5827 ss << "error parsing integer value '" << val << "': " << interr;
5828 return -EINVAL;
5829 }
5830
5831 if (p.type != pg_pool_t::TYPE_ERASURE) {
5832 if (n < 1 || n > p.size) {
5833 ss << "pool min_size must be between 1 and " << (int)p.size;
5834 return -EINVAL;
5835 }
5836 } else {
5837 ErasureCodeInterfaceRef erasure_code;
5838 int k;
5839 stringstream tmp;
5840 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
5841 if (err == 0) {
5842 k = erasure_code->get_data_chunk_count();
5843 } else {
5844 ss << __func__ << " get_erasure_code failed: " << tmp.rdbuf();
5845 return err;
5846 }
5847
5848 if (n < k || n > p.size) {
5849 ss << "pool min_size must be between " << k << " and " << (int)p.size;
5850 return -EINVAL;
5851 }
5852 }
5853 p.min_size = n;
5854 } else if (var == "auid") {
5855 if (interr.length()) {
5856 ss << "error parsing integer value '" << val << "': " << interr;
5857 return -EINVAL;
5858 }
5859 p.auid = n;
5860 } else if (var == "crash_replay_interval") {
5861 if (interr.length()) {
5862 ss << "error parsing integer value '" << val << "': " << interr;
5863 return -EINVAL;
5864 }
5865 p.crash_replay_interval = n;
5866 } else if (var == "pg_num") {
5867 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
5868 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
5869 return -EPERM;
5870 }
5871 if (interr.length()) {
5872 ss << "error parsing integer value '" << val << "': " << interr;
5873 return -EINVAL;
5874 }
5875 if (n <= (int)p.get_pg_num()) {
5876 ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
5877 if (n < (int)p.get_pg_num())
5878 return -EEXIST;
5879 return 0;
5880 }
5881 string force;
5882 cmd_getval(g_ceph_context,cmdmap, "force", force);
5883 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
5884 force != "--yes-i-really-mean-it") {
5885 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
5886 return -EPERM;
5887 }
5888 int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
5889 int64_t new_pgs = n - p.get_pg_num();
5890 if (new_pgs > g_conf->mon_osd_max_split_count * expected_osds) {
5891 ss << "specified pg_num " << n << " is too large (creating "
5892 << new_pgs << " new PGs on ~" << expected_osds
5893 << " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
5894 << ')';
5895 return -E2BIG;
5896 }
5897 p.set_pg_num(n);
5898 // force pre-luminous clients to resend their ops, since they
5899 // don't understand that split PGs now form a new interval.
5900 p.last_force_op_resend_preluminous = pending_inc.epoch;
5901 } else if (var == "pgp_num") {
5902 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
5903 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
5904 return -EPERM;
5905 }
5906 if (interr.length()) {
5907 ss << "error parsing integer value '" << val << "': " << interr;
5908 return -EINVAL;
5909 }
5910 if (n <= 0) {
5911 ss << "specified pgp_num must > 0, but you set to " << n;
5912 return -EINVAL;
5913 }
5914 if (n > (int)p.get_pg_num()) {
5915 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
5916 return -EINVAL;
5917 }
5918 p.set_pgp_num(n);
5919 } else if (var == "crush_rule") {
5920 int id = osdmap.crush->get_rule_id(val);
5921 if (id == -ENOENT) {
5922 ss << "crush rule " << val << " does not exist";
5923 return -ENOENT;
5924 }
5925 if (id < 0) {
5926 ss << cpp_strerror(id);
5927 return -ENOENT;
5928 }
5929 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
5930 return -EINVAL;
5931 }
5932 p.crush_rule = id;
5933 } else if (var == "nodelete" || var == "nopgchange" ||
5934 var == "nosizechange" || var == "write_fadvise_dontneed" ||
5935 var == "noscrub" || var == "nodeep-scrub") {
5936 uint64_t flag = pg_pool_t::get_flag_by_name(var);
5937 // make sure we only compare against 'n' if we didn't receive a string
5938 if (val == "true" || (interr.empty() && n == 1)) {
5939 p.set_flag(flag);
5940 } else if (val == "false" || (interr.empty() && n == 0)) {
5941 p.unset_flag(flag);
5942 } else {
5943 ss << "expecting value 'true', 'false', '0', or '1'";
5944 return -EINVAL;
5945 }
5946 } else if (var == "hashpspool") {
5947 uint64_t flag = pg_pool_t::get_flag_by_name(var);
5948 string force;
5949 cmd_getval(g_ceph_context, cmdmap, "force", force);
5950 if (force != "--yes-i-really-mean-it") {
5951 ss << "are you SURE? this will remap all placement groups in this pool,"
5952 " this triggers large data movement,"
5953 " pass --yes-i-really-mean-it if you really do.";
5954 return -EPERM;
5955 }
5956 // make sure we only compare against 'n' if we didn't receive a string
5957 if (val == "true" || (interr.empty() && n == 1)) {
5958 p.set_flag(flag);
5959 } else if (val == "false" || (interr.empty() && n == 0)) {
5960 p.unset_flag(flag);
5961 } else {
5962 ss << "expecting value 'true', 'false', '0', or '1'";
5963 return -EINVAL;
5964 }
5965 } else if (var == "hit_set_type") {
5966 if (val == "none")
5967 p.hit_set_params = HitSet::Params();
5968 else {
5969 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
5970 if (err)
5971 return err;
5972 if (val == "bloom") {
5973 BloomHitSet::Params *bsp = new BloomHitSet::Params;
5974 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
5975 p.hit_set_params = HitSet::Params(bsp);
5976 } else if (val == "explicit_hash")
5977 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
5978 else if (val == "explicit_object")
5979 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
5980 else {
5981 ss << "unrecognized hit_set type '" << val << "'";
5982 return -EINVAL;
5983 }
5984 }
5985 } else if (var == "hit_set_period") {
5986 if (interr.length()) {
5987 ss << "error parsing integer value '" << val << "': " << interr;
5988 return -EINVAL;
5989 }
5990 p.hit_set_period = n;
5991 } else if (var == "hit_set_count") {
5992 if (interr.length()) {
5993 ss << "error parsing integer value '" << val << "': " << interr;
5994 return -EINVAL;
5995 }
5996 p.hit_set_count = n;
5997 } else if (var == "hit_set_fpp") {
5998 if (floaterr.length()) {
5999 ss << "error parsing floating point value '" << val << "': " << floaterr;
6000 return -EINVAL;
6001 }
6002 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
6003 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
6004 return -EINVAL;
6005 }
6006 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
6007 bloomp->set_fpp(f);
6008 } else if (var == "use_gmt_hitset") {
6009 if (val == "true" || (interr.empty() && n == 1)) {
6010 if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
6011 ss << "not all OSDs support GMT hit set.";
6012 return -EINVAL;
6013 }
6014 p.use_gmt_hitset = true;
6015 } else {
6016 ss << "expecting value 'true' or '1'";
6017 return -EINVAL;
6018 }
6019 } else if (var == "allow_ec_overwrites") {
6020 if (!p.is_erasure()) {
6021 ss << "ec overwrites can only be enabled for an erasure coded pool";
6022 return -EINVAL;
6023 }
6024 if (val == "true" || (interr.empty() && n == 1)) {
6025 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
6026 } else if (val == "false" || (interr.empty() && n == 0)) {
6027 ss << "ec overwrites cannot be disabled once enabled";
6028 return -EINVAL;
6029 } else {
6030 ss << "expecting value 'true', 'false', '0', or '1'";
6031 return -EINVAL;
6032 }
6033 stringstream err;
6034 if (!g_conf->mon_debug_no_require_bluestore_for_ec_overwrites &&
6035 !is_pool_currently_all_bluestore(pool, p, &err)) {
6036 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
6037 return -EINVAL;
6038 }
6039 } else if (var == "target_max_objects") {
6040 if (interr.length()) {
6041 ss << "error parsing int '" << val << "': " << interr;
6042 return -EINVAL;
6043 }
6044 p.target_max_objects = n;
6045 } else if (var == "target_max_bytes") {
6046 if (interr.length()) {
6047 ss << "error parsing int '" << val << "': " << interr;
6048 return -EINVAL;
6049 }
6050 p.target_max_bytes = n;
6051 } else if (var == "cache_target_dirty_ratio") {
6052 if (floaterr.length()) {
6053 ss << "error parsing float '" << val << "': " << floaterr;
6054 return -EINVAL;
6055 }
6056 if (f < 0 || f > 1.0) {
6057 ss << "value must be in the range 0..1";
6058 return -ERANGE;
6059 }
6060 p.cache_target_dirty_ratio_micro = uf;
6061 } else if (var == "cache_target_dirty_high_ratio") {
6062 if (floaterr.length()) {
6063 ss << "error parsing float '" << val << "': " << floaterr;
6064 return -EINVAL;
6065 }
6066 if (f < 0 || f > 1.0) {
6067 ss << "value must be in the range 0..1";
6068 return -ERANGE;
6069 }
6070 p.cache_target_dirty_high_ratio_micro = uf;
6071 } else if (var == "cache_target_full_ratio") {
6072 if (floaterr.length()) {
6073 ss << "error parsing float '" << val << "': " << floaterr;
6074 return -EINVAL;
6075 }
6076 if (f < 0 || f > 1.0) {
6077 ss << "value must be in the range 0..1";
6078 return -ERANGE;
6079 }
6080 p.cache_target_full_ratio_micro = uf;
6081 } else if (var == "cache_min_flush_age") {
6082 if (interr.length()) {
6083 ss << "error parsing int '" << val << "': " << interr;
6084 return -EINVAL;
6085 }
6086 p.cache_min_flush_age = n;
6087 } else if (var == "cache_min_evict_age") {
6088 if (interr.length()) {
6089 ss << "error parsing int '" << val << "': " << interr;
6090 return -EINVAL;
6091 }
6092 p.cache_min_evict_age = n;
6093 } else if (var == "min_read_recency_for_promote") {
6094 if (interr.length()) {
6095 ss << "error parsing integer value '" << val << "': " << interr;
6096 return -EINVAL;
6097 }
6098 p.min_read_recency_for_promote = n;
6099 } else if (var == "hit_set_grade_decay_rate") {
6100 if (interr.length()) {
6101 ss << "error parsing integer value '" << val << "': " << interr;
6102 return -EINVAL;
6103 }
6104 if (n > 100 || n < 0) {
6105 ss << "value out of range,valid range is 0 - 100";
6106 return -EINVAL;
6107 }
6108 p.hit_set_grade_decay_rate = n;
6109 } else if (var == "hit_set_search_last_n") {
6110 if (interr.length()) {
6111 ss << "error parsing integer value '" << val << "': " << interr;
6112 return -EINVAL;
6113 }
6114 if (n > p.hit_set_count || n < 0) {
6115 ss << "value out of range,valid range is 0 - hit_set_count";
6116 return -EINVAL;
6117 }
6118 p.hit_set_search_last_n = n;
6119 } else if (var == "min_write_recency_for_promote") {
6120 if (interr.length()) {
6121 ss << "error parsing integer value '" << val << "': " << interr;
6122 return -EINVAL;
6123 }
6124 p.min_write_recency_for_promote = n;
6125 } else if (var == "fast_read") {
6126 if (p.is_replicated()) {
6127 ss << "fast read is not supported in replication pool";
6128 return -EINVAL;
6129 }
6130 if (val == "true" || (interr.empty() && n == 1)) {
6131 p.fast_read = true;
6132 } else if (val == "false" || (interr.empty() && n == 0)) {
6133 p.fast_read = false;
6134 } else {
6135 ss << "expecting value 'true', 'false', '0', or '1'";
6136 return -EINVAL;
6137 }
6138 } else if (pool_opts_t::is_opt_name(var)) {
6139 if (var == "compression_mode") {
6140 auto cmode = Compressor::get_comp_mode_type(val);
6141 if (!cmode) {
6142 ss << "unrecognized compression mode '" << val << "'";
6143 return EINVAL;
6144 }
6145 } else if (var == "compression_algorithm") {
6146 auto alg = Compressor::get_comp_alg_type(val);
6147 if (!alg) {
6148 ss << "unrecognized compression_algorithm '" << val << "'";
6149 return EINVAL;
6150 }
6151 } else if (var == "compression_required_ratio") {
6152 if (floaterr.length()) {
6153 ss << "error parsing float value '" << val << "': " << floaterr;
6154 return -EINVAL;
6155 }
6156 if (f < 0 || f>1) {
6157 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
6158 return EINVAL;
6159 }
6160 } else if (var == "csum_type") {
6161 auto t = val != "unset" ? Checksummer::get_csum_string_type(val) : 0;
6162 if (t < 0 ) {
6163 ss << "unrecognized csum_type '" << val << "'";
6164 return EINVAL;
6165 }
6166 //preserve csum_type numeric value
6167 n = t;
6168 interr.clear();
6169 } else if (var == "compression_max_blob_size" ||
6170 var == "compression_min_blob_size" ||
6171 var == "csum_max_block" ||
6172 var == "csum_min_block") {
6173 if (interr.length()) {
6174 ss << "error parsing int value '" << val << "': " << interr;
6175 return -EINVAL;
6176 }
6177 }
6178
6179 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
6180 switch (desc.type) {
6181 case pool_opts_t::STR:
6182 if (val.empty()) {
6183 p.opts.unset(desc.key);
6184 } else {
6185 p.opts.set(desc.key, static_cast<std::string>(val));
6186 }
6187 break;
6188 case pool_opts_t::INT:
6189 if (interr.length()) {
6190 ss << "error parsing integer value '" << val << "': " << interr;
6191 return -EINVAL;
6192 }
6193 if (n == 0) {
6194 p.opts.unset(desc.key);
6195 } else {
6196 p.opts.set(desc.key, static_cast<int>(n));
6197 }
6198 break;
6199 case pool_opts_t::DOUBLE:
6200 if (floaterr.length()) {
6201 ss << "error parsing floating point value '" << val << "': " << floaterr;
6202 return -EINVAL;
6203 }
6204 if (f == 0) {
6205 p.opts.unset(desc.key);
6206 } else {
6207 p.opts.set(desc.key, static_cast<double>(f));
6208 }
6209 break;
6210 default:
6211 assert(!"unknown type");
6212 }
6213 } else {
6214 ss << "unrecognized variable '" << var << "'";
6215 return -EINVAL;
6216 }
6217 ss << "set pool " << pool << " " << var << " to " << val;
6218 p.last_change = pending_inc.epoch;
6219 pending_inc.new_pools[pool] = p;
6220 return 0;
6221 }
6222
6223 int OSDMonitor::_prepare_command_osd_crush_remove(
6224 CrushWrapper &newcrush,
6225 int32_t id,
6226 int32_t ancestor,
6227 bool has_ancestor,
6228 bool unlink_only)
6229 {
6230 int err = 0;
6231
6232 if (has_ancestor) {
6233 err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
6234 unlink_only);
6235 } else {
6236 err = newcrush.remove_item(g_ceph_context, id, unlink_only);
6237 }
6238 return err;
6239 }
6240
6241 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
6242 {
6243 pending_inc.crush.clear();
6244 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6245 }
6246
6247 int OSDMonitor::prepare_command_osd_crush_remove(
6248 CrushWrapper &newcrush,
6249 int32_t id,
6250 int32_t ancestor,
6251 bool has_ancestor,
6252 bool unlink_only)
6253 {
6254 int err = _prepare_command_osd_crush_remove(
6255 newcrush, id, ancestor,
6256 has_ancestor, unlink_only);
6257
6258 if (err < 0)
6259 return err;
6260
6261 assert(err == 0);
6262 do_osd_crush_remove(newcrush);
6263
6264 return 0;
6265 }
6266
6267 int OSDMonitor::prepare_command_osd_remove(int32_t id)
6268 {
6269 if (osdmap.is_up(id)) {
6270 return -EBUSY;
6271 }
6272
6273 pending_inc.new_state[id] = osdmap.get_state(id);
6274 pending_inc.new_uuid[id] = uuid_d();
6275 pending_metadata_rm.insert(id);
6276 pending_metadata.erase(id);
6277
6278 return 0;
6279 }
6280
6281 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
6282 {
6283 assert(existing_id);
6284 *existing_id = -1;
6285
6286 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
6287 if (!osdmap.exists(i) &&
6288 pending_inc.new_up_client.count(i) == 0 &&
6289 (pending_inc.new_state.count(i) == 0 ||
6290 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
6291 *existing_id = i;
6292 return -1;
6293 }
6294 }
6295
6296 if (pending_inc.new_max_osd < 0) {
6297 return osdmap.get_max_osd();
6298 }
6299 return pending_inc.new_max_osd;
6300 }
6301
6302 void OSDMonitor::do_osd_create(
6303 const int32_t id,
6304 const uuid_d& uuid,
6305 int32_t* new_id)
6306 {
6307 dout(10) << __func__ << " uuid " << uuid << dendl;
6308 assert(new_id);
6309
6310 // We presume validation has been performed prior to calling this
6311 // function. We assert with prejudice.
6312
6313 int32_t allocated_id = -1; // declare here so we can jump
6314 int32_t existing_id = -1;
6315 if (!uuid.is_zero()) {
6316 existing_id = osdmap.identify_osd(uuid);
6317 if (existing_id >= 0) {
6318 assert(id < 0 || id == existing_id);
6319 *new_id = existing_id;
6320 goto out;
6321 } else if (id >= 0) {
6322 // uuid does not exist, and id has been provided, so just create
6323 // the new osd.id
6324 *new_id = id;
6325 goto out;
6326 }
6327 }
6328
6329 // allocate a new id
6330 allocated_id = _allocate_osd_id(&existing_id);
6331 dout(10) << __func__ << " allocated id " << allocated_id
6332 << " existing id " << existing_id << dendl;
6333 if (existing_id >= 0) {
6334 assert(existing_id < osdmap.get_max_osd());
6335 assert(allocated_id < 0);
6336 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
6337 *new_id = existing_id;
6338
6339 } else if (allocated_id >= 0) {
6340 assert(existing_id < 0);
6341 // raise max_osd
6342 if (pending_inc.new_max_osd < 0) {
6343 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
6344 } else {
6345 ++pending_inc.new_max_osd;
6346 }
6347 *new_id = pending_inc.new_max_osd - 1;
6348 assert(*new_id == allocated_id);
6349 } else {
6350 assert(0 == "unexpected condition");
6351 }
6352
6353 out:
6354 dout(10) << __func__ << " using id " << *new_id << dendl;
6355 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
6356 pending_inc.new_max_osd = *new_id + 1;
6357 }
6358
6359 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
6360 if (!uuid.is_zero())
6361 pending_inc.new_uuid[*new_id] = uuid;
6362 }
6363
6364 int OSDMonitor::validate_osd_create(
6365 const int32_t id,
6366 const uuid_d& uuid,
6367 const bool check_osd_exists,
6368 int32_t* existing_id,
6369 stringstream& ss)
6370 {
6371
6372 dout(10) << __func__ << " id " << id << " uuid " << uuid
6373 << " check_osd_exists " << check_osd_exists << dendl;
6374
6375 assert(existing_id);
6376
6377 if (id < 0 && uuid.is_zero()) {
6378 // we have nothing to validate
6379 *existing_id = -1;
6380 return 0;
6381 } else if (uuid.is_zero()) {
6382 // we have an id but we will ignore it - because that's what
6383 // `osd create` does.
6384 return 0;
6385 }
6386
6387 /*
6388 * This function will be used to validate whether we are able to
6389 * create a new osd when the `uuid` is specified.
6390 *
6391 * It will be used by both `osd create` and `osd new`, as the checks
6392 * are basically the same when it pertains to osd id and uuid validation.
6393 * However, `osd create` presumes an `uuid` is optional, for legacy
6394 * reasons, while `osd new` requires the `uuid` to be provided. This
6395 * means that `osd create` will not be idempotent if an `uuid` is not
6396 * provided, but we will always guarantee the idempotency of `osd new`.
6397 */
6398
6399 assert(!uuid.is_zero());
6400 if (pending_inc.identify_osd(uuid) >= 0) {
6401 // osd is about to exist
6402 return -EAGAIN;
6403 }
6404
6405 int32_t i = osdmap.identify_osd(uuid);
6406 if (i >= 0) {
6407 // osd already exists
6408 if (id >= 0 && i != id) {
6409 ss << "uuid " << uuid << " already in use for different id " << i;
6410 return -EEXIST;
6411 }
6412 // return a positive errno to distinguish between a blocking error
6413 // and an error we consider to not be a problem (i.e., this would be
6414 // an idempotent operation).
6415 *existing_id = i;
6416 return EEXIST;
6417 }
6418 // i < 0
6419 if (id >= 0) {
6420 if (pending_inc.new_state.count(id)) {
6421 // osd is about to exist
6422 return -EAGAIN;
6423 }
6424 // we may not care if an osd exists if we are recreating a previously
6425 // destroyed osd.
6426 if (check_osd_exists && osdmap.exists(id)) {
6427 ss << "id " << id << " already in use and does not match uuid "
6428 << uuid;
6429 return -EINVAL;
6430 }
6431 }
6432 return 0;
6433 }
6434
6435 int OSDMonitor::prepare_command_osd_create(
6436 const int32_t id,
6437 const uuid_d& uuid,
6438 int32_t* existing_id,
6439 stringstream& ss)
6440 {
6441 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
6442 assert(existing_id);
6443
6444 if (uuid.is_zero()) {
6445 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
6446 }
6447
6448 return validate_osd_create(id, uuid, true, existing_id, ss);
6449 }
6450
6451 int OSDMonitor::prepare_command_osd_new(
6452 MonOpRequestRef op,
6453 const map<string,cmd_vartype>& cmdmap,
6454 const map<string,string>& secrets,
6455 stringstream &ss,
6456 Formatter *f)
6457 {
6458 uuid_d uuid;
6459 string uuidstr;
6460 int64_t id = -1;
6461
6462 assert(paxos->is_plugged());
6463
6464 dout(10) << __func__ << " " << op << dendl;
6465
6466 /* validate command. abort now if something's wrong. */
6467
6468 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
6469 *
6470 * If `id` is not specified, we will identify any existing osd based
6471 * on `uuid`. Operation will be idempotent iff secrets match.
6472 *
6473 * If `id` is specified, we will identify any existing osd based on
6474 * `uuid` and match against `id`. If they match, operation will be
6475 * idempotent iff secrets match.
6476 *
6477 * `-i secrets.json` will be optional. If supplied, will be used
6478 * to check for idempotency when `id` and `uuid` match.
6479 *
6480 * If `id` is not specified, and `uuid` does not exist, an id will
6481 * be found or allocated for the osd.
6482 *
6483 * If `id` is specified, and the osd has been previously marked
6484 * as destroyed, then the `id` will be reused.
6485 */
6486 if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
6487 ss << "requires the OSD's UUID to be specified.";
6488 return -EINVAL;
6489 } else if (!uuid.parse(uuidstr.c_str())) {
6490 ss << "invalid UUID value '" << uuidstr << "'.";
6491 return -EINVAL;
6492 }
6493
6494 if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
6495 (id < 0)) {
6496 ss << "invalid OSD id; must be greater or equal than zero.";
6497 return -EINVAL;
6498 }
6499
6500 // are we running an `osd create`-like command, or recreating
6501 // a previously destroyed osd?
6502
6503 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
6504
6505 // we will care about `id` to assess whether osd is `destroyed`, or
6506 // to create a new osd.
6507 // we will need an `id` by the time we reach auth.
6508
6509 int32_t existing_id = -1;
6510 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
6511 &existing_id, ss);
6512
6513 bool may_be_idempotent = false;
6514 if (err == EEXIST) {
6515 // this is idempotent from the osdmon's point-of-view
6516 may_be_idempotent = true;
6517 assert(existing_id >= 0);
6518 id = existing_id;
6519 } else if (err < 0) {
6520 return err;
6521 }
6522
6523 if (!may_be_idempotent) {
6524 // idempotency is out of the window. We are either creating a new
6525 // osd or recreating a destroyed osd.
6526 //
6527 // We now need to figure out if we have an `id` (and if it's valid),
6528 // of find an `id` if we don't have one.
6529
6530 // NOTE: we need to consider the case where the `id` is specified for
6531 // `osd create`, and we must honor it. So this means checking if
6532 // the `id` is destroyed, and if so assume the destroy; otherwise,
6533 // check if it `exists` - in which case we complain about not being
6534 // `destroyed`. In the end, if nothing fails, we must allow the
6535 // creation, so that we are compatible with `create`.
6536 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
6537 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
6538 ss << "OSD " << id << " has not yet been destroyed";
6539 return -EINVAL;
6540 } else if (id < 0) {
6541 // find an `id`
6542 id = _allocate_osd_id(&existing_id);
6543 if (id < 0) {
6544 assert(existing_id >= 0);
6545 id = existing_id;
6546 }
6547 dout(10) << __func__ << " found id " << id << " to use" << dendl;
6548 } else if (id >= 0 && osdmap.is_destroyed(id)) {
6549 dout(10) << __func__ << " recreating osd." << id << dendl;
6550 } else {
6551 dout(10) << __func__ << " creating new osd." << id << dendl;
6552 }
6553 } else {
6554 assert(id >= 0);
6555 assert(osdmap.exists(id));
6556 }
6557
6558 // we are now able to either create a brand new osd or reuse an existing
6559 // osd that has been previously destroyed.
6560
6561 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
6562
6563 if (may_be_idempotent && secrets.empty()) {
6564 // nothing to do, really.
6565 dout(10) << __func__ << " idempotent and no secrets -- no op." << dendl;
6566 assert(id >= 0);
6567 if (f) {
6568 f->open_object_section("created_osd");
6569 f->dump_int("osdid", id);
6570 f->close_section();
6571 } else {
6572 ss << id;
6573 }
6574 return EEXIST;
6575 }
6576
6577 string cephx_secret, lockbox_secret, dmcrypt_key;
6578 bool has_lockbox = false;
6579 bool has_secrets = (!secrets.empty());
6580
6581 ConfigKeyService *svc = nullptr;
6582 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
6583
6584 if (has_secrets) {
6585 if (secrets.count("cephx_secret") == 0) {
6586 ss << "requires a cephx secret.";
6587 return -EINVAL;
6588 }
6589 cephx_secret = secrets.at("cephx_secret");
6590
6591 bool has_lockbox_secret = (secrets.count("cephx_lockbox_secret") > 0);
6592 bool has_dmcrypt_key = (secrets.count("dmcrypt_key") > 0);
6593
6594 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
6595 << " dmcrypt " << has_dmcrypt_key << dendl;
6596
6597 if (has_lockbox_secret && has_dmcrypt_key) {
6598 has_lockbox = true;
6599 lockbox_secret = secrets.at("cephx_lockbox_secret");
6600 dmcrypt_key = secrets.at("dmcrypt_key");
6601 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
6602 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
6603 return -EINVAL;
6604 }
6605
6606 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
6607
6608 err = mon->authmon()->validate_osd_new(id, uuid,
6609 cephx_secret,
6610 lockbox_secret,
6611 cephx_entity,
6612 lockbox_entity,
6613 ss);
6614 if (err < 0) {
6615 return err;
6616 } else if (may_be_idempotent && err != EEXIST) {
6617 // for this to be idempotent, `id` should already be >= 0; no need
6618 // to use validate_id.
6619 assert(id >= 0);
6620 ss << "osd." << id << " exists but secrets do not match";
6621 return -EEXIST;
6622 }
6623
6624 if (has_lockbox) {
6625 svc = (ConfigKeyService*)mon->config_key_service;
6626 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
6627 if (err < 0) {
6628 return err;
6629 } else if (may_be_idempotent && err != EEXIST) {
6630 assert(id >= 0);
6631 ss << "osd." << id << " exists but dm-crypt key does not match.";
6632 return -EEXIST;
6633 }
6634 }
6635 }
6636 assert(!has_secrets || !cephx_secret.empty());
6637 assert(!has_lockbox || !lockbox_secret.empty());
6638
6639 if (may_be_idempotent) {
6640 // we have nothing to do for either the osdmon or the authmon,
6641 // and we have no lockbox - so the config key service will not be
6642 // touched. This is therefore an idempotent operation, and we can
6643 // just return right away.
6644 dout(10) << __func__ << " idempotent -- no op." << dendl;
6645 assert(id >= 0);
6646 if (f) {
6647 f->open_object_section("created_osd");
6648 f->dump_int("osdid", id);
6649 f->close_section();
6650 } else {
6651 ss << id;
6652 }
6653 return EEXIST;
6654 }
6655 assert(!may_be_idempotent);
6656
6657 // perform updates.
6658 if (has_secrets) {
6659 assert(!cephx_secret.empty());
6660 assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
6661 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
6662
6663 err = mon->authmon()->do_osd_new(cephx_entity,
6664 lockbox_entity,
6665 has_lockbox);
6666 assert(0 == err);
6667
6668 if (has_lockbox) {
6669 assert(nullptr != svc);
6670 svc->do_osd_new(uuid, dmcrypt_key);
6671 }
6672 }
6673
6674 if (is_recreate_destroyed) {
6675 assert(id >= 0);
6676 assert(osdmap.is_destroyed(id));
6677 pending_inc.new_weight[id] = CEPH_OSD_OUT;
6678 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED | CEPH_OSD_NEW;
6679 pending_inc.new_uuid[id] = uuid;
6680 } else {
6681 assert(id >= 0);
6682 int32_t new_id = -1;
6683 do_osd_create(id, uuid, &new_id);
6684 assert(new_id >= 0);
6685 assert(id == new_id);
6686 }
6687
6688 if (f) {
6689 f->open_object_section("created_osd");
6690 f->dump_int("osdid", id);
6691 f->close_section();
6692 } else {
6693 ss << id;
6694 }
6695
6696 return 0;
6697 }
6698
6699 bool OSDMonitor::prepare_command(MonOpRequestRef op)
6700 {
6701 op->mark_osdmon_event(__func__);
6702 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
6703 stringstream ss;
6704 map<string, cmd_vartype> cmdmap;
6705 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
6706 string rs = ss.str();
6707 mon->reply_command(op, -EINVAL, rs, get_last_committed());
6708 return true;
6709 }
6710
6711 MonSession *session = m->get_session();
6712 if (!session) {
6713 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
6714 return true;
6715 }
6716
6717 return prepare_command_impl(op, cmdmap);
6718 }
6719
6720 static int parse_reweights(CephContext *cct,
6721 const map<string,cmd_vartype> &cmdmap,
6722 const OSDMap& osdmap,
6723 map<int32_t, uint32_t>* weights)
6724 {
6725 string weights_str;
6726 if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
6727 return -EINVAL;
6728 }
6729 std::replace(begin(weights_str), end(weights_str), '\'', '"');
6730 json_spirit::mValue json_value;
6731 if (!json_spirit::read(weights_str, json_value)) {
6732 return -EINVAL;
6733 }
6734 if (json_value.type() != json_spirit::obj_type) {
6735 return -EINVAL;
6736 }
6737 const auto obj = json_value.get_obj();
6738 try {
6739 for (auto& osd_weight : obj) {
6740 auto osd_id = std::stoi(osd_weight.first);
6741 if (!osdmap.exists(osd_id)) {
6742 return -ENOENT;
6743 }
6744 if (osd_weight.second.type() != json_spirit::str_type) {
6745 return -EINVAL;
6746 }
6747 auto weight = std::stoul(osd_weight.second.get_str());
6748 weights->insert({osd_id, weight});
6749 }
6750 } catch (const std::logic_error& e) {
6751 return -EINVAL;
6752 }
6753 return 0;
6754 }
6755
6756 int OSDMonitor::prepare_command_osd_destroy(
6757 int32_t id,
6758 stringstream& ss)
6759 {
6760 assert(paxos->is_plugged());
6761
6762 // we check if the osd exists for the benefit of `osd purge`, which may
6763 // have previously removed the osd. If the osd does not exist, return
6764 // -ENOENT to convey this, and let the caller deal with it.
6765 //
6766 // we presume that all auth secrets and config keys were removed prior
6767 // to this command being called. if they exist by now, we also assume
6768 // they must have been created by some other command and do not pertain
6769 // to this non-existent osd.
6770 if (!osdmap.exists(id)) {
6771 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
6772 return -ENOENT;
6773 }
6774
6775 uuid_d uuid = osdmap.get_uuid(id);
6776 dout(10) << __func__ << " destroying osd." << id
6777 << " uuid " << uuid << dendl;
6778
6779 // if it has been destroyed, we assume our work here is done.
6780 if (osdmap.is_destroyed(id)) {
6781 ss << "destroyed osd." << id;
6782 return 0;
6783 }
6784
6785 EntityName cephx_entity, lockbox_entity;
6786 bool idempotent_auth = false, idempotent_cks = false;
6787
6788 int err = mon->authmon()->validate_osd_destroy(id, uuid,
6789 cephx_entity,
6790 lockbox_entity,
6791 ss);
6792 if (err < 0) {
6793 if (err == -ENOENT) {
6794 idempotent_auth = true;
6795 err = 0;
6796 } else {
6797 return err;
6798 }
6799 }
6800
6801 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
6802 err = svc->validate_osd_destroy(id, uuid);
6803 if (err < 0) {
6804 assert(err == -ENOENT);
6805 err = 0;
6806 idempotent_cks = true;
6807 }
6808
6809 if (!idempotent_auth) {
6810 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
6811 assert(0 == err);
6812 }
6813
6814 if (!idempotent_cks) {
6815 svc->do_osd_destroy(id, uuid);
6816 }
6817
6818 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
6819 pending_inc.new_uuid[id] = uuid_d();
6820
6821 // we can only propose_pending() once per service, otherwise we'll be
6822 // defying PaxosService and all laws of nature. Therefore, as we may
6823 // be used during 'osd purge', let's keep the caller responsible for
6824 // proposing.
6825 assert(err == 0);
6826 return 0;
6827 }
6828
6829 int OSDMonitor::prepare_command_osd_purge(
6830 int32_t id,
6831 stringstream& ss)
6832 {
6833 assert(paxos->is_plugged());
6834 dout(10) << __func__ << " purging osd." << id << dendl;
6835
6836 assert(!osdmap.is_up(id));
6837
6838 /*
6839 * This may look a bit weird, but this is what's going to happen:
6840 *
6841 * 1. we make sure that removing from crush works
6842 * 2. we call `prepare_command_osd_destroy()`. If it returns an
6843 * error, then we abort the whole operation, as no updates
6844 * have been made. However, we this function will have
6845 * side-effects, thus we need to make sure that all operations
6846 * performed henceforth will *always* succeed.
6847 * 3. we call `prepare_command_osd_remove()`. Although this
6848 * function can return an error, it currently only checks if the
6849 * osd is up - and we have made sure that it is not so, so there
6850 * is no conflict, and it is effectively an update.
6851 * 4. finally, we call `do_osd_crush_remove()`, which will perform
6852 * the crush update we delayed from before.
6853 */
6854
6855 CrushWrapper newcrush;
6856 _get_pending_crush(newcrush);
6857
6858 bool may_be_idempotent = false;
6859
6860 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
6861 if (err == -ENOENT) {
6862 err = 0;
6863 may_be_idempotent = true;
6864 } else if (err < 0) {
6865 ss << "error removing osd." << id << " from crush";
6866 return err;
6867 }
6868
6869 // no point destroying the osd again if it has already been marked destroyed
6870 if (!osdmap.is_destroyed(id)) {
6871 err = prepare_command_osd_destroy(id, ss);
6872 if (err < 0) {
6873 if (err == -ENOENT) {
6874 err = 0;
6875 } else {
6876 return err;
6877 }
6878 } else {
6879 may_be_idempotent = false;
6880 }
6881 }
6882 assert(0 == err);
6883
6884 if (may_be_idempotent && !osdmap.exists(id)) {
6885 dout(10) << __func__ << " osd." << id << " does not exist and "
6886 << "we are idempotent." << dendl;
6887 return -ENOENT;
6888 }
6889
6890 err = prepare_command_osd_remove(id);
6891 // we should not be busy, as we should have made sure this id is not up.
6892 assert(0 == err);
6893
6894 do_osd_crush_remove(newcrush);
6895 return 0;
6896 }
6897
6898 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
6899 map<string,cmd_vartype> &cmdmap)
6900 {
6901 op->mark_osdmon_event(__func__);
6902 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
6903 bool ret = false;
6904 stringstream ss;
6905 string rs;
6906 bufferlist rdata;
6907 int err = 0;
6908
6909 string format;
6910 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
6911 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6912
6913 string prefix;
6914 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
6915
6916 int64_t osdid;
6917 string name;
6918 bool osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
6919 if (osdid_present) {
6920 ostringstream oss;
6921 oss << "osd." << osdid;
6922 name = oss.str();
6923 }
6924
6925 // Even if there's a pending state with changes that could affect
6926 // a command, considering that said state isn't yet committed, we
6927 // just don't care about those changes if the command currently being
6928 // handled acts as a no-op against the current committed state.
6929 // In a nutshell, we assume this command happens *before*.
6930 //
6931 // Let me make this clearer:
6932 //
6933 // - If we have only one client, and that client issues some
6934 // operation that would conflict with this operation but is
6935 // still on the pending state, then we would be sure that said
6936 // operation wouldn't have returned yet, so the client wouldn't
6937 // issue this operation (unless the client didn't wait for the
6938 // operation to finish, and that would be the client's own fault).
6939 //
6940 // - If we have more than one client, each client will observe
6941 // whatever is the state at the moment of the commit. So, if we
6942 // have two clients, one issuing an unlink and another issuing a
6943 // link, and if the link happens while the unlink is still on the
6944 // pending state, from the link's point-of-view this is a no-op.
6945 // If different clients are issuing conflicting operations and
6946 // they care about that, then the clients should make sure they
6947 // enforce some kind of concurrency mechanism -- from our
6948 // perspective that's what Douglas Adams would call an SEP.
6949 //
6950 // This should be used as a general guideline for most commands handled
6951 // in this function. Adapt as you see fit, but please bear in mind that
6952 // this is the expected behavior.
6953
6954
6955 if (prefix == "osd setcrushmap" ||
6956 (prefix == "osd crush set" && !osdid_present)) {
6957 if (pending_inc.crush.length()) {
6958 dout(10) << __func__ << " waiting for pending crush update " << dendl;
6959 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
6960 return true;
6961 }
6962 dout(10) << "prepare_command setting new crush map" << dendl;
6963 bufferlist data(m->get_data());
6964 CrushWrapper crush;
6965 try {
6966 bufferlist::iterator bl(data.begin());
6967 crush.decode(bl);
6968 }
6969 catch (const std::exception &e) {
6970 err = -EINVAL;
6971 ss << "Failed to parse crushmap: " << e.what();
6972 goto reply;
6973 }
6974
6975 int64_t prior_version = 0;
6976 if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
6977 if (prior_version == osdmap.get_crush_version() - 1) {
6978 // see if we are a resend of the last update. this is imperfect
6979 // (multiple racing updaters may not both get reliable success)
6980 // but we expect crush updaters (via this interface) to be rare-ish.
6981 bufferlist current, proposed;
6982 osdmap.crush->encode(current, mon->get_quorum_con_features());
6983 crush.encode(proposed, mon->get_quorum_con_features());
6984 if (current.contents_equal(proposed)) {
6985 dout(10) << __func__
6986 << " proposed matches current and version equals previous"
6987 << dendl;
6988 err = 0;
6989 ss << osdmap.get_crush_version();
6990 goto reply;
6991 }
6992 }
6993 if (prior_version != osdmap.get_crush_version()) {
6994 err = -EPERM;
6995 ss << "prior_version " << prior_version << " != crush version "
6996 << osdmap.get_crush_version();
6997 goto reply;
6998 }
6999 }
7000
7001 if (crush.has_legacy_rulesets()) {
7002 err = -EINVAL;
7003 ss << "crush maps with ruleset != ruleid are no longer allowed";
7004 goto reply;
7005 }
7006 if (!validate_crush_against_features(&crush, ss)) {
7007 err = -EINVAL;
7008 goto reply;
7009 }
7010
7011 const auto& osdmap_pools = osdmap.get_pools();
7012 for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
7013 const int64_t pool_id = pit->first;
7014 const pg_pool_t &pool = pit->second;
7015 int ruleno = pool.get_crush_rule();
7016 if (!crush.rule_exists(ruleno)) {
7017 ss << " the crush rule no "<< ruleno << " for pool id " << pool_id << " is in use";
7018 err = -EINVAL;
7019 goto reply;
7020 }
7021 }
7022
7023 // sanity check: test some inputs to make sure this map isn't totally broken
7024 dout(10) << " testing map" << dendl;
7025 stringstream ess;
7026 CrushTester tester(crush, ess);
7027 // XXX: Use mon_lease as a timeout value for crushtool.
7028 // If the crushtool consistently takes longer than 'mon_lease' seconds,
7029 // then we would consistently trigger an election before the command
7030 // finishes, having a flapping monitor unable to hold quorum.
7031 int r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
7032 osdmap.get_max_osd(),
7033 g_conf->mon_lease);
7034 if (r < 0) {
7035 derr << "error on crush map: " << ess.str() << dendl;
7036 ss << "Failed crushmap test: " << ess.str();
7037 err = r;
7038 goto reply;
7039 }
7040
7041 dout(10) << " result " << ess.str() << dendl;
7042
7043 pending_inc.crush = data;
7044 ss << osdmap.get_crush_version() + 1;
7045 goto update;
7046
7047 } else if (prefix == "osd crush set-device-class") {
7048 if (!osdmap.exists(osdid)) {
7049 err = -ENOENT;
7050 ss << name << " does not exist. create it before updating the crush map";
7051 goto reply;
7052 }
7053
7054 string device_class;
7055 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7056 err = -EINVAL; // no value!
7057 goto reply;
7058 }
7059
7060 CrushWrapper newcrush;
7061 _get_pending_crush(newcrush);
7062
7063 string action;
7064 if (newcrush.item_exists(osdid)) {
7065 action = "updating";
7066 } else {
7067 action = "creating";
7068 newcrush.set_item_name(osdid, name);
7069 }
7070
7071 dout(5) << action << " crush item id " << osdid << " name '"
7072 << name << "' device_class " << device_class << dendl;
7073 err = newcrush.update_device_class(g_ceph_context, osdid, device_class, name);
7074
7075 if (err < 0)
7076 goto reply;
7077
7078 if (err == 0 && !_have_pending_crush()) {
7079 ss << "set-device-class item id " << osdid << " name '" << name << "' device_class "
7080 << device_class << " : no change";
7081 goto reply;
7082 }
7083
7084 pending_inc.crush.clear();
7085 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7086 ss << "set-device-class item id " << osdid << " name '" << name << "' device_class "
7087 << device_class;
7088 getline(ss, rs);
7089 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7090 get_last_committed() + 1));
7091 return true;
7092
7093 } else if (prefix == "osd crush add-bucket") {
7094 // os crush add-bucket <name> <type>
7095 string name, typestr;
7096 cmd_getval(g_ceph_context, cmdmap, "name", name);
7097 cmd_getval(g_ceph_context, cmdmap, "type", typestr);
7098
7099 if (!_have_pending_crush() &&
7100 _get_stable_crush().name_exists(name)) {
7101 ss << "bucket '" << name << "' already exists";
7102 goto reply;
7103 }
7104
7105 CrushWrapper newcrush;
7106 _get_pending_crush(newcrush);
7107
7108 if (newcrush.name_exists(name)) {
7109 ss << "bucket '" << name << "' already exists";
7110 goto update;
7111 }
7112 int type = newcrush.get_type_id(typestr);
7113 if (type < 0) {
7114 ss << "type '" << typestr << "' does not exist";
7115 err = -EINVAL;
7116 goto reply;
7117 }
7118 if (type == 0) {
7119 ss << "type '" << typestr << "' is for devices, not buckets";
7120 err = -EINVAL;
7121 goto reply;
7122 }
7123 int bucketno;
7124 err = newcrush.add_bucket(0, 0,
7125 CRUSH_HASH_DEFAULT, type, 0, NULL,
7126 NULL, &bucketno);
7127 if (err < 0) {
7128 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
7129 goto reply;
7130 }
7131 err = newcrush.set_item_name(bucketno, name);
7132 if (err < 0) {
7133 ss << "error setting bucket name to '" << name << "'";
7134 goto reply;
7135 }
7136
7137 pending_inc.crush.clear();
7138 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7139 ss << "added bucket " << name << " type " << typestr
7140 << " to crush map";
7141 goto update;
7142 } else if (prefix == "osd crush rename-bucket") {
7143 string srcname, dstname;
7144 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
7145 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
7146
7147 err = crush_rename_bucket(srcname, dstname, &ss);
7148 if (err == -EALREADY) // equivalent to success for idempotency
7149 err = 0;
7150 if (err)
7151 goto reply;
7152 else
7153 goto update;
7154 } else if (prefix == "osd crush class create") {
7155 string device_class;
7156 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7157 err = -EINVAL; // no value!
7158 goto reply;
7159 }
7160
7161 if (!_have_pending_crush() &&
7162 _get_stable_crush().class_exists(device_class)) {
7163 ss << "class '" << device_class << "' already exists";
7164 goto reply;
7165 }
7166
7167 CrushWrapper newcrush;
7168 _get_pending_crush(newcrush);
7169
7170 if (newcrush.class_exists(name)) {
7171 ss << "class '" << device_class << "' already exists";
7172 goto update;
7173 }
7174
7175 int class_id = newcrush.get_or_create_class_id(device_class);
7176
7177 pending_inc.crush.clear();
7178 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7179 ss << "created class " << device_class << " with id " << class_id
7180 << " to crush map";
7181 goto update;
7182
7183 } else if (prefix == "osd crush class rm") {
7184 string device_class;
7185 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7186 err = -EINVAL; // no value!
7187 goto reply;
7188 }
7189
7190 CrushWrapper newcrush;
7191 _get_pending_crush(newcrush);
7192
7193 if (!newcrush.class_exists(device_class)) {
7194 err = -ENOENT;
7195 ss << "class '" << device_class << "' does not exist";
7196 goto reply;
7197 }
7198
7199 int class_id = newcrush.get_class_id(device_class);
7200
7201 if (newcrush.class_is_in_use(class_id)) {
7202 err = -EBUSY;
7203 ss << "class '" << device_class << "' is in use";
7204 goto reply;
7205 }
7206
7207 err = newcrush.remove_class_name(device_class);
7208 if (err < 0) {
7209 ss << "class '" << device_class << "' cannot be removed '"
7210 << cpp_strerror(err) << "'";
7211 goto reply;
7212 }
7213
7214 pending_inc.crush.clear();
7215 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7216 ss << "removed class " << device_class << " with id " << class_id
7217 << " from crush map";
7218 goto update;
7219
7220 } else if (osdid_present &&
7221 (prefix == "osd crush set" || prefix == "osd crush add")) {
7222 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
7223 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
7224 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
7225
7226 if (!osdmap.exists(osdid)) {
7227 err = -ENOENT;
7228 ss << name << " does not exist. create it before updating the crush map";
7229 goto reply;
7230 }
7231
7232 double weight;
7233 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
7234 ss << "unable to parse weight value '"
7235 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7236 err = -EINVAL;
7237 goto reply;
7238 }
7239
7240 string args;
7241 vector<string> argvec;
7242 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7243 map<string,string> loc;
7244 CrushWrapper::parse_loc_map(argvec, &loc);
7245
7246 if (prefix == "osd crush set"
7247 && !_get_stable_crush().item_exists(osdid)) {
7248 err = -ENOENT;
7249 ss << "unable to set item id " << osdid << " name '" << name
7250 << "' weight " << weight << " at location " << loc
7251 << ": does not exist";
7252 goto reply;
7253 }
7254
7255 dout(5) << "adding/updating crush item id " << osdid << " name '"
7256 << name << "' weight " << weight << " at location "
7257 << loc << dendl;
7258 CrushWrapper newcrush;
7259 _get_pending_crush(newcrush);
7260
7261 string action;
7262 if (prefix == "osd crush set" ||
7263 newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
7264 action = "set";
7265 err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
7266 } else {
7267 action = "add";
7268 err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
7269 if (err == 0)
7270 err = 1;
7271 }
7272
7273 if (err < 0)
7274 goto reply;
7275
7276 if (err == 0 && !_have_pending_crush()) {
7277 ss << action << " item id " << osdid << " name '" << name << "' weight "
7278 << weight << " at location " << loc << ": no change";
7279 goto reply;
7280 }
7281
7282 pending_inc.crush.clear();
7283 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7284 ss << action << " item id " << osdid << " name '" << name << "' weight "
7285 << weight << " at location " << loc << " to crush map";
7286 getline(ss, rs);
7287 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7288 get_last_committed() + 1));
7289 return true;
7290
7291 } else if (prefix == "osd crush create-or-move") {
7292 do {
7293 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
7294 if (!osdmap.exists(osdid)) {
7295 err = -ENOENT;
7296 ss << name << " does not exist. create it before updating the crush map";
7297 goto reply;
7298 }
7299
7300 double weight;
7301 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
7302 ss << "unable to parse weight value '"
7303 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7304 err = -EINVAL;
7305 goto reply;
7306 }
7307
7308 string args;
7309 vector<string> argvec;
7310 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7311 map<string,string> loc;
7312 CrushWrapper::parse_loc_map(argvec, &loc);
7313
7314 dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
7315 << " at location " << loc << dendl;
7316
7317 CrushWrapper newcrush;
7318 _get_pending_crush(newcrush);
7319
7320 err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
7321 if (err == 0) {
7322 ss << "create-or-move updated item name '" << name << "' weight " << weight
7323 << " at location " << loc << " to crush map";
7324 break;
7325 }
7326 if (err > 0) {
7327 pending_inc.crush.clear();
7328 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7329 ss << "create-or-move updating item name '" << name << "' weight " << weight
7330 << " at location " << loc << " to crush map";
7331 getline(ss, rs);
7332 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7333 get_last_committed() + 1));
7334 return true;
7335 }
7336 } while (false);
7337
7338 } else if (prefix == "osd crush move") {
7339 do {
7340 // osd crush move <name> <loc1> [<loc2> ...]
7341
7342 string args;
7343 vector<string> argvec;
7344 cmd_getval(g_ceph_context, cmdmap, "name", name);
7345 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7346 map<string,string> loc;
7347 CrushWrapper::parse_loc_map(argvec, &loc);
7348
7349 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
7350 CrushWrapper newcrush;
7351 _get_pending_crush(newcrush);
7352
7353 if (!newcrush.name_exists(name)) {
7354 err = -ENOENT;
7355 ss << "item " << name << " does not exist";
7356 break;
7357 }
7358 int id = newcrush.get_item_id(name);
7359
7360 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
7361 if (id >= 0) {
7362 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
7363 } else {
7364 err = newcrush.move_bucket(g_ceph_context, id, loc);
7365 }
7366 if (err >= 0) {
7367 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
7368 pending_inc.crush.clear();
7369 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7370 getline(ss, rs);
7371 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7372 get_last_committed() + 1));
7373 return true;
7374 }
7375 } else {
7376 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
7377 err = 0;
7378 }
7379 } while (false);
7380 } else if (prefix == "osd crush swap-bucket") {
7381 string source, dest, force;
7382 cmd_getval(g_ceph_context, cmdmap, "source", source);
7383 cmd_getval(g_ceph_context, cmdmap, "dest", dest);
7384 cmd_getval(g_ceph_context, cmdmap, "force", force);
7385 CrushWrapper newcrush;
7386 _get_pending_crush(newcrush);
7387 if (!newcrush.name_exists(source)) {
7388 ss << "source item " << source << " does not exist";
7389 err = -ENOENT;
7390 goto reply;
7391 }
7392 if (!newcrush.name_exists(dest)) {
7393 ss << "dest item " << dest << " does not exist";
7394 err = -ENOENT;
7395 goto reply;
7396 }
7397 int sid = newcrush.get_item_id(source);
7398 int did = newcrush.get_item_id(dest);
7399 int sparent;
7400 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 &&
7401 force != "--yes-i-really-mean-it") {
7402 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
7403 err = -EPERM;
7404 goto reply;
7405 }
7406 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
7407 force != "--yes-i-really-mean-it") {
7408 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
7409 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
7410 << "; pass --yes-i-really-mean-it to proceed anyway";
7411 err = -EPERM;
7412 goto reply;
7413 }
7414 int r = newcrush.swap_bucket(g_ceph_context, sid, did);
7415 if (r < 0) {
7416 ss << "failed to swap bucket contents: " << cpp_strerror(r);
7417 goto reply;
7418 }
7419 ss << "swapped bucket of " << source << " to " << dest;
7420 pending_inc.crush.clear();
7421 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7422 wait_for_finished_proposal(op,
7423 new Monitor::C_Command(mon, op, err, ss.str(),
7424 get_last_committed() + 1));
7425 return true;
7426 } else if (prefix == "osd crush link") {
7427 // osd crush link <name> <loc1> [<loc2> ...]
7428 string name;
7429 cmd_getval(g_ceph_context, cmdmap, "name", name);
7430 vector<string> argvec;
7431 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7432 map<string,string> loc;
7433 CrushWrapper::parse_loc_map(argvec, &loc);
7434
7435 // Need an explicit check for name_exists because get_item_id returns
7436 // 0 on unfound.
7437 int id = osdmap.crush->get_item_id(name);
7438 if (!osdmap.crush->name_exists(name)) {
7439 err = -ENOENT;
7440 ss << "item " << name << " does not exist";
7441 goto reply;
7442 } else {
7443 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
7444 }
7445 if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
7446 ss << "no need to move item id " << id << " name '" << name
7447 << "' to location " << loc << " in crush map";
7448 err = 0;
7449 goto reply;
7450 }
7451
7452 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
7453 CrushWrapper newcrush;
7454 _get_pending_crush(newcrush);
7455
7456 if (!newcrush.name_exists(name)) {
7457 err = -ENOENT;
7458 ss << "item " << name << " does not exist";
7459 goto reply;
7460 } else {
7461 int id = newcrush.get_item_id(name);
7462 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
7463 err = newcrush.link_bucket(g_ceph_context, id, loc);
7464 if (err >= 0) {
7465 ss << "linked item id " << id << " name '" << name
7466 << "' to location " << loc << " in crush map";
7467 pending_inc.crush.clear();
7468 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7469 } else {
7470 ss << "cannot link item id " << id << " name '" << name
7471 << "' to location " << loc;
7472 goto reply;
7473 }
7474 } else {
7475 ss << "no need to move item id " << id << " name '" << name
7476 << "' to location " << loc << " in crush map";
7477 err = 0;
7478 }
7479 }
7480 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
7481 get_last_committed() + 1));
7482 return true;
7483 } else if (prefix == "osd crush rm" ||
7484 prefix == "osd crush remove" ||
7485 prefix == "osd crush unlink") {
7486 do {
7487 // osd crush rm <id> [ancestor]
7488 CrushWrapper newcrush;
7489 _get_pending_crush(newcrush);
7490
7491 string name;
7492 cmd_getval(g_ceph_context, cmdmap, "name", name);
7493
7494 if (!osdmap.crush->name_exists(name)) {
7495 err = 0;
7496 ss << "device '" << name << "' does not appear in the crush map";
7497 break;
7498 }
7499 if (!newcrush.name_exists(name)) {
7500 err = 0;
7501 ss << "device '" << name << "' does not appear in the crush map";
7502 getline(ss, rs);
7503 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7504 get_last_committed() + 1));
7505 return true;
7506 }
7507 int id = newcrush.get_item_id(name);
7508 int ancestor = 0;
7509
7510 bool unlink_only = prefix == "osd crush unlink";
7511 string ancestor_str;
7512 if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
7513 if (!newcrush.name_exists(ancestor_str)) {
7514 err = -ENOENT;
7515 ss << "ancestor item '" << ancestor_str
7516 << "' does not appear in the crush map";
7517 break;
7518 }
7519 ancestor = newcrush.get_item_id(ancestor_str);
7520 }
7521
7522 err = prepare_command_osd_crush_remove(
7523 newcrush,
7524 id, ancestor,
7525 (ancestor < 0), unlink_only);
7526
7527 if (err == -ENOENT) {
7528 ss << "item " << id << " does not appear in that position";
7529 err = 0;
7530 break;
7531 }
7532 if (err == 0) {
7533 ss << "removed item id " << id << " name '" << name << "' from crush map";
7534 getline(ss, rs);
7535 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7536 get_last_committed() + 1));
7537 return true;
7538 }
7539 } while (false);
7540
7541 } else if (prefix == "osd crush reweight-all") {
7542 // osd crush reweight <name> <weight>
7543 CrushWrapper newcrush;
7544 _get_pending_crush(newcrush);
7545
7546 newcrush.reweight(g_ceph_context);
7547 pending_inc.crush.clear();
7548 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7549 ss << "reweighted crush hierarchy";
7550 getline(ss, rs);
7551 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7552 get_last_committed() + 1));
7553 return true;
7554 } else if (prefix == "osd crush reweight") {
7555 // osd crush reweight <name> <weight>
7556 CrushWrapper newcrush;
7557 _get_pending_crush(newcrush);
7558
7559 string name;
7560 cmd_getval(g_ceph_context, cmdmap, "name", name);
7561 if (!newcrush.name_exists(name)) {
7562 err = -ENOENT;
7563 ss << "device '" << name << "' does not appear in the crush map";
7564 goto reply;
7565 }
7566
7567 int id = newcrush.get_item_id(name);
7568 if (id < 0) {
7569 ss << "device '" << name << "' is not a leaf in the crush map";
7570 err = -EINVAL;
7571 goto reply;
7572 }
7573 double w;
7574 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
7575 ss << "unable to parse weight value '"
7576 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7577 err = -EINVAL;
7578 goto reply;
7579 }
7580
7581 err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
7582 if (err < 0)
7583 goto reply;
7584 pending_inc.crush.clear();
7585 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7586 ss << "reweighted item id " << id << " name '" << name << "' to " << w
7587 << " in crush map";
7588 getline(ss, rs);
7589 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7590 get_last_committed() + 1));
7591 return true;
7592 } else if (prefix == "osd crush reweight-subtree") {
7593 // osd crush reweight <name> <weight>
7594 CrushWrapper newcrush;
7595 _get_pending_crush(newcrush);
7596
7597 string name;
7598 cmd_getval(g_ceph_context, cmdmap, "name", name);
7599 if (!newcrush.name_exists(name)) {
7600 err = -ENOENT;
7601 ss << "device '" << name << "' does not appear in the crush map";
7602 goto reply;
7603 }
7604
7605 int id = newcrush.get_item_id(name);
7606 if (id >= 0) {
7607 ss << "device '" << name << "' is not a subtree in the crush map";
7608 err = -EINVAL;
7609 goto reply;
7610 }
7611 double w;
7612 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
7613 ss << "unable to parse weight value '"
7614 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7615 err = -EINVAL;
7616 goto reply;
7617 }
7618
7619 err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
7620 if (err < 0)
7621 goto reply;
7622 pending_inc.crush.clear();
7623 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7624 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
7625 << " in crush map";
7626 getline(ss, rs);
7627 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7628 get_last_committed() + 1));
7629 return true;
7630 } else if (prefix == "osd crush tunables") {
7631 CrushWrapper newcrush;
7632 _get_pending_crush(newcrush);
7633
7634 err = 0;
7635 string profile;
7636 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
7637 if (profile == "legacy" || profile == "argonaut") {
7638 newcrush.set_tunables_legacy();
7639 } else if (profile == "bobtail") {
7640 newcrush.set_tunables_bobtail();
7641 } else if (profile == "firefly") {
7642 newcrush.set_tunables_firefly();
7643 } else if (profile == "hammer") {
7644 newcrush.set_tunables_hammer();
7645 } else if (profile == "jewel") {
7646 newcrush.set_tunables_jewel();
7647 } else if (profile == "optimal") {
7648 newcrush.set_tunables_optimal();
7649 } else if (profile == "default") {
7650 newcrush.set_tunables_default();
7651 } else {
7652 ss << "unrecognized profile '" << profile << "'";
7653 err = -EINVAL;
7654 goto reply;
7655 }
7656
7657 if (!validate_crush_against_features(&newcrush, ss)) {
7658 err = -EINVAL;
7659 goto reply;
7660 }
7661
7662 pending_inc.crush.clear();
7663 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7664 ss << "adjusted tunables profile to " << profile;
7665 getline(ss, rs);
7666 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7667 get_last_committed() + 1));
7668 return true;
7669 } else if (prefix == "osd crush set-tunable") {
7670 CrushWrapper newcrush;
7671 _get_pending_crush(newcrush);
7672
7673 err = 0;
7674 string tunable;
7675 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
7676
7677 int64_t value = -1;
7678 if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
7679 err = -EINVAL;
7680 ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
7681 goto reply;
7682 }
7683
7684 if (tunable == "straw_calc_version") {
7685 if (value < 0 || value > 1) {
7686 ss << "value must be 0 or 1; got " << value;
7687 err = -EINVAL;
7688 goto reply;
7689 }
7690 newcrush.set_straw_calc_version(value);
7691 } else {
7692 ss << "unrecognized tunable '" << tunable << "'";
7693 err = -EINVAL;
7694 goto reply;
7695 }
7696
7697 if (!validate_crush_against_features(&newcrush, ss)) {
7698 err = -EINVAL;
7699 goto reply;
7700 }
7701
7702 pending_inc.crush.clear();
7703 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7704 ss << "adjusted tunable " << tunable << " to " << value;
7705 getline(ss, rs);
7706 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7707 get_last_committed() + 1));
7708 return true;
7709
7710 } else if (prefix == "osd crush rule create-simple") {
7711 string name, root, type, mode;
7712 cmd_getval(g_ceph_context, cmdmap, "name", name);
7713 cmd_getval(g_ceph_context, cmdmap, "root", root);
7714 cmd_getval(g_ceph_context, cmdmap, "type", type);
7715 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
7716 if (mode == "")
7717 mode = "firstn";
7718
7719 if (osdmap.crush->rule_exists(name)) {
7720 // The name is uniquely associated to a ruleid and the rule it contains
7721 // From the user point of view, the rule is more meaningfull.
7722 ss << "rule " << name << " already exists";
7723 err = 0;
7724 goto reply;
7725 }
7726
7727 CrushWrapper newcrush;
7728 _get_pending_crush(newcrush);
7729
7730 if (newcrush.rule_exists(name)) {
7731 // The name is uniquely associated to a ruleid and the rule it contains
7732 // From the user point of view, the rule is more meaningfull.
7733 ss << "rule " << name << " already exists";
7734 err = 0;
7735 } else {
7736 int ruleno = newcrush.add_simple_rule(name, root, type, mode,
7737 pg_pool_t::TYPE_REPLICATED, &ss);
7738 if (ruleno < 0) {
7739 err = ruleno;
7740 goto reply;
7741 }
7742
7743 pending_inc.crush.clear();
7744 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7745 }
7746 getline(ss, rs);
7747 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7748 get_last_committed() + 1));
7749 return true;
7750
7751 } else if (prefix == "osd erasure-code-profile rm") {
7752 string name;
7753 cmd_getval(g_ceph_context, cmdmap, "name", name);
7754
7755 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
7756 goto wait;
7757
7758 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
7759 err = -EBUSY;
7760 goto reply;
7761 }
7762
7763 if (osdmap.has_erasure_code_profile(name) ||
7764 pending_inc.new_erasure_code_profiles.count(name)) {
7765 if (osdmap.has_erasure_code_profile(name)) {
7766 pending_inc.old_erasure_code_profiles.push_back(name);
7767 } else {
7768 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
7769 pending_inc.new_erasure_code_profiles.erase(name);
7770 }
7771
7772 getline(ss, rs);
7773 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7774 get_last_committed() + 1));
7775 return true;
7776 } else {
7777 ss << "erasure-code-profile " << name << " does not exist";
7778 err = 0;
7779 goto reply;
7780 }
7781
7782 } else if (prefix == "osd erasure-code-profile set") {
7783 string name;
7784 cmd_getval(g_ceph_context, cmdmap, "name", name);
7785 vector<string> profile;
7786 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
7787 bool force;
7788 if (profile.size() > 0 && profile.back() == "--force") {
7789 profile.pop_back();
7790 force = true;
7791 } else {
7792 force = false;
7793 }
7794 map<string,string> profile_map;
7795 err = parse_erasure_code_profile(profile, &profile_map, &ss);
7796 if (err)
7797 goto reply;
7798 if (profile_map.find("plugin") == profile_map.end()) {
7799 ss << "erasure-code-profile " << profile_map
7800 << " must contain a plugin entry" << std::endl;
7801 err = -EINVAL;
7802 goto reply;
7803 }
7804 string plugin = profile_map["plugin"];
7805
7806 if (pending_inc.has_erasure_code_profile(name)) {
7807 dout(20) << "erasure code profile " << name << " try again" << dendl;
7808 goto wait;
7809 } else {
7810 if (plugin == "isa" || plugin == "lrc") {
7811 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
7812 if (err == -EAGAIN)
7813 goto wait;
7814 if (err)
7815 goto reply;
7816 } else if (plugin == "shec") {
7817 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
7818 if (err == -EAGAIN)
7819 goto wait;
7820 if (err)
7821 goto reply;
7822 }
7823 err = normalize_profile(name, profile_map, force, &ss);
7824 if (err)
7825 goto reply;
7826
7827 if (osdmap.has_erasure_code_profile(name)) {
7828 ErasureCodeProfile existing_profile_map =
7829 osdmap.get_erasure_code_profile(name);
7830 err = normalize_profile(name, existing_profile_map, force, &ss);
7831 if (err)
7832 goto reply;
7833
7834 if (existing_profile_map == profile_map) {
7835 err = 0;
7836 goto reply;
7837 }
7838 if (!force) {
7839 err = -EPERM;
7840 ss << "will not override erasure code profile " << name
7841 << " because the existing profile "
7842 << existing_profile_map
7843 << " is different from the proposed profile "
7844 << profile_map;
7845 goto reply;
7846 }
7847 }
7848
7849 dout(20) << "erasure code profile set " << name << "="
7850 << profile_map << dendl;
7851 pending_inc.set_erasure_code_profile(name, profile_map);
7852 }
7853
7854 getline(ss, rs);
7855 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7856 get_last_committed() + 1));
7857 return true;
7858
7859 } else if (prefix == "osd crush rule create-erasure") {
7860 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
7861 if (err == -EAGAIN)
7862 goto wait;
7863 if (err)
7864 goto reply;
7865 string name, poolstr;
7866 cmd_getval(g_ceph_context, cmdmap, "name", name);
7867 string profile;
7868 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
7869 if (profile == "")
7870 profile = "default";
7871 if (profile == "default") {
7872 if (!osdmap.has_erasure_code_profile(profile)) {
7873 if (pending_inc.has_erasure_code_profile(profile)) {
7874 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
7875 goto wait;
7876 }
7877
7878 map<string,string> profile_map;
7879 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
7880 profile_map,
7881 &ss);
7882 if (err)
7883 goto reply;
7884 err = normalize_profile(name, profile_map, true, &ss);
7885 if (err)
7886 goto reply;
7887 dout(20) << "erasure code profile set " << profile << "="
7888 << profile_map << dendl;
7889 pending_inc.set_erasure_code_profile(profile, profile_map);
7890 goto wait;
7891 }
7892 }
7893
7894 int rule;
7895 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7896 if (err < 0) {
7897 switch(err) {
7898 case -EEXIST: // return immediately
7899 ss << "rule " << name << " already exists";
7900 err = 0;
7901 goto reply;
7902 break;
7903 case -EALREADY: // wait for pending to be proposed
7904 ss << "rule " << name << " already exists";
7905 err = 0;
7906 break;
7907 default: // non recoverable error
7908 goto reply;
7909 break;
7910 }
7911 } else {
7912 ss << "created rule " << name << " at " << rule;
7913 }
7914
7915 getline(ss, rs);
7916 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7917 get_last_committed() + 1));
7918 return true;
7919
7920 } else if (prefix == "osd crush rule rm") {
7921 string name;
7922 cmd_getval(g_ceph_context, cmdmap, "name", name);
7923
7924 if (!osdmap.crush->rule_exists(name)) {
7925 ss << "rule " << name << " does not exist";
7926 err = 0;
7927 goto reply;
7928 }
7929
7930 CrushWrapper newcrush;
7931 _get_pending_crush(newcrush);
7932
7933 if (!newcrush.rule_exists(name)) {
7934 ss << "rule " << name << " does not exist";
7935 err = 0;
7936 } else {
7937 int ruleno = newcrush.get_rule_id(name);
7938 assert(ruleno >= 0);
7939
7940 // make sure it is not in use.
7941 // FIXME: this is ok in some situations, but let's not bother with that
7942 // complexity now.
7943 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
7944 if (osdmap.crush_ruleset_in_use(ruleset)) {
7945 ss << "crush ruleset " << name << " " << ruleset << " is in use";
7946 err = -EBUSY;
7947 goto reply;
7948 }
7949
7950 err = newcrush.remove_rule(ruleno);
7951 if (err < 0) {
7952 goto reply;
7953 }
7954
7955 pending_inc.crush.clear();
7956 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7957 }
7958 getline(ss, rs);
7959 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7960 get_last_committed() + 1));
7961 return true;
7962
7963 } else if (prefix == "osd setmaxosd") {
7964 int64_t newmax;
7965 if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
7966 ss << "unable to parse 'newmax' value '"
7967 << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
7968 err = -EINVAL;
7969 goto reply;
7970 }
7971
7972 if (newmax > g_conf->mon_max_osd) {
7973 err = -ERANGE;
7974 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
7975 << g_conf->mon_max_osd << ")";
7976 goto reply;
7977 }
7978
7979 // Don't allow shrinking OSD number as this will cause data loss
7980 // and may cause kernel crashes.
7981 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
7982 if (newmax < osdmap.get_max_osd()) {
7983 // Check if the OSDs exist between current max and new value.
7984 // If there are any OSDs exist, then don't allow shrinking number
7985 // of OSDs.
7986 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
7987 if (osdmap.exists(i)) {
7988 err = -EBUSY;
7989 ss << "cannot shrink max_osd to " << newmax
7990 << " because osd." << i << " (and possibly others) still in use";
7991 goto reply;
7992 }
7993 }
7994 }
7995
7996 pending_inc.new_max_osd = newmax;
7997 ss << "set new max_osd = " << pending_inc.new_max_osd;
7998 getline(ss, rs);
7999 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8000 get_last_committed() + 1));
8001 return true;
8002
8003 } else if (prefix == "osd set-full-ratio" ||
8004 prefix == "osd set-backfillfull-ratio" ||
8005 prefix == "osd set-nearfull-ratio") {
8006 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8007 ss << "you must complete the upgrade and set require_osd_release ="
8008 << "luminous before using the new interface";
8009 err = -EPERM;
8010 goto reply;
8011 }
8012 double n;
8013 if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
8014 ss << "unable to parse 'ratio' value '"
8015 << cmd_vartype_stringify(cmdmap["who"]) << "'";
8016 err = -EINVAL;
8017 goto reply;
8018 }
8019 if (prefix == "osd set-full-ratio")
8020 pending_inc.new_full_ratio = n;
8021 else if (prefix == "osd set-backfillfull-ratio")
8022 pending_inc.new_backfillfull_ratio = n;
8023 else if (prefix == "osd set-nearfull-ratio")
8024 pending_inc.new_nearfull_ratio = n;
8025 ss << prefix << " " << n;
8026 getline(ss, rs);
8027 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8028 get_last_committed() + 1));
8029 return true;
8030 } else if (prefix == "osd set-require-min-compat-client") {
8031 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8032 ss << "you must complete the upgrade and set require_osd_release ="
8033 << "luminous before using the new interface";
8034 err = -EPERM;
8035 goto reply;
8036 }
8037 string v;
8038 cmd_getval(g_ceph_context, cmdmap, "version", v);
8039 int vno = ceph_release_from_name(v.c_str());
8040 if (vno <= 0) {
8041 ss << "version " << v << " is not recognized";
8042 err = -EINVAL;
8043 goto reply;
8044 }
8045 OSDMap newmap;
8046 newmap.deepish_copy_from(osdmap);
8047 newmap.apply_incremental(pending_inc);
8048 newmap.require_min_compat_client = vno;
8049 auto mvno = newmap.get_min_compat_client();
8050 if (vno < mvno) {
8051 ss << "osdmap current utilizes features that require "
8052 << ceph_release_name(mvno)
8053 << "; cannot set require_min_compat_client below that to "
8054 << ceph_release_name(vno);
8055 err = -EPERM;
8056 goto reply;
8057 }
8058 string sure;
8059 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
8060 if (sure != "--yes-i-really-mean-it") {
8061 FeatureMap m;
8062 mon->get_combined_feature_map(&m);
8063 uint64_t features = ceph_release_features(vno);
8064 bool first = true;
8065 bool ok = true;
8066 for (int type : {
8067 CEPH_ENTITY_TYPE_CLIENT,
8068 CEPH_ENTITY_TYPE_MDS,
8069 CEPH_ENTITY_TYPE_MGR }) {
8070 auto p = m.m.find(type);
8071 if (p == m.m.end()) {
8072 continue;
8073 }
8074 for (auto& q : p->second) {
8075 uint64_t missing = ~q.first & features;
8076 if (missing) {
8077 if (first) {
8078 ss << "cannot set require_min_compat_client to " << v << ": ";
8079 } else {
8080 ss << "; ";
8081 }
8082 first = false;
8083 ss << q.second << " connected " << ceph_entity_type_name(type)
8084 << "(s) look like " << ceph_release_name(
8085 ceph_release_from_features(q.first))
8086 << " (missing 0x" << std::hex << missing << std::dec << ")";
8087 ok = false;
8088 }
8089 }
8090 }
8091 if (!ok) {
8092 ss << "; add --yes-i-really-mean-it to do it anyway";
8093 err = -EPERM;
8094 goto reply;
8095 }
8096 }
8097 ss << "set require_min_compat_client to " << ceph_release_name(vno);
8098 pending_inc.new_require_min_compat_client = vno;
8099 getline(ss, rs);
8100 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8101 get_last_committed() + 1));
8102 return true;
8103 } else if (prefix == "osd pause") {
8104 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8105
8106 } else if (prefix == "osd unpause") {
8107 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8108
8109 } else if (prefix == "osd set") {
8110 string key;
8111 cmd_getval(g_ceph_context, cmdmap, "key", key);
8112 if (key == "full")
8113 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
8114 else if (key == "pause")
8115 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8116 else if (key == "noup")
8117 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
8118 else if (key == "nodown")
8119 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
8120 else if (key == "noout")
8121 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
8122 else if (key == "noin")
8123 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
8124 else if (key == "nobackfill")
8125 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
8126 else if (key == "norebalance")
8127 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
8128 else if (key == "norecover")
8129 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
8130 else if (key == "noscrub")
8131 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
8132 else if (key == "nodeep-scrub")
8133 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
8134 else if (key == "notieragent")
8135 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
8136 else if (key == "sortbitwise") {
8137 if (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT) {
8138 return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
8139 } else {
8140 ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
8141 err = -EPERM;
8142 goto reply;
8143 }
8144 } else if (key == "require_jewel_osds") {
8145 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8146 ss << "the sortbitwise flag must be set before require_jewel_osds";
8147 err = -EPERM;
8148 goto reply;
8149 } else if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL) {
8150 ss << "require_osd_release is already >= jewel";
8151 err = 0;
8152 goto reply;
8153 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)) {
8154 return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
8155 } else {
8156 ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
8157 err = -EPERM;
8158 }
8159 } else if (key == "require_kraken_osds") {
8160 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8161 ss << "the sortbitwise flag must be set before require_kraken_osds";
8162 err = -EPERM;
8163 goto reply;
8164 } else if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN) {
8165 ss << "require_osd_release is already >= kraken";
8166 err = 0;
8167 goto reply;
8168 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)) {
8169 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
8170 // ensure JEWEL is also set
8171 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
8172 return r;
8173 } else {
8174 ss << "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
8175 err = -EPERM;
8176 }
8177 } else {
8178 ss << "unrecognized flag '" << key << "'";
8179 err = -EINVAL;
8180 }
8181
8182 } else if (prefix == "osd unset") {
8183 string key;
8184 cmd_getval(g_ceph_context, cmdmap, "key", key);
8185 if (key == "full")
8186 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
8187 else if (key == "pause")
8188 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8189 else if (key == "noup")
8190 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
8191 else if (key == "nodown")
8192 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
8193 else if (key == "noout")
8194 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
8195 else if (key == "noin")
8196 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
8197 else if (key == "nobackfill")
8198 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
8199 else if (key == "norebalance")
8200 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
8201 else if (key == "norecover")
8202 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
8203 else if (key == "noscrub")
8204 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
8205 else if (key == "nodeep-scrub")
8206 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
8207 else if (key == "notieragent")
8208 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
8209 else if (key == "sortbitwise") {
8210 ss << "the sortbitwise flag is required and cannot be unset";
8211 err = -EPERM;
8212 } else {
8213 ss << "unrecognized flag '" << key << "'";
8214 err = -EINVAL;
8215 }
8216
8217 } else if (prefix == "osd require-osd-release") {
8218 string release;
8219 cmd_getval(g_ceph_context, cmdmap, "release", release);
8220 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8221 ss << "the sortbitwise flag must be set first";
8222 err = -EPERM;
8223 goto reply;
8224 }
8225 int rel = ceph_release_from_name(release.c_str());
8226 if (rel <= 0) {
8227 ss << "unrecognized release " << release;
8228 err = -EINVAL;
8229 goto reply;
8230 }
8231 if (rel < CEPH_RELEASE_LUMINOUS) {
8232 ss << "use this command only for luminous and later";
8233 err = -EINVAL;
8234 goto reply;
8235 }
8236 if (rel == CEPH_RELEASE_LUMINOUS) {
8237 if (!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS)) {
8238 ss << "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
8239 err = -EPERM;
8240 goto reply;
8241 }
8242 } else {
8243 ss << "not supported for this release yet";
8244 err = -EPERM;
8245 goto reply;
8246 }
8247 if (rel < osdmap.require_osd_release) {
8248 ss << "require_osd_release cannot be lowered once it has been set";
8249 err = -EPERM;
8250 goto reply;
8251 }
8252 pending_inc.new_require_osd_release = rel;
8253 goto update;
8254 } else if (prefix == "osd cluster_snap") {
8255 // ** DISABLE THIS FOR NOW **
8256 ss << "cluster snapshot currently disabled (broken implementation)";
8257 // ** DISABLE THIS FOR NOW **
8258
8259 } else if (prefix == "osd down" ||
8260 prefix == "osd out" ||
8261 prefix == "osd in" ||
8262 prefix == "osd rm") {
8263
8264 bool any = false;
8265 bool stop = false;
8266 bool verbose = true;
8267
8268 vector<string> idvec;
8269 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
8270 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8271 set<int> osds;
8272
8273 // wildcard?
8274 if (j == 0 &&
8275 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8276 if (prefix == "osd in") {
8277 // touch out osds only
8278 osdmap.get_out_osds(osds);
8279 } else {
8280 osdmap.get_all_osds(osds);
8281 }
8282 stop = true;
8283 verbose = false; // so the output is less noisy.
8284 } else {
8285 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8286 if (osd < 0) {
8287 ss << "invalid osd id" << osd;
8288 err = -EINVAL;
8289 continue;
8290 } else if (!osdmap.exists(osd)) {
8291 ss << "osd." << osd << " does not exist. ";
8292 continue;
8293 }
8294
8295 osds.insert(osd);
8296 }
8297
8298 for (auto &osd : osds) {
8299 if (prefix == "osd down") {
8300 if (osdmap.is_down(osd)) {
8301 if (verbose)
8302 ss << "osd." << osd << " is already down. ";
8303 } else {
8304 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
8305 ss << "marked down osd." << osd << ". ";
8306 any = true;
8307 }
8308 } else if (prefix == "osd out") {
8309 if (osdmap.is_out(osd)) {
8310 if (verbose)
8311 ss << "osd." << osd << " is already out. ";
8312 } else {
8313 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
8314 if (osdmap.osd_weight[osd]) {
8315 if (pending_inc.new_xinfo.count(osd) == 0) {
8316 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
8317 }
8318 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
8319 }
8320 ss << "marked out osd." << osd << ". ";
8321 any = true;
8322 }
8323 } else if (prefix == "osd in") {
8324 if (osdmap.is_in(osd)) {
8325 if (verbose)
8326 ss << "osd." << osd << " is already in. ";
8327 } else {
8328 if (osdmap.osd_xinfo[osd].old_weight > 0) {
8329 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
8330 if (pending_inc.new_xinfo.count(osd) == 0) {
8331 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
8332 }
8333 pending_inc.new_xinfo[osd].old_weight = 0;
8334 } else {
8335 pending_inc.new_weight[osd] = CEPH_OSD_IN;
8336 }
8337 ss << "marked in osd." << osd << ". ";
8338 any = true;
8339 }
8340 } else if (prefix == "osd rm") {
8341 err = prepare_command_osd_remove(osd);
8342
8343 if (err == -EBUSY) {
8344 if (any)
8345 ss << ", ";
8346 ss << "osd." << osd << " is still up; must be down before removal. ";
8347 } else {
8348 assert(err == 0);
8349 if (any) {
8350 ss << ", osd." << osd;
8351 } else {
8352 ss << "removed osd." << osd;
8353 }
8354 any = true;
8355 }
8356 }
8357 }
8358 }
8359 if (any) {
8360 getline(ss, rs);
8361 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
8362 get_last_committed() + 1));
8363 return true;
8364 }
8365 } else if (prefix == "osd add-noup" ||
8366 prefix == "osd add-nodown" ||
8367 prefix == "osd add-noin" ||
8368 prefix == "osd add-noout") {
8369
8370 enum {
8371 OP_NOUP,
8372 OP_NODOWN,
8373 OP_NOIN,
8374 OP_NOOUT,
8375 } option;
8376
8377 if (prefix == "osd add-noup") {
8378 option = OP_NOUP;
8379 } else if (prefix == "osd add-nodown") {
8380 option = OP_NODOWN;
8381 } else if (prefix == "osd add-noin") {
8382 option = OP_NOIN;
8383 } else {
8384 option = OP_NOOUT;
8385 }
8386
8387 bool any = false;
8388 bool stop = false;
8389
8390 vector<string> idvec;
8391 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
8392 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8393
8394 set<int> osds;
8395
8396 // wildcard?
8397 if (j == 0 &&
8398 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8399 osdmap.get_all_osds(osds);
8400 stop = true;
8401 } else {
8402 // try traditional single osd way
8403
8404 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8405 if (osd < 0) {
8406 // ss has reason for failure
8407 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8408 err = -EINVAL;
8409 continue;
8410 }
8411
8412 osds.insert(osd);
8413 }
8414
8415 for (auto &osd : osds) {
8416
8417 if (!osdmap.exists(osd)) {
8418 ss << "osd." << osd << " does not exist. ";
8419 continue;
8420 }
8421
8422 switch (option) {
8423 case OP_NOUP:
8424 if (osdmap.is_up(osd)) {
8425 ss << "osd." << osd << " is already up. ";
8426 continue;
8427 }
8428
8429 if (osdmap.is_noup(osd)) {
8430 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP))
8431 any = true;
8432 } else {
8433 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
8434 any = true;
8435 }
8436
8437 break;
8438
8439 case OP_NODOWN:
8440 if (osdmap.is_down(osd)) {
8441 ss << "osd." << osd << " is already down. ";
8442 continue;
8443 }
8444
8445 if (osdmap.is_nodown(osd)) {
8446 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN))
8447 any = true;
8448 } else {
8449 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
8450 any = true;
8451 }
8452
8453 break;
8454
8455 case OP_NOIN:
8456 if (osdmap.is_in(osd)) {
8457 ss << "osd." << osd << " is already in. ";
8458 continue;
8459 }
8460
8461 if (osdmap.is_noin(osd)) {
8462 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN))
8463 any = true;
8464 } else {
8465 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
8466 any = true;
8467 }
8468
8469 break;
8470
8471 case OP_NOOUT:
8472 if (osdmap.is_out(osd)) {
8473 ss << "osd." << osd << " is already out. ";
8474 continue;
8475 }
8476
8477 if (osdmap.is_noout(osd)) {
8478 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT))
8479 any = true;
8480 } else {
8481 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
8482 any = true;
8483 }
8484
8485 break;
8486
8487 default:
8488 assert(0 == "invalid option");
8489 }
8490 }
8491 }
8492
8493 if (any) {
8494 getline(ss, rs);
8495 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
8496 get_last_committed() + 1));
8497 return true;
8498 }
8499 } else if (prefix == "osd rm-noup" ||
8500 prefix == "osd rm-nodown" ||
8501 prefix == "osd rm-noin" ||
8502 prefix == "osd rm-noout") {
8503
8504 enum {
8505 OP_NOUP,
8506 OP_NODOWN,
8507 OP_NOIN,
8508 OP_NOOUT,
8509 } option;
8510
8511 if (prefix == "osd rm-noup") {
8512 option = OP_NOUP;
8513 } else if (prefix == "osd rm-nodown") {
8514 option = OP_NODOWN;
8515 } else if (prefix == "osd rm-noin") {
8516 option = OP_NOIN;
8517 } else {
8518 option = OP_NOOUT;
8519 }
8520
8521 bool any = false;
8522 bool stop = false;
8523
8524 vector<string> idvec;
8525 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
8526
8527 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8528
8529 vector<int> osds;
8530
8531 // wildcard?
8532 if (j == 0 &&
8533 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8534
8535 // touch previous noup/nodown/noin/noout osds only
8536 switch (option) {
8537 case OP_NOUP:
8538 osdmap.get_noup_osds(&osds);
8539 break;
8540 case OP_NODOWN:
8541 osdmap.get_nodown_osds(&osds);
8542 break;
8543 case OP_NOIN:
8544 osdmap.get_noin_osds(&osds);
8545 break;
8546 case OP_NOOUT:
8547 osdmap.get_noout_osds(&osds);
8548 break;
8549 default:
8550 assert(0 == "invalid option");
8551 }
8552
8553 // cancel any pending noup/nodown/noin/noout requests too
8554 vector<int> pending_state_osds;
8555 (void) pending_inc.get_pending_state_osds(&pending_state_osds);
8556 for (auto &p : pending_state_osds) {
8557
8558 switch (option) {
8559 case OP_NOUP:
8560 if (!osdmap.is_noup(p) &&
8561 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOUP)) {
8562 any = true;
8563 }
8564 break;
8565
8566 case OP_NODOWN:
8567 if (!osdmap.is_nodown(p) &&
8568 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN)) {
8569 any = true;
8570 }
8571 break;
8572
8573 case OP_NOIN:
8574 if (!osdmap.is_noin(p) &&
8575 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOIN)) {
8576 any = true;
8577 }
8578 break;
8579
8580 case OP_NOOUT:
8581 if (!osdmap.is_noout(p) &&
8582 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT)) {
8583 any = true;
8584 }
8585 break;
8586
8587 default:
8588 assert(0 == "invalid option");
8589 }
8590 }
8591
8592 stop = true;
8593 } else {
8594 // try traditional single osd way
8595
8596 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8597 if (osd < 0) {
8598 // ss has reason for failure
8599 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8600 err = -EINVAL;
8601 continue;
8602 }
8603
8604 osds.push_back(osd);
8605 }
8606
8607 for (auto &osd : osds) {
8608
8609 if (!osdmap.exists(osd)) {
8610 ss << "osd." << osd << " does not exist. ";
8611 continue;
8612 }
8613
8614 switch (option) {
8615 case OP_NOUP:
8616 if (osdmap.is_noup(osd)) {
8617 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
8618 any = true;
8619 } else if (pending_inc.pending_osd_state_clear(
8620 osd, CEPH_OSD_NOUP)) {
8621 any = true;
8622 }
8623 break;
8624
8625 case OP_NODOWN:
8626 if (osdmap.is_nodown(osd)) {
8627 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
8628 any = true;
8629 } else if (pending_inc.pending_osd_state_clear(
8630 osd, CEPH_OSD_NODOWN)) {
8631 any = true;
8632 }
8633 break;
8634
8635 case OP_NOIN:
8636 if (osdmap.is_noin(osd)) {
8637 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
8638 any = true;
8639 } else if (pending_inc.pending_osd_state_clear(
8640 osd, CEPH_OSD_NOIN)) {
8641 any = true;
8642 }
8643 break;
8644
8645 case OP_NOOUT:
8646 if (osdmap.is_noout(osd)) {
8647 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
8648 any = true;
8649 } else if (pending_inc.pending_osd_state_clear(
8650 osd, CEPH_OSD_NOOUT)) {
8651 any = true;
8652 }
8653 break;
8654
8655 default:
8656 assert(0 == "invalid option");
8657 }
8658 }
8659 }
8660
8661 if (any) {
8662 getline(ss, rs);
8663 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
8664 get_last_committed() + 1));
8665 return true;
8666 }
8667 } else if (prefix == "osd pg-temp") {
8668 string pgidstr;
8669 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
8670 ss << "unable to parse 'pgid' value '"
8671 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
8672 err = -EINVAL;
8673 goto reply;
8674 }
8675 pg_t pgid;
8676 if (!pgid.parse(pgidstr.c_str())) {
8677 ss << "invalid pgid '" << pgidstr << "'";
8678 err = -EINVAL;
8679 goto reply;
8680 }
8681 if (!osdmap.pg_exists(pgid)) {
8682 ss << "pg " << pgid << " does not exist";
8683 err = -ENOENT;
8684 goto reply;
8685 }
8686 if (pending_inc.new_pg_temp.count(pgid)) {
8687 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
8688 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8689 return true;
8690 }
8691
8692 vector<int64_t> id_vec;
8693 vector<int32_t> new_pg_temp;
8694 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
8695 ss << "unable to parse 'id' value(s) '"
8696 << cmd_vartype_stringify(cmdmap["id"]) << "'";
8697 err = -EINVAL;
8698 goto reply;
8699 }
8700 for (auto osd : id_vec) {
8701 if (!osdmap.exists(osd)) {
8702 ss << "osd." << osd << " does not exist";
8703 err = -ENOENT;
8704 goto reply;
8705 }
8706 new_pg_temp.push_back(osd);
8707 }
8708
8709 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
8710 new_pg_temp.begin(), new_pg_temp.end());
8711 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
8712 goto update;
8713 } else if (prefix == "osd primary-temp") {
8714 string pgidstr;
8715 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
8716 ss << "unable to parse 'pgid' value '"
8717 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
8718 err = -EINVAL;
8719 goto reply;
8720 }
8721 pg_t pgid;
8722 if (!pgid.parse(pgidstr.c_str())) {
8723 ss << "invalid pgid '" << pgidstr << "'";
8724 err = -EINVAL;
8725 goto reply;
8726 }
8727 if (!osdmap.pg_exists(pgid)) {
8728 ss << "pg " << pgid << " does not exist";
8729 err = -ENOENT;
8730 goto reply;
8731 }
8732
8733 int64_t osd;
8734 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
8735 ss << "unable to parse 'id' value '"
8736 << cmd_vartype_stringify(cmdmap["id"]) << "'";
8737 err = -EINVAL;
8738 goto reply;
8739 }
8740 if (osd != -1 && !osdmap.exists(osd)) {
8741 ss << "osd." << osd << " does not exist";
8742 err = -ENOENT;
8743 goto reply;
8744 }
8745
8746 if (osdmap.require_min_compat_client > 0 &&
8747 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
8748 ss << "require_min_compat_client "
8749 << ceph_release_name(osdmap.require_min_compat_client)
8750 << " < firefly, which is required for primary-temp";
8751 err = -EPERM;
8752 goto reply;
8753 } else if (!g_conf->mon_osd_allow_primary_temp) {
8754 ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
8755 err = -EPERM;
8756 goto reply;
8757 }
8758
8759 pending_inc.new_primary_temp[pgid] = osd;
8760 ss << "set " << pgid << " primary_temp mapping to " << osd;
8761 goto update;
8762 } else if (prefix == "osd pg-upmap") {
8763 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8764 ss << "you must complete the upgrade and set require_osd_release ="
8765 << "luminous before using the new interface";
8766 err = -EPERM;
8767 goto reply;
8768 }
8769 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
8770 ss << "min_compat_client "
8771 << ceph_release_name(osdmap.require_min_compat_client)
8772 << " < luminous, which is required for pg-upmap";
8773 err = -EPERM;
8774 goto reply;
8775 }
8776 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
8777 if (err == -EAGAIN)
8778 goto wait;
8779 if (err < 0)
8780 goto reply;
8781 string pgidstr;
8782 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
8783 ss << "unable to parse 'pgid' value '"
8784 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
8785 err = -EINVAL;
8786 goto reply;
8787 }
8788 pg_t pgid;
8789 if (!pgid.parse(pgidstr.c_str())) {
8790 ss << "invalid pgid '" << pgidstr << "'";
8791 err = -EINVAL;
8792 goto reply;
8793 }
8794 if (!osdmap.pg_exists(pgid)) {
8795 ss << "pg " << pgid << " does not exist";
8796 err = -ENOENT;
8797 goto reply;
8798 }
8799 if (pending_inc.new_pg_upmap.count(pgid) ||
8800 pending_inc.old_pg_upmap.count(pgid)) {
8801 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
8802 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8803 return true;
8804 }
8805 vector<int64_t> id_vec;
8806 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
8807 ss << "unable to parse 'id' value(s) '"
8808 << cmd_vartype_stringify(cmdmap["id"]) << "'";
8809 err = -EINVAL;
8810 goto reply;
8811 }
8812 vector<int32_t> new_pg_upmap;
8813 for (auto osd : id_vec) {
8814 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
8815 ss << "osd." << osd << " does not exist";
8816 err = -ENOENT;
8817 goto reply;
8818 }
8819 new_pg_upmap.push_back(osd);
8820 }
8821
8822 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
8823 new_pg_upmap.begin(), new_pg_upmap.end());
8824 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
8825 goto update;
8826 } else if (prefix == "osd rm-pg-upmap") {
8827 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8828 ss << "you must complete the upgrade and set require_osd_release ="
8829 << "luminous before using the new interface";
8830 err = -EPERM;
8831 goto reply;
8832 }
8833 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
8834 ss << "require_min_compat_client "
8835 << ceph_release_name(osdmap.require_min_compat_client)
8836 << " < luminous, which is required for pg-upmap";
8837 err = -EPERM;
8838 goto reply;
8839 }
8840 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
8841 if (err == -EAGAIN)
8842 goto wait;
8843 if (err < 0)
8844 goto reply;
8845 string pgidstr;
8846 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
8847 ss << "unable to parse 'pgid' value '"
8848 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
8849 err = -EINVAL;
8850 goto reply;
8851 }
8852 pg_t pgid;
8853 if (!pgid.parse(pgidstr.c_str())) {
8854 ss << "invalid pgid '" << pgidstr << "'";
8855 err = -EINVAL;
8856 goto reply;
8857 }
8858 if (pending_inc.new_pg_upmap.count(pgid) ||
8859 pending_inc.old_pg_upmap.count(pgid)) {
8860 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
8861 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8862 return true;
8863 }
8864
8865 pending_inc.old_pg_upmap.insert(pgid);
8866 ss << "clear " << pgid << " pg_upmap mapping";
8867 goto update;
8868 } else if (prefix == "osd pg-upmap-items") {
8869 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8870 ss << "you must complete the upgrade and set require_osd_release ="
8871 << "luminous before using the new interface";
8872 err = -EPERM;
8873 goto reply;
8874 }
8875 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
8876 ss << "require_min_compat_client "
8877 << ceph_release_name(osdmap.require_min_compat_client)
8878 << " < luminous, which is required for pg-upmap";
8879 err = -EPERM;
8880 goto reply;
8881 }
8882 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
8883 if (err == -EAGAIN)
8884 goto wait;
8885 if (err < 0)
8886 goto reply;
8887 string pgidstr;
8888 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
8889 ss << "unable to parse 'pgid' value '"
8890 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
8891 err = -EINVAL;
8892 goto reply;
8893 }
8894 pg_t pgid;
8895 if (!pgid.parse(pgidstr.c_str())) {
8896 ss << "invalid pgid '" << pgidstr << "'";
8897 err = -EINVAL;
8898 goto reply;
8899 }
8900 if (!osdmap.pg_exists(pgid)) {
8901 ss << "pg " << pgid << " does not exist";
8902 err = -ENOENT;
8903 goto reply;
8904 }
8905 if (pending_inc.new_pg_upmap_items.count(pgid) ||
8906 pending_inc.old_pg_upmap_items.count(pgid)) {
8907 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
8908 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8909 return true;
8910 }
8911 vector<int64_t> id_vec;
8912 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
8913 ss << "unable to parse 'id' value(s) '"
8914 << cmd_vartype_stringify(cmdmap["id"]) << "'";
8915 err = -EINVAL;
8916 goto reply;
8917 }
8918 if (id_vec.size() % 2) {
8919 ss << "you must specify pairs of osd ids to be remapped";
8920 err = -EINVAL;
8921 goto reply;
8922 }
8923 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
8924 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
8925 int from = *p++;
8926 int to = *p;
8927 if (!osdmap.exists(from)) {
8928 ss << "osd." << from << " does not exist";
8929 err = -ENOENT;
8930 goto reply;
8931 }
8932 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
8933 ss << "osd." << to << " does not exist";
8934 err = -ENOENT;
8935 goto reply;
8936 }
8937 new_pg_upmap_items.push_back(make_pair(from, to));
8938 }
8939
8940 pending_inc.new_pg_upmap_items[pgid] =
8941 mempool::osdmap::vector<pair<int32_t,int32_t>>(
8942 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
8943 ss << "set " << pgid << " pg_upmap_items mapping to " << new_pg_upmap_items;
8944 goto update;
8945 } else if (prefix == "osd rm-pg-upmap-items") {
8946 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8947 ss << "you must complete the upgrade and set require_osd_release ="
8948 << "luminous before using the new interface";
8949 err = -EPERM;
8950 goto reply;
8951 }
8952 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
8953 ss << "require_min_compat_client "
8954 << ceph_release_name(osdmap.require_min_compat_client)
8955 << " < luminous, which is required for pg-upmap";
8956 err = -EPERM;
8957 goto reply;
8958 }
8959 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
8960 if (err == -EAGAIN)
8961 goto wait;
8962 if (err < 0)
8963 goto reply;
8964 string pgidstr;
8965 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
8966 ss << "unable to parse 'pgid' value '"
8967 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
8968 err = -EINVAL;
8969 goto reply;
8970 }
8971 pg_t pgid;
8972 if (!pgid.parse(pgidstr.c_str())) {
8973 ss << "invalid pgid '" << pgidstr << "'";
8974 err = -EINVAL;
8975 goto reply;
8976 }
8977 if (pending_inc.new_pg_upmap_items.count(pgid) ||
8978 pending_inc.old_pg_upmap_items.count(pgid)) {
8979 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
8980 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8981 return true;
8982 }
8983
8984 pending_inc.old_pg_upmap_items.insert(pgid);
8985 ss << "clear " << pgid << " pg_upmap_items mapping";
8986 goto update;
8987 } else if (prefix == "osd primary-affinity") {
8988 int64_t id;
8989 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
8990 ss << "invalid osd id value '"
8991 << cmd_vartype_stringify(cmdmap["id"]) << "'";
8992 err = -EINVAL;
8993 goto reply;
8994 }
8995 double w;
8996 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8997 ss << "unable to parse 'weight' value '"
8998 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8999 err = -EINVAL;
9000 goto reply;
9001 }
9002 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
9003 if (ww < 0L) {
9004 ss << "weight must be >= 0";
9005 err = -EINVAL;
9006 goto reply;
9007 }
9008 if (osdmap.require_min_compat_client > 0 &&
9009 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
9010 ss << "require_min_compat_client "
9011 << ceph_release_name(osdmap.require_min_compat_client)
9012 << " < firefly, which is required for primary-affinity";
9013 err = -EPERM;
9014 goto reply;
9015 } else if (!g_conf->mon_osd_allow_primary_affinity) {
9016 ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
9017 err = -EPERM;
9018 goto reply;
9019 }
9020 err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
9021 if (err == -EAGAIN)
9022 goto wait;
9023 if (err < 0)
9024 goto reply;
9025 if (osdmap.exists(id)) {
9026 pending_inc.new_primary_affinity[id] = ww;
9027 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
9028 getline(ss, rs);
9029 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9030 get_last_committed() + 1));
9031 return true;
9032 } else {
9033 ss << "osd." << id << " does not exist";
9034 err = -ENOENT;
9035 goto reply;
9036 }
9037 } else if (prefix == "osd reweight") {
9038 int64_t id;
9039 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9040 ss << "unable to parse osd id value '"
9041 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9042 err = -EINVAL;
9043 goto reply;
9044 }
9045 double w;
9046 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
9047 ss << "unable to parse weight value '"
9048 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
9049 err = -EINVAL;
9050 goto reply;
9051 }
9052 long ww = (int)((double)CEPH_OSD_IN*w);
9053 if (ww < 0L) {
9054 ss << "weight must be >= 0";
9055 err = -EINVAL;
9056 goto reply;
9057 }
9058 if (osdmap.exists(id)) {
9059 pending_inc.new_weight[id] = ww;
9060 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
9061 getline(ss, rs);
9062 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9063 get_last_committed() + 1));
9064 return true;
9065 } else {
9066 ss << "osd." << id << " does not exist";
9067 err = -ENOENT;
9068 goto reply;
9069 }
9070 } else if (prefix == "osd reweightn") {
9071 map<int32_t, uint32_t> weights;
9072 err = parse_reweights(g_ceph_context, cmdmap, osdmap, &weights);
9073 if (err) {
9074 ss << "unable to parse 'weights' value '"
9075 << cmd_vartype_stringify(cmdmap["weights"]) << "'";
9076 goto reply;
9077 }
9078 pending_inc.new_weight.insert(weights.begin(), weights.end());
9079 wait_for_finished_proposal(
9080 op,
9081 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
9082 return true;
9083 } else if (prefix == "osd lost") {
9084 int64_t id;
9085 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9086 ss << "unable to parse osd id value '"
9087 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9088 err = -EINVAL;
9089 goto reply;
9090 }
9091 string sure;
9092 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
9093 ss << "are you SURE? this might mean real, permanent data loss. pass "
9094 "--yes-i-really-mean-it if you really do.";
9095 err = -EPERM;
9096 goto reply;
9097 } else if (!osdmap.exists(id)) {
9098 ss << "osd." << id << " does not exist";
9099 err = -ENOENT;
9100 goto reply;
9101 } else if (!osdmap.is_down(id)) {
9102 ss << "osd." << id << " is not down";
9103 err = -EBUSY;
9104 goto reply;
9105 } else {
9106 epoch_t e = osdmap.get_info(id).down_at;
9107 pending_inc.new_lost[id] = e;
9108 ss << "marked osd lost in epoch " << e;
9109 getline(ss, rs);
9110 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9111 get_last_committed() + 1));
9112 return true;
9113 }
9114
9115 } else if (prefix == "osd destroy" || prefix == "osd purge") {
9116 /* Destroying an OSD means that we don't expect to further make use of
9117 * the OSDs data (which may even become unreadable after this operation),
9118 * and that we are okay with scrubbing all its cephx keys and config-key
9119 * data (which may include lockbox keys, thus rendering the osd's data
9120 * unreadable).
9121 *
9122 * The OSD will not be removed. Instead, we will mark it as destroyed,
9123 * such that a subsequent call to `create` will not reuse the osd id.
9124 * This will play into being able to recreate the OSD, at the same
9125 * crush location, with minimal data movement.
9126 */
9127
9128 // make sure authmon is writeable.
9129 if (!mon->authmon()->is_writeable()) {
9130 dout(10) << __func__ << " waiting for auth mon to be writeable for "
9131 << "osd destroy" << dendl;
9132 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
9133 return false;
9134 }
9135
9136 int64_t id;
9137 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9138 ss << "unable to parse osd id value '"
9139 << cmd_vartype_stringify(cmdmap["id"]) << "";
9140 err = -EINVAL;
9141 goto reply;
9142 }
9143
9144 bool is_destroy = (prefix == "osd destroy");
9145 if (!is_destroy) {
9146 assert("osd purge" == prefix);
9147 }
9148
9149 string sure;
9150 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
9151 sure != "--yes-i-really-mean-it") {
9152 ss << "Are you SURE? This will mean real, permanent data loss, as well "
9153 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
9154 << "really do.";
9155 err = -EPERM;
9156 goto reply;
9157 } else if (is_destroy && !osdmap.exists(id)) {
9158 ss << "osd." << id << " does not exist";
9159 err = -ENOENT;
9160 goto reply;
9161 } else if (osdmap.is_up(id)) {
9162 ss << "osd." << id << " is not `down`.";
9163 err = -EBUSY;
9164 goto reply;
9165 } else if (is_destroy && osdmap.is_destroyed(id)) {
9166 ss << "destroyed osd." << id;
9167 err = 0;
9168 goto reply;
9169 }
9170
9171 bool goto_reply = false;
9172
9173 paxos->plug();
9174 if (is_destroy) {
9175 err = prepare_command_osd_destroy(id, ss);
9176 // we checked above that it should exist.
9177 assert(err != -ENOENT);
9178 } else {
9179 err = prepare_command_osd_purge(id, ss);
9180 if (err == -ENOENT) {
9181 err = 0;
9182 ss << "osd." << id << " does not exist.";
9183 goto_reply = true;
9184 }
9185 }
9186 paxos->unplug();
9187
9188 if (err < 0 || goto_reply) {
9189 goto reply;
9190 }
9191
9192 if (is_destroy) {
9193 ss << "destroyed osd." << id;
9194 } else {
9195 ss << "purged osd." << id;
9196 }
9197
9198 getline(ss, rs);
9199 wait_for_finished_proposal(op,
9200 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
9201 force_immediate_propose();
9202 return true;
9203
9204 } else if (prefix == "osd new") {
9205
9206 // make sure authmon is writeable.
9207 if (!mon->authmon()->is_writeable()) {
9208 dout(10) << __func__ << " waiting for auth mon to be writeable for "
9209 << "osd destroy" << dendl;
9210 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
9211 return false;
9212 }
9213
9214 map<string,string> secrets_map;
9215
9216 bufferlist bl = m->get_data();
9217 string secrets_json = bl.to_str();
9218 dout(20) << __func__ << " osd new json = " << secrets_json << dendl;
9219
9220 err = get_json_str_map(secrets_json, ss, &secrets_map);
9221 if (err < 0)
9222 goto reply;
9223
9224 dout(20) << __func__ << " osd new secrets " << secrets_map << dendl;
9225
9226 paxos->plug();
9227 err = prepare_command_osd_new(op, cmdmap, secrets_map, ss, f.get());
9228 paxos->unplug();
9229
9230 if (err < 0) {
9231 goto reply;
9232 }
9233
9234 if (f) {
9235 f->flush(rdata);
9236 } else {
9237 rdata.append(ss);
9238 }
9239
9240 if (err == EEXIST) {
9241 // idempotent operation
9242 err = 0;
9243 goto reply;
9244 }
9245
9246 wait_for_finished_proposal(op,
9247 new Monitor::C_Command(mon, op, 0, rs, rdata,
9248 get_last_committed() + 1));
9249 force_immediate_propose();
9250 return true;
9251
9252 } else if (prefix == "osd create") {
9253
9254 // optional id provided?
9255 int64_t id = -1, cmd_id = -1;
9256 if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
9257 if (cmd_id < 0) {
9258 ss << "invalid osd id value '" << cmd_id << "'";
9259 err = -EINVAL;
9260 goto reply;
9261 }
9262 dout(10) << " osd create got id " << cmd_id << dendl;
9263 }
9264
9265 uuid_d uuid;
9266 string uuidstr;
9267 if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
9268 if (!uuid.parse(uuidstr.c_str())) {
9269 ss << "invalid uuid value '" << uuidstr << "'";
9270 err = -EINVAL;
9271 goto reply;
9272 }
9273 // we only care about the id if we also have the uuid, to
9274 // ensure the operation's idempotency.
9275 id = cmd_id;
9276 }
9277
9278 int32_t new_id = -1;
9279 err = prepare_command_osd_create(id, uuid, &new_id, ss);
9280 if (err < 0) {
9281 if (err == -EAGAIN) {
9282 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9283 return true;
9284 }
9285 // a check has failed; reply to the user.
9286 goto reply;
9287
9288 } else if (err == EEXIST) {
9289 // this is an idempotent operation; we can go ahead and reply.
9290 if (f) {
9291 f->open_object_section("created_osd");
9292 f->dump_int("osdid", new_id);
9293 f->close_section();
9294 f->flush(rdata);
9295 } else {
9296 ss << new_id;
9297 rdata.append(ss);
9298 }
9299 err = 0;
9300 goto reply;
9301 }
9302
9303 do_osd_create(id, uuid, &new_id);
9304
9305 if (f) {
9306 f->open_object_section("created_osd");
9307 f->dump_int("osdid", new_id);
9308 f->close_section();
9309 f->flush(rdata);
9310 } else {
9311 ss << new_id;
9312 rdata.append(ss);
9313 }
9314 wait_for_finished_proposal(op,
9315 new Monitor::C_Command(mon, op, 0, rs, rdata,
9316 get_last_committed() + 1));
9317 return true;
9318
9319 } else if (prefix == "osd blacklist clear") {
9320 pending_inc.new_blacklist.clear();
9321 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
9322 osdmap.get_blacklist(&blacklist);
9323 for (const auto &entry : blacklist) {
9324 pending_inc.old_blacklist.push_back(entry.first);
9325 }
9326 ss << " removed all blacklist entries";
9327 getline(ss, rs);
9328 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9329 get_last_committed() + 1));
9330 return true;
9331 } else if (prefix == "osd blacklist") {
9332 string addrstr;
9333 cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
9334 entity_addr_t addr;
9335 if (!addr.parse(addrstr.c_str(), 0)) {
9336 ss << "unable to parse address " << addrstr;
9337 err = -EINVAL;
9338 goto reply;
9339 }
9340 else {
9341 string blacklistop;
9342 cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
9343 if (blacklistop == "add") {
9344 utime_t expires = ceph_clock_now();
9345 double d;
9346 // default one hour
9347 cmd_getval(g_ceph_context, cmdmap, "expire", d, double(60*60));
9348 expires += d;
9349
9350 pending_inc.new_blacklist[addr] = expires;
9351 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
9352 getline(ss, rs);
9353 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9354 get_last_committed() + 1));
9355 return true;
9356 } else if (blacklistop == "rm") {
9357 if (osdmap.is_blacklisted(addr) ||
9358 pending_inc.new_blacklist.count(addr)) {
9359 if (osdmap.is_blacklisted(addr))
9360 pending_inc.old_blacklist.push_back(addr);
9361 else
9362 pending_inc.new_blacklist.erase(addr);
9363 ss << "un-blacklisting " << addr;
9364 getline(ss, rs);
9365 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9366 get_last_committed() + 1));
9367 return true;
9368 }
9369 ss << addr << " isn't blacklisted";
9370 err = 0;
9371 goto reply;
9372 }
9373 }
9374 } else if (prefix == "osd pool mksnap") {
9375 string poolstr;
9376 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9377 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
9378 if (pool < 0) {
9379 ss << "unrecognized pool '" << poolstr << "'";
9380 err = -ENOENT;
9381 goto reply;
9382 }
9383 string snapname;
9384 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
9385 const pg_pool_t *p = osdmap.get_pg_pool(pool);
9386 if (p->is_unmanaged_snaps_mode()) {
9387 ss << "pool " << poolstr << " is in unmanaged snaps mode";
9388 err = -EINVAL;
9389 goto reply;
9390 } else if (p->snap_exists(snapname.c_str())) {
9391 ss << "pool " << poolstr << " snap " << snapname << " already exists";
9392 err = 0;
9393 goto reply;
9394 } else if (p->is_tier()) {
9395 ss << "pool " << poolstr << " is a cache tier";
9396 err = -EINVAL;
9397 goto reply;
9398 }
9399 pg_pool_t *pp = 0;
9400 if (pending_inc.new_pools.count(pool))
9401 pp = &pending_inc.new_pools[pool];
9402 if (!pp) {
9403 pp = &pending_inc.new_pools[pool];
9404 *pp = *p;
9405 }
9406 if (pp->snap_exists(snapname.c_str())) {
9407 ss << "pool " << poolstr << " snap " << snapname << " already exists";
9408 } else {
9409 pp->add_snap(snapname.c_str(), ceph_clock_now());
9410 pp->set_snap_epoch(pending_inc.epoch);
9411 ss << "created pool " << poolstr << " snap " << snapname;
9412 }
9413 getline(ss, rs);
9414 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9415 get_last_committed() + 1));
9416 return true;
9417 } else if (prefix == "osd pool rmsnap") {
9418 string poolstr;
9419 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9420 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
9421 if (pool < 0) {
9422 ss << "unrecognized pool '" << poolstr << "'";
9423 err = -ENOENT;
9424 goto reply;
9425 }
9426 string snapname;
9427 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
9428 const pg_pool_t *p = osdmap.get_pg_pool(pool);
9429 if (p->is_unmanaged_snaps_mode()) {
9430 ss << "pool " << poolstr << " is in unmanaged snaps mode";
9431 err = -EINVAL;
9432 goto reply;
9433 } else if (!p->snap_exists(snapname.c_str())) {
9434 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
9435 err = 0;
9436 goto reply;
9437 }
9438 pg_pool_t *pp = 0;
9439 if (pending_inc.new_pools.count(pool))
9440 pp = &pending_inc.new_pools[pool];
9441 if (!pp) {
9442 pp = &pending_inc.new_pools[pool];
9443 *pp = *p;
9444 }
9445 snapid_t sn = pp->snap_exists(snapname.c_str());
9446 if (sn) {
9447 pp->remove_snap(sn);
9448 pp->set_snap_epoch(pending_inc.epoch);
9449 ss << "removed pool " << poolstr << " snap " << snapname;
9450 } else {
9451 ss << "already removed pool " << poolstr << " snap " << snapname;
9452 }
9453 getline(ss, rs);
9454 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9455 get_last_committed() + 1));
9456 return true;
9457 } else if (prefix == "osd pool create") {
9458 int64_t pg_num;
9459 int64_t pgp_num;
9460 cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
9461 cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
9462
9463 string pool_type_str;
9464 cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
9465 if (pool_type_str.empty())
9466 pool_type_str = pg_pool_t::get_default_type();
9467
9468 string poolstr;
9469 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9470 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9471 if (pool_id >= 0) {
9472 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
9473 if (pool_type_str != p->get_type_name()) {
9474 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
9475 err = -EINVAL;
9476 } else {
9477 ss << "pool '" << poolstr << "' already exists";
9478 err = 0;
9479 }
9480 goto reply;
9481 }
9482
9483 int pool_type;
9484 if (pool_type_str == "replicated") {
9485 pool_type = pg_pool_t::TYPE_REPLICATED;
9486 } else if (pool_type_str == "erasure") {
9487 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
9488 CEPH_FEATURE_OSD_ERASURE_CODES,
9489 ss);
9490 if (err == -EAGAIN)
9491 goto wait;
9492 if (err)
9493 goto reply;
9494 pool_type = pg_pool_t::TYPE_ERASURE;
9495 } else {
9496 ss << "unknown pool type '" << pool_type_str << "'";
9497 err = -EINVAL;
9498 goto reply;
9499 }
9500
9501 bool implicit_rule_creation = false;
9502 string rule_name;
9503 cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
9504 string erasure_code_profile;
9505 cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
9506
9507 if (pool_type == pg_pool_t::TYPE_ERASURE) {
9508 if (erasure_code_profile == "")
9509 erasure_code_profile = "default";
9510 //handle the erasure code profile
9511 if (erasure_code_profile == "default") {
9512 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
9513 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
9514 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
9515 goto wait;
9516 }
9517
9518 map<string,string> profile_map;
9519 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
9520 profile_map,
9521 &ss);
9522 if (err)
9523 goto reply;
9524 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
9525 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
9526 goto wait;
9527 }
9528 }
9529 if (rule_name == "") {
9530 implicit_rule_creation = true;
9531 if (erasure_code_profile == "default") {
9532 rule_name = "erasure-code";
9533 } else {
9534 dout(1) << "implicitly use rule named after the pool: "
9535 << poolstr << dendl;
9536 rule_name = poolstr;
9537 }
9538 }
9539 } else {
9540 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
9541 rule_name = erasure_code_profile;
9542 }
9543
9544 if (!implicit_rule_creation && rule_name != "") {
9545 int rule;
9546 err = get_crush_rule(rule_name, &rule, &ss);
9547 if (err == -EAGAIN) {
9548 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9549 return true;
9550 }
9551 if (err)
9552 goto reply;
9553 }
9554
9555 int64_t expected_num_objects;
9556 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects", expected_num_objects, int64_t(0));
9557 if (expected_num_objects < 0) {
9558 ss << "'expected_num_objects' must be non-negative";
9559 err = -EINVAL;
9560 goto reply;
9561 }
9562
9563 int64_t fast_read_param;
9564 cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
9565 FastReadType fast_read = FAST_READ_DEFAULT;
9566 if (fast_read_param == 0)
9567 fast_read = FAST_READ_OFF;
9568 else if (fast_read_param > 0)
9569 fast_read = FAST_READ_ON;
9570
9571 err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
9572 -1, // default crush rule
9573 rule_name,
9574 pg_num, pgp_num,
9575 erasure_code_profile, pool_type,
9576 (uint64_t)expected_num_objects,
9577 fast_read,
9578 &ss);
9579 if (err < 0) {
9580 switch(err) {
9581 case -EEXIST:
9582 ss << "pool '" << poolstr << "' already exists";
9583 break;
9584 case -EAGAIN:
9585 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9586 return true;
9587 case -ERANGE:
9588 goto reply;
9589 default:
9590 goto reply;
9591 break;
9592 }
9593 } else {
9594 ss << "pool '" << poolstr << "' created";
9595 }
9596 getline(ss, rs);
9597 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9598 get_last_committed() + 1));
9599 return true;
9600
9601 } else if (prefix == "osd pool delete" ||
9602 prefix == "osd pool rm") {
9603 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
9604 string poolstr, poolstr2, sure;
9605 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9606 cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
9607 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
9608 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
9609 if (pool < 0) {
9610 ss << "pool '" << poolstr << "' does not exist";
9611 err = 0;
9612 goto reply;
9613 }
9614
9615 bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
9616 if (poolstr2 != poolstr ||
9617 (sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
9618 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
9619 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
9620 << "followed by --yes-i-really-really-mean-it.";
9621 err = -EPERM;
9622 goto reply;
9623 }
9624 err = _prepare_remove_pool(pool, &ss, force_no_fake);
9625 if (err == -EAGAIN) {
9626 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9627 return true;
9628 }
9629 if (err < 0)
9630 goto reply;
9631 goto update;
9632 } else if (prefix == "osd pool rename") {
9633 string srcpoolstr, destpoolstr;
9634 cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
9635 cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
9636 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
9637 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
9638
9639 if (pool_src < 0) {
9640 if (pool_dst >= 0) {
9641 // src pool doesn't exist, dst pool does exist: to ensure idempotency
9642 // of operations, assume this rename succeeded, as it is not changing
9643 // the current state. Make sure we output something understandable
9644 // for whoever is issuing the command, if they are paying attention,
9645 // in case it was not intentional; or to avoid a "wtf?" and a bug
9646 // report in case it was intentional, while expecting a failure.
9647 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
9648 << destpoolstr << "' does -- assuming successful rename";
9649 err = 0;
9650 } else {
9651 ss << "unrecognized pool '" << srcpoolstr << "'";
9652 err = -ENOENT;
9653 }
9654 goto reply;
9655 } else if (pool_dst >= 0) {
9656 // source pool exists and so does the destination pool
9657 ss << "pool '" << destpoolstr << "' already exists";
9658 err = -EEXIST;
9659 goto reply;
9660 }
9661
9662 int ret = _prepare_rename_pool(pool_src, destpoolstr);
9663 if (ret == 0) {
9664 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
9665 } else {
9666 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
9667 << cpp_strerror(ret);
9668 }
9669 getline(ss, rs);
9670 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
9671 get_last_committed() + 1));
9672 return true;
9673
9674 } else if (prefix == "osd pool set") {
9675 err = prepare_command_pool_set(cmdmap, ss);
9676 if (err == -EAGAIN)
9677 goto wait;
9678 if (err < 0)
9679 goto reply;
9680
9681 getline(ss, rs);
9682 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9683 get_last_committed() + 1));
9684 return true;
9685 } else if (prefix == "osd tier add") {
9686 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
9687 if (err == -EAGAIN)
9688 goto wait;
9689 if (err)
9690 goto reply;
9691 string poolstr;
9692 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9693 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9694 if (pool_id < 0) {
9695 ss << "unrecognized pool '" << poolstr << "'";
9696 err = -ENOENT;
9697 goto reply;
9698 }
9699 string tierpoolstr;
9700 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
9701 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
9702 if (tierpool_id < 0) {
9703 ss << "unrecognized pool '" << tierpoolstr << "'";
9704 err = -ENOENT;
9705 goto reply;
9706 }
9707 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
9708 assert(p);
9709 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
9710 assert(tp);
9711
9712 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
9713 goto reply;
9714 }
9715
9716 // make sure new tier is empty
9717 string force_nonempty;
9718 cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
9719 const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
9720 if (pstats && pstats->stats.sum.num_objects != 0 &&
9721 force_nonempty != "--force-nonempty") {
9722 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
9723 err = -ENOTEMPTY;
9724 goto reply;
9725 }
9726 if (tp->ec_pool()) {
9727 ss << "tier pool '" << tierpoolstr
9728 << "' is an ec pool, which cannot be a tier";
9729 err = -ENOTSUP;
9730 goto reply;
9731 }
9732 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
9733 ((force_nonempty != "--force-nonempty") ||
9734 (!g_conf->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
9735 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
9736 err = -ENOTEMPTY;
9737 goto reply;
9738 }
9739 // go
9740 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
9741 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
9742 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
9743 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9744 return true;
9745 }
9746 np->tiers.insert(tierpool_id);
9747 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
9748 ntp->tier_of = pool_id;
9749 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
9750 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
9751 get_last_committed() + 1));
9752 return true;
9753 } else if (prefix == "osd tier remove" ||
9754 prefix == "osd tier rm") {
9755 string poolstr;
9756 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9757 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9758 if (pool_id < 0) {
9759 ss << "unrecognized pool '" << poolstr << "'";
9760 err = -ENOENT;
9761 goto reply;
9762 }
9763 string tierpoolstr;
9764 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
9765 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
9766 if (tierpool_id < 0) {
9767 ss << "unrecognized pool '" << tierpoolstr << "'";
9768 err = -ENOENT;
9769 goto reply;
9770 }
9771 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
9772 assert(p);
9773 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
9774 assert(tp);
9775
9776 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
9777 goto reply;
9778 }
9779
9780 if (p->tiers.count(tierpool_id) == 0) {
9781 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
9782 err = 0;
9783 goto reply;
9784 }
9785 if (tp->tier_of != pool_id) {
9786 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
9787 << osdmap.get_pool_name(tp->tier_of) << "': "
9788 // be scary about it; this is an inconsistency and bells must go off
9789 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
9790 err = -EINVAL;
9791 goto reply;
9792 }
9793 if (p->read_tier == tierpool_id) {
9794 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
9795 err = -EBUSY;
9796 goto reply;
9797 }
9798 // go
9799 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
9800 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
9801 if (np->tiers.count(tierpool_id) == 0 ||
9802 ntp->tier_of != pool_id ||
9803 np->read_tier == tierpool_id) {
9804 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9805 return true;
9806 }
9807 np->tiers.erase(tierpool_id);
9808 ntp->clear_tier();
9809 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
9810 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
9811 get_last_committed() + 1));
9812 return true;
9813 } else if (prefix == "osd tier set-overlay") {
9814 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
9815 if (err == -EAGAIN)
9816 goto wait;
9817 if (err)
9818 goto reply;
9819 string poolstr;
9820 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9821 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9822 if (pool_id < 0) {
9823 ss << "unrecognized pool '" << poolstr << "'";
9824 err = -ENOENT;
9825 goto reply;
9826 }
9827 string overlaypoolstr;
9828 cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
9829 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
9830 if (overlaypool_id < 0) {
9831 ss << "unrecognized pool '" << overlaypoolstr << "'";
9832 err = -ENOENT;
9833 goto reply;
9834 }
9835 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
9836 assert(p);
9837 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
9838 assert(overlay_p);
9839 if (p->tiers.count(overlaypool_id) == 0) {
9840 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
9841 err = -EINVAL;
9842 goto reply;
9843 }
9844 if (p->read_tier == overlaypool_id) {
9845 err = 0;
9846 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
9847 goto reply;
9848 }
9849 if (p->has_read_tier()) {
9850 ss << "pool '" << poolstr << "' has overlay '"
9851 << osdmap.get_pool_name(p->read_tier)
9852 << "'; please remove-overlay first";
9853 err = -EINVAL;
9854 goto reply;
9855 }
9856
9857 // go
9858 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
9859 np->read_tier = overlaypool_id;
9860 np->write_tier = overlaypool_id;
9861 np->set_last_force_op_resend(pending_inc.epoch);
9862 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
9863 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
9864 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
9865 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
9866 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
9867 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
9868 get_last_committed() + 1));
9869 return true;
9870 } else if (prefix == "osd tier remove-overlay" ||
9871 prefix == "osd tier rm-overlay") {
9872 string poolstr;
9873 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9874 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9875 if (pool_id < 0) {
9876 ss << "unrecognized pool '" << poolstr << "'";
9877 err = -ENOENT;
9878 goto reply;
9879 }
9880 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
9881 assert(p);
9882 if (!p->has_read_tier()) {
9883 err = 0;
9884 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
9885 goto reply;
9886 }
9887
9888 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
9889 goto reply;
9890 }
9891
9892 // go
9893 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
9894 if (np->has_read_tier()) {
9895 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
9896 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
9897 nop->set_last_force_op_resend(pending_inc.epoch);
9898 }
9899 if (np->has_write_tier()) {
9900 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
9901 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
9902 nop->set_last_force_op_resend(pending_inc.epoch);
9903 }
9904 np->clear_read_tier();
9905 np->clear_write_tier();
9906 np->set_last_force_op_resend(pending_inc.epoch);
9907 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
9908 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
9909 get_last_committed() + 1));
9910 return true;
9911 } else if (prefix == "osd tier cache-mode") {
9912 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
9913 if (err == -EAGAIN)
9914 goto wait;
9915 if (err)
9916 goto reply;
9917 string poolstr;
9918 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9919 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9920 if (pool_id < 0) {
9921 ss << "unrecognized pool '" << poolstr << "'";
9922 err = -ENOENT;
9923 goto reply;
9924 }
9925 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
9926 assert(p);
9927 if (!p->is_tier()) {
9928 ss << "pool '" << poolstr << "' is not a tier";
9929 err = -EINVAL;
9930 goto reply;
9931 }
9932 string modestr;
9933 cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
9934 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
9935 if (mode < 0) {
9936 ss << "'" << modestr << "' is not a valid cache mode";
9937 err = -EINVAL;
9938 goto reply;
9939 }
9940
9941 string sure;
9942 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
9943 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
9944 mode != pg_pool_t::CACHEMODE_NONE &&
9945 mode != pg_pool_t::CACHEMODE_PROXY &&
9946 mode != pg_pool_t::CACHEMODE_READPROXY) &&
9947 sure != "--yes-i-really-mean-it") {
9948 ss << "'" << modestr << "' is not a well-supported cache mode and may "
9949 << "corrupt your data. pass --yes-i-really-mean-it to force.";
9950 err = -EPERM;
9951 goto reply;
9952 }
9953
9954 // pool already has this cache-mode set and there are no pending changes
9955 if (p->cache_mode == mode &&
9956 (pending_inc.new_pools.count(pool_id) == 0 ||
9957 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
9958 ss << "set cache-mode for pool '" << poolstr << "'"
9959 << " to " << pg_pool_t::get_cache_mode_name(mode);
9960 err = 0;
9961 goto reply;
9962 }
9963
9964 /* Mode description:
9965 *
9966 * none: No cache-mode defined
9967 * forward: Forward all reads and writes to base pool
9968 * writeback: Cache writes, promote reads from base pool
9969 * readonly: Forward writes to base pool
9970 * readforward: Writes are in writeback mode, Reads are in forward mode
9971 * proxy: Proxy all reads and writes to base pool
9972 * readproxy: Writes are in writeback mode, Reads are in proxy mode
9973 *
9974 * Hence, these are the allowed transitions:
9975 *
9976 * none -> any
9977 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
9978 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
9979 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
9980 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
9981 * writeback -> readforward || readproxy || forward || proxy
9982 * readonly -> any
9983 */
9984
9985 // We check if the transition is valid against the current pool mode, as
9986 // it is the only committed state thus far. We will blantly squash
9987 // whatever mode is on the pending state.
9988
9989 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
9990 (mode != pg_pool_t::CACHEMODE_FORWARD &&
9991 mode != pg_pool_t::CACHEMODE_PROXY &&
9992 mode != pg_pool_t::CACHEMODE_READFORWARD &&
9993 mode != pg_pool_t::CACHEMODE_READPROXY)) {
9994 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
9995 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
9996 << "' pool; only '"
9997 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
9998 << "','"
9999 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
10000 << "','"
10001 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
10002 << "','"
10003 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
10004 << "' allowed.";
10005 err = -EINVAL;
10006 goto reply;
10007 }
10008 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
10009 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10010 mode != pg_pool_t::CACHEMODE_FORWARD &&
10011 mode != pg_pool_t::CACHEMODE_PROXY &&
10012 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
10013
10014 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
10015 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10016 mode != pg_pool_t::CACHEMODE_FORWARD &&
10017 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10018 mode != pg_pool_t::CACHEMODE_PROXY)) ||
10019
10020 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
10021 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10022 mode != pg_pool_t::CACHEMODE_FORWARD &&
10023 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10024 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
10025
10026 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
10027 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10028 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10029 mode != pg_pool_t::CACHEMODE_PROXY &&
10030 mode != pg_pool_t::CACHEMODE_READPROXY))) {
10031
10032 const pool_stat_t* pstats =
10033 mon->pgservice->get_pool_stat(pool_id);
10034
10035 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
10036 ss << "unable to set cache-mode '"
10037 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
10038 << "': dirty objects found";
10039 err = -EBUSY;
10040 goto reply;
10041 }
10042 }
10043 // go
10044 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10045 np->cache_mode = mode;
10046 // set this both when moving to and from cache_mode NONE. this is to
10047 // capture legacy pools that were set up before this flag existed.
10048 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
10049 ss << "set cache-mode for pool '" << poolstr
10050 << "' to " << pg_pool_t::get_cache_mode_name(mode);
10051 if (mode == pg_pool_t::CACHEMODE_NONE) {
10052 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
10053 assert(base_pool);
10054 if (base_pool->read_tier == pool_id ||
10055 base_pool->write_tier == pool_id)
10056 ss <<" (WARNING: pool is still configured as read or write tier)";
10057 }
10058 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10059 get_last_committed() + 1));
10060 return true;
10061 } else if (prefix == "osd tier add-cache") {
10062 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10063 if (err == -EAGAIN)
10064 goto wait;
10065 if (err)
10066 goto reply;
10067 string poolstr;
10068 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10069 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10070 if (pool_id < 0) {
10071 ss << "unrecognized pool '" << poolstr << "'";
10072 err = -ENOENT;
10073 goto reply;
10074 }
10075 string tierpoolstr;
10076 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10077 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10078 if (tierpool_id < 0) {
10079 ss << "unrecognized pool '" << tierpoolstr << "'";
10080 err = -ENOENT;
10081 goto reply;
10082 }
10083 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10084 assert(p);
10085 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10086 assert(tp);
10087
10088 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
10089 goto reply;
10090 }
10091
10092 int64_t size = 0;
10093 if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
10094 ss << "unable to parse 'size' value '"
10095 << cmd_vartype_stringify(cmdmap["size"]) << "'";
10096 err = -EINVAL;
10097 goto reply;
10098 }
10099 // make sure new tier is empty
10100 const pool_stat_t *pstats =
10101 mon->pgservice->get_pool_stat(tierpool_id);
10102 if (pstats && pstats->stats.sum.num_objects != 0) {
10103 ss << "tier pool '" << tierpoolstr << "' is not empty";
10104 err = -ENOTEMPTY;
10105 goto reply;
10106 }
10107 string modestr = g_conf->osd_tier_default_cache_mode;
10108 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
10109 if (mode < 0) {
10110 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
10111 err = -EINVAL;
10112 goto reply;
10113 }
10114 HitSet::Params hsp;
10115 if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
10116 BloomHitSet::Params *bsp = new BloomHitSet::Params;
10117 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
10118 hsp = HitSet::Params(bsp);
10119 } else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
10120 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
10121 }
10122 else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
10123 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
10124 } else {
10125 ss << "osd tier cache default hit set type '" <<
10126 g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
10127 err = -EINVAL;
10128 goto reply;
10129 }
10130 // go
10131 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10132 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10133 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
10134 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10135 return true;
10136 }
10137 np->tiers.insert(tierpool_id);
10138 np->read_tier = np->write_tier = tierpool_id;
10139 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
10140 np->set_last_force_op_resend(pending_inc.epoch);
10141 ntp->set_last_force_op_resend(pending_inc.epoch);
10142 ntp->tier_of = pool_id;
10143 ntp->cache_mode = mode;
10144 ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
10145 ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
10146 ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
10147 ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
10148 ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
10149 ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
10150 ntp->hit_set_params = hsp;
10151 ntp->target_max_bytes = size;
10152 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
10153 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10154 get_last_committed() + 1));
10155 return true;
10156 } else if (prefix == "osd pool set-quota") {
10157 string poolstr;
10158 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10159 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10160 if (pool_id < 0) {
10161 ss << "unrecognized pool '" << poolstr << "'";
10162 err = -ENOENT;
10163 goto reply;
10164 }
10165
10166 string field;
10167 cmd_getval(g_ceph_context, cmdmap, "field", field);
10168 if (field != "max_objects" && field != "max_bytes") {
10169 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
10170 err = -EINVAL;
10171 goto reply;
10172 }
10173
10174 // val could contain unit designations, so we treat as a string
10175 string val;
10176 cmd_getval(g_ceph_context, cmdmap, "val", val);
10177 stringstream tss;
10178 int64_t value = unit_to_bytesize(val, &tss);
10179 if (value < 0) {
10180 ss << "error parsing value '" << value << "': " << tss.str();
10181 err = value;
10182 goto reply;
10183 }
10184
10185 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
10186 if (field == "max_objects") {
10187 pi->quota_max_objects = value;
10188 } else if (field == "max_bytes") {
10189 pi->quota_max_bytes = value;
10190 } else {
10191 assert(0 == "unrecognized option");
10192 }
10193 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
10194 rs = ss.str();
10195 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10196 get_last_committed() + 1));
10197 return true;
10198
10199 } else if (prefix == "osd reweight-by-pg" ||
10200 prefix == "osd reweight-by-utilization" ||
10201 prefix == "osd test-reweight-by-pg" ||
10202 prefix == "osd test-reweight-by-utilization") {
10203 bool by_pg =
10204 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
10205 bool dry_run =
10206 prefix == "osd test-reweight-by-pg" ||
10207 prefix == "osd test-reweight-by-utilization";
10208 int64_t oload;
10209 cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
10210 set<int64_t> pools;
10211 vector<string> poolnamevec;
10212 cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
10213 for (unsigned j = 0; j < poolnamevec.size(); j++) {
10214 int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
10215 if (pool < 0) {
10216 ss << "pool '" << poolnamevec[j] << "' does not exist";
10217 err = -ENOENT;
10218 goto reply;
10219 }
10220 pools.insert(pool);
10221 }
10222 double max_change = g_conf->mon_reweight_max_change;
10223 cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
10224 if (max_change <= 0.0) {
10225 ss << "max_change " << max_change << " must be positive";
10226 err = -EINVAL;
10227 goto reply;
10228 }
10229 int64_t max_osds = g_conf->mon_reweight_max_osds;
10230 cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
10231 if (max_osds <= 0) {
10232 ss << "max_osds " << max_osds << " must be positive";
10233 err = -EINVAL;
10234 goto reply;
10235 }
10236 string no_increasing;
10237 cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
10238 string out_str;
10239 mempool::osdmap::map<int32_t, uint32_t> new_weights;
10240 err = mon->pgservice->reweight_by_utilization(osdmap,
10241 oload,
10242 max_change,
10243 max_osds,
10244 by_pg,
10245 pools.empty() ? NULL : &pools,
10246 no_increasing == "--no-increasing",
10247 &new_weights,
10248 &ss, &out_str, f.get());
10249 if (err >= 0) {
10250 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
10251 }
10252 if (f)
10253 f->flush(rdata);
10254 else
10255 rdata.append(out_str);
10256 if (err < 0) {
10257 ss << "FAILED reweight-by-pg";
10258 } else if (err == 0 || dry_run) {
10259 ss << "no change";
10260 } else {
10261 ss << "SUCCESSFUL reweight-by-pg";
10262 pending_inc.new_weight = std::move(new_weights);
10263 wait_for_finished_proposal(
10264 op,
10265 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
10266 return true;
10267 }
10268 } else {
10269 err = -EINVAL;
10270 }
10271
10272 reply:
10273 getline(ss, rs);
10274 if (err < 0 && rs.length() == 0)
10275 rs = cpp_strerror(err);
10276 mon->reply_command(op, err, rs, rdata, get_last_committed());
10277 return ret;
10278
10279 update:
10280 getline(ss, rs);
10281 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10282 get_last_committed() + 1));
10283 return true;
10284
10285 wait:
10286 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10287 return true;
10288 }
10289
10290 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
10291 {
10292 op->mark_osdmon_event(__func__);
10293 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10294
10295 if (m->fsid != mon->monmap->fsid) {
10296 dout(0) << __func__ << " drop message on fsid " << m->fsid
10297 << " != " << mon->monmap->fsid << " for " << *m << dendl;
10298 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10299 return true;
10300 }
10301
10302 if (m->op == POOL_OP_CREATE)
10303 return preprocess_pool_op_create(op);
10304
10305 if (!osdmap.get_pg_pool(m->pool)) {
10306 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
10307 _pool_op_reply(op, 0, osdmap.get_epoch());
10308 return true;
10309 }
10310
10311 // check if the snap and snapname exist
10312 bool snap_exists = false;
10313 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
10314 if (p->snap_exists(m->name.c_str()))
10315 snap_exists = true;
10316
10317 switch (m->op) {
10318 case POOL_OP_CREATE_SNAP:
10319 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
10320 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10321 return true;
10322 }
10323 if (snap_exists) {
10324 _pool_op_reply(op, 0, osdmap.get_epoch());
10325 return true;
10326 }
10327 return false;
10328 case POOL_OP_CREATE_UNMANAGED_SNAP:
10329 if (p->is_pool_snaps_mode()) {
10330 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10331 return true;
10332 }
10333 return false;
10334 case POOL_OP_DELETE_SNAP:
10335 if (p->is_unmanaged_snaps_mode()) {
10336 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10337 return true;
10338 }
10339 if (!snap_exists) {
10340 _pool_op_reply(op, 0, osdmap.get_epoch());
10341 return true;
10342 }
10343 return false;
10344 case POOL_OP_DELETE_UNMANAGED_SNAP:
10345 if (p->is_pool_snaps_mode()) {
10346 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10347 return true;
10348 }
10349 if (p->is_removed_snap(m->snapid)) {
10350 _pool_op_reply(op, 0, osdmap.get_epoch());
10351 return true;
10352 }
10353 return false;
10354 case POOL_OP_DELETE:
10355 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
10356 _pool_op_reply(op, 0, osdmap.get_epoch());
10357 return true;
10358 }
10359 return false;
10360 case POOL_OP_AUID_CHANGE:
10361 return false;
10362 default:
10363 ceph_abort();
10364 break;
10365 }
10366
10367 return false;
10368 }
10369
10370 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
10371 {
10372 op->mark_osdmon_event(__func__);
10373 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10374 MonSession *session = m->get_session();
10375 if (!session) {
10376 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
10377 return true;
10378 }
10379 if (!session->is_capable("osd", MON_CAP_W)) {
10380 dout(5) << "attempt to create new pool without sufficient auid privileges!"
10381 << "message: " << *m << std::endl
10382 << "caps: " << session->caps << dendl;
10383 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
10384 return true;
10385 }
10386
10387 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
10388 if (pool >= 0) {
10389 _pool_op_reply(op, 0, osdmap.get_epoch());
10390 return true;
10391 }
10392
10393 return false;
10394 }
10395
10396 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
10397 {
10398 op->mark_osdmon_event(__func__);
10399 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10400 dout(10) << "prepare_pool_op " << *m << dendl;
10401 if (m->op == POOL_OP_CREATE) {
10402 return prepare_pool_op_create(op);
10403 } else if (m->op == POOL_OP_DELETE) {
10404 return prepare_pool_op_delete(op);
10405 }
10406
10407 int ret = 0;
10408 bool changed = false;
10409
10410 if (!osdmap.have_pg_pool(m->pool)) {
10411 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
10412 return false;
10413 }
10414
10415 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
10416
10417 switch (m->op) {
10418 case POOL_OP_CREATE_SNAP:
10419 if (pool->is_tier()) {
10420 ret = -EINVAL;
10421 _pool_op_reply(op, ret, osdmap.get_epoch());
10422 return false;
10423 } // else, fall through
10424 case POOL_OP_DELETE_SNAP:
10425 if (!pool->is_unmanaged_snaps_mode()) {
10426 bool snap_exists = pool->snap_exists(m->name.c_str());
10427 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
10428 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
10429 ret = 0;
10430 } else {
10431 break;
10432 }
10433 } else {
10434 ret = -EINVAL;
10435 }
10436 _pool_op_reply(op, ret, osdmap.get_epoch());
10437 return false;
10438
10439 case POOL_OP_DELETE_UNMANAGED_SNAP:
10440 // we won't allow removal of an unmanaged snapshot from a pool
10441 // not in unmanaged snaps mode.
10442 if (!pool->is_unmanaged_snaps_mode()) {
10443 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
10444 return false;
10445 }
10446 /* fall-thru */
10447 case POOL_OP_CREATE_UNMANAGED_SNAP:
10448 // but we will allow creating an unmanaged snapshot on any pool
10449 // as long as it is not in 'pool' snaps mode.
10450 if (pool->is_pool_snaps_mode()) {
10451 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10452 return false;
10453 }
10454 }
10455
10456 // projected pool info
10457 pg_pool_t pp;
10458 if (pending_inc.new_pools.count(m->pool))
10459 pp = pending_inc.new_pools[m->pool];
10460 else
10461 pp = *osdmap.get_pg_pool(m->pool);
10462
10463 bufferlist reply_data;
10464
10465 // pool snaps vs unmanaged snaps are mutually exclusive
10466 switch (m->op) {
10467 case POOL_OP_CREATE_SNAP:
10468 case POOL_OP_DELETE_SNAP:
10469 if (pp.is_unmanaged_snaps_mode()) {
10470 ret = -EINVAL;
10471 goto out;
10472 }
10473 break;
10474
10475 case POOL_OP_CREATE_UNMANAGED_SNAP:
10476 case POOL_OP_DELETE_UNMANAGED_SNAP:
10477 if (pp.is_pool_snaps_mode()) {
10478 ret = -EINVAL;
10479 goto out;
10480 }
10481 }
10482
10483 switch (m->op) {
10484 case POOL_OP_CREATE_SNAP:
10485 if (!pp.snap_exists(m->name.c_str())) {
10486 pp.add_snap(m->name.c_str(), ceph_clock_now());
10487 dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
10488 changed = true;
10489 }
10490 break;
10491
10492 case POOL_OP_DELETE_SNAP:
10493 {
10494 snapid_t s = pp.snap_exists(m->name.c_str());
10495 if (s) {
10496 pp.remove_snap(s);
10497 changed = true;
10498 }
10499 }
10500 break;
10501
10502 case POOL_OP_CREATE_UNMANAGED_SNAP:
10503 {
10504 uint64_t snapid;
10505 pp.add_unmanaged_snap(snapid);
10506 ::encode(snapid, reply_data);
10507 changed = true;
10508 }
10509 break;
10510
10511 case POOL_OP_DELETE_UNMANAGED_SNAP:
10512 if (!pp.is_removed_snap(m->snapid)) {
10513 pp.remove_unmanaged_snap(m->snapid);
10514 changed = true;
10515 }
10516 break;
10517
10518 case POOL_OP_AUID_CHANGE:
10519 if (pp.auid != m->auid) {
10520 pp.auid = m->auid;
10521 changed = true;
10522 }
10523 break;
10524
10525 default:
10526 ceph_abort();
10527 break;
10528 }
10529
10530 if (changed) {
10531 pp.set_snap_epoch(pending_inc.epoch);
10532 pending_inc.new_pools[m->pool] = pp;
10533 }
10534
10535 out:
10536 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
10537 return true;
10538 }
10539
10540 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
10541 {
10542 op->mark_osdmon_event(__func__);
10543 int err = prepare_new_pool(op);
10544 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
10545 return true;
10546 }
10547
10548 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
10549 ostream *ss)
10550 {
10551 const string& poolstr = osdmap.get_pool_name(pool_id);
10552
10553 // If the Pool is in use by CephFS, refuse to delete it
10554 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
10555 if (pending_fsmap.pool_in_use(pool_id)) {
10556 *ss << "pool '" << poolstr << "' is in use by CephFS";
10557 return -EBUSY;
10558 }
10559
10560 if (pool.tier_of >= 0) {
10561 *ss << "pool '" << poolstr << "' is a tier of '"
10562 << osdmap.get_pool_name(pool.tier_of) << "'";
10563 return -EBUSY;
10564 }
10565 if (!pool.tiers.empty()) {
10566 *ss << "pool '" << poolstr << "' has tiers";
10567 for(auto tier : pool.tiers) {
10568 *ss << " " << osdmap.get_pool_name(tier);
10569 }
10570 return -EBUSY;
10571 }
10572
10573 if (!g_conf->mon_allow_pool_delete) {
10574 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
10575 return -EPERM;
10576 }
10577
10578 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
10579 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
10580 return -EPERM;
10581 }
10582
10583 *ss << "pool '" << poolstr << "' removed";
10584 return 0;
10585 }
10586
10587 /**
10588 * Check if it is safe to add a tier to a base pool
10589 *
10590 * @return
10591 * True if the operation should proceed, false if we should abort here
10592 * (abort doesn't necessarily mean error, could be idempotency)
10593 */
10594 bool OSDMonitor::_check_become_tier(
10595 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
10596 const int64_t base_pool_id, const pg_pool_t *base_pool,
10597 int *err,
10598 ostream *ss) const
10599 {
10600 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
10601 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
10602
10603 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
10604 if (pending_fsmap.pool_in_use(tier_pool_id)) {
10605 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
10606 *err = -EBUSY;
10607 return false;
10608 }
10609
10610 if (base_pool->tiers.count(tier_pool_id)) {
10611 assert(tier_pool->tier_of == base_pool_id);
10612 *err = 0;
10613 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
10614 << base_pool_name << "'";
10615 return false;
10616 }
10617
10618 if (base_pool->is_tier()) {
10619 *ss << "pool '" << base_pool_name << "' is already a tier of '"
10620 << osdmap.get_pool_name(base_pool->tier_of) << "', "
10621 << "multiple tiers are not yet supported.";
10622 *err = -EINVAL;
10623 return false;
10624 }
10625
10626 if (tier_pool->has_tiers()) {
10627 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
10628 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
10629 it != tier_pool->tiers.end(); ++it)
10630 *ss << "'" << osdmap.get_pool_name(*it) << "',";
10631 *ss << " multiple tiers are not yet supported.";
10632 *err = -EINVAL;
10633 return false;
10634 }
10635
10636 if (tier_pool->is_tier()) {
10637 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
10638 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
10639 *err = -EINVAL;
10640 return false;
10641 }
10642
10643 *err = 0;
10644 return true;
10645 }
10646
10647
10648 /**
10649 * Check if it is safe to remove a tier from this base pool
10650 *
10651 * @return
10652 * True if the operation should proceed, false if we should abort here
10653 * (abort doesn't necessarily mean error, could be idempotency)
10654 */
10655 bool OSDMonitor::_check_remove_tier(
10656 const int64_t base_pool_id, const pg_pool_t *base_pool,
10657 const pg_pool_t *tier_pool,
10658 int *err, ostream *ss) const
10659 {
10660 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
10661
10662 // Apply CephFS-specific checks
10663 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
10664 if (pending_fsmap.pool_in_use(base_pool_id)) {
10665 if (base_pool->type != pg_pool_t::TYPE_REPLICATED) {
10666 // If the underlying pool is erasure coded, we can't permit the
10667 // removal of the replicated tier that CephFS relies on to access it
10668 *ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
10669 *err = -EBUSY;
10670 return false;
10671 }
10672
10673 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
10674 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
10675 "tier is still in use as a writeback cache. Change the cache "
10676 "mode and flush the cache before removing it";
10677 *err = -EBUSY;
10678 return false;
10679 }
10680 }
10681
10682 *err = 0;
10683 return true;
10684 }
10685
10686 int OSDMonitor::_prepare_remove_pool(
10687 int64_t pool, ostream *ss, bool no_fake)
10688 {
10689 dout(10) << "_prepare_remove_pool " << pool << dendl;
10690 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10691 int r = _check_remove_pool(pool, *p, ss);
10692 if (r < 0)
10693 return r;
10694
10695 auto new_pool = pending_inc.new_pools.find(pool);
10696 if (new_pool != pending_inc.new_pools.end()) {
10697 // if there is a problem with the pending info, wait and retry
10698 // this op.
10699 const auto& p = new_pool->second;
10700 int r = _check_remove_pool(pool, p, ss);
10701 if (r < 0)
10702 return -EAGAIN;
10703 }
10704
10705 if (pending_inc.old_pools.count(pool)) {
10706 dout(10) << "_prepare_remove_pool " << pool << " already pending removal"
10707 << dendl;
10708 return 0;
10709 }
10710
10711 if (g_conf->mon_fake_pool_delete && !no_fake) {
10712 string old_name = osdmap.get_pool_name(pool);
10713 string new_name = old_name + "." + stringify(pool) + ".DELETED";
10714 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
10715 << old_name << " -> " << new_name << dendl;
10716 pending_inc.new_pool_names[pool] = new_name;
10717 return 0;
10718 }
10719
10720 // remove
10721 pending_inc.old_pools.insert(pool);
10722
10723 // remove any pg_temp mappings for this pool too
10724 for (auto p = osdmap.pg_temp->begin();
10725 p != osdmap.pg_temp->end();
10726 ++p) {
10727 if (p->first.pool() == (uint64_t)pool) {
10728 dout(10) << "_prepare_remove_pool " << pool << " removing obsolete pg_temp "
10729 << p->first << dendl;
10730 pending_inc.new_pg_temp[p->first].clear();
10731 }
10732 }
10733 for (auto p = osdmap.primary_temp->begin();
10734 p != osdmap.primary_temp->end();
10735 ++p) {
10736 if (p->first.pool() == (uint64_t)pool) {
10737 dout(10) << "_prepare_remove_pool " << pool
10738 << " removing obsolete primary_temp" << p->first << dendl;
10739 pending_inc.new_primary_temp[p->first] = -1;
10740 }
10741 }
10742 return 0;
10743 }
10744
10745 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
10746 {
10747 dout(10) << "_prepare_rename_pool " << pool << dendl;
10748 if (pending_inc.old_pools.count(pool)) {
10749 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
10750 return -ENOENT;
10751 }
10752 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
10753 p != pending_inc.new_pool_names.end();
10754 ++p) {
10755 if (p->second == newname && p->first != pool) {
10756 return -EEXIST;
10757 }
10758 }
10759
10760 pending_inc.new_pool_names[pool] = newname;
10761 return 0;
10762 }
10763
10764 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
10765 {
10766 op->mark_osdmon_event(__func__);
10767 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10768 ostringstream ss;
10769 int ret = _prepare_remove_pool(m->pool, &ss, false);
10770 if (ret == -EAGAIN) {
10771 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10772 return true;
10773 }
10774 if (ret < 0)
10775 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
10776 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
10777 pending_inc.epoch));
10778 return true;
10779 }
10780
10781 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
10782 int ret, epoch_t epoch, bufferlist *blp)
10783 {
10784 op->mark_osdmon_event(__func__);
10785 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10786 dout(20) << "_pool_op_reply " << ret << dendl;
10787 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
10788 ret, epoch, get_last_committed(), blp);
10789 mon->send_reply(op, reply);
10790 }