]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
update sources to v12.2.4
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <locale>
22 #include <sstream>
23
24 #include "mon/OSDMonitor.h"
25 #include "mon/Monitor.h"
26 #include "mon/MDSMonitor.h"
27 #include "mon/PGMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreated.h"
51 #include "messages/MOSDPGTemp.h"
52 #include "messages/MMonCommand.h"
53 #include "messages/MRemoveSnaps.h"
54 #include "messages/MOSDScrub.h"
55 #include "messages/MRoute.h"
56
57 #include "common/TextTable.h"
58 #include "common/Timer.h"
59 #include "common/ceph_argparse.h"
60 #include "common/perf_counters.h"
61 #include "common/strtol.h"
62
63 #include "common/config.h"
64 #include "common/errno.h"
65
66 #include "erasure-code/ErasureCodePlugin.h"
67 #include "compressor/Compressor.h"
68 #include "common/Checksummer.h"
69
70 #include "include/compat.h"
71 #include "include/assert.h"
72 #include "include/stringify.h"
73 #include "include/util.h"
74 #include "common/cmdparse.h"
75 #include "include/str_list.h"
76 #include "include/str_map.h"
77 #include "include/scope_guard.h"
78
79 #include "json_spirit/json_spirit_reader.h"
80
81 #include <boost/algorithm/string/predicate.hpp>
82
83 #define dout_subsys ceph_subsys_mon
84 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
85 static const string OSD_METADATA_PREFIX("osd_metadata");
86
87 namespace {
88
89 const uint32_t MAX_POOL_APPLICATIONS = 4;
90 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
91 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
92
93 } // anonymous namespace
94
95 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
96 {
97 if (epoch_by_pg.size() <= ps) {
98 epoch_by_pg.resize(ps + 1, 0);
99 }
100 const auto old_lec = epoch_by_pg[ps];
101 if (old_lec >= last_epoch_clean) {
102 // stale lec
103 return;
104 }
105 epoch_by_pg[ps] = last_epoch_clean;
106 if (last_epoch_clean < floor) {
107 floor = last_epoch_clean;
108 } else if (last_epoch_clean > floor) {
109 if (old_lec == floor) {
110 // probably should increase floor?
111 auto new_floor = std::min_element(std::begin(epoch_by_pg),
112 std::end(epoch_by_pg));
113 floor = *new_floor;
114 }
115 }
116 if (ps != next_missing) {
117 return;
118 }
119 for (; next_missing < epoch_by_pg.size(); next_missing++) {
120 if (epoch_by_pg[next_missing] == 0) {
121 break;
122 }
123 }
124 }
125
126 void LastEpochClean::remove_pool(uint64_t pool)
127 {
128 report_by_pool.erase(pool);
129 }
130
131 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
132 {
133 auto& lec = report_by_pool[pg.pool()];
134 return lec.report(pg.ps(), last_epoch_clean);
135 }
136
137 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
138 {
139 auto floor = latest.get_epoch();
140 for (auto& pool : latest.get_pools()) {
141 auto reported = report_by_pool.find(pool.first);
142 if (reported == report_by_pool.end()) {
143 return 0;
144 }
145 if (reported->second.next_missing < pool.second.get_pg_num()) {
146 return 0;
147 }
148 if (reported->second.floor < floor) {
149 floor = reported->second.floor;
150 }
151 }
152 return floor;
153 }
154
155
156 struct C_UpdateCreatingPGs : public Context {
157 OSDMonitor *osdmon;
158 utime_t start;
159 epoch_t epoch;
160 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
161 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
162 void finish(int r) override {
163 if (r >= 0) {
164 utime_t end = ceph_clock_now();
165 dout(10) << "osdmap epoch " << epoch << " mapping took "
166 << (end - start) << " seconds" << dendl;
167 osdmon->update_creating_pgs();
168 osdmon->check_pg_creates_subs();
169 }
170 }
171 };
172
173 #undef dout_prefix
174 #define dout_prefix _prefix(_dout, mon, osdmap)
175 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
176 return *_dout << "mon." << mon->name << "@" << mon->rank
177 << "(" << mon->get_state_name()
178 << ").osd e" << osdmap.get_epoch() << " ";
179 }
180
181 OSDMonitor::OSDMonitor(
182 CephContext *cct,
183 Monitor *mn,
184 Paxos *p,
185 const string& service_name)
186 : PaxosService(mn, p, service_name),
187 cct(cct),
188 inc_osd_cache(g_conf->mon_osd_cache_size),
189 full_osd_cache(g_conf->mon_osd_cache_size),
190 last_attempted_minwait_time(utime_t()),
191 mapper(mn->cct, &mn->cpu_tp),
192 op_tracker(cct, true, 1)
193 {}
194
195 bool OSDMonitor::_have_pending_crush()
196 {
197 return pending_inc.crush.length() > 0;
198 }
199
200 CrushWrapper &OSDMonitor::_get_stable_crush()
201 {
202 return *osdmap.crush;
203 }
204
205 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
206 {
207 bufferlist bl;
208 if (pending_inc.crush.length())
209 bl = pending_inc.crush;
210 else
211 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
212
213 bufferlist::iterator p = bl.begin();
214 newcrush.decode(p);
215 }
216
217 void OSDMonitor::create_initial()
218 {
219 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
220
221 OSDMap newmap;
222
223 bufferlist bl;
224 mon->store->get("mkfs", "osdmap", bl);
225
226 if (bl.length()) {
227 newmap.decode(bl);
228 newmap.set_fsid(mon->monmap->fsid);
229 } else {
230 newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0);
231 }
232 newmap.set_epoch(1);
233 newmap.created = newmap.modified = ceph_clock_now();
234
235 // new clusters should sort bitwise by default.
236 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
237
238 // new cluster should require latest by default
239 if (g_conf->mon_debug_no_require_luminous) {
240 newmap.require_osd_release = CEPH_RELEASE_KRAKEN;
241 derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
242 } else {
243 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
244 newmap.flags |=
245 CEPH_OSDMAP_RECOVERY_DELETES |
246 CEPH_OSDMAP_PURGED_SNAPDIRS;
247 newmap.full_ratio = g_conf->mon_osd_full_ratio;
248 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
249 newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
250 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
251 newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
252 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
253 int r = ceph_release_from_name(
254 g_conf->mon_osd_initial_require_min_compat_client.c_str());
255 if (r <= 0) {
256 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
257 }
258 newmap.require_min_compat_client = r;
259 }
260
261 // encode into pending incremental
262 newmap.encode(pending_inc.fullmap,
263 mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED);
264 pending_inc.full_crc = newmap.get_crc();
265 dout(20) << " full crc " << pending_inc.full_crc << dendl;
266 }
267
268 void OSDMonitor::get_store_prefixes(std::set<string>& s)
269 {
270 s.insert(service_name);
271 s.insert(OSD_PG_CREATING_PREFIX);
272 s.insert(OSD_METADATA_PREFIX);
273 }
274
275 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
276 {
277 version_t version = get_last_committed();
278 if (version == osdmap.epoch)
279 return;
280 assert(version > osdmap.epoch);
281
282 dout(15) << "update_from_paxos paxos e " << version
283 << ", my e " << osdmap.epoch << dendl;
284
285 if (mapping_job) {
286 if (!mapping_job->is_done()) {
287 dout(1) << __func__ << " mapping job "
288 << mapping_job.get() << " did not complete, "
289 << mapping_job->shards << " left, canceling" << dendl;
290 mapping_job->abort();
291 }
292 mapping_job.reset();
293 }
294
295 load_health();
296
297 /*
298 * We will possibly have a stashed latest that *we* wrote, and we will
299 * always be sure to have the oldest full map in the first..last range
300 * due to encode_trim_extra(), which includes the oldest full map in the trim
301 * transaction.
302 *
303 * encode_trim_extra() does not however write the full map's
304 * version to 'full_latest'. This is only done when we are building the
305 * full maps from the incremental versions. But don't panic! We make sure
306 * that the following conditions find whichever full map version is newer.
307 */
308 version_t latest_full = get_version_latest_full();
309 if (latest_full == 0 && get_first_committed() > 1)
310 latest_full = get_first_committed();
311
312 if (get_first_committed() > 1 &&
313 latest_full < get_first_committed()) {
314 // the monitor could be just sync'ed with its peer, and the latest_full key
315 // is not encoded in the paxos commits in encode_pending(), so we need to
316 // make sure we get it pointing to a proper version.
317 version_t lc = get_last_committed();
318 version_t fc = get_first_committed();
319
320 dout(10) << __func__ << " looking for valid full map in interval"
321 << " [" << fc << ", " << lc << "]" << dendl;
322
323 latest_full = 0;
324 for (version_t v = lc; v >= fc; v--) {
325 string full_key = "full_" + stringify(v);
326 if (mon->store->exists(get_service_name(), full_key)) {
327 dout(10) << __func__ << " found latest full map v " << v << dendl;
328 latest_full = v;
329 break;
330 }
331 }
332
333 assert(latest_full > 0);
334 auto t(std::make_shared<MonitorDBStore::Transaction>());
335 put_version_latest_full(t, latest_full);
336 mon->store->apply_transaction(t);
337 dout(10) << __func__ << " updated the on-disk full map version to "
338 << latest_full << dendl;
339 }
340
341 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
342 bufferlist latest_bl;
343 get_version_full(latest_full, latest_bl);
344 assert(latest_bl.length() != 0);
345 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
346 osdmap.decode(latest_bl);
347 }
348
349 if (mon->monmap->get_required_features().contains_all(
350 ceph::features::mon::FEATURE_LUMINOUS)) {
351 bufferlist bl;
352 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
353 auto p = bl.begin();
354 std::lock_guard<std::mutex> l(creating_pgs_lock);
355 creating_pgs.decode(p);
356 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
357 << creating_pgs.last_scan_epoch
358 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
359 } else {
360 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
361 << dendl;
362 }
363 }
364
365 // make sure we're using the right pg service.. remove me post-luminous!
366 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
367 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
368 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
369 } else {
370 dout(10) << __func__ << " pgservice is pg" << dendl;
371 mon->pgservice = mon->pgmon()->get_pg_stat_service();
372 }
373
374 // walk through incrementals
375 MonitorDBStore::TransactionRef t;
376 size_t tx_size = 0;
377 while (version > osdmap.epoch) {
378 bufferlist inc_bl;
379 int err = get_version(osdmap.epoch+1, inc_bl);
380 assert(err == 0);
381 assert(inc_bl.length());
382
383 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
384 << dendl;
385 OSDMap::Incremental inc(inc_bl);
386 err = osdmap.apply_incremental(inc);
387 assert(err == 0);
388
389 if (!t)
390 t.reset(new MonitorDBStore::Transaction);
391
392 // Write out the full map for all past epochs. Encode the full
393 // map with the same features as the incremental. If we don't
394 // know, use the quorum features. If we don't know those either,
395 // encode with all features.
396 uint64_t f = inc.encode_features;
397 if (!f)
398 f = mon->get_quorum_con_features();
399 if (!f)
400 f = -1;
401 bufferlist full_bl;
402 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
403 tx_size += full_bl.length();
404
405 bufferlist orig_full_bl;
406 get_version_full(osdmap.epoch, orig_full_bl);
407 if (orig_full_bl.length()) {
408 // the primary provided the full map
409 assert(inc.have_crc);
410 if (inc.full_crc != osdmap.crc) {
411 // This will happen if the mons were running mixed versions in
412 // the past or some other circumstance made the full encoded
413 // maps divergent. Reloading here will bring us back into
414 // sync with the primary for this and all future maps. OSDs
415 // will also be brought back into sync when they discover the
416 // crc mismatch and request a full map from a mon.
417 derr << __func__ << " full map CRC mismatch, resetting to canonical"
418 << dendl;
419 osdmap = OSDMap();
420 osdmap.decode(orig_full_bl);
421 }
422 } else {
423 assert(!inc.have_crc);
424 put_version_full(t, osdmap.epoch, full_bl);
425 }
426 put_version_latest_full(t, osdmap.epoch);
427
428 // share
429 dout(1) << osdmap << dendl;
430
431 if (osdmap.epoch == 1) {
432 t->erase("mkfs", "osdmap");
433 }
434
435 // make sure we're using the right pg service.. remove me post-luminous!
436 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
437 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
438 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
439 } else {
440 dout(10) << __func__ << " pgservice is pg" << dendl;
441 mon->pgservice = mon->pgmon()->get_pg_stat_service();
442 }
443
444 if (tx_size > g_conf->mon_sync_max_payload_size*2) {
445 mon->store->apply_transaction(t);
446 t = MonitorDBStore::TransactionRef();
447 tx_size = 0;
448 }
449 if (mon->monmap->get_required_features().contains_all(
450 ceph::features::mon::FEATURE_LUMINOUS)) {
451 for (const auto &osd_state : inc.new_state) {
452 if (osd_state.second & CEPH_OSD_UP) {
453 // could be marked up *or* down, but we're too lazy to check which
454 last_osd_report.erase(osd_state.first);
455 }
456 if (osd_state.second & CEPH_OSD_EXISTS) {
457 // could be created *or* destroyed, but we can safely drop it
458 osd_epochs.erase(osd_state.first);
459 }
460 }
461 }
462 }
463
464 if (t) {
465 mon->store->apply_transaction(t);
466 }
467
468 for (int o = 0; o < osdmap.get_max_osd(); o++) {
469 if (osdmap.is_out(o))
470 continue;
471 auto found = down_pending_out.find(o);
472 if (osdmap.is_down(o)) {
473 // populate down -> out map
474 if (found == down_pending_out.end()) {
475 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
476 down_pending_out[o] = ceph_clock_now();
477 }
478 } else {
479 if (found != down_pending_out.end()) {
480 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
481 down_pending_out.erase(found);
482 }
483 }
484 }
485 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
486
487 if (mon->is_leader()) {
488 // kick pgmon, make sure it's seen the latest map
489 mon->pgmon()->check_osd_map(osdmap.epoch);
490 }
491
492 check_osdmap_subs();
493 check_pg_creates_subs();
494
495 share_map_with_random_osd();
496 update_logger();
497
498 process_failures();
499
500 // make sure our feature bits reflect the latest map
501 update_msgr_features();
502
503 if (!mon->is_leader()) {
504 // will be called by on_active() on the leader, avoid doing so twice
505 start_mapping();
506 }
507 }
508
509 void OSDMonitor::start_mapping()
510 {
511 // initiate mapping job
512 if (mapping_job) {
513 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
514 << dendl;
515 mapping_job->abort();
516 }
517 if (!osdmap.get_pools().empty()) {
518 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
519 mapping_job = mapping.start_update(osdmap, mapper,
520 g_conf->mon_osd_mapping_pgs_per_chunk);
521 dout(10) << __func__ << " started mapping job " << mapping_job.get()
522 << " at " << fin->start << dendl;
523 mapping_job->set_finish_event(fin);
524 } else {
525 dout(10) << __func__ << " no pools, no mapping job" << dendl;
526 mapping_job = nullptr;
527 }
528 }
529
530 void OSDMonitor::update_msgr_features()
531 {
532 set<int> types;
533 types.insert((int)entity_name_t::TYPE_OSD);
534 types.insert((int)entity_name_t::TYPE_CLIENT);
535 types.insert((int)entity_name_t::TYPE_MDS);
536 types.insert((int)entity_name_t::TYPE_MON);
537 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
538 uint64_t mask;
539 uint64_t features = osdmap.get_features(*q, &mask);
540 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
541 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
542 Messenger::Policy p = mon->messenger->get_policy(*q);
543 p.features_required = (p.features_required & ~mask) | features;
544 mon->messenger->set_policy(*q, p);
545 }
546 }
547 }
548
549 void OSDMonitor::on_active()
550 {
551 update_logger();
552
553 if (mon->is_leader()) {
554 mon->clog->debug() << "osdmap " << osdmap;
555 } else {
556 list<MonOpRequestRef> ls;
557 take_all_failures(ls);
558 while (!ls.empty()) {
559 MonOpRequestRef op = ls.front();
560 op->mark_osdmon_event(__func__);
561 dispatch(op);
562 ls.pop_front();
563 }
564 }
565 start_mapping();
566 }
567
568 void OSDMonitor::on_restart()
569 {
570 last_osd_report.clear();
571 }
572
573 void OSDMonitor::on_shutdown()
574 {
575 dout(10) << __func__ << dendl;
576 if (mapping_job) {
577 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
578 << dendl;
579 mapping_job->abort();
580 }
581
582 // discard failure info, waiters
583 list<MonOpRequestRef> ls;
584 take_all_failures(ls);
585 ls.clear();
586 }
587
588 void OSDMonitor::update_logger()
589 {
590 dout(10) << "update_logger" << dendl;
591
592 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
593 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
594 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
595 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
596 }
597
598 void OSDMonitor::create_pending()
599 {
600 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
601 pending_inc.fsid = mon->monmap->fsid;
602
603 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
604
605 // clean up pg_temp, primary_temp
606 OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
607 dout(10) << "create_pending did clean_temps" << dendl;
608
609 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
610 // instead of osd_backfill_full_ratio config
611 if (osdmap.backfillfull_ratio <= 0) {
612 pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
613 if (pending_inc.new_backfillfull_ratio > 1.0)
614 pending_inc.new_backfillfull_ratio /= 100;
615 dout(1) << __func__ << " setting backfillfull_ratio = "
616 << pending_inc.new_backfillfull_ratio << dendl;
617 }
618 if (osdmap.get_epoch() > 0 &&
619 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
620 // transition full ratios from PGMap to OSDMap (on upgrade)
621 float full_ratio = mon->pgservice->get_full_ratio();
622 float nearfull_ratio = mon->pgservice->get_nearfull_ratio();
623 if (osdmap.full_ratio != full_ratio) {
624 dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
625 << " -> " << full_ratio << " (from pgmap)" << dendl;
626 pending_inc.new_full_ratio = full_ratio;
627 }
628 if (osdmap.nearfull_ratio != nearfull_ratio) {
629 dout(10) << __func__ << " nearfull_ratio " << osdmap.nearfull_ratio
630 << " -> " << nearfull_ratio << " (from pgmap)" << dendl;
631 pending_inc.new_nearfull_ratio = nearfull_ratio;
632 }
633 } else {
634 // safety check (this shouldn't really happen)
635 if (osdmap.full_ratio <= 0) {
636 pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
637 if (pending_inc.new_full_ratio > 1.0)
638 pending_inc.new_full_ratio /= 100;
639 dout(1) << __func__ << " setting full_ratio = "
640 << pending_inc.new_full_ratio << dendl;
641 }
642 if (osdmap.nearfull_ratio <= 0) {
643 pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
644 if (pending_inc.new_nearfull_ratio > 1.0)
645 pending_inc.new_nearfull_ratio /= 100;
646 dout(1) << __func__ << " setting nearfull_ratio = "
647 << pending_inc.new_nearfull_ratio << dendl;
648 }
649 }
650
651 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
652 // structure.
653 if (osdmap.crush->has_legacy_rule_ids()) {
654 CrushWrapper newcrush;
655 _get_pending_crush(newcrush);
656
657 // First, for all pools, work out which rule they really used
658 // by resolving ruleset to rule.
659 for (const auto &i : osdmap.get_pools()) {
660 const auto pool_id = i.first;
661 const auto &pool = i.second;
662 int new_rule_id = newcrush.find_rule(pool.crush_rule,
663 pool.type, pool.size);
664
665 dout(1) << __func__ << " rewriting pool "
666 << osdmap.get_pool_name(pool_id) << " crush ruleset "
667 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
668 if (pending_inc.new_pools.count(pool_id) == 0) {
669 pending_inc.new_pools[pool_id] = pool;
670 }
671 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
672 }
673
674 // Now, go ahead and renumber all the rules so that their
675 // rule_id field corresponds to their position in the array
676 auto old_to_new = newcrush.renumber_rules();
677 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
678 for (const auto &i : old_to_new) {
679 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
680 }
681 pending_inc.crush.clear();
682 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
683 }
684 }
685
686 creating_pgs_t
687 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
688 {
689 dout(10) << __func__ << dendl;
690 creating_pgs_t pending_creatings;
691 {
692 std::lock_guard<std::mutex> l(creating_pgs_lock);
693 pending_creatings = creating_pgs;
694 }
695 // check for new or old pools
696 if (pending_creatings.last_scan_epoch < inc.epoch) {
697 if (osdmap.get_epoch() &&
698 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
699 auto added =
700 mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
701 osdmap.get_pools(),
702 &pending_creatings);
703 dout(7) << __func__ << " " << added << " pgs added from pgmap" << dendl;
704 }
705 unsigned queued = 0;
706 queued += scan_for_creating_pgs(osdmap.get_pools(),
707 inc.old_pools,
708 inc.modified,
709 &pending_creatings);
710 queued += scan_for_creating_pgs(inc.new_pools,
711 inc.old_pools,
712 inc.modified,
713 &pending_creatings);
714 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
715 for (auto deleted_pool : inc.old_pools) {
716 auto removed = pending_creatings.remove_pool(deleted_pool);
717 dout(10) << __func__ << " " << removed
718 << " pg removed because containing pool deleted: "
719 << deleted_pool << dendl;
720 last_epoch_clean.remove_pool(deleted_pool);
721 }
722 // pgmon updates its creating_pgs in check_osd_map() which is called by
723 // on_active() and check_osd_map() could be delayed if lease expires, so its
724 // creating_pgs could be stale in comparison with the one of osdmon. let's
725 // trim them here. otherwise, they will be added back after being erased.
726 unsigned removed = 0;
727 for (auto& pg : pending_created_pgs) {
728 dout(20) << __func__ << " noting created pg " << pg << dendl;
729 pending_creatings.created_pools.insert(pg.pool());
730 removed += pending_creatings.pgs.erase(pg);
731 }
732 pending_created_pgs.clear();
733 dout(10) << __func__ << " " << removed
734 << " pgs removed because they're created" << dendl;
735 pending_creatings.last_scan_epoch = osdmap.get_epoch();
736 }
737
738 // process queue
739 unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
740 const auto total = pending_creatings.pgs.size();
741 while (pending_creatings.pgs.size() < max &&
742 !pending_creatings.queue.empty()) {
743 auto p = pending_creatings.queue.begin();
744 int64_t poolid = p->first;
745 dout(10) << __func__ << " pool " << poolid
746 << " created " << p->second.created
747 << " modified " << p->second.modified
748 << " [" << p->second.start << "-" << p->second.end << ")"
749 << dendl;
750 int n = MIN(max - pending_creatings.pgs.size(),
751 p->second.end - p->second.start);
752 ps_t first = p->second.start;
753 ps_t end = first + n;
754 for (ps_t ps = first; ps < end; ++ps) {
755 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
756 // NOTE: use the *current* epoch as the PG creation epoch so that the
757 // OSD does not have to generate a long set of PastIntervals.
758 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
759 p->second.modified));
760 dout(10) << __func__ << " adding " << pgid << dendl;
761 }
762 p->second.start = end;
763 if (p->second.done()) {
764 dout(10) << __func__ << " done with queue for " << poolid << dendl;
765 pending_creatings.queue.erase(p);
766 } else {
767 dout(10) << __func__ << " pool " << poolid
768 << " now [" << p->second.start << "-" << p->second.end << ")"
769 << dendl;
770 }
771 }
772 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
773 << " pools" << dendl;
774 dout(10) << __func__
775 << " " << (pending_creatings.pgs.size() - total)
776 << "/" << pending_creatings.pgs.size()
777 << " pgs added from queued pools" << dendl;
778 return pending_creatings;
779 }
780
781 void OSDMonitor::maybe_prime_pg_temp()
782 {
783 bool all = false;
784 if (pending_inc.crush.length()) {
785 dout(10) << __func__ << " new crush map, all" << dendl;
786 all = true;
787 }
788
789 if (!pending_inc.new_up_client.empty()) {
790 dout(10) << __func__ << " new up osds, all" << dendl;
791 all = true;
792 }
793
794 // check for interesting OSDs
795 set<int> osds;
796 for (auto p = pending_inc.new_state.begin();
797 !all && p != pending_inc.new_state.end();
798 ++p) {
799 if ((p->second & CEPH_OSD_UP) &&
800 osdmap.is_up(p->first)) {
801 osds.insert(p->first);
802 }
803 }
804 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
805 !all && p != pending_inc.new_weight.end();
806 ++p) {
807 if (p->second < osdmap.get_weight(p->first)) {
808 // weight reduction
809 osds.insert(p->first);
810 } else {
811 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
812 << dendl;
813 all = true;
814 }
815 }
816
817 if (!all && osds.empty())
818 return;
819
820 if (!all) {
821 unsigned estimate =
822 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
823 if (estimate > mapping.get_num_pgs() *
824 g_conf->mon_osd_prime_pg_temp_max_estimate) {
825 dout(10) << __func__ << " estimate " << estimate << " pgs on "
826 << osds.size() << " osds >= "
827 << g_conf->mon_osd_prime_pg_temp_max_estimate << " of total "
828 << mapping.get_num_pgs() << " pgs, all"
829 << dendl;
830 all = true;
831 } else {
832 dout(10) << __func__ << " estimate " << estimate << " pgs on "
833 << osds.size() << " osds" << dendl;
834 }
835 }
836
837 OSDMap next;
838 next.deepish_copy_from(osdmap);
839 next.apply_incremental(pending_inc);
840
841 if (next.get_pools().empty()) {
842 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
843 } else if (all) {
844 PrimeTempJob job(next, this);
845 mapper.queue(&job, g_conf->mon_osd_mapping_pgs_per_chunk);
846 if (job.wait_for(g_conf->mon_osd_prime_pg_temp_max_time)) {
847 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
848 } else {
849 dout(10) << __func__ << " did not finish in "
850 << g_conf->mon_osd_prime_pg_temp_max_time
851 << ", stopping" << dendl;
852 job.abort();
853 }
854 } else {
855 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
856 utime_t stop = ceph_clock_now();
857 stop += g_conf->mon_osd_prime_pg_temp_max_time;
858 const int chunk = 1000;
859 int n = chunk;
860 std::unordered_set<pg_t> did_pgs;
861 for (auto osd : osds) {
862 auto& pgs = mapping.get_osd_acting_pgs(osd);
863 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
864 for (auto pgid : pgs) {
865 if (!did_pgs.insert(pgid).second) {
866 continue;
867 }
868 prime_pg_temp(next, pgid);
869 if (--n <= 0) {
870 n = chunk;
871 if (ceph_clock_now() > stop) {
872 dout(10) << __func__ << " consumed more than "
873 << g_conf->mon_osd_prime_pg_temp_max_time
874 << " seconds, stopping"
875 << dendl;
876 return;
877 }
878 }
879 }
880 }
881 }
882 }
883
884 void OSDMonitor::prime_pg_temp(
885 const OSDMap& next,
886 pg_t pgid)
887 {
888 if (mon->monmap->get_required_features().contains_all(
889 ceph::features::mon::FEATURE_LUMINOUS)) {
890 // TODO: remove this creating_pgs direct access?
891 if (creating_pgs.pgs.count(pgid)) {
892 return;
893 }
894 } else {
895 if (mon->pgservice->is_creating_pg(pgid)) {
896 return;
897 }
898 }
899 if (!osdmap.pg_exists(pgid)) {
900 return;
901 }
902
903 vector<int> up, acting;
904 mapping.get(pgid, &up, nullptr, &acting, nullptr);
905
906 vector<int> next_up, next_acting;
907 int next_up_primary, next_acting_primary;
908 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
909 &next_acting, &next_acting_primary);
910 if (acting == next_acting && next_up != next_acting)
911 return; // no change since last epoch
912
913 if (acting.empty())
914 return; // if previously empty now we can be no worse off
915 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
916 if (pool && acting.size() < pool->min_size)
917 return; // can be no worse off than before
918
919 if (next_up == next_acting) {
920 acting.clear();
921 dout(20) << __func__ << "next_up === next_acting now, clear pg_temp"
922 << dendl;
923 }
924
925 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
926 << " -> " << next_up << "/" << next_acting
927 << ", priming " << acting
928 << dendl;
929 {
930 Mutex::Locker l(prime_pg_temp_lock);
931 // do not touch a mapping if a change is pending
932 pending_inc.new_pg_temp.emplace(
933 pgid,
934 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
935 }
936 }
937
938 /**
939 * @note receiving a transaction in this function gives a fair amount of
940 * freedom to the service implementation if it does need it. It shouldn't.
941 */
942 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
943 {
944 dout(10) << "encode_pending e " << pending_inc.epoch
945 << dendl;
946
947 // finalize up pending_inc
948 pending_inc.modified = ceph_clock_now();
949
950 int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
951 assert(r == 0);
952
953 if (mapping_job) {
954 if (!mapping_job->is_done()) {
955 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
956 << mapping_job.get() << " did not complete, "
957 << mapping_job->shards << " left" << dendl;
958 mapping_job->abort();
959 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
960 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
961 << mapping_job.get() << " is prior epoch "
962 << mapping.get_epoch() << dendl;
963 } else {
964 if (g_conf->mon_osd_prime_pg_temp) {
965 maybe_prime_pg_temp();
966 }
967 }
968 } else if (g_conf->mon_osd_prime_pg_temp) {
969 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
970 << dendl;
971 }
972 mapping_job.reset();
973
974 // ensure we don't have blank new_state updates. these are interrpeted as
975 // CEPH_OSD_UP (and almost certainly not what we want!).
976 auto p = pending_inc.new_state.begin();
977 while (p != pending_inc.new_state.end()) {
978 if (p->second == 0) {
979 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
980 p = pending_inc.new_state.erase(p);
981 } else {
982 ++p;
983 }
984 }
985
986 bufferlist bl;
987
988 {
989 OSDMap tmp;
990 tmp.deepish_copy_from(osdmap);
991 tmp.apply_incremental(pending_inc);
992
993 if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
994 // remove any legacy osdmap nearfull/full flags
995 {
996 if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
997 dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
998 << dendl;
999 remove_flag(CEPH_OSDMAP_NEARFULL);
1000 remove_flag(CEPH_OSDMAP_FULL);
1001 }
1002 }
1003 // collect which pools are currently affected by
1004 // the near/backfill/full osd(s),
1005 // and set per-pool near/backfill/full flag instead
1006 set<int64_t> full_pool_ids;
1007 set<int64_t> backfillfull_pool_ids;
1008 set<int64_t> nearfull_pool_ids;
1009 tmp.get_full_pools(g_ceph_context,
1010 &full_pool_ids,
1011 &backfillfull_pool_ids,
1012 &nearfull_pool_ids);
1013 if (full_pool_ids.empty() ||
1014 backfillfull_pool_ids.empty() ||
1015 nearfull_pool_ids.empty()) {
1016 // normal case - no nearfull, backfillfull or full osds
1017 // try cancel any improper nearfull/backfillfull/full pool
1018 // flags first
1019 for (auto &pool: tmp.get_pools()) {
1020 auto p = pool.first;
1021 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1022 nearfull_pool_ids.empty()) {
1023 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1024 << "'s nearfull flag" << dendl;
1025 if (pending_inc.new_pools.count(p) == 0) {
1026 // load original pool info first!
1027 pending_inc.new_pools[p] = pool.second;
1028 }
1029 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1030 }
1031 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1032 backfillfull_pool_ids.empty()) {
1033 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1034 << "'s backfillfull flag" << dendl;
1035 if (pending_inc.new_pools.count(p) == 0) {
1036 pending_inc.new_pools[p] = pool.second;
1037 }
1038 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1039 }
1040 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1041 full_pool_ids.empty()) {
1042 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1043 // set by EQUOTA, skipping
1044 continue;
1045 }
1046 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1047 << "'s full flag" << dendl;
1048 if (pending_inc.new_pools.count(p) == 0) {
1049 pending_inc.new_pools[p] = pool.second;
1050 }
1051 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1052 }
1053 }
1054 }
1055 if (!full_pool_ids.empty()) {
1056 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1057 << " as full" << dendl;
1058 for (auto &p: full_pool_ids) {
1059 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1060 continue;
1061 }
1062 if (pending_inc.new_pools.count(p) == 0) {
1063 pending_inc.new_pools[p] = tmp.pools[p];
1064 }
1065 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1066 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1067 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1068 }
1069 // cancel FLAG_FULL for pools which are no longer full too
1070 for (auto &pool: tmp.get_pools()) {
1071 auto p = pool.first;
1072 if (full_pool_ids.count(p)) {
1073 // skip pools we have just marked as full above
1074 continue;
1075 }
1076 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1077 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1078 // don't touch if currently is not full
1079 // or is running out of quota (and hence considered as full)
1080 continue;
1081 }
1082 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1083 << "'s full flag" << dendl;
1084 if (pending_inc.new_pools.count(p) == 0) {
1085 pending_inc.new_pools[p] = pool.second;
1086 }
1087 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1088 }
1089 }
1090 if (!backfillfull_pool_ids.empty()) {
1091 for (auto &p: backfillfull_pool_ids) {
1092 if (full_pool_ids.count(p)) {
1093 // skip pools we have already considered as full above
1094 continue;
1095 }
1096 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1097 // make sure FLAG_FULL is truly set, so we are safe not
1098 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1099 assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1100 continue;
1101 }
1102 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1103 // don't bother if pool is already marked as backfillfull
1104 continue;
1105 }
1106 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1107 << "'s as backfillfull" << dendl;
1108 if (pending_inc.new_pools.count(p) == 0) {
1109 pending_inc.new_pools[p] = tmp.pools[p];
1110 }
1111 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1112 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1113 }
1114 // cancel FLAG_BACKFILLFULL for pools
1115 // which are no longer backfillfull too
1116 for (auto &pool: tmp.get_pools()) {
1117 auto p = pool.first;
1118 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1119 // skip pools we have just marked as backfillfull/full above
1120 continue;
1121 }
1122 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1123 // and don't touch if currently is not backfillfull
1124 continue;
1125 }
1126 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1127 << "'s backfillfull flag" << dendl;
1128 if (pending_inc.new_pools.count(p) == 0) {
1129 pending_inc.new_pools[p] = pool.second;
1130 }
1131 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1132 }
1133 }
1134 if (!nearfull_pool_ids.empty()) {
1135 for (auto &p: nearfull_pool_ids) {
1136 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1137 continue;
1138 }
1139 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1140 // make sure FLAG_FULL is truly set, so we are safe not
1141 // to set a extra (redundant) FLAG_NEARFULL flag
1142 assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1143 continue;
1144 }
1145 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1146 // don't bother if pool is already marked as nearfull
1147 continue;
1148 }
1149 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1150 << "'s as nearfull" << dendl;
1151 if (pending_inc.new_pools.count(p) == 0) {
1152 pending_inc.new_pools[p] = tmp.pools[p];
1153 }
1154 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1155 }
1156 // cancel FLAG_NEARFULL for pools
1157 // which are no longer nearfull too
1158 for (auto &pool: tmp.get_pools()) {
1159 auto p = pool.first;
1160 if (full_pool_ids.count(p) ||
1161 backfillfull_pool_ids.count(p) ||
1162 nearfull_pool_ids.count(p)) {
1163 // skip pools we have just marked as
1164 // nearfull/backfillfull/full above
1165 continue;
1166 }
1167 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1168 // and don't touch if currently is not nearfull
1169 continue;
1170 }
1171 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1172 << "'s nearfull flag" << dendl;
1173 if (pending_inc.new_pools.count(p) == 0) {
1174 pending_inc.new_pools[p] = pool.second;
1175 }
1176 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1177 }
1178 }
1179
1180 // min_compat_client?
1181 if (tmp.require_min_compat_client == 0) {
1182 auto mv = tmp.get_min_compat_client();
1183 dout(1) << __func__ << " setting require_min_compat_client to currently "
1184 << "required " << ceph_release_name(mv) << dendl;
1185 mon->clog->info() << "setting require_min_compat_client to currently "
1186 << "required " << ceph_release_name(mv);
1187 pending_inc.new_require_min_compat_client = mv;
1188 }
1189
1190 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1191 // convert ec profile ruleset-* -> crush-*
1192 for (auto& p : tmp.erasure_code_profiles) {
1193 bool changed = false;
1194 map<string,string> newprofile;
1195 for (auto& q : p.second) {
1196 if (q.first.find("ruleset-") == 0) {
1197 string key = "crush-";
1198 key += q.first.substr(8);
1199 newprofile[key] = q.second;
1200 changed = true;
1201 dout(20) << " updating ec profile " << p.first
1202 << " key " << q.first << " -> " << key << dendl;
1203 } else {
1204 newprofile[q.first] = q.second;
1205 }
1206 }
1207 if (changed) {
1208 dout(10) << " updated ec profile " << p.first << ": "
1209 << newprofile << dendl;
1210 pending_inc.new_erasure_code_profiles[p.first] = newprofile;
1211 }
1212 }
1213
1214 // auto-enable pool applications upon upgrade
1215 // NOTE: this can be removed post-Luminous assuming upgrades need to
1216 // proceed through Luminous
1217 for (auto &pool_pair : tmp.pools) {
1218 int64_t pool_id = pool_pair.first;
1219 pg_pool_t pg_pool = pool_pair.second;
1220 if (pg_pool.is_tier()) {
1221 continue;
1222 }
1223
1224 std::string pool_name = tmp.get_pool_name(pool_id);
1225 uint32_t match_count = 0;
1226
1227 // CephFS
1228 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
1229 if (pending_fsmap.pool_in_use(pool_id)) {
1230 dout(10) << __func__ << " auto-enabling CephFS on pool '"
1231 << pool_name << "'" << dendl;
1232 pg_pool.application_metadata.insert(
1233 {pg_pool_t::APPLICATION_NAME_CEPHFS, {}});
1234 ++match_count;
1235 }
1236
1237 // RBD heuristics (default OpenStack pool names from docs and
1238 // ceph-ansible)
1239 if (boost::algorithm::contains(pool_name, "rbd") ||
1240 pool_name == "images" || pool_name == "volumes" ||
1241 pool_name == "backups" || pool_name == "vms") {
1242 dout(10) << __func__ << " auto-enabling RBD on pool '"
1243 << pool_name << "'" << dendl;
1244 pg_pool.application_metadata.insert(
1245 {pg_pool_t::APPLICATION_NAME_RBD, {}});
1246 ++match_count;
1247 }
1248
1249 // RGW heuristics
1250 if (boost::algorithm::contains(pool_name, ".rgw") ||
1251 boost::algorithm::contains(pool_name, ".log") ||
1252 boost::algorithm::contains(pool_name, ".intent-log") ||
1253 boost::algorithm::contains(pool_name, ".usage") ||
1254 boost::algorithm::contains(pool_name, ".users")) {
1255 dout(10) << __func__ << " auto-enabling RGW on pool '"
1256 << pool_name << "'" << dendl;
1257 pg_pool.application_metadata.insert(
1258 {pg_pool_t::APPLICATION_NAME_RGW, {}});
1259 ++match_count;
1260 }
1261
1262 // OpenStack gnocchi (from ceph-ansible)
1263 if (pool_name == "metrics" && match_count == 0) {
1264 dout(10) << __func__ << " auto-enabling OpenStack Gnocchi on pool '"
1265 << pool_name << "'" << dendl;
1266 pg_pool.application_metadata.insert({"openstack_gnocchi", {}});
1267 ++match_count;
1268 }
1269
1270 if (match_count == 1) {
1271 pg_pool.last_change = pending_inc.epoch;
1272 pending_inc.new_pools[pool_id] = pg_pool;
1273 } else if (match_count > 1) {
1274 auto pstat = mon->pgservice->get_pool_stat(pool_id);
1275 if (pstat != nullptr && pstat->stats.sum.num_objects > 0) {
1276 mon->clog->info() << "unable to auto-enable application for pool "
1277 << "'" << pool_name << "'";
1278 }
1279 }
1280 }
1281 }
1282 }
1283 }
1284
1285 // tell me about it
1286 for (auto i = pending_inc.new_state.begin();
1287 i != pending_inc.new_state.end();
1288 ++i) {
1289 int s = i->second ? i->second : CEPH_OSD_UP;
1290 if (s & CEPH_OSD_UP)
1291 dout(2) << " osd." << i->first << " DOWN" << dendl;
1292 if (s & CEPH_OSD_EXISTS)
1293 dout(2) << " osd." << i->first << " DNE" << dendl;
1294 }
1295 for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
1296 i != pending_inc.new_up_client.end();
1297 ++i) {
1298 //FIXME: insert cluster addresses too
1299 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1300 }
1301 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1302 i != pending_inc.new_weight.end();
1303 ++i) {
1304 if (i->second == CEPH_OSD_OUT) {
1305 dout(2) << " osd." << i->first << " OUT" << dendl;
1306 } else if (i->second == CEPH_OSD_IN) {
1307 dout(2) << " osd." << i->first << " IN" << dendl;
1308 } else {
1309 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1310 }
1311 }
1312
1313 // features for osdmap and its incremental
1314 uint64_t features = mon->get_quorum_con_features();
1315
1316 // encode full map and determine its crc
1317 OSDMap tmp;
1318 {
1319 tmp.deepish_copy_from(osdmap);
1320 tmp.apply_incremental(pending_inc);
1321
1322 // determine appropriate features
1323 if (tmp.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1324 dout(10) << __func__ << " encoding without feature SERVER_LUMINOUS"
1325 << dendl;
1326 features &= ~CEPH_FEATURE_SERVER_LUMINOUS;
1327 }
1328 if (tmp.require_osd_release < CEPH_RELEASE_KRAKEN) {
1329 dout(10) << __func__ << " encoding without feature SERVER_KRAKEN | "
1330 << "MSG_ADDR2" << dendl;
1331 features &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1332 CEPH_FEATURE_MSG_ADDR2);
1333 }
1334 if (tmp.require_osd_release < CEPH_RELEASE_JEWEL) {
1335 dout(10) << __func__ << " encoding without feature SERVER_JEWEL" << dendl;
1336 features &= ~CEPH_FEATURE_SERVER_JEWEL;
1337 }
1338 dout(10) << __func__ << " encoding full map with " << features << dendl;
1339
1340 bufferlist fullbl;
1341 ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1342 pending_inc.full_crc = tmp.get_crc();
1343
1344 // include full map in the txn. note that old monitors will
1345 // overwrite this. new ones will now skip the local full map
1346 // encode and reload from this.
1347 put_version_full(t, pending_inc.epoch, fullbl);
1348 }
1349
1350 // encode
1351 assert(get_last_committed() + 1 == pending_inc.epoch);
1352 ::encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1353
1354 dout(20) << " full_crc " << tmp.get_crc()
1355 << " inc_crc " << pending_inc.inc_crc << dendl;
1356
1357 /* put everything in the transaction */
1358 put_version(t, pending_inc.epoch, bl);
1359 put_last_committed(t, pending_inc.epoch);
1360
1361 // metadata, too!
1362 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1363 p != pending_metadata.end();
1364 ++p)
1365 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1366 for (set<int>::iterator p = pending_metadata_rm.begin();
1367 p != pending_metadata_rm.end();
1368 ++p)
1369 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1370 pending_metadata.clear();
1371 pending_metadata_rm.clear();
1372
1373 // and pg creating, also!
1374 if (mon->monmap->get_required_features().contains_all(
1375 ceph::features::mon::FEATURE_LUMINOUS)) {
1376 auto pending_creatings = update_pending_pgs(pending_inc);
1377 if (osdmap.get_epoch() &&
1378 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1379 dout(7) << __func__ << " in the middle of upgrading, "
1380 << " trimming pending creating_pgs using pgmap" << dendl;
1381 mon->pgservice->maybe_trim_creating_pgs(&pending_creatings);
1382 }
1383 bufferlist creatings_bl;
1384 ::encode(pending_creatings, creatings_bl);
1385 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1386 }
1387
1388 // health
1389 health_check_map_t next;
1390 tmp.check_health(&next);
1391 encode_health(next, t);
1392 }
1393
1394 void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
1395 const ceph::unordered_map<pg_t,pg_stat_t>& pg_stat)
1396 {
1397 auto p = creating_pgs->pgs.begin();
1398 while (p != creating_pgs->pgs.end()) {
1399 auto q = pg_stat.find(p->first);
1400 if (q != pg_stat.end() &&
1401 !(q->second.state & PG_STATE_CREATING)) {
1402 dout(20) << __func__ << " pgmap shows " << p->first << " is created"
1403 << dendl;
1404 p = creating_pgs->pgs.erase(p);
1405 } else {
1406 ++p;
1407 }
1408 }
1409 }
1410
1411 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1412 {
1413 bufferlist bl;
1414 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1415 if (r < 0)
1416 return r;
1417 try {
1418 bufferlist::iterator p = bl.begin();
1419 ::decode(m, p);
1420 }
1421 catch (buffer::error& e) {
1422 if (err)
1423 *err << "osd." << osd << " metadata is corrupt";
1424 return -EIO;
1425 }
1426 return 0;
1427 }
1428
1429 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
1430 {
1431 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1432 if (osdmap.is_up(osd)) {
1433 map<string,string> meta;
1434 load_metadata(osd, meta, nullptr);
1435 auto p = meta.find(field);
1436 if (p == meta.end()) {
1437 (*out)["unknown"]++;
1438 } else {
1439 (*out)[p->second]++;
1440 }
1441 }
1442 }
1443 }
1444
1445 void OSDMonitor::count_metadata(const string& field, Formatter *f)
1446 {
1447 map<string,int> by_val;
1448 count_metadata(field, &by_val);
1449 f->open_object_section(field.c_str());
1450 for (auto& p : by_val) {
1451 f->dump_int(p.first.c_str(), p.second);
1452 }
1453 f->close_section();
1454 }
1455
1456 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1457 {
1458 map<string, string> metadata;
1459 int r = load_metadata(osd, metadata, nullptr);
1460 if (r < 0)
1461 return r;
1462
1463 auto it = metadata.find("osd_objectstore");
1464 if (it == metadata.end())
1465 return -ENOENT;
1466 *type = it->second;
1467 return 0;
1468 }
1469
1470 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1471 const pg_pool_t &pool,
1472 ostream *err)
1473 {
1474 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1475 // since filestore osds could always join the pool later
1476 set<int> checked_osds;
1477 for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) {
1478 vector<int> up, acting;
1479 pg_t pgid(ps, pool_id, -1);
1480 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1481 for (int osd : up) {
1482 if (checked_osds.find(osd) != checked_osds.end())
1483 continue;
1484 string objectstore_type;
1485 int r = get_osd_objectstore_type(osd, &objectstore_type);
1486 // allow with missing metadata, e.g. due to an osd never booting yet
1487 if (r < 0 || objectstore_type == "bluestore") {
1488 checked_osds.insert(osd);
1489 continue;
1490 }
1491 *err << "osd." << osd << " uses " << objectstore_type;
1492 return false;
1493 }
1494 }
1495 return true;
1496 }
1497
1498 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1499 {
1500 map<string,string> m;
1501 if (int r = load_metadata(osd, m, err))
1502 return r;
1503 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1504 f->dump_string(p->first.c_str(), p->second);
1505 return 0;
1506 }
1507
1508 void OSDMonitor::print_nodes(Formatter *f)
1509 {
1510 // group OSDs by their hosts
1511 map<string, list<int> > osds; // hostname => osd
1512 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1513 map<string, string> m;
1514 if (load_metadata(osd, m, NULL)) {
1515 continue;
1516 }
1517 map<string, string>::iterator hostname = m.find("hostname");
1518 if (hostname == m.end()) {
1519 // not likely though
1520 continue;
1521 }
1522 osds[hostname->second].push_back(osd);
1523 }
1524
1525 dump_services(f, osds, "osd");
1526 }
1527
1528 void OSDMonitor::share_map_with_random_osd()
1529 {
1530 if (osdmap.get_num_up_osds() == 0) {
1531 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1532 return;
1533 }
1534
1535 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1536 if (!s) {
1537 dout(10) << __func__ << " no up osd on our session map" << dendl;
1538 return;
1539 }
1540
1541 dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
1542 // whatev, they'll request more if they need it
1543 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
1544 s->con->send_message(m);
1545 // NOTE: do *not* record osd has up to this epoch (as we do
1546 // elsewhere) as they may still need to request older values.
1547 }
1548
1549 version_t OSDMonitor::get_trim_to()
1550 {
1551 if (mon->get_quorum().empty()) {
1552 dout(10) << __func__ << ": quorum not formed" << dendl;
1553 return 0;
1554 }
1555
1556 epoch_t floor;
1557 if (mon->monmap->get_required_features().contains_all(
1558 ceph::features::mon::FEATURE_LUMINOUS)) {
1559 {
1560 // TODO: Get this hidden in PGStatService
1561 std::lock_guard<std::mutex> l(creating_pgs_lock);
1562 if (!creating_pgs.pgs.empty()) {
1563 return 0;
1564 }
1565 }
1566 floor = get_min_last_epoch_clean();
1567 } else {
1568 if (!mon->pgservice->is_readable())
1569 return 0;
1570 if (mon->pgservice->have_creating_pgs()) {
1571 return 0;
1572 }
1573 floor = mon->pgservice->get_min_last_epoch_clean();
1574 }
1575 {
1576 dout(10) << " min_last_epoch_clean " << floor << dendl;
1577 if (g_conf->mon_osd_force_trim_to > 0 &&
1578 g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
1579 floor = g_conf->mon_osd_force_trim_to;
1580 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1581 }
1582 unsigned min = g_conf->mon_min_osdmap_epochs;
1583 if (floor + min > get_last_committed()) {
1584 if (min < get_last_committed())
1585 floor = get_last_committed() - min;
1586 else
1587 floor = 0;
1588 }
1589 if (floor > get_first_committed())
1590 return floor;
1591 }
1592 return 0;
1593 }
1594
1595 epoch_t OSDMonitor::get_min_last_epoch_clean() const
1596 {
1597 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1598 // also scan osd epochs
1599 // don't trim past the oldest reported osd epoch
1600 for (auto& osd_epoch : osd_epochs) {
1601 if (osd_epoch.second < floor) {
1602 floor = osd_epoch.second;
1603 }
1604 }
1605 return floor;
1606 }
1607
1608 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1609 version_t first)
1610 {
1611 dout(10) << __func__ << " including full map for e " << first << dendl;
1612 bufferlist bl;
1613 get_version_full(first, bl);
1614 put_version_full(tx, first, bl);
1615 }
1616
1617 // -------------
1618
1619 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
1620 {
1621 op->mark_osdmon_event(__func__);
1622 Message *m = op->get_req();
1623 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
1624
1625 switch (m->get_type()) {
1626 // READs
1627 case MSG_MON_COMMAND:
1628 return preprocess_command(op);
1629 case CEPH_MSG_MON_GET_OSDMAP:
1630 return preprocess_get_osdmap(op);
1631
1632 // damp updates
1633 case MSG_OSD_MARK_ME_DOWN:
1634 return preprocess_mark_me_down(op);
1635 case MSG_OSD_FULL:
1636 return preprocess_full(op);
1637 case MSG_OSD_FAILURE:
1638 return preprocess_failure(op);
1639 case MSG_OSD_BOOT:
1640 return preprocess_boot(op);
1641 case MSG_OSD_ALIVE:
1642 return preprocess_alive(op);
1643 case MSG_OSD_PG_CREATED:
1644 return preprocess_pg_created(op);
1645 case MSG_OSD_PGTEMP:
1646 return preprocess_pgtemp(op);
1647 case MSG_OSD_BEACON:
1648 return preprocess_beacon(op);
1649
1650 case CEPH_MSG_POOLOP:
1651 return preprocess_pool_op(op);
1652
1653 case MSG_REMOVE_SNAPS:
1654 return preprocess_remove_snaps(op);
1655
1656 default:
1657 ceph_abort();
1658 return true;
1659 }
1660 }
1661
1662 bool OSDMonitor::prepare_update(MonOpRequestRef op)
1663 {
1664 op->mark_osdmon_event(__func__);
1665 Message *m = op->get_req();
1666 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
1667
1668 switch (m->get_type()) {
1669 // damp updates
1670 case MSG_OSD_MARK_ME_DOWN:
1671 return prepare_mark_me_down(op);
1672 case MSG_OSD_FULL:
1673 return prepare_full(op);
1674 case MSG_OSD_FAILURE:
1675 return prepare_failure(op);
1676 case MSG_OSD_BOOT:
1677 return prepare_boot(op);
1678 case MSG_OSD_ALIVE:
1679 return prepare_alive(op);
1680 case MSG_OSD_PG_CREATED:
1681 return prepare_pg_created(op);
1682 case MSG_OSD_PGTEMP:
1683 return prepare_pgtemp(op);
1684 case MSG_OSD_BEACON:
1685 return prepare_beacon(op);
1686
1687 case MSG_MON_COMMAND:
1688 return prepare_command(op);
1689
1690 case CEPH_MSG_POOLOP:
1691 return prepare_pool_op(op);
1692
1693 case MSG_REMOVE_SNAPS:
1694 return prepare_remove_snaps(op);
1695
1696
1697 default:
1698 ceph_abort();
1699 }
1700
1701 return false;
1702 }
1703
1704 bool OSDMonitor::should_propose(double& delay)
1705 {
1706 dout(10) << "should_propose" << dendl;
1707
1708 // if full map, propose immediately! any subsequent changes will be clobbered.
1709 if (pending_inc.fullmap.length())
1710 return true;
1711
1712 // adjust osd weights?
1713 if (!osd_weight.empty() &&
1714 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
1715 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
1716 osdmap.adjust_osd_weights(osd_weight, pending_inc);
1717 delay = 0.0;
1718 osd_weight.clear();
1719 return true;
1720 }
1721
1722 // propose as fast as possible if updating up_thru or pg_temp
1723 // want to merge OSDMap changes as much as possible
1724 if ((pending_inc.new_primary_temp.size() == 1
1725 || pending_inc.new_up_thru.size() == 1)
1726 && pending_inc.new_state.size() < 2) {
1727 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl;
1728
1729 utime_t now = ceph_clock_now();
1730 if (now - last_attempted_minwait_time > g_conf->paxos_propose_interval
1731 && now - paxos->get_last_commit_time() > g_conf->paxos_min_wait) {
1732 delay = g_conf->paxos_min_wait;
1733 last_attempted_minwait_time = now;
1734 return true;
1735 }
1736 }
1737
1738 return PaxosService::should_propose(delay);
1739 }
1740
1741
1742
1743 // ---------------------------
1744 // READs
1745
1746 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
1747 {
1748 op->mark_osdmon_event(__func__);
1749 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
1750 dout(10) << __func__ << " " << *m << dendl;
1751 MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
1752 epoch_t first = get_first_committed();
1753 epoch_t last = osdmap.get_epoch();
1754 int max = g_conf->osd_map_message_max;
1755 for (epoch_t e = MAX(first, m->get_full_first());
1756 e <= MIN(last, m->get_full_last()) && max > 0;
1757 ++e, --max) {
1758 int r = get_version_full(e, reply->maps[e]);
1759 assert(r >= 0);
1760 }
1761 for (epoch_t e = MAX(first, m->get_inc_first());
1762 e <= MIN(last, m->get_inc_last()) && max > 0;
1763 ++e, --max) {
1764 int r = get_version(e, reply->incremental_maps[e]);
1765 assert(r >= 0);
1766 }
1767 reply->oldest_map = first;
1768 reply->newest_map = last;
1769 mon->send_reply(op, reply);
1770 return true;
1771 }
1772
1773
1774 // ---------------------------
1775 // UPDATEs
1776
1777 // failure --
1778
1779 bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
1780 // check permissions
1781 MonSession *session = m->get_session();
1782 if (!session)
1783 return true;
1784 if (!session->is_capable("osd", MON_CAP_X)) {
1785 dout(0) << "got MOSDFailure from entity with insufficient caps "
1786 << session->caps << dendl;
1787 return true;
1788 }
1789 if (fsid != mon->monmap->fsid) {
1790 dout(0) << "check_source: on fsid " << fsid
1791 << " != " << mon->monmap->fsid << dendl;
1792 return true;
1793 }
1794 return false;
1795 }
1796
1797
1798 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
1799 {
1800 op->mark_osdmon_event(__func__);
1801 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1802 // who is target_osd
1803 int badboy = m->get_target().name.num();
1804
1805 // check permissions
1806 if (check_source(m, m->fsid))
1807 goto didit;
1808
1809 // first, verify the reporting host is valid
1810 if (m->get_orig_source().is_osd()) {
1811 int from = m->get_orig_source().num();
1812 if (!osdmap.exists(from) ||
1813 osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
1814 (osdmap.is_down(from) && m->if_osd_failed())) {
1815 dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
1816 send_incremental(op, m->get_epoch()+1);
1817 goto didit;
1818 }
1819 }
1820
1821
1822 // weird?
1823 if (osdmap.is_down(badboy)) {
1824 dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1825 if (m->get_epoch() < osdmap.get_epoch())
1826 send_incremental(op, m->get_epoch()+1);
1827 goto didit;
1828 }
1829 if (osdmap.get_inst(badboy) != m->get_target()) {
1830 dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
1831 << ", from " << m->get_orig_source_inst() << dendl;
1832 if (m->get_epoch() < osdmap.get_epoch())
1833 send_incremental(op, m->get_epoch()+1);
1834 goto didit;
1835 }
1836
1837 // already reported?
1838 if (osdmap.is_down(badboy) ||
1839 osdmap.get_up_from(badboy) > m->get_epoch()) {
1840 dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1841 if (m->get_epoch() < osdmap.get_epoch())
1842 send_incremental(op, m->get_epoch()+1);
1843 goto didit;
1844 }
1845
1846 if (!can_mark_down(badboy)) {
1847 dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
1848 goto didit;
1849 }
1850
1851 dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1852 return false;
1853
1854 didit:
1855 return true;
1856 }
1857
1858 class C_AckMarkedDown : public C_MonOp {
1859 OSDMonitor *osdmon;
1860 public:
1861 C_AckMarkedDown(
1862 OSDMonitor *osdmon,
1863 MonOpRequestRef op)
1864 : C_MonOp(op), osdmon(osdmon) {}
1865
1866 void _finish(int) override {
1867 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1868 osdmon->mon->send_reply(
1869 op,
1870 new MOSDMarkMeDown(
1871 m->fsid,
1872 m->get_target(),
1873 m->get_epoch(),
1874 false)); // ACK itself does not request an ack
1875 }
1876 ~C_AckMarkedDown() override {
1877 }
1878 };
1879
1880 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
1881 {
1882 op->mark_osdmon_event(__func__);
1883 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1884 int requesting_down = m->get_target().name.num();
1885 int from = m->get_orig_source().num();
1886
1887 // check permissions
1888 if (check_source(m, m->fsid))
1889 goto reply;
1890
1891 // first, verify the reporting host is valid
1892 if (!m->get_orig_source().is_osd())
1893 goto reply;
1894
1895 if (!osdmap.exists(from) ||
1896 osdmap.is_down(from) ||
1897 osdmap.get_addr(from) != m->get_target().addr) {
1898 dout(5) << "preprocess_mark_me_down from dead osd."
1899 << from << ", ignoring" << dendl;
1900 send_incremental(op, m->get_epoch()+1);
1901 goto reply;
1902 }
1903
1904 // no down might be set
1905 if (!can_mark_down(requesting_down))
1906 goto reply;
1907
1908 dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
1909 return false;
1910
1911 reply:
1912 if (m->request_ack) {
1913 Context *c(new C_AckMarkedDown(this, op));
1914 c->complete(0);
1915 }
1916 return true;
1917 }
1918
1919 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
1920 {
1921 op->mark_osdmon_event(__func__);
1922 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1923 int target_osd = m->get_target().name.num();
1924
1925 assert(osdmap.is_up(target_osd));
1926 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1927
1928 mon->clog->info() << "osd." << target_osd << " marked itself down";
1929 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1930 if (m->request_ack)
1931 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
1932 return true;
1933 }
1934
1935 bool OSDMonitor::can_mark_down(int i)
1936 {
1937 if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
1938 dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i
1939 << " down" << dendl;
1940 return false;
1941 }
1942
1943 if (osdmap.is_nodown(i)) {
1944 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
1945 << "will not mark it down" << dendl;
1946 return false;
1947 }
1948
1949 int num_osds = osdmap.get_num_osds();
1950 if (num_osds == 0) {
1951 dout(5) << __func__ << " no osds" << dendl;
1952 return false;
1953 }
1954 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
1955 float up_ratio = (float)up / (float)num_osds;
1956 if (up_ratio < g_conf->mon_osd_min_up_ratio) {
1957 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
1958 << g_conf->mon_osd_min_up_ratio
1959 << ", will not mark osd." << i << " down" << dendl;
1960 return false;
1961 }
1962 return true;
1963 }
1964
1965 bool OSDMonitor::can_mark_up(int i)
1966 {
1967 if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
1968 dout(5) << __func__ << " NOUP flag set, will not mark osd." << i
1969 << " up" << dendl;
1970 return false;
1971 }
1972
1973 if (osdmap.is_noup(i)) {
1974 dout(5) << __func__ << " osd." << i << " is marked as noup, "
1975 << "will not mark it up" << dendl;
1976 return false;
1977 }
1978
1979 return true;
1980 }
1981
1982 /**
1983 * @note the parameter @p i apparently only exists here so we can output the
1984 * osd's id on messages.
1985 */
1986 bool OSDMonitor::can_mark_out(int i)
1987 {
1988 if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
1989 dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
1990 return false;
1991 }
1992
1993 if (osdmap.is_noout(i)) {
1994 dout(5) << __func__ << " osd." << i << " is marked as noout, "
1995 << "will not mark it out" << dendl;
1996 return false;
1997 }
1998
1999 int num_osds = osdmap.get_num_osds();
2000 if (num_osds == 0) {
2001 dout(5) << __func__ << " no osds" << dendl;
2002 return false;
2003 }
2004 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
2005 float in_ratio = (float)in / (float)num_osds;
2006 if (in_ratio < g_conf->mon_osd_min_in_ratio) {
2007 if (i >= 0)
2008 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2009 << g_conf->mon_osd_min_in_ratio
2010 << ", will not mark osd." << i << " out" << dendl;
2011 else
2012 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2013 << g_conf->mon_osd_min_in_ratio
2014 << ", will not mark osds out" << dendl;
2015 return false;
2016 }
2017
2018 return true;
2019 }
2020
2021 bool OSDMonitor::can_mark_in(int i)
2022 {
2023 if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
2024 dout(5) << __func__ << " NOIN flag set, will not mark osd." << i
2025 << " in" << dendl;
2026 return false;
2027 }
2028
2029 if (osdmap.is_noin(i)) {
2030 dout(5) << __func__ << " osd." << i << " is marked as noin, "
2031 << "will not mark it in" << dendl;
2032 return false;
2033 }
2034
2035 return true;
2036 }
2037
2038 bool OSDMonitor::check_failures(utime_t now)
2039 {
2040 bool found_failure = false;
2041 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2042 p != failure_info.end();
2043 ++p) {
2044 if (can_mark_down(p->first)) {
2045 found_failure |= check_failure(now, p->first, p->second);
2046 }
2047 }
2048 return found_failure;
2049 }
2050
2051 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
2052 {
2053 // already pending failure?
2054 if (pending_inc.new_state.count(target_osd) &&
2055 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2056 dout(10) << " already pending failure" << dendl;
2057 return true;
2058 }
2059
2060 set<string> reporters_by_subtree;
2061 string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
2062 utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
2063 utime_t max_failed_since = fi.get_failed_since();
2064 utime_t failed_for = now - max_failed_since;
2065
2066 utime_t grace = orig_grace;
2067 double my_grace = 0, peer_grace = 0;
2068 double decay_k = 0;
2069 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2070 double halflife = (double)g_conf->mon_osd_laggy_halflife;
2071 decay_k = ::log(.5) / halflife;
2072
2073 // scale grace period based on historical probability of 'lagginess'
2074 // (false positive failures due to slowness).
2075 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
2076 double decay = exp((double)failed_for * decay_k);
2077 dout(20) << " halflife " << halflife << " decay_k " << decay_k
2078 << " failed_for " << failed_for << " decay " << decay << dendl;
2079 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
2080 grace += my_grace;
2081 }
2082
2083 // consider the peers reporting a failure a proxy for a potential
2084 // 'subcluster' over the overall cluster that is similarly
2085 // laggy. this is clearly not true in all cases, but will sometimes
2086 // help us localize the grace correction to a subset of the system
2087 // (say, a rack with a bad switch) that is unhappy.
2088 assert(fi.reporters.size());
2089 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
2090 p != fi.reporters.end();
2091 ++p) {
2092 // get the parent bucket whose type matches with "reporter_subtree_level".
2093 // fall back to OSD if the level doesn't exist.
2094 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
2095 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
2096 if (iter == reporter_loc.end()) {
2097 reporters_by_subtree.insert("osd." + to_string(p->first));
2098 } else {
2099 reporters_by_subtree.insert(iter->second);
2100 }
2101 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2102 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
2103 utime_t elapsed = now - xi.down_stamp;
2104 double decay = exp((double)elapsed * decay_k);
2105 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
2106 }
2107 }
2108
2109 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2110 peer_grace /= (double)fi.reporters.size();
2111 grace += peer_grace;
2112 }
2113
2114 dout(10) << " osd." << target_osd << " has "
2115 << fi.reporters.size() << " reporters, "
2116 << grace << " grace (" << orig_grace << " + " << my_grace
2117 << " + " << peer_grace << "), max_failed_since " << max_failed_since
2118 << dendl;
2119
2120 if (failed_for >= grace &&
2121 (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
2122 dout(1) << " we have enough reporters to mark osd." << target_osd
2123 << " down" << dendl;
2124 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2125
2126 mon->clog->info() << "osd." << target_osd << " failed ("
2127 << osdmap.crush->get_full_location_ordered_string(
2128 target_osd)
2129 << ") ("
2130 << (int)reporters_by_subtree.size()
2131 << " reporters from different "
2132 << reporter_subtree_level << " after "
2133 << failed_for << " >= grace " << grace << ")";
2134 return true;
2135 }
2136 return false;
2137 }
2138
2139 void OSDMonitor::force_failure(int target_osd, int by)
2140 {
2141 // already pending failure?
2142 if (pending_inc.new_state.count(target_osd) &&
2143 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2144 dout(10) << " already pending failure" << dendl;
2145 return;
2146 }
2147
2148 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
2149 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2150
2151 mon->clog->info() << "osd." << target_osd << " failed ("
2152 << osdmap.crush->get_full_location_ordered_string(target_osd)
2153 << ") (connection refused reported by osd." << by << ")";
2154 return;
2155 }
2156
2157 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
2158 {
2159 op->mark_osdmon_event(__func__);
2160 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2161 dout(1) << "prepare_failure " << m->get_target()
2162 << " from " << m->get_orig_source_inst()
2163 << " is reporting failure:" << m->if_osd_failed() << dendl;
2164
2165 int target_osd = m->get_target().name.num();
2166 int reporter = m->get_orig_source().num();
2167 assert(osdmap.is_up(target_osd));
2168 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
2169
2170 if (m->if_osd_failed()) {
2171 // calculate failure time
2172 utime_t now = ceph_clock_now();
2173 utime_t failed_since =
2174 m->get_recv_stamp() - utime_t(m->failed_for, 0);
2175
2176 // add a report
2177 if (m->is_immediate()) {
2178 mon->clog->debug() << m->get_target() << " reported immediately failed by "
2179 << m->get_orig_source_inst();
2180 force_failure(target_osd, reporter);
2181 return true;
2182 }
2183 mon->clog->debug() << m->get_target() << " reported failed by "
2184 << m->get_orig_source_inst();
2185
2186 failure_info_t& fi = failure_info[target_osd];
2187 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2188 if (old_op) {
2189 mon->no_reply(old_op);
2190 }
2191
2192 return check_failure(now, target_osd, fi);
2193 } else {
2194 // remove the report
2195 mon->clog->debug() << m->get_target() << " failure report canceled by "
2196 << m->get_orig_source_inst();
2197 if (failure_info.count(target_osd)) {
2198 failure_info_t& fi = failure_info[target_osd];
2199 MonOpRequestRef report_op = fi.cancel_report(reporter);
2200 if (report_op) {
2201 mon->no_reply(report_op);
2202 }
2203 if (fi.reporters.empty()) {
2204 dout(10) << " removing last failure_info for osd." << target_osd
2205 << dendl;
2206 failure_info.erase(target_osd);
2207 } else {
2208 dout(10) << " failure_info for osd." << target_osd << " now "
2209 << fi.reporters.size() << " reporters" << dendl;
2210 }
2211 } else {
2212 dout(10) << " no failure_info for osd." << target_osd << dendl;
2213 }
2214 mon->no_reply(op);
2215 }
2216
2217 return false;
2218 }
2219
2220 void OSDMonitor::process_failures()
2221 {
2222 map<int,failure_info_t>::iterator p = failure_info.begin();
2223 while (p != failure_info.end()) {
2224 if (osdmap.is_up(p->first)) {
2225 ++p;
2226 } else {
2227 dout(10) << "process_failures osd." << p->first << dendl;
2228 list<MonOpRequestRef> ls;
2229 p->second.take_report_messages(ls);
2230 failure_info.erase(p++);
2231
2232 while (!ls.empty()) {
2233 MonOpRequestRef o = ls.front();
2234 if (o) {
2235 o->mark_event(__func__);
2236 MOSDFailure *m = o->get_req<MOSDFailure>();
2237 send_latest(o, m->get_epoch());
2238 }
2239 ls.pop_front();
2240 }
2241 }
2242 }
2243 }
2244
2245 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2246 {
2247 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2248
2249 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2250 p != failure_info.end();
2251 ++p) {
2252 p->second.take_report_messages(ls);
2253 }
2254 failure_info.clear();
2255 }
2256
2257
2258 // boot --
2259
2260 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2261 {
2262 op->mark_osdmon_event(__func__);
2263 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2264 int from = m->get_orig_source_inst().name.num();
2265
2266 // check permissions, ignore if failed (no response expected)
2267 MonSession *session = m->get_session();
2268 if (!session)
2269 goto ignore;
2270 if (!session->is_capable("osd", MON_CAP_X)) {
2271 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2272 << session->caps << dendl;
2273 goto ignore;
2274 }
2275
2276 if (m->sb.cluster_fsid != mon->monmap->fsid) {
2277 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2278 << " != " << mon->monmap->fsid << dendl;
2279 goto ignore;
2280 }
2281
2282 if (m->get_orig_source_inst().addr.is_blank_ip()) {
2283 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2284 goto ignore;
2285 }
2286
2287 assert(m->get_orig_source_inst().name.is_osd());
2288
2289 // check if osd has required features to boot
2290 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2291 CEPH_FEATURE_OSD_ERASURE_CODES) &&
2292 !(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
2293 dout(0) << __func__ << " osdmap requires erasure code but osd at "
2294 << m->get_orig_source_inst()
2295 << " doesn't announce support -- ignore" << dendl;
2296 goto ignore;
2297 }
2298
2299 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2300 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
2301 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
2302 dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
2303 << m->get_orig_source_inst()
2304 << " doesn't announce support -- ignore" << dendl;
2305 goto ignore;
2306 }
2307
2308 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2309 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
2310 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
2311 dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
2312 << m->get_orig_source_inst()
2313 << " doesn't announce support -- ignore" << dendl;
2314 goto ignore;
2315 }
2316
2317 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
2318 !HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2319 mon->clog->info() << "disallowing boot of OSD "
2320 << m->get_orig_source_inst()
2321 << " because the osdmap requires"
2322 << " CEPH_FEATURE_SERVER_LUMINOUS"
2323 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2324 goto ignore;
2325 }
2326
2327 if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL &&
2328 !(m->osd_features & CEPH_FEATURE_SERVER_JEWEL)) {
2329 mon->clog->info() << "disallowing boot of OSD "
2330 << m->get_orig_source_inst()
2331 << " because the osdmap requires"
2332 << " CEPH_FEATURE_SERVER_JEWEL"
2333 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2334 goto ignore;
2335 }
2336
2337 if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN &&
2338 !HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2339 mon->clog->info() << "disallowing boot of OSD "
2340 << m->get_orig_source_inst()
2341 << " because the osdmap requires"
2342 << " CEPH_FEATURE_SERVER_KRAKEN"
2343 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2344 goto ignore;
2345 }
2346
2347 if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
2348 !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
2349 mon->clog->info() << "disallowing boot of OSD "
2350 << m->get_orig_source_inst()
2351 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2352 goto ignore;
2353 }
2354
2355 if (osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES) &&
2356 !(m->osd_features & CEPH_FEATURE_OSD_RECOVERY_DELETES)) {
2357 mon->clog->info() << "disallowing boot of OSD "
2358 << m->get_orig_source_inst()
2359 << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
2360 goto ignore;
2361 }
2362
2363 if (any_of(osdmap.get_pools().begin(),
2364 osdmap.get_pools().end(),
2365 [](const std::pair<int64_t,pg_pool_t>& pool)
2366 { return pool.second.use_gmt_hitset; })) {
2367 assert(osdmap.get_num_up_osds() == 0 ||
2368 osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
2369 if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
2370 dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
2371 << m->get_orig_source_inst()
2372 << " doesn't announce support -- ignore" << dendl;
2373 goto ignore;
2374 }
2375 }
2376
2377 // make sure upgrades stop at luminous
2378 if (HAVE_FEATURE(m->osd_features, SERVER_M) &&
2379 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
2380 mon->clog->info() << "disallowing boot of post-luminous OSD "
2381 << m->get_orig_source_inst()
2382 << " because require_osd_release < luminous";
2383 goto ignore;
2384 }
2385
2386 // make sure upgrades stop at jewel
2387 if (HAVE_FEATURE(m->osd_features, SERVER_KRAKEN) &&
2388 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
2389 mon->clog->info() << "disallowing boot of post-jewel OSD "
2390 << m->get_orig_source_inst()
2391 << " because require_osd_release < jewel";
2392 goto ignore;
2393 }
2394
2395 // make sure upgrades stop at hammer
2396 // * HAMMER_0_94_4 is the required hammer feature
2397 // * MON_METADATA is the first post-hammer feature
2398 if (osdmap.get_num_up_osds() > 0) {
2399 if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
2400 !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
2401 mon->clog->info() << "disallowing boot of post-hammer OSD "
2402 << m->get_orig_source_inst()
2403 << " because one or more up OSDs is pre-hammer v0.94.4";
2404 goto ignore;
2405 }
2406 if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
2407 (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
2408 mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2409 << m->get_orig_source_inst()
2410 << " because all up OSDs are post-hammer";
2411 goto ignore;
2412 }
2413 }
2414
2415 // already booted?
2416 if (osdmap.is_up(from) &&
2417 osdmap.get_inst(from) == m->get_orig_source_inst() &&
2418 osdmap.get_cluster_addr(from) == m->cluster_addr) {
2419 // yup.
2420 dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
2421 << " == " << osdmap.get_inst(from) << dendl;
2422 _booted(op, false);
2423 return true;
2424 }
2425
2426 if (osdmap.exists(from) &&
2427 !osdmap.get_uuid(from).is_zero() &&
2428 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2429 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2430 << " clashes with existing osd: different fsid"
2431 << " (ours: " << osdmap.get_uuid(from)
2432 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2433 goto ignore;
2434 }
2435
2436 if (osdmap.exists(from) &&
2437 osdmap.get_info(from).up_from > m->version &&
2438 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) {
2439 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2440 send_latest(op, m->sb.current_epoch+1);
2441 return true;
2442 }
2443
2444 // noup?
2445 if (!can_mark_up(from)) {
2446 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2447 send_latest(op, m->sb.current_epoch+1);
2448 return true;
2449 }
2450
2451 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2452 return false;
2453
2454 ignore:
2455 return true;
2456 }
2457
2458 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2459 {
2460 op->mark_osdmon_event(__func__);
2461 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2462 dout(7) << __func__ << " from " << m->get_orig_source_inst() << " sb " << m->sb
2463 << " cluster_addr " << m->cluster_addr
2464 << " hb_back_addr " << m->hb_back_addr
2465 << " hb_front_addr " << m->hb_front_addr
2466 << dendl;
2467
2468 assert(m->get_orig_source().is_osd());
2469 int from = m->get_orig_source().num();
2470
2471 // does this osd exist?
2472 if (from >= osdmap.get_max_osd()) {
2473 dout(1) << "boot from osd." << from << " >= max_osd "
2474 << osdmap.get_max_osd() << dendl;
2475 return false;
2476 }
2477
2478 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2479 if (pending_inc.new_state.count(from))
2480 oldstate ^= pending_inc.new_state[from];
2481
2482 // already up? mark down first?
2483 if (osdmap.is_up(from)) {
2484 dout(7) << __func__ << " was up, first marking down "
2485 << osdmap.get_inst(from) << dendl;
2486 // preprocess should have caught these; if not, assert.
2487 assert(osdmap.get_inst(from) != m->get_orig_source_inst() ||
2488 osdmap.get_cluster_addr(from) != m->cluster_addr);
2489 assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2490
2491 if (pending_inc.new_state.count(from) == 0 ||
2492 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2493 // mark previous guy down
2494 pending_inc.new_state[from] = CEPH_OSD_UP;
2495 }
2496 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2497 } else if (pending_inc.new_up_client.count(from)) {
2498 // already prepared, just wait
2499 dout(7) << __func__ << " already prepared, waiting on "
2500 << m->get_orig_source_addr() << dendl;
2501 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2502 } else {
2503 // mark new guy up.
2504 pending_inc.new_up_client[from] = m->get_orig_source_addr();
2505 if (!m->cluster_addr.is_blank_ip())
2506 pending_inc.new_up_cluster[from] = m->cluster_addr;
2507 pending_inc.new_hb_back_up[from] = m->hb_back_addr;
2508 if (!m->hb_front_addr.is_blank_ip())
2509 pending_inc.new_hb_front_up[from] = m->hb_front_addr;
2510
2511 down_pending_out.erase(from); // if any
2512
2513 if (m->sb.weight)
2514 osd_weight[from] = m->sb.weight;
2515
2516 // set uuid?
2517 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2518 << dendl;
2519 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2520 // preprocess should have caught this; if not, assert.
2521 assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2522 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2523 }
2524
2525 // fresh osd?
2526 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2527 const osd_info_t& i = osdmap.get_info(from);
2528 if (i.up_from > i.lost_at) {
2529 dout(10) << " fresh osd; marking lost_at too" << dendl;
2530 pending_inc.new_lost[from] = osdmap.get_epoch();
2531 }
2532 }
2533
2534 // metadata
2535 bufferlist osd_metadata;
2536 ::encode(m->metadata, osd_metadata);
2537 pending_metadata[from] = osd_metadata;
2538 pending_metadata_rm.erase(from);
2539
2540 // adjust last clean unmount epoch?
2541 const osd_info_t& info = osdmap.get_info(from);
2542 dout(10) << " old osd_info: " << info << dendl;
2543 if (m->sb.mounted > info.last_clean_begin ||
2544 (m->sb.mounted == info.last_clean_begin &&
2545 m->sb.clean_thru > info.last_clean_end)) {
2546 epoch_t begin = m->sb.mounted;
2547 epoch_t end = m->sb.clean_thru;
2548
2549 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2550 << "[" << info.last_clean_begin << "," << info.last_clean_end
2551 << ") -> [" << begin << "-" << end << ")"
2552 << dendl;
2553 pending_inc.new_last_clean_interval[from] =
2554 pair<epoch_t,epoch_t>(begin, end);
2555 }
2556
2557 osd_xinfo_t xi = osdmap.get_xinfo(from);
2558 if (m->boot_epoch == 0) {
2559 xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
2560 xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
2561 dout(10) << " not laggy, new xi " << xi << dendl;
2562 } else {
2563 if (xi.down_stamp.sec()) {
2564 int interval = ceph_clock_now().sec() -
2565 xi.down_stamp.sec();
2566 if (g_conf->mon_osd_laggy_max_interval &&
2567 (interval > g_conf->mon_osd_laggy_max_interval)) {
2568 interval = g_conf->mon_osd_laggy_max_interval;
2569 }
2570 xi.laggy_interval =
2571 interval * g_conf->mon_osd_laggy_weight +
2572 xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
2573 }
2574 xi.laggy_probability =
2575 g_conf->mon_osd_laggy_weight +
2576 xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
2577 dout(10) << " laggy, now xi " << xi << dendl;
2578 }
2579
2580 // set features shared by the osd
2581 if (m->osd_features)
2582 xi.features = m->osd_features;
2583 else
2584 xi.features = m->get_connection()->get_features();
2585
2586 // mark in?
2587 if ((g_conf->mon_osd_auto_mark_auto_out_in &&
2588 (oldstate & CEPH_OSD_AUTOOUT)) ||
2589 (g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
2590 (g_conf->mon_osd_auto_mark_in)) {
2591 if (can_mark_in(from)) {
2592 if (osdmap.osd_xinfo[from].old_weight > 0) {
2593 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
2594 xi.old_weight = 0;
2595 } else {
2596 pending_inc.new_weight[from] = CEPH_OSD_IN;
2597 }
2598 } else {
2599 dout(7) << __func__ << " NOIN set, will not mark in "
2600 << m->get_orig_source_addr() << dendl;
2601 }
2602 }
2603
2604 pending_inc.new_xinfo[from] = xi;
2605
2606 // wait
2607 wait_for_finished_proposal(op, new C_Booted(this, op));
2608 }
2609 return true;
2610 }
2611
2612 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
2613 {
2614 op->mark_osdmon_event(__func__);
2615 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2616 dout(7) << "_booted " << m->get_orig_source_inst()
2617 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
2618
2619 if (logit) {
2620 mon->clog->info() << m->get_orig_source_inst() << " boot";
2621 }
2622
2623 send_latest(op, m->sb.current_epoch+1);
2624 }
2625
2626
2627 // -------------
2628 // full
2629
2630 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
2631 {
2632 op->mark_osdmon_event(__func__);
2633 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2634 int from = m->get_orig_source().num();
2635 set<string> state;
2636 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2637
2638 // check permissions, ignore if failed
2639 MonSession *session = m->get_session();
2640 if (!session)
2641 goto ignore;
2642 if (!session->is_capable("osd", MON_CAP_X)) {
2643 dout(0) << "MOSDFull from entity with insufficient privileges:"
2644 << session->caps << dendl;
2645 goto ignore;
2646 }
2647
2648 // ignore a full message from the osd instance that already went down
2649 if (!osdmap.exists(from)) {
2650 dout(7) << __func__ << " ignoring full message from nonexistent "
2651 << m->get_orig_source_inst() << dendl;
2652 goto ignore;
2653 }
2654 if ((!osdmap.is_up(from) &&
2655 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) ||
2656 (osdmap.is_up(from) &&
2657 osdmap.get_inst(from) != m->get_orig_source_inst())) {
2658 dout(7) << __func__ << " ignoring full message from down "
2659 << m->get_orig_source_inst() << dendl;
2660 goto ignore;
2661 }
2662
2663 OSDMap::calc_state_set(osdmap.get_state(from), state);
2664
2665 if ((osdmap.get_state(from) & mask) == m->state) {
2666 dout(7) << __func__ << " state already " << state << " for osd." << from
2667 << " " << m->get_orig_source_inst() << dendl;
2668 _reply_map(op, m->version);
2669 goto ignore;
2670 }
2671
2672 dout(10) << __func__ << " want state " << state << " for osd." << from
2673 << " " << m->get_orig_source_inst() << dendl;
2674 return false;
2675
2676 ignore:
2677 return true;
2678 }
2679
2680 bool OSDMonitor::prepare_full(MonOpRequestRef op)
2681 {
2682 op->mark_osdmon_event(__func__);
2683 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2684 const int from = m->get_orig_source().num();
2685
2686 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2687 const unsigned want_state = m->state & mask; // safety first
2688
2689 unsigned cur_state = osdmap.get_state(from);
2690 auto p = pending_inc.new_state.find(from);
2691 if (p != pending_inc.new_state.end()) {
2692 cur_state ^= p->second;
2693 }
2694 cur_state &= mask;
2695
2696 set<string> want_state_set, cur_state_set;
2697 OSDMap::calc_state_set(want_state, want_state_set);
2698 OSDMap::calc_state_set(cur_state, cur_state_set);
2699
2700 if (cur_state != want_state) {
2701 if (p != pending_inc.new_state.end()) {
2702 p->second &= ~mask;
2703 } else {
2704 pending_inc.new_state[from] = 0;
2705 }
2706 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
2707 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2708 << " -> " << want_state_set << dendl;
2709 } else {
2710 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2711 << " = wanted " << want_state_set << ", just waiting" << dendl;
2712 }
2713
2714 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2715 return true;
2716 }
2717
2718 // -------------
2719 // alive
2720
2721 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
2722 {
2723 op->mark_osdmon_event(__func__);
2724 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2725 int from = m->get_orig_source().num();
2726
2727 // check permissions, ignore if failed
2728 MonSession *session = m->get_session();
2729 if (!session)
2730 goto ignore;
2731 if (!session->is_capable("osd", MON_CAP_X)) {
2732 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2733 << session->caps << dendl;
2734 goto ignore;
2735 }
2736
2737 if (!osdmap.is_up(from) ||
2738 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2739 dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
2740 goto ignore;
2741 }
2742
2743 if (osdmap.get_up_thru(from) >= m->want) {
2744 // yup.
2745 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
2746 _reply_map(op, m->version);
2747 return true;
2748 }
2749
2750 dout(10) << "preprocess_alive want up_thru " << m->want
2751 << " from " << m->get_orig_source_inst() << dendl;
2752 return false;
2753
2754 ignore:
2755 return true;
2756 }
2757
2758 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
2759 {
2760 op->mark_osdmon_event(__func__);
2761 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2762 int from = m->get_orig_source().num();
2763
2764 if (0) { // we probably don't care much about these
2765 mon->clog->debug() << m->get_orig_source_inst() << " alive";
2766 }
2767
2768 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
2769 << " from " << m->get_orig_source_inst() << dendl;
2770
2771 update_up_thru(from, m->version); // set to the latest map the OSD has
2772 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2773 return true;
2774 }
2775
2776 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
2777 {
2778 op->mark_osdmon_event(__func__);
2779 dout(7) << "_reply_map " << e
2780 << " from " << op->get_req()->get_orig_source_inst()
2781 << dendl;
2782 send_latest(op, e);
2783 }
2784
2785 // pg_created
2786 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
2787 {
2788 op->mark_osdmon_event(__func__);
2789 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2790 dout(10) << __func__ << " " << *m << dendl;
2791 auto session = m->get_session();
2792 if (!session) {
2793 dout(10) << __func__ << ": no monitor session!" << dendl;
2794 return true;
2795 }
2796 if (!session->is_capable("osd", MON_CAP_X)) {
2797 derr << __func__ << " received from entity "
2798 << "with insufficient privileges " << session->caps << dendl;
2799 return true;
2800 }
2801 // always forward the "created!" to the leader
2802 return false;
2803 }
2804
2805 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
2806 {
2807 op->mark_osdmon_event(__func__);
2808 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2809 dout(10) << __func__ << " " << *m << dendl;
2810 auto src = m->get_orig_source();
2811 auto from = src.num();
2812 if (!src.is_osd() ||
2813 !mon->osdmon()->osdmap.is_up(from) ||
2814 m->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
2815 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
2816 return false;
2817 }
2818 pending_created_pgs.push_back(m->pgid);
2819 return true;
2820 }
2821
2822 // -------------
2823 // pg_temp changes
2824
2825 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
2826 {
2827 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2828 dout(10) << "preprocess_pgtemp " << *m << dendl;
2829 mempool::osdmap::vector<int> empty;
2830 int from = m->get_orig_source().num();
2831 size_t ignore_cnt = 0;
2832
2833 // check caps
2834 MonSession *session = m->get_session();
2835 if (!session)
2836 goto ignore;
2837 if (!session->is_capable("osd", MON_CAP_X)) {
2838 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2839 << session->caps << dendl;
2840 goto ignore;
2841 }
2842
2843 if (!osdmap.is_up(from) ||
2844 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2845 dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
2846 goto ignore;
2847 }
2848
2849 if (m->forced) {
2850 return false;
2851 }
2852
2853 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2854 dout(20) << " " << p->first
2855 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
2856 << " -> " << p->second << dendl;
2857
2858 // does the pool exist?
2859 if (!osdmap.have_pg_pool(p->first.pool())) {
2860 /*
2861 * 1. If the osdmap does not have the pool, it means the pool has been
2862 * removed in-between the osd sending this message and us handling it.
2863 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2864 * not exist in the pending either, as the osds would not send a
2865 * message about a pool they know nothing about (yet).
2866 * 3. However, if the pool does exist in the pending, then it must be a
2867 * new pool, and not relevant to this message (see 1).
2868 */
2869 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2870 << ": pool has been removed" << dendl;
2871 ignore_cnt++;
2872 continue;
2873 }
2874
2875 int acting_primary = -1;
2876 osdmap.pg_to_up_acting_osds(
2877 p->first, nullptr, nullptr, nullptr, &acting_primary);
2878 if (acting_primary != from) {
2879 /* If the source isn't the primary based on the current osdmap, we know
2880 * that the interval changed and that we can discard this message.
2881 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2882 * which of two pg temp mappings on the same pg is more recent.
2883 */
2884 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2885 << ": primary has changed" << dendl;
2886 ignore_cnt++;
2887 continue;
2888 }
2889
2890 // removal?
2891 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
2892 osdmap.primary_temp->count(p->first)))
2893 return false;
2894 // change?
2895 // NOTE: we assume that this will clear pg_primary, so consider
2896 // an existing pg_primary field to imply a change
2897 if (p->second.size() &&
2898 (osdmap.pg_temp->count(p->first) == 0 ||
2899 !vectors_equal(osdmap.pg_temp->get(p->first), p->second) ||
2900 osdmap.primary_temp->count(p->first)))
2901 return false;
2902 }
2903
2904 // should we ignore all the pgs?
2905 if (ignore_cnt == m->pg_temp.size())
2906 goto ignore;
2907
2908 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
2909 _reply_map(op, m->map_epoch);
2910 return true;
2911
2912 ignore:
2913 return true;
2914 }
2915
2916 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
2917 {
2918 epoch_t old_up_thru = osdmap.get_up_thru(from);
2919 auto ut = pending_inc.new_up_thru.find(from);
2920 if (ut != pending_inc.new_up_thru.end()) {
2921 old_up_thru = ut->second;
2922 }
2923 if (up_thru > old_up_thru) {
2924 // set up_thru too, so the osd doesn't have to ask again
2925 pending_inc.new_up_thru[from] = up_thru;
2926 }
2927 }
2928
2929 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
2930 {
2931 op->mark_osdmon_event(__func__);
2932 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2933 int from = m->get_orig_source().num();
2934 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
2935 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2936 uint64_t pool = p->first.pool();
2937 if (pending_inc.old_pools.count(pool)) {
2938 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2939 << ": pool pending removal" << dendl;
2940 continue;
2941 }
2942 if (!osdmap.have_pg_pool(pool)) {
2943 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2944 << ": pool has been removed" << dendl;
2945 continue;
2946 }
2947 pending_inc.new_pg_temp[p->first] =
2948 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
2949
2950 // unconditionally clear pg_primary (until this message can encode
2951 // a change for that, too.. at which point we need to also fix
2952 // preprocess_pg_temp)
2953 if (osdmap.primary_temp->count(p->first) ||
2954 pending_inc.new_primary_temp.count(p->first))
2955 pending_inc.new_primary_temp[p->first] = -1;
2956 }
2957
2958 // set up_thru too, so the osd doesn't have to ask again
2959 update_up_thru(from, m->map_epoch);
2960
2961 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
2962 return true;
2963 }
2964
2965
2966 // ---
2967
2968 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
2969 {
2970 op->mark_osdmon_event(__func__);
2971 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2972 dout(7) << "preprocess_remove_snaps " << *m << dendl;
2973
2974 // check privilege, ignore if failed
2975 MonSession *session = m->get_session();
2976 if (!session)
2977 goto ignore;
2978 if (!session->caps.is_capable(
2979 g_ceph_context,
2980 CEPH_ENTITY_TYPE_MON,
2981 session->entity_name,
2982 "osd", "osd pool rmsnap", {}, true, true, false)) {
2983 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
2984 << session->caps << dendl;
2985 goto ignore;
2986 }
2987
2988 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
2989 q != m->snaps.end();
2990 ++q) {
2991 if (!osdmap.have_pg_pool(q->first)) {
2992 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
2993 continue;
2994 }
2995 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
2996 for (vector<snapid_t>::iterator p = q->second.begin();
2997 p != q->second.end();
2998 ++p) {
2999 if (*p > pi->get_snap_seq() ||
3000 !pi->removed_snaps.contains(*p))
3001 return false;
3002 }
3003 }
3004
3005 ignore:
3006 return true;
3007 }
3008
3009 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
3010 {
3011 op->mark_osdmon_event(__func__);
3012 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3013 dout(7) << "prepare_remove_snaps " << *m << dendl;
3014
3015 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
3016 p != m->snaps.end();
3017 ++p) {
3018
3019 if (!osdmap.have_pg_pool(p->first)) {
3020 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
3021 continue;
3022 }
3023
3024 pg_pool_t& pi = osdmap.pools[p->first];
3025 for (vector<snapid_t>::iterator q = p->second.begin();
3026 q != p->second.end();
3027 ++q) {
3028 if (!pi.removed_snaps.contains(*q) &&
3029 (!pending_inc.new_pools.count(p->first) ||
3030 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
3031 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
3032 newpi->removed_snaps.insert(*q);
3033 dout(10) << " pool " << p->first << " removed_snaps added " << *q
3034 << " (now " << newpi->removed_snaps << ")" << dendl;
3035 if (*q > newpi->get_snap_seq()) {
3036 dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
3037 newpi->set_snap_seq(*q);
3038 }
3039 newpi->set_snap_epoch(pending_inc.epoch);
3040 }
3041 }
3042 }
3043 return true;
3044 }
3045
3046 // osd beacon
3047 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
3048 {
3049 op->mark_osdmon_event(__func__);
3050 auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3051 // check caps
3052 auto session = beacon->get_session();
3053 if (!session) {
3054 dout(10) << __func__ << " no monitor session!" << dendl;
3055 return true;
3056 }
3057 if (!session->is_capable("osd", MON_CAP_X)) {
3058 derr << __func__ << " received from entity "
3059 << "with insufficient privileges " << session->caps << dendl;
3060 return true;
3061 }
3062 // Always forward the beacon to the leader, even if they are the same as
3063 // the old one. The leader will mark as down osds that haven't sent
3064 // beacon for a few minutes.
3065 return false;
3066 }
3067
3068 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
3069 {
3070 op->mark_osdmon_event(__func__);
3071 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3072 const auto src = beacon->get_orig_source();
3073 dout(10) << __func__ << " " << *beacon
3074 << " from " << src << dendl;
3075 int from = src.num();
3076
3077 if (!src.is_osd() ||
3078 !osdmap.is_up(from) ||
3079 beacon->get_orig_source_inst() != osdmap.get_inst(from)) {
3080 dout(1) << " ignoring beacon from non-active osd." << dendl;
3081 return false;
3082 }
3083
3084 last_osd_report[from] = ceph_clock_now();
3085 osd_epochs[from] = beacon->version;
3086
3087 for (const auto& pg : beacon->pgs) {
3088 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
3089 }
3090 return false;
3091 }
3092
3093 // ---------------
3094 // map helpers
3095
3096 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
3097 {
3098 op->mark_osdmon_event(__func__);
3099 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
3100 << " start " << start << dendl;
3101 if (start == 0)
3102 send_full(op);
3103 else
3104 send_incremental(op, start);
3105 }
3106
3107
3108 MOSDMap *OSDMonitor::build_latest_full()
3109 {
3110 MOSDMap *r = new MOSDMap(mon->monmap->fsid);
3111 get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
3112 r->oldest_map = get_first_committed();
3113 r->newest_map = osdmap.get_epoch();
3114 return r;
3115 }
3116
3117 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
3118 {
3119 dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
3120 MOSDMap *m = new MOSDMap(mon->monmap->fsid);
3121 m->oldest_map = get_first_committed();
3122 m->newest_map = osdmap.get_epoch();
3123
3124 for (epoch_t e = to; e >= from && e > 0; e--) {
3125 bufferlist bl;
3126 int err = get_version(e, bl);
3127 if (err == 0) {
3128 assert(bl.length());
3129 // if (get_version(e, bl) > 0) {
3130 dout(20) << "build_incremental inc " << e << " "
3131 << bl.length() << " bytes" << dendl;
3132 m->incremental_maps[e] = bl;
3133 } else {
3134 assert(err == -ENOENT);
3135 assert(!bl.length());
3136 get_version_full(e, bl);
3137 if (bl.length() > 0) {
3138 //else if (get_version("full", e, bl) > 0) {
3139 dout(20) << "build_incremental full " << e << " "
3140 << bl.length() << " bytes" << dendl;
3141 m->maps[e] = bl;
3142 } else {
3143 ceph_abort(); // we should have all maps.
3144 }
3145 }
3146 }
3147 return m;
3148 }
3149
3150 void OSDMonitor::send_full(MonOpRequestRef op)
3151 {
3152 op->mark_osdmon_event(__func__);
3153 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
3154 mon->send_reply(op, build_latest_full());
3155 }
3156
3157 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
3158 {
3159 op->mark_osdmon_event(__func__);
3160
3161 MonSession *s = op->get_session();
3162 assert(s);
3163
3164 if (s->proxy_con &&
3165 s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
3166 // oh, we can tell the other mon to do it
3167 dout(10) << __func__ << " asking proxying mon to send_incremental from "
3168 << first << dendl;
3169 MRoute *r = new MRoute(s->proxy_tid, NULL);
3170 r->send_osdmap_first = first;
3171 s->proxy_con->send_message(r);
3172 op->mark_event("reply: send routed send_osdmap_first reply");
3173 } else {
3174 // do it ourselves
3175 send_incremental(first, s, false, op);
3176 }
3177 }
3178
3179 void OSDMonitor::send_incremental(epoch_t first,
3180 MonSession *session,
3181 bool onetime,
3182 MonOpRequestRef req)
3183 {
3184 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3185 << " to " << session->inst << dendl;
3186
3187 if (first <= session->osd_epoch) {
3188 dout(10) << __func__ << " " << session->inst << " should already have epoch "
3189 << session->osd_epoch << dendl;
3190 first = session->osd_epoch + 1;
3191 }
3192
3193 if (first < get_first_committed()) {
3194 first = get_first_committed();
3195 bufferlist bl;
3196 int err = get_version_full(first, bl);
3197 assert(err == 0);
3198 assert(bl.length());
3199
3200 dout(20) << "send_incremental starting with base full "
3201 << first << " " << bl.length() << " bytes" << dendl;
3202
3203 MOSDMap *m = new MOSDMap(osdmap.get_fsid());
3204 m->oldest_map = get_first_committed();
3205 m->newest_map = osdmap.get_epoch();
3206 m->maps[first] = bl;
3207
3208 if (req) {
3209 mon->send_reply(req, m);
3210 session->osd_epoch = first;
3211 return;
3212 } else {
3213 session->con->send_message(m);
3214 session->osd_epoch = first;
3215 }
3216 first++;
3217 }
3218
3219 while (first <= osdmap.get_epoch()) {
3220 epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
3221 osdmap.get_epoch());
3222 MOSDMap *m = build_incremental(first, last);
3223
3224 if (req) {
3225 // send some maps. it may not be all of them, but it will get them
3226 // started.
3227 mon->send_reply(req, m);
3228 } else {
3229 session->con->send_message(m);
3230 first = last + 1;
3231 }
3232 session->osd_epoch = last;
3233 if (onetime || req)
3234 break;
3235 }
3236 }
3237
3238 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3239 {
3240 if (inc_osd_cache.lookup(ver, &bl)) {
3241 return 0;
3242 }
3243 int ret = PaxosService::get_version(ver, bl);
3244 if (!ret) {
3245 inc_osd_cache.add(ver, bl);
3246 }
3247 return ret;
3248 }
3249
3250 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3251 {
3252 if (full_osd_cache.lookup(ver, &bl)) {
3253 return 0;
3254 }
3255 int ret = PaxosService::get_version_full(ver, bl);
3256 if (!ret) {
3257 full_osd_cache.add(ver, bl);
3258 }
3259 return ret;
3260 }
3261
3262 epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
3263 {
3264 dout(10) << "blacklist " << a << " until " << until << dendl;
3265 pending_inc.new_blacklist[a] = until;
3266 return pending_inc.epoch;
3267 }
3268
3269
3270 void OSDMonitor::check_osdmap_subs()
3271 {
3272 dout(10) << __func__ << dendl;
3273 if (!osdmap.get_epoch()) {
3274 return;
3275 }
3276 auto osdmap_subs = mon->session_map.subs.find("osdmap");
3277 if (osdmap_subs == mon->session_map.subs.end()) {
3278 return;
3279 }
3280 auto p = osdmap_subs->second->begin();
3281 while (!p.end()) {
3282 auto sub = *p;
3283 ++p;
3284 check_osdmap_sub(sub);
3285 }
3286 }
3287
3288 void OSDMonitor::check_osdmap_sub(Subscription *sub)
3289 {
3290 dout(10) << __func__ << " " << sub << " next " << sub->next
3291 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
3292 if (sub->next <= osdmap.get_epoch()) {
3293 if (sub->next >= 1)
3294 send_incremental(sub->next, sub->session, sub->incremental_onetime);
3295 else
3296 sub->session->con->send_message(build_latest_full());
3297 if (sub->onetime)
3298 mon->session_map.remove_sub(sub);
3299 else
3300 sub->next = osdmap.get_epoch() + 1;
3301 }
3302 }
3303
3304 void OSDMonitor::check_pg_creates_subs()
3305 {
3306 if (!mon->monmap->get_required_features().contains_all(
3307 ceph::features::mon::FEATURE_LUMINOUS)) {
3308 // PGMonitor takes care of this in pre-luminous era.
3309 return;
3310 }
3311 if (!osdmap.get_num_up_osds()) {
3312 return;
3313 }
3314 assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
3315 mon->with_session_map([this](const MonSessionMap& session_map) {
3316 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
3317 if (pg_creates_subs == session_map.subs.end()) {
3318 return;
3319 }
3320 for (auto sub : *pg_creates_subs->second) {
3321 check_pg_creates_sub(sub);
3322 }
3323 });
3324 }
3325
3326 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
3327 {
3328 dout(20) << __func__ << " .. " << sub->session->inst << dendl;
3329 assert(sub->type == "osd_pg_creates");
3330 // only send these if the OSD is up. we will check_subs() when they do
3331 // come up so they will get the creates then.
3332 if (sub->session->inst.name.is_osd() &&
3333 mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
3334 sub->next = send_pg_creates(sub->session->inst.name.num(),
3335 sub->session->con.get(),
3336 sub->next);
3337 }
3338 }
3339
3340 void OSDMonitor::do_application_enable(int64_t pool_id,
3341 const std::string &app_name)
3342 {
3343 assert(paxos->is_plugged() && is_writeable());
3344
3345 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
3346 << dendl;
3347
3348 assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS ||
3349 pending_inc.new_require_osd_release >= CEPH_RELEASE_LUMINOUS);
3350
3351 auto pp = osdmap.get_pg_pool(pool_id);
3352 assert(pp != nullptr);
3353
3354 pg_pool_t p = *pp;
3355 if (pending_inc.new_pools.count(pool_id)) {
3356 p = pending_inc.new_pools[pool_id];
3357 }
3358
3359 p.application_metadata.insert({app_name, {}});
3360 p.last_change = pending_inc.epoch;
3361 pending_inc.new_pools[pool_id] = p;
3362 }
3363
3364 unsigned OSDMonitor::scan_for_creating_pgs(
3365 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
3366 const mempool::osdmap::set<int64_t>& removed_pools,
3367 utime_t modified,
3368 creating_pgs_t* creating_pgs) const
3369 {
3370 unsigned queued = 0;
3371 for (auto& p : pools) {
3372 int64_t poolid = p.first;
3373 const pg_pool_t& pool = p.second;
3374 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
3375 pool.get_type(), pool.get_size());
3376 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
3377 continue;
3378
3379 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
3380 const auto created = pool.get_last_change();
3381 if (last_scan_epoch && created <= last_scan_epoch) {
3382 dout(10) << __func__ << " no change in pool " << poolid
3383 << " " << pool << dendl;
3384 continue;
3385 }
3386 if (removed_pools.count(poolid)) {
3387 dout(10) << __func__ << " pool is being removed: " << poolid
3388 << " " << pool << dendl;
3389 continue;
3390 }
3391 dout(10) << __func__ << " queueing pool create for " << poolid
3392 << " " << pool << dendl;
3393 if (creating_pgs->create_pool(poolid, pool.get_pg_num(),
3394 created, modified)) {
3395 queued++;
3396 }
3397 }
3398 return queued;
3399 }
3400
3401 void OSDMonitor::update_creating_pgs()
3402 {
3403 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
3404 << creating_pgs.queue.size() << " pools in queue" << dendl;
3405 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
3406 std::lock_guard<std::mutex> l(creating_pgs_lock);
3407 for (const auto& pg : creating_pgs.pgs) {
3408 int acting_primary = -1;
3409 auto pgid = pg.first;
3410 auto mapped = pg.second.first;
3411 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
3412 mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
3413 // check the previous creating_pgs, look for the target to whom the pg was
3414 // previously mapped
3415 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
3416 const auto last_acting_primary = pgs_by_epoch.first;
3417 for (auto& pgs: pgs_by_epoch.second) {
3418 if (pgs.second.count(pgid)) {
3419 if (last_acting_primary == acting_primary) {
3420 mapped = pgs.first;
3421 } else {
3422 dout(20) << __func__ << " " << pgid << " "
3423 << " acting_primary:" << last_acting_primary
3424 << " -> " << acting_primary << dendl;
3425 // note epoch if the target of the create message changed.
3426 mapped = mapping.get_epoch();
3427 }
3428 break;
3429 } else {
3430 // newly creating
3431 mapped = mapping.get_epoch();
3432 }
3433 }
3434 }
3435 dout(10) << __func__ << " will instruct osd." << acting_primary
3436 << " to create " << pgid << "@" << mapped << dendl;
3437 new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
3438 }
3439 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
3440 creating_pgs_epoch = mapping.get_epoch();
3441 }
3442
3443 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
3444 {
3445 dout(30) << __func__ << " osd." << osd << " next=" << next
3446 << " " << creating_pgs_by_osd_epoch << dendl;
3447 std::lock_guard<std::mutex> l(creating_pgs_lock);
3448 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
3449 dout(20) << __func__
3450 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
3451 // the subscribers will be updated when the mapping is completed anyway
3452 return next;
3453 }
3454 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
3455 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
3456 return next;
3457 assert(!creating_pgs_by_epoch->second.empty());
3458
3459 MOSDPGCreate *m = nullptr;
3460 epoch_t last = 0;
3461 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
3462 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
3463 auto epoch = epoch_pgs->first;
3464 auto& pgs = epoch_pgs->second;
3465 dout(20) << __func__ << " osd." << osd << " from " << next
3466 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
3467 last = epoch;
3468 for (auto& pg : pgs) {
3469 if (!m)
3470 m = new MOSDPGCreate(creating_pgs_epoch);
3471 // Need the create time from the monitor using its clock to set
3472 // last_scrub_stamp upon pg creation.
3473 auto create = creating_pgs.pgs.find(pg);
3474 assert(create != creating_pgs.pgs.end());
3475 m->mkpg.emplace(pg, pg_create_t{create->second.first, pg, 0});
3476 m->ctimes.emplace(pg, create->second.second);
3477 dout(20) << __func__ << " will create " << pg
3478 << " at " << create->second.first << dendl;
3479 }
3480 }
3481 if (!m) {
3482 dout(20) << __func__ << " osd." << osd << " from " << next
3483 << " has nothing to send" << dendl;
3484 return next;
3485 }
3486 con->send_message(m);
3487 // sub is current through last + 1
3488 return last + 1;
3489 }
3490
3491 // TICK
3492
3493
3494 void OSDMonitor::tick()
3495 {
3496 if (!is_active()) return;
3497
3498 dout(10) << osdmap << dendl;
3499
3500 if (!mon->is_leader()) return;
3501
3502 bool do_propose = false;
3503 utime_t now = ceph_clock_now();
3504
3505 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3506 mon->monmap->get_required_features().contains_all(
3507 ceph::features::mon::FEATURE_LUMINOUS)) {
3508 if (handle_osd_timeouts(now, last_osd_report)) {
3509 do_propose = true;
3510 }
3511 }
3512 if (!osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) &&
3513 osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3514 mon->mgrstatmon()->is_readable() &&
3515 mon->mgrstatmon()->definitely_converted_snapsets()) {
3516 dout(1) << __func__ << " all snapsets converted, setting purged_snapdirs"
3517 << dendl;
3518 add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS);
3519 do_propose = true;
3520 }
3521
3522 // mark osds down?
3523 if (check_failures(now))
3524 do_propose = true;
3525
3526 // mark down osds out?
3527
3528 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3529 * influence at all. The decision is made based on the ratio of "in" osds,
3530 * and the function returns false if this ratio is lower that the minimum
3531 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3532 */
3533 if (can_mark_out(-1)) {
3534 set<int> down_cache; // quick cache of down subtrees
3535
3536 map<int,utime_t>::iterator i = down_pending_out.begin();
3537 while (i != down_pending_out.end()) {
3538 int o = i->first;
3539 utime_t down = now;
3540 down -= i->second;
3541 ++i;
3542
3543 if (osdmap.is_down(o) &&
3544 osdmap.is_in(o) &&
3545 can_mark_out(o)) {
3546 utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
3547 utime_t grace = orig_grace;
3548 double my_grace = 0.0;
3549
3550 if (g_conf->mon_osd_adjust_down_out_interval) {
3551 // scale grace period the same way we do the heartbeat grace.
3552 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
3553 double halflife = (double)g_conf->mon_osd_laggy_halflife;
3554 double decay_k = ::log(.5) / halflife;
3555 double decay = exp((double)down * decay_k);
3556 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
3557 << " down for " << down << " decay " << decay << dendl;
3558 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3559 grace += my_grace;
3560 }
3561
3562 // is this an entire large subtree down?
3563 if (g_conf->mon_osd_down_out_subtree_limit.length()) {
3564 int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
3565 if (type > 0) {
3566 if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
3567 dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
3568 << " subtree for osd." << o << " is down; resetting timer" << dendl;
3569 // reset timer, too.
3570 down_pending_out[o] = now;
3571 continue;
3572 }
3573 }
3574 }
3575
3576 bool down_out = !osdmap.is_destroyed(o) &&
3577 g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace;
3578 bool destroyed_out = osdmap.is_destroyed(o) &&
3579 g_conf->mon_osd_destroyed_out_interval > 0 &&
3580 // this is not precise enough as we did not make a note when this osd
3581 // was marked as destroyed, but let's not bother with that
3582 // complexity for now.
3583 down.sec() >= g_conf->mon_osd_destroyed_out_interval;
3584 if (down_out || destroyed_out) {
3585 dout(10) << "tick marking osd." << o << " OUT after " << down
3586 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
3587 pending_inc.new_weight[o] = CEPH_OSD_OUT;
3588
3589 // set the AUTOOUT bit.
3590 if (pending_inc.new_state.count(o) == 0)
3591 pending_inc.new_state[o] = 0;
3592 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
3593
3594 // remember previous weight
3595 if (pending_inc.new_xinfo.count(o) == 0)
3596 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
3597 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
3598
3599 do_propose = true;
3600
3601 mon->clog->info() << "Marking osd." << o << " out (has been down for "
3602 << int(down.sec()) << " seconds)";
3603 } else
3604 continue;
3605 }
3606
3607 down_pending_out.erase(o);
3608 }
3609 } else {
3610 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
3611 }
3612
3613 // expire blacklisted items?
3614 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
3615 p != osdmap.blacklist.end();
3616 ++p) {
3617 if (p->second < now) {
3618 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
3619 pending_inc.old_blacklist.push_back(p->first);
3620 do_propose = true;
3621 }
3622 }
3623
3624 // if map full setting has changed, get that info out there!
3625 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS &&
3626 mon->pgservice->is_readable()) {
3627 // for pre-luminous compat only!
3628 if (mon->pgservice->have_full_osds()) {
3629 dout(5) << "There are full osds, setting full flag" << dendl;
3630 add_flag(CEPH_OSDMAP_FULL);
3631 } else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
3632 dout(10) << "No full osds, removing full flag" << dendl;
3633 remove_flag(CEPH_OSDMAP_FULL);
3634 }
3635
3636 if (mon->pgservice->have_nearfull_osds()) {
3637 dout(5) << "There are near full osds, setting nearfull flag" << dendl;
3638 add_flag(CEPH_OSDMAP_NEARFULL);
3639 } else if (osdmap.test_flag(CEPH_OSDMAP_NEARFULL)){
3640 dout(10) << "No near full osds, removing nearfull flag" << dendl;
3641 remove_flag(CEPH_OSDMAP_NEARFULL);
3642 }
3643 if (pending_inc.new_flags != -1 &&
3644 (pending_inc.new_flags ^ osdmap.flags) & (CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
3645 dout(1) << "New setting for" <<
3646 (pending_inc.new_flags & CEPH_OSDMAP_FULL ? " CEPH_OSDMAP_FULL" : "") <<
3647 (pending_inc.new_flags & CEPH_OSDMAP_NEARFULL ? " CEPH_OSDMAP_NEARFULL" : "")
3648 << " -- doing propose" << dendl;
3649 do_propose = true;
3650 }
3651 }
3652
3653 if (update_pools_status())
3654 do_propose = true;
3655
3656 if (do_propose ||
3657 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
3658 propose_pending();
3659 }
3660
3661 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
3662 std::map<int,utime_t> &last_osd_report)
3663 {
3664 utime_t timeo(g_conf->mon_osd_report_timeout, 0);
3665 if (now - mon->get_leader_since() < timeo) {
3666 // We haven't been the leader for long enough to consider OSD timeouts
3667 return false;
3668 }
3669
3670 int max_osd = osdmap.get_max_osd();
3671 bool new_down = false;
3672
3673 for (int i=0; i < max_osd; ++i) {
3674 dout(30) << __func__ << ": checking up on osd " << i << dendl;
3675 if (!osdmap.exists(i)) {
3676 last_osd_report.erase(i); // if any
3677 continue;
3678 }
3679 if (!osdmap.is_up(i))
3680 continue;
3681 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
3682 if (t == last_osd_report.end()) {
3683 // it wasn't in the map; start the timer.
3684 last_osd_report[i] = now;
3685 } else if (can_mark_down(i)) {
3686 utime_t diff = now - t->second;
3687 if (diff > timeo) {
3688 mon->clog->info() << "osd." << i << " marked down after no beacon for "
3689 << diff << " seconds";
3690 derr << "no beacon from osd." << i << " since " << t->second
3691 << ", " << diff << " seconds ago. marking down" << dendl;
3692 pending_inc.new_state[i] = CEPH_OSD_UP;
3693 new_down = true;
3694 }
3695 }
3696 }
3697 return new_down;
3698 }
3699
3700 void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
3701 list<pair<health_status_t,string> > *detail,
3702 CephContext *cct) const
3703 {
3704 int num_osds = osdmap.get_num_osds();
3705
3706 if (num_osds == 0) {
3707 summary.push_back(make_pair(HEALTH_ERR, "no osds"));
3708 } else {
3709 int num_in_osds = 0;
3710 int num_down_in_osds = 0;
3711 set<int> osds;
3712 set<int> down_in_osds;
3713 set<int> up_in_osds;
3714 set<int> subtree_up;
3715 unordered_map<int, set<int> > subtree_type_down;
3716 unordered_map<int, int> num_osds_subtree;
3717 int max_type = osdmap.crush->get_max_type_id();
3718
3719 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3720 if (!osdmap.exists(i)) {
3721 if (osdmap.crush->item_exists(i)) {
3722 osds.insert(i);
3723 }
3724 continue;
3725 }
3726 if (osdmap.is_out(i))
3727 continue;
3728 ++num_in_osds;
3729 if (down_in_osds.count(i) || up_in_osds.count(i))
3730 continue;
3731 if (!osdmap.is_up(i)) {
3732 down_in_osds.insert(i);
3733 int parent_id = 0;
3734 int current = i;
3735 for (int type = 0; type <= max_type; type++) {
3736 if (!osdmap.crush->get_type_name(type))
3737 continue;
3738 int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
3739 if (r == -ENOENT)
3740 break;
3741 // break early if this parent is already marked as up
3742 if (subtree_up.count(parent_id))
3743 break;
3744 type = osdmap.crush->get_bucket_type(parent_id);
3745 if (!osdmap.subtree_type_is_down(
3746 g_ceph_context, parent_id, type,
3747 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
3748 break;
3749 current = parent_id;
3750 }
3751 }
3752 }
3753
3754 // calculate the number of down osds in each down subtree and
3755 // store it in num_osds_subtree
3756 for (int type = 1; type <= max_type; type++) {
3757 if (!osdmap.crush->get_type_name(type))
3758 continue;
3759 for (auto j = subtree_type_down[type].begin();
3760 j != subtree_type_down[type].end();
3761 ++j) {
3762 if (type == 1) {
3763 list<int> children;
3764 int num = osdmap.crush->get_children(*j, &children);
3765 num_osds_subtree[*j] = num;
3766 } else {
3767 list<int> children;
3768 int num = 0;
3769 int num_children = osdmap.crush->get_children(*j, &children);
3770 if (num_children == 0)
3771 continue;
3772 for (auto l = children.begin(); l != children.end(); ++l) {
3773 if (num_osds_subtree[*l] > 0) {
3774 num = num + num_osds_subtree[*l];
3775 }
3776 }
3777 num_osds_subtree[*j] = num;
3778 }
3779 }
3780 }
3781 num_down_in_osds = down_in_osds.size();
3782 assert(num_down_in_osds <= num_in_osds);
3783 if (num_down_in_osds > 0) {
3784 // summary of down subtree types and osds
3785 for (int type = max_type; type > 0; type--) {
3786 if (!osdmap.crush->get_type_name(type))
3787 continue;
3788 if (subtree_type_down[type].size() > 0) {
3789 ostringstream ss;
3790 ss << subtree_type_down[type].size() << " "
3791 << osdmap.crush->get_type_name(type);
3792 if (subtree_type_down[type].size() > 1) {
3793 ss << "s";
3794 }
3795 int sum_down_osds = 0;
3796 for (auto j = subtree_type_down[type].begin();
3797 j != subtree_type_down[type].end();
3798 ++j) {
3799 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
3800 }
3801 ss << " (" << sum_down_osds << " osds) down";
3802 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3803 }
3804 }
3805 ostringstream ss;
3806 ss << down_in_osds.size() << " osds down";
3807 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3808
3809 if (detail) {
3810 // details of down subtree types
3811 for (int type = max_type; type > 0; type--) {
3812 if (!osdmap.crush->get_type_name(type))
3813 continue;
3814 for (auto j = subtree_type_down[type].rbegin();
3815 j != subtree_type_down[type].rend();
3816 ++j) {
3817 ostringstream ss;
3818 ss << osdmap.crush->get_type_name(type);
3819 ss << " ";
3820 ss << osdmap.crush->get_item_name(*j);
3821 // at the top level, do not print location
3822 if (type != max_type) {
3823 ss << " (";
3824 ss << osdmap.crush->get_full_location_ordered_string(*j);
3825 ss << ")";
3826 }
3827 int num = num_osds_subtree[*j];
3828 ss << " (" << num << " osds)";
3829 ss << " is down";
3830 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3831 }
3832 }
3833 // details of down osds
3834 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
3835 ostringstream ss;
3836 ss << "osd." << *it << " (";
3837 ss << osdmap.crush->get_full_location_ordered_string(*it);
3838 ss << ") is down";
3839 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3840 }
3841 }
3842 }
3843
3844 if (!osds.empty()) {
3845 ostringstream ss;
3846 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
3847 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3848 if (detail) {
3849 ss << " (osds: " << osds << ")";
3850 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3851 }
3852 }
3853
3854 // note: we leave it to ceph-mgr to generate details health warnings
3855 // with actual osd utilizations
3856
3857 // warn about flags
3858 uint64_t warn_flags =
3859 CEPH_OSDMAP_FULL |
3860 CEPH_OSDMAP_PAUSERD |
3861 CEPH_OSDMAP_PAUSEWR |
3862 CEPH_OSDMAP_PAUSEREC |
3863 CEPH_OSDMAP_NOUP |
3864 CEPH_OSDMAP_NODOWN |
3865 CEPH_OSDMAP_NOIN |
3866 CEPH_OSDMAP_NOOUT |
3867 CEPH_OSDMAP_NOBACKFILL |
3868 CEPH_OSDMAP_NORECOVER |
3869 CEPH_OSDMAP_NOSCRUB |
3870 CEPH_OSDMAP_NODEEP_SCRUB |
3871 CEPH_OSDMAP_NOTIERAGENT |
3872 CEPH_OSDMAP_NOREBALANCE;
3873 if (osdmap.test_flag(warn_flags)) {
3874 ostringstream ss;
3875 ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
3876 << " flag(s) set";
3877 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3878 if (detail)
3879 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3880 }
3881
3882 // old crush tunables?
3883 if (g_conf->mon_warn_on_legacy_crush_tunables) {
3884 string min = osdmap.crush->get_min_required_version();
3885 if (min < g_conf->mon_crush_min_required_version) {
3886 ostringstream ss;
3887 ss << "crush map has legacy tunables (require " << min
3888 << ", min is " << g_conf->mon_crush_min_required_version << ")";
3889 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3890 if (detail) {
3891 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3892 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3893 }
3894 }
3895 }
3896 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
3897 if (osdmap.crush->get_straw_calc_version() == 0) {
3898 ostringstream ss;
3899 ss << "crush map has straw_calc_version=0";
3900 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3901 if (detail) {
3902 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3903 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3904 }
3905 }
3906 }
3907
3908 // hit_set-less cache_mode?
3909 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
3910 int problem_cache_pools = 0;
3911 for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
3912 p != osdmap.pools.end();
3913 ++p) {
3914 const pg_pool_t& info = p->second;
3915 if (info.cache_mode_requires_hit_set() &&
3916 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
3917 ++problem_cache_pools;
3918 if (detail) {
3919 ostringstream ss;
3920 ss << "pool '" << osdmap.get_pool_name(p->first)
3921 << "' with cache_mode " << info.get_cache_mode_name()
3922 << " needs hit_set_type to be set but it is not";
3923 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3924 }
3925 }
3926 }
3927 if (problem_cache_pools) {
3928 ostringstream ss;
3929 ss << problem_cache_pools << " cache pools are missing hit_sets";
3930 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3931 }
3932 }
3933
3934 // Not using 'sortbitwise' and should be?
3935 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
3936 (osdmap.get_up_osd_features() &
3937 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
3938 ostringstream ss;
3939 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
3940 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3941 }
3942
3943 // Warn if 'mon_osd_down_out_interval' is set to zero.
3944 // Having this option set to zero on the leader acts much like the
3945 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3946 // without the 'noout' flag set but acting like that just the same, so
3947 // we report a HEALTH_WARN in case this option is set to zero.
3948 // This is an ugly hack to get the warning out, but until we find a way
3949 // to spread global options throughout the mon cluster and have all mons
3950 // using a base set of the same options, we need to work around this sort
3951 // of things.
3952 // There's also the obvious drawback that if this is set on a single
3953 // monitor on a 3-monitor cluster, this warning will only be shown every
3954 // third monitor connection.
3955 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
3956 g_conf->mon_osd_down_out_interval == 0) {
3957 ostringstream ss;
3958 ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
3959 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3960 if (detail) {
3961 ss << "; this has the same effect as the 'noout' flag";
3962 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3963 }
3964 }
3965
3966 // warn about upgrade flags that can be set but are not.
3967 if (g_conf->mon_debug_no_require_luminous) {
3968 // ignore these checks
3969 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS) &&
3970 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
3971 string msg = "all OSDs are running luminous or later but"
3972 " require_osd_release < luminous";
3973 summary.push_back(make_pair(HEALTH_WARN, msg));
3974 if (detail) {
3975 detail->push_back(make_pair(HEALTH_WARN, msg));
3976 }
3977 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN) &&
3978 osdmap.require_osd_release < CEPH_RELEASE_KRAKEN) {
3979 string msg = "all OSDs are running kraken or later but"
3980 " require_osd_release < kraken";
3981 summary.push_back(make_pair(HEALTH_WARN, msg));
3982 if (detail) {
3983 detail->push_back(make_pair(HEALTH_WARN, msg));
3984 }
3985 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL) &&
3986 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
3987 string msg = "all OSDs are running jewel or later but"
3988 " require_osd_release < jewel";
3989 summary.push_back(make_pair(HEALTH_WARN, msg));
3990 if (detail) {
3991 detail->push_back(make_pair(HEALTH_WARN, msg));
3992 }
3993 }
3994
3995 for (auto it : osdmap.get_pools()) {
3996 const pg_pool_t &pool = it.second;
3997 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
3998 const string& pool_name = osdmap.get_pool_name(it.first);
3999 stringstream ss;
4000 ss << "pool '" << pool_name << "' is full";
4001 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4002 if (detail)
4003 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4004 }
4005 }
4006 }
4007 }
4008
4009 void OSDMonitor::dump_info(Formatter *f)
4010 {
4011 f->open_object_section("osdmap");
4012 osdmap.dump(f);
4013 f->close_section();
4014
4015 f->open_array_section("osd_metadata");
4016 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4017 if (osdmap.exists(i)) {
4018 f->open_object_section("osd");
4019 f->dump_unsigned("id", i);
4020 dump_osd_metadata(i, f, NULL);
4021 f->close_section();
4022 }
4023 }
4024 f->close_section();
4025
4026 f->dump_unsigned("osdmap_first_committed", get_first_committed());
4027 f->dump_unsigned("osdmap_last_committed", get_last_committed());
4028
4029 f->open_object_section("crushmap");
4030 osdmap.crush->dump(f);
4031 f->close_section();
4032 }
4033
4034 namespace {
4035 enum osd_pool_get_choices {
4036 SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
4037 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL,
4038 NODELETE, NOPGCHANGE, NOSIZECHANGE,
4039 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
4040 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4041 USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
4042 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4043 CACHE_TARGET_FULL_RATIO,
4044 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4045 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
4046 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
4047 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
4048 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
4049 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
4050 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
4051 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
4052 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK };
4053
4054 std::set<osd_pool_get_choices>
4055 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
4056 const std::set<osd_pool_get_choices>& second)
4057 {
4058 std::set<osd_pool_get_choices> result;
4059 std::set_difference(first.begin(), first.end(),
4060 second.begin(), second.end(),
4061 std::inserter(result, result.end()));
4062 return result;
4063 }
4064 }
4065
4066
4067 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
4068 {
4069 op->mark_osdmon_event(__func__);
4070 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
4071 int r = 0;
4072 bufferlist rdata;
4073 stringstream ss, ds;
4074
4075 map<string, cmd_vartype> cmdmap;
4076 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
4077 string rs = ss.str();
4078 mon->reply_command(op, -EINVAL, rs, get_last_committed());
4079 return true;
4080 }
4081
4082 MonSession *session = m->get_session();
4083 if (!session) {
4084 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
4085 return true;
4086 }
4087
4088 string prefix;
4089 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
4090
4091 string format;
4092 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
4093 boost::scoped_ptr<Formatter> f(Formatter::create(format));
4094
4095 if (prefix == "osd stat") {
4096 osdmap.print_summary(f.get(), ds, "");
4097 if (f)
4098 f->flush(rdata);
4099 else
4100 rdata.append(ds);
4101 }
4102 else if (prefix == "osd perf" ||
4103 prefix == "osd blocked-by") {
4104 r = mon->pgservice->process_pg_command(prefix, cmdmap,
4105 osdmap, f.get(), &ss, &rdata);
4106 }
4107 else if (prefix == "osd dump" ||
4108 prefix == "osd tree" ||
4109 prefix == "osd ls" ||
4110 prefix == "osd getmap" ||
4111 prefix == "osd getcrushmap" ||
4112 prefix == "osd ls-tree") {
4113 string val;
4114
4115 epoch_t epoch = 0;
4116 int64_t epochnum;
4117 cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
4118 epoch = epochnum;
4119
4120 bufferlist osdmap_bl;
4121 int err = get_version_full(epoch, osdmap_bl);
4122 if (err == -ENOENT) {
4123 r = -ENOENT;
4124 ss << "there is no map for epoch " << epoch;
4125 goto reply;
4126 }
4127 assert(err == 0);
4128 assert(osdmap_bl.length());
4129
4130 OSDMap *p;
4131 if (epoch == osdmap.get_epoch()) {
4132 p = &osdmap;
4133 } else {
4134 p = new OSDMap;
4135 p->decode(osdmap_bl);
4136 }
4137
4138 auto sg = make_scope_guard([&] {
4139 if (p != &osdmap) {
4140 delete p;
4141 }
4142 });
4143
4144 if (prefix == "osd dump") {
4145 stringstream ds;
4146 if (f) {
4147 f->open_object_section("osdmap");
4148 p->dump(f.get());
4149 f->close_section();
4150 f->flush(ds);
4151 } else {
4152 p->print(ds);
4153 }
4154 rdata.append(ds);
4155 if (!f)
4156 ds << " ";
4157 } else if (prefix == "osd ls") {
4158 if (f) {
4159 f->open_array_section("osds");
4160 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4161 if (osdmap.exists(i)) {
4162 f->dump_int("osd", i);
4163 }
4164 }
4165 f->close_section();
4166 f->flush(ds);
4167 } else {
4168 bool first = true;
4169 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4170 if (osdmap.exists(i)) {
4171 if (!first)
4172 ds << "\n";
4173 first = false;
4174 ds << i;
4175 }
4176 }
4177 }
4178 rdata.append(ds);
4179 } else if (prefix == "osd tree") {
4180 vector<string> states;
4181 cmd_getval(g_ceph_context, cmdmap, "states", states);
4182 unsigned filter = 0;
4183 for (auto& s : states) {
4184 if (s == "up") {
4185 filter |= OSDMap::DUMP_UP;
4186 } else if (s == "down") {
4187 filter |= OSDMap::DUMP_DOWN;
4188 } else if (s == "in") {
4189 filter |= OSDMap::DUMP_IN;
4190 } else if (s == "out") {
4191 filter |= OSDMap::DUMP_OUT;
4192 } else if (s == "destroyed") {
4193 filter |= OSDMap::DUMP_DESTROYED;
4194 } else {
4195 ss << "unrecognized state '" << s << "'";
4196 r = -EINVAL;
4197 goto reply;
4198 }
4199 }
4200 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
4201 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4202 ss << "cannot specify both 'in' and 'out'";
4203 r = -EINVAL;
4204 goto reply;
4205 }
4206 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4207 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4208 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4209 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4210 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4211 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4212 ss << "can specify only one of 'up', 'down' and 'destroyed'";
4213 r = -EINVAL;
4214 goto reply;
4215 }
4216 if (f) {
4217 f->open_object_section("tree");
4218 p->print_tree(f.get(), NULL, filter);
4219 f->close_section();
4220 f->flush(ds);
4221 } else {
4222 p->print_tree(NULL, &ds, filter);
4223 }
4224 rdata.append(ds);
4225 } else if (prefix == "osd getmap") {
4226 rdata.append(osdmap_bl);
4227 ss << "got osdmap epoch " << p->get_epoch();
4228 } else if (prefix == "osd getcrushmap") {
4229 p->crush->encode(rdata, mon->get_quorum_con_features());
4230 ss << p->get_crush_version();
4231 } else if (prefix == "osd ls-tree") {
4232 string bucket_name;
4233 cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
4234 set<int> osds;
4235 r = p->get_osds_by_bucket_name(bucket_name, &osds);
4236 if (r == -ENOENT) {
4237 ss << "\"" << bucket_name << "\" does not exist";
4238 goto reply;
4239 } else if (r < 0) {
4240 ss << "can not parse bucket name:\"" << bucket_name << "\"";
4241 goto reply;
4242 }
4243
4244 if (f) {
4245 f->open_array_section("osds");
4246 for (auto &i : osds) {
4247 if (osdmap.exists(i)) {
4248 f->dump_int("osd", i);
4249 }
4250 }
4251 f->close_section();
4252 f->flush(ds);
4253 } else {
4254 bool first = true;
4255 for (auto &i : osds) {
4256 if (osdmap.exists(i)) {
4257 if (!first)
4258 ds << "\n";
4259 first = false;
4260 ds << i;
4261 }
4262 }
4263 }
4264
4265 rdata.append(ds);
4266 }
4267 } else if (prefix == "osd df") {
4268 string method;
4269 cmd_getval(g_ceph_context, cmdmap, "output_method", method);
4270 print_osd_utilization(osdmap, mon->pgservice, ds,
4271 f.get(), method == "tree");
4272 rdata.append(ds);
4273 } else if (prefix == "osd getmaxosd") {
4274 if (f) {
4275 f->open_object_section("getmaxosd");
4276 f->dump_unsigned("epoch", osdmap.get_epoch());
4277 f->dump_int("max_osd", osdmap.get_max_osd());
4278 f->close_section();
4279 f->flush(rdata);
4280 } else {
4281 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4282 rdata.append(ds);
4283 }
4284 } else if (prefix == "osd utilization") {
4285 string out;
4286 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4287 if (f)
4288 f->flush(rdata);
4289 else
4290 rdata.append(out);
4291 r = 0;
4292 goto reply;
4293 } else if (prefix == "osd find") {
4294 int64_t osd;
4295 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4296 ss << "unable to parse osd id value '"
4297 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4298 r = -EINVAL;
4299 goto reply;
4300 }
4301 if (!osdmap.exists(osd)) {
4302 ss << "osd." << osd << " does not exist";
4303 r = -ENOENT;
4304 goto reply;
4305 }
4306 string format;
4307 cmd_getval(g_ceph_context, cmdmap, "format", format);
4308 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4309 f->open_object_section("osd_location");
4310 f->dump_int("osd", osd);
4311 f->dump_stream("ip") << osdmap.get_addr(osd);
4312 f->open_object_section("crush_location");
4313 map<string,string> loc = osdmap.crush->get_full_location(osd);
4314 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4315 f->dump_string(p->first.c_str(), p->second);
4316 f->close_section();
4317 f->close_section();
4318 f->flush(rdata);
4319 } else if (prefix == "osd metadata") {
4320 int64_t osd = -1;
4321 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4322 !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4323 ss << "unable to parse osd id value '"
4324 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4325 r = -EINVAL;
4326 goto reply;
4327 }
4328 if (osd >= 0 && !osdmap.exists(osd)) {
4329 ss << "osd." << osd << " does not exist";
4330 r = -ENOENT;
4331 goto reply;
4332 }
4333 string format;
4334 cmd_getval(g_ceph_context, cmdmap, "format", format);
4335 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4336 if (osd >= 0) {
4337 f->open_object_section("osd_metadata");
4338 f->dump_unsigned("id", osd);
4339 r = dump_osd_metadata(osd, f.get(), &ss);
4340 if (r < 0)
4341 goto reply;
4342 f->close_section();
4343 } else {
4344 r = 0;
4345 f->open_array_section("osd_metadata");
4346 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4347 if (osdmap.exists(i)) {
4348 f->open_object_section("osd");
4349 f->dump_unsigned("id", i);
4350 r = dump_osd_metadata(i, f.get(), NULL);
4351 if (r == -EINVAL || r == -ENOENT) {
4352 // Drop error, continue to get other daemons' metadata
4353 dout(4) << "No metadata for osd." << i << dendl;
4354 r = 0;
4355 } else if (r < 0) {
4356 // Unexpected error
4357 goto reply;
4358 }
4359 f->close_section();
4360 }
4361 }
4362 f->close_section();
4363 }
4364 f->flush(rdata);
4365 } else if (prefix == "osd versions") {
4366 if (!f)
4367 f.reset(Formatter::create("json-pretty"));
4368 count_metadata("ceph_version", f.get());
4369 f->flush(rdata);
4370 r = 0;
4371 } else if (prefix == "osd count-metadata") {
4372 if (!f)
4373 f.reset(Formatter::create("json-pretty"));
4374 string field;
4375 cmd_getval(g_ceph_context, cmdmap, "property", field);
4376 count_metadata(field, f.get());
4377 f->flush(rdata);
4378 r = 0;
4379 } else if (prefix == "osd map") {
4380 string poolstr, objstr, namespacestr;
4381 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4382 cmd_getval(g_ceph_context, cmdmap, "object", objstr);
4383 cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
4384
4385 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4386 if (pool < 0) {
4387 ss << "pool " << poolstr << " does not exist";
4388 r = -ENOENT;
4389 goto reply;
4390 }
4391 object_locator_t oloc(pool, namespacestr);
4392 object_t oid(objstr);
4393 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4394 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4395 vector<int> up, acting;
4396 int up_p, acting_p;
4397 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
4398
4399 string fullobjname;
4400 if (!namespacestr.empty())
4401 fullobjname = namespacestr + string("/") + oid.name;
4402 else
4403 fullobjname = oid.name;
4404 if (f) {
4405 f->open_object_section("osd_map");
4406 f->dump_unsigned("epoch", osdmap.get_epoch());
4407 f->dump_string("pool", poolstr);
4408 f->dump_int("pool_id", pool);
4409 f->dump_stream("objname") << fullobjname;
4410 f->dump_stream("raw_pgid") << pgid;
4411 f->dump_stream("pgid") << mpgid;
4412 f->open_array_section("up");
4413 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
4414 f->dump_int("osd", *p);
4415 f->close_section();
4416 f->dump_int("up_primary", up_p);
4417 f->open_array_section("acting");
4418 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
4419 f->dump_int("osd", *p);
4420 f->close_section();
4421 f->dump_int("acting_primary", acting_p);
4422 f->close_section(); // osd_map
4423 f->flush(rdata);
4424 } else {
4425 ds << "osdmap e" << osdmap.get_epoch()
4426 << " pool '" << poolstr << "' (" << pool << ")"
4427 << " object '" << fullobjname << "' ->"
4428 << " pg " << pgid << " (" << mpgid << ")"
4429 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
4430 << pg_vector_string(acting) << ", p" << acting_p << ")";
4431 rdata.append(ds);
4432 }
4433
4434 } else if (prefix == "pg map") {
4435 pg_t pgid;
4436 string pgidstr;
4437 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
4438 if (!pgid.parse(pgidstr.c_str())) {
4439 ss << "invalid pgid '" << pgidstr << "'";
4440 r = -EINVAL;
4441 goto reply;
4442 }
4443 vector<int> up, acting;
4444 if (!osdmap.have_pg_pool(pgid.pool())) {
4445 ss << "pg '" << pgidstr << "' does not exist";
4446 r = -ENOENT;
4447 goto reply;
4448 }
4449 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4450 osdmap.pg_to_up_acting_osds(pgid, up, acting);
4451 if (f) {
4452 f->open_object_section("pg_map");
4453 f->dump_unsigned("epoch", osdmap.get_epoch());
4454 f->dump_stream("raw_pgid") << pgid;
4455 f->dump_stream("pgid") << mpgid;
4456 f->open_array_section("up");
4457 for (auto osd : up) {
4458 f->dump_int("up_osd", osd);
4459 }
4460 f->close_section();
4461 f->open_array_section("acting");
4462 for (auto osd : acting) {
4463 f->dump_int("acting_osd", osd);
4464 }
4465 f->close_section();
4466 f->close_section();
4467 f->flush(rdata);
4468 } else {
4469 ds << "osdmap e" << osdmap.get_epoch()
4470 << " pg " << pgid << " (" << mpgid << ")"
4471 << " -> up " << up << " acting " << acting;
4472 rdata.append(ds);
4473 }
4474 goto reply;
4475
4476 } else if (prefix == "osd scrub" ||
4477 prefix == "osd deep-scrub" ||
4478 prefix == "osd repair") {
4479 string whostr;
4480 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
4481 vector<string> pvec;
4482 get_str_vec(prefix, pvec);
4483
4484 if (whostr == "*" || whostr == "all" || whostr == "any") {
4485 ss << "osds ";
4486 int c = 0;
4487 for (int i = 0; i < osdmap.get_max_osd(); i++)
4488 if (osdmap.is_up(i)) {
4489 ss << (c++ ? "," : "") << i;
4490 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4491 pvec.back() == "repair",
4492 pvec.back() == "deep-scrub"),
4493 osdmap.get_inst(i));
4494 }
4495 r = 0;
4496 ss << " instructed to " << pvec.back();
4497 } else {
4498 long osd = parse_osd_id(whostr.c_str(), &ss);
4499 if (osd < 0) {
4500 r = -EINVAL;
4501 } else if (osdmap.is_up(osd)) {
4502 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4503 pvec.back() == "repair",
4504 pvec.back() == "deep-scrub"),
4505 osdmap.get_inst(osd));
4506 ss << "osd." << osd << " instructed to " << pvec.back();
4507 } else {
4508 ss << "osd." << osd << " is not up";
4509 r = -EAGAIN;
4510 }
4511 }
4512 } else if (prefix == "osd lspools") {
4513 int64_t auid;
4514 cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
4515 if (f)
4516 f->open_array_section("pools");
4517 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
4518 p != osdmap.pools.end();
4519 ++p) {
4520 if (!auid || p->second.auid == (uint64_t)auid) {
4521 if (f) {
4522 f->open_object_section("pool");
4523 f->dump_int("poolnum", p->first);
4524 f->dump_string("poolname", osdmap.pool_name[p->first]);
4525 f->close_section();
4526 } else {
4527 ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
4528 }
4529 }
4530 }
4531 if (f) {
4532 f->close_section();
4533 f->flush(ds);
4534 }
4535 rdata.append(ds);
4536 } else if (prefix == "osd blacklist ls") {
4537 if (f)
4538 f->open_array_section("blacklist");
4539
4540 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4541 p != osdmap.blacklist.end();
4542 ++p) {
4543 if (f) {
4544 f->open_object_section("entry");
4545 f->dump_stream("addr") << p->first;
4546 f->dump_stream("until") << p->second;
4547 f->close_section();
4548 } else {
4549 stringstream ss;
4550 string s;
4551 ss << p->first << " " << p->second;
4552 getline(ss, s);
4553 s += "\n";
4554 rdata.append(s);
4555 }
4556 }
4557 if (f) {
4558 f->close_section();
4559 f->flush(rdata);
4560 }
4561 ss << "listed " << osdmap.blacklist.size() << " entries";
4562
4563 } else if (prefix == "osd pool ls") {
4564 string detail;
4565 cmd_getval(g_ceph_context, cmdmap, "detail", detail);
4566 if (!f && detail == "detail") {
4567 ostringstream ss;
4568 osdmap.print_pools(ss);
4569 rdata.append(ss.str());
4570 } else {
4571 if (f)
4572 f->open_array_section("pools");
4573 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
4574 it != osdmap.get_pools().end();
4575 ++it) {
4576 if (f) {
4577 if (detail == "detail") {
4578 f->open_object_section("pool");
4579 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4580 it->second.dump(f.get());
4581 f->close_section();
4582 } else {
4583 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4584 }
4585 } else {
4586 rdata.append(osdmap.get_pool_name(it->first) + "\n");
4587 }
4588 }
4589 if (f) {
4590 f->close_section();
4591 f->flush(rdata);
4592 }
4593 }
4594
4595 } else if (prefix == "osd crush get-tunable") {
4596 string tunable;
4597 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
4598 ostringstream rss;
4599 if (f)
4600 f->open_object_section("tunable");
4601 if (tunable == "straw_calc_version") {
4602 if (f)
4603 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
4604 else
4605 rss << osdmap.crush->get_straw_calc_version() << "\n";
4606 } else {
4607 r = -EINVAL;
4608 goto reply;
4609 }
4610 if (f) {
4611 f->close_section();
4612 f->flush(rdata);
4613 } else {
4614 rdata.append(rss.str());
4615 }
4616 r = 0;
4617
4618 } else if (prefix == "osd pool get") {
4619 string poolstr;
4620 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4621 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4622 if (pool < 0) {
4623 ss << "unrecognized pool '" << poolstr << "'";
4624 r = -ENOENT;
4625 goto reply;
4626 }
4627
4628 const pg_pool_t *p = osdmap.get_pg_pool(pool);
4629 string var;
4630 cmd_getval(g_ceph_context, cmdmap, "var", var);
4631
4632 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
4633 const choices_map_t ALL_CHOICES = {
4634 {"size", SIZE},
4635 {"min_size", MIN_SIZE},
4636 {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
4637 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
4638 {"crush_rule", CRUSH_RULE},
4639 {"hashpspool", HASHPSPOOL}, {"nodelete", NODELETE},
4640 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
4641 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
4642 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
4643 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
4644 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
4645 {"use_gmt_hitset", USE_GMT_HITSET},
4646 {"auid", AUID}, {"target_max_objects", TARGET_MAX_OBJECTS},
4647 {"target_max_bytes", TARGET_MAX_BYTES},
4648 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
4649 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
4650 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
4651 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
4652 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
4653 {"erasure_code_profile", ERASURE_CODE_PROFILE},
4654 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
4655 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
4656 {"fast_read", FAST_READ},
4657 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
4658 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
4659 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
4660 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
4661 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
4662 {"recovery_priority", RECOVERY_PRIORITY},
4663 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
4664 {"scrub_priority", SCRUB_PRIORITY},
4665 {"compression_mode", COMPRESSION_MODE},
4666 {"compression_algorithm", COMPRESSION_ALGORITHM},
4667 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
4668 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
4669 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
4670 {"csum_type", CSUM_TYPE},
4671 {"csum_max_block", CSUM_MAX_BLOCK},
4672 {"csum_min_block", CSUM_MIN_BLOCK},
4673 };
4674
4675 typedef std::set<osd_pool_get_choices> choices_set_t;
4676
4677 const choices_set_t ONLY_TIER_CHOICES = {
4678 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4679 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
4680 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4681 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4682 MIN_READ_RECENCY_FOR_PROMOTE,
4683 MIN_WRITE_RECENCY_FOR_PROMOTE,
4684 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
4685 };
4686 const choices_set_t ONLY_ERASURE_CHOICES = {
4687 ERASURE_CODE_PROFILE
4688 };
4689
4690 choices_set_t selected_choices;
4691 if (var == "all") {
4692 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
4693 it != ALL_CHOICES.end(); ++it) {
4694 selected_choices.insert(it->second);
4695 }
4696
4697 if(!p->is_tier()) {
4698 selected_choices = subtract_second_from_first(selected_choices,
4699 ONLY_TIER_CHOICES);
4700 }
4701
4702 if(!p->is_erasure()) {
4703 selected_choices = subtract_second_from_first(selected_choices,
4704 ONLY_ERASURE_CHOICES);
4705 }
4706 } else /* var != "all" */ {
4707 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
4708 osd_pool_get_choices selected = found->second;
4709
4710 if (!p->is_tier() &&
4711 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
4712 ss << "pool '" << poolstr
4713 << "' is not a tier pool: variable not applicable";
4714 r = -EACCES;
4715 goto reply;
4716 }
4717
4718 if (!p->is_erasure() &&
4719 ONLY_ERASURE_CHOICES.find(selected)
4720 != ONLY_ERASURE_CHOICES.end()) {
4721 ss << "pool '" << poolstr
4722 << "' is not a erasure pool: variable not applicable";
4723 r = -EACCES;
4724 goto reply;
4725 }
4726
4727 selected_choices.insert(selected);
4728 }
4729
4730 if (f) {
4731 for(choices_set_t::const_iterator it = selected_choices.begin();
4732 it != selected_choices.end(); ++it) {
4733 choices_map_t::const_iterator i;
4734 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4735 if (i->second == *it) {
4736 break;
4737 }
4738 }
4739 assert(i != ALL_CHOICES.end());
4740 bool pool_opt = pool_opts_t::is_opt_name(i->first);
4741 if (!pool_opt) {
4742 f->open_object_section("pool");
4743 f->dump_string("pool", poolstr);
4744 f->dump_int("pool_id", pool);
4745 }
4746 switch(*it) {
4747 case PG_NUM:
4748 f->dump_int("pg_num", p->get_pg_num());
4749 break;
4750 case PGP_NUM:
4751 f->dump_int("pgp_num", p->get_pgp_num());
4752 break;
4753 case AUID:
4754 f->dump_int("auid", p->get_auid());
4755 break;
4756 case SIZE:
4757 f->dump_int("size", p->get_size());
4758 break;
4759 case MIN_SIZE:
4760 f->dump_int("min_size", p->get_min_size());
4761 break;
4762 case CRASH_REPLAY_INTERVAL:
4763 f->dump_int("crash_replay_interval",
4764 p->get_crash_replay_interval());
4765 break;
4766 case CRUSH_RULE:
4767 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
4768 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
4769 p->get_crush_rule()));
4770 } else {
4771 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
4772 }
4773 break;
4774 case HASHPSPOOL:
4775 case NODELETE:
4776 case NOPGCHANGE:
4777 case NOSIZECHANGE:
4778 case WRITE_FADVISE_DONTNEED:
4779 case NOSCRUB:
4780 case NODEEP_SCRUB:
4781 f->dump_string(i->first.c_str(),
4782 p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4783 "true" : "false");
4784 break;
4785 case HIT_SET_PERIOD:
4786 f->dump_int("hit_set_period", p->hit_set_period);
4787 break;
4788 case HIT_SET_COUNT:
4789 f->dump_int("hit_set_count", p->hit_set_count);
4790 break;
4791 case HIT_SET_TYPE:
4792 f->dump_string("hit_set_type",
4793 HitSet::get_type_name(p->hit_set_params.get_type()));
4794 break;
4795 case HIT_SET_FPP:
4796 {
4797 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4798 BloomHitSet::Params *bloomp =
4799 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4800 f->dump_float("hit_set_fpp", bloomp->get_fpp());
4801 } else if(var != "all") {
4802 f->close_section();
4803 ss << "hit set is not of type Bloom; " <<
4804 "invalid to get a false positive rate!";
4805 r = -EINVAL;
4806 goto reply;
4807 }
4808 }
4809 break;
4810 case USE_GMT_HITSET:
4811 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
4812 break;
4813 case TARGET_MAX_OBJECTS:
4814 f->dump_unsigned("target_max_objects", p->target_max_objects);
4815 break;
4816 case TARGET_MAX_BYTES:
4817 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
4818 break;
4819 case CACHE_TARGET_DIRTY_RATIO:
4820 f->dump_unsigned("cache_target_dirty_ratio_micro",
4821 p->cache_target_dirty_ratio_micro);
4822 f->dump_float("cache_target_dirty_ratio",
4823 ((float)p->cache_target_dirty_ratio_micro/1000000));
4824 break;
4825 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4826 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
4827 p->cache_target_dirty_high_ratio_micro);
4828 f->dump_float("cache_target_dirty_high_ratio",
4829 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
4830 break;
4831 case CACHE_TARGET_FULL_RATIO:
4832 f->dump_unsigned("cache_target_full_ratio_micro",
4833 p->cache_target_full_ratio_micro);
4834 f->dump_float("cache_target_full_ratio",
4835 ((float)p->cache_target_full_ratio_micro/1000000));
4836 break;
4837 case CACHE_MIN_FLUSH_AGE:
4838 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
4839 break;
4840 case CACHE_MIN_EVICT_AGE:
4841 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
4842 break;
4843 case ERASURE_CODE_PROFILE:
4844 f->dump_string("erasure_code_profile", p->erasure_code_profile);
4845 break;
4846 case MIN_READ_RECENCY_FOR_PROMOTE:
4847 f->dump_int("min_read_recency_for_promote",
4848 p->min_read_recency_for_promote);
4849 break;
4850 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4851 f->dump_int("min_write_recency_for_promote",
4852 p->min_write_recency_for_promote);
4853 break;
4854 case FAST_READ:
4855 f->dump_int("fast_read", p->fast_read);
4856 break;
4857 case HIT_SET_GRADE_DECAY_RATE:
4858 f->dump_int("hit_set_grade_decay_rate",
4859 p->hit_set_grade_decay_rate);
4860 break;
4861 case HIT_SET_SEARCH_LAST_N:
4862 f->dump_int("hit_set_search_last_n",
4863 p->hit_set_search_last_n);
4864 break;
4865 case SCRUB_MIN_INTERVAL:
4866 case SCRUB_MAX_INTERVAL:
4867 case DEEP_SCRUB_INTERVAL:
4868 case RECOVERY_PRIORITY:
4869 case RECOVERY_OP_PRIORITY:
4870 case SCRUB_PRIORITY:
4871 case COMPRESSION_MODE:
4872 case COMPRESSION_ALGORITHM:
4873 case COMPRESSION_REQUIRED_RATIO:
4874 case COMPRESSION_MAX_BLOB_SIZE:
4875 case COMPRESSION_MIN_BLOB_SIZE:
4876 case CSUM_TYPE:
4877 case CSUM_MAX_BLOCK:
4878 case CSUM_MIN_BLOCK:
4879 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
4880 if (p->opts.is_set(key)) {
4881 f->open_object_section("pool");
4882 f->dump_string("pool", poolstr);
4883 f->dump_int("pool_id", pool);
4884 if(*it == CSUM_TYPE) {
4885 int val;
4886 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
4887 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
4888 } else {
4889 p->opts.dump(i->first, f.get());
4890 }
4891 f->close_section();
4892 f->flush(rdata);
4893 }
4894 break;
4895 }
4896 if (!pool_opt) {
4897 f->close_section();
4898 f->flush(rdata);
4899 }
4900 }
4901
4902 } else /* !f */ {
4903 for(choices_set_t::const_iterator it = selected_choices.begin();
4904 it != selected_choices.end(); ++it) {
4905 choices_map_t::const_iterator i;
4906 switch(*it) {
4907 case PG_NUM:
4908 ss << "pg_num: " << p->get_pg_num() << "\n";
4909 break;
4910 case PGP_NUM:
4911 ss << "pgp_num: " << p->get_pgp_num() << "\n";
4912 break;
4913 case AUID:
4914 ss << "auid: " << p->get_auid() << "\n";
4915 break;
4916 case SIZE:
4917 ss << "size: " << p->get_size() << "\n";
4918 break;
4919 case MIN_SIZE:
4920 ss << "min_size: " << p->get_min_size() << "\n";
4921 break;
4922 case CRASH_REPLAY_INTERVAL:
4923 ss << "crash_replay_interval: " <<
4924 p->get_crash_replay_interval() << "\n";
4925 break;
4926 case CRUSH_RULE:
4927 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
4928 ss << "crush_rule: " << osdmap.crush->get_rule_name(
4929 p->get_crush_rule()) << "\n";
4930 } else {
4931 ss << "crush_rule: " << p->get_crush_rule() << "\n";
4932 }
4933 break;
4934 case HIT_SET_PERIOD:
4935 ss << "hit_set_period: " << p->hit_set_period << "\n";
4936 break;
4937 case HIT_SET_COUNT:
4938 ss << "hit_set_count: " << p->hit_set_count << "\n";
4939 break;
4940 case HIT_SET_TYPE:
4941 ss << "hit_set_type: " <<
4942 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
4943 break;
4944 case HIT_SET_FPP:
4945 {
4946 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4947 BloomHitSet::Params *bloomp =
4948 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4949 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
4950 } else if(var != "all") {
4951 ss << "hit set is not of type Bloom; " <<
4952 "invalid to get a false positive rate!";
4953 r = -EINVAL;
4954 goto reply;
4955 }
4956 }
4957 break;
4958 case USE_GMT_HITSET:
4959 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
4960 break;
4961 case TARGET_MAX_OBJECTS:
4962 ss << "target_max_objects: " << p->target_max_objects << "\n";
4963 break;
4964 case TARGET_MAX_BYTES:
4965 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
4966 break;
4967 case CACHE_TARGET_DIRTY_RATIO:
4968 ss << "cache_target_dirty_ratio: "
4969 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
4970 break;
4971 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4972 ss << "cache_target_dirty_high_ratio: "
4973 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
4974 break;
4975 case CACHE_TARGET_FULL_RATIO:
4976 ss << "cache_target_full_ratio: "
4977 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
4978 break;
4979 case CACHE_MIN_FLUSH_AGE:
4980 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
4981 break;
4982 case CACHE_MIN_EVICT_AGE:
4983 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
4984 break;
4985 case ERASURE_CODE_PROFILE:
4986 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
4987 break;
4988 case MIN_READ_RECENCY_FOR_PROMOTE:
4989 ss << "min_read_recency_for_promote: " <<
4990 p->min_read_recency_for_promote << "\n";
4991 break;
4992 case HIT_SET_GRADE_DECAY_RATE:
4993 ss << "hit_set_grade_decay_rate: " <<
4994 p->hit_set_grade_decay_rate << "\n";
4995 break;
4996 case HIT_SET_SEARCH_LAST_N:
4997 ss << "hit_set_search_last_n: " <<
4998 p->hit_set_search_last_n << "\n";
4999 break;
5000 case HASHPSPOOL:
5001 case NODELETE:
5002 case NOPGCHANGE:
5003 case NOSIZECHANGE:
5004 case WRITE_FADVISE_DONTNEED:
5005 case NOSCRUB:
5006 case NODEEP_SCRUB:
5007 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5008 if (i->second == *it)
5009 break;
5010 }
5011 assert(i != ALL_CHOICES.end());
5012 ss << i->first << ": " <<
5013 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
5014 "true" : "false") << "\n";
5015 break;
5016 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5017 ss << "min_write_recency_for_promote: " <<
5018 p->min_write_recency_for_promote << "\n";
5019 break;
5020 case FAST_READ:
5021 ss << "fast_read: " << p->fast_read << "\n";
5022 break;
5023 case SCRUB_MIN_INTERVAL:
5024 case SCRUB_MAX_INTERVAL:
5025 case DEEP_SCRUB_INTERVAL:
5026 case RECOVERY_PRIORITY:
5027 case RECOVERY_OP_PRIORITY:
5028 case SCRUB_PRIORITY:
5029 case COMPRESSION_MODE:
5030 case COMPRESSION_ALGORITHM:
5031 case COMPRESSION_REQUIRED_RATIO:
5032 case COMPRESSION_MAX_BLOB_SIZE:
5033 case COMPRESSION_MIN_BLOB_SIZE:
5034 case CSUM_TYPE:
5035 case CSUM_MAX_BLOCK:
5036 case CSUM_MIN_BLOCK:
5037 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5038 if (i->second == *it)
5039 break;
5040 }
5041 assert(i != ALL_CHOICES.end());
5042 {
5043 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5044 if (p->opts.is_set(key)) {
5045 if(key == pool_opts_t::CSUM_TYPE) {
5046 int val;
5047 p->opts.get(key, &val);
5048 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
5049 } else {
5050 ss << i->first << ": " << p->opts.get(key) << "\n";
5051 }
5052 }
5053 }
5054 break;
5055 }
5056 rdata.append(ss.str());
5057 ss.str("");
5058 }
5059 }
5060 r = 0;
5061 } else if (prefix == "osd pool stats") {
5062 r = mon->pgservice->process_pg_command(prefix, cmdmap,
5063 osdmap, f.get(), &ss, &rdata);
5064 } else if (prefix == "osd pool get-quota") {
5065 string pool_name;
5066 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
5067
5068 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
5069 if (poolid < 0) {
5070 assert(poolid == -ENOENT);
5071 ss << "unrecognized pool '" << pool_name << "'";
5072 r = -ENOENT;
5073 goto reply;
5074 }
5075 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
5076
5077 if (f) {
5078 f->open_object_section("pool_quotas");
5079 f->dump_string("pool_name", pool_name);
5080 f->dump_unsigned("pool_id", poolid);
5081 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
5082 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
5083 f->close_section();
5084 f->flush(rdata);
5085 } else {
5086 stringstream rs;
5087 rs << "quotas for pool '" << pool_name << "':\n"
5088 << " max objects: ";
5089 if (p->quota_max_objects == 0)
5090 rs << "N/A";
5091 else
5092 rs << si_t(p->quota_max_objects) << " objects";
5093 rs << "\n"
5094 << " max bytes : ";
5095 if (p->quota_max_bytes == 0)
5096 rs << "N/A";
5097 else
5098 rs << si_t(p->quota_max_bytes) << "B";
5099 rdata.append(rs.str());
5100 }
5101 rdata.append("\n");
5102 r = 0;
5103 } else if (prefix == "osd crush rule list" ||
5104 prefix == "osd crush rule ls") {
5105 if (f) {
5106 f->open_array_section("rules");
5107 osdmap.crush->list_rules(f.get());
5108 f->close_section();
5109 f->flush(rdata);
5110 } else {
5111 ostringstream ss;
5112 osdmap.crush->list_rules(&ss);
5113 rdata.append(ss.str());
5114 }
5115 } else if (prefix == "osd crush rule ls-by-class") {
5116 string class_name;
5117 cmd_getval(g_ceph_context, cmdmap, "class", class_name);
5118 if (class_name.empty()) {
5119 ss << "no class specified";
5120 r = -EINVAL;
5121 goto reply;
5122 }
5123 set<int> rules;
5124 r = osdmap.crush->get_rules_by_class(class_name, &rules);
5125 if (r < 0) {
5126 ss << "failed to get rules by class '" << class_name << "'";
5127 goto reply;
5128 }
5129 if (f) {
5130 f->open_array_section("rules");
5131 for (auto &rule: rules) {
5132 f->dump_string("name", osdmap.crush->get_rule_name(rule));
5133 }
5134 f->close_section();
5135 f->flush(rdata);
5136 } else {
5137 ostringstream rs;
5138 for (auto &rule: rules) {
5139 rs << osdmap.crush->get_rule_name(rule) << "\n";
5140 }
5141 rdata.append(rs.str());
5142 }
5143 } else if (prefix == "osd crush rule dump") {
5144 string name;
5145 cmd_getval(g_ceph_context, cmdmap, "name", name);
5146 string format;
5147 cmd_getval(g_ceph_context, cmdmap, "format", format);
5148 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5149 if (name == "") {
5150 f->open_array_section("rules");
5151 osdmap.crush->dump_rules(f.get());
5152 f->close_section();
5153 } else {
5154 int ruleno = osdmap.crush->get_rule_id(name);
5155 if (ruleno < 0) {
5156 ss << "unknown crush rule '" << name << "'";
5157 r = ruleno;
5158 goto reply;
5159 }
5160 osdmap.crush->dump_rule(ruleno, f.get());
5161 }
5162 ostringstream rs;
5163 f->flush(rs);
5164 rs << "\n";
5165 rdata.append(rs.str());
5166 } else if (prefix == "osd crush dump") {
5167 string format;
5168 cmd_getval(g_ceph_context, cmdmap, "format", format);
5169 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5170 f->open_object_section("crush_map");
5171 osdmap.crush->dump(f.get());
5172 f->close_section();
5173 ostringstream rs;
5174 f->flush(rs);
5175 rs << "\n";
5176 rdata.append(rs.str());
5177 } else if (prefix == "osd crush show-tunables") {
5178 string format;
5179 cmd_getval(g_ceph_context, cmdmap, "format", format);
5180 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5181 f->open_object_section("crush_map_tunables");
5182 osdmap.crush->dump_tunables(f.get());
5183 f->close_section();
5184 ostringstream rs;
5185 f->flush(rs);
5186 rs << "\n";
5187 rdata.append(rs.str());
5188 } else if (prefix == "osd crush tree") {
5189 string shadow;
5190 cmd_getval(g_ceph_context, cmdmap, "shadow", shadow);
5191 bool show_shadow = shadow == "--show-shadow";
5192 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5193 if (f) {
5194 osdmap.crush->dump_tree(nullptr,
5195 f.get(),
5196 osdmap.get_pool_names(),
5197 show_shadow);
5198 f->flush(rdata);
5199 } else {
5200 ostringstream ss;
5201 osdmap.crush->dump_tree(&ss,
5202 nullptr,
5203 osdmap.get_pool_names(),
5204 show_shadow);
5205 rdata.append(ss.str());
5206 }
5207 } else if (prefix == "osd crush ls") {
5208 string name;
5209 if (!cmd_getval(g_ceph_context, cmdmap, "node", name)) {
5210 ss << "no node specified";
5211 r = -EINVAL;
5212 goto reply;
5213 }
5214 if (!osdmap.crush->name_exists(name)) {
5215 ss << "node '" << name << "' does not exist";
5216 r = -ENOENT;
5217 goto reply;
5218 }
5219 int id = osdmap.crush->get_item_id(name);
5220 list<int> result;
5221 if (id >= 0) {
5222 result.push_back(id);
5223 } else {
5224 int num = osdmap.crush->get_bucket_size(id);
5225 for (int i = 0; i < num; ++i) {
5226 result.push_back(osdmap.crush->get_bucket_item(id, i));
5227 }
5228 }
5229 if (f) {
5230 f->open_array_section("items");
5231 for (auto i : result) {
5232 f->dump_string("item", osdmap.crush->get_item_name(i));
5233 }
5234 f->close_section();
5235 f->flush(rdata);
5236 } else {
5237 ostringstream ss;
5238 for (auto i : result) {
5239 ss << osdmap.crush->get_item_name(i) << "\n";
5240 }
5241 rdata.append(ss.str());
5242 }
5243 r = 0;
5244 } else if (prefix == "osd crush class ls") {
5245 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5246 f->open_array_section("crush_classes");
5247 for (auto i : osdmap.crush->class_name)
5248 f->dump_string("class", i.second);
5249 f->close_section();
5250 f->flush(rdata);
5251 } else if (prefix == "osd crush class ls-osd") {
5252 string name;
5253 cmd_getval(g_ceph_context, cmdmap, "class", name);
5254 set<int> osds;
5255 osdmap.crush->get_devices_by_class(name, &osds);
5256 if (f) {
5257 f->open_array_section("osds");
5258 for (auto &osd: osds)
5259 f->dump_int("osd", osd);
5260 f->close_section();
5261 f->flush(rdata);
5262 } else {
5263 bool first = true;
5264 for (auto &osd : osds) {
5265 if (!first)
5266 ds << "\n";
5267 first = false;
5268 ds << osd;
5269 }
5270 rdata.append(ds);
5271 }
5272 } else if (prefix == "osd erasure-code-profile ls") {
5273 const auto &profiles = osdmap.get_erasure_code_profiles();
5274 if (f)
5275 f->open_array_section("erasure-code-profiles");
5276 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5277 if (f)
5278 f->dump_string("profile", i->first.c_str());
5279 else
5280 rdata.append(i->first + "\n");
5281 }
5282 if (f) {
5283 f->close_section();
5284 ostringstream rs;
5285 f->flush(rs);
5286 rs << "\n";
5287 rdata.append(rs.str());
5288 }
5289 } else if (prefix == "osd crush weight-set ls") {
5290 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5291 if (f) {
5292 f->open_array_section("weight_sets");
5293 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5294 f->dump_string("pool", "(compat)");
5295 }
5296 for (auto& i : osdmap.crush->choose_args) {
5297 if (i.first >= 0) {
5298 f->dump_string("pool", osdmap.get_pool_name(i.first));
5299 }
5300 }
5301 f->close_section();
5302 f->flush(rdata);
5303 } else {
5304 ostringstream rs;
5305 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5306 rs << "(compat)\n";
5307 }
5308 for (auto& i : osdmap.crush->choose_args) {
5309 if (i.first >= 0) {
5310 rs << osdmap.get_pool_name(i.first) << "\n";
5311 }
5312 }
5313 rdata.append(rs.str());
5314 }
5315 } else if (prefix == "osd crush weight-set dump") {
5316 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5317 "json-pretty"));
5318 osdmap.crush->dump_choose_args(f.get());
5319 f->flush(rdata);
5320 } else if (prefix == "osd erasure-code-profile get") {
5321 string name;
5322 cmd_getval(g_ceph_context, cmdmap, "name", name);
5323 if (!osdmap.has_erasure_code_profile(name)) {
5324 ss << "unknown erasure code profile '" << name << "'";
5325 r = -ENOENT;
5326 goto reply;
5327 }
5328 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5329 if (f)
5330 f->open_object_section("profile");
5331 for (map<string,string>::const_iterator i = profile.begin();
5332 i != profile.end();
5333 ++i) {
5334 if (f)
5335 f->dump_string(i->first.c_str(), i->second.c_str());
5336 else
5337 rdata.append(i->first + "=" + i->second + "\n");
5338 }
5339 if (f) {
5340 f->close_section();
5341 ostringstream rs;
5342 f->flush(rs);
5343 rs << "\n";
5344 rdata.append(rs.str());
5345 }
5346 } else if (prefix == "osd pool application get") {
5347 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5348 "json-pretty"));
5349 string pool_name;
5350 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
5351 string app;
5352 cmd_getval(g_ceph_context, cmdmap, "app", app);
5353 string key;
5354 cmd_getval(g_ceph_context, cmdmap, "key", key);
5355
5356 if (pool_name.empty()) {
5357 // all
5358 f->open_object_section("pools");
5359 for (const auto &pool : osdmap.pools) {
5360 std::string name("<unknown>");
5361 const auto &pni = osdmap.pool_name.find(pool.first);
5362 if (pni != osdmap.pool_name.end())
5363 name = pni->second;
5364 f->open_object_section(name.c_str());
5365 for (auto &app_pair : pool.second.application_metadata) {
5366 f->open_object_section(app_pair.first.c_str());
5367 for (auto &kv_pair : app_pair.second) {
5368 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5369 }
5370 f->close_section();
5371 }
5372 f->close_section(); // name
5373 }
5374 f->close_section(); // pools
5375 f->flush(rdata);
5376 } else {
5377 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
5378 if (pool < 0) {
5379 ss << "unrecognized pool '" << pool_name << "'";
5380 r = -ENOENT;
5381 goto reply;
5382 }
5383 auto p = osdmap.get_pg_pool(pool);
5384 // filter by pool
5385 if (app.empty()) {
5386 f->open_object_section(pool_name.c_str());
5387 for (auto &app_pair : p->application_metadata) {
5388 f->open_object_section(app_pair.first.c_str());
5389 for (auto &kv_pair : app_pair.second) {
5390 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5391 }
5392 f->close_section(); // application
5393 }
5394 f->close_section(); // pool_name
5395 f->flush(rdata);
5396 goto reply;
5397 }
5398
5399 auto app_it = p->application_metadata.find(app);
5400 if (app_it == p->application_metadata.end()) {
5401 ss << "pool '" << pool_name << "' has no application '" << app << "'";
5402 r = -ENOENT;
5403 goto reply;
5404 }
5405 // filter by pool + app
5406 if (key.empty()) {
5407 f->open_object_section(app_it->first.c_str());
5408 for (auto &kv_pair : app_it->second) {
5409 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5410 }
5411 f->close_section(); // application
5412 f->flush(rdata);
5413 goto reply;
5414 }
5415 // filter by pool + app + key
5416 auto key_it = app_it->second.find(key);
5417 if (key_it == app_it->second.end()) {
5418 ss << "application '" << app << "' on pool '" << pool_name
5419 << "' does not have key '" << key << "'";
5420 r = -ENOENT;
5421 goto reply;
5422 }
5423 ss << key_it->second << "\n";
5424 rdata.append(ss.str());
5425 ss.str("");
5426 }
5427 } else {
5428 // try prepare update
5429 return false;
5430 }
5431
5432 reply:
5433 string rs;
5434 getline(ss, rs);
5435 mon->reply_command(op, r, rs, rdata, get_last_committed());
5436 return true;
5437 }
5438
5439 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
5440 {
5441 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
5442 osdmap.get_pg_pool(pool_id));
5443 assert(pool);
5444 pool->set_flag(flags);
5445 }
5446
5447 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
5448 {
5449 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
5450 osdmap.get_pg_pool(pool_id));
5451 assert(pool);
5452 pool->unset_flag(flags);
5453 }
5454
5455 bool OSDMonitor::update_pools_status()
5456 {
5457 if (!mon->pgservice->is_readable())
5458 return false;
5459
5460 bool ret = false;
5461
5462 auto& pools = osdmap.get_pools();
5463 for (auto it = pools.begin(); it != pools.end(); ++it) {
5464 const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
5465 if (!pstat)
5466 continue;
5467 const object_stat_sum_t& sum = pstat->stats.sum;
5468 const pg_pool_t &pool = it->second;
5469 const string& pool_name = osdmap.get_pool_name(it->first);
5470
5471 bool pool_is_full =
5472 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
5473 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
5474
5475 if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
5476 if (pool_is_full)
5477 continue;
5478
5479 mon->clog->info() << "pool '" << pool_name
5480 << "' no longer out of quota; removing NO_QUOTA flag";
5481 // below we cancel FLAG_FULL too, we'll set it again in
5482 // OSDMonitor::encode_pending if it still fails the osd-full checking.
5483 clear_pool_flags(it->first,
5484 pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
5485 ret = true;
5486 } else {
5487 if (!pool_is_full)
5488 continue;
5489
5490 if (pool.quota_max_bytes > 0 &&
5491 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
5492 mon->clog->warn() << "pool '" << pool_name << "' is full"
5493 << " (reached quota's max_bytes: "
5494 << si_t(pool.quota_max_bytes) << ")";
5495 }
5496 if (pool.quota_max_objects > 0 &&
5497 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
5498 mon->clog->warn() << "pool '" << pool_name << "' is full"
5499 << " (reached quota's max_objects: "
5500 << pool.quota_max_objects << ")";
5501 }
5502 // set both FLAG_FULL_NO_QUOTA and FLAG_FULL
5503 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
5504 // since FLAG_FULL should always take precedence
5505 set_pool_flags(it->first,
5506 pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
5507 clear_pool_flags(it->first,
5508 pg_pool_t::FLAG_NEARFULL |
5509 pg_pool_t::FLAG_BACKFILLFULL);
5510 ret = true;
5511 }
5512 }
5513 return ret;
5514 }
5515
5516 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
5517 {
5518 op->mark_osdmon_event(__func__);
5519 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
5520 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
5521 MonSession *session = m->get_session();
5522 if (!session)
5523 return -EPERM;
5524 string erasure_code_profile;
5525 stringstream ss;
5526 string rule_name;
5527 if (m->auid)
5528 return prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
5529 0, 0,
5530 erasure_code_profile,
5531 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5532 else
5533 return prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
5534 0, 0,
5535 erasure_code_profile,
5536 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5537 }
5538
5539 int OSDMonitor::crush_rename_bucket(const string& srcname,
5540 const string& dstname,
5541 ostream *ss)
5542 {
5543 int ret;
5544 //
5545 // Avoid creating a pending crush if it does not already exists and
5546 // the rename would fail.
5547 //
5548 if (!_have_pending_crush()) {
5549 ret = _get_stable_crush().can_rename_bucket(srcname,
5550 dstname,
5551 ss);
5552 if (ret)
5553 return ret;
5554 }
5555
5556 CrushWrapper newcrush;
5557 _get_pending_crush(newcrush);
5558
5559 ret = newcrush.rename_bucket(srcname,
5560 dstname,
5561 ss);
5562 if (ret)
5563 return ret;
5564
5565 pending_inc.crush.clear();
5566 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5567 *ss << "renamed bucket " << srcname << " into " << dstname;
5568 return 0;
5569 }
5570
5571 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
5572 {
5573 string replacement = "";
5574
5575 if (plugin == "jerasure_generic" ||
5576 plugin == "jerasure_sse3" ||
5577 plugin == "jerasure_sse4" ||
5578 plugin == "jerasure_neon") {
5579 replacement = "jerasure";
5580 } else if (plugin == "shec_generic" ||
5581 plugin == "shec_sse3" ||
5582 plugin == "shec_sse4" ||
5583 plugin == "shec_neon") {
5584 replacement = "shec";
5585 }
5586
5587 if (replacement != "") {
5588 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
5589 << plugin << " that has been deprecated. Please use "
5590 << replacement << " instead." << dendl;
5591 }
5592 }
5593
5594 int OSDMonitor::normalize_profile(const string& profilename,
5595 ErasureCodeProfile &profile,
5596 bool force,
5597 ostream *ss)
5598 {
5599 ErasureCodeInterfaceRef erasure_code;
5600 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5601 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
5602 check_legacy_ec_plugin(plugin->second, profilename);
5603 int err = instance.factory(plugin->second,
5604 g_conf->get_val<std::string>("erasure_code_dir"),
5605 profile, &erasure_code, ss);
5606 if (err) {
5607 return err;
5608 }
5609
5610 err = erasure_code->init(profile, ss);
5611 if (err) {
5612 return err;
5613 }
5614
5615 auto it = profile.find("stripe_unit");
5616 if (it != profile.end()) {
5617 string err_str;
5618 uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5619 if (!err_str.empty()) {
5620 *ss << "could not parse stripe_unit '" << it->second
5621 << "': " << err_str << std::endl;
5622 return -EINVAL;
5623 }
5624 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5625 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
5626 if (chunk_size != stripe_unit) {
5627 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
5628 << "alignment. Would be padded to " << chunk_size
5629 << std::endl;
5630 return -EINVAL;
5631 }
5632 if ((stripe_unit % 4096) != 0 && !force) {
5633 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
5634 << "use --force to override this check" << std::endl;
5635 return -EINVAL;
5636 }
5637 }
5638 return 0;
5639 }
5640
5641 int OSDMonitor::crush_rule_create_erasure(const string &name,
5642 const string &profile,
5643 int *rule,
5644 ostream *ss)
5645 {
5646 int ruleid = osdmap.crush->get_rule_id(name);
5647 if (ruleid != -ENOENT) {
5648 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
5649 return -EEXIST;
5650 }
5651
5652 CrushWrapper newcrush;
5653 _get_pending_crush(newcrush);
5654
5655 ruleid = newcrush.get_rule_id(name);
5656 if (ruleid != -ENOENT) {
5657 *rule = newcrush.get_rule_mask_ruleset(ruleid);
5658 return -EALREADY;
5659 } else {
5660 ErasureCodeInterfaceRef erasure_code;
5661 int err = get_erasure_code(profile, &erasure_code, ss);
5662 if (err) {
5663 *ss << "failed to load plugin using profile " << profile << std::endl;
5664 return err;
5665 }
5666
5667 err = erasure_code->create_rule(name, newcrush, ss);
5668 erasure_code.reset();
5669 if (err < 0)
5670 return err;
5671 *rule = err;
5672 pending_inc.crush.clear();
5673 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5674 return 0;
5675 }
5676 }
5677
5678 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
5679 ErasureCodeInterfaceRef *erasure_code,
5680 ostream *ss) const
5681 {
5682 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
5683 return -EAGAIN;
5684 ErasureCodeProfile profile =
5685 osdmap.get_erasure_code_profile(erasure_code_profile);
5686 ErasureCodeProfile::const_iterator plugin =
5687 profile.find("plugin");
5688 if (plugin == profile.end()) {
5689 *ss << "cannot determine the erasure code plugin"
5690 << " because there is no 'plugin' entry in the erasure_code_profile "
5691 << profile << std::endl;
5692 return -EINVAL;
5693 }
5694 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
5695 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5696 return instance.factory(plugin->second,
5697 g_conf->get_val<std::string>("erasure_code_dir"),
5698 profile, erasure_code, ss);
5699 }
5700
5701 int OSDMonitor::check_cluster_features(uint64_t features,
5702 stringstream &ss)
5703 {
5704 stringstream unsupported_ss;
5705 int unsupported_count = 0;
5706 if ((mon->get_quorum_con_features() & features) != features) {
5707 unsupported_ss << "the monitor cluster";
5708 ++unsupported_count;
5709 }
5710
5711 set<int32_t> up_osds;
5712 osdmap.get_up_osds(up_osds);
5713 for (set<int32_t>::iterator it = up_osds.begin();
5714 it != up_osds.end(); ++it) {
5715 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
5716 if ((xi.features & features) != features) {
5717 if (unsupported_count > 0)
5718 unsupported_ss << ", ";
5719 unsupported_ss << "osd." << *it;
5720 unsupported_count ++;
5721 }
5722 }
5723
5724 if (unsupported_count > 0) {
5725 ss << "features " << features << " unsupported by: "
5726 << unsupported_ss.str();
5727 return -ENOTSUP;
5728 }
5729
5730 // check pending osd state, too!
5731 for (map<int32_t,osd_xinfo_t>::const_iterator p =
5732 pending_inc.new_xinfo.begin();
5733 p != pending_inc.new_xinfo.end(); ++p) {
5734 const osd_xinfo_t &xi = p->second;
5735 if ((xi.features & features) != features) {
5736 dout(10) << __func__ << " pending osd." << p->first
5737 << " features are insufficient; retry" << dendl;
5738 return -EAGAIN;
5739 }
5740 }
5741
5742 return 0;
5743 }
5744
5745 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
5746 stringstream& ss)
5747 {
5748 OSDMap::Incremental new_pending = pending_inc;
5749 ::encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
5750 OSDMap newmap;
5751 newmap.deepish_copy_from(osdmap);
5752 newmap.apply_incremental(new_pending);
5753
5754 // client compat
5755 if (newmap.require_min_compat_client > 0) {
5756 auto mv = newmap.get_min_compat_client();
5757 if (mv > newmap.require_min_compat_client) {
5758 ss << "new crush map requires client version " << ceph_release_name(mv)
5759 << " but require_min_compat_client is "
5760 << ceph_release_name(newmap.require_min_compat_client);
5761 return false;
5762 }
5763 }
5764
5765 // osd compat
5766 uint64_t features =
5767 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
5768 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
5769 stringstream features_ss;
5770 int r = check_cluster_features(features, features_ss);
5771 if (r) {
5772 ss << "Could not change CRUSH: " << features_ss.str();
5773 return false;
5774 }
5775
5776 return true;
5777 }
5778
5779 bool OSDMonitor::erasure_code_profile_in_use(
5780 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
5781 const string &profile,
5782 ostream *ss)
5783 {
5784 bool found = false;
5785 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5786 p != pools.end();
5787 ++p) {
5788 if (p->second.erasure_code_profile == profile) {
5789 *ss << osdmap.pool_name[p->first] << " ";
5790 found = true;
5791 }
5792 }
5793 if (found) {
5794 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
5795 }
5796 return found;
5797 }
5798
5799 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
5800 map<string,string> *erasure_code_profile_map,
5801 ostream *ss)
5802 {
5803 int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
5804 *ss,
5805 erasure_code_profile_map);
5806 if (r)
5807 return r;
5808 assert((*erasure_code_profile_map).count("plugin"));
5809 string default_plugin = (*erasure_code_profile_map)["plugin"];
5810 map<string,string> user_map;
5811 for (vector<string>::const_iterator i = erasure_code_profile.begin();
5812 i != erasure_code_profile.end();
5813 ++i) {
5814 size_t equal = i->find('=');
5815 if (equal == string::npos) {
5816 user_map[*i] = string();
5817 (*erasure_code_profile_map)[*i] = string();
5818 } else {
5819 string key = i->substr(0, equal);
5820 equal++;
5821 const string value = i->substr(equal);
5822 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
5823 key.find("ruleset-") == 0) {
5824 if (g_conf->get_val<bool>("mon_fixup_legacy_erasure_code_profiles")) {
5825 mon->clog->warn() << "erasure code profile property '" << key
5826 << "' is no longer supported; try "
5827 << "'crush-" << key.substr(8) << "' instead";
5828 key = string("crush-") + key.substr(8);
5829 } else {
5830 *ss << "property '" << key << "' is no longer supported; try "
5831 << "'crush-" << key.substr(8) << "' instead";
5832 return -EINVAL;
5833 }
5834 }
5835 user_map[key] = value;
5836 (*erasure_code_profile_map)[key] = value;
5837 }
5838 }
5839
5840 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
5841 (*erasure_code_profile_map) = user_map;
5842
5843 return 0;
5844 }
5845
5846 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
5847 const string &erasure_code_profile,
5848 unsigned *size, unsigned *min_size,
5849 ostream *ss)
5850 {
5851 int err = 0;
5852 switch (pool_type) {
5853 case pg_pool_t::TYPE_REPLICATED:
5854 *size = g_conf->osd_pool_default_size;
5855 *min_size = g_conf->get_osd_pool_default_min_size();
5856 break;
5857 case pg_pool_t::TYPE_ERASURE:
5858 {
5859 ErasureCodeInterfaceRef erasure_code;
5860 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5861 if (err == 0) {
5862 *size = erasure_code->get_chunk_count();
5863 *min_size = MIN(erasure_code->get_data_chunk_count() + 1, *size);
5864 }
5865 }
5866 break;
5867 default:
5868 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
5869 err = -EINVAL;
5870 break;
5871 }
5872 return err;
5873 }
5874
5875 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
5876 const string &erasure_code_profile,
5877 uint32_t *stripe_width,
5878 ostream *ss)
5879 {
5880 int err = 0;
5881 switch (pool_type) {
5882 case pg_pool_t::TYPE_REPLICATED:
5883 // ignored
5884 break;
5885 case pg_pool_t::TYPE_ERASURE:
5886 {
5887 ErasureCodeProfile profile =
5888 osdmap.get_erasure_code_profile(erasure_code_profile);
5889 ErasureCodeInterfaceRef erasure_code;
5890 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5891 if (err)
5892 break;
5893 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5894 uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
5895 auto it = profile.find("stripe_unit");
5896 if (it != profile.end()) {
5897 string err_str;
5898 stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5899 assert(err_str.empty());
5900 }
5901 *stripe_width = data_chunks *
5902 erasure_code->get_chunk_size(stripe_unit * data_chunks);
5903 }
5904 break;
5905 default:
5906 *ss << "prepare_pool_stripe_width: "
5907 << pool_type << " is not a known pool type";
5908 err = -EINVAL;
5909 break;
5910 }
5911 return err;
5912 }
5913
5914 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
5915 const string &erasure_code_profile,
5916 const string &rule_name,
5917 int *crush_rule,
5918 ostream *ss)
5919 {
5920
5921 if (*crush_rule < 0) {
5922 switch (pool_type) {
5923 case pg_pool_t::TYPE_REPLICATED:
5924 {
5925 if (rule_name == "") {
5926 // Use default rule
5927 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
5928 if (*crush_rule < 0) {
5929 // Errors may happen e.g. if no valid rule is available
5930 *ss << "No suitable CRUSH rule exists, check "
5931 << "'osd pool default crush *' config options";
5932 return -ENOENT;
5933 }
5934 } else {
5935 return get_crush_rule(rule_name, crush_rule, ss);
5936 }
5937 }
5938 break;
5939 case pg_pool_t::TYPE_ERASURE:
5940 {
5941 int err = crush_rule_create_erasure(rule_name,
5942 erasure_code_profile,
5943 crush_rule, ss);
5944 switch (err) {
5945 case -EALREADY:
5946 dout(20) << "prepare_pool_crush_rule: rule "
5947 << rule_name << " try again" << dendl;
5948 // fall through
5949 case 0:
5950 // need to wait for the crush rule to be proposed before proceeding
5951 err = -EAGAIN;
5952 break;
5953 case -EEXIST:
5954 err = 0;
5955 break;
5956 }
5957 return err;
5958 }
5959 break;
5960 default:
5961 *ss << "prepare_pool_crush_rule: " << pool_type
5962 << " is not a known pool type";
5963 return -EINVAL;
5964 break;
5965 }
5966 } else {
5967 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
5968 *ss << "CRUSH rule " << *crush_rule << " not found";
5969 return -ENOENT;
5970 }
5971 }
5972
5973 return 0;
5974 }
5975
5976 int OSDMonitor::get_crush_rule(const string &rule_name,
5977 int *crush_rule,
5978 ostream *ss)
5979 {
5980 int ret;
5981 ret = osdmap.crush->get_rule_id(rule_name);
5982 if (ret != -ENOENT) {
5983 // found it, use it
5984 *crush_rule = ret;
5985 } else {
5986 CrushWrapper newcrush;
5987 _get_pending_crush(newcrush);
5988
5989 ret = newcrush.get_rule_id(rule_name);
5990 if (ret != -ENOENT) {
5991 // found it, wait for it to be proposed
5992 dout(20) << __func__ << ": rule " << rule_name
5993 << " try again" << dendl;
5994 return -EAGAIN;
5995 } else {
5996 // Cannot find it , return error
5997 *ss << "specified rule " << rule_name << " doesn't exist";
5998 return ret;
5999 }
6000 }
6001 return 0;
6002 }
6003
6004 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
6005 {
6006 auto max_pgs_per_osd = g_conf->get_val<uint64_t>("mon_max_pg_per_osd");
6007 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
6008 auto max_pgs = max_pgs_per_osd * num_osds;
6009 uint64_t projected = 0;
6010 if (pool < 0) {
6011 projected += pg_num * size;
6012 }
6013 for (const auto& i : osdmap.get_pools()) {
6014 if (i.first == pool) {
6015 projected += pg_num * size;
6016 } else {
6017 projected += i.second.get_pg_num() * i.second.get_size();
6018 }
6019 }
6020 if (projected > max_pgs) {
6021 if (pool >= 0) {
6022 *ss << "pool id " << pool;
6023 }
6024 *ss << " pg_num " << pg_num << " size " << size
6025 << " would mean " << projected
6026 << " total pgs, which exceeds max " << max_pgs
6027 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6028 << " * num_in_osds " << num_osds << ")";
6029 return -ERANGE;
6030 }
6031 return 0;
6032 }
6033
6034 /**
6035 * @param name The name of the new pool
6036 * @param auid The auid of the pool owner. Can be -1
6037 * @param crush_rule The crush rule to use. If <0, will use the system default
6038 * @param crush_rule_name The crush rule to use, if crush_rulset <0
6039 * @param pg_num The pg_num to use. If set to 0, will use the system default
6040 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6041 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6042 * @param pool_type TYPE_ERASURE, or TYPE_REP
6043 * @param expected_num_objects expected number of objects on the pool
6044 * @param fast_read fast read type.
6045 * @param ss human readable error message, if any.
6046 *
6047 * @return 0 on success, negative errno on failure.
6048 */
6049 int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
6050 int crush_rule,
6051 const string &crush_rule_name,
6052 unsigned pg_num, unsigned pgp_num,
6053 const string &erasure_code_profile,
6054 const unsigned pool_type,
6055 const uint64_t expected_num_objects,
6056 FastReadType fast_read,
6057 ostream *ss)
6058 {
6059 if (name.length() == 0)
6060 return -EINVAL;
6061 if (pg_num == 0)
6062 pg_num = g_conf->osd_pool_default_pg_num;
6063 if (pgp_num == 0)
6064 pgp_num = g_conf->osd_pool_default_pgp_num;
6065 if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
6066 *ss << "'pg_num' must be greater than 0 and less than or equal to "
6067 << g_conf->mon_max_pool_pg_num
6068 << " (you may adjust 'mon max pool pg num' for higher values)";
6069 return -ERANGE;
6070 }
6071 if (pgp_num > pg_num) {
6072 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6073 << ", which in this case is " << pg_num;
6074 return -ERANGE;
6075 }
6076 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
6077 *ss << "'fast_read' can only apply to erasure coding pool";
6078 return -EINVAL;
6079 }
6080 int r;
6081 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
6082 crush_rule_name, &crush_rule, ss);
6083 if (r) {
6084 dout(10) << " prepare_pool_crush_rule returns " << r << dendl;
6085 return r;
6086 }
6087 if (g_conf->mon_osd_crush_smoke_test) {
6088 CrushWrapper newcrush;
6089 _get_pending_crush(newcrush);
6090 ostringstream err;
6091 CrushTester tester(newcrush, err);
6092 tester.set_min_x(0);
6093 tester.set_max_x(50);
6094 tester.set_rule(crush_rule);
6095 auto start = ceph::coarse_mono_clock::now();
6096 r = tester.test_with_fork(g_conf->mon_lease);
6097 auto duration = ceph::coarse_mono_clock::now() - start;
6098 if (r < 0) {
6099 dout(10) << " tester.test_with_fork returns " << r
6100 << ": " << err.str() << dendl;
6101 *ss << "crush test failed with " << r << ": " << err.str();
6102 return r;
6103 }
6104 dout(10) << __func__ << " crush smoke test duration: "
6105 << duration << dendl;
6106 }
6107 unsigned size, min_size;
6108 r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
6109 if (r) {
6110 dout(10) << " prepare_pool_size returns " << r << dendl;
6111 return r;
6112 }
6113 r = check_pg_num(-1, pg_num, size, ss);
6114 if (r) {
6115 dout(10) << " prepare_pool_size returns " << r << dendl;
6116 return r;
6117 }
6118
6119 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
6120 return -EINVAL;
6121 }
6122
6123 uint32_t stripe_width = 0;
6124 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
6125 if (r) {
6126 dout(10) << " prepare_pool_stripe_width returns " << r << dendl;
6127 return r;
6128 }
6129
6130 bool fread = false;
6131 if (pool_type == pg_pool_t::TYPE_ERASURE) {
6132 switch (fast_read) {
6133 case FAST_READ_OFF:
6134 fread = false;
6135 break;
6136 case FAST_READ_ON:
6137 fread = true;
6138 break;
6139 case FAST_READ_DEFAULT:
6140 fread = g_conf->mon_osd_pool_ec_fast_read;
6141 break;
6142 default:
6143 *ss << "invalid fast_read setting: " << fast_read;
6144 return -EINVAL;
6145 }
6146 }
6147
6148 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
6149 p != pending_inc.new_pool_names.end();
6150 ++p) {
6151 if (p->second == name)
6152 return 0;
6153 }
6154
6155 if (-1 == pending_inc.new_pool_max)
6156 pending_inc.new_pool_max = osdmap.pool_max;
6157 int64_t pool = ++pending_inc.new_pool_max;
6158 pg_pool_t empty;
6159 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
6160 pi->type = pool_type;
6161 pi->fast_read = fread;
6162 pi->flags = g_conf->osd_pool_default_flags;
6163 if (g_conf->osd_pool_default_flag_hashpspool)
6164 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
6165 if (g_conf->osd_pool_default_flag_nodelete)
6166 pi->set_flag(pg_pool_t::FLAG_NODELETE);
6167 if (g_conf->osd_pool_default_flag_nopgchange)
6168 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
6169 if (g_conf->osd_pool_default_flag_nosizechange)
6170 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
6171 if (g_conf->osd_pool_use_gmt_hitset &&
6172 (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
6173 pi->use_gmt_hitset = true;
6174 else
6175 pi->use_gmt_hitset = false;
6176
6177 pi->size = size;
6178 pi->min_size = min_size;
6179 pi->crush_rule = crush_rule;
6180 pi->expected_num_objects = expected_num_objects;
6181 pi->object_hash = CEPH_STR_HASH_RJENKINS;
6182 pi->set_pg_num(pg_num);
6183 pi->set_pgp_num(pgp_num);
6184 pi->last_change = pending_inc.epoch;
6185 pi->auid = auid;
6186 pi->erasure_code_profile = erasure_code_profile;
6187 pi->stripe_width = stripe_width;
6188 pi->cache_target_dirty_ratio_micro =
6189 g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
6190 pi->cache_target_dirty_high_ratio_micro =
6191 g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
6192 pi->cache_target_full_ratio_micro =
6193 g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
6194 pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
6195 pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
6196 pending_inc.new_pool_names[pool] = name;
6197 return 0;
6198 }
6199
6200 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
6201 {
6202 op->mark_osdmon_event(__func__);
6203 ostringstream ss;
6204 if (pending_inc.new_flags < 0)
6205 pending_inc.new_flags = osdmap.get_flags();
6206 pending_inc.new_flags |= flag;
6207 ss << OSDMap::get_flag_string(flag) << " is set";
6208 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
6209 get_last_committed() + 1));
6210 return true;
6211 }
6212
6213 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
6214 {
6215 op->mark_osdmon_event(__func__);
6216 ostringstream ss;
6217 if (pending_inc.new_flags < 0)
6218 pending_inc.new_flags = osdmap.get_flags();
6219 pending_inc.new_flags &= ~flag;
6220 ss << OSDMap::get_flag_string(flag) << " is unset";
6221 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
6222 get_last_committed() + 1));
6223 return true;
6224 }
6225
6226 int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
6227 stringstream& ss)
6228 {
6229 string poolstr;
6230 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
6231 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6232 if (pool < 0) {
6233 ss << "unrecognized pool '" << poolstr << "'";
6234 return -ENOENT;
6235 }
6236 string var;
6237 cmd_getval(g_ceph_context, cmdmap, "var", var);
6238
6239 pg_pool_t p = *osdmap.get_pg_pool(pool);
6240 if (pending_inc.new_pools.count(pool))
6241 p = pending_inc.new_pools[pool];
6242
6243 // accept val as a json string in the normal case (current
6244 // generation monitor). parse out int or float values from the
6245 // string as needed. however, if it is not a string, try to pull
6246 // out an int, in case an older monitor with an older json schema is
6247 // forwarding a request.
6248 string val;
6249 string interr, floaterr;
6250 int64_t n = 0;
6251 double f = 0;
6252 int64_t uf = 0; // micro-f
6253 if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
6254 // wasn't a string; maybe an older mon forwarded json with an int?
6255 if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
6256 return -EINVAL; // no value!
6257 } else {
6258 // we got a string. see if it contains an int.
6259 n = strict_strtoll(val.c_str(), 10, &interr);
6260 // or a float
6261 f = strict_strtod(val.c_str(), &floaterr);
6262 uf = llrintl(f * (double)1000000.0);
6263 }
6264
6265 if (!p.is_tier() &&
6266 (var == "hit_set_type" || var == "hit_set_period" ||
6267 var == "hit_set_count" || var == "hit_set_fpp" ||
6268 var == "target_max_objects" || var == "target_max_bytes" ||
6269 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
6270 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
6271 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
6272 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
6273 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
6274 return -EACCES;
6275 }
6276
6277 if (var == "size") {
6278 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6279 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
6280 return -EPERM;
6281 }
6282 if (p.type == pg_pool_t::TYPE_ERASURE) {
6283 ss << "can not change the size of an erasure-coded pool";
6284 return -ENOTSUP;
6285 }
6286 if (interr.length()) {
6287 ss << "error parsing integer value '" << val << "': " << interr;
6288 return -EINVAL;
6289 }
6290 if (n <= 0 || n > 10) {
6291 ss << "pool size must be between 1 and 10";
6292 return -EINVAL;
6293 }
6294 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
6295 if (r < 0) {
6296 return r;
6297 }
6298 p.size = n;
6299 if (n < p.min_size)
6300 p.min_size = n;
6301 } else if (var == "min_size") {
6302 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6303 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
6304 return -EPERM;
6305 }
6306 if (interr.length()) {
6307 ss << "error parsing integer value '" << val << "': " << interr;
6308 return -EINVAL;
6309 }
6310
6311 if (p.type != pg_pool_t::TYPE_ERASURE) {
6312 if (n < 1 || n > p.size) {
6313 ss << "pool min_size must be between 1 and " << (int)p.size;
6314 return -EINVAL;
6315 }
6316 } else {
6317 ErasureCodeInterfaceRef erasure_code;
6318 int k;
6319 stringstream tmp;
6320 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
6321 if (err == 0) {
6322 k = erasure_code->get_data_chunk_count();
6323 } else {
6324 ss << __func__ << " get_erasure_code failed: " << tmp.str();
6325 return err;
6326 }
6327
6328 if (n < k || n > p.size) {
6329 ss << "pool min_size must be between " << k << " and " << (int)p.size;
6330 return -EINVAL;
6331 }
6332 }
6333 p.min_size = n;
6334 } else if (var == "auid") {
6335 if (interr.length()) {
6336 ss << "error parsing integer value '" << val << "': " << interr;
6337 return -EINVAL;
6338 }
6339 p.auid = n;
6340 } else if (var == "crash_replay_interval") {
6341 if (interr.length()) {
6342 ss << "error parsing integer value '" << val << "': " << interr;
6343 return -EINVAL;
6344 }
6345 p.crash_replay_interval = n;
6346 } else if (var == "pg_num") {
6347 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6348 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
6349 return -EPERM;
6350 }
6351 if (interr.length()) {
6352 ss << "error parsing integer value '" << val << "': " << interr;
6353 return -EINVAL;
6354 }
6355 if (n <= (int)p.get_pg_num()) {
6356 ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
6357 if (n < (int)p.get_pg_num())
6358 return -EEXIST;
6359 return 0;
6360 }
6361 if (n > (unsigned)g_conf->mon_max_pool_pg_num) {
6362 ss << "'pg_num' must be greater than 0 and less than or equal to "
6363 << g_conf->mon_max_pool_pg_num
6364 << " (you may adjust 'mon max pool pg num' for higher values)";
6365 return -ERANGE;
6366 }
6367 int r = check_pg_num(pool, n, p.get_size(), &ss);
6368 if (r) {
6369 return r;
6370 }
6371 string force;
6372 cmd_getval(g_ceph_context,cmdmap, "force", force);
6373 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
6374 force != "--yes-i-really-mean-it") {
6375 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
6376 return -EPERM;
6377 }
6378 int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
6379 int64_t new_pgs = n - p.get_pg_num();
6380 if (new_pgs > g_conf->mon_osd_max_split_count * expected_osds) {
6381 ss << "specified pg_num " << n << " is too large (creating "
6382 << new_pgs << " new PGs on ~" << expected_osds
6383 << " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
6384 << ')';
6385 return -E2BIG;
6386 }
6387 p.set_pg_num(n);
6388 // force pre-luminous clients to resend their ops, since they
6389 // don't understand that split PGs now form a new interval.
6390 p.last_force_op_resend_preluminous = pending_inc.epoch;
6391 } else if (var == "pgp_num") {
6392 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6393 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
6394 return -EPERM;
6395 }
6396 if (interr.length()) {
6397 ss << "error parsing integer value '" << val << "': " << interr;
6398 return -EINVAL;
6399 }
6400 if (n <= 0) {
6401 ss << "specified pgp_num must > 0, but you set to " << n;
6402 return -EINVAL;
6403 }
6404 if (n > (int)p.get_pg_num()) {
6405 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
6406 return -EINVAL;
6407 }
6408 p.set_pgp_num(n);
6409 } else if (var == "crush_rule") {
6410 int id = osdmap.crush->get_rule_id(val);
6411 if (id == -ENOENT) {
6412 ss << "crush rule " << val << " does not exist";
6413 return -ENOENT;
6414 }
6415 if (id < 0) {
6416 ss << cpp_strerror(id);
6417 return -ENOENT;
6418 }
6419 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
6420 return -EINVAL;
6421 }
6422 p.crush_rule = id;
6423 } else if (var == "nodelete" || var == "nopgchange" ||
6424 var == "nosizechange" || var == "write_fadvise_dontneed" ||
6425 var == "noscrub" || var == "nodeep-scrub") {
6426 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6427 // make sure we only compare against 'n' if we didn't receive a string
6428 if (val == "true" || (interr.empty() && n == 1)) {
6429 p.set_flag(flag);
6430 } else if (val == "false" || (interr.empty() && n == 0)) {
6431 p.unset_flag(flag);
6432 } else {
6433 ss << "expecting value 'true', 'false', '0', or '1'";
6434 return -EINVAL;
6435 }
6436 } else if (var == "hashpspool") {
6437 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6438 string force;
6439 cmd_getval(g_ceph_context, cmdmap, "force", force);
6440 if (force != "--yes-i-really-mean-it") {
6441 ss << "are you SURE? this will remap all placement groups in this pool,"
6442 " this triggers large data movement,"
6443 " pass --yes-i-really-mean-it if you really do.";
6444 return -EPERM;
6445 }
6446 // make sure we only compare against 'n' if we didn't receive a string
6447 if (val == "true" || (interr.empty() && n == 1)) {
6448 p.set_flag(flag);
6449 } else if (val == "false" || (interr.empty() && n == 0)) {
6450 p.unset_flag(flag);
6451 } else {
6452 ss << "expecting value 'true', 'false', '0', or '1'";
6453 return -EINVAL;
6454 }
6455 } else if (var == "hit_set_type") {
6456 if (val == "none")
6457 p.hit_set_params = HitSet::Params();
6458 else {
6459 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
6460 if (err)
6461 return err;
6462 if (val == "bloom") {
6463 BloomHitSet::Params *bsp = new BloomHitSet::Params;
6464 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
6465 p.hit_set_params = HitSet::Params(bsp);
6466 } else if (val == "explicit_hash")
6467 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
6468 else if (val == "explicit_object")
6469 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
6470 else {
6471 ss << "unrecognized hit_set type '" << val << "'";
6472 return -EINVAL;
6473 }
6474 }
6475 } else if (var == "hit_set_period") {
6476 if (interr.length()) {
6477 ss << "error parsing integer value '" << val << "': " << interr;
6478 return -EINVAL;
6479 }
6480 p.hit_set_period = n;
6481 } else if (var == "hit_set_count") {
6482 if (interr.length()) {
6483 ss << "error parsing integer value '" << val << "': " << interr;
6484 return -EINVAL;
6485 }
6486 p.hit_set_count = n;
6487 } else if (var == "hit_set_fpp") {
6488 if (floaterr.length()) {
6489 ss << "error parsing floating point value '" << val << "': " << floaterr;
6490 return -EINVAL;
6491 }
6492 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
6493 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
6494 return -EINVAL;
6495 }
6496 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
6497 bloomp->set_fpp(f);
6498 } else if (var == "use_gmt_hitset") {
6499 if (val == "true" || (interr.empty() && n == 1)) {
6500 string force;
6501 cmd_getval(g_ceph_context, cmdmap, "force", force);
6502 if (!osdmap.get_num_up_osds() && force != "--yes-i-really-mean-it") {
6503 ss << "Not advisable to continue since no OSDs are up. Pass "
6504 << "--yes-i-really-mean-it if you really wish to continue.";
6505 return -EPERM;
6506 }
6507 if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)
6508 && force != "--yes-i-really-mean-it") {
6509 ss << "not all OSDs support GMT hit set.";
6510 return -EINVAL;
6511 }
6512 p.use_gmt_hitset = true;
6513 } else {
6514 ss << "expecting value 'true' or '1'";
6515 return -EINVAL;
6516 }
6517 } else if (var == "allow_ec_overwrites") {
6518 if (!p.is_erasure()) {
6519 ss << "ec overwrites can only be enabled for an erasure coded pool";
6520 return -EINVAL;
6521 }
6522 stringstream err;
6523 if (!g_conf->mon_debug_no_require_bluestore_for_ec_overwrites &&
6524 !is_pool_currently_all_bluestore(pool, p, &err)) {
6525 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
6526 return -EINVAL;
6527 }
6528 if (val == "true" || (interr.empty() && n == 1)) {
6529 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
6530 } else if (val == "false" || (interr.empty() && n == 0)) {
6531 ss << "ec overwrites cannot be disabled once enabled";
6532 return -EINVAL;
6533 } else {
6534 ss << "expecting value 'true', 'false', '0', or '1'";
6535 return -EINVAL;
6536 }
6537 } else if (var == "target_max_objects") {
6538 if (interr.length()) {
6539 ss << "error parsing int '" << val << "': " << interr;
6540 return -EINVAL;
6541 }
6542 p.target_max_objects = n;
6543 } else if (var == "target_max_bytes") {
6544 if (interr.length()) {
6545 ss << "error parsing int '" << val << "': " << interr;
6546 return -EINVAL;
6547 }
6548 p.target_max_bytes = n;
6549 } else if (var == "cache_target_dirty_ratio") {
6550 if (floaterr.length()) {
6551 ss << "error parsing float '" << val << "': " << floaterr;
6552 return -EINVAL;
6553 }
6554 if (f < 0 || f > 1.0) {
6555 ss << "value must be in the range 0..1";
6556 return -ERANGE;
6557 }
6558 p.cache_target_dirty_ratio_micro = uf;
6559 } else if (var == "cache_target_dirty_high_ratio") {
6560 if (floaterr.length()) {
6561 ss << "error parsing float '" << val << "': " << floaterr;
6562 return -EINVAL;
6563 }
6564 if (f < 0 || f > 1.0) {
6565 ss << "value must be in the range 0..1";
6566 return -ERANGE;
6567 }
6568 p.cache_target_dirty_high_ratio_micro = uf;
6569 } else if (var == "cache_target_full_ratio") {
6570 if (floaterr.length()) {
6571 ss << "error parsing float '" << val << "': " << floaterr;
6572 return -EINVAL;
6573 }
6574 if (f < 0 || f > 1.0) {
6575 ss << "value must be in the range 0..1";
6576 return -ERANGE;
6577 }
6578 p.cache_target_full_ratio_micro = uf;
6579 } else if (var == "cache_min_flush_age") {
6580 if (interr.length()) {
6581 ss << "error parsing int '" << val << "': " << interr;
6582 return -EINVAL;
6583 }
6584 p.cache_min_flush_age = n;
6585 } else if (var == "cache_min_evict_age") {
6586 if (interr.length()) {
6587 ss << "error parsing int '" << val << "': " << interr;
6588 return -EINVAL;
6589 }
6590 p.cache_min_evict_age = n;
6591 } else if (var == "min_read_recency_for_promote") {
6592 if (interr.length()) {
6593 ss << "error parsing integer value '" << val << "': " << interr;
6594 return -EINVAL;
6595 }
6596 p.min_read_recency_for_promote = n;
6597 } else if (var == "hit_set_grade_decay_rate") {
6598 if (interr.length()) {
6599 ss << "error parsing integer value '" << val << "': " << interr;
6600 return -EINVAL;
6601 }
6602 if (n > 100 || n < 0) {
6603 ss << "value out of range,valid range is 0 - 100";
6604 return -EINVAL;
6605 }
6606 p.hit_set_grade_decay_rate = n;
6607 } else if (var == "hit_set_search_last_n") {
6608 if (interr.length()) {
6609 ss << "error parsing integer value '" << val << "': " << interr;
6610 return -EINVAL;
6611 }
6612 if (n > p.hit_set_count || n < 0) {
6613 ss << "value out of range,valid range is 0 - hit_set_count";
6614 return -EINVAL;
6615 }
6616 p.hit_set_search_last_n = n;
6617 } else if (var == "min_write_recency_for_promote") {
6618 if (interr.length()) {
6619 ss << "error parsing integer value '" << val << "': " << interr;
6620 return -EINVAL;
6621 }
6622 p.min_write_recency_for_promote = n;
6623 } else if (var == "fast_read") {
6624 if (p.is_replicated()) {
6625 ss << "fast read is not supported in replication pool";
6626 return -EINVAL;
6627 }
6628 if (val == "true" || (interr.empty() && n == 1)) {
6629 p.fast_read = true;
6630 } else if (val == "false" || (interr.empty() && n == 0)) {
6631 p.fast_read = false;
6632 } else {
6633 ss << "expecting value 'true', 'false', '0', or '1'";
6634 return -EINVAL;
6635 }
6636 } else if (pool_opts_t::is_opt_name(var)) {
6637 bool unset = val == "unset";
6638 if (var == "compression_mode") {
6639 if (!unset) {
6640 auto cmode = Compressor::get_comp_mode_type(val);
6641 if (!cmode) {
6642 ss << "unrecognized compression mode '" << val << "'";
6643 return -EINVAL;
6644 }
6645 }
6646 } else if (var == "compression_algorithm") {
6647 if (!unset) {
6648 auto alg = Compressor::get_comp_alg_type(val);
6649 if (!alg) {
6650 ss << "unrecognized compression_algorithm '" << val << "'";
6651 return -EINVAL;
6652 }
6653 }
6654 } else if (var == "compression_required_ratio") {
6655 if (floaterr.length()) {
6656 ss << "error parsing float value '" << val << "': " << floaterr;
6657 return -EINVAL;
6658 }
6659 if (f < 0 || f > 1) {
6660 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
6661 return -EINVAL;
6662 }
6663 } else if (var == "csum_type") {
6664 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
6665 if (t < 0 ) {
6666 ss << "unrecognized csum_type '" << val << "'";
6667 return -EINVAL;
6668 }
6669 //preserve csum_type numeric value
6670 n = t;
6671 interr.clear();
6672 } else if (var == "compression_max_blob_size" ||
6673 var == "compression_min_blob_size" ||
6674 var == "csum_max_block" ||
6675 var == "csum_min_block") {
6676 if (interr.length()) {
6677 ss << "error parsing int value '" << val << "': " << interr;
6678 return -EINVAL;
6679 }
6680 }
6681
6682 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
6683 switch (desc.type) {
6684 case pool_opts_t::STR:
6685 if (unset) {
6686 p.opts.unset(desc.key);
6687 } else {
6688 p.opts.set(desc.key, static_cast<std::string>(val));
6689 }
6690 break;
6691 case pool_opts_t::INT:
6692 if (interr.length()) {
6693 ss << "error parsing integer value '" << val << "': " << interr;
6694 return -EINVAL;
6695 }
6696 if (n == 0) {
6697 p.opts.unset(desc.key);
6698 } else {
6699 p.opts.set(desc.key, static_cast<int>(n));
6700 }
6701 break;
6702 case pool_opts_t::DOUBLE:
6703 if (floaterr.length()) {
6704 ss << "error parsing floating point value '" << val << "': " << floaterr;
6705 return -EINVAL;
6706 }
6707 if (f == 0) {
6708 p.opts.unset(desc.key);
6709 } else {
6710 p.opts.set(desc.key, static_cast<double>(f));
6711 }
6712 break;
6713 default:
6714 assert(!"unknown type");
6715 }
6716 } else {
6717 ss << "unrecognized variable '" << var << "'";
6718 return -EINVAL;
6719 }
6720 if (val != "unset") {
6721 ss << "set pool " << pool << " " << var << " to " << val;
6722 } else {
6723 ss << "unset pool " << pool << " " << var;
6724 }
6725 p.last_change = pending_inc.epoch;
6726 pending_inc.new_pools[pool] = p;
6727 return 0;
6728 }
6729
6730 int OSDMonitor::prepare_command_pool_application(const string &prefix,
6731 map<string,cmd_vartype> &cmdmap,
6732 stringstream& ss)
6733 {
6734 string pool_name;
6735 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
6736 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6737 if (pool < 0) {
6738 ss << "unrecognized pool '" << pool_name << "'";
6739 return -ENOENT;
6740 }
6741
6742 pg_pool_t p = *osdmap.get_pg_pool(pool);
6743 if (pending_inc.new_pools.count(pool)) {
6744 p = pending_inc.new_pools[pool];
6745 }
6746
6747 string app;
6748 cmd_getval(g_ceph_context, cmdmap, "app", app);
6749 bool app_exists = (p.application_metadata.count(app) > 0);
6750
6751 if (boost::algorithm::ends_with(prefix, "enable")) {
6752 if (app.empty()) {
6753 ss << "application name must be provided";
6754 return -EINVAL;
6755 }
6756
6757 if (p.is_tier()) {
6758 ss << "application must be enabled on base tier";
6759 return -EINVAL;
6760 }
6761
6762 string force;
6763 cmd_getval(g_ceph_context, cmdmap, "force", force);
6764
6765 if (!app_exists && !p.application_metadata.empty() &&
6766 force != "--yes-i-really-mean-it") {
6767 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
6768 << "application; pass --yes-i-really-mean-it to proceed anyway";
6769 return -EPERM;
6770 }
6771
6772 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
6773 ss << "too many enabled applications on pool '" << pool_name << "'; "
6774 << "max " << MAX_POOL_APPLICATIONS;
6775 return -EINVAL;
6776 }
6777
6778 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
6779 ss << "application name '" << app << "' too long; max length "
6780 << MAX_POOL_APPLICATION_LENGTH;
6781 return -EINVAL;
6782 }
6783
6784 if (!app_exists) {
6785 p.application_metadata[app] = {};
6786 }
6787 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
6788
6789 } else if (boost::algorithm::ends_with(prefix, "disable")) {
6790 string force;
6791 cmd_getval(g_ceph_context, cmdmap, "force", force);
6792
6793 if (force != "--yes-i-really-mean-it") {
6794 ss << "Are you SURE? Disabling an application within a pool might result "
6795 << "in loss of application functionality; pass "
6796 << "--yes-i-really-mean-it to proceed anyway";
6797 return -EPERM;
6798 }
6799
6800 if (!app_exists) {
6801 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6802 << "'";
6803 return 0; // idempotent
6804 }
6805
6806 p.application_metadata.erase(app);
6807 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
6808
6809 } else if (boost::algorithm::ends_with(prefix, "set")) {
6810 if (p.is_tier()) {
6811 ss << "application metadata must be set on base tier";
6812 return -EINVAL;
6813 }
6814
6815 if (!app_exists) {
6816 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6817 << "'";
6818 return -ENOENT;
6819 }
6820
6821 string key;
6822 cmd_getval(g_ceph_context, cmdmap, "key", key);
6823
6824 if (key.empty()) {
6825 ss << "key must be provided";
6826 return -EINVAL;
6827 }
6828
6829 auto &app_keys = p.application_metadata[app];
6830 if (app_keys.count(key) == 0 &&
6831 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
6832 ss << "too many keys set for application '" << app << "' on pool '"
6833 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
6834 return -EINVAL;
6835 }
6836
6837 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
6838 ss << "key '" << app << "' too long; max length "
6839 << MAX_POOL_APPLICATION_LENGTH;
6840 return -EINVAL;
6841 }
6842
6843 string value;
6844 cmd_getval(g_ceph_context, cmdmap, "value", value);
6845 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
6846 ss << "value '" << value << "' too long; max length "
6847 << MAX_POOL_APPLICATION_LENGTH;
6848 return -EINVAL;
6849 }
6850
6851 p.application_metadata[app][key] = value;
6852 ss << "set application '" << app << "' key '" << key << "' to '"
6853 << value << "' on pool '" << pool_name << "'";
6854 } else if (boost::algorithm::ends_with(prefix, "rm")) {
6855 if (!app_exists) {
6856 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6857 << "'";
6858 return -ENOENT;
6859 }
6860
6861 string key;
6862 cmd_getval(g_ceph_context, cmdmap, "key", key);
6863 auto it = p.application_metadata[app].find(key);
6864 if (it == p.application_metadata[app].end()) {
6865 ss << "application '" << app << "' on pool '" << pool_name
6866 << "' does not have key '" << key << "'";
6867 return 0; // idempotent
6868 }
6869
6870 p.application_metadata[app].erase(it);
6871 ss << "removed application '" << app << "' key '" << key << "' on pool '"
6872 << pool_name << "'";
6873 } else {
6874 assert(false);
6875 }
6876
6877 p.last_change = pending_inc.epoch;
6878 pending_inc.new_pools[pool] = p;
6879 return 0;
6880 }
6881
6882 int OSDMonitor::_prepare_command_osd_crush_remove(
6883 CrushWrapper &newcrush,
6884 int32_t id,
6885 int32_t ancestor,
6886 bool has_ancestor,
6887 bool unlink_only)
6888 {
6889 int err = 0;
6890
6891 if (has_ancestor) {
6892 err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
6893 unlink_only);
6894 } else {
6895 err = newcrush.remove_item(g_ceph_context, id, unlink_only);
6896 }
6897 return err;
6898 }
6899
6900 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
6901 {
6902 pending_inc.crush.clear();
6903 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6904 }
6905
6906 int OSDMonitor::prepare_command_osd_crush_remove(
6907 CrushWrapper &newcrush,
6908 int32_t id,
6909 int32_t ancestor,
6910 bool has_ancestor,
6911 bool unlink_only)
6912 {
6913 int err = _prepare_command_osd_crush_remove(
6914 newcrush, id, ancestor,
6915 has_ancestor, unlink_only);
6916
6917 if (err < 0)
6918 return err;
6919
6920 assert(err == 0);
6921 do_osd_crush_remove(newcrush);
6922
6923 return 0;
6924 }
6925
6926 int OSDMonitor::prepare_command_osd_remove(int32_t id)
6927 {
6928 if (osdmap.is_up(id)) {
6929 return -EBUSY;
6930 }
6931
6932 pending_inc.new_state[id] = osdmap.get_state(id);
6933 pending_inc.new_uuid[id] = uuid_d();
6934 pending_metadata_rm.insert(id);
6935 pending_metadata.erase(id);
6936
6937 return 0;
6938 }
6939
6940 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
6941 {
6942 assert(existing_id);
6943 *existing_id = -1;
6944
6945 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
6946 if (!osdmap.exists(i) &&
6947 pending_inc.new_up_client.count(i) == 0 &&
6948 (pending_inc.new_state.count(i) == 0 ||
6949 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
6950 *existing_id = i;
6951 return -1;
6952 }
6953 }
6954
6955 if (pending_inc.new_max_osd < 0) {
6956 return osdmap.get_max_osd();
6957 }
6958 return pending_inc.new_max_osd;
6959 }
6960
6961 void OSDMonitor::do_osd_create(
6962 const int32_t id,
6963 const uuid_d& uuid,
6964 const string& device_class,
6965 int32_t* new_id)
6966 {
6967 dout(10) << __func__ << " uuid " << uuid << dendl;
6968 assert(new_id);
6969
6970 // We presume validation has been performed prior to calling this
6971 // function. We assert with prejudice.
6972
6973 int32_t allocated_id = -1; // declare here so we can jump
6974 int32_t existing_id = -1;
6975 if (!uuid.is_zero()) {
6976 existing_id = osdmap.identify_osd(uuid);
6977 if (existing_id >= 0) {
6978 assert(id < 0 || id == existing_id);
6979 *new_id = existing_id;
6980 goto out;
6981 } else if (id >= 0) {
6982 // uuid does not exist, and id has been provided, so just create
6983 // the new osd.id
6984 *new_id = id;
6985 goto out;
6986 }
6987 }
6988
6989 // allocate a new id
6990 allocated_id = _allocate_osd_id(&existing_id);
6991 dout(10) << __func__ << " allocated id " << allocated_id
6992 << " existing id " << existing_id << dendl;
6993 if (existing_id >= 0) {
6994 assert(existing_id < osdmap.get_max_osd());
6995 assert(allocated_id < 0);
6996 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
6997 *new_id = existing_id;
6998 } else if (allocated_id >= 0) {
6999 assert(existing_id < 0);
7000 // raise max_osd
7001 if (pending_inc.new_max_osd < 0) {
7002 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
7003 } else {
7004 ++pending_inc.new_max_osd;
7005 }
7006 *new_id = pending_inc.new_max_osd - 1;
7007 assert(*new_id == allocated_id);
7008 } else {
7009 assert(0 == "unexpected condition");
7010 }
7011
7012 out:
7013 if (device_class.size()) {
7014 CrushWrapper newcrush;
7015 _get_pending_crush(newcrush);
7016 if (newcrush.get_max_devices() < *new_id + 1) {
7017 newcrush.set_max_devices(*new_id + 1);
7018 }
7019 string name = string("osd.") + stringify(*new_id);
7020 if (!newcrush.item_exists(*new_id)) {
7021 newcrush.set_item_name(*new_id, name);
7022 }
7023 ostringstream ss;
7024 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
7025 if (r < 0) {
7026 derr << __func__ << " failed to set " << name << " device_class "
7027 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
7028 << dendl;
7029 // non-fatal... this might be a replay and we want to be idempotent.
7030 } else {
7031 dout(20) << __func__ << " set " << name << " device_class " << device_class
7032 << dendl;
7033 pending_inc.crush.clear();
7034 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7035 }
7036 } else {
7037 dout(20) << __func__ << " no device_class" << dendl;
7038 }
7039
7040 dout(10) << __func__ << " using id " << *new_id << dendl;
7041 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
7042 pending_inc.new_max_osd = *new_id + 1;
7043 }
7044
7045 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
7046 if (!uuid.is_zero())
7047 pending_inc.new_uuid[*new_id] = uuid;
7048 }
7049
7050 int OSDMonitor::validate_osd_create(
7051 const int32_t id,
7052 const uuid_d& uuid,
7053 const bool check_osd_exists,
7054 int32_t* existing_id,
7055 stringstream& ss)
7056 {
7057
7058 dout(10) << __func__ << " id " << id << " uuid " << uuid
7059 << " check_osd_exists " << check_osd_exists << dendl;
7060
7061 assert(existing_id);
7062
7063 if (id < 0 && uuid.is_zero()) {
7064 // we have nothing to validate
7065 *existing_id = -1;
7066 return 0;
7067 } else if (uuid.is_zero()) {
7068 // we have an id but we will ignore it - because that's what
7069 // `osd create` does.
7070 return 0;
7071 }
7072
7073 /*
7074 * This function will be used to validate whether we are able to
7075 * create a new osd when the `uuid` is specified.
7076 *
7077 * It will be used by both `osd create` and `osd new`, as the checks
7078 * are basically the same when it pertains to osd id and uuid validation.
7079 * However, `osd create` presumes an `uuid` is optional, for legacy
7080 * reasons, while `osd new` requires the `uuid` to be provided. This
7081 * means that `osd create` will not be idempotent if an `uuid` is not
7082 * provided, but we will always guarantee the idempotency of `osd new`.
7083 */
7084
7085 assert(!uuid.is_zero());
7086 if (pending_inc.identify_osd(uuid) >= 0) {
7087 // osd is about to exist
7088 return -EAGAIN;
7089 }
7090
7091 int32_t i = osdmap.identify_osd(uuid);
7092 if (i >= 0) {
7093 // osd already exists
7094 if (id >= 0 && i != id) {
7095 ss << "uuid " << uuid << " already in use for different id " << i;
7096 return -EEXIST;
7097 }
7098 // return a positive errno to distinguish between a blocking error
7099 // and an error we consider to not be a problem (i.e., this would be
7100 // an idempotent operation).
7101 *existing_id = i;
7102 return EEXIST;
7103 }
7104 // i < 0
7105 if (id >= 0) {
7106 if (pending_inc.new_state.count(id)) {
7107 // osd is about to exist
7108 return -EAGAIN;
7109 }
7110 // we may not care if an osd exists if we are recreating a previously
7111 // destroyed osd.
7112 if (check_osd_exists && osdmap.exists(id)) {
7113 ss << "id " << id << " already in use and does not match uuid "
7114 << uuid;
7115 return -EINVAL;
7116 }
7117 }
7118 return 0;
7119 }
7120
7121 int OSDMonitor::prepare_command_osd_create(
7122 const int32_t id,
7123 const uuid_d& uuid,
7124 int32_t* existing_id,
7125 stringstream& ss)
7126 {
7127 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
7128 assert(existing_id);
7129 if (osdmap.is_destroyed(id)) {
7130 ss << "ceph osd create has been deprecated. Please use ceph osd new "
7131 "instead.";
7132 return -EINVAL;
7133 }
7134
7135 if (uuid.is_zero()) {
7136 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
7137 }
7138
7139 return validate_osd_create(id, uuid, true, existing_id, ss);
7140 }
7141
7142 int OSDMonitor::prepare_command_osd_new(
7143 MonOpRequestRef op,
7144 const map<string,cmd_vartype>& cmdmap,
7145 const map<string,string>& params,
7146 stringstream &ss,
7147 Formatter *f)
7148 {
7149 uuid_d uuid;
7150 string uuidstr;
7151 int64_t id = -1;
7152
7153 assert(paxos->is_plugged());
7154
7155 dout(10) << __func__ << " " << op << dendl;
7156
7157 /* validate command. abort now if something's wrong. */
7158
7159 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
7160 *
7161 * If `id` is not specified, we will identify any existing osd based
7162 * on `uuid`. Operation will be idempotent iff secrets match.
7163 *
7164 * If `id` is specified, we will identify any existing osd based on
7165 * `uuid` and match against `id`. If they match, operation will be
7166 * idempotent iff secrets match.
7167 *
7168 * `-i secrets.json` will be optional. If supplied, will be used
7169 * to check for idempotency when `id` and `uuid` match.
7170 *
7171 * If `id` is not specified, and `uuid` does not exist, an id will
7172 * be found or allocated for the osd.
7173 *
7174 * If `id` is specified, and the osd has been previously marked
7175 * as destroyed, then the `id` will be reused.
7176 */
7177 if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
7178 ss << "requires the OSD's UUID to be specified.";
7179 return -EINVAL;
7180 } else if (!uuid.parse(uuidstr.c_str())) {
7181 ss << "invalid UUID value '" << uuidstr << "'.";
7182 return -EINVAL;
7183 }
7184
7185 if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
7186 (id < 0)) {
7187 ss << "invalid OSD id; must be greater or equal than zero.";
7188 return -EINVAL;
7189 }
7190
7191 // are we running an `osd create`-like command, or recreating
7192 // a previously destroyed osd?
7193
7194 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
7195
7196 // we will care about `id` to assess whether osd is `destroyed`, or
7197 // to create a new osd.
7198 // we will need an `id` by the time we reach auth.
7199
7200 int32_t existing_id = -1;
7201 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
7202 &existing_id, ss);
7203
7204 bool may_be_idempotent = false;
7205 if (err == EEXIST) {
7206 // this is idempotent from the osdmon's point-of-view
7207 may_be_idempotent = true;
7208 assert(existing_id >= 0);
7209 id = existing_id;
7210 } else if (err < 0) {
7211 return err;
7212 }
7213
7214 if (!may_be_idempotent) {
7215 // idempotency is out of the window. We are either creating a new
7216 // osd or recreating a destroyed osd.
7217 //
7218 // We now need to figure out if we have an `id` (and if it's valid),
7219 // of find an `id` if we don't have one.
7220
7221 // NOTE: we need to consider the case where the `id` is specified for
7222 // `osd create`, and we must honor it. So this means checking if
7223 // the `id` is destroyed, and if so assume the destroy; otherwise,
7224 // check if it `exists` - in which case we complain about not being
7225 // `destroyed`. In the end, if nothing fails, we must allow the
7226 // creation, so that we are compatible with `create`.
7227 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
7228 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
7229 ss << "OSD " << id << " has not yet been destroyed";
7230 return -EINVAL;
7231 } else if (id < 0) {
7232 // find an `id`
7233 id = _allocate_osd_id(&existing_id);
7234 if (id < 0) {
7235 assert(existing_id >= 0);
7236 id = existing_id;
7237 }
7238 dout(10) << __func__ << " found id " << id << " to use" << dendl;
7239 } else if (id >= 0 && osdmap.is_destroyed(id)) {
7240 dout(10) << __func__ << " recreating osd." << id << dendl;
7241 } else {
7242 dout(10) << __func__ << " creating new osd." << id << dendl;
7243 }
7244 } else {
7245 assert(id >= 0);
7246 assert(osdmap.exists(id));
7247 }
7248
7249 // we are now able to either create a brand new osd or reuse an existing
7250 // osd that has been previously destroyed.
7251
7252 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
7253
7254 if (may_be_idempotent && params.empty()) {
7255 // nothing to do, really.
7256 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
7257 assert(id >= 0);
7258 if (f) {
7259 f->open_object_section("created_osd");
7260 f->dump_int("osdid", id);
7261 f->close_section();
7262 } else {
7263 ss << id;
7264 }
7265 return EEXIST;
7266 }
7267
7268 string device_class;
7269 auto p = params.find("crush_device_class");
7270 if (p != params.end()) {
7271 device_class = p->second;
7272 dout(20) << __func__ << " device_class will be " << device_class << dendl;
7273 }
7274 string cephx_secret, lockbox_secret, dmcrypt_key;
7275 bool has_lockbox = false;
7276 bool has_secrets = params.count("cephx_secret")
7277 || params.count("cephx_lockbox_secret")
7278 || params.count("dmcrypt_key");
7279
7280 ConfigKeyService *svc = nullptr;
7281 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
7282
7283 if (has_secrets) {
7284 if (params.count("cephx_secret") == 0) {
7285 ss << "requires a cephx secret.";
7286 return -EINVAL;
7287 }
7288 cephx_secret = params.at("cephx_secret");
7289
7290 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
7291 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
7292
7293 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
7294 << " dmcrypt " << has_dmcrypt_key << dendl;
7295
7296 if (has_lockbox_secret && has_dmcrypt_key) {
7297 has_lockbox = true;
7298 lockbox_secret = params.at("cephx_lockbox_secret");
7299 dmcrypt_key = params.at("dmcrypt_key");
7300 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
7301 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
7302 return -EINVAL;
7303 }
7304
7305 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
7306
7307 err = mon->authmon()->validate_osd_new(id, uuid,
7308 cephx_secret,
7309 lockbox_secret,
7310 cephx_entity,
7311 lockbox_entity,
7312 ss);
7313 if (err < 0) {
7314 return err;
7315 } else if (may_be_idempotent && err != EEXIST) {
7316 // for this to be idempotent, `id` should already be >= 0; no need
7317 // to use validate_id.
7318 assert(id >= 0);
7319 ss << "osd." << id << " exists but secrets do not match";
7320 return -EEXIST;
7321 }
7322
7323 if (has_lockbox) {
7324 svc = (ConfigKeyService*)mon->config_key_service;
7325 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
7326 if (err < 0) {
7327 return err;
7328 } else if (may_be_idempotent && err != EEXIST) {
7329 assert(id >= 0);
7330 ss << "osd." << id << " exists but dm-crypt key does not match.";
7331 return -EEXIST;
7332 }
7333 }
7334 }
7335 assert(!has_secrets || !cephx_secret.empty());
7336 assert(!has_lockbox || !lockbox_secret.empty());
7337
7338 if (may_be_idempotent) {
7339 // we have nothing to do for either the osdmon or the authmon,
7340 // and we have no lockbox - so the config key service will not be
7341 // touched. This is therefore an idempotent operation, and we can
7342 // just return right away.
7343 dout(10) << __func__ << " idempotent -- no op." << dendl;
7344 assert(id >= 0);
7345 if (f) {
7346 f->open_object_section("created_osd");
7347 f->dump_int("osdid", id);
7348 f->close_section();
7349 } else {
7350 ss << id;
7351 }
7352 return EEXIST;
7353 }
7354 assert(!may_be_idempotent);
7355
7356 // perform updates.
7357 if (has_secrets) {
7358 assert(!cephx_secret.empty());
7359 assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
7360 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
7361
7362 err = mon->authmon()->do_osd_new(cephx_entity,
7363 lockbox_entity,
7364 has_lockbox);
7365 assert(0 == err);
7366
7367 if (has_lockbox) {
7368 assert(nullptr != svc);
7369 svc->do_osd_new(uuid, dmcrypt_key);
7370 }
7371 }
7372
7373 if (is_recreate_destroyed) {
7374 assert(id >= 0);
7375 assert(osdmap.is_destroyed(id));
7376 pending_inc.new_weight[id] = CEPH_OSD_OUT;
7377 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED | CEPH_OSD_NEW;
7378 if (osdmap.get_state(id) & CEPH_OSD_UP) {
7379 // due to http://tracker.ceph.com/issues/20751 some clusters may
7380 // have UP set for non-existent OSDs; make sure it is cleared
7381 // for a newly created osd.
7382 pending_inc.new_state[id] |= CEPH_OSD_UP;
7383 }
7384 pending_inc.new_uuid[id] = uuid;
7385 } else {
7386 assert(id >= 0);
7387 int32_t new_id = -1;
7388 do_osd_create(id, uuid, device_class, &new_id);
7389 assert(new_id >= 0);
7390 assert(id == new_id);
7391 }
7392
7393 if (f) {
7394 f->open_object_section("created_osd");
7395 f->dump_int("osdid", id);
7396 f->close_section();
7397 } else {
7398 ss << id;
7399 }
7400
7401 return 0;
7402 }
7403
7404 bool OSDMonitor::prepare_command(MonOpRequestRef op)
7405 {
7406 op->mark_osdmon_event(__func__);
7407 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7408 stringstream ss;
7409 map<string, cmd_vartype> cmdmap;
7410 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
7411 string rs = ss.str();
7412 mon->reply_command(op, -EINVAL, rs, get_last_committed());
7413 return true;
7414 }
7415
7416 MonSession *session = m->get_session();
7417 if (!session) {
7418 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
7419 return true;
7420 }
7421
7422 return prepare_command_impl(op, cmdmap);
7423 }
7424
7425 static int parse_reweights(CephContext *cct,
7426 const map<string,cmd_vartype> &cmdmap,
7427 const OSDMap& osdmap,
7428 map<int32_t, uint32_t>* weights)
7429 {
7430 string weights_str;
7431 if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
7432 return -EINVAL;
7433 }
7434 std::replace(begin(weights_str), end(weights_str), '\'', '"');
7435 json_spirit::mValue json_value;
7436 if (!json_spirit::read(weights_str, json_value)) {
7437 return -EINVAL;
7438 }
7439 if (json_value.type() != json_spirit::obj_type) {
7440 return -EINVAL;
7441 }
7442 const auto obj = json_value.get_obj();
7443 try {
7444 for (auto& osd_weight : obj) {
7445 auto osd_id = std::stoi(osd_weight.first);
7446 if (!osdmap.exists(osd_id)) {
7447 return -ENOENT;
7448 }
7449 if (osd_weight.second.type() != json_spirit::str_type) {
7450 return -EINVAL;
7451 }
7452 auto weight = std::stoul(osd_weight.second.get_str());
7453 weights->insert({osd_id, weight});
7454 }
7455 } catch (const std::logic_error& e) {
7456 return -EINVAL;
7457 }
7458 return 0;
7459 }
7460
7461 int OSDMonitor::prepare_command_osd_destroy(
7462 int32_t id,
7463 stringstream& ss)
7464 {
7465 assert(paxos->is_plugged());
7466
7467 // we check if the osd exists for the benefit of `osd purge`, which may
7468 // have previously removed the osd. If the osd does not exist, return
7469 // -ENOENT to convey this, and let the caller deal with it.
7470 //
7471 // we presume that all auth secrets and config keys were removed prior
7472 // to this command being called. if they exist by now, we also assume
7473 // they must have been created by some other command and do not pertain
7474 // to this non-existent osd.
7475 if (!osdmap.exists(id)) {
7476 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
7477 return -ENOENT;
7478 }
7479
7480 uuid_d uuid = osdmap.get_uuid(id);
7481 dout(10) << __func__ << " destroying osd." << id
7482 << " uuid " << uuid << dendl;
7483
7484 // if it has been destroyed, we assume our work here is done.
7485 if (osdmap.is_destroyed(id)) {
7486 ss << "destroyed osd." << id;
7487 return 0;
7488 }
7489
7490 EntityName cephx_entity, lockbox_entity;
7491 bool idempotent_auth = false, idempotent_cks = false;
7492
7493 int err = mon->authmon()->validate_osd_destroy(id, uuid,
7494 cephx_entity,
7495 lockbox_entity,
7496 ss);
7497 if (err < 0) {
7498 if (err == -ENOENT) {
7499 idempotent_auth = true;
7500 } else {
7501 return err;
7502 }
7503 }
7504
7505 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
7506 err = svc->validate_osd_destroy(id, uuid);
7507 if (err < 0) {
7508 assert(err == -ENOENT);
7509 err = 0;
7510 idempotent_cks = true;
7511 }
7512
7513 if (!idempotent_auth) {
7514 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
7515 assert(0 == err);
7516 }
7517
7518 if (!idempotent_cks) {
7519 svc->do_osd_destroy(id, uuid);
7520 }
7521
7522 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
7523 pending_inc.new_uuid[id] = uuid_d();
7524
7525 // we can only propose_pending() once per service, otherwise we'll be
7526 // defying PaxosService and all laws of nature. Therefore, as we may
7527 // be used during 'osd purge', let's keep the caller responsible for
7528 // proposing.
7529 assert(err == 0);
7530 return 0;
7531 }
7532
7533 int OSDMonitor::prepare_command_osd_purge(
7534 int32_t id,
7535 stringstream& ss)
7536 {
7537 assert(paxos->is_plugged());
7538 dout(10) << __func__ << " purging osd." << id << dendl;
7539
7540 assert(!osdmap.is_up(id));
7541
7542 /*
7543 * This may look a bit weird, but this is what's going to happen:
7544 *
7545 * 1. we make sure that removing from crush works
7546 * 2. we call `prepare_command_osd_destroy()`. If it returns an
7547 * error, then we abort the whole operation, as no updates
7548 * have been made. However, we this function will have
7549 * side-effects, thus we need to make sure that all operations
7550 * performed henceforth will *always* succeed.
7551 * 3. we call `prepare_command_osd_remove()`. Although this
7552 * function can return an error, it currently only checks if the
7553 * osd is up - and we have made sure that it is not so, so there
7554 * is no conflict, and it is effectively an update.
7555 * 4. finally, we call `do_osd_crush_remove()`, which will perform
7556 * the crush update we delayed from before.
7557 */
7558
7559 CrushWrapper newcrush;
7560 _get_pending_crush(newcrush);
7561
7562 bool may_be_idempotent = false;
7563
7564 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
7565 if (err == -ENOENT) {
7566 err = 0;
7567 may_be_idempotent = true;
7568 } else if (err < 0) {
7569 ss << "error removing osd." << id << " from crush";
7570 return err;
7571 }
7572
7573 // no point destroying the osd again if it has already been marked destroyed
7574 if (!osdmap.is_destroyed(id)) {
7575 err = prepare_command_osd_destroy(id, ss);
7576 if (err < 0) {
7577 if (err == -ENOENT) {
7578 err = 0;
7579 } else {
7580 return err;
7581 }
7582 } else {
7583 may_be_idempotent = false;
7584 }
7585 }
7586 assert(0 == err);
7587
7588 if (may_be_idempotent && !osdmap.exists(id)) {
7589 dout(10) << __func__ << " osd." << id << " does not exist and "
7590 << "we are idempotent." << dendl;
7591 return -ENOENT;
7592 }
7593
7594 err = prepare_command_osd_remove(id);
7595 // we should not be busy, as we should have made sure this id is not up.
7596 assert(0 == err);
7597
7598 do_osd_crush_remove(newcrush);
7599 return 0;
7600 }
7601
7602 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
7603 map<string,cmd_vartype> &cmdmap)
7604 {
7605 op->mark_osdmon_event(__func__);
7606 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7607 bool ret = false;
7608 stringstream ss;
7609 string rs;
7610 bufferlist rdata;
7611 int err = 0;
7612
7613 string format;
7614 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
7615 boost::scoped_ptr<Formatter> f(Formatter::create(format));
7616
7617 string prefix;
7618 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
7619
7620 int64_t osdid;
7621 string name;
7622 bool osdid_present = false;
7623 if (prefix != "osd pg-temp" &&
7624 prefix != "osd pg-upmap" &&
7625 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
7626 osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
7627 }
7628 if (osdid_present) {
7629 ostringstream oss;
7630 oss << "osd." << osdid;
7631 name = oss.str();
7632 }
7633
7634 // Even if there's a pending state with changes that could affect
7635 // a command, considering that said state isn't yet committed, we
7636 // just don't care about those changes if the command currently being
7637 // handled acts as a no-op against the current committed state.
7638 // In a nutshell, we assume this command happens *before*.
7639 //
7640 // Let me make this clearer:
7641 //
7642 // - If we have only one client, and that client issues some
7643 // operation that would conflict with this operation but is
7644 // still on the pending state, then we would be sure that said
7645 // operation wouldn't have returned yet, so the client wouldn't
7646 // issue this operation (unless the client didn't wait for the
7647 // operation to finish, and that would be the client's own fault).
7648 //
7649 // - If we have more than one client, each client will observe
7650 // whatever is the state at the moment of the commit. So, if we
7651 // have two clients, one issuing an unlink and another issuing a
7652 // link, and if the link happens while the unlink is still on the
7653 // pending state, from the link's point-of-view this is a no-op.
7654 // If different clients are issuing conflicting operations and
7655 // they care about that, then the clients should make sure they
7656 // enforce some kind of concurrency mechanism -- from our
7657 // perspective that's what Douglas Adams would call an SEP.
7658 //
7659 // This should be used as a general guideline for most commands handled
7660 // in this function. Adapt as you see fit, but please bear in mind that
7661 // this is the expected behavior.
7662
7663
7664 if (prefix == "osd setcrushmap" ||
7665 (prefix == "osd crush set" && !osdid_present)) {
7666 if (pending_inc.crush.length()) {
7667 dout(10) << __func__ << " waiting for pending crush update " << dendl;
7668 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7669 return true;
7670 }
7671 dout(10) << "prepare_command setting new crush map" << dendl;
7672 bufferlist data(m->get_data());
7673 CrushWrapper crush;
7674 try {
7675 bufferlist::iterator bl(data.begin());
7676 crush.decode(bl);
7677 }
7678 catch (const std::exception &e) {
7679 err = -EINVAL;
7680 ss << "Failed to parse crushmap: " << e.what();
7681 goto reply;
7682 }
7683
7684 int64_t prior_version = 0;
7685 if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
7686 if (prior_version == osdmap.get_crush_version() - 1) {
7687 // see if we are a resend of the last update. this is imperfect
7688 // (multiple racing updaters may not both get reliable success)
7689 // but we expect crush updaters (via this interface) to be rare-ish.
7690 bufferlist current, proposed;
7691 osdmap.crush->encode(current, mon->get_quorum_con_features());
7692 crush.encode(proposed, mon->get_quorum_con_features());
7693 if (current.contents_equal(proposed)) {
7694 dout(10) << __func__
7695 << " proposed matches current and version equals previous"
7696 << dendl;
7697 err = 0;
7698 ss << osdmap.get_crush_version();
7699 goto reply;
7700 }
7701 }
7702 if (prior_version != osdmap.get_crush_version()) {
7703 err = -EPERM;
7704 ss << "prior_version " << prior_version << " != crush version "
7705 << osdmap.get_crush_version();
7706 goto reply;
7707 }
7708 }
7709
7710 if (crush.has_legacy_rule_ids()) {
7711 err = -EINVAL;
7712 ss << "crush maps with ruleset != ruleid are no longer allowed";
7713 goto reply;
7714 }
7715 if (!validate_crush_against_features(&crush, ss)) {
7716 err = -EINVAL;
7717 goto reply;
7718 }
7719
7720 err = osdmap.validate_crush_rules(&crush, &ss);
7721 if (err < 0) {
7722 goto reply;
7723 }
7724
7725 if (g_conf->mon_osd_crush_smoke_test) {
7726 // sanity check: test some inputs to make sure this map isn't
7727 // totally broken
7728 dout(10) << " testing map" << dendl;
7729 stringstream ess;
7730 CrushTester tester(crush, ess);
7731 tester.set_min_x(0);
7732 tester.set_max_x(50);
7733 auto start = ceph::coarse_mono_clock::now();
7734 int r = tester.test_with_fork(g_conf->mon_lease);
7735 auto duration = ceph::coarse_mono_clock::now() - start;
7736 if (r < 0) {
7737 dout(10) << " tester.test_with_fork returns " << r
7738 << ": " << ess.str() << dendl;
7739 ss << "crush smoke test failed with " << r << ": " << ess.str();
7740 err = r;
7741 goto reply;
7742 }
7743 dout(10) << __func__ << " crush somke test duration: "
7744 << duration << ", result: " << ess.str() << dendl;
7745 }
7746
7747 pending_inc.crush = data;
7748 ss << osdmap.get_crush_version() + 1;
7749 goto update;
7750
7751 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
7752 CrushWrapper newcrush;
7753 _get_pending_crush(newcrush);
7754 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
7755 int bid = -1 - b;
7756 if (newcrush.bucket_exists(bid) &&
7757 newcrush.get_bucket_alg(bid)) {
7758 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
7759 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
7760 }
7761 }
7762 if (!validate_crush_against_features(&newcrush, ss)) {
7763 err = -EINVAL;
7764 goto reply;
7765 }
7766 pending_inc.crush.clear();
7767 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7768 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7769 get_last_committed() + 1));
7770 return true;
7771 } else if (prefix == "osd crush set-device-class") {
7772 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7773 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7774 << "luminous' before using crush device classes";
7775 err = -EPERM;
7776 goto reply;
7777 }
7778
7779 string device_class;
7780 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7781 err = -EINVAL; // no value!
7782 goto reply;
7783 }
7784
7785 bool stop = false;
7786 vector<string> idvec;
7787 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7788 CrushWrapper newcrush;
7789 _get_pending_crush(newcrush);
7790 set<int> updated;
7791 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7792 set<int> osds;
7793 // wildcard?
7794 if (j == 0 &&
7795 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7796 osdmap.get_all_osds(osds);
7797 stop = true;
7798 } else {
7799 // try traditional single osd way
7800 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7801 if (osd < 0) {
7802 // ss has reason for failure
7803 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7804 err = -EINVAL;
7805 continue;
7806 }
7807 osds.insert(osd);
7808 }
7809
7810 for (auto &osd : osds) {
7811 if (!osdmap.exists(osd)) {
7812 ss << "osd." << osd << " does not exist. ";
7813 continue;
7814 }
7815
7816 ostringstream oss;
7817 oss << "osd." << osd;
7818 string name = oss.str();
7819
7820 if (newcrush.get_max_devices() < osd + 1) {
7821 newcrush.set_max_devices(osd + 1);
7822 }
7823 string action;
7824 if (newcrush.item_exists(osd)) {
7825 action = "updating";
7826 } else {
7827 action = "creating";
7828 newcrush.set_item_name(osd, name);
7829 }
7830
7831 dout(5) << action << " crush item id " << osd << " name '" << name
7832 << "' device_class '" << device_class << "'"
7833 << dendl;
7834 err = newcrush.update_device_class(osd, device_class, name, &ss);
7835 if (err < 0) {
7836 goto reply;
7837 }
7838 if (err == 0 && !_have_pending_crush()) {
7839 if (!stop) {
7840 // for single osd only, wildcard makes too much noise
7841 ss << "set-device-class item id " << osd << " name '" << name
7842 << "' device_class '" << device_class << "': no change";
7843 }
7844 } else {
7845 updated.insert(osd);
7846 }
7847 }
7848 }
7849
7850 if (!updated.empty()) {
7851 pending_inc.crush.clear();
7852 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7853 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
7854 getline(ss, rs);
7855 wait_for_finished_proposal(op,
7856 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
7857 return true;
7858 }
7859
7860 } else if (prefix == "osd crush rm-device-class") {
7861 bool stop = false;
7862 vector<string> idvec;
7863 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7864 CrushWrapper newcrush;
7865 _get_pending_crush(newcrush);
7866 set<int> updated;
7867
7868 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7869 set<int> osds;
7870
7871 // wildcard?
7872 if (j == 0 &&
7873 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7874 osdmap.get_all_osds(osds);
7875 stop = true;
7876 } else {
7877 // try traditional single osd way
7878 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7879 if (osd < 0) {
7880 // ss has reason for failure
7881 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7882 err = -EINVAL;
7883 goto reply;
7884 }
7885 osds.insert(osd);
7886 }
7887
7888 for (auto &osd : osds) {
7889 if (!osdmap.exists(osd)) {
7890 ss << "osd." << osd << " does not exist. ";
7891 continue;
7892 }
7893
7894 auto class_name = newcrush.get_item_class(osd);
7895 if (!class_name) {
7896 ss << "osd." << osd << " belongs to no class, ";
7897 continue;
7898 }
7899 // note that we do not verify if class_is_in_use here
7900 // in case the device is misclassified and user wants
7901 // to overridely reset...
7902
7903 err = newcrush.remove_device_class(g_ceph_context, osd, &ss);
7904 if (err < 0) {
7905 // ss has reason for failure
7906 goto reply;
7907 }
7908 updated.insert(osd);
7909 }
7910 }
7911
7912 if (!updated.empty()) {
7913 pending_inc.crush.clear();
7914 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7915 ss << "done removing class of osd(s): " << updated;
7916 getline(ss, rs);
7917 wait_for_finished_proposal(op,
7918 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
7919 return true;
7920 }
7921 } else if (prefix == "osd crush class rename") {
7922 string srcname, dstname;
7923 if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
7924 err = -EINVAL;
7925 goto reply;
7926 }
7927 if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
7928 err = -EINVAL;
7929 goto reply;
7930 }
7931
7932 CrushWrapper newcrush;
7933 _get_pending_crush(newcrush);
7934 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
7935 // suppose this is a replay and return success
7936 // so command is idempotent
7937 ss << "already renamed to '" << dstname << "'";
7938 err = 0;
7939 goto reply;
7940 }
7941
7942 err = newcrush.rename_class(srcname, dstname);
7943 if (err < 0) {
7944 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
7945 << cpp_strerror(err);
7946 goto reply;
7947 }
7948
7949 pending_inc.crush.clear();
7950 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7951 ss << "rename class '" << srcname << "' to '" << dstname << "'";
7952 goto update;
7953 } else if (prefix == "osd crush add-bucket") {
7954 // os crush add-bucket <name> <type>
7955 string name, typestr;
7956 cmd_getval(g_ceph_context, cmdmap, "name", name);
7957 cmd_getval(g_ceph_context, cmdmap, "type", typestr);
7958
7959 if (!_have_pending_crush() &&
7960 _get_stable_crush().name_exists(name)) {
7961 ss << "bucket '" << name << "' already exists";
7962 goto reply;
7963 }
7964
7965 CrushWrapper newcrush;
7966 _get_pending_crush(newcrush);
7967
7968 if (newcrush.name_exists(name)) {
7969 ss << "bucket '" << name << "' already exists";
7970 goto update;
7971 }
7972 int type = newcrush.get_type_id(typestr);
7973 if (type < 0) {
7974 ss << "type '" << typestr << "' does not exist";
7975 err = -EINVAL;
7976 goto reply;
7977 }
7978 if (type == 0) {
7979 ss << "type '" << typestr << "' is for devices, not buckets";
7980 err = -EINVAL;
7981 goto reply;
7982 }
7983 int bucketno;
7984 err = newcrush.add_bucket(0, 0,
7985 CRUSH_HASH_DEFAULT, type, 0, NULL,
7986 NULL, &bucketno);
7987 if (err < 0) {
7988 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
7989 goto reply;
7990 }
7991 err = newcrush.set_item_name(bucketno, name);
7992 if (err < 0) {
7993 ss << "error setting bucket name to '" << name << "'";
7994 goto reply;
7995 }
7996
7997 pending_inc.crush.clear();
7998 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7999 ss << "added bucket " << name << " type " << typestr
8000 << " to crush map";
8001 goto update;
8002 } else if (prefix == "osd crush rename-bucket") {
8003 string srcname, dstname;
8004 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
8005 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
8006
8007 err = crush_rename_bucket(srcname, dstname, &ss);
8008 if (err == -EALREADY) // equivalent to success for idempotency
8009 err = 0;
8010 if (err)
8011 goto reply;
8012 else
8013 goto update;
8014 } else if (prefix == "osd crush weight-set create" ||
8015 prefix == "osd crush weight-set create-compat") {
8016 CrushWrapper newcrush;
8017 _get_pending_crush(newcrush);
8018 int64_t pool;
8019 int positions;
8020 if (newcrush.has_non_straw2_buckets()) {
8021 ss << "crush map contains one or more bucket(s) that are not straw2";
8022 err = -EPERM;
8023 goto reply;
8024 }
8025 if (prefix == "osd crush weight-set create") {
8026 if (osdmap.require_min_compat_client > 0 &&
8027 osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
8028 ss << "require_min_compat_client "
8029 << ceph_release_name(osdmap.require_min_compat_client)
8030 << " < luminous, which is required for per-pool weight-sets. "
8031 << "Try 'ceph osd set-require-min-compat-client luminous' "
8032 << "before using the new interface";
8033 err = -EPERM;
8034 goto reply;
8035 }
8036 string poolname, mode;
8037 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
8038 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8039 if (pool < 0) {
8040 ss << "pool '" << poolname << "' not found";
8041 err = -ENOENT;
8042 goto reply;
8043 }
8044 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
8045 if (mode != "flat" && mode != "positional") {
8046 ss << "unrecognized weight-set mode '" << mode << "'";
8047 err = -EINVAL;
8048 goto reply;
8049 }
8050 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
8051 } else {
8052 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
8053 positions = 1;
8054 }
8055 newcrush.create_choose_args(pool, positions);
8056 pending_inc.crush.clear();
8057 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8058 goto update;
8059
8060 } else if (prefix == "osd crush weight-set rm" ||
8061 prefix == "osd crush weight-set rm-compat") {
8062 CrushWrapper newcrush;
8063 _get_pending_crush(newcrush);
8064 int64_t pool;
8065 if (prefix == "osd crush weight-set rm") {
8066 string poolname;
8067 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
8068 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8069 if (pool < 0) {
8070 ss << "pool '" << poolname << "' not found";
8071 err = -ENOENT;
8072 goto reply;
8073 }
8074 } else {
8075 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
8076 }
8077 newcrush.rm_choose_args(pool);
8078 pending_inc.crush.clear();
8079 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8080 goto update;
8081
8082 } else if (prefix == "osd crush weight-set reweight" ||
8083 prefix == "osd crush weight-set reweight-compat") {
8084 string poolname, item;
8085 vector<double> weight;
8086 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
8087 cmd_getval(g_ceph_context, cmdmap, "item", item);
8088 cmd_getval(g_ceph_context, cmdmap, "weight", weight);
8089 CrushWrapper newcrush;
8090 _get_pending_crush(newcrush);
8091 int64_t pool;
8092 if (prefix == "osd crush weight-set reweight") {
8093 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8094 if (pool < 0) {
8095 ss << "pool '" << poolname << "' not found";
8096 err = -ENOENT;
8097 goto reply;
8098 }
8099 if (!newcrush.have_choose_args(pool)) {
8100 ss << "no weight-set for pool '" << poolname << "'";
8101 err = -ENOENT;
8102 goto reply;
8103 }
8104 auto arg_map = newcrush.choose_args_get(pool);
8105 int positions = newcrush.get_choose_args_positions(arg_map);
8106 if (weight.size() != (size_t)positions) {
8107 ss << "must specify exact " << positions << " weight values";
8108 err = -EINVAL;
8109 goto reply;
8110 }
8111 } else {
8112 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
8113 if (!newcrush.have_choose_args(pool)) {
8114 ss << "no backward-compatible weight-set";
8115 err = -ENOENT;
8116 goto reply;
8117 }
8118 }
8119 if (!newcrush.name_exists(item)) {
8120 ss << "item '" << item << "' does not exist";
8121 err = -ENOENT;
8122 goto reply;
8123 }
8124 err = newcrush.choose_args_adjust_item_weightf(
8125 g_ceph_context,
8126 newcrush.choose_args_get(pool),
8127 newcrush.get_item_id(item),
8128 weight,
8129 &ss);
8130 if (err < 0) {
8131 goto reply;
8132 }
8133 err = 0;
8134 pending_inc.crush.clear();
8135 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8136 goto update;
8137 } else if (osdid_present &&
8138 (prefix == "osd crush set" || prefix == "osd crush add")) {
8139 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
8140 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
8141 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
8142
8143 if (!osdmap.exists(osdid)) {
8144 err = -ENOENT;
8145 ss << name << " does not exist. Create it before updating the crush map";
8146 goto reply;
8147 }
8148
8149 double weight;
8150 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
8151 ss << "unable to parse weight value '"
8152 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8153 err = -EINVAL;
8154 goto reply;
8155 }
8156
8157 string args;
8158 vector<string> argvec;
8159 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8160 map<string,string> loc;
8161 CrushWrapper::parse_loc_map(argvec, &loc);
8162
8163 if (prefix == "osd crush set"
8164 && !_get_stable_crush().item_exists(osdid)) {
8165 err = -ENOENT;
8166 ss << "unable to set item id " << osdid << " name '" << name
8167 << "' weight " << weight << " at location " << loc
8168 << ": does not exist";
8169 goto reply;
8170 }
8171
8172 dout(5) << "adding/updating crush item id " << osdid << " name '"
8173 << name << "' weight " << weight << " at location "
8174 << loc << dendl;
8175 CrushWrapper newcrush;
8176 _get_pending_crush(newcrush);
8177
8178 string action;
8179 if (prefix == "osd crush set" ||
8180 newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
8181 action = "set";
8182 err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
8183 } else {
8184 action = "add";
8185 err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
8186 if (err == 0)
8187 err = 1;
8188 }
8189
8190 if (err < 0)
8191 goto reply;
8192
8193 if (err == 0 && !_have_pending_crush()) {
8194 ss << action << " item id " << osdid << " name '" << name << "' weight "
8195 << weight << " at location " << loc << ": no change";
8196 goto reply;
8197 }
8198
8199 pending_inc.crush.clear();
8200 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8201 ss << action << " item id " << osdid << " name '" << name << "' weight "
8202 << weight << " at location " << loc << " to crush map";
8203 getline(ss, rs);
8204 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8205 get_last_committed() + 1));
8206 return true;
8207
8208 } else if (prefix == "osd crush create-or-move") {
8209 do {
8210 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
8211 if (!osdmap.exists(osdid)) {
8212 err = -ENOENT;
8213 ss << name << " does not exist. create it before updating the crush map";
8214 goto reply;
8215 }
8216
8217 double weight;
8218 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
8219 ss << "unable to parse weight value '"
8220 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8221 err = -EINVAL;
8222 goto reply;
8223 }
8224
8225 string args;
8226 vector<string> argvec;
8227 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8228 map<string,string> loc;
8229 CrushWrapper::parse_loc_map(argvec, &loc);
8230
8231 dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
8232 << " at location " << loc << dendl;
8233
8234 CrushWrapper newcrush;
8235 _get_pending_crush(newcrush);
8236
8237 err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
8238 if (err == 0) {
8239 ss << "create-or-move updated item name '" << name << "' weight " << weight
8240 << " at location " << loc << " to crush map";
8241 break;
8242 }
8243 if (err > 0) {
8244 pending_inc.crush.clear();
8245 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8246 ss << "create-or-move updating item name '" << name << "' weight " << weight
8247 << " at location " << loc << " to crush map";
8248 getline(ss, rs);
8249 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8250 get_last_committed() + 1));
8251 return true;
8252 }
8253 } while (false);
8254
8255 } else if (prefix == "osd crush move") {
8256 do {
8257 // osd crush move <name> <loc1> [<loc2> ...]
8258
8259 string args;
8260 vector<string> argvec;
8261 cmd_getval(g_ceph_context, cmdmap, "name", name);
8262 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8263 map<string,string> loc;
8264 CrushWrapper::parse_loc_map(argvec, &loc);
8265
8266 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
8267 CrushWrapper newcrush;
8268 _get_pending_crush(newcrush);
8269
8270 if (!newcrush.name_exists(name)) {
8271 err = -ENOENT;
8272 ss << "item " << name << " does not exist";
8273 break;
8274 }
8275 int id = newcrush.get_item_id(name);
8276
8277 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
8278 if (id >= 0) {
8279 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
8280 } else {
8281 err = newcrush.move_bucket(g_ceph_context, id, loc);
8282 }
8283 if (err >= 0) {
8284 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
8285 pending_inc.crush.clear();
8286 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8287 getline(ss, rs);
8288 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8289 get_last_committed() + 1));
8290 return true;
8291 }
8292 } else {
8293 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
8294 err = 0;
8295 }
8296 } while (false);
8297 } else if (prefix == "osd crush swap-bucket") {
8298 string source, dest, force;
8299 cmd_getval(g_ceph_context, cmdmap, "source", source);
8300 cmd_getval(g_ceph_context, cmdmap, "dest", dest);
8301 cmd_getval(g_ceph_context, cmdmap, "force", force);
8302 CrushWrapper newcrush;
8303 _get_pending_crush(newcrush);
8304 if (!newcrush.name_exists(source)) {
8305 ss << "source item " << source << " does not exist";
8306 err = -ENOENT;
8307 goto reply;
8308 }
8309 if (!newcrush.name_exists(dest)) {
8310 ss << "dest item " << dest << " does not exist";
8311 err = -ENOENT;
8312 goto reply;
8313 }
8314 int sid = newcrush.get_item_id(source);
8315 int did = newcrush.get_item_id(dest);
8316 int sparent;
8317 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 &&
8318 force != "--yes-i-really-mean-it") {
8319 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
8320 err = -EPERM;
8321 goto reply;
8322 }
8323 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
8324 force != "--yes-i-really-mean-it") {
8325 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
8326 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
8327 << "; pass --yes-i-really-mean-it to proceed anyway";
8328 err = -EPERM;
8329 goto reply;
8330 }
8331 int r = newcrush.swap_bucket(g_ceph_context, sid, did);
8332 if (r < 0) {
8333 ss << "failed to swap bucket contents: " << cpp_strerror(r);
8334 err = r;
8335 goto reply;
8336 }
8337 ss << "swapped bucket of " << source << " to " << dest;
8338 pending_inc.crush.clear();
8339 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8340 wait_for_finished_proposal(op,
8341 new Monitor::C_Command(mon, op, err, ss.str(),
8342 get_last_committed() + 1));
8343 return true;
8344 } else if (prefix == "osd crush link") {
8345 // osd crush link <name> <loc1> [<loc2> ...]
8346 string name;
8347 cmd_getval(g_ceph_context, cmdmap, "name", name);
8348 vector<string> argvec;
8349 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8350 map<string,string> loc;
8351 CrushWrapper::parse_loc_map(argvec, &loc);
8352
8353 // Need an explicit check for name_exists because get_item_id returns
8354 // 0 on unfound.
8355 int id = osdmap.crush->get_item_id(name);
8356 if (!osdmap.crush->name_exists(name)) {
8357 err = -ENOENT;
8358 ss << "item " << name << " does not exist";
8359 goto reply;
8360 } else {
8361 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
8362 }
8363 if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
8364 ss << "no need to move item id " << id << " name '" << name
8365 << "' to location " << loc << " in crush map";
8366 err = 0;
8367 goto reply;
8368 }
8369
8370 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
8371 CrushWrapper newcrush;
8372 _get_pending_crush(newcrush);
8373
8374 if (!newcrush.name_exists(name)) {
8375 err = -ENOENT;
8376 ss << "item " << name << " does not exist";
8377 goto reply;
8378 } else {
8379 int id = newcrush.get_item_id(name);
8380 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
8381 err = newcrush.link_bucket(g_ceph_context, id, loc);
8382 if (err >= 0) {
8383 ss << "linked item id " << id << " name '" << name
8384 << "' to location " << loc << " in crush map";
8385 pending_inc.crush.clear();
8386 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8387 } else {
8388 ss << "cannot link item id " << id << " name '" << name
8389 << "' to location " << loc;
8390 goto reply;
8391 }
8392 } else {
8393 ss << "no need to move item id " << id << " name '" << name
8394 << "' to location " << loc << " in crush map";
8395 err = 0;
8396 }
8397 }
8398 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
8399 get_last_committed() + 1));
8400 return true;
8401 } else if (prefix == "osd crush rm" ||
8402 prefix == "osd crush remove" ||
8403 prefix == "osd crush unlink") {
8404 do {
8405 // osd crush rm <id> [ancestor]
8406 CrushWrapper newcrush;
8407 _get_pending_crush(newcrush);
8408
8409 string name;
8410 cmd_getval(g_ceph_context, cmdmap, "name", name);
8411
8412 if (!osdmap.crush->name_exists(name)) {
8413 err = 0;
8414 ss << "device '" << name << "' does not appear in the crush map";
8415 break;
8416 }
8417 if (!newcrush.name_exists(name)) {
8418 err = 0;
8419 ss << "device '" << name << "' does not appear in the crush map";
8420 getline(ss, rs);
8421 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8422 get_last_committed() + 1));
8423 return true;
8424 }
8425 int id = newcrush.get_item_id(name);
8426 int ancestor = 0;
8427
8428 bool unlink_only = prefix == "osd crush unlink";
8429 string ancestor_str;
8430 if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
8431 if (!newcrush.name_exists(ancestor_str)) {
8432 err = -ENOENT;
8433 ss << "ancestor item '" << ancestor_str
8434 << "' does not appear in the crush map";
8435 break;
8436 }
8437 ancestor = newcrush.get_item_id(ancestor_str);
8438 }
8439
8440 err = prepare_command_osd_crush_remove(
8441 newcrush,
8442 id, ancestor,
8443 (ancestor < 0), unlink_only);
8444
8445 if (err == -ENOENT) {
8446 ss << "item " << id << " does not appear in that position";
8447 err = 0;
8448 break;
8449 }
8450 if (err == 0) {
8451 ss << "removed item id " << id << " name '" << name << "' from crush map";
8452 getline(ss, rs);
8453 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8454 get_last_committed() + 1));
8455 return true;
8456 }
8457 } while (false);
8458
8459 } else if (prefix == "osd crush reweight-all") {
8460 CrushWrapper newcrush;
8461 _get_pending_crush(newcrush);
8462
8463 newcrush.reweight(g_ceph_context);
8464 pending_inc.crush.clear();
8465 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8466 ss << "reweighted crush hierarchy";
8467 getline(ss, rs);
8468 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8469 get_last_committed() + 1));
8470 return true;
8471 } else if (prefix == "osd crush reweight") {
8472 // osd crush reweight <name> <weight>
8473 CrushWrapper newcrush;
8474 _get_pending_crush(newcrush);
8475
8476 string name;
8477 cmd_getval(g_ceph_context, cmdmap, "name", name);
8478 if (!newcrush.name_exists(name)) {
8479 err = -ENOENT;
8480 ss << "device '" << name << "' does not appear in the crush map";
8481 goto reply;
8482 }
8483
8484 int id = newcrush.get_item_id(name);
8485 if (id < 0) {
8486 ss << "device '" << name << "' is not a leaf in the crush map";
8487 err = -EINVAL;
8488 goto reply;
8489 }
8490 double w;
8491 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8492 ss << "unable to parse weight value '"
8493 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8494 err = -EINVAL;
8495 goto reply;
8496 }
8497
8498 err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
8499 if (err < 0)
8500 goto reply;
8501 pending_inc.crush.clear();
8502 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8503 ss << "reweighted item id " << id << " name '" << name << "' to " << w
8504 << " in crush map";
8505 getline(ss, rs);
8506 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8507 get_last_committed() + 1));
8508 return true;
8509 } else if (prefix == "osd crush reweight-subtree") {
8510 // osd crush reweight <name> <weight>
8511 CrushWrapper newcrush;
8512 _get_pending_crush(newcrush);
8513
8514 string name;
8515 cmd_getval(g_ceph_context, cmdmap, "name", name);
8516 if (!newcrush.name_exists(name)) {
8517 err = -ENOENT;
8518 ss << "device '" << name << "' does not appear in the crush map";
8519 goto reply;
8520 }
8521
8522 int id = newcrush.get_item_id(name);
8523 if (id >= 0) {
8524 ss << "device '" << name << "' is not a subtree in the crush map";
8525 err = -EINVAL;
8526 goto reply;
8527 }
8528 double w;
8529 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8530 ss << "unable to parse weight value '"
8531 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8532 err = -EINVAL;
8533 goto reply;
8534 }
8535
8536 err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
8537 if (err < 0)
8538 goto reply;
8539 pending_inc.crush.clear();
8540 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8541 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
8542 << " in crush map";
8543 getline(ss, rs);
8544 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8545 get_last_committed() + 1));
8546 return true;
8547 } else if (prefix == "osd crush tunables") {
8548 CrushWrapper newcrush;
8549 _get_pending_crush(newcrush);
8550
8551 err = 0;
8552 string profile;
8553 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8554 if (profile == "legacy" || profile == "argonaut") {
8555 newcrush.set_tunables_legacy();
8556 } else if (profile == "bobtail") {
8557 newcrush.set_tunables_bobtail();
8558 } else if (profile == "firefly") {
8559 newcrush.set_tunables_firefly();
8560 } else if (profile == "hammer") {
8561 newcrush.set_tunables_hammer();
8562 } else if (profile == "jewel") {
8563 newcrush.set_tunables_jewel();
8564 } else if (profile == "optimal") {
8565 newcrush.set_tunables_optimal();
8566 } else if (profile == "default") {
8567 newcrush.set_tunables_default();
8568 } else {
8569 ss << "unrecognized profile '" << profile << "'";
8570 err = -EINVAL;
8571 goto reply;
8572 }
8573
8574 if (!validate_crush_against_features(&newcrush, ss)) {
8575 err = -EINVAL;
8576 goto reply;
8577 }
8578
8579 pending_inc.crush.clear();
8580 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8581 ss << "adjusted tunables profile to " << profile;
8582 getline(ss, rs);
8583 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8584 get_last_committed() + 1));
8585 return true;
8586 } else if (prefix == "osd crush set-tunable") {
8587 CrushWrapper newcrush;
8588 _get_pending_crush(newcrush);
8589
8590 err = 0;
8591 string tunable;
8592 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
8593
8594 int64_t value = -1;
8595 if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
8596 err = -EINVAL;
8597 ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
8598 goto reply;
8599 }
8600
8601 if (tunable == "straw_calc_version") {
8602 if (value != 0 && value != 1) {
8603 ss << "value must be 0 or 1; got " << value;
8604 err = -EINVAL;
8605 goto reply;
8606 }
8607 newcrush.set_straw_calc_version(value);
8608 } else {
8609 ss << "unrecognized tunable '" << tunable << "'";
8610 err = -EINVAL;
8611 goto reply;
8612 }
8613
8614 if (!validate_crush_against_features(&newcrush, ss)) {
8615 err = -EINVAL;
8616 goto reply;
8617 }
8618
8619 pending_inc.crush.clear();
8620 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8621 ss << "adjusted tunable " << tunable << " to " << value;
8622 getline(ss, rs);
8623 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8624 get_last_committed() + 1));
8625 return true;
8626
8627 } else if (prefix == "osd crush rule create-simple") {
8628 string name, root, type, mode;
8629 cmd_getval(g_ceph_context, cmdmap, "name", name);
8630 cmd_getval(g_ceph_context, cmdmap, "root", root);
8631 cmd_getval(g_ceph_context, cmdmap, "type", type);
8632 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
8633 if (mode == "")
8634 mode = "firstn";
8635
8636 if (osdmap.crush->rule_exists(name)) {
8637 // The name is uniquely associated to a ruleid and the rule it contains
8638 // From the user point of view, the rule is more meaningfull.
8639 ss << "rule " << name << " already exists";
8640 err = 0;
8641 goto reply;
8642 }
8643
8644 CrushWrapper newcrush;
8645 _get_pending_crush(newcrush);
8646
8647 if (newcrush.rule_exists(name)) {
8648 // The name is uniquely associated to a ruleid and the rule it contains
8649 // From the user point of view, the rule is more meaningfull.
8650 ss << "rule " << name << " already exists";
8651 err = 0;
8652 } else {
8653 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
8654 pg_pool_t::TYPE_REPLICATED, &ss);
8655 if (ruleno < 0) {
8656 err = ruleno;
8657 goto reply;
8658 }
8659
8660 pending_inc.crush.clear();
8661 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8662 }
8663 getline(ss, rs);
8664 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8665 get_last_committed() + 1));
8666 return true;
8667
8668 } else if (prefix == "osd crush rule create-replicated") {
8669 string name, root, type, device_class;
8670 cmd_getval(g_ceph_context, cmdmap, "name", name);
8671 cmd_getval(g_ceph_context, cmdmap, "root", root);
8672 cmd_getval(g_ceph_context, cmdmap, "type", type);
8673 cmd_getval(g_ceph_context, cmdmap, "class", device_class);
8674
8675 if (!device_class.empty()) {
8676 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8677 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8678 << "luminous' before using crush device classes";
8679 err = -EPERM;
8680 goto reply;
8681 }
8682 }
8683
8684 if (osdmap.crush->rule_exists(name)) {
8685 // The name is uniquely associated to a ruleid and the rule it contains
8686 // From the user point of view, the rule is more meaningfull.
8687 ss << "rule " << name << " already exists";
8688 err = 0;
8689 goto reply;
8690 }
8691
8692 CrushWrapper newcrush;
8693 _get_pending_crush(newcrush);
8694
8695 if (newcrush.rule_exists(name)) {
8696 // The name is uniquely associated to a ruleid and the rule it contains
8697 // From the user point of view, the rule is more meaningfull.
8698 ss << "rule " << name << " already exists";
8699 err = 0;
8700 } else {
8701 int ruleno = newcrush.add_simple_rule(
8702 name, root, type, device_class,
8703 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
8704 if (ruleno < 0) {
8705 err = ruleno;
8706 goto reply;
8707 }
8708
8709 pending_inc.crush.clear();
8710 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8711 }
8712 getline(ss, rs);
8713 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8714 get_last_committed() + 1));
8715 return true;
8716
8717 } else if (prefix == "osd erasure-code-profile rm") {
8718 string name;
8719 cmd_getval(g_ceph_context, cmdmap, "name", name);
8720
8721 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
8722 goto wait;
8723
8724 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
8725 err = -EBUSY;
8726 goto reply;
8727 }
8728
8729 if (osdmap.has_erasure_code_profile(name) ||
8730 pending_inc.new_erasure_code_profiles.count(name)) {
8731 if (osdmap.has_erasure_code_profile(name)) {
8732 pending_inc.old_erasure_code_profiles.push_back(name);
8733 } else {
8734 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
8735 pending_inc.new_erasure_code_profiles.erase(name);
8736 }
8737
8738 getline(ss, rs);
8739 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8740 get_last_committed() + 1));
8741 return true;
8742 } else {
8743 ss << "erasure-code-profile " << name << " does not exist";
8744 err = 0;
8745 goto reply;
8746 }
8747
8748 } else if (prefix == "osd erasure-code-profile set") {
8749 string name;
8750 cmd_getval(g_ceph_context, cmdmap, "name", name);
8751 vector<string> profile;
8752 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8753 bool force;
8754 if (profile.size() > 0 && profile.back() == "--force") {
8755 profile.pop_back();
8756 force = true;
8757 } else {
8758 force = false;
8759 }
8760 map<string,string> profile_map;
8761 err = parse_erasure_code_profile(profile, &profile_map, &ss);
8762 if (err)
8763 goto reply;
8764 if (profile_map.find("plugin") == profile_map.end()) {
8765 ss << "erasure-code-profile " << profile_map
8766 << " must contain a plugin entry" << std::endl;
8767 err = -EINVAL;
8768 goto reply;
8769 }
8770 string plugin = profile_map["plugin"];
8771
8772 if (pending_inc.has_erasure_code_profile(name)) {
8773 dout(20) << "erasure code profile " << name << " try again" << dendl;
8774 goto wait;
8775 } else {
8776 if (plugin == "isa" || plugin == "lrc") {
8777 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
8778 if (err == -EAGAIN)
8779 goto wait;
8780 if (err)
8781 goto reply;
8782 } else if (plugin == "shec") {
8783 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
8784 if (err == -EAGAIN)
8785 goto wait;
8786 if (err)
8787 goto reply;
8788 }
8789 err = normalize_profile(name, profile_map, force, &ss);
8790 if (err)
8791 goto reply;
8792
8793 if (osdmap.has_erasure_code_profile(name)) {
8794 ErasureCodeProfile existing_profile_map =
8795 osdmap.get_erasure_code_profile(name);
8796 err = normalize_profile(name, existing_profile_map, force, &ss);
8797 if (err)
8798 goto reply;
8799
8800 if (existing_profile_map == profile_map) {
8801 err = 0;
8802 goto reply;
8803 }
8804 if (!force) {
8805 err = -EPERM;
8806 ss << "will not override erasure code profile " << name
8807 << " because the existing profile "
8808 << existing_profile_map
8809 << " is different from the proposed profile "
8810 << profile_map;
8811 goto reply;
8812 }
8813 }
8814
8815 dout(20) << "erasure code profile set " << name << "="
8816 << profile_map << dendl;
8817 pending_inc.set_erasure_code_profile(name, profile_map);
8818 }
8819
8820 getline(ss, rs);
8821 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8822 get_last_committed() + 1));
8823 return true;
8824
8825 } else if (prefix == "osd crush rule create-erasure") {
8826 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
8827 if (err == -EAGAIN)
8828 goto wait;
8829 if (err)
8830 goto reply;
8831 string name, poolstr;
8832 cmd_getval(g_ceph_context, cmdmap, "name", name);
8833 string profile;
8834 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8835 if (profile == "")
8836 profile = "default";
8837 if (profile == "default") {
8838 if (!osdmap.has_erasure_code_profile(profile)) {
8839 if (pending_inc.has_erasure_code_profile(profile)) {
8840 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
8841 goto wait;
8842 }
8843
8844 map<string,string> profile_map;
8845 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
8846 profile_map,
8847 &ss);
8848 if (err)
8849 goto reply;
8850 err = normalize_profile(name, profile_map, true, &ss);
8851 if (err)
8852 goto reply;
8853 dout(20) << "erasure code profile set " << profile << "="
8854 << profile_map << dendl;
8855 pending_inc.set_erasure_code_profile(profile, profile_map);
8856 goto wait;
8857 }
8858 }
8859
8860 int rule;
8861 err = crush_rule_create_erasure(name, profile, &rule, &ss);
8862 if (err < 0) {
8863 switch(err) {
8864 case -EEXIST: // return immediately
8865 ss << "rule " << name << " already exists";
8866 err = 0;
8867 goto reply;
8868 break;
8869 case -EALREADY: // wait for pending to be proposed
8870 ss << "rule " << name << " already exists";
8871 err = 0;
8872 break;
8873 default: // non recoverable error
8874 goto reply;
8875 break;
8876 }
8877 } else {
8878 ss << "created rule " << name << " at " << rule;
8879 }
8880
8881 getline(ss, rs);
8882 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8883 get_last_committed() + 1));
8884 return true;
8885
8886 } else if (prefix == "osd crush rule rm") {
8887 string name;
8888 cmd_getval(g_ceph_context, cmdmap, "name", name);
8889
8890 if (!osdmap.crush->rule_exists(name)) {
8891 ss << "rule " << name << " does not exist";
8892 err = 0;
8893 goto reply;
8894 }
8895
8896 CrushWrapper newcrush;
8897 _get_pending_crush(newcrush);
8898
8899 if (!newcrush.rule_exists(name)) {
8900 ss << "rule " << name << " does not exist";
8901 err = 0;
8902 } else {
8903 int ruleno = newcrush.get_rule_id(name);
8904 assert(ruleno >= 0);
8905
8906 // make sure it is not in use.
8907 // FIXME: this is ok in some situations, but let's not bother with that
8908 // complexity now.
8909 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
8910 if (osdmap.crush_rule_in_use(ruleset)) {
8911 ss << "crush ruleset " << name << " " << ruleset << " is in use";
8912 err = -EBUSY;
8913 goto reply;
8914 }
8915
8916 err = newcrush.remove_rule(ruleno);
8917 if (err < 0) {
8918 goto reply;
8919 }
8920
8921 pending_inc.crush.clear();
8922 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8923 }
8924 getline(ss, rs);
8925 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8926 get_last_committed() + 1));
8927 return true;
8928
8929 } else if (prefix == "osd crush rule rename") {
8930 string srcname;
8931 string dstname;
8932 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
8933 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
8934 if (srcname.empty() || dstname.empty()) {
8935 ss << "must specify both source rule name and destination rule name";
8936 err = -EINVAL;
8937 goto reply;
8938 }
8939 if (srcname == dstname) {
8940 ss << "destination rule name is equal to source rule name";
8941 err = 0;
8942 goto reply;
8943 }
8944
8945 CrushWrapper newcrush;
8946 _get_pending_crush(newcrush);
8947 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
8948 // srcname does not exist and dstname already exists
8949 // suppose this is a replay and return success
8950 // (so this command is idempotent)
8951 ss << "already renamed to '" << dstname << "'";
8952 err = 0;
8953 goto reply;
8954 }
8955
8956 err = newcrush.rename_rule(srcname, dstname, &ss);
8957 if (err < 0) {
8958 // ss has reason for failure
8959 goto reply;
8960 }
8961 pending_inc.crush.clear();
8962 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8963 getline(ss, rs);
8964 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8965 get_last_committed() + 1));
8966 return true;
8967
8968 } else if (prefix == "osd setmaxosd") {
8969 int64_t newmax;
8970 if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
8971 ss << "unable to parse 'newmax' value '"
8972 << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
8973 err = -EINVAL;
8974 goto reply;
8975 }
8976
8977 if (newmax > g_conf->mon_max_osd) {
8978 err = -ERANGE;
8979 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
8980 << g_conf->mon_max_osd << ")";
8981 goto reply;
8982 }
8983
8984 // Don't allow shrinking OSD number as this will cause data loss
8985 // and may cause kernel crashes.
8986 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
8987 if (newmax < osdmap.get_max_osd()) {
8988 // Check if the OSDs exist between current max and new value.
8989 // If there are any OSDs exist, then don't allow shrinking number
8990 // of OSDs.
8991 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
8992 if (osdmap.exists(i)) {
8993 err = -EBUSY;
8994 ss << "cannot shrink max_osd to " << newmax
8995 << " because osd." << i << " (and possibly others) still in use";
8996 goto reply;
8997 }
8998 }
8999 }
9000
9001 pending_inc.new_max_osd = newmax;
9002 ss << "set new max_osd = " << pending_inc.new_max_osd;
9003 getline(ss, rs);
9004 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9005 get_last_committed() + 1));
9006 return true;
9007
9008 } else if (prefix == "osd set-full-ratio" ||
9009 prefix == "osd set-backfillfull-ratio" ||
9010 prefix == "osd set-nearfull-ratio") {
9011 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
9012 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9013 << "luminous' before using the new interface";
9014 err = -EPERM;
9015 goto reply;
9016 }
9017 double n;
9018 if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
9019 ss << "unable to parse 'ratio' value '"
9020 << cmd_vartype_stringify(cmdmap["ratio"]) << "'";
9021 err = -EINVAL;
9022 goto reply;
9023 }
9024 if (prefix == "osd set-full-ratio")
9025 pending_inc.new_full_ratio = n;
9026 else if (prefix == "osd set-backfillfull-ratio")
9027 pending_inc.new_backfillfull_ratio = n;
9028 else if (prefix == "osd set-nearfull-ratio")
9029 pending_inc.new_nearfull_ratio = n;
9030 ss << prefix << " " << n;
9031 getline(ss, rs);
9032 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9033 get_last_committed() + 1));
9034 return true;
9035 } else if (prefix == "osd set-require-min-compat-client") {
9036 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
9037 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9038 << "luminous' before using the new interface";
9039 err = -EPERM;
9040 goto reply;
9041 }
9042 string v;
9043 cmd_getval(g_ceph_context, cmdmap, "version", v);
9044 int vno = ceph_release_from_name(v.c_str());
9045 if (vno <= 0) {
9046 ss << "version " << v << " is not recognized";
9047 err = -EINVAL;
9048 goto reply;
9049 }
9050 OSDMap newmap;
9051 newmap.deepish_copy_from(osdmap);
9052 newmap.apply_incremental(pending_inc);
9053 newmap.require_min_compat_client = vno;
9054 auto mvno = newmap.get_min_compat_client();
9055 if (vno < mvno) {
9056 ss << "osdmap current utilizes features that require "
9057 << ceph_release_name(mvno)
9058 << "; cannot set require_min_compat_client below that to "
9059 << ceph_release_name(vno);
9060 err = -EPERM;
9061 goto reply;
9062 }
9063 string sure;
9064 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
9065 if (sure != "--yes-i-really-mean-it") {
9066 FeatureMap m;
9067 mon->get_combined_feature_map(&m);
9068 uint64_t features = ceph_release_features(vno);
9069 bool first = true;
9070 bool ok = true;
9071 for (int type : {
9072 CEPH_ENTITY_TYPE_CLIENT,
9073 CEPH_ENTITY_TYPE_MDS,
9074 CEPH_ENTITY_TYPE_MGR }) {
9075 auto p = m.m.find(type);
9076 if (p == m.m.end()) {
9077 continue;
9078 }
9079 for (auto& q : p->second) {
9080 uint64_t missing = ~q.first & features;
9081 if (missing) {
9082 if (first) {
9083 ss << "cannot set require_min_compat_client to " << v << ": ";
9084 } else {
9085 ss << "; ";
9086 }
9087 first = false;
9088 ss << q.second << " connected " << ceph_entity_type_name(type)
9089 << "(s) look like " << ceph_release_name(
9090 ceph_release_from_features(q.first))
9091 << " (missing 0x" << std::hex << missing << std::dec << ")";
9092 ok = false;
9093 }
9094 }
9095 }
9096 if (!ok) {
9097 ss << "; add --yes-i-really-mean-it to do it anyway";
9098 err = -EPERM;
9099 goto reply;
9100 }
9101 }
9102 ss << "set require_min_compat_client to " << ceph_release_name(vno);
9103 pending_inc.new_require_min_compat_client = vno;
9104 getline(ss, rs);
9105 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9106 get_last_committed() + 1));
9107 return true;
9108 } else if (prefix == "osd pause") {
9109 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9110
9111 } else if (prefix == "osd unpause") {
9112 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9113
9114 } else if (prefix == "osd set") {
9115 string sure;
9116 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
9117 string key;
9118 cmd_getval(g_ceph_context, cmdmap, "key", key);
9119 if (key == "full")
9120 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
9121 else if (key == "pause")
9122 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9123 else if (key == "noup")
9124 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
9125 else if (key == "nodown")
9126 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
9127 else if (key == "noout")
9128 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
9129 else if (key == "noin")
9130 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
9131 else if (key == "nobackfill")
9132 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
9133 else if (key == "norebalance")
9134 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
9135 else if (key == "norecover")
9136 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
9137 else if (key == "noscrub")
9138 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
9139 else if (key == "nodeep-scrub")
9140 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
9141 else if (key == "notieragent")
9142 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
9143 else if (key == "sortbitwise") {
9144 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9145 ss << "Not advisable to continue since no OSDs are up. Pass "
9146 << "--yes-i-really-mean-it if you really wish to continue.";
9147 err = -EPERM;
9148 goto reply;
9149 }
9150 if ((osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)
9151 || sure == "--yes-i-really-mean-it") {
9152 return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
9153 } else {
9154 ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
9155 err = -EPERM;
9156 goto reply;
9157 }
9158 } else if (key == "recovery_deletes") {
9159 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9160 ss << "Not advisable to continue since no OSDs are up. Pass "
9161 << "--yes-i-really-mean-it if you really wish to continue.";
9162 err = -EPERM;
9163 goto reply;
9164 }
9165 if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)
9166 || sure == "--yes-i-really-mean-it") {
9167 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
9168 } else {
9169 ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
9170 err = -EPERM;
9171 goto reply;
9172 }
9173 } else if (key == "require_jewel_osds") {
9174 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9175 ss << "Not advisable to continue since no OSDs are up. Pass "
9176 << "--yes-i-really-mean-it if you really wish to continue.";
9177 err = -EPERM;
9178 goto reply;
9179 }
9180 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9181 ss << "the sortbitwise flag must be set before require_jewel_osds";
9182 err = -EPERM;
9183 goto reply;
9184 } else if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL) {
9185 ss << "require_osd_release is already >= jewel";
9186 err = 0;
9187 goto reply;
9188 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)
9189 || sure == "--yes-i-really-mean-it") {
9190 return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
9191 } else {
9192 ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
9193 err = -EPERM;
9194 }
9195 } else if (key == "require_kraken_osds") {
9196 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9197 ss << "Not advisable to continue since no OSDs are up. Pass "
9198 << "--yes-i-really-mean-it if you really wish to continue.";
9199 err = -EPERM;
9200 goto reply;
9201 }
9202 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9203 ss << "the sortbitwise flag must be set before require_kraken_osds";
9204 err = -EPERM;
9205 goto reply;
9206 } else if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN) {
9207 ss << "require_osd_release is already >= kraken";
9208 err = 0;
9209 goto reply;
9210 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)
9211 || sure == "--yes-i-really-mean-it") {
9212 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
9213 // ensure JEWEL is also set
9214 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
9215 return r;
9216 } else {
9217 ss << "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
9218 err = -EPERM;
9219 }
9220 } else {
9221 ss << "unrecognized flag '" << key << "'";
9222 err = -EINVAL;
9223 }
9224
9225 } else if (prefix == "osd unset") {
9226 string key;
9227 cmd_getval(g_ceph_context, cmdmap, "key", key);
9228 if (key == "full")
9229 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
9230 else if (key == "pause")
9231 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9232 else if (key == "noup")
9233 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
9234 else if (key == "nodown")
9235 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
9236 else if (key == "noout")
9237 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
9238 else if (key == "noin")
9239 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
9240 else if (key == "nobackfill")
9241 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
9242 else if (key == "norebalance")
9243 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
9244 else if (key == "norecover")
9245 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
9246 else if (key == "noscrub")
9247 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
9248 else if (key == "nodeep-scrub")
9249 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
9250 else if (key == "notieragent")
9251 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
9252 else {
9253 ss << "unrecognized flag '" << key << "'";
9254 err = -EINVAL;
9255 }
9256
9257 } else if (prefix == "osd require-osd-release") {
9258 string release;
9259 cmd_getval(g_ceph_context, cmdmap, "release", release);
9260 string sure;
9261 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
9262 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9263 ss << "the sortbitwise flag must be set first";
9264 err = -EPERM;
9265 goto reply;
9266 }
9267 int rel = ceph_release_from_name(release.c_str());
9268 if (rel <= 0) {
9269 ss << "unrecognized release " << release;
9270 err = -EINVAL;
9271 goto reply;
9272 }
9273 if (rel < CEPH_RELEASE_LUMINOUS) {
9274 ss << "use this command only for luminous and later";
9275 err = -EINVAL;
9276 goto reply;
9277 }
9278 if (rel == osdmap.require_osd_release) {
9279 // idempotent
9280 err = 0;
9281 goto reply;
9282 }
9283 if (rel == CEPH_RELEASE_LUMINOUS) {
9284 if (!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS)) {
9285 ss << "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
9286 err = -EPERM;
9287 goto reply;
9288 }
9289 } else {
9290 ss << "not supported for this release yet";
9291 err = -EPERM;
9292 goto reply;
9293 }
9294 if (rel < osdmap.require_osd_release) {
9295 ss << "require_osd_release cannot be lowered once it has been set";
9296 err = -EPERM;
9297 goto reply;
9298 }
9299 pending_inc.new_require_osd_release = rel;
9300 if (rel >= CEPH_RELEASE_LUMINOUS &&
9301 !osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
9302 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
9303 }
9304 goto update;
9305 } else if (prefix == "osd cluster_snap") {
9306 // ** DISABLE THIS FOR NOW **
9307 ss << "cluster snapshot currently disabled (broken implementation)";
9308 // ** DISABLE THIS FOR NOW **
9309
9310 } else if (prefix == "osd down" ||
9311 prefix == "osd out" ||
9312 prefix == "osd in" ||
9313 prefix == "osd rm") {
9314
9315 bool any = false;
9316 bool stop = false;
9317 bool verbose = true;
9318
9319 vector<string> idvec;
9320 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9321 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9322 set<int> osds;
9323
9324 // wildcard?
9325 if (j == 0 &&
9326 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9327 if (prefix == "osd in") {
9328 // touch out osds only
9329 osdmap.get_out_osds(osds);
9330 } else {
9331 osdmap.get_all_osds(osds);
9332 }
9333 stop = true;
9334 verbose = false; // so the output is less noisy.
9335 } else {
9336 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9337 if (osd < 0) {
9338 ss << "invalid osd id" << osd;
9339 err = -EINVAL;
9340 continue;
9341 } else if (!osdmap.exists(osd)) {
9342 ss << "osd." << osd << " does not exist. ";
9343 continue;
9344 }
9345
9346 osds.insert(osd);
9347 }
9348
9349 for (auto &osd : osds) {
9350 if (prefix == "osd down") {
9351 if (osdmap.is_down(osd)) {
9352 if (verbose)
9353 ss << "osd." << osd << " is already down. ";
9354 } else {
9355 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
9356 ss << "marked down osd." << osd << ". ";
9357 any = true;
9358 }
9359 } else if (prefix == "osd out") {
9360 if (osdmap.is_out(osd)) {
9361 if (verbose)
9362 ss << "osd." << osd << " is already out. ";
9363 } else {
9364 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
9365 if (osdmap.osd_weight[osd]) {
9366 if (pending_inc.new_xinfo.count(osd) == 0) {
9367 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9368 }
9369 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
9370 }
9371 ss << "marked out osd." << osd << ". ";
9372 std::ostringstream msg;
9373 msg << "Client " << op->get_session()->entity_name
9374 << " marked osd." << osd << " out";
9375 if (osdmap.is_up(osd)) {
9376 msg << ", while it was still marked up";
9377 } else {
9378 auto period = ceph_clock_now() - down_pending_out[osd];
9379 msg << ", after it was down for " << int(period.sec())
9380 << " seconds";
9381 }
9382
9383 mon->clog->info() << msg.str();
9384 any = true;
9385 }
9386 } else if (prefix == "osd in") {
9387 if (osdmap.is_in(osd)) {
9388 if (verbose)
9389 ss << "osd." << osd << " is already in. ";
9390 } else {
9391 if (osdmap.osd_xinfo[osd].old_weight > 0) {
9392 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
9393 if (pending_inc.new_xinfo.count(osd) == 0) {
9394 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9395 }
9396 pending_inc.new_xinfo[osd].old_weight = 0;
9397 } else {
9398 pending_inc.new_weight[osd] = CEPH_OSD_IN;
9399 }
9400 ss << "marked in osd." << osd << ". ";
9401 any = true;
9402 }
9403 } else if (prefix == "osd rm") {
9404 err = prepare_command_osd_remove(osd);
9405
9406 if (err == -EBUSY) {
9407 if (any)
9408 ss << ", ";
9409 ss << "osd." << osd << " is still up; must be down before removal. ";
9410 } else {
9411 assert(err == 0);
9412 if (any) {
9413 ss << ", osd." << osd;
9414 } else {
9415 ss << "removed osd." << osd;
9416 }
9417 any = true;
9418 }
9419 }
9420 }
9421 }
9422 if (any) {
9423 getline(ss, rs);
9424 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9425 get_last_committed() + 1));
9426 return true;
9427 }
9428 } else if (prefix == "osd add-noup" ||
9429 prefix == "osd add-nodown" ||
9430 prefix == "osd add-noin" ||
9431 prefix == "osd add-noout") {
9432
9433 enum {
9434 OP_NOUP,
9435 OP_NODOWN,
9436 OP_NOIN,
9437 OP_NOOUT,
9438 } option;
9439
9440 if (prefix == "osd add-noup") {
9441 option = OP_NOUP;
9442 } else if (prefix == "osd add-nodown") {
9443 option = OP_NODOWN;
9444 } else if (prefix == "osd add-noin") {
9445 option = OP_NOIN;
9446 } else {
9447 option = OP_NOOUT;
9448 }
9449
9450 bool any = false;
9451 bool stop = false;
9452
9453 vector<string> idvec;
9454 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9455 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9456
9457 set<int> osds;
9458
9459 // wildcard?
9460 if (j == 0 &&
9461 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9462 osdmap.get_all_osds(osds);
9463 stop = true;
9464 } else {
9465 // try traditional single osd way
9466
9467 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9468 if (osd < 0) {
9469 // ss has reason for failure
9470 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9471 err = -EINVAL;
9472 continue;
9473 }
9474
9475 osds.insert(osd);
9476 }
9477
9478 for (auto &osd : osds) {
9479
9480 if (!osdmap.exists(osd)) {
9481 ss << "osd." << osd << " does not exist. ";
9482 continue;
9483 }
9484
9485 switch (option) {
9486 case OP_NOUP:
9487 if (osdmap.is_up(osd)) {
9488 ss << "osd." << osd << " is already up. ";
9489 continue;
9490 }
9491
9492 if (osdmap.is_noup(osd)) {
9493 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP))
9494 any = true;
9495 } else {
9496 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9497 any = true;
9498 }
9499
9500 break;
9501
9502 case OP_NODOWN:
9503 if (osdmap.is_down(osd)) {
9504 ss << "osd." << osd << " is already down. ";
9505 continue;
9506 }
9507
9508 if (osdmap.is_nodown(osd)) {
9509 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN))
9510 any = true;
9511 } else {
9512 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9513 any = true;
9514 }
9515
9516 break;
9517
9518 case OP_NOIN:
9519 if (osdmap.is_in(osd)) {
9520 ss << "osd." << osd << " is already in. ";
9521 continue;
9522 }
9523
9524 if (osdmap.is_noin(osd)) {
9525 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN))
9526 any = true;
9527 } else {
9528 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9529 any = true;
9530 }
9531
9532 break;
9533
9534 case OP_NOOUT:
9535 if (osdmap.is_out(osd)) {
9536 ss << "osd." << osd << " is already out. ";
9537 continue;
9538 }
9539
9540 if (osdmap.is_noout(osd)) {
9541 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT))
9542 any = true;
9543 } else {
9544 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9545 any = true;
9546 }
9547
9548 break;
9549
9550 default:
9551 assert(0 == "invalid option");
9552 }
9553 }
9554 }
9555
9556 if (any) {
9557 getline(ss, rs);
9558 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9559 get_last_committed() + 1));
9560 return true;
9561 }
9562 } else if (prefix == "osd rm-noup" ||
9563 prefix == "osd rm-nodown" ||
9564 prefix == "osd rm-noin" ||
9565 prefix == "osd rm-noout") {
9566
9567 enum {
9568 OP_NOUP,
9569 OP_NODOWN,
9570 OP_NOIN,
9571 OP_NOOUT,
9572 } option;
9573
9574 if (prefix == "osd rm-noup") {
9575 option = OP_NOUP;
9576 } else if (prefix == "osd rm-nodown") {
9577 option = OP_NODOWN;
9578 } else if (prefix == "osd rm-noin") {
9579 option = OP_NOIN;
9580 } else {
9581 option = OP_NOOUT;
9582 }
9583
9584 bool any = false;
9585 bool stop = false;
9586
9587 vector<string> idvec;
9588 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9589
9590 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9591
9592 vector<int> osds;
9593
9594 // wildcard?
9595 if (j == 0 &&
9596 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9597
9598 // touch previous noup/nodown/noin/noout osds only
9599 switch (option) {
9600 case OP_NOUP:
9601 osdmap.get_noup_osds(&osds);
9602 break;
9603 case OP_NODOWN:
9604 osdmap.get_nodown_osds(&osds);
9605 break;
9606 case OP_NOIN:
9607 osdmap.get_noin_osds(&osds);
9608 break;
9609 case OP_NOOUT:
9610 osdmap.get_noout_osds(&osds);
9611 break;
9612 default:
9613 assert(0 == "invalid option");
9614 }
9615
9616 // cancel any pending noup/nodown/noin/noout requests too
9617 vector<int> pending_state_osds;
9618 (void) pending_inc.get_pending_state_osds(&pending_state_osds);
9619 for (auto &p : pending_state_osds) {
9620
9621 switch (option) {
9622 case OP_NOUP:
9623 if (!osdmap.is_noup(p) &&
9624 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOUP)) {
9625 any = true;
9626 }
9627 break;
9628
9629 case OP_NODOWN:
9630 if (!osdmap.is_nodown(p) &&
9631 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN)) {
9632 any = true;
9633 }
9634 break;
9635
9636 case OP_NOIN:
9637 if (!osdmap.is_noin(p) &&
9638 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOIN)) {
9639 any = true;
9640 }
9641 break;
9642
9643 case OP_NOOUT:
9644 if (!osdmap.is_noout(p) &&
9645 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT)) {
9646 any = true;
9647 }
9648 break;
9649
9650 default:
9651 assert(0 == "invalid option");
9652 }
9653 }
9654
9655 stop = true;
9656 } else {
9657 // try traditional single osd way
9658
9659 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9660 if (osd < 0) {
9661 // ss has reason for failure
9662 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9663 err = -EINVAL;
9664 continue;
9665 }
9666
9667 osds.push_back(osd);
9668 }
9669
9670 for (auto &osd : osds) {
9671
9672 if (!osdmap.exists(osd)) {
9673 ss << "osd." << osd << " does not exist. ";
9674 continue;
9675 }
9676
9677 switch (option) {
9678 case OP_NOUP:
9679 if (osdmap.is_noup(osd)) {
9680 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9681 any = true;
9682 } else if (pending_inc.pending_osd_state_clear(
9683 osd, CEPH_OSD_NOUP)) {
9684 any = true;
9685 }
9686 break;
9687
9688 case OP_NODOWN:
9689 if (osdmap.is_nodown(osd)) {
9690 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9691 any = true;
9692 } else if (pending_inc.pending_osd_state_clear(
9693 osd, CEPH_OSD_NODOWN)) {
9694 any = true;
9695 }
9696 break;
9697
9698 case OP_NOIN:
9699 if (osdmap.is_noin(osd)) {
9700 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9701 any = true;
9702 } else if (pending_inc.pending_osd_state_clear(
9703 osd, CEPH_OSD_NOIN)) {
9704 any = true;
9705 }
9706 break;
9707
9708 case OP_NOOUT:
9709 if (osdmap.is_noout(osd)) {
9710 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9711 any = true;
9712 } else if (pending_inc.pending_osd_state_clear(
9713 osd, CEPH_OSD_NOOUT)) {
9714 any = true;
9715 }
9716 break;
9717
9718 default:
9719 assert(0 == "invalid option");
9720 }
9721 }
9722 }
9723
9724 if (any) {
9725 getline(ss, rs);
9726 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9727 get_last_committed() + 1));
9728 return true;
9729 }
9730 } else if (prefix == "osd pg-temp") {
9731 string pgidstr;
9732 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9733 ss << "unable to parse 'pgid' value '"
9734 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9735 err = -EINVAL;
9736 goto reply;
9737 }
9738 pg_t pgid;
9739 if (!pgid.parse(pgidstr.c_str())) {
9740 ss << "invalid pgid '" << pgidstr << "'";
9741 err = -EINVAL;
9742 goto reply;
9743 }
9744 if (!osdmap.pg_exists(pgid)) {
9745 ss << "pg " << pgid << " does not exist";
9746 err = -ENOENT;
9747 goto reply;
9748 }
9749 if (pending_inc.new_pg_temp.count(pgid)) {
9750 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
9751 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9752 return true;
9753 }
9754
9755 vector<int64_t> id_vec;
9756 vector<int32_t> new_pg_temp;
9757 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9758 ss << "unable to parse 'id' value(s) '"
9759 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9760 err = -EINVAL;
9761 goto reply;
9762 }
9763 for (auto osd : id_vec) {
9764 if (!osdmap.exists(osd)) {
9765 ss << "osd." << osd << " does not exist";
9766 err = -ENOENT;
9767 goto reply;
9768 }
9769 new_pg_temp.push_back(osd);
9770 }
9771
9772 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9773 if ((int)new_pg_temp.size() < pool_min_size) {
9774 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
9775 << pool_min_size << ")";
9776 err = -EINVAL;
9777 goto reply;
9778 }
9779
9780 int pool_size = osdmap.get_pg_pool_size(pgid);
9781 if ((int)new_pg_temp.size() > pool_size) {
9782 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
9783 << pool_size << ")";
9784 err = -EINVAL;
9785 goto reply;
9786 }
9787
9788 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
9789 new_pg_temp.begin(), new_pg_temp.end());
9790 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
9791 goto update;
9792 } else if (prefix == "osd primary-temp") {
9793 string pgidstr;
9794 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9795 ss << "unable to parse 'pgid' value '"
9796 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9797 err = -EINVAL;
9798 goto reply;
9799 }
9800 pg_t pgid;
9801 if (!pgid.parse(pgidstr.c_str())) {
9802 ss << "invalid pgid '" << pgidstr << "'";
9803 err = -EINVAL;
9804 goto reply;
9805 }
9806 if (!osdmap.pg_exists(pgid)) {
9807 ss << "pg " << pgid << " does not exist";
9808 err = -ENOENT;
9809 goto reply;
9810 }
9811
9812 int64_t osd;
9813 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
9814 ss << "unable to parse 'id' value '"
9815 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9816 err = -EINVAL;
9817 goto reply;
9818 }
9819 if (osd != -1 && !osdmap.exists(osd)) {
9820 ss << "osd." << osd << " does not exist";
9821 err = -ENOENT;
9822 goto reply;
9823 }
9824
9825 if (osdmap.require_min_compat_client > 0 &&
9826 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
9827 ss << "require_min_compat_client "
9828 << ceph_release_name(osdmap.require_min_compat_client)
9829 << " < firefly, which is required for primary-temp";
9830 err = -EPERM;
9831 goto reply;
9832 } else if (!g_conf->mon_osd_allow_primary_temp) {
9833 ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
9834 err = -EPERM;
9835 goto reply;
9836 }
9837
9838 pending_inc.new_primary_temp[pgid] = osd;
9839 ss << "set " << pgid << " primary_temp mapping to " << osd;
9840 goto update;
9841 } else if (prefix == "osd pg-upmap" ||
9842 prefix == "osd rm-pg-upmap" ||
9843 prefix == "osd pg-upmap-items" ||
9844 prefix == "osd rm-pg-upmap-items") {
9845 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
9846 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9847 << "luminous' before using the new interface";
9848 err = -EPERM;
9849 goto reply;
9850 }
9851 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
9852 ss << "min_compat_client "
9853 << ceph_release_name(osdmap.require_min_compat_client)
9854 << " < luminous, which is required for pg-upmap. "
9855 << "Try 'ceph osd set-require-min-compat-client luminous' "
9856 << "before using the new interface";
9857 err = -EPERM;
9858 goto reply;
9859 }
9860 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
9861 if (err == -EAGAIN)
9862 goto wait;
9863 if (err < 0)
9864 goto reply;
9865 string pgidstr;
9866 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9867 ss << "unable to parse 'pgid' value '"
9868 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9869 err = -EINVAL;
9870 goto reply;
9871 }
9872 pg_t pgid;
9873 if (!pgid.parse(pgidstr.c_str())) {
9874 ss << "invalid pgid '" << pgidstr << "'";
9875 err = -EINVAL;
9876 goto reply;
9877 }
9878 if (!osdmap.pg_exists(pgid)) {
9879 ss << "pg " << pgid << " does not exist";
9880 err = -ENOENT;
9881 goto reply;
9882 }
9883
9884 enum {
9885 OP_PG_UPMAP,
9886 OP_RM_PG_UPMAP,
9887 OP_PG_UPMAP_ITEMS,
9888 OP_RM_PG_UPMAP_ITEMS,
9889 } option;
9890
9891 if (prefix == "osd pg-upmap") {
9892 option = OP_PG_UPMAP;
9893 } else if (prefix == "osd rm-pg-upmap") {
9894 option = OP_RM_PG_UPMAP;
9895 } else if (prefix == "osd pg-upmap-items") {
9896 option = OP_PG_UPMAP_ITEMS;
9897 } else {
9898 option = OP_RM_PG_UPMAP_ITEMS;
9899 }
9900
9901 // check pending upmap changes
9902 switch (option) {
9903 case OP_PG_UPMAP: // fall through
9904 case OP_RM_PG_UPMAP:
9905 if (pending_inc.new_pg_upmap.count(pgid) ||
9906 pending_inc.old_pg_upmap.count(pgid)) {
9907 dout(10) << __func__ << " waiting for pending update on "
9908 << pgid << dendl;
9909 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9910 return true;
9911 }
9912 break;
9913
9914 case OP_PG_UPMAP_ITEMS: // fall through
9915 case OP_RM_PG_UPMAP_ITEMS:
9916 if (pending_inc.new_pg_upmap_items.count(pgid) ||
9917 pending_inc.old_pg_upmap_items.count(pgid)) {
9918 dout(10) << __func__ << " waiting for pending update on "
9919 << pgid << dendl;
9920 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9921 return true;
9922 }
9923 break;
9924
9925 default:
9926 assert(0 == "invalid option");
9927 }
9928
9929 switch (option) {
9930 case OP_PG_UPMAP:
9931 {
9932 vector<int64_t> id_vec;
9933 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9934 ss << "unable to parse 'id' value(s) '"
9935 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9936 err = -EINVAL;
9937 goto reply;
9938 }
9939
9940 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9941 if ((int)id_vec.size() < pool_min_size) {
9942 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
9943 << pool_min_size << ")";
9944 err = -EINVAL;
9945 goto reply;
9946 }
9947
9948 int pool_size = osdmap.get_pg_pool_size(pgid);
9949 if ((int)id_vec.size() > pool_size) {
9950 ss << "num of osds (" << id_vec.size() <<") > pool size ("
9951 << pool_size << ")";
9952 err = -EINVAL;
9953 goto reply;
9954 }
9955
9956 vector<int32_t> new_pg_upmap;
9957 for (auto osd : id_vec) {
9958 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
9959 ss << "osd." << osd << " does not exist";
9960 err = -ENOENT;
9961 goto reply;
9962 }
9963 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
9964 if (it != new_pg_upmap.end()) {
9965 ss << "osd." << osd << " already exists, ";
9966 continue;
9967 }
9968 new_pg_upmap.push_back(osd);
9969 }
9970
9971 if (new_pg_upmap.empty()) {
9972 ss << "no valid upmap items(pairs) is specified";
9973 err = -EINVAL;
9974 goto reply;
9975 }
9976
9977 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
9978 new_pg_upmap.begin(), new_pg_upmap.end());
9979 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
9980 }
9981 break;
9982
9983 case OP_RM_PG_UPMAP:
9984 {
9985 pending_inc.old_pg_upmap.insert(pgid);
9986 ss << "clear " << pgid << " pg_upmap mapping";
9987 }
9988 break;
9989
9990 case OP_PG_UPMAP_ITEMS:
9991 {
9992 vector<int64_t> id_vec;
9993 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9994 ss << "unable to parse 'id' value(s) '"
9995 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9996 err = -EINVAL;
9997 goto reply;
9998 }
9999
10000 if (id_vec.size() % 2) {
10001 ss << "you must specify pairs of osd ids to be remapped";
10002 err = -EINVAL;
10003 goto reply;
10004 }
10005
10006 int pool_size = osdmap.get_pg_pool_size(pgid);
10007 if ((int)(id_vec.size() / 2) > pool_size) {
10008 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
10009 << pool_size << ")";
10010 err = -EINVAL;
10011 goto reply;
10012 }
10013
10014 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
10015 ostringstream items;
10016 items << "[";
10017 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
10018 int from = *p++;
10019 int to = *p;
10020 if (from == to) {
10021 ss << "from osd." << from << " == to osd." << to << ", ";
10022 continue;
10023 }
10024 if (!osdmap.exists(from)) {
10025 ss << "osd." << from << " does not exist";
10026 err = -ENOENT;
10027 goto reply;
10028 }
10029 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
10030 ss << "osd." << to << " does not exist";
10031 err = -ENOENT;
10032 goto reply;
10033 }
10034 pair<int32_t,int32_t> entry = make_pair(from, to);
10035 auto it = std::find(new_pg_upmap_items.begin(),
10036 new_pg_upmap_items.end(), entry);
10037 if (it != new_pg_upmap_items.end()) {
10038 ss << "osd." << from << " -> osd." << to << " already exists, ";
10039 continue;
10040 }
10041 new_pg_upmap_items.push_back(entry);
10042 items << from << "->" << to << ",";
10043 }
10044 string out(items.str());
10045 out.resize(out.size() - 1); // drop last ','
10046 out += "]";
10047
10048 if (new_pg_upmap_items.empty()) {
10049 ss << "no valid upmap items(pairs) is specified";
10050 err = -EINVAL;
10051 goto reply;
10052 }
10053
10054 pending_inc.new_pg_upmap_items[pgid] =
10055 mempool::osdmap::vector<pair<int32_t,int32_t>>(
10056 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
10057 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
10058 }
10059 break;
10060
10061 case OP_RM_PG_UPMAP_ITEMS:
10062 {
10063 pending_inc.old_pg_upmap_items.insert(pgid);
10064 ss << "clear " << pgid << " pg_upmap_items mapping";
10065 }
10066 break;
10067
10068 default:
10069 assert(0 == "invalid option");
10070 }
10071
10072 goto update;
10073 } else if (prefix == "osd primary-affinity") {
10074 int64_t id;
10075 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10076 ss << "invalid osd id value '"
10077 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10078 err = -EINVAL;
10079 goto reply;
10080 }
10081 double w;
10082 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
10083 ss << "unable to parse 'weight' value '"
10084 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
10085 err = -EINVAL;
10086 goto reply;
10087 }
10088 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
10089 if (ww < 0L) {
10090 ss << "weight must be >= 0";
10091 err = -EINVAL;
10092 goto reply;
10093 }
10094 if (osdmap.require_min_compat_client > 0 &&
10095 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10096 ss << "require_min_compat_client "
10097 << ceph_release_name(osdmap.require_min_compat_client)
10098 << " < firefly, which is required for primary-affinity";
10099 err = -EPERM;
10100 goto reply;
10101 } else if (!g_conf->mon_osd_allow_primary_affinity) {
10102 ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
10103 err = -EPERM;
10104 goto reply;
10105 }
10106 err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
10107 if (err == -EAGAIN)
10108 goto wait;
10109 if (err < 0)
10110 goto reply;
10111 if (osdmap.exists(id)) {
10112 pending_inc.new_primary_affinity[id] = ww;
10113 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
10114 getline(ss, rs);
10115 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10116 get_last_committed() + 1));
10117 return true;
10118 } else {
10119 ss << "osd." << id << " does not exist";
10120 err = -ENOENT;
10121 goto reply;
10122 }
10123 } else if (prefix == "osd reweight") {
10124 int64_t id;
10125 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10126 ss << "unable to parse osd id value '"
10127 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10128 err = -EINVAL;
10129 goto reply;
10130 }
10131 double w;
10132 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
10133 ss << "unable to parse weight value '"
10134 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
10135 err = -EINVAL;
10136 goto reply;
10137 }
10138 long ww = (int)((double)CEPH_OSD_IN*w);
10139 if (ww < 0L) {
10140 ss << "weight must be >= 0";
10141 err = -EINVAL;
10142 goto reply;
10143 }
10144 if (osdmap.exists(id)) {
10145 pending_inc.new_weight[id] = ww;
10146 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
10147 getline(ss, rs);
10148 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10149 get_last_committed() + 1));
10150 return true;
10151 } else {
10152 ss << "osd." << id << " does not exist";
10153 err = -ENOENT;
10154 goto reply;
10155 }
10156 } else if (prefix == "osd reweightn") {
10157 map<int32_t, uint32_t> weights;
10158 err = parse_reweights(g_ceph_context, cmdmap, osdmap, &weights);
10159 if (err) {
10160 ss << "unable to parse 'weights' value '"
10161 << cmd_vartype_stringify(cmdmap["weights"]) << "'";
10162 goto reply;
10163 }
10164 pending_inc.new_weight.insert(weights.begin(), weights.end());
10165 wait_for_finished_proposal(
10166 op,
10167 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
10168 return true;
10169 } else if (prefix == "osd lost") {
10170 int64_t id;
10171 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10172 ss << "unable to parse osd id value '"
10173 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10174 err = -EINVAL;
10175 goto reply;
10176 }
10177 string sure;
10178 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
10179 ss << "are you SURE? this might mean real, permanent data loss. pass "
10180 "--yes-i-really-mean-it if you really do.";
10181 err = -EPERM;
10182 goto reply;
10183 } else if (!osdmap.exists(id)) {
10184 ss << "osd." << id << " does not exist";
10185 err = -ENOENT;
10186 goto reply;
10187 } else if (!osdmap.is_down(id)) {
10188 ss << "osd." << id << " is not down";
10189 err = -EBUSY;
10190 goto reply;
10191 } else {
10192 epoch_t e = osdmap.get_info(id).down_at;
10193 pending_inc.new_lost[id] = e;
10194 ss << "marked osd lost in epoch " << e;
10195 getline(ss, rs);
10196 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10197 get_last_committed() + 1));
10198 return true;
10199 }
10200
10201 } else if (prefix == "osd destroy" || prefix == "osd purge") {
10202 /* Destroying an OSD means that we don't expect to further make use of
10203 * the OSDs data (which may even become unreadable after this operation),
10204 * and that we are okay with scrubbing all its cephx keys and config-key
10205 * data (which may include lockbox keys, thus rendering the osd's data
10206 * unreadable).
10207 *
10208 * The OSD will not be removed. Instead, we will mark it as destroyed,
10209 * such that a subsequent call to `create` will not reuse the osd id.
10210 * This will play into being able to recreate the OSD, at the same
10211 * crush location, with minimal data movement.
10212 */
10213
10214 // make sure authmon is writeable.
10215 if (!mon->authmon()->is_writeable()) {
10216 dout(10) << __func__ << " waiting for auth mon to be writeable for "
10217 << "osd destroy" << dendl;
10218 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
10219 return false;
10220 }
10221
10222 int64_t id;
10223 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10224 ss << "unable to parse osd id value '"
10225 << cmd_vartype_stringify(cmdmap["id"]) << "";
10226 err = -EINVAL;
10227 goto reply;
10228 }
10229
10230 bool is_destroy = (prefix == "osd destroy");
10231 if (!is_destroy) {
10232 assert("osd purge" == prefix);
10233 }
10234
10235 string sure;
10236 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
10237 sure != "--yes-i-really-mean-it") {
10238 ss << "Are you SURE? This will mean real, permanent data loss, as well "
10239 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
10240 << "really do.";
10241 err = -EPERM;
10242 goto reply;
10243 } else if (!osdmap.exists(id)) {
10244 ss << "osd." << id << " does not exist";
10245 err = 0; // idempotent
10246 goto reply;
10247 } else if (osdmap.is_up(id)) {
10248 ss << "osd." << id << " is not `down`.";
10249 err = -EBUSY;
10250 goto reply;
10251 } else if (is_destroy && osdmap.is_destroyed(id)) {
10252 ss << "destroyed osd." << id;
10253 err = 0;
10254 goto reply;
10255 }
10256
10257 bool goto_reply = false;
10258
10259 paxos->plug();
10260 if (is_destroy) {
10261 err = prepare_command_osd_destroy(id, ss);
10262 // we checked above that it should exist.
10263 assert(err != -ENOENT);
10264 } else {
10265 err = prepare_command_osd_purge(id, ss);
10266 if (err == -ENOENT) {
10267 err = 0;
10268 ss << "osd." << id << " does not exist.";
10269 goto_reply = true;
10270 }
10271 }
10272 paxos->unplug();
10273
10274 if (err < 0 || goto_reply) {
10275 goto reply;
10276 }
10277
10278 if (is_destroy) {
10279 ss << "destroyed osd." << id;
10280 } else {
10281 ss << "purged osd." << id;
10282 }
10283
10284 getline(ss, rs);
10285 wait_for_finished_proposal(op,
10286 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
10287 force_immediate_propose();
10288 return true;
10289
10290 } else if (prefix == "osd new") {
10291
10292 // make sure authmon is writeable.
10293 if (!mon->authmon()->is_writeable()) {
10294 dout(10) << __func__ << " waiting for auth mon to be writeable for "
10295 << "osd new" << dendl;
10296 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
10297 return false;
10298 }
10299
10300 map<string,string> param_map;
10301
10302 bufferlist bl = m->get_data();
10303 string param_json = bl.to_str();
10304 dout(20) << __func__ << " osd new json = " << param_json << dendl;
10305
10306 err = get_json_str_map(param_json, ss, &param_map);
10307 if (err < 0)
10308 goto reply;
10309
10310 dout(20) << __func__ << " osd new params " << param_map << dendl;
10311
10312 paxos->plug();
10313 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
10314 paxos->unplug();
10315
10316 if (err < 0) {
10317 goto reply;
10318 }
10319
10320 if (f) {
10321 f->flush(rdata);
10322 } else {
10323 rdata.append(ss);
10324 }
10325
10326 if (err == EEXIST) {
10327 // idempotent operation
10328 err = 0;
10329 goto reply;
10330 }
10331
10332 wait_for_finished_proposal(op,
10333 new Monitor::C_Command(mon, op, 0, rs, rdata,
10334 get_last_committed() + 1));
10335 force_immediate_propose();
10336 return true;
10337
10338 } else if (prefix == "osd create") {
10339
10340 // optional id provided?
10341 int64_t id = -1, cmd_id = -1;
10342 if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
10343 if (cmd_id < 0) {
10344 ss << "invalid osd id value '" << cmd_id << "'";
10345 err = -EINVAL;
10346 goto reply;
10347 }
10348 dout(10) << " osd create got id " << cmd_id << dendl;
10349 }
10350
10351 uuid_d uuid;
10352 string uuidstr;
10353 if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
10354 if (!uuid.parse(uuidstr.c_str())) {
10355 ss << "invalid uuid value '" << uuidstr << "'";
10356 err = -EINVAL;
10357 goto reply;
10358 }
10359 // we only care about the id if we also have the uuid, to
10360 // ensure the operation's idempotency.
10361 id = cmd_id;
10362 }
10363
10364 int32_t new_id = -1;
10365 err = prepare_command_osd_create(id, uuid, &new_id, ss);
10366 if (err < 0) {
10367 if (err == -EAGAIN) {
10368 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10369 return true;
10370 }
10371 // a check has failed; reply to the user.
10372 goto reply;
10373
10374 } else if (err == EEXIST) {
10375 // this is an idempotent operation; we can go ahead and reply.
10376 if (f) {
10377 f->open_object_section("created_osd");
10378 f->dump_int("osdid", new_id);
10379 f->close_section();
10380 f->flush(rdata);
10381 } else {
10382 ss << new_id;
10383 rdata.append(ss);
10384 }
10385 err = 0;
10386 goto reply;
10387 }
10388
10389 string empty_device_class;
10390 do_osd_create(id, uuid, empty_device_class, &new_id);
10391
10392 if (f) {
10393 f->open_object_section("created_osd");
10394 f->dump_int("osdid", new_id);
10395 f->close_section();
10396 f->flush(rdata);
10397 } else {
10398 ss << new_id;
10399 rdata.append(ss);
10400 }
10401 wait_for_finished_proposal(op,
10402 new Monitor::C_Command(mon, op, 0, rs, rdata,
10403 get_last_committed() + 1));
10404 return true;
10405
10406 } else if (prefix == "osd blacklist clear") {
10407 pending_inc.new_blacklist.clear();
10408 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
10409 osdmap.get_blacklist(&blacklist);
10410 for (const auto &entry : blacklist) {
10411 pending_inc.old_blacklist.push_back(entry.first);
10412 }
10413 ss << " removed all blacklist entries";
10414 getline(ss, rs);
10415 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10416 get_last_committed() + 1));
10417 return true;
10418 } else if (prefix == "osd blacklist") {
10419 string addrstr;
10420 cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
10421 entity_addr_t addr;
10422 if (!addr.parse(addrstr.c_str(), 0)) {
10423 ss << "unable to parse address " << addrstr;
10424 err = -EINVAL;
10425 goto reply;
10426 }
10427 else {
10428 string blacklistop;
10429 cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
10430 if (blacklistop == "add") {
10431 utime_t expires = ceph_clock_now();
10432 double d;
10433 // default one hour
10434 cmd_getval(g_ceph_context, cmdmap, "expire", d,
10435 g_conf->mon_osd_blacklist_default_expire);
10436 expires += d;
10437
10438 pending_inc.new_blacklist[addr] = expires;
10439
10440 {
10441 // cancel any pending un-blacklisting request too
10442 auto it = std::find(pending_inc.old_blacklist.begin(),
10443 pending_inc.old_blacklist.end(), addr);
10444 if (it != pending_inc.old_blacklist.end()) {
10445 pending_inc.old_blacklist.erase(it);
10446 }
10447 }
10448
10449 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
10450 getline(ss, rs);
10451 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10452 get_last_committed() + 1));
10453 return true;
10454 } else if (blacklistop == "rm") {
10455 if (osdmap.is_blacklisted(addr) ||
10456 pending_inc.new_blacklist.count(addr)) {
10457 if (osdmap.is_blacklisted(addr))
10458 pending_inc.old_blacklist.push_back(addr);
10459 else
10460 pending_inc.new_blacklist.erase(addr);
10461 ss << "un-blacklisting " << addr;
10462 getline(ss, rs);
10463 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10464 get_last_committed() + 1));
10465 return true;
10466 }
10467 ss << addr << " isn't blacklisted";
10468 err = 0;
10469 goto reply;
10470 }
10471 }
10472 } else if (prefix == "osd pool mksnap") {
10473 string poolstr;
10474 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10475 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10476 if (pool < 0) {
10477 ss << "unrecognized pool '" << poolstr << "'";
10478 err = -ENOENT;
10479 goto reply;
10480 }
10481 string snapname;
10482 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10483 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10484 if (p->is_unmanaged_snaps_mode()) {
10485 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10486 err = -EINVAL;
10487 goto reply;
10488 } else if (p->snap_exists(snapname.c_str())) {
10489 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10490 err = 0;
10491 goto reply;
10492 } else if (p->is_tier()) {
10493 ss << "pool " << poolstr << " is a cache tier";
10494 err = -EINVAL;
10495 goto reply;
10496 }
10497 pg_pool_t *pp = 0;
10498 if (pending_inc.new_pools.count(pool))
10499 pp = &pending_inc.new_pools[pool];
10500 if (!pp) {
10501 pp = &pending_inc.new_pools[pool];
10502 *pp = *p;
10503 }
10504 if (pp->snap_exists(snapname.c_str())) {
10505 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10506 } else {
10507 pp->add_snap(snapname.c_str(), ceph_clock_now());
10508 pp->set_snap_epoch(pending_inc.epoch);
10509 ss << "created pool " << poolstr << " snap " << snapname;
10510 }
10511 getline(ss, rs);
10512 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10513 get_last_committed() + 1));
10514 return true;
10515 } else if (prefix == "osd pool rmsnap") {
10516 string poolstr;
10517 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10518 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10519 if (pool < 0) {
10520 ss << "unrecognized pool '" << poolstr << "'";
10521 err = -ENOENT;
10522 goto reply;
10523 }
10524 string snapname;
10525 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10526 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10527 if (p->is_unmanaged_snaps_mode()) {
10528 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10529 err = -EINVAL;
10530 goto reply;
10531 } else if (!p->snap_exists(snapname.c_str())) {
10532 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
10533 err = 0;
10534 goto reply;
10535 }
10536 pg_pool_t *pp = 0;
10537 if (pending_inc.new_pools.count(pool))
10538 pp = &pending_inc.new_pools[pool];
10539 if (!pp) {
10540 pp = &pending_inc.new_pools[pool];
10541 *pp = *p;
10542 }
10543 snapid_t sn = pp->snap_exists(snapname.c_str());
10544 if (sn) {
10545 pp->remove_snap(sn);
10546 pp->set_snap_epoch(pending_inc.epoch);
10547 ss << "removed pool " << poolstr << " snap " << snapname;
10548 } else {
10549 ss << "already removed pool " << poolstr << " snap " << snapname;
10550 }
10551 getline(ss, rs);
10552 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10553 get_last_committed() + 1));
10554 return true;
10555 } else if (prefix == "osd pool create") {
10556 int64_t pg_num;
10557 int64_t pgp_num;
10558 cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
10559 cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
10560
10561 string pool_type_str;
10562 cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
10563 if (pool_type_str.empty())
10564 pool_type_str = g_conf->osd_pool_default_type;
10565
10566 string poolstr;
10567 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10568 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10569 if (pool_id >= 0) {
10570 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10571 if (pool_type_str != p->get_type_name()) {
10572 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
10573 err = -EINVAL;
10574 } else {
10575 ss << "pool '" << poolstr << "' already exists";
10576 err = 0;
10577 }
10578 goto reply;
10579 }
10580
10581 int pool_type;
10582 if (pool_type_str == "replicated") {
10583 pool_type = pg_pool_t::TYPE_REPLICATED;
10584 } else if (pool_type_str == "erasure") {
10585 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
10586 CEPH_FEATURE_OSD_ERASURE_CODES,
10587 ss);
10588 if (err == -EAGAIN)
10589 goto wait;
10590 if (err)
10591 goto reply;
10592 pool_type = pg_pool_t::TYPE_ERASURE;
10593 } else {
10594 ss << "unknown pool type '" << pool_type_str << "'";
10595 err = -EINVAL;
10596 goto reply;
10597 }
10598
10599 bool implicit_rule_creation = false;
10600 string rule_name;
10601 cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
10602 string erasure_code_profile;
10603 cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
10604
10605 if (pool_type == pg_pool_t::TYPE_ERASURE) {
10606 if (erasure_code_profile == "")
10607 erasure_code_profile = "default";
10608 //handle the erasure code profile
10609 if (erasure_code_profile == "default") {
10610 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
10611 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
10612 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
10613 goto wait;
10614 }
10615
10616 map<string,string> profile_map;
10617 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
10618 profile_map,
10619 &ss);
10620 if (err)
10621 goto reply;
10622 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
10623 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
10624 goto wait;
10625 }
10626 }
10627 if (rule_name == "") {
10628 implicit_rule_creation = true;
10629 if (erasure_code_profile == "default") {
10630 rule_name = "erasure-code";
10631 } else {
10632 dout(1) << "implicitly use rule named after the pool: "
10633 << poolstr << dendl;
10634 rule_name = poolstr;
10635 }
10636 }
10637 } else {
10638 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
10639 rule_name = erasure_code_profile;
10640 }
10641
10642 if (!implicit_rule_creation && rule_name != "") {
10643 int rule;
10644 err = get_crush_rule(rule_name, &rule, &ss);
10645 if (err == -EAGAIN) {
10646 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10647 return true;
10648 }
10649 if (err)
10650 goto reply;
10651 }
10652
10653 int64_t expected_num_objects;
10654 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects", expected_num_objects, int64_t(0));
10655 if (expected_num_objects < 0) {
10656 ss << "'expected_num_objects' must be non-negative";
10657 err = -EINVAL;
10658 goto reply;
10659 }
10660
10661 int64_t fast_read_param;
10662 cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
10663 FastReadType fast_read = FAST_READ_DEFAULT;
10664 if (fast_read_param == 0)
10665 fast_read = FAST_READ_OFF;
10666 else if (fast_read_param > 0)
10667 fast_read = FAST_READ_ON;
10668
10669 err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
10670 -1, // default crush rule
10671 rule_name,
10672 pg_num, pgp_num,
10673 erasure_code_profile, pool_type,
10674 (uint64_t)expected_num_objects,
10675 fast_read,
10676 &ss);
10677 if (err < 0) {
10678 switch(err) {
10679 case -EEXIST:
10680 ss << "pool '" << poolstr << "' already exists";
10681 break;
10682 case -EAGAIN:
10683 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10684 return true;
10685 case -ERANGE:
10686 goto reply;
10687 default:
10688 goto reply;
10689 break;
10690 }
10691 } else {
10692 ss << "pool '" << poolstr << "' created";
10693 }
10694 getline(ss, rs);
10695 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10696 get_last_committed() + 1));
10697 return true;
10698
10699 } else if (prefix == "osd pool delete" ||
10700 prefix == "osd pool rm") {
10701 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
10702 string poolstr, poolstr2, sure;
10703 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10704 cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
10705 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
10706 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10707 if (pool < 0) {
10708 ss << "pool '" << poolstr << "' does not exist";
10709 err = 0;
10710 goto reply;
10711 }
10712
10713 bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
10714 if (poolstr2 != poolstr ||
10715 (sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
10716 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
10717 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
10718 << "followed by --yes-i-really-really-mean-it.";
10719 err = -EPERM;
10720 goto reply;
10721 }
10722 err = _prepare_remove_pool(pool, &ss, force_no_fake);
10723 if (err == -EAGAIN) {
10724 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10725 return true;
10726 }
10727 if (err < 0)
10728 goto reply;
10729 goto update;
10730 } else if (prefix == "osd pool rename") {
10731 string srcpoolstr, destpoolstr;
10732 cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
10733 cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
10734 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
10735 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
10736
10737 if (pool_src < 0) {
10738 if (pool_dst >= 0) {
10739 // src pool doesn't exist, dst pool does exist: to ensure idempotency
10740 // of operations, assume this rename succeeded, as it is not changing
10741 // the current state. Make sure we output something understandable
10742 // for whoever is issuing the command, if they are paying attention,
10743 // in case it was not intentional; or to avoid a "wtf?" and a bug
10744 // report in case it was intentional, while expecting a failure.
10745 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
10746 << destpoolstr << "' does -- assuming successful rename";
10747 err = 0;
10748 } else {
10749 ss << "unrecognized pool '" << srcpoolstr << "'";
10750 err = -ENOENT;
10751 }
10752 goto reply;
10753 } else if (pool_dst >= 0) {
10754 // source pool exists and so does the destination pool
10755 ss << "pool '" << destpoolstr << "' already exists";
10756 err = -EEXIST;
10757 goto reply;
10758 }
10759
10760 int ret = _prepare_rename_pool(pool_src, destpoolstr);
10761 if (ret == 0) {
10762 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
10763 } else {
10764 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
10765 << cpp_strerror(ret);
10766 }
10767 getline(ss, rs);
10768 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
10769 get_last_committed() + 1));
10770 return true;
10771
10772 } else if (prefix == "osd pool set") {
10773 err = prepare_command_pool_set(cmdmap, ss);
10774 if (err == -EAGAIN)
10775 goto wait;
10776 if (err < 0)
10777 goto reply;
10778
10779 getline(ss, rs);
10780 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10781 get_last_committed() + 1));
10782 return true;
10783 } else if (prefix == "osd tier add") {
10784 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10785 if (err == -EAGAIN)
10786 goto wait;
10787 if (err)
10788 goto reply;
10789 string poolstr;
10790 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10791 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10792 if (pool_id < 0) {
10793 ss << "unrecognized pool '" << poolstr << "'";
10794 err = -ENOENT;
10795 goto reply;
10796 }
10797 string tierpoolstr;
10798 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10799 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10800 if (tierpool_id < 0) {
10801 ss << "unrecognized pool '" << tierpoolstr << "'";
10802 err = -ENOENT;
10803 goto reply;
10804 }
10805 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10806 assert(p);
10807 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10808 assert(tp);
10809
10810 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
10811 goto reply;
10812 }
10813
10814 // make sure new tier is empty
10815 string force_nonempty;
10816 cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
10817 const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
10818 if (pstats && pstats->stats.sum.num_objects != 0 &&
10819 force_nonempty != "--force-nonempty") {
10820 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
10821 err = -ENOTEMPTY;
10822 goto reply;
10823 }
10824 if (tp->ec_pool()) {
10825 ss << "tier pool '" << tierpoolstr
10826 << "' is an ec pool, which cannot be a tier";
10827 err = -ENOTSUP;
10828 goto reply;
10829 }
10830 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
10831 ((force_nonempty != "--force-nonempty") ||
10832 (!g_conf->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
10833 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
10834 err = -ENOTEMPTY;
10835 goto reply;
10836 }
10837 // go
10838 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10839 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10840 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
10841 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10842 return true;
10843 }
10844 np->tiers.insert(tierpool_id);
10845 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
10846 ntp->tier_of = pool_id;
10847 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
10848 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10849 get_last_committed() + 1));
10850 return true;
10851 } else if (prefix == "osd tier remove" ||
10852 prefix == "osd tier rm") {
10853 string poolstr;
10854 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10855 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10856 if (pool_id < 0) {
10857 ss << "unrecognized pool '" << poolstr << "'";
10858 err = -ENOENT;
10859 goto reply;
10860 }
10861 string tierpoolstr;
10862 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10863 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10864 if (tierpool_id < 0) {
10865 ss << "unrecognized pool '" << tierpoolstr << "'";
10866 err = -ENOENT;
10867 goto reply;
10868 }
10869 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10870 assert(p);
10871 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10872 assert(tp);
10873
10874 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
10875 goto reply;
10876 }
10877
10878 if (p->tiers.count(tierpool_id) == 0) {
10879 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
10880 err = 0;
10881 goto reply;
10882 }
10883 if (tp->tier_of != pool_id) {
10884 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
10885 << osdmap.get_pool_name(tp->tier_of) << "': "
10886 // be scary about it; this is an inconsistency and bells must go off
10887 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
10888 err = -EINVAL;
10889 goto reply;
10890 }
10891 if (p->read_tier == tierpool_id) {
10892 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
10893 err = -EBUSY;
10894 goto reply;
10895 }
10896 // go
10897 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10898 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10899 if (np->tiers.count(tierpool_id) == 0 ||
10900 ntp->tier_of != pool_id ||
10901 np->read_tier == tierpool_id) {
10902 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10903 return true;
10904 }
10905 np->tiers.erase(tierpool_id);
10906 ntp->clear_tier();
10907 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
10908 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10909 get_last_committed() + 1));
10910 return true;
10911 } else if (prefix == "osd tier set-overlay") {
10912 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10913 if (err == -EAGAIN)
10914 goto wait;
10915 if (err)
10916 goto reply;
10917 string poolstr;
10918 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10919 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10920 if (pool_id < 0) {
10921 ss << "unrecognized pool '" << poolstr << "'";
10922 err = -ENOENT;
10923 goto reply;
10924 }
10925 string overlaypoolstr;
10926 cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
10927 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
10928 if (overlaypool_id < 0) {
10929 ss << "unrecognized pool '" << overlaypoolstr << "'";
10930 err = -ENOENT;
10931 goto reply;
10932 }
10933 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10934 assert(p);
10935 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
10936 assert(overlay_p);
10937 if (p->tiers.count(overlaypool_id) == 0) {
10938 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
10939 err = -EINVAL;
10940 goto reply;
10941 }
10942 if (p->read_tier == overlaypool_id) {
10943 err = 0;
10944 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
10945 goto reply;
10946 }
10947 if (p->has_read_tier()) {
10948 ss << "pool '" << poolstr << "' has overlay '"
10949 << osdmap.get_pool_name(p->read_tier)
10950 << "'; please remove-overlay first";
10951 err = -EINVAL;
10952 goto reply;
10953 }
10954
10955 // go
10956 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10957 np->read_tier = overlaypool_id;
10958 np->write_tier = overlaypool_id;
10959 np->set_last_force_op_resend(pending_inc.epoch);
10960 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
10961 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
10962 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
10963 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
10964 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
10965 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10966 get_last_committed() + 1));
10967 return true;
10968 } else if (prefix == "osd tier remove-overlay" ||
10969 prefix == "osd tier rm-overlay") {
10970 string poolstr;
10971 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10972 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10973 if (pool_id < 0) {
10974 ss << "unrecognized pool '" << poolstr << "'";
10975 err = -ENOENT;
10976 goto reply;
10977 }
10978 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10979 assert(p);
10980 if (!p->has_read_tier()) {
10981 err = 0;
10982 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
10983 goto reply;
10984 }
10985
10986 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
10987 goto reply;
10988 }
10989
10990 // go
10991 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10992 if (np->has_read_tier()) {
10993 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
10994 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
10995 nop->set_last_force_op_resend(pending_inc.epoch);
10996 }
10997 if (np->has_write_tier()) {
10998 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
10999 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
11000 nop->set_last_force_op_resend(pending_inc.epoch);
11001 }
11002 np->clear_read_tier();
11003 np->clear_write_tier();
11004 np->set_last_force_op_resend(pending_inc.epoch);
11005 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
11006 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11007 get_last_committed() + 1));
11008 return true;
11009 } else if (prefix == "osd tier cache-mode") {
11010 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11011 if (err == -EAGAIN)
11012 goto wait;
11013 if (err)
11014 goto reply;
11015 string poolstr;
11016 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11017 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11018 if (pool_id < 0) {
11019 ss << "unrecognized pool '" << poolstr << "'";
11020 err = -ENOENT;
11021 goto reply;
11022 }
11023 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11024 assert(p);
11025 if (!p->is_tier()) {
11026 ss << "pool '" << poolstr << "' is not a tier";
11027 err = -EINVAL;
11028 goto reply;
11029 }
11030 string modestr;
11031 cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
11032 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
11033 if (mode < 0) {
11034 ss << "'" << modestr << "' is not a valid cache mode";
11035 err = -EINVAL;
11036 goto reply;
11037 }
11038
11039 string sure;
11040 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
11041 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11042 mode != pg_pool_t::CACHEMODE_NONE &&
11043 mode != pg_pool_t::CACHEMODE_PROXY &&
11044 mode != pg_pool_t::CACHEMODE_READPROXY) &&
11045 sure != "--yes-i-really-mean-it") {
11046 ss << "'" << modestr << "' is not a well-supported cache mode and may "
11047 << "corrupt your data. pass --yes-i-really-mean-it to force.";
11048 err = -EPERM;
11049 goto reply;
11050 }
11051
11052 // pool already has this cache-mode set and there are no pending changes
11053 if (p->cache_mode == mode &&
11054 (pending_inc.new_pools.count(pool_id) == 0 ||
11055 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
11056 ss << "set cache-mode for pool '" << poolstr << "'"
11057 << " to " << pg_pool_t::get_cache_mode_name(mode);
11058 err = 0;
11059 goto reply;
11060 }
11061
11062 /* Mode description:
11063 *
11064 * none: No cache-mode defined
11065 * forward: Forward all reads and writes to base pool
11066 * writeback: Cache writes, promote reads from base pool
11067 * readonly: Forward writes to base pool
11068 * readforward: Writes are in writeback mode, Reads are in forward mode
11069 * proxy: Proxy all reads and writes to base pool
11070 * readproxy: Writes are in writeback mode, Reads are in proxy mode
11071 *
11072 * Hence, these are the allowed transitions:
11073 *
11074 * none -> any
11075 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11076 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11077 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
11078 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
11079 * writeback -> readforward || readproxy || forward || proxy
11080 * readonly -> any
11081 */
11082
11083 // We check if the transition is valid against the current pool mode, as
11084 // it is the only committed state thus far. We will blantly squash
11085 // whatever mode is on the pending state.
11086
11087 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
11088 (mode != pg_pool_t::CACHEMODE_FORWARD &&
11089 mode != pg_pool_t::CACHEMODE_PROXY &&
11090 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11091 mode != pg_pool_t::CACHEMODE_READPROXY)) {
11092 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
11093 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
11094 << "' pool; only '"
11095 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
11096 << "','"
11097 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
11098 << "','"
11099 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
11100 << "','"
11101 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
11102 << "' allowed.";
11103 err = -EINVAL;
11104 goto reply;
11105 }
11106 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
11107 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11108 mode != pg_pool_t::CACHEMODE_FORWARD &&
11109 mode != pg_pool_t::CACHEMODE_PROXY &&
11110 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
11111
11112 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
11113 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11114 mode != pg_pool_t::CACHEMODE_FORWARD &&
11115 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11116 mode != pg_pool_t::CACHEMODE_PROXY)) ||
11117
11118 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
11119 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11120 mode != pg_pool_t::CACHEMODE_FORWARD &&
11121 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11122 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
11123
11124 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
11125 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11126 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11127 mode != pg_pool_t::CACHEMODE_PROXY &&
11128 mode != pg_pool_t::CACHEMODE_READPROXY))) {
11129
11130 const pool_stat_t* pstats =
11131 mon->pgservice->get_pool_stat(pool_id);
11132
11133 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
11134 ss << "unable to set cache-mode '"
11135 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
11136 << "': dirty objects found";
11137 err = -EBUSY;
11138 goto reply;
11139 }
11140 }
11141 // go
11142 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11143 np->cache_mode = mode;
11144 // set this both when moving to and from cache_mode NONE. this is to
11145 // capture legacy pools that were set up before this flag existed.
11146 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
11147 ss << "set cache-mode for pool '" << poolstr
11148 << "' to " << pg_pool_t::get_cache_mode_name(mode);
11149 if (mode == pg_pool_t::CACHEMODE_NONE) {
11150 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
11151 assert(base_pool);
11152 if (base_pool->read_tier == pool_id ||
11153 base_pool->write_tier == pool_id)
11154 ss <<" (WARNING: pool is still configured as read or write tier)";
11155 }
11156 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11157 get_last_committed() + 1));
11158 return true;
11159 } else if (prefix == "osd tier add-cache") {
11160 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11161 if (err == -EAGAIN)
11162 goto wait;
11163 if (err)
11164 goto reply;
11165 string poolstr;
11166 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11167 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11168 if (pool_id < 0) {
11169 ss << "unrecognized pool '" << poolstr << "'";
11170 err = -ENOENT;
11171 goto reply;
11172 }
11173 string tierpoolstr;
11174 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
11175 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11176 if (tierpool_id < 0) {
11177 ss << "unrecognized pool '" << tierpoolstr << "'";
11178 err = -ENOENT;
11179 goto reply;
11180 }
11181 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11182 assert(p);
11183 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11184 assert(tp);
11185
11186 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11187 goto reply;
11188 }
11189
11190 int64_t size = 0;
11191 if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
11192 ss << "unable to parse 'size' value '"
11193 << cmd_vartype_stringify(cmdmap["size"]) << "'";
11194 err = -EINVAL;
11195 goto reply;
11196 }
11197 // make sure new tier is empty
11198 const pool_stat_t *pstats =
11199 mon->pgservice->get_pool_stat(tierpool_id);
11200 if (pstats && pstats->stats.sum.num_objects != 0) {
11201 ss << "tier pool '" << tierpoolstr << "' is not empty";
11202 err = -ENOTEMPTY;
11203 goto reply;
11204 }
11205 string modestr = g_conf->osd_tier_default_cache_mode;
11206 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
11207 if (mode < 0) {
11208 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
11209 err = -EINVAL;
11210 goto reply;
11211 }
11212 HitSet::Params hsp;
11213 if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
11214 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11215 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
11216 hsp = HitSet::Params(bsp);
11217 } else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
11218 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
11219 }
11220 else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
11221 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
11222 } else {
11223 ss << "osd tier cache default hit set type '" <<
11224 g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
11225 err = -EINVAL;
11226 goto reply;
11227 }
11228 // go
11229 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11230 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11231 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11232 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11233 return true;
11234 }
11235 np->tiers.insert(tierpool_id);
11236 np->read_tier = np->write_tier = tierpool_id;
11237 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11238 np->set_last_force_op_resend(pending_inc.epoch);
11239 ntp->set_last_force_op_resend(pending_inc.epoch);
11240 ntp->tier_of = pool_id;
11241 ntp->cache_mode = mode;
11242 ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
11243 ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
11244 ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
11245 ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
11246 ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
11247 ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
11248 ntp->hit_set_params = hsp;
11249 ntp->target_max_bytes = size;
11250 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
11251 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11252 get_last_committed() + 1));
11253 return true;
11254 } else if (prefix == "osd pool set-quota") {
11255 string poolstr;
11256 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11257 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11258 if (pool_id < 0) {
11259 ss << "unrecognized pool '" << poolstr << "'";
11260 err = -ENOENT;
11261 goto reply;
11262 }
11263
11264 string field;
11265 cmd_getval(g_ceph_context, cmdmap, "field", field);
11266 if (field != "max_objects" && field != "max_bytes") {
11267 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
11268 err = -EINVAL;
11269 goto reply;
11270 }
11271
11272 // val could contain unit designations, so we treat as a string
11273 string val;
11274 cmd_getval(g_ceph_context, cmdmap, "val", val);
11275 stringstream tss;
11276 int64_t value = unit_to_bytesize(val, &tss);
11277 if (value < 0) {
11278 ss << "error parsing value '" << value << "': " << tss.str();
11279 err = value;
11280 goto reply;
11281 }
11282
11283 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
11284 if (field == "max_objects") {
11285 pi->quota_max_objects = value;
11286 } else if (field == "max_bytes") {
11287 pi->quota_max_bytes = value;
11288 } else {
11289 assert(0 == "unrecognized option");
11290 }
11291 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
11292 rs = ss.str();
11293 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11294 get_last_committed() + 1));
11295 return true;
11296 } else if (prefix == "osd pool application enable" ||
11297 prefix == "osd pool application disable" ||
11298 prefix == "osd pool application set" ||
11299 prefix == "osd pool application rm") {
11300 err = prepare_command_pool_application(prefix, cmdmap, ss);
11301 if (err == -EAGAIN)
11302 goto wait;
11303 if (err < 0)
11304 goto reply;
11305
11306 getline(ss, rs);
11307 wait_for_finished_proposal(
11308 op, new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
11309 return true;
11310 } else if (prefix == "osd reweight-by-pg" ||
11311 prefix == "osd reweight-by-utilization" ||
11312 prefix == "osd test-reweight-by-pg" ||
11313 prefix == "osd test-reweight-by-utilization") {
11314 bool by_pg =
11315 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
11316 bool dry_run =
11317 prefix == "osd test-reweight-by-pg" ||
11318 prefix == "osd test-reweight-by-utilization";
11319 int64_t oload;
11320 cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
11321 set<int64_t> pools;
11322 vector<string> poolnamevec;
11323 cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
11324 for (unsigned j = 0; j < poolnamevec.size(); j++) {
11325 int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
11326 if (pool < 0) {
11327 ss << "pool '" << poolnamevec[j] << "' does not exist";
11328 err = -ENOENT;
11329 goto reply;
11330 }
11331 pools.insert(pool);
11332 }
11333 double max_change = g_conf->mon_reweight_max_change;
11334 cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
11335 if (max_change <= 0.0) {
11336 ss << "max_change " << max_change << " must be positive";
11337 err = -EINVAL;
11338 goto reply;
11339 }
11340 int64_t max_osds = g_conf->mon_reweight_max_osds;
11341 cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
11342 if (max_osds <= 0) {
11343 ss << "max_osds " << max_osds << " must be positive";
11344 err = -EINVAL;
11345 goto reply;
11346 }
11347 string no_increasing;
11348 cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
11349 string out_str;
11350 mempool::osdmap::map<int32_t, uint32_t> new_weights;
11351 err = mon->pgservice->reweight_by_utilization(osdmap,
11352 oload,
11353 max_change,
11354 max_osds,
11355 by_pg,
11356 pools.empty() ? NULL : &pools,
11357 no_increasing == "--no-increasing",
11358 &new_weights,
11359 &ss, &out_str, f.get());
11360 if (err >= 0) {
11361 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
11362 }
11363 if (f)
11364 f->flush(rdata);
11365 else
11366 rdata.append(out_str);
11367 if (err < 0) {
11368 ss << "FAILED reweight-by-pg";
11369 } else if (err == 0 || dry_run) {
11370 ss << "no change";
11371 } else {
11372 ss << "SUCCESSFUL reweight-by-pg";
11373 pending_inc.new_weight = std::move(new_weights);
11374 wait_for_finished_proposal(
11375 op,
11376 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11377 return true;
11378 }
11379 } else if (prefix == "osd force-create-pg") {
11380 pg_t pgid;
11381 string pgidstr;
11382 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
11383 if (!pgid.parse(pgidstr.c_str())) {
11384 ss << "invalid pgid '" << pgidstr << "'";
11385 err = -EINVAL;
11386 goto reply;
11387 }
11388 bool creating_now;
11389 {
11390 std::lock_guard<std::mutex> l(creating_pgs_lock);
11391 auto emplaced = creating_pgs.pgs.emplace(pgid,
11392 make_pair(osdmap.get_epoch(),
11393 ceph_clock_now()));
11394 creating_now = emplaced.second;
11395 }
11396 if (creating_now) {
11397 ss << "pg " << pgidstr << " now creating, ok";
11398 err = 0;
11399 goto update;
11400 } else {
11401 ss << "pg " << pgid << " already creating";
11402 err = 0;
11403 goto reply;
11404 }
11405 } else {
11406 err = -EINVAL;
11407 }
11408
11409 reply:
11410 getline(ss, rs);
11411 if (err < 0 && rs.length() == 0)
11412 rs = cpp_strerror(err);
11413 mon->reply_command(op, err, rs, rdata, get_last_committed());
11414 return ret;
11415
11416 update:
11417 getline(ss, rs);
11418 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11419 get_last_committed() + 1));
11420 return true;
11421
11422 wait:
11423 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11424 return true;
11425 }
11426
11427 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
11428 {
11429 op->mark_osdmon_event(__func__);
11430 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11431
11432 if (m->fsid != mon->monmap->fsid) {
11433 dout(0) << __func__ << " drop message on fsid " << m->fsid
11434 << " != " << mon->monmap->fsid << " for " << *m << dendl;
11435 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11436 return true;
11437 }
11438
11439 if (m->op == POOL_OP_CREATE)
11440 return preprocess_pool_op_create(op);
11441
11442 if (!osdmap.get_pg_pool(m->pool)) {
11443 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11444 _pool_op_reply(op, 0, osdmap.get_epoch());
11445 return true;
11446 }
11447
11448 // check if the snap and snapname exist
11449 bool snap_exists = false;
11450 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
11451 if (p->snap_exists(m->name.c_str()))
11452 snap_exists = true;
11453
11454 switch (m->op) {
11455 case POOL_OP_CREATE_SNAP:
11456 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
11457 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11458 return true;
11459 }
11460 if (snap_exists) {
11461 _pool_op_reply(op, 0, osdmap.get_epoch());
11462 return true;
11463 }
11464 return false;
11465 case POOL_OP_CREATE_UNMANAGED_SNAP:
11466 if (p->is_pool_snaps_mode()) {
11467 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11468 return true;
11469 }
11470 return false;
11471 case POOL_OP_DELETE_SNAP:
11472 if (p->is_unmanaged_snaps_mode()) {
11473 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11474 return true;
11475 }
11476 if (!snap_exists) {
11477 _pool_op_reply(op, 0, osdmap.get_epoch());
11478 return true;
11479 }
11480 return false;
11481 case POOL_OP_DELETE_UNMANAGED_SNAP:
11482 if (p->is_pool_snaps_mode()) {
11483 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11484 return true;
11485 }
11486 if (p->is_removed_snap(m->snapid)) {
11487 _pool_op_reply(op, 0, osdmap.get_epoch());
11488 return true;
11489 }
11490 return false;
11491 case POOL_OP_DELETE:
11492 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
11493 _pool_op_reply(op, 0, osdmap.get_epoch());
11494 return true;
11495 }
11496 return false;
11497 case POOL_OP_AUID_CHANGE:
11498 return false;
11499 default:
11500 ceph_abort();
11501 break;
11502 }
11503
11504 return false;
11505 }
11506
11507 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
11508 {
11509 op->mark_osdmon_event(__func__);
11510 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11511 MonSession *session = m->get_session();
11512 if (!session) {
11513 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11514 return true;
11515 }
11516 if (!session->is_capable("osd", MON_CAP_W)) {
11517 dout(5) << "attempt to create new pool without sufficient auid privileges!"
11518 << "message: " << *m << std::endl
11519 << "caps: " << session->caps << dendl;
11520 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11521 return true;
11522 }
11523
11524 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
11525 if (pool >= 0) {
11526 _pool_op_reply(op, 0, osdmap.get_epoch());
11527 return true;
11528 }
11529
11530 return false;
11531 }
11532
11533 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
11534 {
11535 op->mark_osdmon_event(__func__);
11536 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11537 dout(10) << "prepare_pool_op " << *m << dendl;
11538 if (m->op == POOL_OP_CREATE) {
11539 return prepare_pool_op_create(op);
11540 } else if (m->op == POOL_OP_DELETE) {
11541 return prepare_pool_op_delete(op);
11542 }
11543
11544 int ret = 0;
11545 bool changed = false;
11546
11547 if (!osdmap.have_pg_pool(m->pool)) {
11548 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
11549 return false;
11550 }
11551
11552 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
11553
11554 switch (m->op) {
11555 case POOL_OP_CREATE_SNAP:
11556 if (pool->is_tier()) {
11557 ret = -EINVAL;
11558 _pool_op_reply(op, ret, osdmap.get_epoch());
11559 return false;
11560 } // else, fall through
11561 case POOL_OP_DELETE_SNAP:
11562 if (!pool->is_unmanaged_snaps_mode()) {
11563 bool snap_exists = pool->snap_exists(m->name.c_str());
11564 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
11565 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
11566 ret = 0;
11567 } else {
11568 break;
11569 }
11570 } else {
11571 ret = -EINVAL;
11572 }
11573 _pool_op_reply(op, ret, osdmap.get_epoch());
11574 return false;
11575
11576 case POOL_OP_DELETE_UNMANAGED_SNAP:
11577 // we won't allow removal of an unmanaged snapshot from a pool
11578 // not in unmanaged snaps mode.
11579 if (!pool->is_unmanaged_snaps_mode()) {
11580 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
11581 return false;
11582 }
11583 /* fall-thru */
11584 case POOL_OP_CREATE_UNMANAGED_SNAP:
11585 // but we will allow creating an unmanaged snapshot on any pool
11586 // as long as it is not in 'pool' snaps mode.
11587 if (pool->is_pool_snaps_mode()) {
11588 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11589 return false;
11590 }
11591 }
11592
11593 // projected pool info
11594 pg_pool_t pp;
11595 if (pending_inc.new_pools.count(m->pool))
11596 pp = pending_inc.new_pools[m->pool];
11597 else
11598 pp = *osdmap.get_pg_pool(m->pool);
11599
11600 bufferlist reply_data;
11601
11602 // pool snaps vs unmanaged snaps are mutually exclusive
11603 switch (m->op) {
11604 case POOL_OP_CREATE_SNAP:
11605 case POOL_OP_DELETE_SNAP:
11606 if (pp.is_unmanaged_snaps_mode()) {
11607 ret = -EINVAL;
11608 goto out;
11609 }
11610 break;
11611
11612 case POOL_OP_CREATE_UNMANAGED_SNAP:
11613 case POOL_OP_DELETE_UNMANAGED_SNAP:
11614 if (pp.is_pool_snaps_mode()) {
11615 ret = -EINVAL;
11616 goto out;
11617 }
11618 }
11619
11620 switch (m->op) {
11621 case POOL_OP_CREATE_SNAP:
11622 if (!pp.snap_exists(m->name.c_str())) {
11623 pp.add_snap(m->name.c_str(), ceph_clock_now());
11624 dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
11625 changed = true;
11626 }
11627 break;
11628
11629 case POOL_OP_DELETE_SNAP:
11630 {
11631 snapid_t s = pp.snap_exists(m->name.c_str());
11632 if (s) {
11633 pp.remove_snap(s);
11634 changed = true;
11635 }
11636 }
11637 break;
11638
11639 case POOL_OP_CREATE_UNMANAGED_SNAP:
11640 {
11641 uint64_t snapid;
11642 pp.add_unmanaged_snap(snapid);
11643 ::encode(snapid, reply_data);
11644 changed = true;
11645 }
11646 break;
11647
11648 case POOL_OP_DELETE_UNMANAGED_SNAP:
11649 if (!pp.is_removed_snap(m->snapid)) {
11650 pp.remove_unmanaged_snap(m->snapid);
11651 changed = true;
11652 }
11653 break;
11654
11655 case POOL_OP_AUID_CHANGE:
11656 if (pp.auid != m->auid) {
11657 pp.auid = m->auid;
11658 changed = true;
11659 }
11660 break;
11661
11662 default:
11663 ceph_abort();
11664 break;
11665 }
11666
11667 if (changed) {
11668 pp.set_snap_epoch(pending_inc.epoch);
11669 pending_inc.new_pools[m->pool] = pp;
11670 }
11671
11672 out:
11673 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
11674 return true;
11675 }
11676
11677 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
11678 {
11679 op->mark_osdmon_event(__func__);
11680 int err = prepare_new_pool(op);
11681 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
11682 return true;
11683 }
11684
11685 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
11686 ostream *ss)
11687 {
11688 const string& poolstr = osdmap.get_pool_name(pool_id);
11689
11690 // If the Pool is in use by CephFS, refuse to delete it
11691 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
11692 if (pending_fsmap.pool_in_use(pool_id)) {
11693 *ss << "pool '" << poolstr << "' is in use by CephFS";
11694 return -EBUSY;
11695 }
11696
11697 if (pool.tier_of >= 0) {
11698 *ss << "pool '" << poolstr << "' is a tier of '"
11699 << osdmap.get_pool_name(pool.tier_of) << "'";
11700 return -EBUSY;
11701 }
11702 if (!pool.tiers.empty()) {
11703 *ss << "pool '" << poolstr << "' has tiers";
11704 for(auto tier : pool.tiers) {
11705 *ss << " " << osdmap.get_pool_name(tier);
11706 }
11707 return -EBUSY;
11708 }
11709
11710 if (!g_conf->mon_allow_pool_delete) {
11711 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
11712 return -EPERM;
11713 }
11714
11715 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
11716 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
11717 return -EPERM;
11718 }
11719
11720 *ss << "pool '" << poolstr << "' removed";
11721 return 0;
11722 }
11723
11724 /**
11725 * Check if it is safe to add a tier to a base pool
11726 *
11727 * @return
11728 * True if the operation should proceed, false if we should abort here
11729 * (abort doesn't necessarily mean error, could be idempotency)
11730 */
11731 bool OSDMonitor::_check_become_tier(
11732 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
11733 const int64_t base_pool_id, const pg_pool_t *base_pool,
11734 int *err,
11735 ostream *ss) const
11736 {
11737 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
11738 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
11739
11740 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
11741 if (pending_fsmap.pool_in_use(tier_pool_id)) {
11742 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
11743 *err = -EBUSY;
11744 return false;
11745 }
11746
11747 if (base_pool->tiers.count(tier_pool_id)) {
11748 assert(tier_pool->tier_of == base_pool_id);
11749 *err = 0;
11750 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
11751 << base_pool_name << "'";
11752 return false;
11753 }
11754
11755 if (base_pool->is_tier()) {
11756 *ss << "pool '" << base_pool_name << "' is already a tier of '"
11757 << osdmap.get_pool_name(base_pool->tier_of) << "', "
11758 << "multiple tiers are not yet supported.";
11759 *err = -EINVAL;
11760 return false;
11761 }
11762
11763 if (tier_pool->has_tiers()) {
11764 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
11765 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
11766 it != tier_pool->tiers.end(); ++it)
11767 *ss << "'" << osdmap.get_pool_name(*it) << "',";
11768 *ss << " multiple tiers are not yet supported.";
11769 *err = -EINVAL;
11770 return false;
11771 }
11772
11773 if (tier_pool->is_tier()) {
11774 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
11775 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
11776 *err = -EINVAL;
11777 return false;
11778 }
11779
11780 *err = 0;
11781 return true;
11782 }
11783
11784
11785 /**
11786 * Check if it is safe to remove a tier from this base pool
11787 *
11788 * @return
11789 * True if the operation should proceed, false if we should abort here
11790 * (abort doesn't necessarily mean error, could be idempotency)
11791 */
11792 bool OSDMonitor::_check_remove_tier(
11793 const int64_t base_pool_id, const pg_pool_t *base_pool,
11794 const pg_pool_t *tier_pool,
11795 int *err, ostream *ss) const
11796 {
11797 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
11798
11799 // Apply CephFS-specific checks
11800 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
11801 if (pending_fsmap.pool_in_use(base_pool_id)) {
11802 if (base_pool->type != pg_pool_t::TYPE_REPLICATED) {
11803 // If the underlying pool is erasure coded, we can't permit the
11804 // removal of the replicated tier that CephFS relies on to access it
11805 *ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
11806 *err = -EBUSY;
11807 return false;
11808 }
11809
11810 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
11811 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
11812 "tier is still in use as a writeback cache. Change the cache "
11813 "mode and flush the cache before removing it";
11814 *err = -EBUSY;
11815 return false;
11816 }
11817 }
11818
11819 *err = 0;
11820 return true;
11821 }
11822
11823 int OSDMonitor::_prepare_remove_pool(
11824 int64_t pool, ostream *ss, bool no_fake)
11825 {
11826 dout(10) << __func__ << " " << pool << dendl;
11827 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11828 int r = _check_remove_pool(pool, *p, ss);
11829 if (r < 0)
11830 return r;
11831
11832 auto new_pool = pending_inc.new_pools.find(pool);
11833 if (new_pool != pending_inc.new_pools.end()) {
11834 // if there is a problem with the pending info, wait and retry
11835 // this op.
11836 const auto& p = new_pool->second;
11837 int r = _check_remove_pool(pool, p, ss);
11838 if (r < 0)
11839 return -EAGAIN;
11840 }
11841
11842 if (pending_inc.old_pools.count(pool)) {
11843 dout(10) << __func__ << " " << pool << " already pending removal"
11844 << dendl;
11845 return 0;
11846 }
11847
11848 if (g_conf->mon_fake_pool_delete && !no_fake) {
11849 string old_name = osdmap.get_pool_name(pool);
11850 string new_name = old_name + "." + stringify(pool) + ".DELETED";
11851 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
11852 << old_name << " -> " << new_name << dendl;
11853 pending_inc.new_pool_names[pool] = new_name;
11854 return 0;
11855 }
11856
11857 // remove
11858 pending_inc.old_pools.insert(pool);
11859
11860 // remove any pg_temp mappings for this pool
11861 for (auto p = osdmap.pg_temp->begin();
11862 p != osdmap.pg_temp->end();
11863 ++p) {
11864 if (p->first.pool() == (uint64_t)pool) {
11865 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
11866 << p->first << dendl;
11867 pending_inc.new_pg_temp[p->first].clear();
11868 }
11869 }
11870 // remove any primary_temp mappings for this pool
11871 for (auto p = osdmap.primary_temp->begin();
11872 p != osdmap.primary_temp->end();
11873 ++p) {
11874 if (p->first.pool() == (uint64_t)pool) {
11875 dout(10) << __func__ << " " << pool
11876 << " removing obsolete primary_temp" << p->first << dendl;
11877 pending_inc.new_primary_temp[p->first] = -1;
11878 }
11879 }
11880 // remove any pg_upmap mappings for this pool
11881 for (auto& p : osdmap.pg_upmap) {
11882 if (p.first.pool() == (uint64_t)pool) {
11883 dout(10) << __func__ << " " << pool
11884 << " removing obsolete pg_upmap "
11885 << p.first << dendl;
11886 pending_inc.old_pg_upmap.insert(p.first);
11887 }
11888 }
11889 // remove any pg_upmap_items mappings for this pool
11890 for (auto& p : osdmap.pg_upmap_items) {
11891 if (p.first.pool() == (uint64_t)pool) {
11892 dout(10) << __func__ << " " << pool
11893 << " removing obsolete pg_upmap_items " << p.first
11894 << dendl;
11895 pending_inc.old_pg_upmap_items.insert(p.first);
11896 }
11897 }
11898
11899 // remove any choose_args for this pool
11900 CrushWrapper newcrush;
11901 _get_pending_crush(newcrush);
11902 if (newcrush.have_choose_args(pool)) {
11903 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
11904 newcrush.rm_choose_args(pool);
11905 pending_inc.crush.clear();
11906 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11907 }
11908 return 0;
11909 }
11910
11911 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
11912 {
11913 dout(10) << "_prepare_rename_pool " << pool << dendl;
11914 if (pending_inc.old_pools.count(pool)) {
11915 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
11916 return -ENOENT;
11917 }
11918 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
11919 p != pending_inc.new_pool_names.end();
11920 ++p) {
11921 if (p->second == newname && p->first != pool) {
11922 return -EEXIST;
11923 }
11924 }
11925
11926 pending_inc.new_pool_names[pool] = newname;
11927 return 0;
11928 }
11929
11930 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
11931 {
11932 op->mark_osdmon_event(__func__);
11933 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11934 ostringstream ss;
11935 int ret = _prepare_remove_pool(m->pool, &ss, false);
11936 if (ret == -EAGAIN) {
11937 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11938 return true;
11939 }
11940 if (ret < 0)
11941 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
11942 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
11943 pending_inc.epoch));
11944 return true;
11945 }
11946
11947 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
11948 int ret, epoch_t epoch, bufferlist *blp)
11949 {
11950 op->mark_osdmon_event(__func__);
11951 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11952 dout(20) << "_pool_op_reply " << ret << dendl;
11953 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
11954 ret, epoch, get_last_committed(), blp);
11955 mon->send_reply(op, reply);
11956 }