]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreate2.h"
51 #include "messages/MOSDPGCreated.h"
52 #include "messages/MOSDPGTemp.h"
53 #include "messages/MOSDPGReadyToMerge.h"
54 #include "messages/MMonCommand.h"
55 #include "messages/MRemoveSnaps.h"
56 #include "messages/MOSDScrub.h"
57 #include "messages/MRoute.h"
58
59 #include "common/TextTable.h"
60 #include "common/Timer.h"
61 #include "common/ceph_argparse.h"
62 #include "common/perf_counters.h"
63 #include "common/strtol.h"
64 #include "common/numa.h"
65
66 #include "common/config.h"
67 #include "common/errno.h"
68
69 #include "erasure-code/ErasureCodePlugin.h"
70 #include "compressor/Compressor.h"
71 #include "common/Checksummer.h"
72
73 #include "include/compat.h"
74 #include "include/ceph_assert.h"
75 #include "include/stringify.h"
76 #include "include/util.h"
77 #include "common/cmdparse.h"
78 #include "include/str_list.h"
79 #include "include/str_map.h"
80 #include "include/scope_guard.h"
81
82 #include "auth/cephx/CephxKeyServer.h"
83 #include "osd/OSDCap.h"
84
85 #include "json_spirit/json_spirit_reader.h"
86
87 #include <boost/algorithm/string/predicate.hpp>
88
89 #define dout_subsys ceph_subsys_mon
90 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
91 static const string OSD_METADATA_PREFIX("osd_metadata");
92 static const string OSD_SNAP_PREFIX("osd_snap");
93
94 namespace {
95
96 const uint32_t MAX_POOL_APPLICATIONS = 4;
97 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
98 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
99
100 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
101 // Note: this doesn't include support for the application tag match
102 if ((grant.spec.allow & OSD_CAP_W) != 0) {
103 auto& match = grant.match;
104 if (match.is_match_all()) {
105 return true;
106 } else if (pool_name != nullptr &&
107 !match.pool_namespace.pool_name.empty() &&
108 match.pool_namespace.pool_name == *pool_name) {
109 return true;
110 }
111 }
112 return false;
113 }
114
115 bool is_unmanaged_snap_op_permitted(CephContext* cct,
116 const KeyServer& key_server,
117 const EntityName& entity_name,
118 const MonCap& mon_caps,
119 const entity_addr_t& peer_socket_addr,
120 const std::string* pool_name)
121 {
122 typedef std::map<std::string, std::string> CommandArgs;
123
124 if (mon_caps.is_capable(
125 cct, CEPH_ENTITY_TYPE_MON,
126 entity_name, "osd",
127 "osd pool op unmanaged-snap",
128 (pool_name == nullptr ?
129 CommandArgs{} /* pool DNE, require unrestricted cap */ :
130 CommandArgs{{"poolname", *pool_name}}),
131 false, true, false,
132 peer_socket_addr)) {
133 return true;
134 }
135
136 AuthCapsInfo caps_info;
137 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
138 caps_info)) {
139 dout(10) << "unable to locate OSD cap data for " << entity_name
140 << " in auth db" << dendl;
141 return false;
142 }
143
144 string caps_str;
145 if (caps_info.caps.length() > 0) {
146 auto p = caps_info.caps.cbegin();
147 try {
148 decode(caps_str, p);
149 } catch (const buffer::error &err) {
150 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
151 << dendl;
152 return false;
153 }
154 }
155
156 OSDCap osd_cap;
157 if (!osd_cap.parse(caps_str, nullptr)) {
158 dout(10) << "unable to parse OSD cap data for " << entity_name
159 << " in auth db" << dendl;
160 return false;
161 }
162
163 // if the entity has write permissions in one or all pools, permit
164 // usage of unmanaged-snapshots
165 if (osd_cap.allow_all()) {
166 return true;
167 }
168
169 for (auto& grant : osd_cap.grants) {
170 if (grant.profile.is_valid()) {
171 for (auto& profile_grant : grant.profile_grants) {
172 if (is_osd_writable(profile_grant, pool_name)) {
173 return true;
174 }
175 }
176 } else if (is_osd_writable(grant, pool_name)) {
177 return true;
178 }
179 }
180
181 return false;
182 }
183
184 } // anonymous namespace
185
186 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
187 {
188 if (epoch_by_pg.size() <= ps) {
189 epoch_by_pg.resize(ps + 1, 0);
190 }
191 const auto old_lec = epoch_by_pg[ps];
192 if (old_lec >= last_epoch_clean) {
193 // stale lec
194 return;
195 }
196 epoch_by_pg[ps] = last_epoch_clean;
197 if (last_epoch_clean < floor) {
198 floor = last_epoch_clean;
199 } else if (last_epoch_clean > floor) {
200 if (old_lec == floor) {
201 // probably should increase floor?
202 auto new_floor = std::min_element(std::begin(epoch_by_pg),
203 std::end(epoch_by_pg));
204 floor = *new_floor;
205 }
206 }
207 if (ps != next_missing) {
208 return;
209 }
210 for (; next_missing < epoch_by_pg.size(); next_missing++) {
211 if (epoch_by_pg[next_missing] == 0) {
212 break;
213 }
214 }
215 }
216
217 void LastEpochClean::remove_pool(uint64_t pool)
218 {
219 report_by_pool.erase(pool);
220 }
221
222 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
223 {
224 auto& lec = report_by_pool[pg.pool()];
225 return lec.report(pg.ps(), last_epoch_clean);
226 }
227
228 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
229 {
230 auto floor = latest.get_epoch();
231 for (auto& pool : latest.get_pools()) {
232 auto reported = report_by_pool.find(pool.first);
233 if (reported == report_by_pool.end()) {
234 return 0;
235 }
236 if (reported->second.next_missing < pool.second.get_pg_num()) {
237 return 0;
238 }
239 if (reported->second.floor < floor) {
240 floor = reported->second.floor;
241 }
242 }
243 return floor;
244 }
245
246
247 class C_UpdateCreatingPGs : public Context {
248 public:
249 OSDMonitor *osdmon;
250 utime_t start;
251 epoch_t epoch;
252 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
253 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
254 void finish(int r) override {
255 if (r >= 0) {
256 utime_t end = ceph_clock_now();
257 dout(10) << "osdmap epoch " << epoch << " mapping took "
258 << (end - start) << " seconds" << dendl;
259 osdmon->update_creating_pgs();
260 osdmon->check_pg_creates_subs();
261 }
262 }
263 };
264
265 #undef dout_prefix
266 #define dout_prefix _prefix(_dout, mon, osdmap)
267 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
268 return *_dout << "mon." << mon->name << "@" << mon->rank
269 << "(" << mon->get_state_name()
270 << ").osd e" << osdmap.get_epoch() << " ";
271 }
272
273 OSDMonitor::OSDMonitor(
274 CephContext *cct,
275 Monitor *mn,
276 Paxos *p,
277 const string& service_name)
278 : PaxosService(mn, p, service_name),
279 cct(cct),
280 inc_osd_cache(g_conf()->mon_osd_cache_size),
281 full_osd_cache(g_conf()->mon_osd_cache_size),
282 has_osdmap_manifest(false),
283 mapper(mn->cct, &mn->cpu_tp)
284 {}
285
286 bool OSDMonitor::_have_pending_crush()
287 {
288 return pending_inc.crush.length() > 0;
289 }
290
291 CrushWrapper &OSDMonitor::_get_stable_crush()
292 {
293 return *osdmap.crush;
294 }
295
296 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
297 {
298 bufferlist bl;
299 if (pending_inc.crush.length())
300 bl = pending_inc.crush;
301 else
302 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
303
304 auto p = bl.cbegin();
305 newcrush.decode(p);
306 }
307
308 void OSDMonitor::create_initial()
309 {
310 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
311
312 OSDMap newmap;
313
314 bufferlist bl;
315 mon->store->get("mkfs", "osdmap", bl);
316
317 if (bl.length()) {
318 newmap.decode(bl);
319 newmap.set_fsid(mon->monmap->fsid);
320 } else {
321 newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
322 }
323 newmap.set_epoch(1);
324 newmap.created = newmap.modified = ceph_clock_now();
325
326 // new clusters should sort bitwise by default.
327 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
328
329 newmap.flags |=
330 CEPH_OSDMAP_RECOVERY_DELETES |
331 CEPH_OSDMAP_PURGED_SNAPDIRS |
332 CEPH_OSDMAP_PGLOG_HARDLIMIT;
333 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
334 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
335 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
336 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
337 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
338 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
339
340 // new cluster should require latest by default
341 if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
342 if (g_conf()->mon_debug_no_require_mimic) {
343 derr << __func__ << " mon_debug_no_require_mimic=true and nautilus=true" << dendl;
344 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
345 } else {
346 derr << __func__ << " mon_debug_no_require_nautilus=true" << dendl;
347 newmap.require_osd_release = CEPH_RELEASE_MIMIC;
348 }
349 } else {
350 newmap.require_osd_release = CEPH_RELEASE_NAUTILUS;
351 int r = ceph_release_from_name(
352 g_conf()->mon_osd_initial_require_min_compat_client.c_str());
353 if (r <= 0) {
354 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
355 }
356 newmap.require_min_compat_client = r;
357 }
358
359 // encode into pending incremental
360 uint64_t features = newmap.get_encoding_features();
361 newmap.encode(pending_inc.fullmap,
362 features | CEPH_FEATURE_RESERVED);
363 pending_inc.full_crc = newmap.get_crc();
364 dout(20) << " full crc " << pending_inc.full_crc << dendl;
365 }
366
367 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
368 {
369 s.insert(service_name);
370 s.insert(OSD_PG_CREATING_PREFIX);
371 s.insert(OSD_METADATA_PREFIX);
372 s.insert(OSD_SNAP_PREFIX);
373 }
374
375 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
376 {
377 // we really don't care if the version has been updated, because we may
378 // have trimmed without having increased the last committed; yet, we may
379 // need to update the in-memory manifest.
380 load_osdmap_manifest();
381
382 version_t version = get_last_committed();
383 if (version == osdmap.epoch)
384 return;
385 ceph_assert(version > osdmap.epoch);
386
387 dout(15) << "update_from_paxos paxos e " << version
388 << ", my e " << osdmap.epoch << dendl;
389
390 if (mapping_job) {
391 if (!mapping_job->is_done()) {
392 dout(1) << __func__ << " mapping job "
393 << mapping_job.get() << " did not complete, "
394 << mapping_job->shards << " left, canceling" << dendl;
395 mapping_job->abort();
396 }
397 mapping_job.reset();
398 }
399
400 load_health();
401
402 /*
403 * We will possibly have a stashed latest that *we* wrote, and we will
404 * always be sure to have the oldest full map in the first..last range
405 * due to encode_trim_extra(), which includes the oldest full map in the trim
406 * transaction.
407 *
408 * encode_trim_extra() does not however write the full map's
409 * version to 'full_latest'. This is only done when we are building the
410 * full maps from the incremental versions. But don't panic! We make sure
411 * that the following conditions find whichever full map version is newer.
412 */
413 version_t latest_full = get_version_latest_full();
414 if (latest_full == 0 && get_first_committed() > 1)
415 latest_full = get_first_committed();
416
417 if (get_first_committed() > 1 &&
418 latest_full < get_first_committed()) {
419 // the monitor could be just sync'ed with its peer, and the latest_full key
420 // is not encoded in the paxos commits in encode_pending(), so we need to
421 // make sure we get it pointing to a proper version.
422 version_t lc = get_last_committed();
423 version_t fc = get_first_committed();
424
425 dout(10) << __func__ << " looking for valid full map in interval"
426 << " [" << fc << ", " << lc << "]" << dendl;
427
428 latest_full = 0;
429 for (version_t v = lc; v >= fc; v--) {
430 string full_key = "full_" + stringify(v);
431 if (mon->store->exists(get_service_name(), full_key)) {
432 dout(10) << __func__ << " found latest full map v " << v << dendl;
433 latest_full = v;
434 break;
435 }
436 }
437
438 ceph_assert(latest_full > 0);
439 auto t(std::make_shared<MonitorDBStore::Transaction>());
440 put_version_latest_full(t, latest_full);
441 mon->store->apply_transaction(t);
442 dout(10) << __func__ << " updated the on-disk full map version to "
443 << latest_full << dendl;
444 }
445
446 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
447 bufferlist latest_bl;
448 get_version_full(latest_full, latest_bl);
449 ceph_assert(latest_bl.length() != 0);
450 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
451 osdmap = OSDMap();
452 osdmap.decode(latest_bl);
453 }
454
455 bufferlist bl;
456 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
457 auto p = bl.cbegin();
458 std::lock_guard<std::mutex> l(creating_pgs_lock);
459 creating_pgs.decode(p);
460 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
461 << creating_pgs.last_scan_epoch
462 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
463 } else {
464 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
465 << dendl;
466 }
467
468 // walk through incrementals
469 MonitorDBStore::TransactionRef t;
470 size_t tx_size = 0;
471 while (version > osdmap.epoch) {
472 bufferlist inc_bl;
473 int err = get_version(osdmap.epoch+1, inc_bl);
474 ceph_assert(err == 0);
475 ceph_assert(inc_bl.length());
476
477 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
478 << dendl;
479 OSDMap::Incremental inc(inc_bl);
480 err = osdmap.apply_incremental(inc);
481 ceph_assert(err == 0);
482
483 if (!t)
484 t.reset(new MonitorDBStore::Transaction);
485
486 // Write out the full map for all past epochs. Encode the full
487 // map with the same features as the incremental. If we don't
488 // know, use the quorum features. If we don't know those either,
489 // encode with all features.
490 uint64_t f = inc.encode_features;
491 if (!f)
492 f = mon->get_quorum_con_features();
493 if (!f)
494 f = -1;
495 bufferlist full_bl;
496 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
497 tx_size += full_bl.length();
498
499 bufferlist orig_full_bl;
500 get_version_full(osdmap.epoch, orig_full_bl);
501 if (orig_full_bl.length()) {
502 // the primary provided the full map
503 ceph_assert(inc.have_crc);
504 if (inc.full_crc != osdmap.crc) {
505 // This will happen if the mons were running mixed versions in
506 // the past or some other circumstance made the full encoded
507 // maps divergent. Reloading here will bring us back into
508 // sync with the primary for this and all future maps. OSDs
509 // will also be brought back into sync when they discover the
510 // crc mismatch and request a full map from a mon.
511 derr << __func__ << " full map CRC mismatch, resetting to canonical"
512 << dendl;
513
514 dout(20) << __func__ << " my (bad) full osdmap:\n";
515 JSONFormatter jf(true);
516 jf.dump_object("osdmap", osdmap);
517 jf.flush(*_dout);
518 *_dout << "\nhexdump:\n";
519 full_bl.hexdump(*_dout);
520 *_dout << dendl;
521
522 osdmap = OSDMap();
523 osdmap.decode(orig_full_bl);
524
525 dout(20) << __func__ << " canonical full osdmap:\n";
526 JSONFormatter jf(true);
527 jf.dump_object("osdmap", osdmap);
528 jf.flush(*_dout);
529 *_dout << "\nhexdump:\n";
530 orig_full_bl.hexdump(*_dout);
531 *_dout << dendl;
532 }
533 } else {
534 ceph_assert(!inc.have_crc);
535 put_version_full(t, osdmap.epoch, full_bl);
536 }
537 put_version_latest_full(t, osdmap.epoch);
538
539 // share
540 dout(1) << osdmap << dendl;
541
542 if (osdmap.epoch == 1) {
543 t->erase("mkfs", "osdmap");
544 }
545
546 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
547 mon->store->apply_transaction(t);
548 t = MonitorDBStore::TransactionRef();
549 tx_size = 0;
550 }
551 for (const auto &osd_state : inc.new_state) {
552 if (osd_state.second & CEPH_OSD_UP) {
553 // could be marked up *or* down, but we're too lazy to check which
554 last_osd_report.erase(osd_state.first);
555 }
556 if (osd_state.second & CEPH_OSD_EXISTS) {
557 // could be created *or* destroyed, but we can safely drop it
558 osd_epochs.erase(osd_state.first);
559 }
560 }
561 }
562
563 if (t) {
564 mon->store->apply_transaction(t);
565 }
566
567 for (int o = 0; o < osdmap.get_max_osd(); o++) {
568 if (osdmap.is_out(o))
569 continue;
570 auto found = down_pending_out.find(o);
571 if (osdmap.is_down(o)) {
572 // populate down -> out map
573 if (found == down_pending_out.end()) {
574 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
575 down_pending_out[o] = ceph_clock_now();
576 }
577 } else {
578 if (found != down_pending_out.end()) {
579 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
580 down_pending_out.erase(found);
581 }
582 }
583 }
584 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
585
586 check_osdmap_subs();
587 check_pg_creates_subs();
588
589 share_map_with_random_osd();
590 update_logger();
591
592 process_failures();
593
594 // make sure our feature bits reflect the latest map
595 update_msgr_features();
596
597 if (!mon->is_leader()) {
598 // will be called by on_active() on the leader, avoid doing so twice
599 start_mapping();
600 }
601 }
602
603 void OSDMonitor::start_mapping()
604 {
605 // initiate mapping job
606 if (mapping_job) {
607 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
608 << dendl;
609 mapping_job->abort();
610 }
611 if (!osdmap.get_pools().empty()) {
612 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
613 mapping_job = mapping.start_update(osdmap, mapper,
614 g_conf()->mon_osd_mapping_pgs_per_chunk);
615 dout(10) << __func__ << " started mapping job " << mapping_job.get()
616 << " at " << fin->start << dendl;
617 mapping_job->set_finish_event(fin);
618 } else {
619 dout(10) << __func__ << " no pools, no mapping job" << dendl;
620 mapping_job = nullptr;
621 }
622 }
623
624 void OSDMonitor::update_msgr_features()
625 {
626 set<int> types;
627 types.insert((int)entity_name_t::TYPE_OSD);
628 types.insert((int)entity_name_t::TYPE_CLIENT);
629 types.insert((int)entity_name_t::TYPE_MDS);
630 types.insert((int)entity_name_t::TYPE_MON);
631 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
632 uint64_t mask;
633 uint64_t features = osdmap.get_features(*q, &mask);
634 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
635 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
636 ceph::net::Policy p = mon->messenger->get_policy(*q);
637 p.features_required = (p.features_required & ~mask) | features;
638 mon->messenger->set_policy(*q, p);
639 }
640 }
641 }
642
643 void OSDMonitor::on_active()
644 {
645 update_logger();
646
647 if (mon->is_leader()) {
648 mon->clog->debug() << "osdmap " << osdmap;
649 if (!priority_convert) {
650 // Only do this once at start-up
651 convert_pool_priorities();
652 priority_convert = true;
653 }
654 } else {
655 list<MonOpRequestRef> ls;
656 take_all_failures(ls);
657 while (!ls.empty()) {
658 MonOpRequestRef op = ls.front();
659 op->mark_osdmon_event(__func__);
660 dispatch(op);
661 ls.pop_front();
662 }
663 }
664 start_mapping();
665 }
666
667 void OSDMonitor::on_restart()
668 {
669 last_osd_report.clear();
670 }
671
672 void OSDMonitor::on_shutdown()
673 {
674 dout(10) << __func__ << dendl;
675 if (mapping_job) {
676 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
677 << dendl;
678 mapping_job->abort();
679 }
680
681 // discard failure info, waiters
682 list<MonOpRequestRef> ls;
683 take_all_failures(ls);
684 ls.clear();
685 }
686
687 void OSDMonitor::update_logger()
688 {
689 dout(10) << "update_logger" << dendl;
690
691 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
692 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
693 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
694 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
695 }
696
697 void OSDMonitor::create_pending()
698 {
699 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
700 pending_inc.fsid = mon->monmap->fsid;
701 pending_metadata.clear();
702 pending_metadata_rm.clear();
703
704 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
705
706 // safety checks (this shouldn't really happen)
707 {
708 if (osdmap.backfillfull_ratio <= 0) {
709 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
710 if (pending_inc.new_backfillfull_ratio > 1.0)
711 pending_inc.new_backfillfull_ratio /= 100;
712 dout(1) << __func__ << " setting backfillfull_ratio = "
713 << pending_inc.new_backfillfull_ratio << dendl;
714 }
715 if (osdmap.full_ratio <= 0) {
716 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
717 if (pending_inc.new_full_ratio > 1.0)
718 pending_inc.new_full_ratio /= 100;
719 dout(1) << __func__ << " setting full_ratio = "
720 << pending_inc.new_full_ratio << dendl;
721 }
722 if (osdmap.nearfull_ratio <= 0) {
723 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
724 if (pending_inc.new_nearfull_ratio > 1.0)
725 pending_inc.new_nearfull_ratio /= 100;
726 dout(1) << __func__ << " setting nearfull_ratio = "
727 << pending_inc.new_nearfull_ratio << dendl;
728 }
729 }
730
731 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
732 // structure.
733 if (osdmap.crush->has_legacy_rule_ids()) {
734 CrushWrapper newcrush;
735 _get_pending_crush(newcrush);
736
737 // First, for all pools, work out which rule they really used
738 // by resolving ruleset to rule.
739 for (const auto &i : osdmap.get_pools()) {
740 const auto pool_id = i.first;
741 const auto &pool = i.second;
742 int new_rule_id = newcrush.find_rule(pool.crush_rule,
743 pool.type, pool.size);
744
745 dout(1) << __func__ << " rewriting pool "
746 << osdmap.get_pool_name(pool_id) << " crush ruleset "
747 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
748 if (pending_inc.new_pools.count(pool_id) == 0) {
749 pending_inc.new_pools[pool_id] = pool;
750 }
751 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
752 }
753
754 // Now, go ahead and renumber all the rules so that their
755 // rule_id field corresponds to their position in the array
756 auto old_to_new = newcrush.renumber_rules();
757 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
758 for (const auto &i : old_to_new) {
759 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
760 }
761 pending_inc.crush.clear();
762 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
763 }
764 }
765
766 creating_pgs_t
767 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
768 const OSDMap& nextmap)
769 {
770 dout(10) << __func__ << dendl;
771 creating_pgs_t pending_creatings;
772 {
773 std::lock_guard<std::mutex> l(creating_pgs_lock);
774 pending_creatings = creating_pgs;
775 }
776 // check for new or old pools
777 if (pending_creatings.last_scan_epoch < inc.epoch) {
778 unsigned queued = 0;
779 queued += scan_for_creating_pgs(osdmap.get_pools(),
780 inc.old_pools,
781 inc.modified,
782 &pending_creatings);
783 queued += scan_for_creating_pgs(inc.new_pools,
784 inc.old_pools,
785 inc.modified,
786 &pending_creatings);
787 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
788 for (auto deleted_pool : inc.old_pools) {
789 auto removed = pending_creatings.remove_pool(deleted_pool);
790 dout(10) << __func__ << " " << removed
791 << " pg removed because containing pool deleted: "
792 << deleted_pool << dendl;
793 last_epoch_clean.remove_pool(deleted_pool);
794 }
795 // pgmon updates its creating_pgs in check_osd_map() which is called by
796 // on_active() and check_osd_map() could be delayed if lease expires, so its
797 // creating_pgs could be stale in comparison with the one of osdmon. let's
798 // trim them here. otherwise, they will be added back after being erased.
799 unsigned removed = 0;
800 for (auto& pg : pending_created_pgs) {
801 dout(20) << __func__ << " noting created pg " << pg << dendl;
802 pending_creatings.created_pools.insert(pg.pool());
803 removed += pending_creatings.pgs.erase(pg);
804 }
805 pending_created_pgs.clear();
806 dout(10) << __func__ << " " << removed
807 << " pgs removed because they're created" << dendl;
808 pending_creatings.last_scan_epoch = osdmap.get_epoch();
809 }
810
811 // filter out any pgs that shouldn't exist.
812 {
813 auto i = pending_creatings.pgs.begin();
814 while (i != pending_creatings.pgs.end()) {
815 if (!nextmap.pg_exists(i->first)) {
816 dout(10) << __func__ << " removing pg " << i->first
817 << " which should not exist" << dendl;
818 i = pending_creatings.pgs.erase(i);
819 } else {
820 ++i;
821 }
822 }
823 }
824
825 // process queue
826 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
827 const auto total = pending_creatings.pgs.size();
828 while (pending_creatings.pgs.size() < max &&
829 !pending_creatings.queue.empty()) {
830 auto p = pending_creatings.queue.begin();
831 int64_t poolid = p->first;
832 dout(10) << __func__ << " pool " << poolid
833 << " created " << p->second.created
834 << " modified " << p->second.modified
835 << " [" << p->second.start << "-" << p->second.end << ")"
836 << dendl;
837 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
838 p->second.end - p->second.start);
839 ps_t first = p->second.start;
840 ps_t end = first + n;
841 for (ps_t ps = first; ps < end; ++ps) {
842 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
843 // NOTE: use the *current* epoch as the PG creation epoch so that the
844 // OSD does not have to generate a long set of PastIntervals.
845 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
846 p->second.modified));
847 dout(10) << __func__ << " adding " << pgid << dendl;
848 }
849 p->second.start = end;
850 if (p->second.done()) {
851 dout(10) << __func__ << " done with queue for " << poolid << dendl;
852 pending_creatings.queue.erase(p);
853 } else {
854 dout(10) << __func__ << " pool " << poolid
855 << " now [" << p->second.start << "-" << p->second.end << ")"
856 << dendl;
857 }
858 }
859 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
860 << " pools" << dendl;
861 dout(10) << __func__
862 << " " << (pending_creatings.pgs.size() - total)
863 << "/" << pending_creatings.pgs.size()
864 << " pgs added from queued pools" << dendl;
865 return pending_creatings;
866 }
867
868 void OSDMonitor::maybe_prime_pg_temp()
869 {
870 bool all = false;
871 if (pending_inc.crush.length()) {
872 dout(10) << __func__ << " new crush map, all" << dendl;
873 all = true;
874 }
875
876 if (!pending_inc.new_up_client.empty()) {
877 dout(10) << __func__ << " new up osds, all" << dendl;
878 all = true;
879 }
880
881 // check for interesting OSDs
882 set<int> osds;
883 for (auto p = pending_inc.new_state.begin();
884 !all && p != pending_inc.new_state.end();
885 ++p) {
886 if ((p->second & CEPH_OSD_UP) &&
887 osdmap.is_up(p->first)) {
888 osds.insert(p->first);
889 }
890 }
891 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
892 !all && p != pending_inc.new_weight.end();
893 ++p) {
894 if (p->second < osdmap.get_weight(p->first)) {
895 // weight reduction
896 osds.insert(p->first);
897 } else {
898 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
899 << dendl;
900 all = true;
901 }
902 }
903
904 if (!all && osds.empty())
905 return;
906
907 if (!all) {
908 unsigned estimate =
909 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
910 if (estimate > mapping.get_num_pgs() *
911 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
912 dout(10) << __func__ << " estimate " << estimate << " pgs on "
913 << osds.size() << " osds >= "
914 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
915 << mapping.get_num_pgs() << " pgs, all"
916 << dendl;
917 all = true;
918 } else {
919 dout(10) << __func__ << " estimate " << estimate << " pgs on "
920 << osds.size() << " osds" << dendl;
921 }
922 }
923
924 OSDMap next;
925 next.deepish_copy_from(osdmap);
926 next.apply_incremental(pending_inc);
927
928 if (next.get_pools().empty()) {
929 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
930 } else if (all) {
931 PrimeTempJob job(next, this);
932 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk);
933 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
934 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
935 } else {
936 dout(10) << __func__ << " did not finish in "
937 << g_conf()->mon_osd_prime_pg_temp_max_time
938 << ", stopping" << dendl;
939 job.abort();
940 }
941 } else {
942 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
943 utime_t stop = ceph_clock_now();
944 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
945 const int chunk = 1000;
946 int n = chunk;
947 std::unordered_set<pg_t> did_pgs;
948 for (auto osd : osds) {
949 auto& pgs = mapping.get_osd_acting_pgs(osd);
950 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
951 for (auto pgid : pgs) {
952 if (!did_pgs.insert(pgid).second) {
953 continue;
954 }
955 prime_pg_temp(next, pgid);
956 if (--n <= 0) {
957 n = chunk;
958 if (ceph_clock_now() > stop) {
959 dout(10) << __func__ << " consumed more than "
960 << g_conf()->mon_osd_prime_pg_temp_max_time
961 << " seconds, stopping"
962 << dendl;
963 return;
964 }
965 }
966 }
967 }
968 }
969 }
970
971 void OSDMonitor::prime_pg_temp(
972 const OSDMap& next,
973 pg_t pgid)
974 {
975 // TODO: remove this creating_pgs direct access?
976 if (creating_pgs.pgs.count(pgid)) {
977 return;
978 }
979 if (!osdmap.pg_exists(pgid)) {
980 return;
981 }
982
983 vector<int> up, acting;
984 mapping.get(pgid, &up, nullptr, &acting, nullptr);
985
986 vector<int> next_up, next_acting;
987 int next_up_primary, next_acting_primary;
988 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
989 &next_acting, &next_acting_primary);
990 if (acting == next_acting &&
991 !(up != acting && next_up == next_acting))
992 return; // no change since last epoch
993
994 if (acting.empty())
995 return; // if previously empty now we can be no worse off
996 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
997 if (pool && acting.size() < pool->min_size)
998 return; // can be no worse off than before
999
1000 if (next_up == next_acting) {
1001 acting.clear();
1002 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1003 << dendl;
1004 }
1005
1006 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1007 << " -> " << next_up << "/" << next_acting
1008 << ", priming " << acting
1009 << dendl;
1010 {
1011 std::lock_guard l(prime_pg_temp_lock);
1012 // do not touch a mapping if a change is pending
1013 pending_inc.new_pg_temp.emplace(
1014 pgid,
1015 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1016 }
1017 }
1018
1019 /**
1020 * @note receiving a transaction in this function gives a fair amount of
1021 * freedom to the service implementation if it does need it. It shouldn't.
1022 */
1023 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1024 {
1025 dout(10) << "encode_pending e " << pending_inc.epoch
1026 << dendl;
1027
1028 if (do_prune(t)) {
1029 dout(1) << __func__ << " osdmap full prune encoded e"
1030 << pending_inc.epoch << dendl;
1031 }
1032
1033 // finalize up pending_inc
1034 pending_inc.modified = ceph_clock_now();
1035
1036 int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1037 ceph_assert(r == 0);
1038
1039 if (mapping_job) {
1040 if (!mapping_job->is_done()) {
1041 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1042 << mapping_job.get() << " did not complete, "
1043 << mapping_job->shards << " left" << dendl;
1044 mapping_job->abort();
1045 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1046 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1047 << mapping_job.get() << " is prior epoch "
1048 << mapping.get_epoch() << dendl;
1049 } else {
1050 if (g_conf()->mon_osd_prime_pg_temp) {
1051 maybe_prime_pg_temp();
1052 }
1053 }
1054 } else if (g_conf()->mon_osd_prime_pg_temp) {
1055 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1056 << dendl;
1057 }
1058 mapping_job.reset();
1059
1060 // ensure we don't have blank new_state updates. these are interrpeted as
1061 // CEPH_OSD_UP (and almost certainly not what we want!).
1062 auto p = pending_inc.new_state.begin();
1063 while (p != pending_inc.new_state.end()) {
1064 if (p->second == 0) {
1065 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1066 p = pending_inc.new_state.erase(p);
1067 } else {
1068 if (p->second & CEPH_OSD_UP) {
1069 pending_inc.new_last_up_change = pending_inc.modified;
1070 }
1071 ++p;
1072 }
1073 }
1074 if (!pending_inc.new_up_client.empty()) {
1075 pending_inc.new_last_up_change = pending_inc.modified;
1076 }
1077 for (auto& i : pending_inc.new_weight) {
1078 if (i.first > osdmap.max_osd) {
1079 if (i.second) {
1080 // new osd is already marked in
1081 pending_inc.new_last_in_change = pending_inc.modified;
1082 }
1083 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1084 // existing osd marked in or out
1085 pending_inc.new_last_in_change = pending_inc.modified;
1086 }
1087 }
1088
1089 {
1090 OSDMap tmp;
1091 tmp.deepish_copy_from(osdmap);
1092 tmp.apply_incremental(pending_inc);
1093
1094 // clean pg_temp mappings
1095 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1096
1097 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1098 osdmap.maybe_remove_pg_upmaps(cct, osdmap, tmp, &pending_inc);
1099
1100 // update creating pgs first so that we can remove the created pgid and
1101 // process the pool flag removal below in the same osdmap epoch.
1102 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1103 bufferlist creatings_bl;
1104 encode(pending_creatings, creatings_bl);
1105 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1106
1107 // remove any old (or incompat) POOL_CREATING flags
1108 for (auto& i : tmp.get_pools()) {
1109 if (tmp.require_osd_release < CEPH_RELEASE_NAUTILUS) {
1110 // pre-nautilus OSDMaps shouldn't get this flag.
1111 if (pending_inc.new_pools.count(i.first)) {
1112 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1113 }
1114 }
1115 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1116 !pending_creatings.still_creating_pool(i.first)) {
1117 dout(10) << __func__ << " done creating pool " << i.first
1118 << ", clearing CREATING flag" << dendl;
1119 if (pending_inc.new_pools.count(i.first) == 0) {
1120 pending_inc.new_pools[i.first] = i.second;
1121 }
1122 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1123 }
1124 }
1125
1126 // remove any legacy osdmap nearfull/full flags
1127 {
1128 if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
1129 dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
1130 << dendl;
1131 remove_flag(CEPH_OSDMAP_NEARFULL);
1132 remove_flag(CEPH_OSDMAP_FULL);
1133 }
1134 }
1135 // collect which pools are currently affected by
1136 // the near/backfill/full osd(s),
1137 // and set per-pool near/backfill/full flag instead
1138 set<int64_t> full_pool_ids;
1139 set<int64_t> backfillfull_pool_ids;
1140 set<int64_t> nearfull_pool_ids;
1141 tmp.get_full_pools(cct,
1142 &full_pool_ids,
1143 &backfillfull_pool_ids,
1144 &nearfull_pool_ids);
1145 if (full_pool_ids.empty() ||
1146 backfillfull_pool_ids.empty() ||
1147 nearfull_pool_ids.empty()) {
1148 // normal case - no nearfull, backfillfull or full osds
1149 // try cancel any improper nearfull/backfillfull/full pool
1150 // flags first
1151 for (auto &pool: tmp.get_pools()) {
1152 auto p = pool.first;
1153 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1154 nearfull_pool_ids.empty()) {
1155 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1156 << "'s nearfull flag" << dendl;
1157 if (pending_inc.new_pools.count(p) == 0) {
1158 // load original pool info first!
1159 pending_inc.new_pools[p] = pool.second;
1160 }
1161 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1162 }
1163 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1164 backfillfull_pool_ids.empty()) {
1165 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1166 << "'s backfillfull flag" << dendl;
1167 if (pending_inc.new_pools.count(p) == 0) {
1168 pending_inc.new_pools[p] = pool.second;
1169 }
1170 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1171 }
1172 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1173 full_pool_ids.empty()) {
1174 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1175 // set by EQUOTA, skipping
1176 continue;
1177 }
1178 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1179 << "'s full flag" << dendl;
1180 if (pending_inc.new_pools.count(p) == 0) {
1181 pending_inc.new_pools[p] = pool.second;
1182 }
1183 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1184 }
1185 }
1186 }
1187 if (!full_pool_ids.empty()) {
1188 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1189 << " as full" << dendl;
1190 for (auto &p: full_pool_ids) {
1191 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1192 continue;
1193 }
1194 if (pending_inc.new_pools.count(p) == 0) {
1195 pending_inc.new_pools[p] = tmp.pools[p];
1196 }
1197 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1198 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1199 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1200 }
1201 // cancel FLAG_FULL for pools which are no longer full too
1202 for (auto &pool: tmp.get_pools()) {
1203 auto p = pool.first;
1204 if (full_pool_ids.count(p)) {
1205 // skip pools we have just marked as full above
1206 continue;
1207 }
1208 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1209 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1210 // don't touch if currently is not full
1211 // or is running out of quota (and hence considered as full)
1212 continue;
1213 }
1214 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1215 << "'s full flag" << dendl;
1216 if (pending_inc.new_pools.count(p) == 0) {
1217 pending_inc.new_pools[p] = pool.second;
1218 }
1219 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1220 }
1221 }
1222 if (!backfillfull_pool_ids.empty()) {
1223 for (auto &p: backfillfull_pool_ids) {
1224 if (full_pool_ids.count(p)) {
1225 // skip pools we have already considered as full above
1226 continue;
1227 }
1228 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1229 // make sure FLAG_FULL is truly set, so we are safe not
1230 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1231 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1232 continue;
1233 }
1234 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1235 // don't bother if pool is already marked as backfillfull
1236 continue;
1237 }
1238 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1239 << "'s as backfillfull" << dendl;
1240 if (pending_inc.new_pools.count(p) == 0) {
1241 pending_inc.new_pools[p] = tmp.pools[p];
1242 }
1243 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1244 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1245 }
1246 // cancel FLAG_BACKFILLFULL for pools
1247 // which are no longer backfillfull too
1248 for (auto &pool: tmp.get_pools()) {
1249 auto p = pool.first;
1250 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1251 // skip pools we have just marked as backfillfull/full above
1252 continue;
1253 }
1254 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1255 // and don't touch if currently is not backfillfull
1256 continue;
1257 }
1258 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1259 << "'s backfillfull flag" << dendl;
1260 if (pending_inc.new_pools.count(p) == 0) {
1261 pending_inc.new_pools[p] = pool.second;
1262 }
1263 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1264 }
1265 }
1266 if (!nearfull_pool_ids.empty()) {
1267 for (auto &p: nearfull_pool_ids) {
1268 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1269 continue;
1270 }
1271 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1272 // make sure FLAG_FULL is truly set, so we are safe not
1273 // to set a extra (redundant) FLAG_NEARFULL flag
1274 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1275 continue;
1276 }
1277 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1278 // don't bother if pool is already marked as nearfull
1279 continue;
1280 }
1281 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1282 << "'s as nearfull" << dendl;
1283 if (pending_inc.new_pools.count(p) == 0) {
1284 pending_inc.new_pools[p] = tmp.pools[p];
1285 }
1286 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1287 }
1288 // cancel FLAG_NEARFULL for pools
1289 // which are no longer nearfull too
1290 for (auto &pool: tmp.get_pools()) {
1291 auto p = pool.first;
1292 if (full_pool_ids.count(p) ||
1293 backfillfull_pool_ids.count(p) ||
1294 nearfull_pool_ids.count(p)) {
1295 // skip pools we have just marked as
1296 // nearfull/backfillfull/full above
1297 continue;
1298 }
1299 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1300 // and don't touch if currently is not nearfull
1301 continue;
1302 }
1303 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1304 << "'s nearfull flag" << dendl;
1305 if (pending_inc.new_pools.count(p) == 0) {
1306 pending_inc.new_pools[p] = pool.second;
1307 }
1308 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1309 }
1310 }
1311
1312 // min_compat_client?
1313 if (tmp.require_min_compat_client == 0) {
1314 auto mv = tmp.get_min_compat_client();
1315 dout(1) << __func__ << " setting require_min_compat_client to currently "
1316 << "required " << ceph_release_name(mv) << dendl;
1317 mon->clog->info() << "setting require_min_compat_client to currently "
1318 << "required " << ceph_release_name(mv);
1319 pending_inc.new_require_min_compat_client = mv;
1320 }
1321
1322 // upgrade to mimic?
1323 if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC &&
1324 tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1325 dout(10) << __func__ << " first mimic+ epoch" << dendl;
1326 // record this epoch as the deletion for all legacy removed_snaps
1327 for (auto& p : tmp.get_pools()) {
1328 // update every pool
1329 if (pending_inc.new_pools.count(p.first) == 0) {
1330 pending_inc.new_pools[p.first] = p.second;
1331 }
1332 auto& pi = pending_inc.new_pools[p.first];
1333 if (pi.snap_seq == 0) {
1334 // no snaps on this pool
1335 continue;
1336 }
1337 if ((pi.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS |
1338 pg_pool_t::FLAG_POOL_SNAPS)) == 0) {
1339 if (!pi.removed_snaps.empty()) {
1340 pi.flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
1341 } else {
1342 pi.flags |= pg_pool_t::FLAG_POOL_SNAPS;
1343 }
1344 }
1345
1346 // Make all previously removed snaps appear to be removed in this
1347 // epoch. this populates removed_snaps_queue. The OSD will subtract
1348 // off its purged_snaps, as before, and this set will shrink over the
1349 // following epochs as the purged snaps are reported back through the
1350 // mgr.
1351 OSDMap::snap_interval_set_t removed;
1352 if (!p.second.removed_snaps.empty()) {
1353 // different flavor of interval_set :(
1354 for (auto q = p.second.removed_snaps.begin();
1355 q != p.second.removed_snaps.end();
1356 ++q) {
1357 removed.insert(q.get_start(), q.get_len());
1358 }
1359 } else {
1360 for (snapid_t s = 1; s <= pi.get_snap_seq(); s = s + 1) {
1361 if (pi.snaps.count(s) == 0) {
1362 removed.insert(s);
1363 }
1364 }
1365 }
1366 pending_inc.new_removed_snaps[p.first].union_of(removed);
1367
1368 dout(10) << __func__ << " converting pool " << p.first
1369 << " with " << p.second.removed_snaps.size()
1370 << " legacy removed_snaps" << dendl;
1371 string k = make_snap_epoch_key(p.first, pending_inc.epoch);
1372 bufferlist v;
1373 encode(p.second.removed_snaps, v);
1374 t->put(OSD_SNAP_PREFIX, k, v);
1375 for (auto q = p.second.removed_snaps.begin();
1376 q != p.second.removed_snaps.end();
1377 ++q) {
1378 bufferlist v;
1379 string k = make_snap_key_value(p.first, q.get_start(),
1380 q.get_len(), pending_inc.epoch, &v);
1381 t->put(OSD_SNAP_PREFIX, k, v);
1382 }
1383 }
1384 }
1385 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS &&
1386 tmp.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
1387 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1388 // add creating flags?
1389 for (auto& i : tmp.get_pools()) {
1390 if (pending_creatings.still_creating_pool(i.first)) {
1391 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1392 << dendl;
1393 if (pending_inc.new_pools.count(i.first) == 0) {
1394 pending_inc.new_pools[i.first] = i.second;
1395 }
1396 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1397 }
1398 }
1399 // adjust blacklist items to all be TYPE_ANY
1400 for (auto& i : tmp.blacklist) {
1401 auto a = i.first;
1402 a.set_type(entity_addr_t::TYPE_ANY);
1403 pending_inc.new_blacklist[a] = i.second;
1404 pending_inc.old_blacklist.push_back(i.first);
1405 }
1406 }
1407 }
1408
1409 // tell me about it
1410 for (auto i = pending_inc.new_state.begin();
1411 i != pending_inc.new_state.end();
1412 ++i) {
1413 int s = i->second ? i->second : CEPH_OSD_UP;
1414 if (s & CEPH_OSD_UP)
1415 dout(2) << " osd." << i->first << " DOWN" << dendl;
1416 if (s & CEPH_OSD_EXISTS)
1417 dout(2) << " osd." << i->first << " DNE" << dendl;
1418 }
1419 for (auto i = pending_inc.new_up_client.begin();
1420 i != pending_inc.new_up_client.end();
1421 ++i) {
1422 //FIXME: insert cluster addresses too
1423 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1424 }
1425 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1426 i != pending_inc.new_weight.end();
1427 ++i) {
1428 if (i->second == CEPH_OSD_OUT) {
1429 dout(2) << " osd." << i->first << " OUT" << dendl;
1430 } else if (i->second == CEPH_OSD_IN) {
1431 dout(2) << " osd." << i->first << " IN" << dendl;
1432 } else {
1433 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1434 }
1435 }
1436
1437 // features for osdmap and its incremental
1438 uint64_t features;
1439
1440 // encode full map and determine its crc
1441 OSDMap tmp;
1442 {
1443 tmp.deepish_copy_from(osdmap);
1444 tmp.apply_incremental(pending_inc);
1445
1446 // determine appropriate features
1447 features = tmp.get_encoding_features();
1448 dout(10) << __func__ << " encoding full map with "
1449 << ceph_release_name(tmp.require_osd_release)
1450 << " features " << features << dendl;
1451
1452 // the features should be a subset of the mon quorum's features!
1453 ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1454
1455 bufferlist fullbl;
1456 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1457 pending_inc.full_crc = tmp.get_crc();
1458
1459 // include full map in the txn. note that old monitors will
1460 // overwrite this. new ones will now skip the local full map
1461 // encode and reload from this.
1462 put_version_full(t, pending_inc.epoch, fullbl);
1463 }
1464
1465 // encode
1466 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1467 bufferlist bl;
1468 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1469
1470 dout(20) << " full_crc " << tmp.get_crc()
1471 << " inc_crc " << pending_inc.inc_crc << dendl;
1472
1473 /* put everything in the transaction */
1474 put_version(t, pending_inc.epoch, bl);
1475 put_last_committed(t, pending_inc.epoch);
1476
1477 // metadata, too!
1478 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1479 p != pending_metadata.end();
1480 ++p)
1481 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1482 for (set<int>::iterator p = pending_metadata_rm.begin();
1483 p != pending_metadata_rm.end();
1484 ++p)
1485 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1486 pending_metadata.clear();
1487 pending_metadata_rm.clear();
1488
1489 // removed_snaps
1490 if (tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1491 for (auto& i : pending_inc.new_removed_snaps) {
1492 {
1493 // all snaps removed this epoch
1494 string k = make_snap_epoch_key(i.first, pending_inc.epoch);
1495 bufferlist v;
1496 encode(i.second, v);
1497 t->put(OSD_SNAP_PREFIX, k, v);
1498 }
1499 for (auto q = i.second.begin();
1500 q != i.second.end();
1501 ++q) {
1502 bufferlist v;
1503 string k = make_snap_key_value(i.first, q.get_start(),
1504 q.get_len(), pending_inc.epoch, &v);
1505 t->put(OSD_SNAP_PREFIX, k, v);
1506 }
1507 }
1508 for (auto& i : pending_inc.new_purged_snaps) {
1509 for (auto q = i.second.begin();
1510 q != i.second.end();
1511 ++q) {
1512 bufferlist v;
1513 string k = make_snap_purged_key_value(i.first, q.get_start(),
1514 q.get_len(), pending_inc.epoch,
1515 &v);
1516 t->put(OSD_SNAP_PREFIX, k, v);
1517 }
1518 }
1519 }
1520
1521 // health
1522 health_check_map_t next;
1523 tmp.check_health(&next);
1524 encode_health(next, t);
1525 }
1526
1527 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1528 {
1529 bufferlist bl;
1530 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1531 if (r < 0)
1532 return r;
1533 try {
1534 auto p = bl.cbegin();
1535 decode(m, p);
1536 }
1537 catch (buffer::error& e) {
1538 if (err)
1539 *err << "osd." << osd << " metadata is corrupt";
1540 return -EIO;
1541 }
1542 return 0;
1543 }
1544
1545 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
1546 {
1547 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1548 if (osdmap.is_up(osd)) {
1549 map<string,string> meta;
1550 load_metadata(osd, meta, nullptr);
1551 auto p = meta.find(field);
1552 if (p == meta.end()) {
1553 (*out)["unknown"]++;
1554 } else {
1555 (*out)[p->second]++;
1556 }
1557 }
1558 }
1559 }
1560
1561 void OSDMonitor::count_metadata(const string& field, Formatter *f)
1562 {
1563 map<string,int> by_val;
1564 count_metadata(field, &by_val);
1565 f->open_object_section(field.c_str());
1566 for (auto& p : by_val) {
1567 f->dump_int(p.first.c_str(), p.second);
1568 }
1569 f->close_section();
1570 }
1571
1572 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1573 {
1574 map<string, string> metadata;
1575 int r = load_metadata(osd, metadata, nullptr);
1576 if (r < 0)
1577 return r;
1578
1579 auto it = metadata.find("osd_objectstore");
1580 if (it == metadata.end())
1581 return -ENOENT;
1582 *type = it->second;
1583 return 0;
1584 }
1585
1586 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1587 const pg_pool_t &pool,
1588 ostream *err)
1589 {
1590 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1591 // since filestore osds could always join the pool later
1592 set<int> checked_osds;
1593 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
1594 vector<int> up, acting;
1595 pg_t pgid(ps, pool_id);
1596 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1597 for (int osd : up) {
1598 if (checked_osds.find(osd) != checked_osds.end())
1599 continue;
1600 string objectstore_type;
1601 int r = get_osd_objectstore_type(osd, &objectstore_type);
1602 // allow with missing metadata, e.g. due to an osd never booting yet
1603 if (r < 0 || objectstore_type == "bluestore") {
1604 checked_osds.insert(osd);
1605 continue;
1606 }
1607 *err << "osd." << osd << " uses " << objectstore_type;
1608 return false;
1609 }
1610 }
1611 return true;
1612 }
1613
1614 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1615 {
1616 map<string,string> m;
1617 if (int r = load_metadata(osd, m, err))
1618 return r;
1619 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1620 f->dump_string(p->first.c_str(), p->second);
1621 return 0;
1622 }
1623
1624 void OSDMonitor::print_nodes(Formatter *f)
1625 {
1626 // group OSDs by their hosts
1627 map<string, list<int> > osds; // hostname => osd
1628 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1629 map<string, string> m;
1630 if (load_metadata(osd, m, NULL)) {
1631 continue;
1632 }
1633 map<string, string>::iterator hostname = m.find("hostname");
1634 if (hostname == m.end()) {
1635 // not likely though
1636 continue;
1637 }
1638 osds[hostname->second].push_back(osd);
1639 }
1640
1641 dump_services(f, osds, "osd");
1642 }
1643
1644 void OSDMonitor::share_map_with_random_osd()
1645 {
1646 if (osdmap.get_num_up_osds() == 0) {
1647 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1648 return;
1649 }
1650
1651 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1652 if (!s) {
1653 dout(10) << __func__ << " no up osd on our session map" << dendl;
1654 return;
1655 }
1656
1657 dout(10) << "committed, telling random " << s->name
1658 << " all about it" << dendl;
1659
1660 // get feature of the peer
1661 // use quorum_con_features, if it's an anonymous connection.
1662 uint64_t features = s->con_features ? s->con_features :
1663 mon->get_quorum_con_features();
1664 // whatev, they'll request more if they need it
1665 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
1666 s->con->send_message(m);
1667 // NOTE: do *not* record osd has up to this epoch (as we do
1668 // elsewhere) as they may still need to request older values.
1669 }
1670
1671 version_t OSDMonitor::get_trim_to() const
1672 {
1673 if (mon->get_quorum().empty()) {
1674 dout(10) << __func__ << ": quorum not formed" << dendl;
1675 return 0;
1676 }
1677
1678 {
1679 std::lock_guard<std::mutex> l(creating_pgs_lock);
1680 if (!creating_pgs.pgs.empty()) {
1681 return 0;
1682 }
1683 }
1684
1685 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
1686 dout(0) << __func__
1687 << " blocking osdmap trim"
1688 " ('mon_debug_block_osdmap_trim' set to 'true')"
1689 << dendl;
1690 return 0;
1691 }
1692
1693 {
1694 epoch_t floor = get_min_last_epoch_clean();
1695 dout(10) << " min_last_epoch_clean " << floor << dendl;
1696 if (g_conf()->mon_osd_force_trim_to > 0 &&
1697 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
1698 floor = g_conf()->mon_osd_force_trim_to;
1699 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1700 }
1701 unsigned min = g_conf()->mon_min_osdmap_epochs;
1702 if (floor + min > get_last_committed()) {
1703 if (min < get_last_committed())
1704 floor = get_last_committed() - min;
1705 else
1706 floor = 0;
1707 }
1708 if (floor > get_first_committed())
1709 return floor;
1710 }
1711 return 0;
1712 }
1713
1714 epoch_t OSDMonitor::get_min_last_epoch_clean() const
1715 {
1716 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1717 // also scan osd epochs
1718 // don't trim past the oldest reported osd epoch
1719 for (auto& osd_epoch : osd_epochs) {
1720 if (osd_epoch.second < floor) {
1721 floor = osd_epoch.second;
1722 }
1723 }
1724 return floor;
1725 }
1726
1727 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1728 version_t first)
1729 {
1730 dout(10) << __func__ << " including full map for e " << first << dendl;
1731 bufferlist bl;
1732 get_version_full(first, bl);
1733 put_version_full(tx, first, bl);
1734
1735 if (has_osdmap_manifest &&
1736 first > osdmap_manifest.get_first_pinned()) {
1737 _prune_update_trimmed(tx, first);
1738 }
1739 }
1740
1741
1742 /* full osdmap prune
1743 *
1744 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
1745 */
1746
1747 void OSDMonitor::load_osdmap_manifest()
1748 {
1749 bool store_has_manifest =
1750 mon->store->exists(get_service_name(), "osdmap_manifest");
1751
1752 if (!store_has_manifest) {
1753 if (!has_osdmap_manifest) {
1754 return;
1755 }
1756
1757 dout(20) << __func__
1758 << " dropping osdmap manifest from memory." << dendl;
1759 osdmap_manifest = osdmap_manifest_t();
1760 has_osdmap_manifest = false;
1761 return;
1762 }
1763
1764 dout(20) << __func__
1765 << " osdmap manifest detected in store; reload." << dendl;
1766
1767 bufferlist manifest_bl;
1768 int r = get_value("osdmap_manifest", manifest_bl);
1769 if (r < 0) {
1770 derr << __func__ << " unable to read osdmap version manifest" << dendl;
1771 ceph_abort_msg("error reading manifest");
1772 }
1773 osdmap_manifest.decode(manifest_bl);
1774 has_osdmap_manifest = true;
1775
1776 dout(10) << __func__ << " store osdmap manifest pinned ("
1777 << osdmap_manifest.get_first_pinned()
1778 << " .. "
1779 << osdmap_manifest.get_last_pinned()
1780 << ")"
1781 << dendl;
1782 }
1783
1784 bool OSDMonitor::should_prune() const
1785 {
1786 version_t first = get_first_committed();
1787 version_t last = get_last_committed();
1788 version_t min_osdmap_epochs =
1789 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
1790 version_t prune_min =
1791 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
1792 version_t prune_interval =
1793 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
1794 version_t last_pinned = osdmap_manifest.get_last_pinned();
1795 version_t last_to_pin = last - min_osdmap_epochs;
1796
1797 // Make it or break it constraints.
1798 //
1799 // If any of these conditions fails, we will not prune, regardless of
1800 // whether we have an on-disk manifest with an on-going pruning state.
1801 //
1802 if ((last - first) <= min_osdmap_epochs) {
1803 // between the first and last committed epochs, we don't have
1804 // enough epochs to trim, much less to prune.
1805 dout(10) << __func__
1806 << " currently holding only " << (last - first)
1807 << " epochs (min osdmap epochs: " << min_osdmap_epochs
1808 << "); do not prune."
1809 << dendl;
1810 return false;
1811
1812 } else if ((last_to_pin - first) < prune_min) {
1813 // between the first committed epoch and the last epoch we would prune,
1814 // we simply don't have enough versions over the minimum to prune maps.
1815 dout(10) << __func__
1816 << " could only prune " << (last_to_pin - first)
1817 << " epochs (" << first << ".." << last_to_pin << "), which"
1818 " is less than the required minimum (" << prune_min << ")"
1819 << dendl;
1820 return false;
1821
1822 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
1823 dout(10) << __func__
1824 << " we have pruned as far as we can; do not prune."
1825 << dendl;
1826 return false;
1827
1828 } else if (last_pinned + prune_interval > last_to_pin) {
1829 dout(10) << __func__
1830 << " not enough epochs to form an interval (last pinned: "
1831 << last_pinned << ", last to pin: "
1832 << last_to_pin << ", interval: " << prune_interval << ")"
1833 << dendl;
1834 return false;
1835 }
1836
1837 dout(15) << __func__
1838 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
1839 << " lc (" << first << ".." << last << ")"
1840 << dendl;
1841 return true;
1842 }
1843
1844 void OSDMonitor::_prune_update_trimmed(
1845 MonitorDBStore::TransactionRef tx,
1846 version_t first)
1847 {
1848 dout(10) << __func__
1849 << " first " << first
1850 << " last_pinned " << osdmap_manifest.get_last_pinned()
1851 << " last_pinned " << osdmap_manifest.get_last_pinned()
1852 << dendl;
1853
1854 osdmap_manifest_t manifest = osdmap_manifest;
1855
1856 if (!manifest.is_pinned(first)) {
1857 manifest.pin(first);
1858 }
1859
1860 set<version_t>::iterator p_end = manifest.pinned.find(first);
1861 set<version_t>::iterator p = manifest.pinned.begin();
1862 manifest.pinned.erase(p, p_end);
1863 ceph_assert(manifest.get_first_pinned() == first);
1864
1865 if (manifest.get_last_pinned() == first+1 ||
1866 manifest.pinned.size() == 1) {
1867 // we reached the end of the line, as pinned maps go; clean up our
1868 // manifest, and let `should_prune()` decide whether we should prune
1869 // again.
1870 tx->erase(get_service_name(), "osdmap_manifest");
1871 return;
1872 }
1873
1874 bufferlist bl;
1875 manifest.encode(bl);
1876 tx->put(get_service_name(), "osdmap_manifest", bl);
1877 }
1878
1879 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
1880 {
1881 dout(1) << __func__ << dendl;
1882
1883 version_t pin_first;
1884
1885 // verify constrainsts on stable in-memory state
1886 if (!has_osdmap_manifest) {
1887 // we must have never pruned, OR if we pruned the state must no longer
1888 // be relevant (i.e., the state must have been removed alongside with
1889 // the trim that *must* have removed past the last pinned map in a
1890 // previous prune).
1891 ceph_assert(osdmap_manifest.pinned.empty());
1892 ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
1893 pin_first = get_first_committed();
1894
1895 } else {
1896 // we must have pruned in the past AND its state is still relevant
1897 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
1898 // and thus we still hold a manifest in the store).
1899 ceph_assert(!osdmap_manifest.pinned.empty());
1900 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
1901 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
1902
1903 dout(10) << __func__
1904 << " first_pinned " << osdmap_manifest.get_first_pinned()
1905 << " last_pinned " << osdmap_manifest.get_last_pinned()
1906 << dendl;
1907
1908 pin_first = osdmap_manifest.get_last_pinned();
1909 }
1910
1911 manifest.pin(pin_first);
1912 }
1913
1914 bool OSDMonitor::_prune_sanitize_options() const
1915 {
1916 uint64_t prune_interval =
1917 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
1918 uint64_t prune_min =
1919 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
1920 uint64_t txsize =
1921 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
1922
1923 bool r = true;
1924
1925 if (prune_interval == 0) {
1926 derr << __func__
1927 << " prune is enabled BUT prune interval is zero; abort."
1928 << dendl;
1929 r = false;
1930 } else if (prune_interval == 1) {
1931 derr << __func__
1932 << " prune interval is equal to one, which essentially means"
1933 " no pruning; abort."
1934 << dendl;
1935 r = false;
1936 }
1937 if (prune_min == 0) {
1938 derr << __func__
1939 << " prune is enabled BUT prune min is zero; abort."
1940 << dendl;
1941 r = false;
1942 }
1943 if (prune_interval > prune_min) {
1944 derr << __func__
1945 << " impossible to ascertain proper prune interval because"
1946 << " it is greater than the minimum prune epochs"
1947 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
1948 << dendl;
1949 r = false;
1950 }
1951
1952 if (txsize < prune_interval - 1) {
1953 derr << __func__
1954 << "'mon_osdmap_full_prune_txsize' (" << txsize
1955 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
1956 << "); abort." << dendl;
1957 r = false;
1958 }
1959 return r;
1960 }
1961
1962 bool OSDMonitor::is_prune_enabled() const {
1963 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
1964 }
1965
1966 bool OSDMonitor::is_prune_supported() const {
1967 return mon->get_required_mon_features().contains_any(
1968 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
1969 }
1970
1971 /** do_prune
1972 *
1973 * @returns true if has side-effects; false otherwise.
1974 */
1975 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
1976 {
1977 bool enabled = is_prune_enabled();
1978
1979 dout(1) << __func__ << " osdmap full prune "
1980 << ( enabled ? "enabled" : "disabled")
1981 << dendl;
1982
1983 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
1984 return false;
1985 }
1986
1987 // we are beyond the minimum prune versions, we need to remove maps because
1988 // otherwise the store will grow unbounded and we may end up having issues
1989 // with available disk space or store hangs.
1990
1991 // we will not pin all versions. We will leave a buffer number of versions.
1992 // this allows us the monitor to trim maps without caring too much about
1993 // pinned maps, and then allow us to use another ceph-mon without these
1994 // capabilities, without having to repair the store.
1995
1996 osdmap_manifest_t manifest = osdmap_manifest;
1997
1998 version_t first = get_first_committed();
1999 version_t last = get_last_committed();
2000
2001 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2002 version_t last_pinned = manifest.get_last_pinned();
2003 uint64_t prune_interval =
2004 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2005 uint64_t txsize =
2006 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2007
2008 prune_init(manifest);
2009
2010 // we need to get rid of some osdmaps
2011
2012 dout(5) << __func__
2013 << " lc (" << first << " .. " << last << ")"
2014 << " last_pinned " << last_pinned
2015 << " interval " << prune_interval
2016 << " last_to_pin " << last_to_pin
2017 << dendl;
2018
2019 // We will be erasing maps as we go.
2020 //
2021 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2022 //
2023 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2024 // we stop pruning. We could prune the maps between `next_to_pin` and
2025 // `last_to_pin`, but by not doing it we end up with neater pruned
2026 // intervals, aligned with `prune_interval`. Besides, this should not be a
2027 // problem as long as `prune_interval` is set to a sane value, instead of
2028 // hundreds or thousands of maps.
2029
2030 auto map_exists = [this](version_t v) {
2031 string k = mon->store->combine_strings("full", v);
2032 return mon->store->exists(get_service_name(), k);
2033 };
2034
2035 // 'interval' represents the number of maps from the last pinned
2036 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2037 // version 11 next; all intermediate versions will be removed.
2038 //
2039 // 'txsize' represents the maximum number of versions we'll be removing in
2040 // this iteration. If 'txsize' is large enough to perform multiple passes
2041 // pinning and removing maps, we will do so; if not, we'll do at least one
2042 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2043 // ensure that we never go *over* the maximum.
2044
2045 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2046 uint64_t removal_interval = prune_interval - 1;
2047
2048 if (txsize < removal_interval) {
2049 dout(5) << __func__
2050 << " setting txsize to removal interval size ("
2051 << removal_interval << " versions"
2052 << dendl;
2053 txsize = removal_interval;
2054 }
2055 ceph_assert(removal_interval > 0);
2056
2057 uint64_t num_pruned = 0;
2058 while (num_pruned + removal_interval <= txsize) {
2059 last_pinned = manifest.get_last_pinned();
2060
2061 if (last_pinned + prune_interval > last_to_pin) {
2062 break;
2063 }
2064 ceph_assert(last_pinned < last_to_pin);
2065
2066 version_t next_pinned = last_pinned + prune_interval;
2067 ceph_assert(next_pinned <= last_to_pin);
2068 manifest.pin(next_pinned);
2069
2070 dout(20) << __func__
2071 << " last_pinned " << last_pinned
2072 << " next_pinned " << next_pinned
2073 << " num_pruned " << num_pruned
2074 << " removal interval (" << (last_pinned+1)
2075 << ".." << (next_pinned-1) << ")"
2076 << " txsize " << txsize << dendl;
2077
2078 ceph_assert(map_exists(last_pinned));
2079 ceph_assert(map_exists(next_pinned));
2080
2081 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2082 ceph_assert(!manifest.is_pinned(v));
2083
2084 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2085 string full_key = mon->store->combine_strings("full", v);
2086 tx->erase(get_service_name(), full_key);
2087 ++num_pruned;
2088 }
2089 }
2090
2091 ceph_assert(num_pruned > 0);
2092
2093 bufferlist bl;
2094 manifest.encode(bl);
2095 tx->put(get_service_name(), "osdmap_manifest", bl);
2096
2097 return true;
2098 }
2099
2100
2101 // -------------
2102
2103 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2104 {
2105 op->mark_osdmon_event(__func__);
2106 Message *m = op->get_req();
2107 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2108
2109 switch (m->get_type()) {
2110 // READs
2111 case MSG_MON_COMMAND:
2112 try {
2113 return preprocess_command(op);
2114 } catch (const bad_cmd_get& e) {
2115 bufferlist bl;
2116 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2117 return true;
2118 }
2119 case CEPH_MSG_MON_GET_OSDMAP:
2120 return preprocess_get_osdmap(op);
2121
2122 // damp updates
2123 case MSG_OSD_MARK_ME_DOWN:
2124 return preprocess_mark_me_down(op);
2125 case MSG_OSD_FULL:
2126 return preprocess_full(op);
2127 case MSG_OSD_FAILURE:
2128 return preprocess_failure(op);
2129 case MSG_OSD_BOOT:
2130 return preprocess_boot(op);
2131 case MSG_OSD_ALIVE:
2132 return preprocess_alive(op);
2133 case MSG_OSD_PG_CREATED:
2134 return preprocess_pg_created(op);
2135 case MSG_OSD_PG_READY_TO_MERGE:
2136 return preprocess_pg_ready_to_merge(op);
2137 case MSG_OSD_PGTEMP:
2138 return preprocess_pgtemp(op);
2139 case MSG_OSD_BEACON:
2140 return preprocess_beacon(op);
2141
2142 case CEPH_MSG_POOLOP:
2143 return preprocess_pool_op(op);
2144
2145 case MSG_REMOVE_SNAPS:
2146 return preprocess_remove_snaps(op);
2147
2148 default:
2149 ceph_abort();
2150 return true;
2151 }
2152 }
2153
2154 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2155 {
2156 op->mark_osdmon_event(__func__);
2157 Message *m = op->get_req();
2158 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2159
2160 switch (m->get_type()) {
2161 // damp updates
2162 case MSG_OSD_MARK_ME_DOWN:
2163 return prepare_mark_me_down(op);
2164 case MSG_OSD_FULL:
2165 return prepare_full(op);
2166 case MSG_OSD_FAILURE:
2167 return prepare_failure(op);
2168 case MSG_OSD_BOOT:
2169 return prepare_boot(op);
2170 case MSG_OSD_ALIVE:
2171 return prepare_alive(op);
2172 case MSG_OSD_PG_CREATED:
2173 return prepare_pg_created(op);
2174 case MSG_OSD_PGTEMP:
2175 return prepare_pgtemp(op);
2176 case MSG_OSD_PG_READY_TO_MERGE:
2177 return prepare_pg_ready_to_merge(op);
2178 case MSG_OSD_BEACON:
2179 return prepare_beacon(op);
2180
2181 case MSG_MON_COMMAND:
2182 try {
2183 return prepare_command(op);
2184 } catch (const bad_cmd_get& e) {
2185 bufferlist bl;
2186 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2187 return true;
2188 }
2189
2190 case CEPH_MSG_POOLOP:
2191 return prepare_pool_op(op);
2192
2193 case MSG_REMOVE_SNAPS:
2194 return prepare_remove_snaps(op);
2195
2196
2197 default:
2198 ceph_abort();
2199 }
2200
2201 return false;
2202 }
2203
2204 bool OSDMonitor::should_propose(double& delay)
2205 {
2206 dout(10) << "should_propose" << dendl;
2207
2208 // if full map, propose immediately! any subsequent changes will be clobbered.
2209 if (pending_inc.fullmap.length())
2210 return true;
2211
2212 // adjust osd weights?
2213 if (!osd_weight.empty() &&
2214 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2215 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2216 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2217 delay = 0.0;
2218 osd_weight.clear();
2219 return true;
2220 }
2221
2222 return PaxosService::should_propose(delay);
2223 }
2224
2225
2226
2227 // ---------------------------
2228 // READs
2229
2230 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2231 {
2232 op->mark_osdmon_event(__func__);
2233 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
2234
2235 uint64_t features = mon->get_quorum_con_features();
2236 if (op->get_session() && op->get_session()->con_features)
2237 features = op->get_session()->con_features;
2238
2239 dout(10) << __func__ << " " << *m << dendl;
2240 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2241 epoch_t first = get_first_committed();
2242 epoch_t last = osdmap.get_epoch();
2243 int max = g_conf()->osd_map_message_max;
2244 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2245 for (epoch_t e = std::max(first, m->get_full_first());
2246 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2247 ++e, --max) {
2248 bufferlist& bl = reply->maps[e];
2249 int r = get_version_full(e, features, bl);
2250 ceph_assert(r >= 0);
2251 max_bytes -= bl.length();
2252 }
2253 for (epoch_t e = std::max(first, m->get_inc_first());
2254 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2255 ++e, --max) {
2256 bufferlist& bl = reply->incremental_maps[e];
2257 int r = get_version(e, features, bl);
2258 ceph_assert(r >= 0);
2259 max_bytes -= bl.length();
2260 }
2261 reply->oldest_map = first;
2262 reply->newest_map = last;
2263 mon->send_reply(op, reply);
2264 return true;
2265 }
2266
2267
2268 // ---------------------------
2269 // UPDATEs
2270
2271 // failure --
2272
2273 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2274 // check permissions
2275 MonSession *session = op->get_session();
2276 if (!session)
2277 return true;
2278 if (!session->is_capable("osd", MON_CAP_X)) {
2279 dout(0) << "got MOSDFailure from entity with insufficient caps "
2280 << session->caps << dendl;
2281 return true;
2282 }
2283 if (fsid != mon->monmap->fsid) {
2284 dout(0) << "check_source: on fsid " << fsid
2285 << " != " << mon->monmap->fsid << dendl;
2286 return true;
2287 }
2288 return false;
2289 }
2290
2291
2292 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2293 {
2294 op->mark_osdmon_event(__func__);
2295 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2296 // who is target_osd
2297 int badboy = m->get_target_osd();
2298
2299 // check permissions
2300 if (check_source(op, m->fsid))
2301 goto didit;
2302
2303 // first, verify the reporting host is valid
2304 if (m->get_orig_source().is_osd()) {
2305 int from = m->get_orig_source().num();
2306 if (!osdmap.exists(from) ||
2307 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2308 (osdmap.is_down(from) && m->if_osd_failed())) {
2309 dout(5) << "preprocess_failure from dead osd." << from
2310 << ", ignoring" << dendl;
2311 send_incremental(op, m->get_epoch()+1);
2312 goto didit;
2313 }
2314 }
2315
2316
2317 // weird?
2318 if (osdmap.is_down(badboy)) {
2319 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2320 << " " << m->get_target_addrs()
2321 << ", from " << m->get_orig_source() << dendl;
2322 if (m->get_epoch() < osdmap.get_epoch())
2323 send_incremental(op, m->get_epoch()+1);
2324 goto didit;
2325 }
2326 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2327 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2328 << " " << m->get_target_addrs()
2329 << " != map's " << osdmap.get_addrs(badboy)
2330 << ", from " << m->get_orig_source() << dendl;
2331 if (m->get_epoch() < osdmap.get_epoch())
2332 send_incremental(op, m->get_epoch()+1);
2333 goto didit;
2334 }
2335
2336 // already reported?
2337 if (osdmap.is_down(badboy) ||
2338 osdmap.get_up_from(badboy) > m->get_epoch()) {
2339 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2340 << " " << m->get_target_addrs()
2341 << ", from " << m->get_orig_source() << dendl;
2342 if (m->get_epoch() < osdmap.get_epoch())
2343 send_incremental(op, m->get_epoch()+1);
2344 goto didit;
2345 }
2346
2347 if (!can_mark_down(badboy)) {
2348 dout(5) << "preprocess_failure ignoring report of osd."
2349 << m->get_target_osd() << " " << m->get_target_addrs()
2350 << " from " << m->get_orig_source() << dendl;
2351 goto didit;
2352 }
2353
2354 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2355 << " " << m->get_target_addrs()
2356 << ", from " << m->get_orig_source() << dendl;
2357 return false;
2358
2359 didit:
2360 mon->no_reply(op);
2361 return true;
2362 }
2363
2364 class C_AckMarkedDown : public C_MonOp {
2365 OSDMonitor *osdmon;
2366 public:
2367 C_AckMarkedDown(
2368 OSDMonitor *osdmon,
2369 MonOpRequestRef op)
2370 : C_MonOp(op), osdmon(osdmon) {}
2371
2372 void _finish(int) override {
2373 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2374 osdmon->mon->send_reply(
2375 op,
2376 new MOSDMarkMeDown(
2377 m->fsid,
2378 m->target_osd,
2379 m->target_addrs,
2380 m->get_epoch(),
2381 false)); // ACK itself does not request an ack
2382 }
2383 ~C_AckMarkedDown() override {
2384 }
2385 };
2386
2387 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2388 {
2389 op->mark_osdmon_event(__func__);
2390 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2391 int from = m->target_osd;
2392
2393 // check permissions
2394 if (check_source(op, m->fsid))
2395 goto reply;
2396
2397 // first, verify the reporting host is valid
2398 if (!m->get_orig_source().is_osd())
2399 goto reply;
2400
2401 if (!osdmap.exists(from) ||
2402 osdmap.is_down(from) ||
2403 osdmap.get_addrs(from) != m->target_addrs) {
2404 dout(5) << "preprocess_mark_me_down from dead osd."
2405 << from << ", ignoring" << dendl;
2406 send_incremental(op, m->get_epoch()+1);
2407 goto reply;
2408 }
2409
2410 // no down might be set
2411 if (!can_mark_down(from))
2412 goto reply;
2413
2414 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2415 << " " << m->target_addrs << dendl;
2416 return false;
2417
2418 reply:
2419 if (m->request_ack) {
2420 Context *c(new C_AckMarkedDown(this, op));
2421 c->complete(0);
2422 }
2423 return true;
2424 }
2425
2426 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2427 {
2428 op->mark_osdmon_event(__func__);
2429 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2430 int target_osd = m->target_osd;
2431
2432 ceph_assert(osdmap.is_up(target_osd));
2433 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2434
2435 mon->clog->info() << "osd." << target_osd << " marked itself down";
2436 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2437 if (m->request_ack)
2438 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2439 return true;
2440 }
2441
2442 bool OSDMonitor::can_mark_down(int i)
2443 {
2444 if (osdmap.is_nodown(i)) {
2445 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
2446 << "will not mark it down" << dendl;
2447 return false;
2448 }
2449
2450 int num_osds = osdmap.get_num_osds();
2451 if (num_osds == 0) {
2452 dout(5) << __func__ << " no osds" << dendl;
2453 return false;
2454 }
2455 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
2456 float up_ratio = (float)up / (float)num_osds;
2457 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
2458 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
2459 << g_conf()->mon_osd_min_up_ratio
2460 << ", will not mark osd." << i << " down" << dendl;
2461 return false;
2462 }
2463 return true;
2464 }
2465
2466 bool OSDMonitor::can_mark_up(int i)
2467 {
2468 if (osdmap.is_noup(i)) {
2469 dout(5) << __func__ << " osd." << i << " is marked as noup, "
2470 << "will not mark it up" << dendl;
2471 return false;
2472 }
2473
2474 return true;
2475 }
2476
2477 /**
2478 * @note the parameter @p i apparently only exists here so we can output the
2479 * osd's id on messages.
2480 */
2481 bool OSDMonitor::can_mark_out(int i)
2482 {
2483 if (osdmap.is_noout(i)) {
2484 dout(5) << __func__ << " osd." << i << " is marked as noout, "
2485 << "will not mark it out" << dendl;
2486 return false;
2487 }
2488
2489 int num_osds = osdmap.get_num_osds();
2490 if (num_osds == 0) {
2491 dout(5) << __func__ << " no osds" << dendl;
2492 return false;
2493 }
2494 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
2495 float in_ratio = (float)in / (float)num_osds;
2496 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
2497 if (i >= 0)
2498 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2499 << g_conf()->mon_osd_min_in_ratio
2500 << ", will not mark osd." << i << " out" << dendl;
2501 else
2502 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2503 << g_conf()->mon_osd_min_in_ratio
2504 << ", will not mark osds out" << dendl;
2505 return false;
2506 }
2507
2508 return true;
2509 }
2510
2511 bool OSDMonitor::can_mark_in(int i)
2512 {
2513 if (osdmap.is_noin(i)) {
2514 dout(5) << __func__ << " osd." << i << " is marked as noin, "
2515 << "will not mark it in" << dendl;
2516 return false;
2517 }
2518
2519 return true;
2520 }
2521
2522 bool OSDMonitor::check_failures(utime_t now)
2523 {
2524 bool found_failure = false;
2525 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2526 p != failure_info.end();
2527 ++p) {
2528 if (can_mark_down(p->first)) {
2529 found_failure |= check_failure(now, p->first, p->second);
2530 }
2531 }
2532 return found_failure;
2533 }
2534
2535 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
2536 {
2537 // already pending failure?
2538 if (pending_inc.new_state.count(target_osd) &&
2539 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2540 dout(10) << " already pending failure" << dendl;
2541 return true;
2542 }
2543
2544 set<string> reporters_by_subtree;
2545 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2546 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
2547 utime_t max_failed_since = fi.get_failed_since();
2548 utime_t failed_for = now - max_failed_since;
2549
2550 utime_t grace = orig_grace;
2551 double my_grace = 0, peer_grace = 0;
2552 double decay_k = 0;
2553 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2554 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
2555 decay_k = ::log(.5) / halflife;
2556
2557 // scale grace period based on historical probability of 'lagginess'
2558 // (false positive failures due to slowness).
2559 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
2560 double decay = exp((double)failed_for * decay_k);
2561 dout(20) << " halflife " << halflife << " decay_k " << decay_k
2562 << " failed_for " << failed_for << " decay " << decay << dendl;
2563 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
2564 grace += my_grace;
2565 }
2566
2567 // consider the peers reporting a failure a proxy for a potential
2568 // 'subcluster' over the overall cluster that is similarly
2569 // laggy. this is clearly not true in all cases, but will sometimes
2570 // help us localize the grace correction to a subset of the system
2571 // (say, a rack with a bad switch) that is unhappy.
2572 ceph_assert(fi.reporters.size());
2573 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
2574 p != fi.reporters.end();
2575 ++p) {
2576 // get the parent bucket whose type matches with "reporter_subtree_level".
2577 // fall back to OSD if the level doesn't exist.
2578 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
2579 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
2580 if (iter == reporter_loc.end()) {
2581 reporters_by_subtree.insert("osd." + to_string(p->first));
2582 } else {
2583 reporters_by_subtree.insert(iter->second);
2584 }
2585 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2586 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
2587 utime_t elapsed = now - xi.down_stamp;
2588 double decay = exp((double)elapsed * decay_k);
2589 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
2590 }
2591 }
2592
2593 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2594 peer_grace /= (double)fi.reporters.size();
2595 grace += peer_grace;
2596 }
2597
2598 dout(10) << " osd." << target_osd << " has "
2599 << fi.reporters.size() << " reporters, "
2600 << grace << " grace (" << orig_grace << " + " << my_grace
2601 << " + " << peer_grace << "), max_failed_since " << max_failed_since
2602 << dendl;
2603
2604 if (failed_for >= grace &&
2605 reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
2606 dout(1) << " we have enough reporters to mark osd." << target_osd
2607 << " down" << dendl;
2608 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2609
2610 mon->clog->info() << "osd." << target_osd << " failed ("
2611 << osdmap.crush->get_full_location_ordered_string(
2612 target_osd)
2613 << ") ("
2614 << (int)reporters_by_subtree.size()
2615 << " reporters from different "
2616 << reporter_subtree_level << " after "
2617 << failed_for << " >= grace " << grace << ")";
2618 return true;
2619 }
2620 return false;
2621 }
2622
2623 void OSDMonitor::force_failure(int target_osd, int by)
2624 {
2625 // already pending failure?
2626 if (pending_inc.new_state.count(target_osd) &&
2627 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2628 dout(10) << " already pending failure" << dendl;
2629 return;
2630 }
2631
2632 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
2633 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2634
2635 mon->clog->info() << "osd." << target_osd << " failed ("
2636 << osdmap.crush->get_full_location_ordered_string(target_osd)
2637 << ") (connection refused reported by osd." << by << ")";
2638 return;
2639 }
2640
2641 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
2642 {
2643 op->mark_osdmon_event(__func__);
2644 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2645 dout(1) << "prepare_failure osd." << m->get_target_osd()
2646 << " " << m->get_target_addrs()
2647 << " from " << m->get_orig_source()
2648 << " is reporting failure:" << m->if_osd_failed() << dendl;
2649
2650 int target_osd = m->get_target_osd();
2651 int reporter = m->get_orig_source().num();
2652 ceph_assert(osdmap.is_up(target_osd));
2653 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
2654
2655 if (m->if_osd_failed()) {
2656 // calculate failure time
2657 utime_t now = ceph_clock_now();
2658 utime_t failed_since =
2659 m->get_recv_stamp() - utime_t(m->failed_for, 0);
2660
2661 // add a report
2662 if (m->is_immediate()) {
2663 mon->clog->debug() << "osd." << m->get_target_osd()
2664 << " reported immediately failed by "
2665 << m->get_orig_source();
2666 force_failure(target_osd, reporter);
2667 mon->no_reply(op);
2668 return true;
2669 }
2670 mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
2671 << m->get_orig_source();
2672
2673 failure_info_t& fi = failure_info[target_osd];
2674 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2675 if (old_op) {
2676 mon->no_reply(old_op);
2677 }
2678
2679 return check_failure(now, target_osd, fi);
2680 } else {
2681 // remove the report
2682 mon->clog->debug() << "osd." << m->get_target_osd()
2683 << " failure report canceled by "
2684 << m->get_orig_source();
2685 if (failure_info.count(target_osd)) {
2686 failure_info_t& fi = failure_info[target_osd];
2687 MonOpRequestRef report_op = fi.cancel_report(reporter);
2688 if (report_op) {
2689 mon->no_reply(report_op);
2690 }
2691 if (fi.reporters.empty()) {
2692 dout(10) << " removing last failure_info for osd." << target_osd
2693 << dendl;
2694 failure_info.erase(target_osd);
2695 } else {
2696 dout(10) << " failure_info for osd." << target_osd << " now "
2697 << fi.reporters.size() << " reporters" << dendl;
2698 }
2699 } else {
2700 dout(10) << " no failure_info for osd." << target_osd << dendl;
2701 }
2702 mon->no_reply(op);
2703 }
2704
2705 return false;
2706 }
2707
2708 void OSDMonitor::process_failures()
2709 {
2710 map<int,failure_info_t>::iterator p = failure_info.begin();
2711 while (p != failure_info.end()) {
2712 if (osdmap.is_up(p->first)) {
2713 ++p;
2714 } else {
2715 dout(10) << "process_failures osd." << p->first << dendl;
2716 list<MonOpRequestRef> ls;
2717 p->second.take_report_messages(ls);
2718 failure_info.erase(p++);
2719
2720 while (!ls.empty()) {
2721 MonOpRequestRef o = ls.front();
2722 if (o) {
2723 o->mark_event(__func__);
2724 MOSDFailure *m = o->get_req<MOSDFailure>();
2725 send_latest(o, m->get_epoch());
2726 mon->no_reply(o);
2727 }
2728 ls.pop_front();
2729 }
2730 }
2731 }
2732 }
2733
2734 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2735 {
2736 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2737
2738 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2739 p != failure_info.end();
2740 ++p) {
2741 p->second.take_report_messages(ls);
2742 }
2743 failure_info.clear();
2744 }
2745
2746
2747 // boot --
2748
2749 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2750 {
2751 op->mark_osdmon_event(__func__);
2752 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2753 int from = m->get_orig_source_inst().name.num();
2754
2755 // check permissions, ignore if failed (no response expected)
2756 MonSession *session = op->get_session();
2757 if (!session)
2758 goto ignore;
2759 if (!session->is_capable("osd", MON_CAP_X)) {
2760 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2761 << session->caps << dendl;
2762 goto ignore;
2763 }
2764
2765 if (m->sb.cluster_fsid != mon->monmap->fsid) {
2766 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2767 << " != " << mon->monmap->fsid << dendl;
2768 goto ignore;
2769 }
2770
2771 if (m->get_orig_source_inst().addr.is_blank_ip()) {
2772 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2773 goto ignore;
2774 }
2775
2776 ceph_assert(m->get_orig_source_inst().name.is_osd());
2777
2778 // force all osds to have gone through luminous prior to upgrade to nautilus
2779 {
2780 vector<string> missing;
2781 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2782 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
2783 }
2784 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
2785 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
2786 }
2787 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2788 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
2789 }
2790 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
2791 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
2792 }
2793
2794 if (!missing.empty()) {
2795 using std::experimental::make_ostream_joiner;
2796
2797 stringstream ss;
2798 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
2799
2800 mon->clog->info() << "disallowing boot of OSD "
2801 << m->get_orig_source_inst()
2802 << " because the osd lacks " << ss.str();
2803 goto ignore;
2804 }
2805 }
2806
2807 // make sure upgrades stop at nautilus
2808 if (HAVE_FEATURE(m->osd_features, SERVER_O) &&
2809 osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
2810 mon->clog->info() << "disallowing boot of post-nautilus OSD "
2811 << m->get_orig_source_inst()
2812 << " because require_osd_release < nautilus";
2813 goto ignore;
2814 }
2815
2816 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
2817 // we are reusing a jewel feature bit that was retired in luminous.
2818 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
2819 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
2820 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
2821 mon->clog->info() << "disallowing boot of OSD "
2822 << m->get_orig_source_inst()
2823 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
2824 goto ignore;
2825 }
2826
2827 // already booted?
2828 if (osdmap.is_up(from) &&
2829 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
2830 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
2831 // yup.
2832 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
2833 << " " << m->get_orig_source_addrs()
2834 << " =~ " << osdmap.get_addrs(from) << dendl;
2835 _booted(op, false);
2836 return true;
2837 }
2838
2839 if (osdmap.exists(from) &&
2840 !osdmap.get_uuid(from).is_zero() &&
2841 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2842 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2843 << " clashes with existing osd: different fsid"
2844 << " (ours: " << osdmap.get_uuid(from)
2845 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2846 goto ignore;
2847 }
2848
2849 if (osdmap.exists(from) &&
2850 osdmap.get_info(from).up_from > m->version &&
2851 osdmap.get_most_recent_addrs(from).legacy_equals(
2852 m->get_orig_source_addrs())) {
2853 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2854 send_latest(op, m->sb.current_epoch+1);
2855 return true;
2856 }
2857
2858 // noup?
2859 if (!can_mark_up(from)) {
2860 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2861 send_latest(op, m->sb.current_epoch+1);
2862 return true;
2863 }
2864
2865 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2866 return false;
2867
2868 ignore:
2869 return true;
2870 }
2871
2872 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2873 {
2874 op->mark_osdmon_event(__func__);
2875 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2876 dout(7) << __func__ << " from " << m->get_source()
2877 << " sb " << m->sb
2878 << " client_addrs" << m->get_connection()->get_peer_addrs()
2879 << " cluster_addrs " << m->cluster_addrs
2880 << " hb_back_addrs " << m->hb_back_addrs
2881 << " hb_front_addrs " << m->hb_front_addrs
2882 << dendl;
2883
2884 ceph_assert(m->get_orig_source().is_osd());
2885 int from = m->get_orig_source().num();
2886
2887 // does this osd exist?
2888 if (from >= osdmap.get_max_osd()) {
2889 dout(1) << "boot from osd." << from << " >= max_osd "
2890 << osdmap.get_max_osd() << dendl;
2891 return false;
2892 }
2893
2894 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2895 if (pending_inc.new_state.count(from))
2896 oldstate ^= pending_inc.new_state[from];
2897
2898 // already up? mark down first?
2899 if (osdmap.is_up(from)) {
2900 dout(7) << __func__ << " was up, first marking down osd." << from << " "
2901 << osdmap.get_addrs(from) << dendl;
2902 // preprocess should have caught these; if not, assert.
2903 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
2904 m->get_orig_source_addrs()) ||
2905 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
2906 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2907
2908 if (pending_inc.new_state.count(from) == 0 ||
2909 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2910 // mark previous guy down
2911 pending_inc.new_state[from] = CEPH_OSD_UP;
2912 }
2913 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2914 } else if (pending_inc.new_up_client.count(from)) {
2915 // already prepared, just wait
2916 dout(7) << __func__ << " already prepared, waiting on "
2917 << m->get_orig_source_addr() << dendl;
2918 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2919 } else {
2920 // mark new guy up.
2921 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
2922 pending_inc.new_up_cluster[from] = m->cluster_addrs;
2923 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
2924 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
2925
2926 down_pending_out.erase(from); // if any
2927
2928 if (m->sb.weight)
2929 osd_weight[from] = m->sb.weight;
2930
2931 // set uuid?
2932 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2933 << dendl;
2934 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2935 // preprocess should have caught this; if not, assert.
2936 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2937 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2938 }
2939
2940 // fresh osd?
2941 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2942 const osd_info_t& i = osdmap.get_info(from);
2943 if (i.up_from > i.lost_at) {
2944 dout(10) << " fresh osd; marking lost_at too" << dendl;
2945 pending_inc.new_lost[from] = osdmap.get_epoch();
2946 }
2947 }
2948
2949 // metadata
2950 bufferlist osd_metadata;
2951 encode(m->metadata, osd_metadata);
2952 pending_metadata[from] = osd_metadata;
2953 pending_metadata_rm.erase(from);
2954
2955 // adjust last clean unmount epoch?
2956 const osd_info_t& info = osdmap.get_info(from);
2957 dout(10) << " old osd_info: " << info << dendl;
2958 if (m->sb.mounted > info.last_clean_begin ||
2959 (m->sb.mounted == info.last_clean_begin &&
2960 m->sb.clean_thru > info.last_clean_end)) {
2961 epoch_t begin = m->sb.mounted;
2962 epoch_t end = m->sb.clean_thru;
2963
2964 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2965 << "[" << info.last_clean_begin << "," << info.last_clean_end
2966 << ") -> [" << begin << "-" << end << ")"
2967 << dendl;
2968 pending_inc.new_last_clean_interval[from] =
2969 pair<epoch_t,epoch_t>(begin, end);
2970 }
2971
2972 osd_xinfo_t xi = osdmap.get_xinfo(from);
2973 if (m->boot_epoch == 0) {
2974 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
2975 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
2976 dout(10) << " not laggy, new xi " << xi << dendl;
2977 } else {
2978 if (xi.down_stamp.sec()) {
2979 int interval = ceph_clock_now().sec() -
2980 xi.down_stamp.sec();
2981 if (g_conf()->mon_osd_laggy_max_interval &&
2982 (interval > g_conf()->mon_osd_laggy_max_interval)) {
2983 interval = g_conf()->mon_osd_laggy_max_interval;
2984 }
2985 xi.laggy_interval =
2986 interval * g_conf()->mon_osd_laggy_weight +
2987 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
2988 }
2989 xi.laggy_probability =
2990 g_conf()->mon_osd_laggy_weight +
2991 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
2992 dout(10) << " laggy, now xi " << xi << dendl;
2993 }
2994
2995 // set features shared by the osd
2996 if (m->osd_features)
2997 xi.features = m->osd_features;
2998 else
2999 xi.features = m->get_connection()->get_features();
3000
3001 // mark in?
3002 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3003 (oldstate & CEPH_OSD_AUTOOUT)) ||
3004 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3005 (g_conf()->mon_osd_auto_mark_in)) {
3006 if (can_mark_in(from)) {
3007 if (osdmap.osd_xinfo[from].old_weight > 0) {
3008 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
3009 xi.old_weight = 0;
3010 } else {
3011 pending_inc.new_weight[from] = CEPH_OSD_IN;
3012 }
3013 } else {
3014 dout(7) << __func__ << " NOIN set, will not mark in "
3015 << m->get_orig_source_addr() << dendl;
3016 }
3017 }
3018
3019 pending_inc.new_xinfo[from] = xi;
3020
3021 // wait
3022 wait_for_finished_proposal(op, new C_Booted(this, op));
3023 }
3024 return true;
3025 }
3026
3027 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3028 {
3029 op->mark_osdmon_event(__func__);
3030 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
3031 dout(7) << "_booted " << m->get_orig_source_inst()
3032 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3033
3034 if (logit) {
3035 mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3036 << " boot";
3037 }
3038
3039 send_latest(op, m->sb.current_epoch+1);
3040 }
3041
3042
3043 // -------------
3044 // full
3045
3046 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3047 {
3048 op->mark_osdmon_event(__func__);
3049 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3050 int from = m->get_orig_source().num();
3051 set<string> state;
3052 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3053
3054 // check permissions, ignore if failed
3055 MonSession *session = op->get_session();
3056 if (!session)
3057 goto ignore;
3058 if (!session->is_capable("osd", MON_CAP_X)) {
3059 dout(0) << "MOSDFull from entity with insufficient privileges:"
3060 << session->caps << dendl;
3061 goto ignore;
3062 }
3063
3064 // ignore a full message from the osd instance that already went down
3065 if (!osdmap.exists(from)) {
3066 dout(7) << __func__ << " ignoring full message from nonexistent "
3067 << m->get_orig_source_inst() << dendl;
3068 goto ignore;
3069 }
3070 if ((!osdmap.is_up(from) &&
3071 osdmap.get_most_recent_addrs(from).legacy_equals(
3072 m->get_orig_source_addrs())) ||
3073 (osdmap.is_up(from) &&
3074 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3075 dout(7) << __func__ << " ignoring full message from down "
3076 << m->get_orig_source_inst() << dendl;
3077 goto ignore;
3078 }
3079
3080 OSDMap::calc_state_set(osdmap.get_state(from), state);
3081
3082 if ((osdmap.get_state(from) & mask) == m->state) {
3083 dout(7) << __func__ << " state already " << state << " for osd." << from
3084 << " " << m->get_orig_source_inst() << dendl;
3085 _reply_map(op, m->version);
3086 goto ignore;
3087 }
3088
3089 dout(10) << __func__ << " want state " << state << " for osd." << from
3090 << " " << m->get_orig_source_inst() << dendl;
3091 return false;
3092
3093 ignore:
3094 return true;
3095 }
3096
3097 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3098 {
3099 op->mark_osdmon_event(__func__);
3100 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3101 const int from = m->get_orig_source().num();
3102
3103 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3104 const unsigned want_state = m->state & mask; // safety first
3105
3106 unsigned cur_state = osdmap.get_state(from);
3107 auto p = pending_inc.new_state.find(from);
3108 if (p != pending_inc.new_state.end()) {
3109 cur_state ^= p->second;
3110 }
3111 cur_state &= mask;
3112
3113 set<string> want_state_set, cur_state_set;
3114 OSDMap::calc_state_set(want_state, want_state_set);
3115 OSDMap::calc_state_set(cur_state, cur_state_set);
3116
3117 if (cur_state != want_state) {
3118 if (p != pending_inc.new_state.end()) {
3119 p->second &= ~mask;
3120 } else {
3121 pending_inc.new_state[from] = 0;
3122 }
3123 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3124 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3125 << " -> " << want_state_set << dendl;
3126 } else {
3127 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3128 << " = wanted " << want_state_set << ", just waiting" << dendl;
3129 }
3130
3131 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3132 return true;
3133 }
3134
3135 // -------------
3136 // alive
3137
3138 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3139 {
3140 op->mark_osdmon_event(__func__);
3141 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3142 int from = m->get_orig_source().num();
3143
3144 // check permissions, ignore if failed
3145 MonSession *session = op->get_session();
3146 if (!session)
3147 goto ignore;
3148 if (!session->is_capable("osd", MON_CAP_X)) {
3149 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3150 << session->caps << dendl;
3151 goto ignore;
3152 }
3153
3154 if (!osdmap.is_up(from) ||
3155 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3156 dout(7) << "preprocess_alive ignoring alive message from down "
3157 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3158 << dendl;
3159 goto ignore;
3160 }
3161
3162 if (osdmap.get_up_thru(from) >= m->want) {
3163 // yup.
3164 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3165 _reply_map(op, m->version);
3166 return true;
3167 }
3168
3169 dout(10) << "preprocess_alive want up_thru " << m->want
3170 << " from " << m->get_orig_source_inst() << dendl;
3171 return false;
3172
3173 ignore:
3174 return true;
3175 }
3176
3177 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3178 {
3179 op->mark_osdmon_event(__func__);
3180 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3181 int from = m->get_orig_source().num();
3182
3183 if (0) { // we probably don't care much about these
3184 mon->clog->debug() << m->get_orig_source_inst() << " alive";
3185 }
3186
3187 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3188 << " from " << m->get_orig_source_inst() << dendl;
3189
3190 update_up_thru(from, m->version); // set to the latest map the OSD has
3191 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3192 return true;
3193 }
3194
3195 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3196 {
3197 op->mark_osdmon_event(__func__);
3198 dout(7) << "_reply_map " << e
3199 << " from " << op->get_req()->get_orig_source_inst()
3200 << dendl;
3201 send_latest(op, e);
3202 }
3203
3204 // pg_created
3205 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3206 {
3207 op->mark_osdmon_event(__func__);
3208 auto m = static_cast<MOSDPGCreated*>(op->get_req());
3209 dout(10) << __func__ << " " << *m << dendl;
3210 auto session = op->get_session();
3211 mon->no_reply(op);
3212 if (!session) {
3213 dout(10) << __func__ << ": no monitor session!" << dendl;
3214 return true;
3215 }
3216 if (!session->is_capable("osd", MON_CAP_X)) {
3217 derr << __func__ << " received from entity "
3218 << "with insufficient privileges " << session->caps << dendl;
3219 return true;
3220 }
3221 // always forward the "created!" to the leader
3222 return false;
3223 }
3224
3225 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3226 {
3227 op->mark_osdmon_event(__func__);
3228 auto m = static_cast<MOSDPGCreated*>(op->get_req());
3229 dout(10) << __func__ << " " << *m << dendl;
3230 auto src = m->get_orig_source();
3231 auto from = src.num();
3232 if (!src.is_osd() ||
3233 !mon->osdmon()->osdmap.is_up(from) ||
3234 !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3235 m->get_orig_source_addrs())) {
3236 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3237 return false;
3238 }
3239 pending_created_pgs.push_back(m->pgid);
3240 return true;
3241 }
3242
3243 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3244 {
3245 op->mark_osdmon_event(__func__);
3246 auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3247 dout(10) << __func__ << " " << *m << dendl;
3248 const pg_pool_t *pi;
3249 auto session = op->get_session();
3250 if (!session) {
3251 dout(10) << __func__ << ": no monitor session!" << dendl;
3252 goto ignore;
3253 }
3254 if (!session->is_capable("osd", MON_CAP_X)) {
3255 derr << __func__ << " received from entity "
3256 << "with insufficient privileges " << session->caps << dendl;
3257 goto ignore;
3258 }
3259 pi = osdmap.get_pg_pool(m->pgid.pool());
3260 if (!pi) {
3261 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3262 goto ignore;
3263 }
3264 if (pi->get_pg_num() <= m->pgid.ps()) {
3265 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3266 goto ignore;
3267 }
3268 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3269 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3270 goto ignore;
3271 }
3272 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3273 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3274 goto ignore;
3275 }
3276 return false;
3277
3278 ignore:
3279 mon->no_reply(op);
3280 return true;
3281 }
3282
3283 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3284 {
3285 op->mark_osdmon_event(__func__);
3286 auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3287 dout(10) << __func__ << " " << *m << dendl;
3288 pg_pool_t p;
3289 if (pending_inc.new_pools.count(m->pgid.pool()))
3290 p = pending_inc.new_pools[m->pgid.pool()];
3291 else
3292 p = *osdmap.get_pg_pool(m->pgid.pool());
3293 if (p.get_pg_num() != m->pgid.ps() + 1 ||
3294 p.get_pg_num_pending() > m->pgid.ps()) {
3295 dout(10) << __func__
3296 << " race with concurrent pg_num[_pending] update, will retry"
3297 << dendl;
3298 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3299 return true;
3300 }
3301
3302 if (m->ready) {
3303 p.dec_pg_num(m->pgid,
3304 pending_inc.epoch,
3305 m->source_version,
3306 m->target_version,
3307 m->last_epoch_started,
3308 m->last_epoch_clean);
3309 p.last_change = pending_inc.epoch;
3310 } else {
3311 // back off the merge attempt!
3312 p.set_pg_num_pending(p.get_pg_num());
3313 }
3314
3315 // force pre-nautilus clients to resend their ops, since they
3316 // don't understand pg_num_pending changes form a new interval
3317 p.last_force_op_resend_prenautilus = pending_inc.epoch;
3318
3319 pending_inc.new_pools[m->pgid.pool()] = p;
3320
3321 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3322 if (m->ready &&
3323 prob > 0 &&
3324 prob > (double)(rand() % 1000)/1000.0) {
3325 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3326 auto n = new MMonCommand(mon->monmap->get_fsid());
3327 n->set_connection(m->get_connection());
3328 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3329 osdmap.get_pool_name(m->pgid.pool()) +
3330 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3331 stringify(m->pgid.ps() + 1) + "\"}" };
3332 MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3333 nop->set_type_service();
3334 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3335 } else {
3336 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3337 }
3338 return true;
3339 }
3340
3341
3342 // -------------
3343 // pg_temp changes
3344
3345 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3346 {
3347 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3348 dout(10) << "preprocess_pgtemp " << *m << dendl;
3349 mempool::osdmap::vector<int> empty;
3350 int from = m->get_orig_source().num();
3351 size_t ignore_cnt = 0;
3352
3353 // check caps
3354 MonSession *session = op->get_session();
3355 if (!session)
3356 goto ignore;
3357 if (!session->is_capable("osd", MON_CAP_X)) {
3358 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3359 << session->caps << dendl;
3360 goto ignore;
3361 }
3362
3363 if (!osdmap.is_up(from) ||
3364 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3365 dout(7) << "ignoring pgtemp message from down "
3366 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3367 << dendl;
3368 goto ignore;
3369 }
3370
3371 if (m->forced) {
3372 return false;
3373 }
3374
3375 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3376 dout(20) << " " << p->first
3377 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3378 << " -> " << p->second << dendl;
3379
3380 // does the pool exist?
3381 if (!osdmap.have_pg_pool(p->first.pool())) {
3382 /*
3383 * 1. If the osdmap does not have the pool, it means the pool has been
3384 * removed in-between the osd sending this message and us handling it.
3385 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3386 * not exist in the pending either, as the osds would not send a
3387 * message about a pool they know nothing about (yet).
3388 * 3. However, if the pool does exist in the pending, then it must be a
3389 * new pool, and not relevant to this message (see 1).
3390 */
3391 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3392 << ": pool has been removed" << dendl;
3393 ignore_cnt++;
3394 continue;
3395 }
3396
3397 int acting_primary = -1;
3398 osdmap.pg_to_up_acting_osds(
3399 p->first, nullptr, nullptr, nullptr, &acting_primary);
3400 if (acting_primary != from) {
3401 /* If the source isn't the primary based on the current osdmap, we know
3402 * that the interval changed and that we can discard this message.
3403 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3404 * which of two pg temp mappings on the same pg is more recent.
3405 */
3406 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3407 << ": primary has changed" << dendl;
3408 ignore_cnt++;
3409 continue;
3410 }
3411
3412 // removal?
3413 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3414 osdmap.primary_temp->count(p->first)))
3415 return false;
3416 // change?
3417 // NOTE: we assume that this will clear pg_primary, so consider
3418 // an existing pg_primary field to imply a change
3419 if (p->second.size() &&
3420 (osdmap.pg_temp->count(p->first) == 0 ||
3421 osdmap.pg_temp->get(p->first) != p->second ||
3422 osdmap.primary_temp->count(p->first)))
3423 return false;
3424 }
3425
3426 // should we ignore all the pgs?
3427 if (ignore_cnt == m->pg_temp.size())
3428 goto ignore;
3429
3430 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
3431 _reply_map(op, m->map_epoch);
3432 return true;
3433
3434 ignore:
3435 return true;
3436 }
3437
3438 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
3439 {
3440 epoch_t old_up_thru = osdmap.get_up_thru(from);
3441 auto ut = pending_inc.new_up_thru.find(from);
3442 if (ut != pending_inc.new_up_thru.end()) {
3443 old_up_thru = ut->second;
3444 }
3445 if (up_thru > old_up_thru) {
3446 // set up_thru too, so the osd doesn't have to ask again
3447 pending_inc.new_up_thru[from] = up_thru;
3448 }
3449 }
3450
3451 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
3452 {
3453 op->mark_osdmon_event(__func__);
3454 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3455 int from = m->get_orig_source().num();
3456 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
3457 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3458 uint64_t pool = p->first.pool();
3459 if (pending_inc.old_pools.count(pool)) {
3460 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3461 << ": pool pending removal" << dendl;
3462 continue;
3463 }
3464 if (!osdmap.have_pg_pool(pool)) {
3465 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3466 << ": pool has been removed" << dendl;
3467 continue;
3468 }
3469 pending_inc.new_pg_temp[p->first] =
3470 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
3471
3472 // unconditionally clear pg_primary (until this message can encode
3473 // a change for that, too.. at which point we need to also fix
3474 // preprocess_pg_temp)
3475 if (osdmap.primary_temp->count(p->first) ||
3476 pending_inc.new_primary_temp.count(p->first))
3477 pending_inc.new_primary_temp[p->first] = -1;
3478 }
3479
3480 // set up_thru too, so the osd doesn't have to ask again
3481 update_up_thru(from, m->map_epoch);
3482
3483 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
3484 return true;
3485 }
3486
3487
3488 // ---
3489
3490 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
3491 {
3492 op->mark_osdmon_event(__func__);
3493 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3494 dout(7) << "preprocess_remove_snaps " << *m << dendl;
3495
3496 // check privilege, ignore if failed
3497 MonSession *session = op->get_session();
3498 mon->no_reply(op);
3499 if (!session)
3500 goto ignore;
3501 if (!session->caps.is_capable(
3502 cct,
3503 CEPH_ENTITY_TYPE_MON,
3504 session->entity_name,
3505 "osd", "osd pool rmsnap", {}, true, true, false,
3506 session->get_peer_socket_addr())) {
3507 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3508 << session->caps << dendl;
3509 goto ignore;
3510 }
3511
3512 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
3513 q != m->snaps.end();
3514 ++q) {
3515 if (!osdmap.have_pg_pool(q->first)) {
3516 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
3517 continue;
3518 }
3519 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
3520 for (vector<snapid_t>::iterator p = q->second.begin();
3521 p != q->second.end();
3522 ++p) {
3523 if (*p > pi->get_snap_seq() ||
3524 !pi->removed_snaps.contains(*p))
3525 return false;
3526 }
3527 }
3528
3529 ignore:
3530 return true;
3531 }
3532
3533 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
3534 {
3535 op->mark_osdmon_event(__func__);
3536 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3537 dout(7) << "prepare_remove_snaps " << *m << dendl;
3538
3539 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
3540 p != m->snaps.end();
3541 ++p) {
3542
3543 if (!osdmap.have_pg_pool(p->first)) {
3544 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
3545 continue;
3546 }
3547
3548 pg_pool_t& pi = osdmap.pools[p->first];
3549 for (vector<snapid_t>::iterator q = p->second.begin();
3550 q != p->second.end();
3551 ++q) {
3552 if (!pi.removed_snaps.contains(*q) &&
3553 (!pending_inc.new_pools.count(p->first) ||
3554 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
3555 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
3556 newpi->removed_snaps.insert(*q);
3557 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
3558 dout(10) << " pool " << p->first << " removed_snaps added " << *q
3559 << " (now " << newpi->removed_snaps << ")" << dendl;
3560 if (*q > newpi->get_snap_seq()) {
3561 dout(10) << " pool " << p->first << " snap_seq "
3562 << newpi->get_snap_seq() << " -> " << *q << dendl;
3563 newpi->set_snap_seq(*q);
3564 }
3565 newpi->set_snap_epoch(pending_inc.epoch);
3566 pending_inc.new_removed_snaps[p->first].insert(*q);
3567 }
3568 }
3569 }
3570 return true;
3571 }
3572
3573 // osd beacon
3574 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
3575 {
3576 op->mark_osdmon_event(__func__);
3577 // check caps
3578 auto session = op->get_session();
3579 mon->no_reply(op);
3580 if (!session) {
3581 dout(10) << __func__ << " no monitor session!" << dendl;
3582 return true;
3583 }
3584 if (!session->is_capable("osd", MON_CAP_X)) {
3585 derr << __func__ << " received from entity "
3586 << "with insufficient privileges " << session->caps << dendl;
3587 return true;
3588 }
3589 // Always forward the beacon to the leader, even if they are the same as
3590 // the old one. The leader will mark as down osds that haven't sent
3591 // beacon for a few minutes.
3592 return false;
3593 }
3594
3595 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
3596 {
3597 op->mark_osdmon_event(__func__);
3598 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3599 const auto src = beacon->get_orig_source();
3600 dout(10) << __func__ << " " << *beacon
3601 << " from " << src << dendl;
3602 int from = src.num();
3603
3604 if (!src.is_osd() ||
3605 !osdmap.is_up(from) ||
3606 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
3607 if (src.is_osd() && !osdmap.is_up(from)) {
3608 // share some new maps with this guy in case it may not be
3609 // aware of its own deadness...
3610 send_latest(op, beacon->version+1);
3611 }
3612 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
3613 return false;
3614 }
3615
3616 last_osd_report[from] = ceph_clock_now();
3617 osd_epochs[from] = beacon->version;
3618
3619 for (const auto& pg : beacon->pgs) {
3620 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
3621 }
3622 return false;
3623 }
3624
3625 // ---------------
3626 // map helpers
3627
3628 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
3629 {
3630 op->mark_osdmon_event(__func__);
3631 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
3632 << " start " << start << dendl;
3633 if (start == 0)
3634 send_full(op);
3635 else
3636 send_incremental(op, start);
3637 }
3638
3639
3640 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
3641 {
3642 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
3643 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
3644 r->oldest_map = get_first_committed();
3645 r->newest_map = osdmap.get_epoch();
3646 return r;
3647 }
3648
3649 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
3650 {
3651 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
3652 << std::hex << features << std::dec << dendl;
3653 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
3654 m->oldest_map = get_first_committed();
3655 m->newest_map = osdmap.get_epoch();
3656
3657 for (epoch_t e = to; e >= from && e > 0; e--) {
3658 bufferlist bl;
3659 int err = get_version(e, features, bl);
3660 if (err == 0) {
3661 ceph_assert(bl.length());
3662 // if (get_version(e, bl) > 0) {
3663 dout(20) << "build_incremental inc " << e << " "
3664 << bl.length() << " bytes" << dendl;
3665 m->incremental_maps[e] = bl;
3666 } else {
3667 ceph_assert(err == -ENOENT);
3668 ceph_assert(!bl.length());
3669 get_version_full(e, features, bl);
3670 if (bl.length() > 0) {
3671 //else if (get_version("full", e, bl) > 0) {
3672 dout(20) << "build_incremental full " << e << " "
3673 << bl.length() << " bytes" << dendl;
3674 m->maps[e] = bl;
3675 } else {
3676 ceph_abort(); // we should have all maps.
3677 }
3678 }
3679 }
3680 return m;
3681 }
3682
3683 void OSDMonitor::send_full(MonOpRequestRef op)
3684 {
3685 op->mark_osdmon_event(__func__);
3686 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
3687 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
3688 }
3689
3690 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
3691 {
3692 op->mark_osdmon_event(__func__);
3693
3694 MonSession *s = op->get_session();
3695 ceph_assert(s);
3696
3697 if (s->proxy_con) {
3698 // oh, we can tell the other mon to do it
3699 dout(10) << __func__ << " asking proxying mon to send_incremental from "
3700 << first << dendl;
3701 MRoute *r = new MRoute(s->proxy_tid, NULL);
3702 r->send_osdmap_first = first;
3703 s->proxy_con->send_message(r);
3704 op->mark_event("reply: send routed send_osdmap_first reply");
3705 } else {
3706 // do it ourselves
3707 send_incremental(first, s, false, op);
3708 }
3709 }
3710
3711 void OSDMonitor::send_incremental(epoch_t first,
3712 MonSession *session,
3713 bool onetime,
3714 MonOpRequestRef req)
3715 {
3716 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3717 << " to " << session->name << dendl;
3718
3719 // get feature of the peer
3720 // use quorum_con_features, if it's an anonymous connection.
3721 uint64_t features = session->con_features ? session->con_features :
3722 mon->get_quorum_con_features();
3723
3724 if (first <= session->osd_epoch) {
3725 dout(10) << __func__ << " " << session->name << " should already have epoch "
3726 << session->osd_epoch << dendl;
3727 first = session->osd_epoch + 1;
3728 }
3729
3730 if (first < get_first_committed()) {
3731 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
3732 m->oldest_map = get_first_committed();
3733 m->newest_map = osdmap.get_epoch();
3734
3735 // share removed snaps during the gap
3736 get_removed_snaps_range(first, m->oldest_map, &m->gap_removed_snaps);
3737
3738 first = get_first_committed();
3739 bufferlist bl;
3740 int err = get_version_full(first, features, bl);
3741 ceph_assert(err == 0);
3742 ceph_assert(bl.length());
3743 dout(20) << "send_incremental starting with base full "
3744 << first << " " << bl.length() << " bytes" << dendl;
3745 m->maps[first] = bl;
3746
3747 if (req) {
3748 mon->send_reply(req, m);
3749 session->osd_epoch = first;
3750 return;
3751 } else {
3752 session->con->send_message(m);
3753 session->osd_epoch = first;
3754 }
3755 first++;
3756 }
3757
3758 while (first <= osdmap.get_epoch()) {
3759 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
3760 osdmap.get_epoch());
3761 MOSDMap *m = build_incremental(first, last, features);
3762
3763 if (req) {
3764 // send some maps. it may not be all of them, but it will get them
3765 // started.
3766 mon->send_reply(req, m);
3767 } else {
3768 session->con->send_message(m);
3769 first = last + 1;
3770 }
3771 session->osd_epoch = last;
3772 if (onetime || req)
3773 break;
3774 }
3775 }
3776
3777 void OSDMonitor::get_removed_snaps_range(
3778 epoch_t start, epoch_t end,
3779 mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps)
3780 {
3781 // we only care about pools that exist now.
3782 for (auto& p : osdmap.get_pools()) {
3783 auto& t = (*gap_removed_snaps)[p.first];
3784 for (epoch_t epoch = start; epoch < end; ++epoch) {
3785 string k = make_snap_epoch_key(p.first, epoch);
3786 bufferlist v;
3787 mon->store->get(OSD_SNAP_PREFIX, k, v);
3788 if (v.length()) {
3789 auto q = v.cbegin();
3790 OSDMap::snap_interval_set_t snaps;
3791 decode(snaps, q);
3792 t.union_of(snaps);
3793 }
3794 }
3795 dout(10) << __func__ << " " << p.first << " " << t << dendl;
3796 }
3797 }
3798
3799 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3800 {
3801 return get_version(ver, mon->get_quorum_con_features(), bl);
3802 }
3803
3804 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
3805 {
3806 OSDMap::Incremental inc;
3807 auto q = bl.cbegin();
3808 inc.decode(q);
3809 // always encode with subset of osdmap's canonical features
3810 uint64_t f = features & inc.encode_features;
3811 dout(20) << __func__ << " " << inc.epoch << " with features " << f
3812 << dendl;
3813 bl.clear();
3814 if (inc.fullmap.length()) {
3815 // embedded full map?
3816 OSDMap m;
3817 m.decode(inc.fullmap);
3818 inc.fullmap.clear();
3819 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
3820 }
3821 if (inc.crush.length()) {
3822 // embedded crush map
3823 CrushWrapper c;
3824 auto p = inc.crush.cbegin();
3825 c.decode(p);
3826 inc.crush.clear();
3827 c.encode(inc.crush, f);
3828 }
3829 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
3830 }
3831
3832 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
3833 {
3834 OSDMap m;
3835 auto q = bl.cbegin();
3836 m.decode(q);
3837 // always encode with subset of osdmap's canonical features
3838 uint64_t f = features & m.get_encoding_features();
3839 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
3840 << dendl;
3841 bl.clear();
3842 m.encode(bl, f | CEPH_FEATURE_RESERVED);
3843 }
3844
3845 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
3846 {
3847 uint64_t significant_features = OSDMap::get_significant_features(features);
3848 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
3849 return 0;
3850 }
3851 int ret = PaxosService::get_version(ver, bl);
3852 if (ret < 0) {
3853 return ret;
3854 }
3855 // NOTE: this check is imprecise; the OSDMap encoding features may
3856 // be a subset of the latest mon quorum features, but worst case we
3857 // reencode once and then cache the (identical) result under both
3858 // feature masks.
3859 if (significant_features !=
3860 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
3861 reencode_incremental_map(bl, features);
3862 }
3863 inc_osd_cache.add({ver, significant_features}, bl);
3864 return 0;
3865 }
3866
3867 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
3868 {
3869 bufferlist inc_bl;
3870 int err = get_version(ver, inc_bl);
3871 ceph_assert(err == 0);
3872 ceph_assert(inc_bl.length());
3873
3874 auto p = inc_bl.cbegin();
3875 inc.decode(p);
3876 dout(10) << __func__ << " "
3877 << " epoch " << inc.epoch
3878 << " inc_crc " << inc.inc_crc
3879 << " full_crc " << inc.full_crc
3880 << " encode_features " << inc.encode_features << dendl;
3881 return 0;
3882 }
3883
3884 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
3885 {
3886 dout(10) << __func__ << " ver " << ver << dendl;
3887
3888 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
3889 if (closest_pinned == 0) {
3890 return -ENOENT;
3891 }
3892 if (closest_pinned > ver) {
3893 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
3894 }
3895 ceph_assert(closest_pinned <= ver);
3896
3897 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
3898
3899 // get osdmap incremental maps and apply on top of this one.
3900 bufferlist osdm_bl;
3901 bool has_cached_osdmap = false;
3902 for (version_t v = ver-1; v >= closest_pinned; --v) {
3903 if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
3904 &osdm_bl)) {
3905 dout(10) << __func__ << " found map in cache ver " << v << dendl;
3906 closest_pinned = v;
3907 has_cached_osdmap = true;
3908 break;
3909 }
3910 }
3911
3912 if (!has_cached_osdmap) {
3913 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
3914 if (err != 0) {
3915 derr << __func__ << " closest pinned map ver " << closest_pinned
3916 << " not available! error: " << cpp_strerror(err) << dendl;
3917 }
3918 ceph_assert(err == 0);
3919 }
3920
3921 ceph_assert(osdm_bl.length());
3922
3923 OSDMap osdm;
3924 osdm.decode(osdm_bl);
3925
3926 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
3927 << " e" << osdm.epoch
3928 << " crc " << osdm.get_crc()
3929 << " -- applying incremental maps." << dendl;
3930
3931 uint64_t encode_features = 0;
3932 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
3933 dout(20) << __func__ << " applying inc epoch " << v << dendl;
3934
3935 OSDMap::Incremental inc;
3936 int err = get_inc(v, inc);
3937 ceph_assert(err == 0);
3938
3939 encode_features = inc.encode_features;
3940
3941 err = osdm.apply_incremental(inc);
3942 ceph_assert(err == 0);
3943
3944 // this block performs paranoid checks on map retrieval
3945 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
3946 inc.full_crc != 0) {
3947
3948 uint64_t f = encode_features;
3949 if (!f) {
3950 f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
3951 }
3952
3953 // encode osdmap to force calculating crcs
3954 bufferlist tbl;
3955 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
3956 // decode osdmap to compare crcs with what's expected by incremental
3957 OSDMap tosdm;
3958 tosdm.decode(tbl);
3959
3960 if (tosdm.get_crc() != inc.full_crc) {
3961 derr << __func__
3962 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
3963 << ", expected " << inc.full_crc << ")" << dendl;
3964 ceph_abort_msg("osdmap crc mismatch");
3965 }
3966 }
3967
3968 // note: we cannot add the recently computed map to the cache, as is,
3969 // because we have not encoded the map into a bl.
3970 }
3971
3972 if (!encode_features) {
3973 dout(10) << __func__
3974 << " last incremental map didn't have features;"
3975 << " defaulting to quorum's or all" << dendl;
3976 encode_features =
3977 (mon->quorum_con_features ? mon->quorum_con_features : -1);
3978 }
3979 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
3980
3981 return 0;
3982 }
3983
3984 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3985 {
3986 return get_version_full(ver, mon->get_quorum_con_features(), bl);
3987 }
3988
3989 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
3990 bufferlist& bl)
3991 {
3992 uint64_t significant_features = OSDMap::get_significant_features(features);
3993 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
3994 return 0;
3995 }
3996 int ret = PaxosService::get_version_full(ver, bl);
3997 if (ret == -ENOENT) {
3998 // build map?
3999 ret = get_full_from_pinned_map(ver, bl);
4000 }
4001 if (ret < 0) {
4002 return ret;
4003 }
4004 // NOTE: this check is imprecise; the OSDMap encoding features may
4005 // be a subset of the latest mon quorum features, but worst case we
4006 // reencode once and then cache the (identical) result under both
4007 // feature masks.
4008 if (significant_features !=
4009 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4010 reencode_full_map(bl, features);
4011 }
4012 full_osd_cache.add({ver, significant_features}, bl);
4013 return 0;
4014 }
4015
4016 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4017 {
4018 dout(10) << "blacklist " << av << " until " << until << dendl;
4019 for (auto a : av.v) {
4020 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4021 a.set_type(entity_addr_t::TYPE_ANY);
4022 } else {
4023 a.set_type(entity_addr_t::TYPE_LEGACY);
4024 }
4025 pending_inc.new_blacklist[a] = until;
4026 }
4027 return pending_inc.epoch;
4028 }
4029
4030 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4031 {
4032 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4033 a.set_type(entity_addr_t::TYPE_ANY);
4034 } else {
4035 a.set_type(entity_addr_t::TYPE_LEGACY);
4036 }
4037 dout(10) << "blacklist " << a << " until " << until << dendl;
4038 pending_inc.new_blacklist[a] = until;
4039 return pending_inc.epoch;
4040 }
4041
4042
4043 void OSDMonitor::check_osdmap_subs()
4044 {
4045 dout(10) << __func__ << dendl;
4046 if (!osdmap.get_epoch()) {
4047 return;
4048 }
4049 auto osdmap_subs = mon->session_map.subs.find("osdmap");
4050 if (osdmap_subs == mon->session_map.subs.end()) {
4051 return;
4052 }
4053 auto p = osdmap_subs->second->begin();
4054 while (!p.end()) {
4055 auto sub = *p;
4056 ++p;
4057 check_osdmap_sub(sub);
4058 }
4059 }
4060
4061 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4062 {
4063 dout(10) << __func__ << " " << sub << " next " << sub->next
4064 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4065 if (sub->next <= osdmap.get_epoch()) {
4066 if (sub->next >= 1)
4067 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4068 else
4069 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4070 if (sub->onetime)
4071 mon->session_map.remove_sub(sub);
4072 else
4073 sub->next = osdmap.get_epoch() + 1;
4074 }
4075 }
4076
4077 void OSDMonitor::check_pg_creates_subs()
4078 {
4079 if (!osdmap.get_num_up_osds()) {
4080 return;
4081 }
4082 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4083 mon->with_session_map([this](const MonSessionMap& session_map) {
4084 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4085 if (pg_creates_subs == session_map.subs.end()) {
4086 return;
4087 }
4088 for (auto sub : *pg_creates_subs->second) {
4089 check_pg_creates_sub(sub);
4090 }
4091 });
4092 }
4093
4094 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4095 {
4096 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4097 ceph_assert(sub->type == "osd_pg_creates");
4098 // only send these if the OSD is up. we will check_subs() when they do
4099 // come up so they will get the creates then.
4100 if (sub->session->name.is_osd() &&
4101 mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4102 sub->next = send_pg_creates(sub->session->name.num(),
4103 sub->session->con.get(),
4104 sub->next);
4105 }
4106 }
4107
4108 void OSDMonitor::do_application_enable(int64_t pool_id,
4109 const std::string &app_name,
4110 const std::string &app_key,
4111 const std::string &app_value)
4112 {
4113 ceph_assert(paxos->is_plugged() && is_writeable());
4114
4115 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4116 << dendl;
4117
4118 ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
4119
4120 auto pp = osdmap.get_pg_pool(pool_id);
4121 ceph_assert(pp != nullptr);
4122
4123 pg_pool_t p = *pp;
4124 if (pending_inc.new_pools.count(pool_id)) {
4125 p = pending_inc.new_pools[pool_id];
4126 }
4127
4128 if (app_key.empty()) {
4129 p.application_metadata.insert({app_name, {}});
4130 } else {
4131 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4132 }
4133 p.last_change = pending_inc.epoch;
4134 pending_inc.new_pools[pool_id] = p;
4135 }
4136
4137 unsigned OSDMonitor::scan_for_creating_pgs(
4138 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4139 const mempool::osdmap::set<int64_t>& removed_pools,
4140 utime_t modified,
4141 creating_pgs_t* creating_pgs) const
4142 {
4143 unsigned queued = 0;
4144 for (auto& p : pools) {
4145 int64_t poolid = p.first;
4146 if (creating_pgs->created_pools.count(poolid)) {
4147 dout(10) << __func__ << " already created " << poolid << dendl;
4148 continue;
4149 }
4150 const pg_pool_t& pool = p.second;
4151 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4152 pool.get_type(), pool.get_size());
4153 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4154 continue;
4155
4156 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4157 const auto created = pool.get_last_change();
4158 if (last_scan_epoch && created <= last_scan_epoch) {
4159 dout(10) << __func__ << " no change in pool " << poolid
4160 << " " << pool << dendl;
4161 continue;
4162 }
4163 if (removed_pools.count(poolid)) {
4164 dout(10) << __func__ << " pool is being removed: " << poolid
4165 << " " << pool << dendl;
4166 continue;
4167 }
4168 dout(10) << __func__ << " queueing pool create for " << poolid
4169 << " " << pool << dendl;
4170 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4171 created, modified);
4172 queued++;
4173 }
4174 return queued;
4175 }
4176
4177 void OSDMonitor::update_creating_pgs()
4178 {
4179 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4180 << creating_pgs.queue.size() << " pools in queue" << dendl;
4181 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4182 std::lock_guard<std::mutex> l(creating_pgs_lock);
4183 for (const auto& pg : creating_pgs.pgs) {
4184 int acting_primary = -1;
4185 auto pgid = pg.first;
4186 if (!osdmap.pg_exists(pgid)) {
4187 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4188 << dendl;
4189 continue;
4190 }
4191 auto mapped = pg.second.first;
4192 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4193 spg_t spgid(pgid);
4194 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4195 // check the previous creating_pgs, look for the target to whom the pg was
4196 // previously mapped
4197 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4198 const auto last_acting_primary = pgs_by_epoch.first;
4199 for (auto& pgs: pgs_by_epoch.second) {
4200 if (pgs.second.count(spgid)) {
4201 if (last_acting_primary == acting_primary) {
4202 mapped = pgs.first;
4203 } else {
4204 dout(20) << __func__ << " " << pgid << " "
4205 << " acting_primary:" << last_acting_primary
4206 << " -> " << acting_primary << dendl;
4207 // note epoch if the target of the create message changed.
4208 mapped = mapping.get_epoch();
4209 }
4210 break;
4211 } else {
4212 // newly creating
4213 mapped = mapping.get_epoch();
4214 }
4215 }
4216 }
4217 dout(10) << __func__ << " will instruct osd." << acting_primary
4218 << " to create " << pgid << "@" << mapped << dendl;
4219 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4220 }
4221 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4222 creating_pgs_epoch = mapping.get_epoch();
4223 }
4224
4225 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4226 {
4227 dout(30) << __func__ << " osd." << osd << " next=" << next
4228 << " " << creating_pgs_by_osd_epoch << dendl;
4229 std::lock_guard<std::mutex> l(creating_pgs_lock);
4230 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4231 dout(20) << __func__
4232 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4233 // the subscribers will be updated when the mapping is completed anyway
4234 return next;
4235 }
4236 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4237 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4238 return next;
4239 ceph_assert(!creating_pgs_by_epoch->second.empty());
4240
4241 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4242 MOSDPGCreate2 *m = nullptr;
4243
4244 bool old = osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS;
4245
4246 epoch_t last = 0;
4247 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4248 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4249 auto epoch = epoch_pgs->first;
4250 auto& pgs = epoch_pgs->second;
4251 dout(20) << __func__ << " osd." << osd << " from " << next
4252 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4253 last = epoch;
4254 for (auto& pg : pgs) {
4255 // Need the create time from the monitor using its clock to set
4256 // last_scrub_stamp upon pg creation.
4257 auto create = creating_pgs.pgs.find(pg.pgid);
4258 ceph_assert(create != creating_pgs.pgs.end());
4259 if (old) {
4260 if (!oldm) {
4261 oldm = new MOSDPGCreate(creating_pgs_epoch);
4262 }
4263 oldm->mkpg.emplace(pg.pgid,
4264 pg_create_t{create->second.first, pg.pgid, 0});
4265 oldm->ctimes.emplace(pg.pgid, create->second.second);
4266 } else {
4267 if (!m) {
4268 m = new MOSDPGCreate2(creating_pgs_epoch);
4269 }
4270 m->pgs.emplace(pg, create->second);
4271 }
4272 dout(20) << __func__ << " will create " << pg
4273 << " at " << create->second.first << dendl;
4274 }
4275 }
4276 if (m) {
4277 con->send_message(m);
4278 } else if (oldm) {
4279 con->send_message(oldm);
4280 } else {
4281 dout(20) << __func__ << " osd." << osd << " from " << next
4282 << " has nothing to send" << dendl;
4283 return next;
4284 }
4285
4286 // sub is current through last + 1
4287 return last + 1;
4288 }
4289
4290 // TICK
4291
4292
4293 void OSDMonitor::tick()
4294 {
4295 if (!is_active()) return;
4296
4297 dout(10) << osdmap << dendl;
4298
4299 // always update osdmap manifest, regardless of being the leader.
4300 load_osdmap_manifest();
4301
4302 if (!mon->is_leader()) return;
4303
4304 bool do_propose = false;
4305 utime_t now = ceph_clock_now();
4306
4307 if (handle_osd_timeouts(now, last_osd_report)) {
4308 do_propose = true;
4309 }
4310
4311 // mark osds down?
4312 if (check_failures(now)) {
4313 do_propose = true;
4314 }
4315
4316 // Force a proposal if we need to prune; pruning is performed on
4317 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4318 // even if there's nothing going on.
4319 if (is_prune_enabled() && should_prune()) {
4320 do_propose = true;
4321 }
4322
4323 // mark down osds out?
4324
4325 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4326 * influence at all. The decision is made based on the ratio of "in" osds,
4327 * and the function returns false if this ratio is lower that the minimum
4328 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
4329 */
4330 if (can_mark_out(-1)) {
4331 string down_out_subtree_limit = g_conf().get_val<string>(
4332 "mon_osd_down_out_subtree_limit");
4333 set<int> down_cache; // quick cache of down subtrees
4334
4335 map<int,utime_t>::iterator i = down_pending_out.begin();
4336 while (i != down_pending_out.end()) {
4337 int o = i->first;
4338 utime_t down = now;
4339 down -= i->second;
4340 ++i;
4341
4342 if (osdmap.is_down(o) &&
4343 osdmap.is_in(o) &&
4344 can_mark_out(o)) {
4345 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
4346 utime_t grace = orig_grace;
4347 double my_grace = 0.0;
4348
4349 if (g_conf()->mon_osd_adjust_down_out_interval) {
4350 // scale grace period the same way we do the heartbeat grace.
4351 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
4352 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
4353 double decay_k = ::log(.5) / halflife;
4354 double decay = exp((double)down * decay_k);
4355 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
4356 << " down for " << down << " decay " << decay << dendl;
4357 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
4358 grace += my_grace;
4359 }
4360
4361 // is this an entire large subtree down?
4362 if (down_out_subtree_limit.length()) {
4363 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
4364 if (type > 0) {
4365 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
4366 dout(10) << "tick entire containing " << down_out_subtree_limit
4367 << " subtree for osd." << o
4368 << " is down; resetting timer" << dendl;
4369 // reset timer, too.
4370 down_pending_out[o] = now;
4371 continue;
4372 }
4373 }
4374 }
4375
4376 bool down_out = !osdmap.is_destroyed(o) &&
4377 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
4378 bool destroyed_out = osdmap.is_destroyed(o) &&
4379 g_conf()->mon_osd_destroyed_out_interval > 0 &&
4380 // this is not precise enough as we did not make a note when this osd
4381 // was marked as destroyed, but let's not bother with that
4382 // complexity for now.
4383 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
4384 if (down_out || destroyed_out) {
4385 dout(10) << "tick marking osd." << o << " OUT after " << down
4386 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
4387 pending_inc.new_weight[o] = CEPH_OSD_OUT;
4388
4389 // set the AUTOOUT bit.
4390 if (pending_inc.new_state.count(o) == 0)
4391 pending_inc.new_state[o] = 0;
4392 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
4393
4394 // remember previous weight
4395 if (pending_inc.new_xinfo.count(o) == 0)
4396 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
4397 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
4398
4399 do_propose = true;
4400
4401 mon->clog->info() << "Marking osd." << o << " out (has been down for "
4402 << int(down.sec()) << " seconds)";
4403 } else
4404 continue;
4405 }
4406
4407 down_pending_out.erase(o);
4408 }
4409 } else {
4410 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
4411 }
4412
4413 // expire blacklisted items?
4414 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4415 p != osdmap.blacklist.end();
4416 ++p) {
4417 if (p->second < now) {
4418 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
4419 pending_inc.old_blacklist.push_back(p->first);
4420 do_propose = true;
4421 }
4422 }
4423
4424 if (try_prune_purged_snaps()) {
4425 do_propose = true;
4426 }
4427
4428 if (update_pools_status())
4429 do_propose = true;
4430
4431 if (do_propose ||
4432 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
4433 propose_pending();
4434 }
4435
4436 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
4437 std::map<int,utime_t> &last_osd_report)
4438 {
4439 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
4440 if (now - mon->get_leader_since() < timeo) {
4441 // We haven't been the leader for long enough to consider OSD timeouts
4442 return false;
4443 }
4444
4445 int max_osd = osdmap.get_max_osd();
4446 bool new_down = false;
4447
4448 for (int i=0; i < max_osd; ++i) {
4449 dout(30) << __func__ << ": checking up on osd " << i << dendl;
4450 if (!osdmap.exists(i)) {
4451 last_osd_report.erase(i); // if any
4452 continue;
4453 }
4454 if (!osdmap.is_up(i))
4455 continue;
4456 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
4457 if (t == last_osd_report.end()) {
4458 // it wasn't in the map; start the timer.
4459 last_osd_report[i] = now;
4460 } else if (can_mark_down(i)) {
4461 utime_t diff = now - t->second;
4462 if (diff > timeo) {
4463 mon->clog->info() << "osd." << i << " marked down after no beacon for "
4464 << diff << " seconds";
4465 derr << "no beacon from osd." << i << " since " << t->second
4466 << ", " << diff << " seconds ago. marking down" << dendl;
4467 pending_inc.new_state[i] = CEPH_OSD_UP;
4468 new_down = true;
4469 }
4470 }
4471 }
4472 return new_down;
4473 }
4474
4475 static void dump_cpu_list(Formatter *f, const char *name,
4476 const string& strlist)
4477 {
4478 cpu_set_t cpu_set;
4479 size_t cpu_set_size;
4480 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
4481 return;
4482 }
4483 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
4484 f->open_array_section(name);
4485 for (auto cpu : cpus) {
4486 f->dump_int("cpu", cpu);
4487 }
4488 f->close_section();
4489 }
4490
4491 void OSDMonitor::dump_info(Formatter *f)
4492 {
4493 f->open_object_section("osdmap");
4494 osdmap.dump(f);
4495 f->close_section();
4496
4497 f->open_array_section("osd_metadata");
4498 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4499 if (osdmap.exists(i)) {
4500 f->open_object_section("osd");
4501 f->dump_unsigned("id", i);
4502 dump_osd_metadata(i, f, NULL);
4503 f->close_section();
4504 }
4505 }
4506 f->close_section();
4507
4508 f->dump_unsigned("osdmap_first_committed", get_first_committed());
4509 f->dump_unsigned("osdmap_last_committed", get_last_committed());
4510
4511 f->open_object_section("crushmap");
4512 osdmap.crush->dump(f);
4513 f->close_section();
4514
4515 if (has_osdmap_manifest) {
4516 f->open_object_section("osdmap_manifest");
4517 osdmap_manifest.dump(f);
4518 f->close_section();
4519 }
4520 }
4521
4522 namespace {
4523 enum osd_pool_get_choices {
4524 SIZE, MIN_SIZE,
4525 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
4526 NODELETE, NOPGCHANGE, NOSIZECHANGE,
4527 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
4528 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4529 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
4530 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4531 CACHE_TARGET_FULL_RATIO,
4532 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4533 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
4534 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
4535 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
4536 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
4537 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
4538 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
4539 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
4540 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
4541 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
4542 PG_AUTOSCALE_BIAS };
4543
4544 std::set<osd_pool_get_choices>
4545 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
4546 const std::set<osd_pool_get_choices>& second)
4547 {
4548 std::set<osd_pool_get_choices> result;
4549 std::set_difference(first.begin(), first.end(),
4550 second.begin(), second.end(),
4551 std::inserter(result, result.end()));
4552 return result;
4553 }
4554 }
4555
4556
4557 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
4558 {
4559 op->mark_osdmon_event(__func__);
4560 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
4561 int r = 0;
4562 bufferlist rdata;
4563 stringstream ss, ds;
4564
4565 cmdmap_t cmdmap;
4566 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
4567 string rs = ss.str();
4568 mon->reply_command(op, -EINVAL, rs, get_last_committed());
4569 return true;
4570 }
4571
4572 MonSession *session = op->get_session();
4573 if (!session) {
4574 derr << __func__ << " no session" << dendl;
4575 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
4576 return true;
4577 }
4578
4579 string prefix;
4580 cmd_getval(cct, cmdmap, "prefix", prefix);
4581
4582 string format;
4583 cmd_getval(cct, cmdmap, "format", format, string("plain"));
4584 boost::scoped_ptr<Formatter> f(Formatter::create(format));
4585
4586 if (prefix == "osd stat") {
4587 osdmap.print_summary(f.get(), ds, "", true);
4588 if (f)
4589 f->flush(rdata);
4590 else
4591 rdata.append(ds);
4592 }
4593 else if (prefix == "osd dump" ||
4594 prefix == "osd tree" ||
4595 prefix == "osd tree-from" ||
4596 prefix == "osd ls" ||
4597 prefix == "osd getmap" ||
4598 prefix == "osd getcrushmap" ||
4599 prefix == "osd ls-tree") {
4600 string val;
4601
4602 epoch_t epoch = 0;
4603 int64_t epochnum;
4604 cmd_getval(cct, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
4605 epoch = epochnum;
4606
4607 bufferlist osdmap_bl;
4608 int err = get_version_full(epoch, osdmap_bl);
4609 if (err == -ENOENT) {
4610 r = -ENOENT;
4611 ss << "there is no map for epoch " << epoch;
4612 goto reply;
4613 }
4614 ceph_assert(err == 0);
4615 ceph_assert(osdmap_bl.length());
4616
4617 OSDMap *p;
4618 if (epoch == osdmap.get_epoch()) {
4619 p = &osdmap;
4620 } else {
4621 p = new OSDMap;
4622 p->decode(osdmap_bl);
4623 }
4624
4625 auto sg = make_scope_guard([&] {
4626 if (p != &osdmap) {
4627 delete p;
4628 }
4629 });
4630
4631 if (prefix == "osd dump") {
4632 stringstream ds;
4633 if (f) {
4634 f->open_object_section("osdmap");
4635 p->dump(f.get());
4636 f->close_section();
4637 f->flush(ds);
4638 } else {
4639 p->print(ds);
4640 }
4641 rdata.append(ds);
4642 if (!f)
4643 ds << " ";
4644 } else if (prefix == "osd ls") {
4645 if (f) {
4646 f->open_array_section("osds");
4647 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4648 if (osdmap.exists(i)) {
4649 f->dump_int("osd", i);
4650 }
4651 }
4652 f->close_section();
4653 f->flush(ds);
4654 } else {
4655 bool first = true;
4656 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4657 if (osdmap.exists(i)) {
4658 if (!first)
4659 ds << "\n";
4660 first = false;
4661 ds << i;
4662 }
4663 }
4664 }
4665 rdata.append(ds);
4666 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
4667 string bucket;
4668 if (prefix == "osd tree-from") {
4669 cmd_getval(cct, cmdmap, "bucket", bucket);
4670 if (!osdmap.crush->name_exists(bucket)) {
4671 ss << "bucket '" << bucket << "' does not exist";
4672 r = -ENOENT;
4673 goto reply;
4674 }
4675 int id = osdmap.crush->get_item_id(bucket);
4676 if (id >= 0) {
4677 ss << "\"" << bucket << "\" is not a bucket";
4678 r = -EINVAL;
4679 goto reply;
4680 }
4681 }
4682
4683 vector<string> states;
4684 cmd_getval(cct, cmdmap, "states", states);
4685 unsigned filter = 0;
4686 for (auto& s : states) {
4687 if (s == "up") {
4688 filter |= OSDMap::DUMP_UP;
4689 } else if (s == "down") {
4690 filter |= OSDMap::DUMP_DOWN;
4691 } else if (s == "in") {
4692 filter |= OSDMap::DUMP_IN;
4693 } else if (s == "out") {
4694 filter |= OSDMap::DUMP_OUT;
4695 } else if (s == "destroyed") {
4696 filter |= OSDMap::DUMP_DESTROYED;
4697 } else {
4698 ss << "unrecognized state '" << s << "'";
4699 r = -EINVAL;
4700 goto reply;
4701 }
4702 }
4703 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
4704 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4705 ss << "cannot specify both 'in' and 'out'";
4706 r = -EINVAL;
4707 goto reply;
4708 }
4709 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4710 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4711 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4712 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4713 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4714 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4715 ss << "can specify only one of 'up', 'down' and 'destroyed'";
4716 r = -EINVAL;
4717 goto reply;
4718 }
4719 if (f) {
4720 f->open_object_section("tree");
4721 p->print_tree(f.get(), NULL, filter, bucket);
4722 f->close_section();
4723 f->flush(ds);
4724 } else {
4725 p->print_tree(NULL, &ds, filter, bucket);
4726 }
4727 rdata.append(ds);
4728 } else if (prefix == "osd getmap") {
4729 rdata.append(osdmap_bl);
4730 ss << "got osdmap epoch " << p->get_epoch();
4731 } else if (prefix == "osd getcrushmap") {
4732 p->crush->encode(rdata, mon->get_quorum_con_features());
4733 ss << p->get_crush_version();
4734 } else if (prefix == "osd ls-tree") {
4735 string bucket_name;
4736 cmd_getval(cct, cmdmap, "name", bucket_name);
4737 set<int> osds;
4738 r = p->get_osds_by_bucket_name(bucket_name, &osds);
4739 if (r == -ENOENT) {
4740 ss << "\"" << bucket_name << "\" does not exist";
4741 goto reply;
4742 } else if (r < 0) {
4743 ss << "can not parse bucket name:\"" << bucket_name << "\"";
4744 goto reply;
4745 }
4746
4747 if (f) {
4748 f->open_array_section("osds");
4749 for (auto &i : osds) {
4750 if (osdmap.exists(i)) {
4751 f->dump_int("osd", i);
4752 }
4753 }
4754 f->close_section();
4755 f->flush(ds);
4756 } else {
4757 bool first = true;
4758 for (auto &i : osds) {
4759 if (osdmap.exists(i)) {
4760 if (!first)
4761 ds << "\n";
4762 first = false;
4763 ds << i;
4764 }
4765 }
4766 }
4767
4768 rdata.append(ds);
4769 }
4770 } else if (prefix == "osd getmaxosd") {
4771 if (f) {
4772 f->open_object_section("getmaxosd");
4773 f->dump_unsigned("epoch", osdmap.get_epoch());
4774 f->dump_int("max_osd", osdmap.get_max_osd());
4775 f->close_section();
4776 f->flush(rdata);
4777 } else {
4778 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4779 rdata.append(ds);
4780 }
4781 } else if (prefix == "osd utilization") {
4782 string out;
4783 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4784 if (f)
4785 f->flush(rdata);
4786 else
4787 rdata.append(out);
4788 r = 0;
4789 goto reply;
4790 } else if (prefix == "osd find") {
4791 int64_t osd;
4792 if (!cmd_getval(cct, cmdmap, "id", osd)) {
4793 ss << "unable to parse osd id value '"
4794 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4795 r = -EINVAL;
4796 goto reply;
4797 }
4798 if (!osdmap.exists(osd)) {
4799 ss << "osd." << osd << " does not exist";
4800 r = -ENOENT;
4801 goto reply;
4802 }
4803 string format;
4804 cmd_getval(cct, cmdmap, "format", format);
4805 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4806 f->open_object_section("osd_location");
4807 f->dump_int("osd", osd);
4808 f->dump_object("addrs", osdmap.get_addrs(osd));
4809 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
4810
4811 // try to identify host, pod/container name, etc.
4812 map<string,string> m;
4813 load_metadata(osd, m, nullptr);
4814 if (auto p = m.find("hostname"); p != m.end()) {
4815 f->dump_string("host", p->second);
4816 }
4817 for (auto& k : {
4818 "pod_name", "pod_namespace", // set by rook
4819 "container_name" // set by ceph-ansible
4820 }) {
4821 if (auto p = m.find(k); p != m.end()) {
4822 f->dump_string(k, p->second);
4823 }
4824 }
4825
4826 // crush is helpful too
4827 f->open_object_section("crush_location");
4828 map<string,string> loc = osdmap.crush->get_full_location(osd);
4829 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4830 f->dump_string(p->first.c_str(), p->second);
4831 f->close_section();
4832 f->close_section();
4833 f->flush(rdata);
4834 } else if (prefix == "osd metadata") {
4835 int64_t osd = -1;
4836 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4837 !cmd_getval(cct, cmdmap, "id", osd)) {
4838 ss << "unable to parse osd id value '"
4839 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4840 r = -EINVAL;
4841 goto reply;
4842 }
4843 if (osd >= 0 && !osdmap.exists(osd)) {
4844 ss << "osd." << osd << " does not exist";
4845 r = -ENOENT;
4846 goto reply;
4847 }
4848 string format;
4849 cmd_getval(cct, cmdmap, "format", format);
4850 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4851 if (osd >= 0) {
4852 f->open_object_section("osd_metadata");
4853 f->dump_unsigned("id", osd);
4854 r = dump_osd_metadata(osd, f.get(), &ss);
4855 if (r < 0)
4856 goto reply;
4857 f->close_section();
4858 } else {
4859 r = 0;
4860 f->open_array_section("osd_metadata");
4861 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4862 if (osdmap.exists(i)) {
4863 f->open_object_section("osd");
4864 f->dump_unsigned("id", i);
4865 r = dump_osd_metadata(i, f.get(), NULL);
4866 if (r == -EINVAL || r == -ENOENT) {
4867 // Drop error, continue to get other daemons' metadata
4868 dout(4) << "No metadata for osd." << i << dendl;
4869 r = 0;
4870 } else if (r < 0) {
4871 // Unexpected error
4872 goto reply;
4873 }
4874 f->close_section();
4875 }
4876 }
4877 f->close_section();
4878 }
4879 f->flush(rdata);
4880 } else if (prefix == "osd versions") {
4881 if (!f)
4882 f.reset(Formatter::create("json-pretty"));
4883 count_metadata("ceph_version", f.get());
4884 f->flush(rdata);
4885 r = 0;
4886 } else if (prefix == "osd count-metadata") {
4887 if (!f)
4888 f.reset(Formatter::create("json-pretty"));
4889 string field;
4890 cmd_getval(cct, cmdmap, "property", field);
4891 count_metadata(field, f.get());
4892 f->flush(rdata);
4893 r = 0;
4894 } else if (prefix == "osd numa-status") {
4895 TextTable tbl;
4896 if (f) {
4897 f->open_array_section("osds");
4898 } else {
4899 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
4900 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
4901 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
4902 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
4903 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
4904 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
4905 }
4906 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4907 if (osdmap.exists(i)) {
4908 map<string,string> m;
4909 ostringstream err;
4910 if (load_metadata(i, m, &err) < 0) {
4911 continue;
4912 }
4913 string host;
4914 auto p = m.find("hostname");
4915 if (p != m.end()) {
4916 host = p->second;
4917 }
4918 if (f) {
4919 f->open_object_section("osd");
4920 f->dump_int("osd", i);
4921 f->dump_string("host", host);
4922 for (auto n : { "network_numa_node", "objectstore_numa_node",
4923 "numa_node" }) {
4924 p = m.find(n);
4925 if (p != m.end()) {
4926 f->dump_int(n, atoi(p->second.c_str()));
4927 }
4928 }
4929 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
4930 p = m.find(n);
4931 if (p != m.end()) {
4932 list<string> ls = get_str_list(p->second, ",");
4933 f->open_array_section(n);
4934 for (auto node : ls) {
4935 f->dump_int("node", atoi(node.c_str()));
4936 }
4937 f->close_section();
4938 }
4939 }
4940 for (auto n : { "numa_node_cpus" }) {
4941 p = m.find(n);
4942 if (p != m.end()) {
4943 dump_cpu_list(f.get(), n, p->second);
4944 }
4945 }
4946 f->close_section();
4947 } else {
4948 tbl << i;
4949 tbl << host;
4950 p = m.find("network_numa_nodes");
4951 if (p != m.end()) {
4952 tbl << p->second;
4953 } else {
4954 tbl << "-";
4955 }
4956 p = m.find("objectstore_numa_nodes");
4957 if (p != m.end()) {
4958 tbl << p->second;
4959 } else {
4960 tbl << "-";
4961 }
4962 p = m.find("numa_node");
4963 auto q = m.find("numa_node_cpus");
4964 if (p != m.end() && q != m.end()) {
4965 tbl << p->second;
4966 tbl << q->second;
4967 } else {
4968 tbl << "-";
4969 tbl << "-";
4970 }
4971 tbl << TextTable::endrow;
4972 }
4973 }
4974 }
4975 if (f) {
4976 f->close_section();
4977 f->flush(rdata);
4978 } else {
4979 rdata.append(stringify(tbl));
4980 }
4981 } else if (prefix == "osd map") {
4982 string poolstr, objstr, namespacestr;
4983 cmd_getval(cct, cmdmap, "pool", poolstr);
4984 cmd_getval(cct, cmdmap, "object", objstr);
4985 cmd_getval(cct, cmdmap, "nspace", namespacestr);
4986
4987 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4988 if (pool < 0) {
4989 ss << "pool " << poolstr << " does not exist";
4990 r = -ENOENT;
4991 goto reply;
4992 }
4993 object_locator_t oloc(pool, namespacestr);
4994 object_t oid(objstr);
4995 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4996 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4997 vector<int> up, acting;
4998 int up_p, acting_p;
4999 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5000
5001 string fullobjname;
5002 if (!namespacestr.empty())
5003 fullobjname = namespacestr + string("/") + oid.name;
5004 else
5005 fullobjname = oid.name;
5006 if (f) {
5007 f->open_object_section("osd_map");
5008 f->dump_unsigned("epoch", osdmap.get_epoch());
5009 f->dump_string("pool", poolstr);
5010 f->dump_int("pool_id", pool);
5011 f->dump_stream("objname") << fullobjname;
5012 f->dump_stream("raw_pgid") << pgid;
5013 f->dump_stream("pgid") << mpgid;
5014 f->open_array_section("up");
5015 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5016 f->dump_int("osd", *p);
5017 f->close_section();
5018 f->dump_int("up_primary", up_p);
5019 f->open_array_section("acting");
5020 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5021 f->dump_int("osd", *p);
5022 f->close_section();
5023 f->dump_int("acting_primary", acting_p);
5024 f->close_section(); // osd_map
5025 f->flush(rdata);
5026 } else {
5027 ds << "osdmap e" << osdmap.get_epoch()
5028 << " pool '" << poolstr << "' (" << pool << ")"
5029 << " object '" << fullobjname << "' ->"
5030 << " pg " << pgid << " (" << mpgid << ")"
5031 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5032 << pg_vector_string(acting) << ", p" << acting_p << ")";
5033 rdata.append(ds);
5034 }
5035
5036 } else if (prefix == "pg map") {
5037 pg_t pgid;
5038 string pgidstr;
5039 cmd_getval(cct, cmdmap, "pgid", pgidstr);
5040 if (!pgid.parse(pgidstr.c_str())) {
5041 ss << "invalid pgid '" << pgidstr << "'";
5042 r = -EINVAL;
5043 goto reply;
5044 }
5045 vector<int> up, acting;
5046 if (!osdmap.have_pg_pool(pgid.pool())) {
5047 ss << "pg '" << pgidstr << "' does not exist";
5048 r = -ENOENT;
5049 goto reply;
5050 }
5051 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5052 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5053 if (f) {
5054 f->open_object_section("pg_map");
5055 f->dump_unsigned("epoch", osdmap.get_epoch());
5056 f->dump_stream("raw_pgid") << pgid;
5057 f->dump_stream("pgid") << mpgid;
5058 f->open_array_section("up");
5059 for (auto osd : up) {
5060 f->dump_int("up_osd", osd);
5061 }
5062 f->close_section();
5063 f->open_array_section("acting");
5064 for (auto osd : acting) {
5065 f->dump_int("acting_osd", osd);
5066 }
5067 f->close_section();
5068 f->close_section();
5069 f->flush(rdata);
5070 } else {
5071 ds << "osdmap e" << osdmap.get_epoch()
5072 << " pg " << pgid << " (" << mpgid << ")"
5073 << " -> up " << up << " acting " << acting;
5074 rdata.append(ds);
5075 }
5076 goto reply;
5077
5078 } else if (prefix == "osd lspools") {
5079 if (f)
5080 f->open_array_section("pools");
5081 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5082 p != osdmap.pools.end();
5083 ++p) {
5084 if (f) {
5085 f->open_object_section("pool");
5086 f->dump_int("poolnum", p->first);
5087 f->dump_string("poolname", osdmap.pool_name[p->first]);
5088 f->close_section();
5089 } else {
5090 ds << p->first << ' ' << osdmap.pool_name[p->first];
5091 if (next(p) != osdmap.pools.end()) {
5092 ds << '\n';
5093 }
5094 }
5095 }
5096 if (f) {
5097 f->close_section();
5098 f->flush(ds);
5099 }
5100 rdata.append(ds);
5101 } else if (prefix == "osd blacklist ls") {
5102 if (f)
5103 f->open_array_section("blacklist");
5104
5105 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5106 p != osdmap.blacklist.end();
5107 ++p) {
5108 if (f) {
5109 f->open_object_section("entry");
5110 f->dump_string("addr", p->first.get_legacy_str());
5111 f->dump_stream("until") << p->second;
5112 f->close_section();
5113 } else {
5114 stringstream ss;
5115 string s;
5116 ss << p->first << " " << p->second;
5117 getline(ss, s);
5118 s += "\n";
5119 rdata.append(s);
5120 }
5121 }
5122 if (f) {
5123 f->close_section();
5124 f->flush(rdata);
5125 }
5126 ss << "listed " << osdmap.blacklist.size() << " entries";
5127
5128 } else if (prefix == "osd pool ls") {
5129 string detail;
5130 cmd_getval(cct, cmdmap, "detail", detail);
5131 if (!f && detail == "detail") {
5132 ostringstream ss;
5133 osdmap.print_pools(ss);
5134 rdata.append(ss.str());
5135 } else {
5136 if (f)
5137 f->open_array_section("pools");
5138 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5139 it != osdmap.get_pools().end();
5140 ++it) {
5141 if (f) {
5142 if (detail == "detail") {
5143 f->open_object_section("pool");
5144 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5145 it->second.dump(f.get());
5146 f->close_section();
5147 } else {
5148 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5149 }
5150 } else {
5151 rdata.append(osdmap.get_pool_name(it->first) + "\n");
5152 }
5153 }
5154 if (f) {
5155 f->close_section();
5156 f->flush(rdata);
5157 }
5158 }
5159
5160 } else if (prefix == "osd crush get-tunable") {
5161 string tunable;
5162 cmd_getval(cct, cmdmap, "tunable", tunable);
5163 ostringstream rss;
5164 if (f)
5165 f->open_object_section("tunable");
5166 if (tunable == "straw_calc_version") {
5167 if (f)
5168 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5169 else
5170 rss << osdmap.crush->get_straw_calc_version() << "\n";
5171 } else {
5172 r = -EINVAL;
5173 goto reply;
5174 }
5175 if (f) {
5176 f->close_section();
5177 f->flush(rdata);
5178 } else {
5179 rdata.append(rss.str());
5180 }
5181 r = 0;
5182
5183 } else if (prefix == "osd pool get") {
5184 string poolstr;
5185 cmd_getval(cct, cmdmap, "pool", poolstr);
5186 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5187 if (pool < 0) {
5188 ss << "unrecognized pool '" << poolstr << "'";
5189 r = -ENOENT;
5190 goto reply;
5191 }
5192
5193 const pg_pool_t *p = osdmap.get_pg_pool(pool);
5194 string var;
5195 cmd_getval(cct, cmdmap, "var", var);
5196
5197 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5198 const choices_map_t ALL_CHOICES = {
5199 {"size", SIZE},
5200 {"min_size", MIN_SIZE},
5201 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5202 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5203 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5204 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5205 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5206 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5207 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5208 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5209 {"use_gmt_hitset", USE_GMT_HITSET},
5210 {"target_max_objects", TARGET_MAX_OBJECTS},
5211 {"target_max_bytes", TARGET_MAX_BYTES},
5212 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5213 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5214 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5215 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5216 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5217 {"erasure_code_profile", ERASURE_CODE_PROFILE},
5218 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5219 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5220 {"fast_read", FAST_READ},
5221 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5222 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5223 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5224 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5225 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5226 {"recovery_priority", RECOVERY_PRIORITY},
5227 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5228 {"scrub_priority", SCRUB_PRIORITY},
5229 {"compression_mode", COMPRESSION_MODE},
5230 {"compression_algorithm", COMPRESSION_ALGORITHM},
5231 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5232 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5233 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5234 {"csum_type", CSUM_TYPE},
5235 {"csum_max_block", CSUM_MAX_BLOCK},
5236 {"csum_min_block", CSUM_MIN_BLOCK},
5237 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5238 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5239 {"pg_num_min", PG_NUM_MIN},
5240 {"target_size_bytes", TARGET_SIZE_BYTES},
5241 {"target_size_ratio", TARGET_SIZE_RATIO},
5242 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
5243 };
5244
5245 typedef std::set<osd_pool_get_choices> choices_set_t;
5246
5247 const choices_set_t ONLY_TIER_CHOICES = {
5248 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5249 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5250 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5251 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5252 MIN_READ_RECENCY_FOR_PROMOTE,
5253 MIN_WRITE_RECENCY_FOR_PROMOTE,
5254 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
5255 };
5256 const choices_set_t ONLY_ERASURE_CHOICES = {
5257 EC_OVERWRITES, ERASURE_CODE_PROFILE
5258 };
5259
5260 choices_set_t selected_choices;
5261 if (var == "all") {
5262 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
5263 it != ALL_CHOICES.end(); ++it) {
5264 selected_choices.insert(it->second);
5265 }
5266
5267 if(!p->is_tier()) {
5268 selected_choices = subtract_second_from_first(selected_choices,
5269 ONLY_TIER_CHOICES);
5270 }
5271
5272 if(!p->is_erasure()) {
5273 selected_choices = subtract_second_from_first(selected_choices,
5274 ONLY_ERASURE_CHOICES);
5275 }
5276 } else /* var != "all" */ {
5277 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
5278 osd_pool_get_choices selected = found->second;
5279
5280 if (!p->is_tier() &&
5281 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
5282 ss << "pool '" << poolstr
5283 << "' is not a tier pool: variable not applicable";
5284 r = -EACCES;
5285 goto reply;
5286 }
5287
5288 if (!p->is_erasure() &&
5289 ONLY_ERASURE_CHOICES.find(selected)
5290 != ONLY_ERASURE_CHOICES.end()) {
5291 ss << "pool '" << poolstr
5292 << "' is not a erasure pool: variable not applicable";
5293 r = -EACCES;
5294 goto reply;
5295 }
5296
5297 if (pool_opts_t::is_opt_name(var) &&
5298 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
5299 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
5300 r = -ENOENT;
5301 goto reply;
5302 }
5303
5304 selected_choices.insert(selected);
5305 }
5306
5307 if (f) {
5308 f->open_object_section("pool");
5309 f->dump_string("pool", poolstr);
5310 f->dump_int("pool_id", pool);
5311 for(choices_set_t::const_iterator it = selected_choices.begin();
5312 it != selected_choices.end(); ++it) {
5313 choices_map_t::const_iterator i;
5314 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5315 if (i->second == *it) {
5316 break;
5317 }
5318 }
5319 ceph_assert(i != ALL_CHOICES.end());
5320 switch(*it) {
5321 case PG_NUM:
5322 f->dump_int("pg_num", p->get_pg_num());
5323 break;
5324 case PGP_NUM:
5325 f->dump_int("pgp_num", p->get_pgp_num());
5326 break;
5327 case SIZE:
5328 f->dump_int("size", p->get_size());
5329 break;
5330 case MIN_SIZE:
5331 f->dump_int("min_size", p->get_min_size());
5332 break;
5333 case CRUSH_RULE:
5334 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
5335 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
5336 p->get_crush_rule()));
5337 } else {
5338 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
5339 }
5340 break;
5341 case EC_OVERWRITES:
5342 f->dump_bool("allow_ec_overwrites",
5343 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
5344 break;
5345 case PG_AUTOSCALE_MODE:
5346 f->dump_string("pg_autoscale_mode",
5347 pg_pool_t::get_pg_autoscale_mode_name(
5348 p->pg_autoscale_mode));
5349 break;
5350 case HASHPSPOOL:
5351 case NODELETE:
5352 case NOPGCHANGE:
5353 case NOSIZECHANGE:
5354 case WRITE_FADVISE_DONTNEED:
5355 case NOSCRUB:
5356 case NODEEP_SCRUB:
5357 f->dump_bool(i->first.c_str(),
5358 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
5359 break;
5360 case HIT_SET_PERIOD:
5361 f->dump_int("hit_set_period", p->hit_set_period);
5362 break;
5363 case HIT_SET_COUNT:
5364 f->dump_int("hit_set_count", p->hit_set_count);
5365 break;
5366 case HIT_SET_TYPE:
5367 f->dump_string("hit_set_type",
5368 HitSet::get_type_name(p->hit_set_params.get_type()));
5369 break;
5370 case HIT_SET_FPP:
5371 {
5372 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5373 BloomHitSet::Params *bloomp =
5374 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5375 f->dump_float("hit_set_fpp", bloomp->get_fpp());
5376 } else if(var != "all") {
5377 f->close_section();
5378 ss << "hit set is not of type Bloom; " <<
5379 "invalid to get a false positive rate!";
5380 r = -EINVAL;
5381 goto reply;
5382 }
5383 }
5384 break;
5385 case USE_GMT_HITSET:
5386 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
5387 break;
5388 case TARGET_MAX_OBJECTS:
5389 f->dump_unsigned("target_max_objects", p->target_max_objects);
5390 break;
5391 case TARGET_MAX_BYTES:
5392 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
5393 break;
5394 case CACHE_TARGET_DIRTY_RATIO:
5395 f->dump_unsigned("cache_target_dirty_ratio_micro",
5396 p->cache_target_dirty_ratio_micro);
5397 f->dump_float("cache_target_dirty_ratio",
5398 ((float)p->cache_target_dirty_ratio_micro/1000000));
5399 break;
5400 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5401 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
5402 p->cache_target_dirty_high_ratio_micro);
5403 f->dump_float("cache_target_dirty_high_ratio",
5404 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
5405 break;
5406 case CACHE_TARGET_FULL_RATIO:
5407 f->dump_unsigned("cache_target_full_ratio_micro",
5408 p->cache_target_full_ratio_micro);
5409 f->dump_float("cache_target_full_ratio",
5410 ((float)p->cache_target_full_ratio_micro/1000000));
5411 break;
5412 case CACHE_MIN_FLUSH_AGE:
5413 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
5414 break;
5415 case CACHE_MIN_EVICT_AGE:
5416 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
5417 break;
5418 case ERASURE_CODE_PROFILE:
5419 f->dump_string("erasure_code_profile", p->erasure_code_profile);
5420 break;
5421 case MIN_READ_RECENCY_FOR_PROMOTE:
5422 f->dump_int("min_read_recency_for_promote",
5423 p->min_read_recency_for_promote);
5424 break;
5425 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5426 f->dump_int("min_write_recency_for_promote",
5427 p->min_write_recency_for_promote);
5428 break;
5429 case FAST_READ:
5430 f->dump_int("fast_read", p->fast_read);
5431 break;
5432 case HIT_SET_GRADE_DECAY_RATE:
5433 f->dump_int("hit_set_grade_decay_rate",
5434 p->hit_set_grade_decay_rate);
5435 break;
5436 case HIT_SET_SEARCH_LAST_N:
5437 f->dump_int("hit_set_search_last_n",
5438 p->hit_set_search_last_n);
5439 break;
5440 case SCRUB_MIN_INTERVAL:
5441 case SCRUB_MAX_INTERVAL:
5442 case DEEP_SCRUB_INTERVAL:
5443 case RECOVERY_PRIORITY:
5444 case RECOVERY_OP_PRIORITY:
5445 case SCRUB_PRIORITY:
5446 case COMPRESSION_MODE:
5447 case COMPRESSION_ALGORITHM:
5448 case COMPRESSION_REQUIRED_RATIO:
5449 case COMPRESSION_MAX_BLOB_SIZE:
5450 case COMPRESSION_MIN_BLOB_SIZE:
5451 case CSUM_TYPE:
5452 case CSUM_MAX_BLOCK:
5453 case CSUM_MIN_BLOCK:
5454 case FINGERPRINT_ALGORITHM:
5455 case PG_NUM_MIN:
5456 case TARGET_SIZE_BYTES:
5457 case TARGET_SIZE_RATIO:
5458 case PG_AUTOSCALE_BIAS:
5459 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5460 if (p->opts.is_set(key)) {
5461 if(*it == CSUM_TYPE) {
5462 int64_t val;
5463 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
5464 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
5465 } else {
5466 p->opts.dump(i->first, f.get());
5467 }
5468 }
5469 break;
5470 }
5471 }
5472 f->close_section();
5473 f->flush(rdata);
5474 } else /* !f */ {
5475 for(choices_set_t::const_iterator it = selected_choices.begin();
5476 it != selected_choices.end(); ++it) {
5477 choices_map_t::const_iterator i;
5478 switch(*it) {
5479 case PG_NUM:
5480 ss << "pg_num: " << p->get_pg_num() << "\n";
5481 break;
5482 case PGP_NUM:
5483 ss << "pgp_num: " << p->get_pgp_num() << "\n";
5484 break;
5485 case SIZE:
5486 ss << "size: " << p->get_size() << "\n";
5487 break;
5488 case MIN_SIZE:
5489 ss << "min_size: " << p->get_min_size() << "\n";
5490 break;
5491 case CRUSH_RULE:
5492 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
5493 ss << "crush_rule: " << osdmap.crush->get_rule_name(
5494 p->get_crush_rule()) << "\n";
5495 } else {
5496 ss << "crush_rule: " << p->get_crush_rule() << "\n";
5497 }
5498 break;
5499 case PG_AUTOSCALE_MODE:
5500 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
5501 p->pg_autoscale_mode) <<"\n";
5502 break;
5503 case HIT_SET_PERIOD:
5504 ss << "hit_set_period: " << p->hit_set_period << "\n";
5505 break;
5506 case HIT_SET_COUNT:
5507 ss << "hit_set_count: " << p->hit_set_count << "\n";
5508 break;
5509 case HIT_SET_TYPE:
5510 ss << "hit_set_type: " <<
5511 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
5512 break;
5513 case HIT_SET_FPP:
5514 {
5515 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5516 BloomHitSet::Params *bloomp =
5517 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5518 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
5519 } else if(var != "all") {
5520 ss << "hit set is not of type Bloom; " <<
5521 "invalid to get a false positive rate!";
5522 r = -EINVAL;
5523 goto reply;
5524 }
5525 }
5526 break;
5527 case USE_GMT_HITSET:
5528 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
5529 break;
5530 case TARGET_MAX_OBJECTS:
5531 ss << "target_max_objects: " << p->target_max_objects << "\n";
5532 break;
5533 case TARGET_MAX_BYTES:
5534 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
5535 break;
5536 case CACHE_TARGET_DIRTY_RATIO:
5537 ss << "cache_target_dirty_ratio: "
5538 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
5539 break;
5540 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5541 ss << "cache_target_dirty_high_ratio: "
5542 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
5543 break;
5544 case CACHE_TARGET_FULL_RATIO:
5545 ss << "cache_target_full_ratio: "
5546 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
5547 break;
5548 case CACHE_MIN_FLUSH_AGE:
5549 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
5550 break;
5551 case CACHE_MIN_EVICT_AGE:
5552 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
5553 break;
5554 case ERASURE_CODE_PROFILE:
5555 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
5556 break;
5557 case MIN_READ_RECENCY_FOR_PROMOTE:
5558 ss << "min_read_recency_for_promote: " <<
5559 p->min_read_recency_for_promote << "\n";
5560 break;
5561 case HIT_SET_GRADE_DECAY_RATE:
5562 ss << "hit_set_grade_decay_rate: " <<
5563 p->hit_set_grade_decay_rate << "\n";
5564 break;
5565 case HIT_SET_SEARCH_LAST_N:
5566 ss << "hit_set_search_last_n: " <<
5567 p->hit_set_search_last_n << "\n";
5568 break;
5569 case EC_OVERWRITES:
5570 ss << "allow_ec_overwrites: " <<
5571 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
5572 "\n";
5573 break;
5574 case HASHPSPOOL:
5575 case NODELETE:
5576 case NOPGCHANGE:
5577 case NOSIZECHANGE:
5578 case WRITE_FADVISE_DONTNEED:
5579 case NOSCRUB:
5580 case NODEEP_SCRUB:
5581 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5582 if (i->second == *it)
5583 break;
5584 }
5585 ceph_assert(i != ALL_CHOICES.end());
5586 ss << i->first << ": " <<
5587 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
5588 "true" : "false") << "\n";
5589 break;
5590 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5591 ss << "min_write_recency_for_promote: " <<
5592 p->min_write_recency_for_promote << "\n";
5593 break;
5594 case FAST_READ:
5595 ss << "fast_read: " << p->fast_read << "\n";
5596 break;
5597 case SCRUB_MIN_INTERVAL:
5598 case SCRUB_MAX_INTERVAL:
5599 case DEEP_SCRUB_INTERVAL:
5600 case RECOVERY_PRIORITY:
5601 case RECOVERY_OP_PRIORITY:
5602 case SCRUB_PRIORITY:
5603 case COMPRESSION_MODE:
5604 case COMPRESSION_ALGORITHM:
5605 case COMPRESSION_REQUIRED_RATIO:
5606 case COMPRESSION_MAX_BLOB_SIZE:
5607 case COMPRESSION_MIN_BLOB_SIZE:
5608 case CSUM_TYPE:
5609 case CSUM_MAX_BLOCK:
5610 case CSUM_MIN_BLOCK:
5611 case FINGERPRINT_ALGORITHM:
5612 case PG_NUM_MIN:
5613 case TARGET_SIZE_BYTES:
5614 case TARGET_SIZE_RATIO:
5615 case PG_AUTOSCALE_BIAS:
5616 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5617 if (i->second == *it)
5618 break;
5619 }
5620 ceph_assert(i != ALL_CHOICES.end());
5621 {
5622 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5623 if (p->opts.is_set(key)) {
5624 if(key == pool_opts_t::CSUM_TYPE) {
5625 int64_t val;
5626 p->opts.get(key, &val);
5627 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
5628 } else {
5629 ss << i->first << ": " << p->opts.get(key) << "\n";
5630 }
5631 }
5632 }
5633 break;
5634 }
5635 rdata.append(ss.str());
5636 ss.str("");
5637 }
5638 }
5639 r = 0;
5640 } else if (prefix == "osd pool get-quota") {
5641 string pool_name;
5642 cmd_getval(cct, cmdmap, "pool", pool_name);
5643
5644 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
5645 if (poolid < 0) {
5646 ceph_assert(poolid == -ENOENT);
5647 ss << "unrecognized pool '" << pool_name << "'";
5648 r = -ENOENT;
5649 goto reply;
5650 }
5651 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
5652
5653 if (f) {
5654 f->open_object_section("pool_quotas");
5655 f->dump_string("pool_name", pool_name);
5656 f->dump_unsigned("pool_id", poolid);
5657 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
5658 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
5659 f->close_section();
5660 f->flush(rdata);
5661 } else {
5662 stringstream rs;
5663 rs << "quotas for pool '" << pool_name << "':\n"
5664 << " max objects: ";
5665 if (p->quota_max_objects == 0)
5666 rs << "N/A";
5667 else
5668 rs << si_u_t(p->quota_max_objects) << " objects";
5669 rs << "\n"
5670 << " max bytes : ";
5671 if (p->quota_max_bytes == 0)
5672 rs << "N/A";
5673 else
5674 rs << byte_u_t(p->quota_max_bytes);
5675 rdata.append(rs.str());
5676 }
5677 rdata.append("\n");
5678 r = 0;
5679 } else if (prefix == "osd crush rule list" ||
5680 prefix == "osd crush rule ls") {
5681 if (f) {
5682 f->open_array_section("rules");
5683 osdmap.crush->list_rules(f.get());
5684 f->close_section();
5685 f->flush(rdata);
5686 } else {
5687 ostringstream ss;
5688 osdmap.crush->list_rules(&ss);
5689 rdata.append(ss.str());
5690 }
5691 } else if (prefix == "osd crush rule ls-by-class") {
5692 string class_name;
5693 cmd_getval(cct, cmdmap, "class", class_name);
5694 if (class_name.empty()) {
5695 ss << "no class specified";
5696 r = -EINVAL;
5697 goto reply;
5698 }
5699 set<int> rules;
5700 r = osdmap.crush->get_rules_by_class(class_name, &rules);
5701 if (r < 0) {
5702 ss << "failed to get rules by class '" << class_name << "'";
5703 goto reply;
5704 }
5705 if (f) {
5706 f->open_array_section("rules");
5707 for (auto &rule: rules) {
5708 f->dump_string("name", osdmap.crush->get_rule_name(rule));
5709 }
5710 f->close_section();
5711 f->flush(rdata);
5712 } else {
5713 ostringstream rs;
5714 for (auto &rule: rules) {
5715 rs << osdmap.crush->get_rule_name(rule) << "\n";
5716 }
5717 rdata.append(rs.str());
5718 }
5719 } else if (prefix == "osd crush rule dump") {
5720 string name;
5721 cmd_getval(cct, cmdmap, "name", name);
5722 string format;
5723 cmd_getval(cct, cmdmap, "format", format);
5724 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5725 if (name == "") {
5726 f->open_array_section("rules");
5727 osdmap.crush->dump_rules(f.get());
5728 f->close_section();
5729 } else {
5730 int ruleno = osdmap.crush->get_rule_id(name);
5731 if (ruleno < 0) {
5732 ss << "unknown crush rule '" << name << "'";
5733 r = ruleno;
5734 goto reply;
5735 }
5736 osdmap.crush->dump_rule(ruleno, f.get());
5737 }
5738 ostringstream rs;
5739 f->flush(rs);
5740 rs << "\n";
5741 rdata.append(rs.str());
5742 } else if (prefix == "osd crush dump") {
5743 string format;
5744 cmd_getval(cct, cmdmap, "format", format);
5745 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5746 f->open_object_section("crush_map");
5747 osdmap.crush->dump(f.get());
5748 f->close_section();
5749 ostringstream rs;
5750 f->flush(rs);
5751 rs << "\n";
5752 rdata.append(rs.str());
5753 } else if (prefix == "osd crush show-tunables") {
5754 string format;
5755 cmd_getval(cct, cmdmap, "format", format);
5756 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5757 f->open_object_section("crush_map_tunables");
5758 osdmap.crush->dump_tunables(f.get());
5759 f->close_section();
5760 ostringstream rs;
5761 f->flush(rs);
5762 rs << "\n";
5763 rdata.append(rs.str());
5764 } else if (prefix == "osd crush tree") {
5765 string shadow;
5766 cmd_getval(cct, cmdmap, "shadow", shadow);
5767 bool show_shadow = shadow == "--show-shadow";
5768 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5769 if (f) {
5770 f->open_object_section("crush_tree");
5771 osdmap.crush->dump_tree(nullptr,
5772 f.get(),
5773 osdmap.get_pool_names(),
5774 show_shadow);
5775 f->close_section();
5776 f->flush(rdata);
5777 } else {
5778 ostringstream ss;
5779 osdmap.crush->dump_tree(&ss,
5780 nullptr,
5781 osdmap.get_pool_names(),
5782 show_shadow);
5783 rdata.append(ss.str());
5784 }
5785 } else if (prefix == "osd crush ls") {
5786 string name;
5787 if (!cmd_getval(cct, cmdmap, "node", name)) {
5788 ss << "no node specified";
5789 r = -EINVAL;
5790 goto reply;
5791 }
5792 if (!osdmap.crush->name_exists(name)) {
5793 ss << "node '" << name << "' does not exist";
5794 r = -ENOENT;
5795 goto reply;
5796 }
5797 int id = osdmap.crush->get_item_id(name);
5798 list<int> result;
5799 if (id >= 0) {
5800 result.push_back(id);
5801 } else {
5802 int num = osdmap.crush->get_bucket_size(id);
5803 for (int i = 0; i < num; ++i) {
5804 result.push_back(osdmap.crush->get_bucket_item(id, i));
5805 }
5806 }
5807 if (f) {
5808 f->open_array_section("items");
5809 for (auto i : result) {
5810 f->dump_string("item", osdmap.crush->get_item_name(i));
5811 }
5812 f->close_section();
5813 f->flush(rdata);
5814 } else {
5815 ostringstream ss;
5816 for (auto i : result) {
5817 ss << osdmap.crush->get_item_name(i) << "\n";
5818 }
5819 rdata.append(ss.str());
5820 }
5821 r = 0;
5822 } else if (prefix == "osd crush class ls") {
5823 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5824 f->open_array_section("crush_classes");
5825 for (auto i : osdmap.crush->class_name)
5826 f->dump_string("class", i.second);
5827 f->close_section();
5828 f->flush(rdata);
5829 } else if (prefix == "osd crush class ls-osd") {
5830 string name;
5831 cmd_getval(cct, cmdmap, "class", name);
5832 set<int> osds;
5833 osdmap.crush->get_devices_by_class(name, &osds);
5834 if (f) {
5835 f->open_array_section("osds");
5836 for (auto &osd: osds)
5837 f->dump_int("osd", osd);
5838 f->close_section();
5839 f->flush(rdata);
5840 } else {
5841 bool first = true;
5842 for (auto &osd : osds) {
5843 if (!first)
5844 ds << "\n";
5845 first = false;
5846 ds << osd;
5847 }
5848 rdata.append(ds);
5849 }
5850 } else if (prefix == "osd crush get-device-class") {
5851 vector<string> idvec;
5852 cmd_getval(cct, cmdmap, "ids", idvec);
5853 map<int, string> class_by_osd;
5854 for (auto& id : idvec) {
5855 ostringstream ts;
5856 long osd = parse_osd_id(id.c_str(), &ts);
5857 if (osd < 0) {
5858 ss << "unable to parse osd id:'" << id << "'";
5859 r = -EINVAL;
5860 goto reply;
5861 }
5862 auto device_class = osdmap.crush->get_item_class(osd);
5863 if (device_class)
5864 class_by_osd[osd] = device_class;
5865 else
5866 class_by_osd[osd] = ""; // no class
5867 }
5868 if (f) {
5869 f->open_array_section("osd_device_classes");
5870 for (auto& i : class_by_osd) {
5871 f->open_object_section("osd_device_class");
5872 f->dump_int("osd", i.first);
5873 f->dump_string("device_class", i.second);
5874 f->close_section();
5875 }
5876 f->close_section();
5877 f->flush(rdata);
5878 } else {
5879 if (class_by_osd.size() == 1) {
5880 // for single input, make a clean output
5881 ds << class_by_osd.begin()->second;
5882 } else {
5883 // note that we do not group osds by class here
5884 for (auto it = class_by_osd.begin();
5885 it != class_by_osd.end();
5886 it++) {
5887 ds << "osd." << it->first << ' ' << it->second;
5888 if (next(it) != class_by_osd.end())
5889 ds << '\n';
5890 }
5891 }
5892 rdata.append(ds);
5893 }
5894 } else if (prefix == "osd erasure-code-profile ls") {
5895 const auto &profiles = osdmap.get_erasure_code_profiles();
5896 if (f)
5897 f->open_array_section("erasure-code-profiles");
5898 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5899 if (f)
5900 f->dump_string("profile", i->first.c_str());
5901 else
5902 rdata.append(i->first + "\n");
5903 }
5904 if (f) {
5905 f->close_section();
5906 ostringstream rs;
5907 f->flush(rs);
5908 rs << "\n";
5909 rdata.append(rs.str());
5910 }
5911 } else if (prefix == "osd crush weight-set ls") {
5912 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5913 if (f) {
5914 f->open_array_section("weight_sets");
5915 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5916 f->dump_string("pool", "(compat)");
5917 }
5918 for (auto& i : osdmap.crush->choose_args) {
5919 if (i.first >= 0) {
5920 f->dump_string("pool", osdmap.get_pool_name(i.first));
5921 }
5922 }
5923 f->close_section();
5924 f->flush(rdata);
5925 } else {
5926 ostringstream rs;
5927 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5928 rs << "(compat)\n";
5929 }
5930 for (auto& i : osdmap.crush->choose_args) {
5931 if (i.first >= 0) {
5932 rs << osdmap.get_pool_name(i.first) << "\n";
5933 }
5934 }
5935 rdata.append(rs.str());
5936 }
5937 } else if (prefix == "osd crush weight-set dump") {
5938 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5939 "json-pretty"));
5940 osdmap.crush->dump_choose_args(f.get());
5941 f->flush(rdata);
5942 } else if (prefix == "osd erasure-code-profile get") {
5943 string name;
5944 cmd_getval(cct, cmdmap, "name", name);
5945 if (!osdmap.has_erasure_code_profile(name)) {
5946 ss << "unknown erasure code profile '" << name << "'";
5947 r = -ENOENT;
5948 goto reply;
5949 }
5950 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5951 if (f)
5952 f->open_object_section("profile");
5953 for (map<string,string>::const_iterator i = profile.begin();
5954 i != profile.end();
5955 ++i) {
5956 if (f)
5957 f->dump_string(i->first.c_str(), i->second.c_str());
5958 else
5959 rdata.append(i->first + "=" + i->second + "\n");
5960 }
5961 if (f) {
5962 f->close_section();
5963 ostringstream rs;
5964 f->flush(rs);
5965 rs << "\n";
5966 rdata.append(rs.str());
5967 }
5968 } else if (prefix == "osd pool application get") {
5969 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5970 "json-pretty"));
5971 string pool_name;
5972 cmd_getval(cct, cmdmap, "pool", pool_name);
5973 string app;
5974 cmd_getval(cct, cmdmap, "app", app);
5975 string key;
5976 cmd_getval(cct, cmdmap, "key", key);
5977
5978 if (pool_name.empty()) {
5979 // all
5980 f->open_object_section("pools");
5981 for (const auto &pool : osdmap.pools) {
5982 std::string name("<unknown>");
5983 const auto &pni = osdmap.pool_name.find(pool.first);
5984 if (pni != osdmap.pool_name.end())
5985 name = pni->second;
5986 f->open_object_section(name.c_str());
5987 for (auto &app_pair : pool.second.application_metadata) {
5988 f->open_object_section(app_pair.first.c_str());
5989 for (auto &kv_pair : app_pair.second) {
5990 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5991 }
5992 f->close_section();
5993 }
5994 f->close_section(); // name
5995 }
5996 f->close_section(); // pools
5997 f->flush(rdata);
5998 } else {
5999 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6000 if (pool < 0) {
6001 ss << "unrecognized pool '" << pool_name << "'";
6002 r = -ENOENT;
6003 goto reply;
6004 }
6005 auto p = osdmap.get_pg_pool(pool);
6006 // filter by pool
6007 if (app.empty()) {
6008 f->open_object_section(pool_name.c_str());
6009 for (auto &app_pair : p->application_metadata) {
6010 f->open_object_section(app_pair.first.c_str());
6011 for (auto &kv_pair : app_pair.second) {
6012 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6013 }
6014 f->close_section(); // application
6015 }
6016 f->close_section(); // pool_name
6017 f->flush(rdata);
6018 goto reply;
6019 }
6020
6021 auto app_it = p->application_metadata.find(app);
6022 if (app_it == p->application_metadata.end()) {
6023 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6024 r = -ENOENT;
6025 goto reply;
6026 }
6027 // filter by pool + app
6028 if (key.empty()) {
6029 f->open_object_section(app_it->first.c_str());
6030 for (auto &kv_pair : app_it->second) {
6031 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6032 }
6033 f->close_section(); // application
6034 f->flush(rdata);
6035 goto reply;
6036 }
6037 // filter by pool + app + key
6038 auto key_it = app_it->second.find(key);
6039 if (key_it == app_it->second.end()) {
6040 ss << "application '" << app << "' on pool '" << pool_name
6041 << "' does not have key '" << key << "'";
6042 r = -ENOENT;
6043 goto reply;
6044 }
6045 ss << key_it->second << "\n";
6046 rdata.append(ss.str());
6047 ss.str("");
6048 }
6049 } else if (prefix == "osd get-require-min-compat-client") {
6050 ss << ceph_release_name(osdmap.require_min_compat_client) << std::endl;
6051 rdata.append(ss.str());
6052 ss.str("");
6053 goto reply;
6054 } else if (prefix == "osd pool application enable" ||
6055 prefix == "osd pool application disable" ||
6056 prefix == "osd pool application set" ||
6057 prefix == "osd pool application rm") {
6058 bool changed = false;
6059 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6060 if (r != 0) {
6061 // Error, reply.
6062 goto reply;
6063 } else if (changed) {
6064 // Valid mutation, proceed to prepare phase
6065 return false;
6066 } else {
6067 // Idempotent case, reply
6068 goto reply;
6069 }
6070 } else {
6071 // try prepare update
6072 return false;
6073 }
6074
6075 reply:
6076 string rs;
6077 getline(ss, rs);
6078 mon->reply_command(op, r, rs, rdata, get_last_committed());
6079 return true;
6080 }
6081
6082 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6083 {
6084 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6085 osdmap.get_pg_pool(pool_id));
6086 ceph_assert(pool);
6087 pool->set_flag(flags);
6088 }
6089
6090 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6091 {
6092 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6093 osdmap.get_pg_pool(pool_id));
6094 ceph_assert(pool);
6095 pool->unset_flag(flags);
6096 }
6097
6098 string OSDMonitor::make_snap_epoch_key(int64_t pool, epoch_t epoch)
6099 {
6100 char k[80];
6101 snprintf(k, sizeof(k), "removed_epoch_%llu_%08lx",
6102 (unsigned long long)pool, (unsigned long)epoch);
6103 return k;
6104 }
6105
6106 string OSDMonitor::make_snap_key(int64_t pool, snapid_t snap)
6107 {
6108 char k[80];
6109 snprintf(k, sizeof(k), "removed_snap_%llu_%016llx",
6110 (unsigned long long)pool, (unsigned long long)snap);
6111 return k;
6112 }
6113
6114
6115 string OSDMonitor::make_snap_key_value(
6116 int64_t pool, snapid_t snap, snapid_t num,
6117 epoch_t epoch, bufferlist *v)
6118 {
6119 // encode the *last* epoch in the key so that we can use forward
6120 // iteration only to search for an epoch in an interval.
6121 encode(snap, *v);
6122 encode(snap + num, *v);
6123 encode(epoch, *v);
6124 return make_snap_key(pool, snap + num - 1);
6125 }
6126
6127 string OSDMonitor::make_snap_purged_key(int64_t pool, snapid_t snap)
6128 {
6129 char k[80];
6130 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6131 (unsigned long long)pool, (unsigned long long)snap);
6132 return k;
6133 }
6134 string OSDMonitor::make_snap_purged_key_value(
6135 int64_t pool, snapid_t snap, snapid_t num,
6136 epoch_t epoch, bufferlist *v)
6137 {
6138 // encode the *last* epoch in the key so that we can use forward
6139 // iteration only to search for an epoch in an interval.
6140 encode(snap, *v);
6141 encode(snap + num, *v);
6142 encode(epoch, *v);
6143 return make_snap_purged_key(pool, snap + num - 1);
6144 }
6145
6146 int OSDMonitor::lookup_pruned_snap(int64_t pool, snapid_t snap,
6147 snapid_t *begin, snapid_t *end)
6148 {
6149 string k = make_snap_key(pool, snap);
6150 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6151 it->lower_bound(k);
6152 if (!it->valid()) {
6153 return -ENOENT;
6154 }
6155 if (it->key().find(OSD_SNAP_PREFIX) != 0) {
6156 return -ENOENT;
6157 }
6158 bufferlist v = it->value();
6159 auto p = v.cbegin();
6160 decode(*begin, p);
6161 decode(*end, p);
6162 if (snap < *begin || snap >= *end) {
6163 return -ENOENT;
6164 }
6165 return 0;
6166 }
6167
6168 bool OSDMonitor::try_prune_purged_snaps()
6169 {
6170 if (!mon->mgrstatmon()->is_readable()) {
6171 return false;
6172 }
6173 if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC) {
6174 return false;
6175 }
6176 if (!pending_inc.new_purged_snaps.empty()) {
6177 return false; // we already pruned for this epoch
6178 }
6179
6180 unsigned max_prune = cct->_conf.get_val<uint64_t>(
6181 "mon_max_snap_prune_per_epoch");
6182 if (!max_prune) {
6183 max_prune = 100000;
6184 }
6185 dout(10) << __func__ << " max_prune " << max_prune << dendl;
6186
6187 unsigned actually_pruned = 0;
6188 auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
6189 for (auto& p : osdmap.get_pools()) {
6190 auto q = purged_snaps.find(p.first);
6191 if (q == purged_snaps.end()) {
6192 continue;
6193 }
6194 auto& purged = q->second;
6195 if (purged.empty()) {
6196 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
6197 continue;
6198 }
6199 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
6200 OSDMap::snap_interval_set_t to_prune;
6201 unsigned maybe_pruned = actually_pruned;
6202 for (auto i = purged.begin(); i != purged.end(); ++i) {
6203 snapid_t begin = i.get_start();
6204 auto end = i.get_start() + i.get_len();
6205 snapid_t pbegin = 0, pend = 0;
6206 int r = lookup_pruned_snap(p.first, begin, &pbegin, &pend);
6207 if (r == 0) {
6208 // already purged.
6209 // be a bit aggressive about backing off here, because the mon may
6210 // do a lot of work going through this set, and if we know the
6211 // purged set from the OSDs is at least *partly* stale we may as
6212 // well wait for it to be fresh.
6213 dout(20) << __func__ << " we've already pruned " << pbegin
6214 << "~" << (pend - pbegin) << dendl;
6215 break; // next pool
6216 }
6217 if (pbegin && pbegin < end) {
6218 // the tail of [begin,end) is purged; shorten the range
6219 ceph_assert(pbegin > begin);
6220 end = pbegin;
6221 }
6222 to_prune.insert(begin, end - begin);
6223 maybe_pruned += end - begin;
6224 if (maybe_pruned >= max_prune) {
6225 break;
6226 }
6227 }
6228 if (!to_prune.empty()) {
6229 // PGs may still be reporting things as purged that we have already
6230 // pruned from removed_snaps_queue.
6231 OSDMap::snap_interval_set_t actual;
6232 auto r = osdmap.removed_snaps_queue.find(p.first);
6233 if (r != osdmap.removed_snaps_queue.end()) {
6234 actual.intersection_of(to_prune, r->second);
6235 }
6236 actually_pruned += actual.size();
6237 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
6238 << ", actual pruned " << actual << dendl;
6239 if (!actual.empty()) {
6240 pending_inc.new_purged_snaps[p.first].swap(actual);
6241 }
6242 }
6243 if (actually_pruned >= max_prune) {
6244 break;
6245 }
6246 }
6247 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
6248 return !!actually_pruned;
6249 }
6250
6251 bool OSDMonitor::update_pools_status()
6252 {
6253 if (!mon->mgrstatmon()->is_readable())
6254 return false;
6255
6256 bool ret = false;
6257
6258 auto& pools = osdmap.get_pools();
6259 for (auto it = pools.begin(); it != pools.end(); ++it) {
6260 const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
6261 if (!pstat)
6262 continue;
6263 const object_stat_sum_t& sum = pstat->stats.sum;
6264 const pg_pool_t &pool = it->second;
6265 const string& pool_name = osdmap.get_pool_name(it->first);
6266
6267 bool pool_is_full =
6268 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
6269 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
6270
6271 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
6272 if (pool_is_full)
6273 continue;
6274
6275 mon->clog->info() << "pool '" << pool_name
6276 << "' no longer out of quota; removing NO_QUOTA flag";
6277 // below we cancel FLAG_FULL too, we'll set it again in
6278 // OSDMonitor::encode_pending if it still fails the osd-full checking.
6279 clear_pool_flags(it->first,
6280 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
6281 ret = true;
6282 } else {
6283 if (!pool_is_full)
6284 continue;
6285
6286 if (pool.quota_max_bytes > 0 &&
6287 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
6288 mon->clog->warn() << "pool '" << pool_name << "' is full"
6289 << " (reached quota's max_bytes: "
6290 << byte_u_t(pool.quota_max_bytes) << ")";
6291 }
6292 if (pool.quota_max_objects > 0 &&
6293 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
6294 mon->clog->warn() << "pool '" << pool_name << "' is full"
6295 << " (reached quota's max_objects: "
6296 << pool.quota_max_objects << ")";
6297 }
6298 // set both FLAG_FULL_QUOTA and FLAG_FULL
6299 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
6300 // since FLAG_FULL should always take precedence
6301 set_pool_flags(it->first,
6302 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
6303 clear_pool_flags(it->first,
6304 pg_pool_t::FLAG_NEARFULL |
6305 pg_pool_t::FLAG_BACKFILLFULL);
6306 ret = true;
6307 }
6308 }
6309 return ret;
6310 }
6311
6312 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
6313 {
6314 op->mark_osdmon_event(__func__);
6315 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
6316 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
6317 MonSession *session = op->get_session();
6318 if (!session)
6319 return -EPERM;
6320 string erasure_code_profile;
6321 stringstream ss;
6322 string rule_name;
6323 int ret = 0;
6324 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
6325 0, 0, 0, 0, 0, 0.0,
6326 erasure_code_profile,
6327 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
6328
6329 if (ret < 0) {
6330 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
6331 }
6332 return ret;
6333 }
6334
6335 int OSDMonitor::crush_rename_bucket(const string& srcname,
6336 const string& dstname,
6337 ostream *ss)
6338 {
6339 int ret;
6340 //
6341 // Avoid creating a pending crush if it does not already exists and
6342 // the rename would fail.
6343 //
6344 if (!_have_pending_crush()) {
6345 ret = _get_stable_crush().can_rename_bucket(srcname,
6346 dstname,
6347 ss);
6348 if (ret)
6349 return ret;
6350 }
6351
6352 CrushWrapper newcrush;
6353 _get_pending_crush(newcrush);
6354
6355 ret = newcrush.rename_bucket(srcname,
6356 dstname,
6357 ss);
6358 if (ret)
6359 return ret;
6360
6361 pending_inc.crush.clear();
6362 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6363 *ss << "renamed bucket " << srcname << " into " << dstname;
6364 return 0;
6365 }
6366
6367 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
6368 {
6369 string replacement = "";
6370
6371 if (plugin == "jerasure_generic" ||
6372 plugin == "jerasure_sse3" ||
6373 plugin == "jerasure_sse4" ||
6374 plugin == "jerasure_neon") {
6375 replacement = "jerasure";
6376 } else if (plugin == "shec_generic" ||
6377 plugin == "shec_sse3" ||
6378 plugin == "shec_sse4" ||
6379 plugin == "shec_neon") {
6380 replacement = "shec";
6381 }
6382
6383 if (replacement != "") {
6384 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
6385 << plugin << " that has been deprecated. Please use "
6386 << replacement << " instead." << dendl;
6387 }
6388 }
6389
6390 int OSDMonitor::normalize_profile(const string& profilename,
6391 ErasureCodeProfile &profile,
6392 bool force,
6393 ostream *ss)
6394 {
6395 ErasureCodeInterfaceRef erasure_code;
6396 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6397 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
6398 check_legacy_ec_plugin(plugin->second, profilename);
6399 int err = instance.factory(plugin->second,
6400 g_conf().get_val<std::string>("erasure_code_dir"),
6401 profile, &erasure_code, ss);
6402 if (err) {
6403 return err;
6404 }
6405
6406 err = erasure_code->init(profile, ss);
6407 if (err) {
6408 return err;
6409 }
6410
6411 auto it = profile.find("stripe_unit");
6412 if (it != profile.end()) {
6413 string err_str;
6414 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
6415 if (!err_str.empty()) {
6416 *ss << "could not parse stripe_unit '" << it->second
6417 << "': " << err_str << std::endl;
6418 return -EINVAL;
6419 }
6420 uint32_t data_chunks = erasure_code->get_data_chunk_count();
6421 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
6422 if (chunk_size != stripe_unit) {
6423 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
6424 << "alignment. Would be padded to " << chunk_size
6425 << std::endl;
6426 return -EINVAL;
6427 }
6428 if ((stripe_unit % 4096) != 0 && !force) {
6429 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
6430 << "use --force to override this check" << std::endl;
6431 return -EINVAL;
6432 }
6433 }
6434 return 0;
6435 }
6436
6437 int OSDMonitor::crush_rule_create_erasure(const string &name,
6438 const string &profile,
6439 int *rule,
6440 ostream *ss)
6441 {
6442 int ruleid = osdmap.crush->get_rule_id(name);
6443 if (ruleid != -ENOENT) {
6444 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
6445 return -EEXIST;
6446 }
6447
6448 CrushWrapper newcrush;
6449 _get_pending_crush(newcrush);
6450
6451 ruleid = newcrush.get_rule_id(name);
6452 if (ruleid != -ENOENT) {
6453 *rule = newcrush.get_rule_mask_ruleset(ruleid);
6454 return -EALREADY;
6455 } else {
6456 ErasureCodeInterfaceRef erasure_code;
6457 int err = get_erasure_code(profile, &erasure_code, ss);
6458 if (err) {
6459 *ss << "failed to load plugin using profile " << profile << std::endl;
6460 return err;
6461 }
6462
6463 err = erasure_code->create_rule(name, newcrush, ss);
6464 erasure_code.reset();
6465 if (err < 0)
6466 return err;
6467 *rule = err;
6468 pending_inc.crush.clear();
6469 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6470 return 0;
6471 }
6472 }
6473
6474 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
6475 ErasureCodeInterfaceRef *erasure_code,
6476 ostream *ss) const
6477 {
6478 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
6479 return -EAGAIN;
6480 ErasureCodeProfile profile =
6481 osdmap.get_erasure_code_profile(erasure_code_profile);
6482 ErasureCodeProfile::const_iterator plugin =
6483 profile.find("plugin");
6484 if (plugin == profile.end()) {
6485 *ss << "cannot determine the erasure code plugin"
6486 << " because there is no 'plugin' entry in the erasure_code_profile "
6487 << profile << std::endl;
6488 return -EINVAL;
6489 }
6490 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
6491 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6492 return instance.factory(plugin->second,
6493 g_conf().get_val<std::string>("erasure_code_dir"),
6494 profile, erasure_code, ss);
6495 }
6496
6497 int OSDMonitor::check_cluster_features(uint64_t features,
6498 stringstream &ss)
6499 {
6500 stringstream unsupported_ss;
6501 int unsupported_count = 0;
6502 if ((mon->get_quorum_con_features() & features) != features) {
6503 unsupported_ss << "the monitor cluster";
6504 ++unsupported_count;
6505 }
6506
6507 set<int32_t> up_osds;
6508 osdmap.get_up_osds(up_osds);
6509 for (set<int32_t>::iterator it = up_osds.begin();
6510 it != up_osds.end(); ++it) {
6511 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
6512 if ((xi.features & features) != features) {
6513 if (unsupported_count > 0)
6514 unsupported_ss << ", ";
6515 unsupported_ss << "osd." << *it;
6516 unsupported_count ++;
6517 }
6518 }
6519
6520 if (unsupported_count > 0) {
6521 ss << "features " << features << " unsupported by: "
6522 << unsupported_ss.str();
6523 return -ENOTSUP;
6524 }
6525
6526 // check pending osd state, too!
6527 for (map<int32_t,osd_xinfo_t>::const_iterator p =
6528 pending_inc.new_xinfo.begin();
6529 p != pending_inc.new_xinfo.end(); ++p) {
6530 const osd_xinfo_t &xi = p->second;
6531 if ((xi.features & features) != features) {
6532 dout(10) << __func__ << " pending osd." << p->first
6533 << " features are insufficient; retry" << dendl;
6534 return -EAGAIN;
6535 }
6536 }
6537
6538 return 0;
6539 }
6540
6541 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
6542 stringstream& ss)
6543 {
6544 OSDMap::Incremental new_pending = pending_inc;
6545 encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
6546 OSDMap newmap;
6547 newmap.deepish_copy_from(osdmap);
6548 newmap.apply_incremental(new_pending);
6549
6550 // client compat
6551 if (newmap.require_min_compat_client > 0) {
6552 auto mv = newmap.get_min_compat_client();
6553 if (mv > newmap.require_min_compat_client) {
6554 ss << "new crush map requires client version " << ceph_release_name(mv)
6555 << " but require_min_compat_client is "
6556 << ceph_release_name(newmap.require_min_compat_client);
6557 return false;
6558 }
6559 }
6560
6561 // osd compat
6562 uint64_t features =
6563 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
6564 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
6565 stringstream features_ss;
6566 int r = check_cluster_features(features, features_ss);
6567 if (r) {
6568 ss << "Could not change CRUSH: " << features_ss.str();
6569 return false;
6570 }
6571
6572 return true;
6573 }
6574
6575 bool OSDMonitor::erasure_code_profile_in_use(
6576 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
6577 const string &profile,
6578 ostream *ss)
6579 {
6580 bool found = false;
6581 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
6582 p != pools.end();
6583 ++p) {
6584 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
6585 *ss << osdmap.pool_name[p->first] << " ";
6586 found = true;
6587 }
6588 }
6589 if (found) {
6590 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
6591 }
6592 return found;
6593 }
6594
6595 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
6596 map<string,string> *erasure_code_profile_map,
6597 ostream *ss)
6598 {
6599 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
6600 get_json_str_map,
6601 *ss,
6602 erasure_code_profile_map,
6603 true);
6604 if (r)
6605 return r;
6606 ceph_assert((*erasure_code_profile_map).count("plugin"));
6607 string default_plugin = (*erasure_code_profile_map)["plugin"];
6608 map<string,string> user_map;
6609 for (vector<string>::const_iterator i = erasure_code_profile.begin();
6610 i != erasure_code_profile.end();
6611 ++i) {
6612 size_t equal = i->find('=');
6613 if (equal == string::npos) {
6614 user_map[*i] = string();
6615 (*erasure_code_profile_map)[*i] = string();
6616 } else {
6617 const string key = i->substr(0, equal);
6618 equal++;
6619 const string value = i->substr(equal);
6620 if (key.find("ruleset-") == 0) {
6621 *ss << "property '" << key << "' is no longer supported; try "
6622 << "'crush-" << key.substr(8) << "' instead";
6623 return -EINVAL;
6624 }
6625 user_map[key] = value;
6626 (*erasure_code_profile_map)[key] = value;
6627 }
6628 }
6629
6630 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
6631 (*erasure_code_profile_map) = user_map;
6632
6633 return 0;
6634 }
6635
6636 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
6637 const string &erasure_code_profile,
6638 uint8_t repl_size,
6639 unsigned *size, unsigned *min_size,
6640 ostream *ss)
6641 {
6642 int err = 0;
6643 switch (pool_type) {
6644 case pg_pool_t::TYPE_REPLICATED:
6645 if (repl_size == 0) {
6646 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
6647 }
6648 *size = repl_size;
6649 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
6650 break;
6651 case pg_pool_t::TYPE_ERASURE:
6652 {
6653 ErasureCodeInterfaceRef erasure_code;
6654 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6655 if (err == 0) {
6656 *size = erasure_code->get_chunk_count();
6657 *min_size =
6658 erasure_code->get_data_chunk_count() +
6659 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
6660 assert(*min_size <= *size);
6661 assert(*min_size >= erasure_code->get_data_chunk_count());
6662 }
6663 }
6664 break;
6665 default:
6666 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
6667 err = -EINVAL;
6668 break;
6669 }
6670 return err;
6671 }
6672
6673 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
6674 const string &erasure_code_profile,
6675 uint32_t *stripe_width,
6676 ostream *ss)
6677 {
6678 int err = 0;
6679 switch (pool_type) {
6680 case pg_pool_t::TYPE_REPLICATED:
6681 // ignored
6682 break;
6683 case pg_pool_t::TYPE_ERASURE:
6684 {
6685 ErasureCodeProfile profile =
6686 osdmap.get_erasure_code_profile(erasure_code_profile);
6687 ErasureCodeInterfaceRef erasure_code;
6688 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6689 if (err)
6690 break;
6691 uint32_t data_chunks = erasure_code->get_data_chunk_count();
6692 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
6693 auto it = profile.find("stripe_unit");
6694 if (it != profile.end()) {
6695 string err_str;
6696 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
6697 ceph_assert(err_str.empty());
6698 }
6699 *stripe_width = data_chunks *
6700 erasure_code->get_chunk_size(stripe_unit * data_chunks);
6701 }
6702 break;
6703 default:
6704 *ss << "prepare_pool_stripe_width: "
6705 << pool_type << " is not a known pool type";
6706 err = -EINVAL;
6707 break;
6708 }
6709 return err;
6710 }
6711
6712 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
6713 const string &erasure_code_profile,
6714 const string &rule_name,
6715 int *crush_rule,
6716 ostream *ss)
6717 {
6718
6719 if (*crush_rule < 0) {
6720 switch (pool_type) {
6721 case pg_pool_t::TYPE_REPLICATED:
6722 {
6723 if (rule_name == "") {
6724 // Use default rule
6725 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
6726 if (*crush_rule < 0) {
6727 // Errors may happen e.g. if no valid rule is available
6728 *ss << "No suitable CRUSH rule exists, check "
6729 << "'osd pool default crush *' config options";
6730 return -ENOENT;
6731 }
6732 } else {
6733 return get_crush_rule(rule_name, crush_rule, ss);
6734 }
6735 }
6736 break;
6737 case pg_pool_t::TYPE_ERASURE:
6738 {
6739 int err = crush_rule_create_erasure(rule_name,
6740 erasure_code_profile,
6741 crush_rule, ss);
6742 switch (err) {
6743 case -EALREADY:
6744 dout(20) << "prepare_pool_crush_rule: rule "
6745 << rule_name << " try again" << dendl;
6746 // fall through
6747 case 0:
6748 // need to wait for the crush rule to be proposed before proceeding
6749 err = -EAGAIN;
6750 break;
6751 case -EEXIST:
6752 err = 0;
6753 break;
6754 }
6755 return err;
6756 }
6757 break;
6758 default:
6759 *ss << "prepare_pool_crush_rule: " << pool_type
6760 << " is not a known pool type";
6761 return -EINVAL;
6762 break;
6763 }
6764 } else {
6765 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
6766 *ss << "CRUSH rule " << *crush_rule << " not found";
6767 return -ENOENT;
6768 }
6769 }
6770
6771 return 0;
6772 }
6773
6774 int OSDMonitor::get_crush_rule(const string &rule_name,
6775 int *crush_rule,
6776 ostream *ss)
6777 {
6778 int ret;
6779 ret = osdmap.crush->get_rule_id(rule_name);
6780 if (ret != -ENOENT) {
6781 // found it, use it
6782 *crush_rule = ret;
6783 } else {
6784 CrushWrapper newcrush;
6785 _get_pending_crush(newcrush);
6786
6787 ret = newcrush.get_rule_id(rule_name);
6788 if (ret != -ENOENT) {
6789 // found it, wait for it to be proposed
6790 dout(20) << __func__ << ": rule " << rule_name
6791 << " try again" << dendl;
6792 return -EAGAIN;
6793 } else {
6794 // Cannot find it , return error
6795 *ss << "specified rule " << rule_name << " doesn't exist";
6796 return ret;
6797 }
6798 }
6799 return 0;
6800 }
6801
6802 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
6803 {
6804 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
6805 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
6806 auto max_pgs = max_pgs_per_osd * num_osds;
6807 uint64_t projected = 0;
6808 if (pool < 0) {
6809 projected += pg_num * size;
6810 }
6811 for (const auto& i : osdmap.get_pools()) {
6812 if (i.first == pool) {
6813 projected += pg_num * size;
6814 } else {
6815 projected += i.second.get_pg_num_target() * i.second.get_size();
6816 }
6817 }
6818 if (projected > max_pgs) {
6819 if (pool >= 0) {
6820 *ss << "pool id " << pool;
6821 }
6822 *ss << " pg_num " << pg_num << " size " << size
6823 << " would mean " << projected
6824 << " total pgs, which exceeds max " << max_pgs
6825 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6826 << " * num_in_osds " << num_osds << ")";
6827 return -ERANGE;
6828 }
6829 return 0;
6830 }
6831
6832 /**
6833 * @param name The name of the new pool
6834 * @param crush_rule The crush rule to use. If <0, will use the system default
6835 * @param crush_rule_name The crush rule to use, if crush_rulset <0
6836 * @param pg_num The pg_num to use. If set to 0, will use the system default
6837 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6838 * @param repl_size Replication factor, or 0 for default
6839 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6840 * @param pool_type TYPE_ERASURE, or TYPE_REP
6841 * @param expected_num_objects expected number of objects on the pool
6842 * @param fast_read fast read type.
6843 * @param ss human readable error message, if any.
6844 *
6845 * @return 0 on success, negative errno on failure.
6846 */
6847 int OSDMonitor::prepare_new_pool(string& name,
6848 int crush_rule,
6849 const string &crush_rule_name,
6850 unsigned pg_num, unsigned pgp_num,
6851 unsigned pg_num_min,
6852 const uint64_t repl_size,
6853 const uint64_t target_size_bytes,
6854 const float target_size_ratio,
6855 const string &erasure_code_profile,
6856 const unsigned pool_type,
6857 const uint64_t expected_num_objects,
6858 FastReadType fast_read,
6859 ostream *ss)
6860 {
6861 if (name.length() == 0)
6862 return -EINVAL;
6863 if (pg_num == 0)
6864 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
6865 if (pgp_num == 0)
6866 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
6867 if (!pgp_num)
6868 pgp_num = pg_num;
6869 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
6870 *ss << "'pg_num' must be greater than 0 and less than or equal to "
6871 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
6872 << " (you may adjust 'mon max pool pg num' for higher values)";
6873 return -ERANGE;
6874 }
6875 if (pgp_num > pg_num) {
6876 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6877 << ", which in this case is " << pg_num;
6878 return -ERANGE;
6879 }
6880 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
6881 *ss << "'fast_read' can only apply to erasure coding pool";
6882 return -EINVAL;
6883 }
6884 int r;
6885 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
6886 crush_rule_name, &crush_rule, ss);
6887 if (r) {
6888 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
6889 return r;
6890 }
6891 if (g_conf()->mon_osd_crush_smoke_test) {
6892 CrushWrapper newcrush;
6893 _get_pending_crush(newcrush);
6894 ostringstream err;
6895 CrushTester tester(newcrush, err);
6896 tester.set_min_x(0);
6897 tester.set_max_x(50);
6898 tester.set_rule(crush_rule);
6899 auto start = ceph::coarse_mono_clock::now();
6900 r = tester.test_with_fork(g_conf()->mon_lease);
6901 auto duration = ceph::coarse_mono_clock::now() - start;
6902 if (r < 0) {
6903 dout(10) << "tester.test_with_fork returns " << r
6904 << ": " << err.str() << dendl;
6905 *ss << "crush test failed with " << r << ": " << err.str();
6906 return r;
6907 }
6908 dout(10) << __func__ << " crush smoke test duration: "
6909 << duration << dendl;
6910 }
6911 unsigned size, min_size;
6912 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
6913 &size, &min_size, ss);
6914 if (r) {
6915 dout(10) << "prepare_pool_size returns " << r << dendl;
6916 return r;
6917 }
6918 r = check_pg_num(-1, pg_num, size, ss);
6919 if (r) {
6920 dout(10) << "check_pg_num returns " << r << dendl;
6921 return r;
6922 }
6923
6924 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
6925 return -EINVAL;
6926 }
6927
6928 uint32_t stripe_width = 0;
6929 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
6930 if (r) {
6931 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
6932 return r;
6933 }
6934
6935 bool fread = false;
6936 if (pool_type == pg_pool_t::TYPE_ERASURE) {
6937 switch (fast_read) {
6938 case FAST_READ_OFF:
6939 fread = false;
6940 break;
6941 case FAST_READ_ON:
6942 fread = true;
6943 break;
6944 case FAST_READ_DEFAULT:
6945 fread = g_conf()->osd_pool_default_ec_fast_read;
6946 break;
6947 default:
6948 *ss << "invalid fast_read setting: " << fast_read;
6949 return -EINVAL;
6950 }
6951 }
6952
6953 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
6954 p != pending_inc.new_pool_names.end();
6955 ++p) {
6956 if (p->second == name)
6957 return 0;
6958 }
6959
6960 if (-1 == pending_inc.new_pool_max)
6961 pending_inc.new_pool_max = osdmap.pool_max;
6962 int64_t pool = ++pending_inc.new_pool_max;
6963 pg_pool_t empty;
6964 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
6965 pi->create_time = ceph_clock_now();
6966 pi->type = pool_type;
6967 pi->fast_read = fread;
6968 pi->flags = g_conf()->osd_pool_default_flags;
6969 if (g_conf()->osd_pool_default_flag_hashpspool)
6970 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
6971 if (g_conf()->osd_pool_default_flag_nodelete)
6972 pi->set_flag(pg_pool_t::FLAG_NODELETE);
6973 if (g_conf()->osd_pool_default_flag_nopgchange)
6974 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
6975 if (g_conf()->osd_pool_default_flag_nosizechange)
6976 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
6977 pi->set_flag(pg_pool_t::FLAG_CREATING);
6978 if (g_conf()->osd_pool_use_gmt_hitset)
6979 pi->use_gmt_hitset = true;
6980 else
6981 pi->use_gmt_hitset = false;
6982
6983 pi->size = size;
6984 pi->min_size = min_size;
6985 pi->crush_rule = crush_rule;
6986 pi->expected_num_objects = expected_num_objects;
6987 pi->object_hash = CEPH_STR_HASH_RJENKINS;
6988
6989 {
6990 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
6991 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
6992 pi->pg_autoscale_mode = m >= 0 ? m : 0;
6993 }
6994 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
6995 pi->set_pg_num(
6996 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
6997 : pg_num);
6998 pi->set_pg_num_pending(pi->get_pg_num());
6999 pi->set_pg_num_target(pg_num);
7000 pi->set_pgp_num(pi->get_pg_num());
7001 pi->set_pgp_num_target(pgp_num);
7002 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7003 pg_num_min) {
7004 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7005 }
7006
7007 pi->last_change = pending_inc.epoch;
7008 pi->auid = 0;
7009
7010 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7011 pi->erasure_code_profile = erasure_code_profile;
7012 } else {
7013 pi->erasure_code_profile = "";
7014 }
7015 pi->stripe_width = stripe_width;
7016
7017 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7018 target_size_bytes) {
7019 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7020 // larger than int32_t max.
7021 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7022 }
7023 if (target_size_ratio > 0.0 &&
7024 osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
7025 // only store for nautilus+, just to be consistent and tidy.
7026 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7027 }
7028
7029 pi->cache_target_dirty_ratio_micro =
7030 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7031 pi->cache_target_dirty_high_ratio_micro =
7032 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7033 pi->cache_target_full_ratio_micro =
7034 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7035 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7036 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7037
7038 pending_inc.new_pool_names[pool] = name;
7039 return 0;
7040 }
7041
7042 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7043 {
7044 op->mark_osdmon_event(__func__);
7045 ostringstream ss;
7046 if (pending_inc.new_flags < 0)
7047 pending_inc.new_flags = osdmap.get_flags();
7048 pending_inc.new_flags |= flag;
7049 ss << OSDMap::get_flag_string(flag) << " is set";
7050 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7051 get_last_committed() + 1));
7052 return true;
7053 }
7054
7055 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7056 {
7057 op->mark_osdmon_event(__func__);
7058 ostringstream ss;
7059 if (pending_inc.new_flags < 0)
7060 pending_inc.new_flags = osdmap.get_flags();
7061 pending_inc.new_flags &= ~flag;
7062 ss << OSDMap::get_flag_string(flag) << " is unset";
7063 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7064 get_last_committed() + 1));
7065 return true;
7066 }
7067
7068 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7069 stringstream& ss)
7070 {
7071 string poolstr;
7072 cmd_getval(cct, cmdmap, "pool", poolstr);
7073 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7074 if (pool < 0) {
7075 ss << "unrecognized pool '" << poolstr << "'";
7076 return -ENOENT;
7077 }
7078 string var;
7079 cmd_getval(cct, cmdmap, "var", var);
7080
7081 pg_pool_t p = *osdmap.get_pg_pool(pool);
7082 if (pending_inc.new_pools.count(pool))
7083 p = pending_inc.new_pools[pool];
7084
7085 // accept val as a json string in the normal case (current
7086 // generation monitor). parse out int or float values from the
7087 // string as needed. however, if it is not a string, try to pull
7088 // out an int, in case an older monitor with an older json schema is
7089 // forwarding a request.
7090 string val;
7091 string interr, floaterr;
7092 int64_t n = 0;
7093 double f = 0;
7094 int64_t uf = 0; // micro-f
7095 cmd_getval(cct, cmdmap, "val", val);
7096
7097 // parse string as both int and float; different fields use different types.
7098 n = strict_strtoll(val.c_str(), 10, &interr);
7099 f = strict_strtod(val.c_str(), &floaterr);
7100 uf = llrintl(f * (double)1000000.0);
7101
7102 if (!p.is_tier() &&
7103 (var == "hit_set_type" || var == "hit_set_period" ||
7104 var == "hit_set_count" || var == "hit_set_fpp" ||
7105 var == "target_max_objects" || var == "target_max_bytes" ||
7106 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7107 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7108 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7109 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7110 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7111 return -EACCES;
7112 }
7113
7114 if (var == "size") {
7115 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7116 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7117 return -EPERM;
7118 }
7119 if (p.type == pg_pool_t::TYPE_ERASURE) {
7120 ss << "can not change the size of an erasure-coded pool";
7121 return -ENOTSUP;
7122 }
7123 if (interr.length()) {
7124 ss << "error parsing integer value '" << val << "': " << interr;
7125 return -EINVAL;
7126 }
7127 if (n <= 0 || n > 10) {
7128 ss << "pool size must be between 1 and 10";
7129 return -EINVAL;
7130 }
7131 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7132 if (r < 0) {
7133 return r;
7134 }
7135 p.size = n;
7136 if (n < p.min_size)
7137 p.min_size = n;
7138 } else if (var == "min_size") {
7139 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7140 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7141 return -EPERM;
7142 }
7143 if (interr.length()) {
7144 ss << "error parsing integer value '" << val << "': " << interr;
7145 return -EINVAL;
7146 }
7147
7148 if (p.type != pg_pool_t::TYPE_ERASURE) {
7149 if (n < 1 || n > p.size) {
7150 ss << "pool min_size must be between 1 and " << (int)p.size;
7151 return -EINVAL;
7152 }
7153 } else {
7154 ErasureCodeInterfaceRef erasure_code;
7155 int k;
7156 stringstream tmp;
7157 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7158 if (err == 0) {
7159 k = erasure_code->get_data_chunk_count();
7160 } else {
7161 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7162 return err;
7163 }
7164
7165 if (n < k || n > p.size) {
7166 ss << "pool min_size must be between " << k << " and " << (int)p.size;
7167 return -EINVAL;
7168 }
7169 }
7170 p.min_size = n;
7171 } else if (var == "pg_num_actual") {
7172 if (interr.length()) {
7173 ss << "error parsing integer value '" << val << "': " << interr;
7174 return -EINVAL;
7175 }
7176 if (n == (int)p.get_pg_num()) {
7177 return 0;
7178 }
7179 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7180 ss << "'pg_num' must be greater than 0 and less than or equal to "
7181 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7182 << " (you may adjust 'mon max pool pg num' for higher values)";
7183 return -ERANGE;
7184 }
7185 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
7186 ss << "cannot adjust pg_num while initial PGs are being created";
7187 return -EBUSY;
7188 }
7189 if (n > (int)p.get_pg_num()) {
7190 if (p.get_pg_num() != p.get_pg_num_pending()) {
7191 // force pre-nautilus clients to resend their ops, since they
7192 // don't understand pg_num_pending changes form a new interval
7193 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7194 }
7195 p.set_pg_num(n);
7196 } else {
7197 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7198 ss << "nautilus OSDs are required to adjust pg_num_pending";
7199 return -EPERM;
7200 }
7201 if (n < (int)p.get_pgp_num()) {
7202 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
7203 return -EINVAL;
7204 }
7205 if (n < (int)p.get_pg_num() - 1) {
7206 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
7207 << ") - 1; only single pg decrease is currently supported";
7208 return -EINVAL;
7209 }
7210 p.set_pg_num_pending(n);
7211 // force pre-nautilus clients to resend their ops, since they
7212 // don't understand pg_num_pending changes form a new interval
7213 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7214 }
7215 // force pre-luminous clients to resend their ops, since they
7216 // don't understand that split PGs now form a new interval.
7217 p.last_force_op_resend_preluminous = pending_inc.epoch;
7218 } else if (var == "pg_num") {
7219 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7220 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
7221 return -EPERM;
7222 }
7223 if (interr.length()) {
7224 ss << "error parsing integer value '" << val << "': " << interr;
7225 return -EINVAL;
7226 }
7227 if (n == (int)p.get_pg_num_target()) {
7228 return 0;
7229 }
7230 if (n <= 0 || static_cast<uint64_t>(n) >
7231 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7232 ss << "'pg_num' must be greater than 0 and less than or equal to "
7233 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7234 << " (you may adjust 'mon max pool pg num' for higher values)";
7235 return -ERANGE;
7236 }
7237 if (n > (int)p.get_pg_num_target()) {
7238 int r = check_pg_num(pool, n, p.get_size(), &ss);
7239 if (r) {
7240 return r;
7241 }
7242 bool force = false;
7243 cmd_getval(cct,cmdmap, "yes_i_really_mean_it", force);
7244 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
7245 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
7246 return -EPERM;
7247 }
7248 } else {
7249 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7250 ss << "nautilus OSDs are required to decrease pg_num";
7251 return -EPERM;
7252 }
7253 }
7254 // set targets; mgr will adjust pg_num_actual and pgp_num later.
7255 // make pgp_num track pg_num if it already matches. if it is set
7256 // differently, leave it different and let the user control it
7257 // manually.
7258 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
7259 p.set_pgp_num_target(n);
7260 }
7261 p.set_pg_num_target(n);
7262 } else if (var == "pgp_num_actual") {
7263 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7264 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7265 return -EPERM;
7266 }
7267 if (interr.length()) {
7268 ss << "error parsing integer value '" << val << "': " << interr;
7269 return -EINVAL;
7270 }
7271 if (n <= 0) {
7272 ss << "specified pgp_num must > 0, but you set to " << n;
7273 return -EINVAL;
7274 }
7275 if (n > (int)p.get_pg_num()) {
7276 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
7277 return -EINVAL;
7278 }
7279 if (n > (int)p.get_pg_num_pending()) {
7280 ss << "specified pgp_num " << n
7281 << " > pg_num_pending " << p.get_pg_num_pending();
7282 return -EINVAL;
7283 }
7284 p.set_pgp_num(n);
7285 } else if (var == "pgp_num") {
7286 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7287 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7288 return -EPERM;
7289 }
7290 if (interr.length()) {
7291 ss << "error parsing integer value '" << val << "': " << interr;
7292 return -EINVAL;
7293 }
7294 if (n <= 0) {
7295 ss << "specified pgp_num must > 0, but you set to " << n;
7296 return -EINVAL;
7297 }
7298 if (n > (int)p.get_pg_num_target()) {
7299 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
7300 return -EINVAL;
7301 }
7302 p.set_pgp_num_target(n);
7303 } else if (var == "pg_autoscale_mode") {
7304 n = pg_pool_t::get_pg_autoscale_mode_by_name(val);
7305 if (n < 0) {
7306 ss << "specified invalid mode " << val;
7307 return -EINVAL;
7308 }
7309 p.pg_autoscale_mode = n;
7310 } else if (var == "crush_rule") {
7311 int id = osdmap.crush->get_rule_id(val);
7312 if (id == -ENOENT) {
7313 ss << "crush rule " << val << " does not exist";
7314 return -ENOENT;
7315 }
7316 if (id < 0) {
7317 ss << cpp_strerror(id);
7318 return -ENOENT;
7319 }
7320 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
7321 return -EINVAL;
7322 }
7323 p.crush_rule = id;
7324 } else if (var == "nodelete" || var == "nopgchange" ||
7325 var == "nosizechange" || var == "write_fadvise_dontneed" ||
7326 var == "noscrub" || var == "nodeep-scrub") {
7327 uint64_t flag = pg_pool_t::get_flag_by_name(var);
7328 // make sure we only compare against 'n' if we didn't receive a string
7329 if (val == "true" || (interr.empty() && n == 1)) {
7330 p.set_flag(flag);
7331 } else if (val == "false" || (interr.empty() && n == 0)) {
7332 p.unset_flag(flag);
7333 } else {
7334 ss << "expecting value 'true', 'false', '0', or '1'";
7335 return -EINVAL;
7336 }
7337 } else if (var == "hashpspool") {
7338 uint64_t flag = pg_pool_t::get_flag_by_name(var);
7339 bool force = false;
7340 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7341
7342 if (!force) {
7343 ss << "are you SURE? this will remap all placement groups in this pool,"
7344 " this triggers large data movement,"
7345 " pass --yes-i-really-mean-it if you really do.";
7346 return -EPERM;
7347 }
7348 // make sure we only compare against 'n' if we didn't receive a string
7349 if (val == "true" || (interr.empty() && n == 1)) {
7350 p.set_flag(flag);
7351 } else if (val == "false" || (interr.empty() && n == 0)) {
7352 p.unset_flag(flag);
7353 } else {
7354 ss << "expecting value 'true', 'false', '0', or '1'";
7355 return -EINVAL;
7356 }
7357 } else if (var == "hit_set_type") {
7358 if (val == "none")
7359 p.hit_set_params = HitSet::Params();
7360 else {
7361 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
7362 if (err)
7363 return err;
7364 if (val == "bloom") {
7365 BloomHitSet::Params *bsp = new BloomHitSet::Params;
7366 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7367 p.hit_set_params = HitSet::Params(bsp);
7368 } else if (val == "explicit_hash")
7369 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
7370 else if (val == "explicit_object")
7371 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
7372 else {
7373 ss << "unrecognized hit_set type '" << val << "'";
7374 return -EINVAL;
7375 }
7376 }
7377 } else if (var == "hit_set_period") {
7378 if (interr.length()) {
7379 ss << "error parsing integer value '" << val << "': " << interr;
7380 return -EINVAL;
7381 } else if (n < 0) {
7382 ss << "hit_set_period should be non-negative";
7383 return -EINVAL;
7384 }
7385 p.hit_set_period = n;
7386 } else if (var == "hit_set_count") {
7387 if (interr.length()) {
7388 ss << "error parsing integer value '" << val << "': " << interr;
7389 return -EINVAL;
7390 } else if (n < 0) {
7391 ss << "hit_set_count should be non-negative";
7392 return -EINVAL;
7393 }
7394 p.hit_set_count = n;
7395 } else if (var == "hit_set_fpp") {
7396 if (floaterr.length()) {
7397 ss << "error parsing floating point value '" << val << "': " << floaterr;
7398 return -EINVAL;
7399 } else if (f < 0 || f > 1.0) {
7400 ss << "hit_set_fpp should be in the range 0..1";
7401 return -EINVAL;
7402 }
7403 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
7404 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
7405 return -EINVAL;
7406 }
7407 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
7408 bloomp->set_fpp(f);
7409 } else if (var == "use_gmt_hitset") {
7410 if (val == "true" || (interr.empty() && n == 1)) {
7411 p.use_gmt_hitset = true;
7412 } else {
7413 ss << "expecting value 'true' or '1'";
7414 return -EINVAL;
7415 }
7416 } else if (var == "allow_ec_overwrites") {
7417 if (!p.is_erasure()) {
7418 ss << "ec overwrites can only be enabled for an erasure coded pool";
7419 return -EINVAL;
7420 }
7421 stringstream err;
7422 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
7423 !is_pool_currently_all_bluestore(pool, p, &err)) {
7424 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
7425 return -EINVAL;
7426 }
7427 if (val == "true" || (interr.empty() && n == 1)) {
7428 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
7429 } else if (val == "false" || (interr.empty() && n == 0)) {
7430 ss << "ec overwrites cannot be disabled once enabled";
7431 return -EINVAL;
7432 } else {
7433 ss << "expecting value 'true', 'false', '0', or '1'";
7434 return -EINVAL;
7435 }
7436 } else if (var == "target_max_objects") {
7437 if (interr.length()) {
7438 ss << "error parsing int '" << val << "': " << interr;
7439 return -EINVAL;
7440 }
7441 p.target_max_objects = n;
7442 } else if (var == "target_max_bytes") {
7443 if (interr.length()) {
7444 ss << "error parsing int '" << val << "': " << interr;
7445 return -EINVAL;
7446 }
7447 p.target_max_bytes = n;
7448 } else if (var == "cache_target_dirty_ratio") {
7449 if (floaterr.length()) {
7450 ss << "error parsing float '" << val << "': " << floaterr;
7451 return -EINVAL;
7452 }
7453 if (f < 0 || f > 1.0) {
7454 ss << "value must be in the range 0..1";
7455 return -ERANGE;
7456 }
7457 p.cache_target_dirty_ratio_micro = uf;
7458 } else if (var == "cache_target_dirty_high_ratio") {
7459 if (floaterr.length()) {
7460 ss << "error parsing float '" << val << "': " << floaterr;
7461 return -EINVAL;
7462 }
7463 if (f < 0 || f > 1.0) {
7464 ss << "value must be in the range 0..1";
7465 return -ERANGE;
7466 }
7467 p.cache_target_dirty_high_ratio_micro = uf;
7468 } else if (var == "cache_target_full_ratio") {
7469 if (floaterr.length()) {
7470 ss << "error parsing float '" << val << "': " << floaterr;
7471 return -EINVAL;
7472 }
7473 if (f < 0 || f > 1.0) {
7474 ss << "value must be in the range 0..1";
7475 return -ERANGE;
7476 }
7477 p.cache_target_full_ratio_micro = uf;
7478 } else if (var == "cache_min_flush_age") {
7479 if (interr.length()) {
7480 ss << "error parsing int '" << val << "': " << interr;
7481 return -EINVAL;
7482 }
7483 p.cache_min_flush_age = n;
7484 } else if (var == "cache_min_evict_age") {
7485 if (interr.length()) {
7486 ss << "error parsing int '" << val << "': " << interr;
7487 return -EINVAL;
7488 }
7489 p.cache_min_evict_age = n;
7490 } else if (var == "min_read_recency_for_promote") {
7491 if (interr.length()) {
7492 ss << "error parsing integer value '" << val << "': " << interr;
7493 return -EINVAL;
7494 }
7495 p.min_read_recency_for_promote = n;
7496 } else if (var == "hit_set_grade_decay_rate") {
7497 if (interr.length()) {
7498 ss << "error parsing integer value '" << val << "': " << interr;
7499 return -EINVAL;
7500 }
7501 if (n > 100 || n < 0) {
7502 ss << "value out of range,valid range is 0 - 100";
7503 return -EINVAL;
7504 }
7505 p.hit_set_grade_decay_rate = n;
7506 } else if (var == "hit_set_search_last_n") {
7507 if (interr.length()) {
7508 ss << "error parsing integer value '" << val << "': " << interr;
7509 return -EINVAL;
7510 }
7511 if (n > p.hit_set_count || n < 0) {
7512 ss << "value out of range,valid range is 0 - hit_set_count";
7513 return -EINVAL;
7514 }
7515 p.hit_set_search_last_n = n;
7516 } else if (var == "min_write_recency_for_promote") {
7517 if (interr.length()) {
7518 ss << "error parsing integer value '" << val << "': " << interr;
7519 return -EINVAL;
7520 }
7521 p.min_write_recency_for_promote = n;
7522 } else if (var == "fast_read") {
7523 if (p.is_replicated()) {
7524 ss << "fast read is not supported in replication pool";
7525 return -EINVAL;
7526 }
7527 if (val == "true" || (interr.empty() && n == 1)) {
7528 p.fast_read = true;
7529 } else if (val == "false" || (interr.empty() && n == 0)) {
7530 p.fast_read = false;
7531 } else {
7532 ss << "expecting value 'true', 'false', '0', or '1'";
7533 return -EINVAL;
7534 }
7535 } else if (pool_opts_t::is_opt_name(var)) {
7536 bool unset = val == "unset";
7537 if (var == "compression_mode") {
7538 if (!unset) {
7539 auto cmode = Compressor::get_comp_mode_type(val);
7540 if (!cmode) {
7541 ss << "unrecognized compression mode '" << val << "'";
7542 return -EINVAL;
7543 }
7544 }
7545 } else if (var == "compression_algorithm") {
7546 if (!unset) {
7547 auto alg = Compressor::get_comp_alg_type(val);
7548 if (!alg) {
7549 ss << "unrecognized compression_algorithm '" << val << "'";
7550 return -EINVAL;
7551 }
7552 }
7553 } else if (var == "compression_required_ratio") {
7554 if (floaterr.length()) {
7555 ss << "error parsing float value '" << val << "': " << floaterr;
7556 return -EINVAL;
7557 }
7558 if (f < 0 || f > 1) {
7559 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
7560 return -EINVAL;
7561 }
7562 } else if (var == "csum_type") {
7563 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7564 if (t < 0 ) {
7565 ss << "unrecognized csum_type '" << val << "'";
7566 return -EINVAL;
7567 }
7568 //preserve csum_type numeric value
7569 n = t;
7570 interr.clear();
7571 } else if (var == "compression_max_blob_size" ||
7572 var == "compression_min_blob_size" ||
7573 var == "csum_max_block" ||
7574 var == "csum_min_block") {
7575 if (interr.length()) {
7576 ss << "error parsing int value '" << val << "': " << interr;
7577 return -EINVAL;
7578 }
7579 } else if (var == "fingerprint_algorithm") {
7580 if (!unset) {
7581 auto alg = pg_pool_t::get_fingerprint_from_str(val);
7582 if (!alg) {
7583 ss << "unrecognized fingerprint_algorithm '" << val << "'";
7584 return -EINVAL;
7585 }
7586 }
7587 } else if (var == "pg_num_min") {
7588 if (interr.length()) {
7589 ss << "error parsing int value '" << val << "': " << interr;
7590 return -EINVAL;
7591 }
7592 if (n > (int)p.get_pg_num_target()) {
7593 ss << "specified pg_num_min " << n
7594 << " > pg_num " << p.get_pg_num_target();
7595 return -EINVAL;
7596 }
7597 } else if (var == "recovery_priority") {
7598 if (interr.length()) {
7599 ss << "error parsing int value '" << val << "': " << interr;
7600 return -EINVAL;
7601 }
7602 if (!g_conf()->debug_allow_any_pool_priority) {
7603 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
7604 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
7605 << " and " << OSD_POOL_PRIORITY_MAX;
7606 return -EINVAL;
7607 }
7608 }
7609 } else if (var == "pg_autoscale_bias") {
7610 if (f < 0.0 || f > 1000.0) {
7611 ss << "pg_autoscale_bias must be between 0 and 1000";
7612 return -EINVAL;
7613 }
7614 }
7615
7616 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
7617 switch (desc.type) {
7618 case pool_opts_t::STR:
7619 if (unset) {
7620 p.opts.unset(desc.key);
7621 } else {
7622 p.opts.set(desc.key, static_cast<std::string>(val));
7623 }
7624 break;
7625 case pool_opts_t::INT:
7626 if (interr.length()) {
7627 ss << "error parsing integer value '" << val << "': " << interr;
7628 return -EINVAL;
7629 }
7630 if (n == 0) {
7631 p.opts.unset(desc.key);
7632 } else {
7633 p.opts.set(desc.key, static_cast<int64_t>(n));
7634 }
7635 break;
7636 case pool_opts_t::DOUBLE:
7637 if (floaterr.length()) {
7638 ss << "error parsing floating point value '" << val << "': " << floaterr;
7639 return -EINVAL;
7640 }
7641 if (f == 0) {
7642 p.opts.unset(desc.key);
7643 } else {
7644 p.opts.set(desc.key, static_cast<double>(f));
7645 }
7646 break;
7647 default:
7648 ceph_assert(!"unknown type");
7649 }
7650 } else {
7651 ss << "unrecognized variable '" << var << "'";
7652 return -EINVAL;
7653 }
7654 if (val != "unset") {
7655 ss << "set pool " << pool << " " << var << " to " << val;
7656 } else {
7657 ss << "unset pool " << pool << " " << var;
7658 }
7659 p.last_change = pending_inc.epoch;
7660 pending_inc.new_pools[pool] = p;
7661 return 0;
7662 }
7663
7664 int OSDMonitor::prepare_command_pool_application(const string &prefix,
7665 const cmdmap_t& cmdmap,
7666 stringstream& ss)
7667 {
7668 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
7669 }
7670
7671 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
7672 const cmdmap_t& cmdmap,
7673 stringstream& ss,
7674 bool *modified)
7675 {
7676 return _command_pool_application(prefix, cmdmap, ss, modified, false);
7677 }
7678
7679
7680 /**
7681 * Common logic for preprocess and prepare phases of pool application
7682 * tag commands. In preprocess mode we're only detecting invalid
7683 * commands, and determining whether it was a modification or a no-op.
7684 * In prepare mode we're actually updating the pending state.
7685 */
7686 int OSDMonitor::_command_pool_application(const string &prefix,
7687 const cmdmap_t& cmdmap,
7688 stringstream& ss,
7689 bool *modified,
7690 bool preparing)
7691 {
7692 string pool_name;
7693 cmd_getval(cct, cmdmap, "pool", pool_name);
7694 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
7695 if (pool < 0) {
7696 ss << "unrecognized pool '" << pool_name << "'";
7697 return -ENOENT;
7698 }
7699
7700 pg_pool_t p = *osdmap.get_pg_pool(pool);
7701 if (preparing) {
7702 if (pending_inc.new_pools.count(pool)) {
7703 p = pending_inc.new_pools[pool];
7704 }
7705 }
7706
7707 string app;
7708 cmd_getval(cct, cmdmap, "app", app);
7709 bool app_exists = (p.application_metadata.count(app) > 0);
7710
7711 string key;
7712 cmd_getval(cct, cmdmap, "key", key);
7713 if (key == "all") {
7714 ss << "key cannot be 'all'";
7715 return -EINVAL;
7716 }
7717
7718 string value;
7719 cmd_getval(cct, cmdmap, "value", value);
7720 if (value == "all") {
7721 ss << "value cannot be 'all'";
7722 return -EINVAL;
7723 }
7724
7725 if (boost::algorithm::ends_with(prefix, "enable")) {
7726 if (app.empty()) {
7727 ss << "application name must be provided";
7728 return -EINVAL;
7729 }
7730
7731 if (p.is_tier()) {
7732 ss << "application must be enabled on base tier";
7733 return -EINVAL;
7734 }
7735
7736 bool force = false;
7737 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7738
7739 if (!app_exists && !p.application_metadata.empty() && !force) {
7740 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
7741 << "application; pass --yes-i-really-mean-it to proceed anyway";
7742 return -EPERM;
7743 }
7744
7745 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
7746 ss << "too many enabled applications on pool '" << pool_name << "'; "
7747 << "max " << MAX_POOL_APPLICATIONS;
7748 return -EINVAL;
7749 }
7750
7751 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
7752 ss << "application name '" << app << "' too long; max length "
7753 << MAX_POOL_APPLICATION_LENGTH;
7754 return -EINVAL;
7755 }
7756
7757 if (!app_exists) {
7758 p.application_metadata[app] = {};
7759 }
7760 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
7761
7762 } else if (boost::algorithm::ends_with(prefix, "disable")) {
7763 bool force = false;
7764 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7765
7766 if (!force) {
7767 ss << "Are you SURE? Disabling an application within a pool might result "
7768 << "in loss of application functionality; pass "
7769 << "--yes-i-really-mean-it to proceed anyway";
7770 return -EPERM;
7771 }
7772
7773 if (!app_exists) {
7774 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7775 << "'";
7776 return 0; // idempotent
7777 }
7778
7779 p.application_metadata.erase(app);
7780 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
7781
7782 } else if (boost::algorithm::ends_with(prefix, "set")) {
7783 if (p.is_tier()) {
7784 ss << "application metadata must be set on base tier";
7785 return -EINVAL;
7786 }
7787
7788 if (!app_exists) {
7789 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7790 << "'";
7791 return -ENOENT;
7792 }
7793
7794 string key;
7795 cmd_getval(cct, cmdmap, "key", key);
7796
7797 if (key.empty()) {
7798 ss << "key must be provided";
7799 return -EINVAL;
7800 }
7801
7802 auto &app_keys = p.application_metadata[app];
7803 if (app_keys.count(key) == 0 &&
7804 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
7805 ss << "too many keys set for application '" << app << "' on pool '"
7806 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
7807 return -EINVAL;
7808 }
7809
7810 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
7811 ss << "key '" << app << "' too long; max length "
7812 << MAX_POOL_APPLICATION_LENGTH;
7813 return -EINVAL;
7814 }
7815
7816 string value;
7817 cmd_getval(cct, cmdmap, "value", value);
7818 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
7819 ss << "value '" << value << "' too long; max length "
7820 << MAX_POOL_APPLICATION_LENGTH;
7821 return -EINVAL;
7822 }
7823
7824 p.application_metadata[app][key] = value;
7825 ss << "set application '" << app << "' key '" << key << "' to '"
7826 << value << "' on pool '" << pool_name << "'";
7827 } else if (boost::algorithm::ends_with(prefix, "rm")) {
7828 if (!app_exists) {
7829 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7830 << "'";
7831 return -ENOENT;
7832 }
7833
7834 string key;
7835 cmd_getval(cct, cmdmap, "key", key);
7836 auto it = p.application_metadata[app].find(key);
7837 if (it == p.application_metadata[app].end()) {
7838 ss << "application '" << app << "' on pool '" << pool_name
7839 << "' does not have key '" << key << "'";
7840 return 0; // idempotent
7841 }
7842
7843 p.application_metadata[app].erase(it);
7844 ss << "removed application '" << app << "' key '" << key << "' on pool '"
7845 << pool_name << "'";
7846 } else {
7847 ceph_abort();
7848 }
7849
7850 if (preparing) {
7851 p.last_change = pending_inc.epoch;
7852 pending_inc.new_pools[pool] = p;
7853 }
7854
7855 // Because we fell through this far, we didn't hit no-op cases,
7856 // so pool was definitely modified
7857 if (modified != nullptr) {
7858 *modified = true;
7859 }
7860
7861 return 0;
7862 }
7863
7864 int OSDMonitor::_prepare_command_osd_crush_remove(
7865 CrushWrapper &newcrush,
7866 int32_t id,
7867 int32_t ancestor,
7868 bool has_ancestor,
7869 bool unlink_only)
7870 {
7871 int err = 0;
7872
7873 if (has_ancestor) {
7874 err = newcrush.remove_item_under(cct, id, ancestor,
7875 unlink_only);
7876 } else {
7877 err = newcrush.remove_item(cct, id, unlink_only);
7878 }
7879 return err;
7880 }
7881
7882 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
7883 {
7884 pending_inc.crush.clear();
7885 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7886 }
7887
7888 int OSDMonitor::prepare_command_osd_crush_remove(
7889 CrushWrapper &newcrush,
7890 int32_t id,
7891 int32_t ancestor,
7892 bool has_ancestor,
7893 bool unlink_only)
7894 {
7895 int err = _prepare_command_osd_crush_remove(
7896 newcrush, id, ancestor,
7897 has_ancestor, unlink_only);
7898
7899 if (err < 0)
7900 return err;
7901
7902 ceph_assert(err == 0);
7903 do_osd_crush_remove(newcrush);
7904
7905 return 0;
7906 }
7907
7908 int OSDMonitor::prepare_command_osd_remove(int32_t id)
7909 {
7910 if (osdmap.is_up(id)) {
7911 return -EBUSY;
7912 }
7913
7914 pending_inc.new_state[id] = osdmap.get_state(id);
7915 pending_inc.new_uuid[id] = uuid_d();
7916 pending_metadata_rm.insert(id);
7917 pending_metadata.erase(id);
7918
7919 return 0;
7920 }
7921
7922 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
7923 {
7924 ceph_assert(existing_id);
7925 *existing_id = -1;
7926
7927 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
7928 if (!osdmap.exists(i) &&
7929 pending_inc.new_up_client.count(i) == 0 &&
7930 (pending_inc.new_state.count(i) == 0 ||
7931 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
7932 *existing_id = i;
7933 return -1;
7934 }
7935 }
7936
7937 if (pending_inc.new_max_osd < 0) {
7938 return osdmap.get_max_osd();
7939 }
7940 return pending_inc.new_max_osd;
7941 }
7942
7943 void OSDMonitor::do_osd_create(
7944 const int32_t id,
7945 const uuid_d& uuid,
7946 const string& device_class,
7947 int32_t* new_id)
7948 {
7949 dout(10) << __func__ << " uuid " << uuid << dendl;
7950 ceph_assert(new_id);
7951
7952 // We presume validation has been performed prior to calling this
7953 // function. We assert with prejudice.
7954
7955 int32_t allocated_id = -1; // declare here so we can jump
7956 int32_t existing_id = -1;
7957 if (!uuid.is_zero()) {
7958 existing_id = osdmap.identify_osd(uuid);
7959 if (existing_id >= 0) {
7960 ceph_assert(id < 0 || id == existing_id);
7961 *new_id = existing_id;
7962 goto out;
7963 } else if (id >= 0) {
7964 // uuid does not exist, and id has been provided, so just create
7965 // the new osd.id
7966 *new_id = id;
7967 goto out;
7968 }
7969 }
7970
7971 // allocate a new id
7972 allocated_id = _allocate_osd_id(&existing_id);
7973 dout(10) << __func__ << " allocated id " << allocated_id
7974 << " existing id " << existing_id << dendl;
7975 if (existing_id >= 0) {
7976 ceph_assert(existing_id < osdmap.get_max_osd());
7977 ceph_assert(allocated_id < 0);
7978 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
7979 *new_id = existing_id;
7980 } else if (allocated_id >= 0) {
7981 ceph_assert(existing_id < 0);
7982 // raise max_osd
7983 if (pending_inc.new_max_osd < 0) {
7984 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
7985 } else {
7986 ++pending_inc.new_max_osd;
7987 }
7988 *new_id = pending_inc.new_max_osd - 1;
7989 ceph_assert(*new_id == allocated_id);
7990 } else {
7991 ceph_abort_msg("unexpected condition");
7992 }
7993
7994 out:
7995 if (device_class.size()) {
7996 CrushWrapper newcrush;
7997 _get_pending_crush(newcrush);
7998 if (newcrush.get_max_devices() < *new_id + 1) {
7999 newcrush.set_max_devices(*new_id + 1);
8000 }
8001 string name = string("osd.") + stringify(*new_id);
8002 if (!newcrush.item_exists(*new_id)) {
8003 newcrush.set_item_name(*new_id, name);
8004 }
8005 ostringstream ss;
8006 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8007 if (r < 0) {
8008 derr << __func__ << " failed to set " << name << " device_class "
8009 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8010 << dendl;
8011 // non-fatal... this might be a replay and we want to be idempotent.
8012 } else {
8013 dout(20) << __func__ << " set " << name << " device_class " << device_class
8014 << dendl;
8015 pending_inc.crush.clear();
8016 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8017 }
8018 } else {
8019 dout(20) << __func__ << " no device_class" << dendl;
8020 }
8021
8022 dout(10) << __func__ << " using id " << *new_id << dendl;
8023 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8024 pending_inc.new_max_osd = *new_id + 1;
8025 }
8026
8027 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8028 if (!uuid.is_zero())
8029 pending_inc.new_uuid[*new_id] = uuid;
8030 }
8031
8032 int OSDMonitor::validate_osd_create(
8033 const int32_t id,
8034 const uuid_d& uuid,
8035 const bool check_osd_exists,
8036 int32_t* existing_id,
8037 stringstream& ss)
8038 {
8039
8040 dout(10) << __func__ << " id " << id << " uuid " << uuid
8041 << " check_osd_exists " << check_osd_exists << dendl;
8042
8043 ceph_assert(existing_id);
8044
8045 if (id < 0 && uuid.is_zero()) {
8046 // we have nothing to validate
8047 *existing_id = -1;
8048 return 0;
8049 } else if (uuid.is_zero()) {
8050 // we have an id but we will ignore it - because that's what
8051 // `osd create` does.
8052 return 0;
8053 }
8054
8055 /*
8056 * This function will be used to validate whether we are able to
8057 * create a new osd when the `uuid` is specified.
8058 *
8059 * It will be used by both `osd create` and `osd new`, as the checks
8060 * are basically the same when it pertains to osd id and uuid validation.
8061 * However, `osd create` presumes an `uuid` is optional, for legacy
8062 * reasons, while `osd new` requires the `uuid` to be provided. This
8063 * means that `osd create` will not be idempotent if an `uuid` is not
8064 * provided, but we will always guarantee the idempotency of `osd new`.
8065 */
8066
8067 ceph_assert(!uuid.is_zero());
8068 if (pending_inc.identify_osd(uuid) >= 0) {
8069 // osd is about to exist
8070 return -EAGAIN;
8071 }
8072
8073 int32_t i = osdmap.identify_osd(uuid);
8074 if (i >= 0) {
8075 // osd already exists
8076 if (id >= 0 && i != id) {
8077 ss << "uuid " << uuid << " already in use for different id " << i;
8078 return -EEXIST;
8079 }
8080 // return a positive errno to distinguish between a blocking error
8081 // and an error we consider to not be a problem (i.e., this would be
8082 // an idempotent operation).
8083 *existing_id = i;
8084 return EEXIST;
8085 }
8086 // i < 0
8087 if (id >= 0) {
8088 if (pending_inc.new_state.count(id)) {
8089 // osd is about to exist
8090 return -EAGAIN;
8091 }
8092 // we may not care if an osd exists if we are recreating a previously
8093 // destroyed osd.
8094 if (check_osd_exists && osdmap.exists(id)) {
8095 ss << "id " << id << " already in use and does not match uuid "
8096 << uuid;
8097 return -EINVAL;
8098 }
8099 }
8100 return 0;
8101 }
8102
8103 int OSDMonitor::prepare_command_osd_create(
8104 const int32_t id,
8105 const uuid_d& uuid,
8106 int32_t* existing_id,
8107 stringstream& ss)
8108 {
8109 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8110 ceph_assert(existing_id);
8111 if (osdmap.is_destroyed(id)) {
8112 ss << "ceph osd create has been deprecated. Please use ceph osd new "
8113 "instead.";
8114 return -EINVAL;
8115 }
8116
8117 if (uuid.is_zero()) {
8118 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8119 }
8120
8121 return validate_osd_create(id, uuid, true, existing_id, ss);
8122 }
8123
8124 int OSDMonitor::prepare_command_osd_new(
8125 MonOpRequestRef op,
8126 const cmdmap_t& cmdmap,
8127 const map<string,string>& params,
8128 stringstream &ss,
8129 Formatter *f)
8130 {
8131 uuid_d uuid;
8132 string uuidstr;
8133 int64_t id = -1;
8134
8135 ceph_assert(paxos->is_plugged());
8136
8137 dout(10) << __func__ << " " << op << dendl;
8138
8139 /* validate command. abort now if something's wrong. */
8140
8141 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8142 *
8143 * If `id` is not specified, we will identify any existing osd based
8144 * on `uuid`. Operation will be idempotent iff secrets match.
8145 *
8146 * If `id` is specified, we will identify any existing osd based on
8147 * `uuid` and match against `id`. If they match, operation will be
8148 * idempotent iff secrets match.
8149 *
8150 * `-i secrets.json` will be optional. If supplied, will be used
8151 * to check for idempotency when `id` and `uuid` match.
8152 *
8153 * If `id` is not specified, and `uuid` does not exist, an id will
8154 * be found or allocated for the osd.
8155 *
8156 * If `id` is specified, and the osd has been previously marked
8157 * as destroyed, then the `id` will be reused.
8158 */
8159 if (!cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
8160 ss << "requires the OSD's UUID to be specified.";
8161 return -EINVAL;
8162 } else if (!uuid.parse(uuidstr.c_str())) {
8163 ss << "invalid UUID value '" << uuidstr << "'.";
8164 return -EINVAL;
8165 }
8166
8167 if (cmd_getval(cct, cmdmap, "id", id) &&
8168 (id < 0)) {
8169 ss << "invalid OSD id; must be greater or equal than zero.";
8170 return -EINVAL;
8171 }
8172
8173 // are we running an `osd create`-like command, or recreating
8174 // a previously destroyed osd?
8175
8176 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
8177
8178 // we will care about `id` to assess whether osd is `destroyed`, or
8179 // to create a new osd.
8180 // we will need an `id` by the time we reach auth.
8181
8182 int32_t existing_id = -1;
8183 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
8184 &existing_id, ss);
8185
8186 bool may_be_idempotent = false;
8187 if (err == EEXIST) {
8188 // this is idempotent from the osdmon's point-of-view
8189 may_be_idempotent = true;
8190 ceph_assert(existing_id >= 0);
8191 id = existing_id;
8192 } else if (err < 0) {
8193 return err;
8194 }
8195
8196 if (!may_be_idempotent) {
8197 // idempotency is out of the window. We are either creating a new
8198 // osd or recreating a destroyed osd.
8199 //
8200 // We now need to figure out if we have an `id` (and if it's valid),
8201 // of find an `id` if we don't have one.
8202
8203 // NOTE: we need to consider the case where the `id` is specified for
8204 // `osd create`, and we must honor it. So this means checking if
8205 // the `id` is destroyed, and if so assume the destroy; otherwise,
8206 // check if it `exists` - in which case we complain about not being
8207 // `destroyed`. In the end, if nothing fails, we must allow the
8208 // creation, so that we are compatible with `create`.
8209 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
8210 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
8211 ss << "OSD " << id << " has not yet been destroyed";
8212 return -EINVAL;
8213 } else if (id < 0) {
8214 // find an `id`
8215 id = _allocate_osd_id(&existing_id);
8216 if (id < 0) {
8217 ceph_assert(existing_id >= 0);
8218 id = existing_id;
8219 }
8220 dout(10) << __func__ << " found id " << id << " to use" << dendl;
8221 } else if (id >= 0 && osdmap.is_destroyed(id)) {
8222 dout(10) << __func__ << " recreating osd." << id << dendl;
8223 } else {
8224 dout(10) << __func__ << " creating new osd." << id << dendl;
8225 }
8226 } else {
8227 ceph_assert(id >= 0);
8228 ceph_assert(osdmap.exists(id));
8229 }
8230
8231 // we are now able to either create a brand new osd or reuse an existing
8232 // osd that has been previously destroyed.
8233
8234 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8235
8236 if (may_be_idempotent && params.empty()) {
8237 // nothing to do, really.
8238 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
8239 ceph_assert(id >= 0);
8240 if (f) {
8241 f->open_object_section("created_osd");
8242 f->dump_int("osdid", id);
8243 f->close_section();
8244 } else {
8245 ss << id;
8246 }
8247 return EEXIST;
8248 }
8249
8250 string device_class;
8251 auto p = params.find("crush_device_class");
8252 if (p != params.end()) {
8253 device_class = p->second;
8254 dout(20) << __func__ << " device_class will be " << device_class << dendl;
8255 }
8256 string cephx_secret, lockbox_secret, dmcrypt_key;
8257 bool has_lockbox = false;
8258 bool has_secrets = params.count("cephx_secret")
8259 || params.count("cephx_lockbox_secret")
8260 || params.count("dmcrypt_key");
8261
8262 ConfigKeyService *svc = nullptr;
8263 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
8264
8265 if (has_secrets) {
8266 if (params.count("cephx_secret") == 0) {
8267 ss << "requires a cephx secret.";
8268 return -EINVAL;
8269 }
8270 cephx_secret = params.at("cephx_secret");
8271
8272 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
8273 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
8274
8275 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
8276 << " dmcrypt " << has_dmcrypt_key << dendl;
8277
8278 if (has_lockbox_secret && has_dmcrypt_key) {
8279 has_lockbox = true;
8280 lockbox_secret = params.at("cephx_lockbox_secret");
8281 dmcrypt_key = params.at("dmcrypt_key");
8282 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
8283 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
8284 return -EINVAL;
8285 }
8286
8287 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
8288
8289 err = mon->authmon()->validate_osd_new(id, uuid,
8290 cephx_secret,
8291 lockbox_secret,
8292 cephx_entity,
8293 lockbox_entity,
8294 ss);
8295 if (err < 0) {
8296 return err;
8297 } else if (may_be_idempotent && err != EEXIST) {
8298 // for this to be idempotent, `id` should already be >= 0; no need
8299 // to use validate_id.
8300 ceph_assert(id >= 0);
8301 ss << "osd." << id << " exists but secrets do not match";
8302 return -EEXIST;
8303 }
8304
8305 if (has_lockbox) {
8306 svc = (ConfigKeyService*)mon->config_key_service;
8307 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
8308 if (err < 0) {
8309 return err;
8310 } else if (may_be_idempotent && err != EEXIST) {
8311 ceph_assert(id >= 0);
8312 ss << "osd." << id << " exists but dm-crypt key does not match.";
8313 return -EEXIST;
8314 }
8315 }
8316 }
8317 ceph_assert(!has_secrets || !cephx_secret.empty());
8318 ceph_assert(!has_lockbox || !lockbox_secret.empty());
8319
8320 if (may_be_idempotent) {
8321 // we have nothing to do for either the osdmon or the authmon,
8322 // and we have no lockbox - so the config key service will not be
8323 // touched. This is therefore an idempotent operation, and we can
8324 // just return right away.
8325 dout(10) << __func__ << " idempotent -- no op." << dendl;
8326 ceph_assert(id >= 0);
8327 if (f) {
8328 f->open_object_section("created_osd");
8329 f->dump_int("osdid", id);
8330 f->close_section();
8331 } else {
8332 ss << id;
8333 }
8334 return EEXIST;
8335 }
8336 ceph_assert(!may_be_idempotent);
8337
8338 // perform updates.
8339 if (has_secrets) {
8340 ceph_assert(!cephx_secret.empty());
8341 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
8342 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
8343
8344 err = mon->authmon()->do_osd_new(cephx_entity,
8345 lockbox_entity,
8346 has_lockbox);
8347 ceph_assert(0 == err);
8348
8349 if (has_lockbox) {
8350 ceph_assert(nullptr != svc);
8351 svc->do_osd_new(uuid, dmcrypt_key);
8352 }
8353 }
8354
8355 if (is_recreate_destroyed) {
8356 ceph_assert(id >= 0);
8357 ceph_assert(osdmap.is_destroyed(id));
8358 pending_inc.new_weight[id] = CEPH_OSD_OUT;
8359 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
8360 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
8361 pending_inc.new_state[id] |= CEPH_OSD_NEW;
8362 }
8363 if (osdmap.get_state(id) & CEPH_OSD_UP) {
8364 // due to http://tracker.ceph.com/issues/20751 some clusters may
8365 // have UP set for non-existent OSDs; make sure it is cleared
8366 // for a newly created osd.
8367 pending_inc.new_state[id] |= CEPH_OSD_UP;
8368 }
8369 pending_inc.new_uuid[id] = uuid;
8370 } else {
8371 ceph_assert(id >= 0);
8372 int32_t new_id = -1;
8373 do_osd_create(id, uuid, device_class, &new_id);
8374 ceph_assert(new_id >= 0);
8375 ceph_assert(id == new_id);
8376 }
8377
8378 if (f) {
8379 f->open_object_section("created_osd");
8380 f->dump_int("osdid", id);
8381 f->close_section();
8382 } else {
8383 ss << id;
8384 }
8385
8386 return 0;
8387 }
8388
8389 bool OSDMonitor::prepare_command(MonOpRequestRef op)
8390 {
8391 op->mark_osdmon_event(__func__);
8392 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
8393 stringstream ss;
8394 cmdmap_t cmdmap;
8395 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
8396 string rs = ss.str();
8397 mon->reply_command(op, -EINVAL, rs, get_last_committed());
8398 return true;
8399 }
8400
8401 MonSession *session = op->get_session();
8402 if (!session) {
8403 derr << __func__ << " no session" << dendl;
8404 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
8405 return true;
8406 }
8407
8408 return prepare_command_impl(op, cmdmap);
8409 }
8410
8411 static int parse_reweights(CephContext *cct,
8412 const cmdmap_t& cmdmap,
8413 const OSDMap& osdmap,
8414 map<int32_t, uint32_t>* weights)
8415 {
8416 string weights_str;
8417 if (!cmd_getval(cct, cmdmap, "weights", weights_str)) {
8418 return -EINVAL;
8419 }
8420 std::replace(begin(weights_str), end(weights_str), '\'', '"');
8421 json_spirit::mValue json_value;
8422 if (!json_spirit::read(weights_str, json_value)) {
8423 return -EINVAL;
8424 }
8425 if (json_value.type() != json_spirit::obj_type) {
8426 return -EINVAL;
8427 }
8428 const auto obj = json_value.get_obj();
8429 try {
8430 for (auto& osd_weight : obj) {
8431 auto osd_id = std::stoi(osd_weight.first);
8432 if (!osdmap.exists(osd_id)) {
8433 return -ENOENT;
8434 }
8435 if (osd_weight.second.type() != json_spirit::str_type) {
8436 return -EINVAL;
8437 }
8438 auto weight = std::stoul(osd_weight.second.get_str());
8439 weights->insert({osd_id, weight});
8440 }
8441 } catch (const std::logic_error& e) {
8442 return -EINVAL;
8443 }
8444 return 0;
8445 }
8446
8447 int OSDMonitor::prepare_command_osd_destroy(
8448 int32_t id,
8449 stringstream& ss)
8450 {
8451 ceph_assert(paxos->is_plugged());
8452
8453 // we check if the osd exists for the benefit of `osd purge`, which may
8454 // have previously removed the osd. If the osd does not exist, return
8455 // -ENOENT to convey this, and let the caller deal with it.
8456 //
8457 // we presume that all auth secrets and config keys were removed prior
8458 // to this command being called. if they exist by now, we also assume
8459 // they must have been created by some other command and do not pertain
8460 // to this non-existent osd.
8461 if (!osdmap.exists(id)) {
8462 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
8463 return -ENOENT;
8464 }
8465
8466 uuid_d uuid = osdmap.get_uuid(id);
8467 dout(10) << __func__ << " destroying osd." << id
8468 << " uuid " << uuid << dendl;
8469
8470 // if it has been destroyed, we assume our work here is done.
8471 if (osdmap.is_destroyed(id)) {
8472 ss << "destroyed osd." << id;
8473 return 0;
8474 }
8475
8476 EntityName cephx_entity, lockbox_entity;
8477 bool idempotent_auth = false, idempotent_cks = false;
8478
8479 int err = mon->authmon()->validate_osd_destroy(id, uuid,
8480 cephx_entity,
8481 lockbox_entity,
8482 ss);
8483 if (err < 0) {
8484 if (err == -ENOENT) {
8485 idempotent_auth = true;
8486 } else {
8487 return err;
8488 }
8489 }
8490
8491 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
8492 err = svc->validate_osd_destroy(id, uuid);
8493 if (err < 0) {
8494 ceph_assert(err == -ENOENT);
8495 err = 0;
8496 idempotent_cks = true;
8497 }
8498
8499 if (!idempotent_auth) {
8500 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
8501 ceph_assert(0 == err);
8502 }
8503
8504 if (!idempotent_cks) {
8505 svc->do_osd_destroy(id, uuid);
8506 }
8507
8508 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
8509 pending_inc.new_uuid[id] = uuid_d();
8510
8511 // we can only propose_pending() once per service, otherwise we'll be
8512 // defying PaxosService and all laws of nature. Therefore, as we may
8513 // be used during 'osd purge', let's keep the caller responsible for
8514 // proposing.
8515 ceph_assert(err == 0);
8516 return 0;
8517 }
8518
8519 int OSDMonitor::prepare_command_osd_purge(
8520 int32_t id,
8521 stringstream& ss)
8522 {
8523 ceph_assert(paxos->is_plugged());
8524 dout(10) << __func__ << " purging osd." << id << dendl;
8525
8526 ceph_assert(!osdmap.is_up(id));
8527
8528 /*
8529 * This may look a bit weird, but this is what's going to happen:
8530 *
8531 * 1. we make sure that removing from crush works
8532 * 2. we call `prepare_command_osd_destroy()`. If it returns an
8533 * error, then we abort the whole operation, as no updates
8534 * have been made. However, we this function will have
8535 * side-effects, thus we need to make sure that all operations
8536 * performed henceforth will *always* succeed.
8537 * 3. we call `prepare_command_osd_remove()`. Although this
8538 * function can return an error, it currently only checks if the
8539 * osd is up - and we have made sure that it is not so, so there
8540 * is no conflict, and it is effectively an update.
8541 * 4. finally, we call `do_osd_crush_remove()`, which will perform
8542 * the crush update we delayed from before.
8543 */
8544
8545 CrushWrapper newcrush;
8546 _get_pending_crush(newcrush);
8547
8548 bool may_be_idempotent = false;
8549
8550 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
8551 if (err == -ENOENT) {
8552 err = 0;
8553 may_be_idempotent = true;
8554 } else if (err < 0) {
8555 ss << "error removing osd." << id << " from crush";
8556 return err;
8557 }
8558
8559 // no point destroying the osd again if it has already been marked destroyed
8560 if (!osdmap.is_destroyed(id)) {
8561 err = prepare_command_osd_destroy(id, ss);
8562 if (err < 0) {
8563 if (err == -ENOENT) {
8564 err = 0;
8565 } else {
8566 return err;
8567 }
8568 } else {
8569 may_be_idempotent = false;
8570 }
8571 }
8572 ceph_assert(0 == err);
8573
8574 if (may_be_idempotent && !osdmap.exists(id)) {
8575 dout(10) << __func__ << " osd." << id << " does not exist and "
8576 << "we are idempotent." << dendl;
8577 return -ENOENT;
8578 }
8579
8580 err = prepare_command_osd_remove(id);
8581 // we should not be busy, as we should have made sure this id is not up.
8582 ceph_assert(0 == err);
8583
8584 do_osd_crush_remove(newcrush);
8585 return 0;
8586 }
8587
8588 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
8589 const cmdmap_t& cmdmap)
8590 {
8591 op->mark_osdmon_event(__func__);
8592 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
8593 bool ret = false;
8594 stringstream ss;
8595 string rs;
8596 bufferlist rdata;
8597 int err = 0;
8598
8599 string format;
8600 cmd_getval(cct, cmdmap, "format", format, string("plain"));
8601 boost::scoped_ptr<Formatter> f(Formatter::create(format));
8602
8603 string prefix;
8604 cmd_getval(cct, cmdmap, "prefix", prefix);
8605
8606 int64_t osdid;
8607 string osd_name;
8608 bool osdid_present = false;
8609 if (prefix != "osd pg-temp" &&
8610 prefix != "osd pg-upmap" &&
8611 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
8612 osdid_present = cmd_getval(cct, cmdmap, "id", osdid);
8613 }
8614 if (osdid_present) {
8615 ostringstream oss;
8616 oss << "osd." << osdid;
8617 osd_name = oss.str();
8618 }
8619
8620 // Even if there's a pending state with changes that could affect
8621 // a command, considering that said state isn't yet committed, we
8622 // just don't care about those changes if the command currently being
8623 // handled acts as a no-op against the current committed state.
8624 // In a nutshell, we assume this command happens *before*.
8625 //
8626 // Let me make this clearer:
8627 //
8628 // - If we have only one client, and that client issues some
8629 // operation that would conflict with this operation but is
8630 // still on the pending state, then we would be sure that said
8631 // operation wouldn't have returned yet, so the client wouldn't
8632 // issue this operation (unless the client didn't wait for the
8633 // operation to finish, and that would be the client's own fault).
8634 //
8635 // - If we have more than one client, each client will observe
8636 // whatever is the state at the moment of the commit. So, if we
8637 // have two clients, one issuing an unlink and another issuing a
8638 // link, and if the link happens while the unlink is still on the
8639 // pending state, from the link's point-of-view this is a no-op.
8640 // If different clients are issuing conflicting operations and
8641 // they care about that, then the clients should make sure they
8642 // enforce some kind of concurrency mechanism -- from our
8643 // perspective that's what Douglas Adams would call an SEP.
8644 //
8645 // This should be used as a general guideline for most commands handled
8646 // in this function. Adapt as you see fit, but please bear in mind that
8647 // this is the expected behavior.
8648
8649
8650 if (prefix == "osd setcrushmap" ||
8651 (prefix == "osd crush set" && !osdid_present)) {
8652 if (pending_inc.crush.length()) {
8653 dout(10) << __func__ << " waiting for pending crush update " << dendl;
8654 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8655 return true;
8656 }
8657 dout(10) << "prepare_command setting new crush map" << dendl;
8658 bufferlist data(m->get_data());
8659 CrushWrapper crush;
8660 try {
8661 auto bl = data.cbegin();
8662 crush.decode(bl);
8663 }
8664 catch (const std::exception &e) {
8665 err = -EINVAL;
8666 ss << "Failed to parse crushmap: " << e.what();
8667 goto reply;
8668 }
8669
8670 int64_t prior_version = 0;
8671 if (cmd_getval(cct, cmdmap, "prior_version", prior_version)) {
8672 if (prior_version == osdmap.get_crush_version() - 1) {
8673 // see if we are a resend of the last update. this is imperfect
8674 // (multiple racing updaters may not both get reliable success)
8675 // but we expect crush updaters (via this interface) to be rare-ish.
8676 bufferlist current, proposed;
8677 osdmap.crush->encode(current, mon->get_quorum_con_features());
8678 crush.encode(proposed, mon->get_quorum_con_features());
8679 if (current.contents_equal(proposed)) {
8680 dout(10) << __func__
8681 << " proposed matches current and version equals previous"
8682 << dendl;
8683 err = 0;
8684 ss << osdmap.get_crush_version();
8685 goto reply;
8686 }
8687 }
8688 if (prior_version != osdmap.get_crush_version()) {
8689 err = -EPERM;
8690 ss << "prior_version " << prior_version << " != crush version "
8691 << osdmap.get_crush_version();
8692 goto reply;
8693 }
8694 }
8695
8696 if (crush.has_legacy_rule_ids()) {
8697 err = -EINVAL;
8698 ss << "crush maps with ruleset != ruleid are no longer allowed";
8699 goto reply;
8700 }
8701 if (!validate_crush_against_features(&crush, ss)) {
8702 err = -EINVAL;
8703 goto reply;
8704 }
8705
8706 err = osdmap.validate_crush_rules(&crush, &ss);
8707 if (err < 0) {
8708 goto reply;
8709 }
8710
8711 if (g_conf()->mon_osd_crush_smoke_test) {
8712 // sanity check: test some inputs to make sure this map isn't
8713 // totally broken
8714 dout(10) << " testing map" << dendl;
8715 stringstream ess;
8716 CrushTester tester(crush, ess);
8717 tester.set_min_x(0);
8718 tester.set_max_x(50);
8719 auto start = ceph::coarse_mono_clock::now();
8720 int r = tester.test_with_fork(g_conf()->mon_lease);
8721 auto duration = ceph::coarse_mono_clock::now() - start;
8722 if (r < 0) {
8723 dout(10) << " tester.test_with_fork returns " << r
8724 << ": " << ess.str() << dendl;
8725 ss << "crush smoke test failed with " << r << ": " << ess.str();
8726 err = r;
8727 goto reply;
8728 }
8729 dout(10) << __func__ << " crush somke test duration: "
8730 << duration << ", result: " << ess.str() << dendl;
8731 }
8732
8733 pending_inc.crush = data;
8734 ss << osdmap.get_crush_version() + 1;
8735 goto update;
8736
8737 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
8738 CrushWrapper newcrush;
8739 _get_pending_crush(newcrush);
8740 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
8741 int bid = -1 - b;
8742 if (newcrush.bucket_exists(bid) &&
8743 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
8744 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
8745 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
8746 }
8747 }
8748 if (!validate_crush_against_features(&newcrush, ss)) {
8749 err = -EINVAL;
8750 goto reply;
8751 }
8752 pending_inc.crush.clear();
8753 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8754 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8755 get_last_committed() + 1));
8756 return true;
8757 } else if (prefix == "osd crush set-device-class") {
8758 string device_class;
8759 if (!cmd_getval(cct, cmdmap, "class", device_class)) {
8760 err = -EINVAL; // no value!
8761 goto reply;
8762 }
8763
8764 bool stop = false;
8765 vector<string> idvec;
8766 cmd_getval(cct, cmdmap, "ids", idvec);
8767 CrushWrapper newcrush;
8768 _get_pending_crush(newcrush);
8769 set<int> updated;
8770 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8771 set<int> osds;
8772 // wildcard?
8773 if (j == 0 &&
8774 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8775 osdmap.get_all_osds(osds);
8776 stop = true;
8777 } else {
8778 // try traditional single osd way
8779 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8780 if (osd < 0) {
8781 // ss has reason for failure
8782 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8783 err = -EINVAL;
8784 continue;
8785 }
8786 osds.insert(osd);
8787 }
8788
8789 for (auto &osd : osds) {
8790 if (!osdmap.exists(osd)) {
8791 ss << "osd." << osd << " does not exist. ";
8792 continue;
8793 }
8794
8795 ostringstream oss;
8796 oss << "osd." << osd;
8797 string name = oss.str();
8798
8799 if (newcrush.get_max_devices() < osd + 1) {
8800 newcrush.set_max_devices(osd + 1);
8801 }
8802 string action;
8803 if (newcrush.item_exists(osd)) {
8804 action = "updating";
8805 } else {
8806 action = "creating";
8807 newcrush.set_item_name(osd, name);
8808 }
8809
8810 dout(5) << action << " crush item id " << osd << " name '" << name
8811 << "' device_class '" << device_class << "'"
8812 << dendl;
8813 err = newcrush.update_device_class(osd, device_class, name, &ss);
8814 if (err < 0) {
8815 goto reply;
8816 }
8817 if (err == 0 && !_have_pending_crush()) {
8818 if (!stop) {
8819 // for single osd only, wildcard makes too much noise
8820 ss << "set-device-class item id " << osd << " name '" << name
8821 << "' device_class '" << device_class << "': no change. ";
8822 }
8823 } else {
8824 updated.insert(osd);
8825 }
8826 }
8827 }
8828
8829 if (!updated.empty()) {
8830 pending_inc.crush.clear();
8831 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8832 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
8833 getline(ss, rs);
8834 wait_for_finished_proposal(op,
8835 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8836 return true;
8837 }
8838
8839 } else if (prefix == "osd crush rm-device-class") {
8840 bool stop = false;
8841 vector<string> idvec;
8842 cmd_getval(cct, cmdmap, "ids", idvec);
8843 CrushWrapper newcrush;
8844 _get_pending_crush(newcrush);
8845 set<int> updated;
8846
8847 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8848 set<int> osds;
8849
8850 // wildcard?
8851 if (j == 0 &&
8852 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8853 osdmap.get_all_osds(osds);
8854 stop = true;
8855 } else {
8856 // try traditional single osd way
8857 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8858 if (osd < 0) {
8859 // ss has reason for failure
8860 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8861 err = -EINVAL;
8862 goto reply;
8863 }
8864 osds.insert(osd);
8865 }
8866
8867 for (auto &osd : osds) {
8868 if (!osdmap.exists(osd)) {
8869 ss << "osd." << osd << " does not exist. ";
8870 continue;
8871 }
8872
8873 auto class_name = newcrush.get_item_class(osd);
8874 if (!class_name) {
8875 ss << "osd." << osd << " belongs to no class, ";
8876 continue;
8877 }
8878 // note that we do not verify if class_is_in_use here
8879 // in case the device is misclassified and user wants
8880 // to overridely reset...
8881
8882 err = newcrush.remove_device_class(cct, osd, &ss);
8883 if (err < 0) {
8884 // ss has reason for failure
8885 goto reply;
8886 }
8887 updated.insert(osd);
8888 }
8889 }
8890
8891 if (!updated.empty()) {
8892 pending_inc.crush.clear();
8893 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8894 ss << "done removing class of osd(s): " << updated;
8895 getline(ss, rs);
8896 wait_for_finished_proposal(op,
8897 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8898 return true;
8899 }
8900 } else if (prefix == "osd crush class create") {
8901 string device_class;
8902 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
8903 err = -EINVAL; // no value!
8904 goto reply;
8905 }
8906 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8907 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8908 << "luminous' before using crush device classes";
8909 err = -EPERM;
8910 goto reply;
8911 }
8912 if (!_have_pending_crush() &&
8913 _get_stable_crush().class_exists(device_class)) {
8914 ss << "class '" << device_class << "' already exists";
8915 goto reply;
8916 }
8917 CrushWrapper newcrush;
8918 _get_pending_crush(newcrush);
8919 if (newcrush.class_exists(device_class)) {
8920 ss << "class '" << device_class << "' already exists";
8921 goto update;
8922 }
8923 int class_id = newcrush.get_or_create_class_id(device_class);
8924 pending_inc.crush.clear();
8925 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8926 ss << "created class " << device_class << " with id " << class_id
8927 << " to crush map";
8928 goto update;
8929 } else if (prefix == "osd crush class rm") {
8930 string device_class;
8931 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
8932 err = -EINVAL; // no value!
8933 goto reply;
8934 }
8935 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8936 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8937 << "luminous' before using crush device classes";
8938 err = -EPERM;
8939 goto reply;
8940 }
8941
8942 if (!osdmap.crush->class_exists(device_class)) {
8943 err = 0;
8944 goto reply;
8945 }
8946
8947 CrushWrapper newcrush;
8948 _get_pending_crush(newcrush);
8949 if (!newcrush.class_exists(device_class)) {
8950 err = 0; // make command idempotent
8951 goto wait;
8952 }
8953 int class_id = newcrush.get_class_id(device_class);
8954 stringstream ts;
8955 if (newcrush.class_is_in_use(class_id, &ts)) {
8956 err = -EBUSY;
8957 ss << "class '" << device_class << "' " << ts.str();
8958 goto reply;
8959 }
8960
8961 // check if class is used by any erasure-code-profiles
8962 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
8963 osdmap.get_erasure_code_profiles();
8964 auto ec_profiles = pending_inc.get_erasure_code_profiles();
8965 #ifdef HAVE_STDLIB_MAP_SPLICING
8966 ec_profiles.merge(old_ec_profiles);
8967 #else
8968 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
8969 make_move_iterator(end(old_ec_profiles)));
8970 #endif
8971 list<string> referenced_by;
8972 for (auto &i: ec_profiles) {
8973 for (auto &j: i.second) {
8974 if ("crush-device-class" == j.first && device_class == j.second) {
8975 referenced_by.push_back(i.first);
8976 }
8977 }
8978 }
8979 if (!referenced_by.empty()) {
8980 err = -EBUSY;
8981 ss << "class '" << device_class
8982 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
8983 goto reply;
8984 }
8985
8986 set<int> osds;
8987 newcrush.get_devices_by_class(device_class, &osds);
8988 for (auto& p: osds) {
8989 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
8990 if (err < 0) {
8991 // ss has reason for failure
8992 goto reply;
8993 }
8994 }
8995
8996 if (osds.empty()) {
8997 // empty class, remove directly
8998 err = newcrush.remove_class_name(device_class);
8999 if (err < 0) {
9000 ss << "class '" << device_class << "' cannot be removed '"
9001 << cpp_strerror(err) << "'";
9002 goto reply;
9003 }
9004 }
9005
9006 pending_inc.crush.clear();
9007 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9008 ss << "removed class " << device_class << " with id " << class_id
9009 << " from crush map";
9010 goto update;
9011 } else if (prefix == "osd crush class rename") {
9012 string srcname, dstname;
9013 if (!cmd_getval(cct, cmdmap, "srcname", srcname)) {
9014 err = -EINVAL;
9015 goto reply;
9016 }
9017 if (!cmd_getval(cct, cmdmap, "dstname", dstname)) {
9018 err = -EINVAL;
9019 goto reply;
9020 }
9021
9022 CrushWrapper newcrush;
9023 _get_pending_crush(newcrush);
9024 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9025 // suppose this is a replay and return success
9026 // so command is idempotent
9027 ss << "already renamed to '" << dstname << "'";
9028 err = 0;
9029 goto reply;
9030 }
9031
9032 err = newcrush.rename_class(srcname, dstname);
9033 if (err < 0) {
9034 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9035 << cpp_strerror(err);
9036 goto reply;
9037 }
9038
9039 pending_inc.crush.clear();
9040 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9041 ss << "rename class '" << srcname << "' to '" << dstname << "'";
9042 goto update;
9043 } else if (prefix == "osd crush add-bucket") {
9044 // os crush add-bucket <name> <type>
9045 string name, typestr;
9046 vector<string> argvec;
9047 cmd_getval(cct, cmdmap, "name", name);
9048 cmd_getval(cct, cmdmap, "type", typestr);
9049 cmd_getval(cct, cmdmap, "args", argvec);
9050 map<string,string> loc;
9051 if (!argvec.empty()) {
9052 CrushWrapper::parse_loc_map(argvec, &loc);
9053 dout(0) << "will create and move bucket '" << name
9054 << "' to location " << loc << dendl;
9055 }
9056
9057 if (!_have_pending_crush() &&
9058 _get_stable_crush().name_exists(name)) {
9059 ss << "bucket '" << name << "' already exists";
9060 goto reply;
9061 }
9062
9063 CrushWrapper newcrush;
9064 _get_pending_crush(newcrush);
9065
9066 if (newcrush.name_exists(name)) {
9067 ss << "bucket '" << name << "' already exists";
9068 goto update;
9069 }
9070 int type = newcrush.get_type_id(typestr);
9071 if (type < 0) {
9072 ss << "type '" << typestr << "' does not exist";
9073 err = -EINVAL;
9074 goto reply;
9075 }
9076 if (type == 0) {
9077 ss << "type '" << typestr << "' is for devices, not buckets";
9078 err = -EINVAL;
9079 goto reply;
9080 }
9081 int bucketno;
9082 err = newcrush.add_bucket(0, 0,
9083 CRUSH_HASH_DEFAULT, type, 0, NULL,
9084 NULL, &bucketno);
9085 if (err < 0) {
9086 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9087 goto reply;
9088 }
9089 err = newcrush.set_item_name(bucketno, name);
9090 if (err < 0) {
9091 ss << "error setting bucket name to '" << name << "'";
9092 goto reply;
9093 }
9094
9095 if (!loc.empty()) {
9096 if (!newcrush.check_item_loc(cct, bucketno, loc,
9097 (int *)NULL)) {
9098 err = newcrush.move_bucket(cct, bucketno, loc);
9099 if (err < 0) {
9100 ss << "error moving bucket '" << name << "' to location " << loc;
9101 goto reply;
9102 }
9103 } else {
9104 ss << "no need to move item id " << bucketno << " name '" << name
9105 << "' to location " << loc << " in crush map";
9106 }
9107 }
9108
9109 pending_inc.crush.clear();
9110 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9111 if (loc.empty()) {
9112 ss << "added bucket " << name << " type " << typestr
9113 << " to crush map";
9114 } else {
9115 ss << "added bucket " << name << " type " << typestr
9116 << " to location " << loc;
9117 }
9118 goto update;
9119 } else if (prefix == "osd crush rename-bucket") {
9120 string srcname, dstname;
9121 cmd_getval(cct, cmdmap, "srcname", srcname);
9122 cmd_getval(cct, cmdmap, "dstname", dstname);
9123
9124 err = crush_rename_bucket(srcname, dstname, &ss);
9125 if (err == -EALREADY) // equivalent to success for idempotency
9126 err = 0;
9127 if (err)
9128 goto reply;
9129 else
9130 goto update;
9131 } else if (prefix == "osd crush weight-set create" ||
9132 prefix == "osd crush weight-set create-compat") {
9133 CrushWrapper newcrush;
9134 _get_pending_crush(newcrush);
9135 int64_t pool;
9136 int positions;
9137 if (newcrush.has_non_straw2_buckets()) {
9138 ss << "crush map contains one or more bucket(s) that are not straw2";
9139 err = -EPERM;
9140 goto reply;
9141 }
9142 if (prefix == "osd crush weight-set create") {
9143 if (osdmap.require_min_compat_client > 0 &&
9144 osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
9145 ss << "require_min_compat_client "
9146 << ceph_release_name(osdmap.require_min_compat_client)
9147 << " < luminous, which is required for per-pool weight-sets. "
9148 << "Try 'ceph osd set-require-min-compat-client luminous' "
9149 << "before using the new interface";
9150 err = -EPERM;
9151 goto reply;
9152 }
9153 string poolname, mode;
9154 cmd_getval(cct, cmdmap, "pool", poolname);
9155 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9156 if (pool < 0) {
9157 ss << "pool '" << poolname << "' not found";
9158 err = -ENOENT;
9159 goto reply;
9160 }
9161 cmd_getval(cct, cmdmap, "mode", mode);
9162 if (mode != "flat" && mode != "positional") {
9163 ss << "unrecognized weight-set mode '" << mode << "'";
9164 err = -EINVAL;
9165 goto reply;
9166 }
9167 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
9168 } else {
9169 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9170 positions = 1;
9171 }
9172 if (!newcrush.create_choose_args(pool, positions)) {
9173 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
9174 ss << "compat weight-set already created";
9175 } else {
9176 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
9177 << "' already created";
9178 }
9179 goto reply;
9180 }
9181 pending_inc.crush.clear();
9182 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9183 goto update;
9184
9185 } else if (prefix == "osd crush weight-set rm" ||
9186 prefix == "osd crush weight-set rm-compat") {
9187 CrushWrapper newcrush;
9188 _get_pending_crush(newcrush);
9189 int64_t pool;
9190 if (prefix == "osd crush weight-set rm") {
9191 string poolname;
9192 cmd_getval(cct, cmdmap, "pool", poolname);
9193 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9194 if (pool < 0) {
9195 ss << "pool '" << poolname << "' not found";
9196 err = -ENOENT;
9197 goto reply;
9198 }
9199 } else {
9200 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9201 }
9202 newcrush.rm_choose_args(pool);
9203 pending_inc.crush.clear();
9204 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9205 goto update;
9206
9207 } else if (prefix == "osd crush weight-set reweight" ||
9208 prefix == "osd crush weight-set reweight-compat") {
9209 string poolname, item;
9210 vector<double> weight;
9211 cmd_getval(cct, cmdmap, "pool", poolname);
9212 cmd_getval(cct, cmdmap, "item", item);
9213 cmd_getval(cct, cmdmap, "weight", weight);
9214 CrushWrapper newcrush;
9215 _get_pending_crush(newcrush);
9216 int64_t pool;
9217 if (prefix == "osd crush weight-set reweight") {
9218 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9219 if (pool < 0) {
9220 ss << "pool '" << poolname << "' not found";
9221 err = -ENOENT;
9222 goto reply;
9223 }
9224 if (!newcrush.have_choose_args(pool)) {
9225 ss << "no weight-set for pool '" << poolname << "'";
9226 err = -ENOENT;
9227 goto reply;
9228 }
9229 auto arg_map = newcrush.choose_args_get(pool);
9230 int positions = newcrush.get_choose_args_positions(arg_map);
9231 if (weight.size() != (size_t)positions) {
9232 ss << "must specify exact " << positions << " weight values";
9233 err = -EINVAL;
9234 goto reply;
9235 }
9236 } else {
9237 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9238 if (!newcrush.have_choose_args(pool)) {
9239 ss << "no backward-compatible weight-set";
9240 err = -ENOENT;
9241 goto reply;
9242 }
9243 }
9244 if (!newcrush.name_exists(item)) {
9245 ss << "item '" << item << "' does not exist";
9246 err = -ENOENT;
9247 goto reply;
9248 }
9249 err = newcrush.choose_args_adjust_item_weightf(
9250 cct,
9251 newcrush.choose_args_get(pool),
9252 newcrush.get_item_id(item),
9253 weight,
9254 &ss);
9255 if (err < 0) {
9256 goto reply;
9257 }
9258 err = 0;
9259 pending_inc.crush.clear();
9260 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9261 goto update;
9262 } else if (osdid_present &&
9263 (prefix == "osd crush set" || prefix == "osd crush add")) {
9264 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
9265 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
9266 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
9267
9268 if (!osdmap.exists(osdid)) {
9269 err = -ENOENT;
9270 ss << osd_name
9271 << " does not exist. Create it before updating the crush map";
9272 goto reply;
9273 }
9274
9275 double weight;
9276 if (!cmd_getval(cct, cmdmap, "weight", weight)) {
9277 ss << "unable to parse weight value '"
9278 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9279 err = -EINVAL;
9280 goto reply;
9281 }
9282
9283 string args;
9284 vector<string> argvec;
9285 cmd_getval(cct, cmdmap, "args", argvec);
9286 map<string,string> loc;
9287 CrushWrapper::parse_loc_map(argvec, &loc);
9288
9289 if (prefix == "osd crush set"
9290 && !_get_stable_crush().item_exists(osdid)) {
9291 err = -ENOENT;
9292 ss << "unable to set item id " << osdid << " name '" << osd_name
9293 << "' weight " << weight << " at location " << loc
9294 << ": does not exist";
9295 goto reply;
9296 }
9297
9298 dout(5) << "adding/updating crush item id " << osdid << " name '"
9299 << osd_name << "' weight " << weight << " at location "
9300 << loc << dendl;
9301 CrushWrapper newcrush;
9302 _get_pending_crush(newcrush);
9303
9304 string action;
9305 if (prefix == "osd crush set" ||
9306 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
9307 action = "set";
9308 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
9309 } else {
9310 action = "add";
9311 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
9312 if (err == 0)
9313 err = 1;
9314 }
9315
9316 if (err < 0)
9317 goto reply;
9318
9319 if (err == 0 && !_have_pending_crush()) {
9320 ss << action << " item id " << osdid << " name '" << osd_name
9321 << "' weight " << weight << " at location " << loc << ": no change";
9322 goto reply;
9323 }
9324
9325 pending_inc.crush.clear();
9326 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9327 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
9328 << weight << " at location " << loc << " to crush map";
9329 getline(ss, rs);
9330 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9331 get_last_committed() + 1));
9332 return true;
9333
9334 } else if (prefix == "osd crush create-or-move") {
9335 do {
9336 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
9337 if (!osdmap.exists(osdid)) {
9338 err = -ENOENT;
9339 ss << osd_name
9340 << " does not exist. create it before updating the crush map";
9341 goto reply;
9342 }
9343
9344 double weight;
9345 if (!cmd_getval(cct, cmdmap, "weight", weight)) {
9346 ss << "unable to parse weight value '"
9347 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9348 err = -EINVAL;
9349 goto reply;
9350 }
9351
9352 string args;
9353 vector<string> argvec;
9354 cmd_getval(cct, cmdmap, "args", argvec);
9355 map<string,string> loc;
9356 CrushWrapper::parse_loc_map(argvec, &loc);
9357
9358 dout(0) << "create-or-move crush item name '" << osd_name
9359 << "' initial_weight " << weight << " at location " << loc
9360 << dendl;
9361
9362 CrushWrapper newcrush;
9363 _get_pending_crush(newcrush);
9364
9365 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
9366 g_conf()->osd_crush_update_weight_set);
9367 if (err == 0) {
9368 ss << "create-or-move updated item name '" << osd_name
9369 << "' weight " << weight
9370 << " at location " << loc << " to crush map";
9371 break;
9372 }
9373 if (err > 0) {
9374 pending_inc.crush.clear();
9375 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9376 ss << "create-or-move updating item name '" << osd_name
9377 << "' weight " << weight
9378 << " at location " << loc << " to crush map";
9379 getline(ss, rs);
9380 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9381 get_last_committed() + 1));
9382 return true;
9383 }
9384 } while (false);
9385
9386 } else if (prefix == "osd crush move") {
9387 do {
9388 // osd crush move <name> <loc1> [<loc2> ...]
9389 string name;
9390 vector<string> argvec;
9391 cmd_getval(cct, cmdmap, "name", name);
9392 cmd_getval(cct, cmdmap, "args", argvec);
9393 map<string,string> loc;
9394 CrushWrapper::parse_loc_map(argvec, &loc);
9395
9396 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
9397 CrushWrapper newcrush;
9398 _get_pending_crush(newcrush);
9399
9400 if (!newcrush.name_exists(name)) {
9401 err = -ENOENT;
9402 ss << "item " << name << " does not exist";
9403 break;
9404 }
9405 int id = newcrush.get_item_id(name);
9406
9407 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
9408 if (id >= 0) {
9409 err = newcrush.create_or_move_item(
9410 cct, id, 0, name, loc,
9411 g_conf()->osd_crush_update_weight_set);
9412 } else {
9413 err = newcrush.move_bucket(cct, id, loc);
9414 }
9415 if (err >= 0) {
9416 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9417 pending_inc.crush.clear();
9418 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9419 getline(ss, rs);
9420 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9421 get_last_committed() + 1));
9422 return true;
9423 }
9424 } else {
9425 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9426 err = 0;
9427 }
9428 } while (false);
9429 } else if (prefix == "osd crush swap-bucket") {
9430 string source, dest;
9431 cmd_getval(cct, cmdmap, "source", source);
9432 cmd_getval(cct, cmdmap, "dest", dest);
9433
9434 bool force = false;
9435 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
9436
9437 CrushWrapper newcrush;
9438 _get_pending_crush(newcrush);
9439 if (!newcrush.name_exists(source)) {
9440 ss << "source item " << source << " does not exist";
9441 err = -ENOENT;
9442 goto reply;
9443 }
9444 if (!newcrush.name_exists(dest)) {
9445 ss << "dest item " << dest << " does not exist";
9446 err = -ENOENT;
9447 goto reply;
9448 }
9449 int sid = newcrush.get_item_id(source);
9450 int did = newcrush.get_item_id(dest);
9451 int sparent;
9452 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
9453 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
9454 err = -EPERM;
9455 goto reply;
9456 }
9457 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
9458 !force) {
9459 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
9460 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
9461 << "; pass --yes-i-really-mean-it to proceed anyway";
9462 err = -EPERM;
9463 goto reply;
9464 }
9465 int r = newcrush.swap_bucket(cct, sid, did);
9466 if (r < 0) {
9467 ss << "failed to swap bucket contents: " << cpp_strerror(r);
9468 err = r;
9469 goto reply;
9470 }
9471 ss << "swapped bucket of " << source << " to " << dest;
9472 pending_inc.crush.clear();
9473 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9474 wait_for_finished_proposal(op,
9475 new Monitor::C_Command(mon, op, err, ss.str(),
9476 get_last_committed() + 1));
9477 return true;
9478 } else if (prefix == "osd crush link") {
9479 // osd crush link <name> <loc1> [<loc2> ...]
9480 string name;
9481 cmd_getval(cct, cmdmap, "name", name);
9482 vector<string> argvec;
9483 cmd_getval(cct, cmdmap, "args", argvec);
9484 map<string,string> loc;
9485 CrushWrapper::parse_loc_map(argvec, &loc);
9486
9487 // Need an explicit check for name_exists because get_item_id returns
9488 // 0 on unfound.
9489 int id = osdmap.crush->get_item_id(name);
9490 if (!osdmap.crush->name_exists(name)) {
9491 err = -ENOENT;
9492 ss << "item " << name << " does not exist";
9493 goto reply;
9494 } else {
9495 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
9496 }
9497 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
9498 ss << "no need to move item id " << id << " name '" << name
9499 << "' to location " << loc << " in crush map";
9500 err = 0;
9501 goto reply;
9502 }
9503
9504 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
9505 CrushWrapper newcrush;
9506 _get_pending_crush(newcrush);
9507
9508 if (!newcrush.name_exists(name)) {
9509 err = -ENOENT;
9510 ss << "item " << name << " does not exist";
9511 goto reply;
9512 } else {
9513 int id = newcrush.get_item_id(name);
9514 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
9515 err = newcrush.link_bucket(cct, id, loc);
9516 if (err >= 0) {
9517 ss << "linked item id " << id << " name '" << name
9518 << "' to location " << loc << " in crush map";
9519 pending_inc.crush.clear();
9520 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9521 } else {
9522 ss << "cannot link item id " << id << " name '" << name
9523 << "' to location " << loc;
9524 goto reply;
9525 }
9526 } else {
9527 ss << "no need to move item id " << id << " name '" << name
9528 << "' to location " << loc << " in crush map";
9529 err = 0;
9530 }
9531 }
9532 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
9533 get_last_committed() + 1));
9534 return true;
9535 } else if (prefix == "osd crush rm" ||
9536 prefix == "osd crush remove" ||
9537 prefix == "osd crush unlink") {
9538 do {
9539 // osd crush rm <id> [ancestor]
9540 CrushWrapper newcrush;
9541 _get_pending_crush(newcrush);
9542
9543 string name;
9544 cmd_getval(cct, cmdmap, "name", name);
9545
9546 if (!osdmap.crush->name_exists(name)) {
9547 err = 0;
9548 ss << "device '" << name << "' does not appear in the crush map";
9549 break;
9550 }
9551 if (!newcrush.name_exists(name)) {
9552 err = 0;
9553 ss << "device '" << name << "' does not appear in the crush map";
9554 getline(ss, rs);
9555 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9556 get_last_committed() + 1));
9557 return true;
9558 }
9559 int id = newcrush.get_item_id(name);
9560 int ancestor = 0;
9561
9562 bool unlink_only = prefix == "osd crush unlink";
9563 string ancestor_str;
9564 if (cmd_getval(cct, cmdmap, "ancestor", ancestor_str)) {
9565 if (!newcrush.name_exists(ancestor_str)) {
9566 err = -ENOENT;
9567 ss << "ancestor item '" << ancestor_str
9568 << "' does not appear in the crush map";
9569 break;
9570 }
9571 ancestor = newcrush.get_item_id(ancestor_str);
9572 }
9573
9574 err = prepare_command_osd_crush_remove(
9575 newcrush,
9576 id, ancestor,
9577 (ancestor < 0), unlink_only);
9578
9579 if (err == -ENOENT) {
9580 ss << "item " << id << " does not appear in that position";
9581 err = 0;
9582 break;
9583 }
9584 if (err == 0) {
9585 if (!unlink_only)
9586 pending_inc.new_crush_node_flags[id] = 0;
9587 ss << "removed item id " << id << " name '" << name << "' from crush map";
9588 getline(ss, rs);
9589 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9590 get_last_committed() + 1));
9591 return true;
9592 }
9593 } while (false);
9594
9595 } else if (prefix == "osd crush reweight-all") {
9596 CrushWrapper newcrush;
9597 _get_pending_crush(newcrush);
9598
9599 newcrush.reweight(cct);
9600 pending_inc.crush.clear();
9601 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9602 ss << "reweighted crush hierarchy";
9603 getline(ss, rs);
9604 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9605 get_last_committed() + 1));
9606 return true;
9607 } else if (prefix == "osd crush reweight") {
9608 // osd crush reweight <name> <weight>
9609 CrushWrapper newcrush;
9610 _get_pending_crush(newcrush);
9611
9612 string name;
9613 cmd_getval(cct, cmdmap, "name", name);
9614 if (!newcrush.name_exists(name)) {
9615 err = -ENOENT;
9616 ss << "device '" << name << "' does not appear in the crush map";
9617 goto reply;
9618 }
9619
9620 int id = newcrush.get_item_id(name);
9621 if (id < 0) {
9622 ss << "device '" << name << "' is not a leaf in the crush map";
9623 err = -EINVAL;
9624 goto reply;
9625 }
9626 double w;
9627 if (!cmd_getval(cct, cmdmap, "weight", w)) {
9628 ss << "unable to parse weight value '"
9629 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9630 err = -EINVAL;
9631 goto reply;
9632 }
9633
9634 err = newcrush.adjust_item_weightf(cct, id, w,
9635 g_conf()->osd_crush_update_weight_set);
9636 if (err < 0)
9637 goto reply;
9638 pending_inc.crush.clear();
9639 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9640 ss << "reweighted item id " << id << " name '" << name << "' to " << w
9641 << " in crush map";
9642 getline(ss, rs);
9643 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9644 get_last_committed() + 1));
9645 return true;
9646 } else if (prefix == "osd crush reweight-subtree") {
9647 // osd crush reweight <name> <weight>
9648 CrushWrapper newcrush;
9649 _get_pending_crush(newcrush);
9650
9651 string name;
9652 cmd_getval(cct, cmdmap, "name", name);
9653 if (!newcrush.name_exists(name)) {
9654 err = -ENOENT;
9655 ss << "device '" << name << "' does not appear in the crush map";
9656 goto reply;
9657 }
9658
9659 int id = newcrush.get_item_id(name);
9660 if (id >= 0) {
9661 ss << "device '" << name << "' is not a subtree in the crush map";
9662 err = -EINVAL;
9663 goto reply;
9664 }
9665 double w;
9666 if (!cmd_getval(cct, cmdmap, "weight", w)) {
9667 ss << "unable to parse weight value '"
9668 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9669 err = -EINVAL;
9670 goto reply;
9671 }
9672
9673 err = newcrush.adjust_subtree_weightf(cct, id, w,
9674 g_conf()->osd_crush_update_weight_set);
9675 if (err < 0)
9676 goto reply;
9677 pending_inc.crush.clear();
9678 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9679 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
9680 << " in crush map";
9681 getline(ss, rs);
9682 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9683 get_last_committed() + 1));
9684 return true;
9685 } else if (prefix == "osd crush tunables") {
9686 CrushWrapper newcrush;
9687 _get_pending_crush(newcrush);
9688
9689 err = 0;
9690 string profile;
9691 cmd_getval(cct, cmdmap, "profile", profile);
9692 if (profile == "legacy" || profile == "argonaut") {
9693 newcrush.set_tunables_legacy();
9694 } else if (profile == "bobtail") {
9695 newcrush.set_tunables_bobtail();
9696 } else if (profile == "firefly") {
9697 newcrush.set_tunables_firefly();
9698 } else if (profile == "hammer") {
9699 newcrush.set_tunables_hammer();
9700 } else if (profile == "jewel") {
9701 newcrush.set_tunables_jewel();
9702 } else if (profile == "optimal") {
9703 newcrush.set_tunables_optimal();
9704 } else if (profile == "default") {
9705 newcrush.set_tunables_default();
9706 } else {
9707 ss << "unrecognized profile '" << profile << "'";
9708 err = -EINVAL;
9709 goto reply;
9710 }
9711
9712 if (!validate_crush_against_features(&newcrush, ss)) {
9713 err = -EINVAL;
9714 goto reply;
9715 }
9716
9717 pending_inc.crush.clear();
9718 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9719 ss << "adjusted tunables profile to " << profile;
9720 getline(ss, rs);
9721 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9722 get_last_committed() + 1));
9723 return true;
9724 } else if (prefix == "osd crush set-tunable") {
9725 CrushWrapper newcrush;
9726 _get_pending_crush(newcrush);
9727
9728 err = 0;
9729 string tunable;
9730 cmd_getval(cct, cmdmap, "tunable", tunable);
9731
9732 int64_t value = -1;
9733 if (!cmd_getval(cct, cmdmap, "value", value)) {
9734 err = -EINVAL;
9735 ss << "failed to parse integer value "
9736 << cmd_vartype_stringify(cmdmap.at("value"));
9737 goto reply;
9738 }
9739
9740 if (tunable == "straw_calc_version") {
9741 if (value != 0 && value != 1) {
9742 ss << "value must be 0 or 1; got " << value;
9743 err = -EINVAL;
9744 goto reply;
9745 }
9746 newcrush.set_straw_calc_version(value);
9747 } else {
9748 ss << "unrecognized tunable '" << tunable << "'";
9749 err = -EINVAL;
9750 goto reply;
9751 }
9752
9753 if (!validate_crush_against_features(&newcrush, ss)) {
9754 err = -EINVAL;
9755 goto reply;
9756 }
9757
9758 pending_inc.crush.clear();
9759 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9760 ss << "adjusted tunable " << tunable << " to " << value;
9761 getline(ss, rs);
9762 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9763 get_last_committed() + 1));
9764 return true;
9765
9766 } else if (prefix == "osd crush rule create-simple") {
9767 string name, root, type, mode;
9768 cmd_getval(cct, cmdmap, "name", name);
9769 cmd_getval(cct, cmdmap, "root", root);
9770 cmd_getval(cct, cmdmap, "type", type);
9771 cmd_getval(cct, cmdmap, "mode", mode);
9772 if (mode == "")
9773 mode = "firstn";
9774
9775 if (osdmap.crush->rule_exists(name)) {
9776 // The name is uniquely associated to a ruleid and the rule it contains
9777 // From the user point of view, the rule is more meaningfull.
9778 ss << "rule " << name << " already exists";
9779 err = 0;
9780 goto reply;
9781 }
9782
9783 CrushWrapper newcrush;
9784 _get_pending_crush(newcrush);
9785
9786 if (newcrush.rule_exists(name)) {
9787 // The name is uniquely associated to a ruleid and the rule it contains
9788 // From the user point of view, the rule is more meaningfull.
9789 ss << "rule " << name << " already exists";
9790 err = 0;
9791 } else {
9792 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
9793 pg_pool_t::TYPE_REPLICATED, &ss);
9794 if (ruleno < 0) {
9795 err = ruleno;
9796 goto reply;
9797 }
9798
9799 pending_inc.crush.clear();
9800 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9801 }
9802 getline(ss, rs);
9803 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9804 get_last_committed() + 1));
9805 return true;
9806
9807 } else if (prefix == "osd crush rule create-replicated") {
9808 string name, root, type, device_class;
9809 cmd_getval(cct, cmdmap, "name", name);
9810 cmd_getval(cct, cmdmap, "root", root);
9811 cmd_getval(cct, cmdmap, "type", type);
9812 cmd_getval(cct, cmdmap, "class", device_class);
9813
9814 if (osdmap.crush->rule_exists(name)) {
9815 // The name is uniquely associated to a ruleid and the rule it contains
9816 // From the user point of view, the rule is more meaningfull.
9817 ss << "rule " << name << " already exists";
9818 err = 0;
9819 goto reply;
9820 }
9821
9822 CrushWrapper newcrush;
9823 _get_pending_crush(newcrush);
9824
9825 if (newcrush.rule_exists(name)) {
9826 // The name is uniquely associated to a ruleid and the rule it contains
9827 // From the user point of view, the rule is more meaningfull.
9828 ss << "rule " << name << " already exists";
9829 err = 0;
9830 } else {
9831 int ruleno = newcrush.add_simple_rule(
9832 name, root, type, device_class,
9833 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
9834 if (ruleno < 0) {
9835 err = ruleno;
9836 goto reply;
9837 }
9838
9839 pending_inc.crush.clear();
9840 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9841 }
9842 getline(ss, rs);
9843 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9844 get_last_committed() + 1));
9845 return true;
9846
9847 } else if (prefix == "osd erasure-code-profile rm") {
9848 string name;
9849 cmd_getval(cct, cmdmap, "name", name);
9850
9851 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
9852 goto wait;
9853
9854 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
9855 err = -EBUSY;
9856 goto reply;
9857 }
9858
9859 if (osdmap.has_erasure_code_profile(name) ||
9860 pending_inc.new_erasure_code_profiles.count(name)) {
9861 if (osdmap.has_erasure_code_profile(name)) {
9862 pending_inc.old_erasure_code_profiles.push_back(name);
9863 } else {
9864 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
9865 pending_inc.new_erasure_code_profiles.erase(name);
9866 }
9867
9868 getline(ss, rs);
9869 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9870 get_last_committed() + 1));
9871 return true;
9872 } else {
9873 ss << "erasure-code-profile " << name << " does not exist";
9874 err = 0;
9875 goto reply;
9876 }
9877
9878 } else if (prefix == "osd erasure-code-profile set") {
9879 string name;
9880 cmd_getval(cct, cmdmap, "name", name);
9881 vector<string> profile;
9882 cmd_getval(cct, cmdmap, "profile", profile);
9883
9884 bool force = false;
9885 cmd_getval(cct, cmdmap, "force", force);
9886
9887 map<string,string> profile_map;
9888 err = parse_erasure_code_profile(profile, &profile_map, &ss);
9889 if (err)
9890 goto reply;
9891 if (profile_map.find("plugin") == profile_map.end()) {
9892 ss << "erasure-code-profile " << profile_map
9893 << " must contain a plugin entry" << std::endl;
9894 err = -EINVAL;
9895 goto reply;
9896 }
9897 string plugin = profile_map["plugin"];
9898
9899 if (pending_inc.has_erasure_code_profile(name)) {
9900 dout(20) << "erasure code profile " << name << " try again" << dendl;
9901 goto wait;
9902 } else {
9903 err = normalize_profile(name, profile_map, force, &ss);
9904 if (err)
9905 goto reply;
9906
9907 if (osdmap.has_erasure_code_profile(name)) {
9908 ErasureCodeProfile existing_profile_map =
9909 osdmap.get_erasure_code_profile(name);
9910 err = normalize_profile(name, existing_profile_map, force, &ss);
9911 if (err)
9912 goto reply;
9913
9914 if (existing_profile_map == profile_map) {
9915 err = 0;
9916 goto reply;
9917 }
9918 if (!force) {
9919 err = -EPERM;
9920 ss << "will not override erasure code profile " << name
9921 << " because the existing profile "
9922 << existing_profile_map
9923 << " is different from the proposed profile "
9924 << profile_map;
9925 goto reply;
9926 }
9927 }
9928
9929 dout(20) << "erasure code profile set " << name << "="
9930 << profile_map << dendl;
9931 pending_inc.set_erasure_code_profile(name, profile_map);
9932 }
9933
9934 getline(ss, rs);
9935 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9936 get_last_committed() + 1));
9937 return true;
9938
9939 } else if (prefix == "osd crush rule create-erasure") {
9940 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
9941 if (err == -EAGAIN)
9942 goto wait;
9943 if (err)
9944 goto reply;
9945 string name, poolstr;
9946 cmd_getval(cct, cmdmap, "name", name);
9947 string profile;
9948 cmd_getval(cct, cmdmap, "profile", profile);
9949 if (profile == "")
9950 profile = "default";
9951 if (profile == "default") {
9952 if (!osdmap.has_erasure_code_profile(profile)) {
9953 if (pending_inc.has_erasure_code_profile(profile)) {
9954 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
9955 goto wait;
9956 }
9957
9958 map<string,string> profile_map;
9959 err = osdmap.get_erasure_code_profile_default(cct,
9960 profile_map,
9961 &ss);
9962 if (err)
9963 goto reply;
9964 err = normalize_profile(name, profile_map, true, &ss);
9965 if (err)
9966 goto reply;
9967 dout(20) << "erasure code profile set " << profile << "="
9968 << profile_map << dendl;
9969 pending_inc.set_erasure_code_profile(profile, profile_map);
9970 goto wait;
9971 }
9972 }
9973
9974 int rule;
9975 err = crush_rule_create_erasure(name, profile, &rule, &ss);
9976 if (err < 0) {
9977 switch(err) {
9978 case -EEXIST: // return immediately
9979 ss << "rule " << name << " already exists";
9980 err = 0;
9981 goto reply;
9982 break;
9983 case -EALREADY: // wait for pending to be proposed
9984 ss << "rule " << name << " already exists";
9985 err = 0;
9986 break;
9987 default: // non recoverable error
9988 goto reply;
9989 break;
9990 }
9991 } else {
9992 ss << "created rule " << name << " at " << rule;
9993 }
9994
9995 getline(ss, rs);
9996 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9997 get_last_committed() + 1));
9998 return true;
9999
10000 } else if (prefix == "osd crush rule rm") {
10001 string name;
10002 cmd_getval(cct, cmdmap, "name", name);
10003
10004 if (!osdmap.crush->rule_exists(name)) {
10005 ss << "rule " << name << " does not exist";
10006 err = 0;
10007 goto reply;
10008 }
10009
10010 CrushWrapper newcrush;
10011 _get_pending_crush(newcrush);
10012
10013 if (!newcrush.rule_exists(name)) {
10014 ss << "rule " << name << " does not exist";
10015 err = 0;
10016 } else {
10017 int ruleno = newcrush.get_rule_id(name);
10018 ceph_assert(ruleno >= 0);
10019
10020 // make sure it is not in use.
10021 // FIXME: this is ok in some situations, but let's not bother with that
10022 // complexity now.
10023 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10024 if (osdmap.crush_rule_in_use(ruleset)) {
10025 ss << "crush ruleset " << name << " " << ruleset << " is in use";
10026 err = -EBUSY;
10027 goto reply;
10028 }
10029
10030 err = newcrush.remove_rule(ruleno);
10031 if (err < 0) {
10032 goto reply;
10033 }
10034
10035 pending_inc.crush.clear();
10036 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10037 }
10038 getline(ss, rs);
10039 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10040 get_last_committed() + 1));
10041 return true;
10042
10043 } else if (prefix == "osd crush rule rename") {
10044 string srcname;
10045 string dstname;
10046 cmd_getval(cct, cmdmap, "srcname", srcname);
10047 cmd_getval(cct, cmdmap, "dstname", dstname);
10048 if (srcname.empty() || dstname.empty()) {
10049 ss << "must specify both source rule name and destination rule name";
10050 err = -EINVAL;
10051 goto reply;
10052 }
10053 if (srcname == dstname) {
10054 ss << "destination rule name is equal to source rule name";
10055 err = 0;
10056 goto reply;
10057 }
10058
10059 CrushWrapper newcrush;
10060 _get_pending_crush(newcrush);
10061 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10062 // srcname does not exist and dstname already exists
10063 // suppose this is a replay and return success
10064 // (so this command is idempotent)
10065 ss << "already renamed to '" << dstname << "'";
10066 err = 0;
10067 goto reply;
10068 }
10069
10070 err = newcrush.rename_rule(srcname, dstname, &ss);
10071 if (err < 0) {
10072 // ss has reason for failure
10073 goto reply;
10074 }
10075 pending_inc.crush.clear();
10076 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10077 getline(ss, rs);
10078 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10079 get_last_committed() + 1));
10080 return true;
10081
10082 } else if (prefix == "osd setmaxosd") {
10083 int64_t newmax;
10084 if (!cmd_getval(cct, cmdmap, "newmax", newmax)) {
10085 ss << "unable to parse 'newmax' value '"
10086 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
10087 err = -EINVAL;
10088 goto reply;
10089 }
10090
10091 if (newmax > g_conf()->mon_max_osd) {
10092 err = -ERANGE;
10093 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
10094 << g_conf()->mon_max_osd << ")";
10095 goto reply;
10096 }
10097
10098 // Don't allow shrinking OSD number as this will cause data loss
10099 // and may cause kernel crashes.
10100 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10101 if (newmax < osdmap.get_max_osd()) {
10102 // Check if the OSDs exist between current max and new value.
10103 // If there are any OSDs exist, then don't allow shrinking number
10104 // of OSDs.
10105 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10106 if (osdmap.exists(i)) {
10107 err = -EBUSY;
10108 ss << "cannot shrink max_osd to " << newmax
10109 << " because osd." << i << " (and possibly others) still in use";
10110 goto reply;
10111 }
10112 }
10113 }
10114
10115 pending_inc.new_max_osd = newmax;
10116 ss << "set new max_osd = " << pending_inc.new_max_osd;
10117 getline(ss, rs);
10118 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10119 get_last_committed() + 1));
10120 return true;
10121
10122 } else if (prefix == "osd set-full-ratio" ||
10123 prefix == "osd set-backfillfull-ratio" ||
10124 prefix == "osd set-nearfull-ratio") {
10125 double n;
10126 if (!cmd_getval(cct, cmdmap, "ratio", n)) {
10127 ss << "unable to parse 'ratio' value '"
10128 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
10129 err = -EINVAL;
10130 goto reply;
10131 }
10132 if (prefix == "osd set-full-ratio")
10133 pending_inc.new_full_ratio = n;
10134 else if (prefix == "osd set-backfillfull-ratio")
10135 pending_inc.new_backfillfull_ratio = n;
10136 else if (prefix == "osd set-nearfull-ratio")
10137 pending_inc.new_nearfull_ratio = n;
10138 ss << prefix << " " << n;
10139 getline(ss, rs);
10140 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10141 get_last_committed() + 1));
10142 return true;
10143 } else if (prefix == "osd set-require-min-compat-client") {
10144 string v;
10145 cmd_getval(cct, cmdmap, "version", v);
10146 int vno = ceph_release_from_name(v.c_str());
10147 if (vno <= 0) {
10148 ss << "version " << v << " is not recognized";
10149 err = -EINVAL;
10150 goto reply;
10151 }
10152 OSDMap newmap;
10153 newmap.deepish_copy_from(osdmap);
10154 newmap.apply_incremental(pending_inc);
10155 newmap.require_min_compat_client = vno;
10156 auto mvno = newmap.get_min_compat_client();
10157 if (vno < mvno) {
10158 ss << "osdmap current utilizes features that require "
10159 << ceph_release_name(mvno)
10160 << "; cannot set require_min_compat_client below that to "
10161 << ceph_release_name(vno);
10162 err = -EPERM;
10163 goto reply;
10164 }
10165 bool sure = false;
10166 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
10167 if (!sure) {
10168 FeatureMap m;
10169 mon->get_combined_feature_map(&m);
10170 uint64_t features = ceph_release_features(vno);
10171 bool first = true;
10172 bool ok = true;
10173 for (int type : {
10174 CEPH_ENTITY_TYPE_CLIENT,
10175 CEPH_ENTITY_TYPE_MDS,
10176 CEPH_ENTITY_TYPE_MGR }) {
10177 auto p = m.m.find(type);
10178 if (p == m.m.end()) {
10179 continue;
10180 }
10181 for (auto& q : p->second) {
10182 uint64_t missing = ~q.first & features;
10183 if (missing) {
10184 if (first) {
10185 ss << "cannot set require_min_compat_client to " << v << ": ";
10186 } else {
10187 ss << "; ";
10188 }
10189 first = false;
10190 ss << q.second << " connected " << ceph_entity_type_name(type)
10191 << "(s) look like " << ceph_release_name(
10192 ceph_release_from_features(q.first))
10193 << " (missing 0x" << std::hex << missing << std::dec << ")";
10194 ok = false;
10195 }
10196 }
10197 }
10198 if (!ok) {
10199 ss << "; add --yes-i-really-mean-it to do it anyway";
10200 err = -EPERM;
10201 goto reply;
10202 }
10203 }
10204 ss << "set require_min_compat_client to " << ceph_release_name(vno);
10205 pending_inc.new_require_min_compat_client = vno;
10206 getline(ss, rs);
10207 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10208 get_last_committed() + 1));
10209 return true;
10210 } else if (prefix == "osd pause") {
10211 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10212
10213 } else if (prefix == "osd unpause") {
10214 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10215
10216 } else if (prefix == "osd set") {
10217 bool sure = false;
10218 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
10219
10220 string key;
10221 cmd_getval(cct, cmdmap, "key", key);
10222 if (key == "full")
10223 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
10224 else if (key == "pause")
10225 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10226 else if (key == "noup")
10227 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
10228 else if (key == "nodown")
10229 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
10230 else if (key == "noout")
10231 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
10232 else if (key == "noin")
10233 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
10234 else if (key == "nobackfill")
10235 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
10236 else if (key == "norebalance")
10237 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
10238 else if (key == "norecover")
10239 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
10240 else if (key == "noscrub")
10241 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
10242 else if (key == "nodeep-scrub")
10243 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10244 else if (key == "notieragent")
10245 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
10246 else if (key == "nosnaptrim")
10247 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
10248 else if (key == "pglog_hardlimit") {
10249 if (!osdmap.get_num_up_osds() && !sure) {
10250 ss << "Not advisable to continue since no OSDs are up. Pass "
10251 << "--yes-i-really-mean-it if you really wish to continue.";
10252 err = -EPERM;
10253 goto reply;
10254 }
10255 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
10256 // we are reusing a jewel feature bit that was retired in luminous.
10257 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
10258 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
10259 || sure)) {
10260 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
10261 } else {
10262 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
10263 err = -EPERM;
10264 goto reply;
10265 }
10266 } else {
10267 ss << "unrecognized flag '" << key << "'";
10268 err = -EINVAL;
10269 }
10270
10271 } else if (prefix == "osd unset") {
10272 string key;
10273 cmd_getval(cct, cmdmap, "key", key);
10274 if (key == "full")
10275 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
10276 else if (key == "pause")
10277 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10278 else if (key == "noup")
10279 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
10280 else if (key == "nodown")
10281 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
10282 else if (key == "noout")
10283 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
10284 else if (key == "noin")
10285 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
10286 else if (key == "nobackfill")
10287 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
10288 else if (key == "norebalance")
10289 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
10290 else if (key == "norecover")
10291 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
10292 else if (key == "noscrub")
10293 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
10294 else if (key == "nodeep-scrub")
10295 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10296 else if (key == "notieragent")
10297 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
10298 else if (key == "nosnaptrim")
10299 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
10300 else {
10301 ss << "unrecognized flag '" << key << "'";
10302 err = -EINVAL;
10303 }
10304
10305 } else if (prefix == "osd require-osd-release") {
10306 string release;
10307 cmd_getval(cct, cmdmap, "release", release);
10308 bool sure = false;
10309 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
10310 int rel = ceph_release_from_name(release.c_str());
10311 if (rel <= 0) {
10312 ss << "unrecognized release " << release;
10313 err = -EINVAL;
10314 goto reply;
10315 }
10316 if (rel == osdmap.require_osd_release) {
10317 // idempotent
10318 err = 0;
10319 goto reply;
10320 }
10321 ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
10322 if (!osdmap.get_num_up_osds() && !sure) {
10323 ss << "Not advisable to continue since no OSDs are up. Pass "
10324 << "--yes-i-really-mean-it if you really wish to continue.";
10325 err = -EPERM;
10326 goto reply;
10327 }
10328 if (rel == CEPH_RELEASE_MIMIC) {
10329 if (!mon->monmap->get_required_features().contains_all(
10330 ceph::features::mon::FEATURE_MIMIC)) {
10331 ss << "not all mons are mimic";
10332 err = -EPERM;
10333 goto reply;
10334 }
10335 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
10336 && !sure) {
10337 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
10338 err = -EPERM;
10339 goto reply;
10340 }
10341 } else if (rel == CEPH_RELEASE_NAUTILUS) {
10342 if (!mon->monmap->get_required_features().contains_all(
10343 ceph::features::mon::FEATURE_NAUTILUS)) {
10344 ss << "not all mons are nautilus";
10345 err = -EPERM;
10346 goto reply;
10347 }
10348 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
10349 && !sure) {
10350 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
10351 err = -EPERM;
10352 goto reply;
10353 }
10354 } else {
10355 ss << "not supported for this release yet";
10356 err = -EPERM;
10357 goto reply;
10358 }
10359 if (rel < osdmap.require_osd_release) {
10360 ss << "require_osd_release cannot be lowered once it has been set";
10361 err = -EPERM;
10362 goto reply;
10363 }
10364 pending_inc.new_require_osd_release = rel;
10365 goto update;
10366 } else if (prefix == "osd down" ||
10367 prefix == "osd out" ||
10368 prefix == "osd in" ||
10369 prefix == "osd rm") {
10370
10371 bool any = false;
10372 bool stop = false;
10373 bool verbose = true;
10374
10375 vector<string> idvec;
10376 cmd_getval(cct, cmdmap, "ids", idvec);
10377 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10378 set<int> osds;
10379
10380 // wildcard?
10381 if (j == 0 &&
10382 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10383 if (prefix == "osd in") {
10384 // touch out osds only
10385 osdmap.get_out_existing_osds(osds);
10386 } else {
10387 osdmap.get_all_osds(osds);
10388 }
10389 stop = true;
10390 verbose = false; // so the output is less noisy.
10391 } else {
10392 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10393 if (osd < 0) {
10394 ss << "invalid osd id" << osd;
10395 err = -EINVAL;
10396 continue;
10397 } else if (!osdmap.exists(osd)) {
10398 ss << "osd." << osd << " does not exist. ";
10399 continue;
10400 }
10401
10402 osds.insert(osd);
10403 }
10404
10405 for (auto &osd : osds) {
10406 if (prefix == "osd down") {
10407 if (osdmap.is_down(osd)) {
10408 if (verbose)
10409 ss << "osd." << osd << " is already down. ";
10410 } else {
10411 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
10412 ss << "marked down osd." << osd << ". ";
10413 any = true;
10414 }
10415 } else if (prefix == "osd out") {
10416 if (osdmap.is_out(osd)) {
10417 if (verbose)
10418 ss << "osd." << osd << " is already out. ";
10419 } else {
10420 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
10421 if (osdmap.osd_weight[osd]) {
10422 if (pending_inc.new_xinfo.count(osd) == 0) {
10423 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10424 }
10425 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
10426 }
10427 ss << "marked out osd." << osd << ". ";
10428 std::ostringstream msg;
10429 msg << "Client " << op->get_session()->entity_name
10430 << " marked osd." << osd << " out";
10431 if (osdmap.is_up(osd)) {
10432 msg << ", while it was still marked up";
10433 } else {
10434 auto period = ceph_clock_now() - down_pending_out[osd];
10435 msg << ", after it was down for " << int(period.sec())
10436 << " seconds";
10437 }
10438
10439 mon->clog->info() << msg.str();
10440 any = true;
10441 }
10442 } else if (prefix == "osd in") {
10443 if (osdmap.is_in(osd)) {
10444 if (verbose)
10445 ss << "osd." << osd << " is already in. ";
10446 } else {
10447 if (osdmap.osd_xinfo[osd].old_weight > 0) {
10448 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
10449 if (pending_inc.new_xinfo.count(osd) == 0) {
10450 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10451 }
10452 pending_inc.new_xinfo[osd].old_weight = 0;
10453 } else {
10454 pending_inc.new_weight[osd] = CEPH_OSD_IN;
10455 }
10456 ss << "marked in osd." << osd << ". ";
10457 any = true;
10458 }
10459 } else if (prefix == "osd rm") {
10460 err = prepare_command_osd_remove(osd);
10461
10462 if (err == -EBUSY) {
10463 if (any)
10464 ss << ", ";
10465 ss << "osd." << osd << " is still up; must be down before removal. ";
10466 } else {
10467 ceph_assert(err == 0);
10468 if (any) {
10469 ss << ", osd." << osd;
10470 } else {
10471 ss << "removed osd." << osd;
10472 }
10473 any = true;
10474 }
10475 }
10476 }
10477 }
10478 if (any) {
10479 getline(ss, rs);
10480 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
10481 get_last_committed() + 1));
10482 return true;
10483 }
10484 } else if (prefix == "osd set-group" ||
10485 prefix == "osd unset-group" ||
10486 prefix == "osd add-noup" ||
10487 prefix == "osd add-nodown" ||
10488 prefix == "osd add-noin" ||
10489 prefix == "osd add-noout" ||
10490 prefix == "osd rm-noup" ||
10491 prefix == "osd rm-nodown" ||
10492 prefix == "osd rm-noin" ||
10493 prefix == "osd rm-noout") {
10494 bool do_set = prefix == "osd set-group" ||
10495 prefix.find("add") != string::npos;
10496 string flag_str;
10497 unsigned flags = 0;
10498 vector<string> who;
10499 if (prefix == "osd set-group" || prefix == "osd unset-group") {
10500 cmd_getval(cct, cmdmap, "flags", flag_str);
10501 cmd_getval(cct, cmdmap, "who", who);
10502 vector<string> raw_flags;
10503 boost::split(raw_flags, flag_str, boost::is_any_of(","));
10504 for (auto& f : raw_flags) {
10505 if (f == "noup")
10506 flags |= CEPH_OSD_NOUP;
10507 else if (f == "nodown")
10508 flags |= CEPH_OSD_NODOWN;
10509 else if (f == "noin")
10510 flags |= CEPH_OSD_NOIN;
10511 else if (f == "noout")
10512 flags |= CEPH_OSD_NOOUT;
10513 else {
10514 ss << "unrecognized flag '" << f << "', must be one of "
10515 << "{noup,nodown,noin,noout}";
10516 err = -EINVAL;
10517 goto reply;
10518 }
10519 }
10520 } else {
10521 cmd_getval(cct, cmdmap, "ids", who);
10522 if (prefix.find("noup") != string::npos)
10523 flags = CEPH_OSD_NOUP;
10524 else if (prefix.find("nodown") != string::npos)
10525 flags = CEPH_OSD_NODOWN;
10526 else if (prefix.find("noin") != string::npos)
10527 flags = CEPH_OSD_NOIN;
10528 else if (prefix.find("noout") != string::npos)
10529 flags = CEPH_OSD_NOOUT;
10530 else
10531 ceph_assert(0 == "Unreachable!");
10532 }
10533 if (flags == 0) {
10534 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
10535 err = -EINVAL;
10536 goto reply;
10537 }
10538 if (who.empty()) {
10539 ss << "must specify at least one or more targets to set/unset";
10540 err = -EINVAL;
10541 goto reply;
10542 }
10543 set<int> osds;
10544 set<int> crush_nodes;
10545 set<int> device_classes;
10546 for (auto& w : who) {
10547 if (w == "any" || w == "all" || w == "*") {
10548 osdmap.get_all_osds(osds);
10549 break;
10550 }
10551 std::stringstream ts;
10552 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
10553 osds.insert(osd);
10554 } else if (osdmap.crush->name_exists(w)) {
10555 crush_nodes.insert(osdmap.crush->get_item_id(w));
10556 } else if (osdmap.crush->class_exists(w)) {
10557 device_classes.insert(osdmap.crush->get_class_id(w));
10558 } else {
10559 ss << "unable to parse osd id or crush node or device class: "
10560 << "\"" << w << "\". ";
10561 }
10562 }
10563 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
10564 // ss has reason for failure
10565 err = -EINVAL;
10566 goto reply;
10567 }
10568 bool any = false;
10569 for (auto osd : osds) {
10570 if (!osdmap.exists(osd)) {
10571 ss << "osd." << osd << " does not exist. ";
10572 continue;
10573 }
10574 if (do_set) {
10575 if (flags & CEPH_OSD_NOUP) {
10576 any |= osdmap.is_noup_by_osd(osd) ?
10577 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
10578 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
10579 }
10580 if (flags & CEPH_OSD_NODOWN) {
10581 any |= osdmap.is_nodown_by_osd(osd) ?
10582 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
10583 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
10584 }
10585 if (flags & CEPH_OSD_NOIN) {
10586 any |= osdmap.is_noin_by_osd(osd) ?
10587 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
10588 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
10589 }
10590 if (flags & CEPH_OSD_NOOUT) {
10591 any |= osdmap.is_noout_by_osd(osd) ?
10592 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
10593 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
10594 }
10595 } else {
10596 if (flags & CEPH_OSD_NOUP) {
10597 any |= osdmap.is_noup_by_osd(osd) ?
10598 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
10599 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
10600 }
10601 if (flags & CEPH_OSD_NODOWN) {
10602 any |= osdmap.is_nodown_by_osd(osd) ?
10603 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
10604 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
10605 }
10606 if (flags & CEPH_OSD_NOIN) {
10607 any |= osdmap.is_noin_by_osd(osd) ?
10608 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
10609 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
10610 }
10611 if (flags & CEPH_OSD_NOOUT) {
10612 any |= osdmap.is_noout_by_osd(osd) ?
10613 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
10614 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
10615 }
10616 }
10617 }
10618 for (auto& id : crush_nodes) {
10619 auto old_flags = osdmap.get_crush_node_flags(id);
10620 auto& pending_flags = pending_inc.new_crush_node_flags[id];
10621 pending_flags |= old_flags; // adopt existing flags first!
10622 if (do_set) {
10623 pending_flags |= flags;
10624 } else {
10625 pending_flags &= ~flags;
10626 }
10627 any = true;
10628 }
10629 for (auto& id : device_classes) {
10630 auto old_flags = osdmap.get_device_class_flags(id);
10631 auto& pending_flags = pending_inc.new_device_class_flags[id];
10632 pending_flags |= old_flags;
10633 if (do_set) {
10634 pending_flags |= flags;
10635 } else {
10636 pending_flags &= ~flags;
10637 }
10638 any = true;
10639 }
10640 if (any) {
10641 getline(ss, rs);
10642 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
10643 get_last_committed() + 1));
10644 return true;
10645 }
10646 } else if (prefix == "osd pg-temp") {
10647 string pgidstr;
10648 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10649 ss << "unable to parse 'pgid' value '"
10650 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10651 err = -EINVAL;
10652 goto reply;
10653 }
10654 pg_t pgid;
10655 if (!pgid.parse(pgidstr.c_str())) {
10656 ss << "invalid pgid '" << pgidstr << "'";
10657 err = -EINVAL;
10658 goto reply;
10659 }
10660 if (!osdmap.pg_exists(pgid)) {
10661 ss << "pg " << pgid << " does not exist";
10662 err = -ENOENT;
10663 goto reply;
10664 }
10665 if (pending_inc.new_pg_temp.count(pgid)) {
10666 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
10667 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10668 return true;
10669 }
10670
10671 vector<int64_t> id_vec;
10672 vector<int32_t> new_pg_temp;
10673 cmd_getval(cct, cmdmap, "id", id_vec);
10674 if (id_vec.empty()) {
10675 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
10676 ss << "done cleaning up pg_temp of " << pgid;
10677 goto update;
10678 }
10679 for (auto osd : id_vec) {
10680 if (!osdmap.exists(osd)) {
10681 ss << "osd." << osd << " does not exist";
10682 err = -ENOENT;
10683 goto reply;
10684 }
10685 new_pg_temp.push_back(osd);
10686 }
10687
10688 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10689 if ((int)new_pg_temp.size() < pool_min_size) {
10690 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
10691 << pool_min_size << ")";
10692 err = -EINVAL;
10693 goto reply;
10694 }
10695
10696 int pool_size = osdmap.get_pg_pool_size(pgid);
10697 if ((int)new_pg_temp.size() > pool_size) {
10698 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
10699 << pool_size << ")";
10700 err = -EINVAL;
10701 goto reply;
10702 }
10703
10704 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
10705 new_pg_temp.begin(), new_pg_temp.end());
10706 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
10707 goto update;
10708 } else if (prefix == "osd primary-temp") {
10709 string pgidstr;
10710 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10711 ss << "unable to parse 'pgid' value '"
10712 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10713 err = -EINVAL;
10714 goto reply;
10715 }
10716 pg_t pgid;
10717 if (!pgid.parse(pgidstr.c_str())) {
10718 ss << "invalid pgid '" << pgidstr << "'";
10719 err = -EINVAL;
10720 goto reply;
10721 }
10722 if (!osdmap.pg_exists(pgid)) {
10723 ss << "pg " << pgid << " does not exist";
10724 err = -ENOENT;
10725 goto reply;
10726 }
10727
10728 int64_t osd;
10729 if (!cmd_getval(cct, cmdmap, "id", osd)) {
10730 ss << "unable to parse 'id' value '"
10731 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10732 err = -EINVAL;
10733 goto reply;
10734 }
10735 if (osd != -1 && !osdmap.exists(osd)) {
10736 ss << "osd." << osd << " does not exist";
10737 err = -ENOENT;
10738 goto reply;
10739 }
10740
10741 if (osdmap.require_min_compat_client > 0 &&
10742 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10743 ss << "require_min_compat_client "
10744 << ceph_release_name(osdmap.require_min_compat_client)
10745 << " < firefly, which is required for primary-temp";
10746 err = -EPERM;
10747 goto reply;
10748 }
10749
10750 pending_inc.new_primary_temp[pgid] = osd;
10751 ss << "set " << pgid << " primary_temp mapping to " << osd;
10752 goto update;
10753 } else if (prefix == "pg repeer") {
10754 pg_t pgid;
10755 string pgidstr;
10756 cmd_getval(cct, cmdmap, "pgid", pgidstr);
10757 if (!pgid.parse(pgidstr.c_str())) {
10758 ss << "invalid pgid '" << pgidstr << "'";
10759 err = -EINVAL;
10760 goto reply;
10761 }
10762 if (!osdmap.pg_exists(pgid)) {
10763 ss << "pg '" << pgidstr << "' does not exist";
10764 err = -ENOENT;
10765 goto reply;
10766 }
10767 vector<int> acting;
10768 int primary;
10769 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
10770 if (primary < 0) {
10771 err = -EAGAIN;
10772 ss << "pg currently has no primary";
10773 goto reply;
10774 }
10775 if (acting.size() > 1) {
10776 // map to just primary; it will map back to what it wants
10777 pending_inc.new_pg_temp[pgid] = { primary };
10778 } else {
10779 // hmm, pick another arbitrary osd to induce a change. Note
10780 // that this won't work if there is only one suitable OSD in the cluster.
10781 int i;
10782 bool done = false;
10783 for (i = 0; i < osdmap.get_max_osd(); ++i) {
10784 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
10785 continue;
10786 }
10787 pending_inc.new_pg_temp[pgid] = { primary, i };
10788 done = true;
10789 break;
10790 }
10791 if (!done) {
10792 err = -EAGAIN;
10793 ss << "not enough up OSDs in the cluster to force repeer";
10794 goto reply;
10795 }
10796 }
10797 goto update;
10798 } else if (prefix == "osd pg-upmap" ||
10799 prefix == "osd rm-pg-upmap" ||
10800 prefix == "osd pg-upmap-items" ||
10801 prefix == "osd rm-pg-upmap-items") {
10802 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
10803 ss << "min_compat_client "
10804 << ceph_release_name(osdmap.require_min_compat_client)
10805 << " < luminous, which is required for pg-upmap. "
10806 << "Try 'ceph osd set-require-min-compat-client luminous' "
10807 << "before using the new interface";
10808 err = -EPERM;
10809 goto reply;
10810 }
10811 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
10812 if (err == -EAGAIN)
10813 goto wait;
10814 if (err < 0)
10815 goto reply;
10816 string pgidstr;
10817 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10818 ss << "unable to parse 'pgid' value '"
10819 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10820 err = -EINVAL;
10821 goto reply;
10822 }
10823 pg_t pgid;
10824 if (!pgid.parse(pgidstr.c_str())) {
10825 ss << "invalid pgid '" << pgidstr << "'";
10826 err = -EINVAL;
10827 goto reply;
10828 }
10829 if (!osdmap.pg_exists(pgid)) {
10830 ss << "pg " << pgid << " does not exist";
10831 err = -ENOENT;
10832 goto reply;
10833 }
10834 if (pending_inc.old_pools.count(pgid.pool())) {
10835 ss << "pool of " << pgid << " is pending removal";
10836 err = -ENOENT;
10837 getline(ss, rs);
10838 wait_for_finished_proposal(op,
10839 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
10840 return true;
10841 }
10842
10843 enum {
10844 OP_PG_UPMAP,
10845 OP_RM_PG_UPMAP,
10846 OP_PG_UPMAP_ITEMS,
10847 OP_RM_PG_UPMAP_ITEMS,
10848 } option;
10849
10850 if (prefix == "osd pg-upmap") {
10851 option = OP_PG_UPMAP;
10852 } else if (prefix == "osd rm-pg-upmap") {
10853 option = OP_RM_PG_UPMAP;
10854 } else if (prefix == "osd pg-upmap-items") {
10855 option = OP_PG_UPMAP_ITEMS;
10856 } else {
10857 option = OP_RM_PG_UPMAP_ITEMS;
10858 }
10859
10860 // check pending upmap changes
10861 switch (option) {
10862 case OP_PG_UPMAP: // fall through
10863 case OP_RM_PG_UPMAP:
10864 if (pending_inc.new_pg_upmap.count(pgid) ||
10865 pending_inc.old_pg_upmap.count(pgid)) {
10866 dout(10) << __func__ << " waiting for pending update on "
10867 << pgid << dendl;
10868 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10869 return true;
10870 }
10871 break;
10872
10873 case OP_PG_UPMAP_ITEMS: // fall through
10874 case OP_RM_PG_UPMAP_ITEMS:
10875 if (pending_inc.new_pg_upmap_items.count(pgid) ||
10876 pending_inc.old_pg_upmap_items.count(pgid)) {
10877 dout(10) << __func__ << " waiting for pending update on "
10878 << pgid << dendl;
10879 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10880 return true;
10881 }
10882 break;
10883
10884 default:
10885 ceph_abort_msg("invalid option");
10886 }
10887
10888 switch (option) {
10889 case OP_PG_UPMAP:
10890 {
10891 vector<int64_t> id_vec;
10892 if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
10893 ss << "unable to parse 'id' value(s) '"
10894 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10895 err = -EINVAL;
10896 goto reply;
10897 }
10898
10899 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10900 if ((int)id_vec.size() < pool_min_size) {
10901 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
10902 << pool_min_size << ")";
10903 err = -EINVAL;
10904 goto reply;
10905 }
10906
10907 int pool_size = osdmap.get_pg_pool_size(pgid);
10908 if ((int)id_vec.size() > pool_size) {
10909 ss << "num of osds (" << id_vec.size() <<") > pool size ("
10910 << pool_size << ")";
10911 err = -EINVAL;
10912 goto reply;
10913 }
10914
10915 vector<int32_t> new_pg_upmap;
10916 for (auto osd : id_vec) {
10917 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
10918 ss << "osd." << osd << " does not exist";
10919 err = -ENOENT;
10920 goto reply;
10921 }
10922 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
10923 if (it != new_pg_upmap.end()) {
10924 ss << "osd." << osd << " already exists, ";
10925 continue;
10926 }
10927 new_pg_upmap.push_back(osd);
10928 }
10929
10930 if (new_pg_upmap.empty()) {
10931 ss << "no valid upmap items(pairs) is specified";
10932 err = -EINVAL;
10933 goto reply;
10934 }
10935
10936 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
10937 new_pg_upmap.begin(), new_pg_upmap.end());
10938 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
10939 }
10940 break;
10941
10942 case OP_RM_PG_UPMAP:
10943 {
10944 pending_inc.old_pg_upmap.insert(pgid);
10945 ss << "clear " << pgid << " pg_upmap mapping";
10946 }
10947 break;
10948
10949 case OP_PG_UPMAP_ITEMS:
10950 {
10951 vector<int64_t> id_vec;
10952 if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
10953 ss << "unable to parse 'id' value(s) '"
10954 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10955 err = -EINVAL;
10956 goto reply;
10957 }
10958
10959 if (id_vec.size() % 2) {
10960 ss << "you must specify pairs of osd ids to be remapped";
10961 err = -EINVAL;
10962 goto reply;
10963 }
10964
10965 int pool_size = osdmap.get_pg_pool_size(pgid);
10966 if ((int)(id_vec.size() / 2) > pool_size) {
10967 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
10968 << pool_size << ")";
10969 err = -EINVAL;
10970 goto reply;
10971 }
10972
10973 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
10974 ostringstream items;
10975 items << "[";
10976 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
10977 int from = *p++;
10978 int to = *p;
10979 if (from == to) {
10980 ss << "from osd." << from << " == to osd." << to << ", ";
10981 continue;
10982 }
10983 if (!osdmap.exists(from)) {
10984 ss << "osd." << from << " does not exist";
10985 err = -ENOENT;
10986 goto reply;
10987 }
10988 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
10989 ss << "osd." << to << " does not exist";
10990 err = -ENOENT;
10991 goto reply;
10992 }
10993 pair<int32_t,int32_t> entry = make_pair(from, to);
10994 auto it = std::find(new_pg_upmap_items.begin(),
10995 new_pg_upmap_items.end(), entry);
10996 if (it != new_pg_upmap_items.end()) {
10997 ss << "osd." << from << " -> osd." << to << " already exists, ";
10998 continue;
10999 }
11000 new_pg_upmap_items.push_back(entry);
11001 items << from << "->" << to << ",";
11002 }
11003 string out(items.str());
11004 out.resize(out.size() - 1); // drop last ','
11005 out += "]";
11006
11007 if (new_pg_upmap_items.empty()) {
11008 ss << "no valid upmap items(pairs) is specified";
11009 err = -EINVAL;
11010 goto reply;
11011 }
11012
11013 pending_inc.new_pg_upmap_items[pgid] =
11014 mempool::osdmap::vector<pair<int32_t,int32_t>>(
11015 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11016 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11017 }
11018 break;
11019
11020 case OP_RM_PG_UPMAP_ITEMS:
11021 {
11022 pending_inc.old_pg_upmap_items.insert(pgid);
11023 ss << "clear " << pgid << " pg_upmap_items mapping";
11024 }
11025 break;
11026
11027 default:
11028 ceph_abort_msg("invalid option");
11029 }
11030
11031 goto update;
11032 } else if (prefix == "osd primary-affinity") {
11033 int64_t id;
11034 if (!cmd_getval(cct, cmdmap, "id", id)) {
11035 ss << "invalid osd id value '"
11036 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11037 err = -EINVAL;
11038 goto reply;
11039 }
11040 double w;
11041 if (!cmd_getval(cct, cmdmap, "weight", w)) {
11042 ss << "unable to parse 'weight' value '"
11043 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11044 err = -EINVAL;
11045 goto reply;
11046 }
11047 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11048 if (ww < 0L) {
11049 ss << "weight must be >= 0";
11050 err = -EINVAL;
11051 goto reply;
11052 }
11053 if (osdmap.require_min_compat_client > 0 &&
11054 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
11055 ss << "require_min_compat_client "
11056 << ceph_release_name(osdmap.require_min_compat_client)
11057 << " < firefly, which is required for primary-affinity";
11058 err = -EPERM;
11059 goto reply;
11060 }
11061 if (osdmap.exists(id)) {
11062 pending_inc.new_primary_affinity[id] = ww;
11063 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11064 getline(ss, rs);
11065 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11066 get_last_committed() + 1));
11067 return true;
11068 } else {
11069 ss << "osd." << id << " does not exist";
11070 err = -ENOENT;
11071 goto reply;
11072 }
11073 } else if (prefix == "osd reweight") {
11074 int64_t id;
11075 if (!cmd_getval(cct, cmdmap, "id", id)) {
11076 ss << "unable to parse osd id value '"
11077 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11078 err = -EINVAL;
11079 goto reply;
11080 }
11081 double w;
11082 if (!cmd_getval(cct, cmdmap, "weight", w)) {
11083 ss << "unable to parse weight value '"
11084 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11085 err = -EINVAL;
11086 goto reply;
11087 }
11088 long ww = (int)((double)CEPH_OSD_IN*w);
11089 if (ww < 0L) {
11090 ss << "weight must be >= 0";
11091 err = -EINVAL;
11092 goto reply;
11093 }
11094 if (osdmap.exists(id)) {
11095 pending_inc.new_weight[id] = ww;
11096 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
11097 getline(ss, rs);
11098 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11099 get_last_committed() + 1));
11100 return true;
11101 } else {
11102 ss << "osd." << id << " does not exist";
11103 err = -ENOENT;
11104 goto reply;
11105 }
11106 } else if (prefix == "osd reweightn") {
11107 map<int32_t, uint32_t> weights;
11108 err = parse_reweights(cct, cmdmap, osdmap, &weights);
11109 if (err) {
11110 ss << "unable to parse 'weights' value '"
11111 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
11112 goto reply;
11113 }
11114 pending_inc.new_weight.insert(weights.begin(), weights.end());
11115 wait_for_finished_proposal(
11116 op,
11117 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11118 return true;
11119 } else if (prefix == "osd lost") {
11120 int64_t id;
11121 if (!cmd_getval(cct, cmdmap, "id", id)) {
11122 ss << "unable to parse osd id value '"
11123 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11124 err = -EINVAL;
11125 goto reply;
11126 }
11127 bool sure = false;
11128 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11129 if (!sure) {
11130 ss << "are you SURE? this might mean real, permanent data loss. pass "
11131 "--yes-i-really-mean-it if you really do.";
11132 err = -EPERM;
11133 goto reply;
11134 } else if (!osdmap.exists(id)) {
11135 ss << "osd." << id << " does not exist";
11136 err = -ENOENT;
11137 goto reply;
11138 } else if (!osdmap.is_down(id)) {
11139 ss << "osd." << id << " is not down";
11140 err = -EBUSY;
11141 goto reply;
11142 } else {
11143 epoch_t e = osdmap.get_info(id).down_at;
11144 pending_inc.new_lost[id] = e;
11145 ss << "marked osd lost in epoch " << e;
11146 getline(ss, rs);
11147 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11148 get_last_committed() + 1));
11149 return true;
11150 }
11151
11152 } else if (prefix == "osd destroy-actual" ||
11153 prefix == "osd purge-actual" ||
11154 prefix == "osd purge-new") {
11155 /* Destroying an OSD means that we don't expect to further make use of
11156 * the OSDs data (which may even become unreadable after this operation),
11157 * and that we are okay with scrubbing all its cephx keys and config-key
11158 * data (which may include lockbox keys, thus rendering the osd's data
11159 * unreadable).
11160 *
11161 * The OSD will not be removed. Instead, we will mark it as destroyed,
11162 * such that a subsequent call to `create` will not reuse the osd id.
11163 * This will play into being able to recreate the OSD, at the same
11164 * crush location, with minimal data movement.
11165 */
11166
11167 // make sure authmon is writeable.
11168 if (!mon->authmon()->is_writeable()) {
11169 dout(10) << __func__ << " waiting for auth mon to be writeable for "
11170 << "osd destroy" << dendl;
11171 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11172 return false;
11173 }
11174
11175 int64_t id;
11176 if (!cmd_getval(cct, cmdmap, "id", id)) {
11177 auto p = cmdmap.find("id");
11178 if (p == cmdmap.end()) {
11179 ss << "no osd id specified";
11180 } else {
11181 ss << "unable to parse osd id value '"
11182 << cmd_vartype_stringify(cmdmap.at("id")) << "";
11183 }
11184 err = -EINVAL;
11185 goto reply;
11186 }
11187
11188 bool is_destroy = (prefix == "osd destroy-actual");
11189 if (!is_destroy) {
11190 ceph_assert("osd purge-actual" == prefix ||
11191 "osd purge-new" == prefix);
11192 }
11193
11194 bool sure = false;
11195 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11196 if (!sure) {
11197 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
11198 << "This will mean real, permanent data loss, as well "
11199 << "as deletion of cephx and lockbox keys. "
11200 << "Pass --yes-i-really-mean-it if you really do.";
11201 err = -EPERM;
11202 goto reply;
11203 } else if (!osdmap.exists(id)) {
11204 ss << "osd." << id << " does not exist";
11205 err = 0; // idempotent
11206 goto reply;
11207 } else if (osdmap.is_up(id)) {
11208 ss << "osd." << id << " is not `down`.";
11209 err = -EBUSY;
11210 goto reply;
11211 } else if (is_destroy && osdmap.is_destroyed(id)) {
11212 ss << "destroyed osd." << id;
11213 err = 0;
11214 goto reply;
11215 }
11216
11217 if (prefix == "osd purge-new" &&
11218 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
11219 ss << "osd." << id << " is not new";
11220 err = -EPERM;
11221 goto reply;
11222 }
11223
11224 bool goto_reply = false;
11225
11226 paxos->plug();
11227 if (is_destroy) {
11228 err = prepare_command_osd_destroy(id, ss);
11229 // we checked above that it should exist.
11230 ceph_assert(err != -ENOENT);
11231 } else {
11232 err = prepare_command_osd_purge(id, ss);
11233 if (err == -ENOENT) {
11234 err = 0;
11235 ss << "osd." << id << " does not exist.";
11236 goto_reply = true;
11237 }
11238 }
11239 paxos->unplug();
11240
11241 if (err < 0 || goto_reply) {
11242 goto reply;
11243 }
11244
11245 if (is_destroy) {
11246 ss << "destroyed osd." << id;
11247 } else {
11248 ss << "purged osd." << id;
11249 }
11250
11251 getline(ss, rs);
11252 wait_for_finished_proposal(op,
11253 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
11254 force_immediate_propose();
11255 return true;
11256
11257 } else if (prefix == "osd new") {
11258
11259 // make sure authmon is writeable.
11260 if (!mon->authmon()->is_writeable()) {
11261 dout(10) << __func__ << " waiting for auth mon to be writeable for "
11262 << "osd new" << dendl;
11263 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11264 return false;
11265 }
11266
11267 map<string,string> param_map;
11268
11269 bufferlist bl = m->get_data();
11270 string param_json = bl.to_str();
11271 dout(20) << __func__ << " osd new json = " << param_json << dendl;
11272
11273 err = get_json_str_map(param_json, ss, &param_map);
11274 if (err < 0)
11275 goto reply;
11276
11277 dout(20) << __func__ << " osd new params " << param_map << dendl;
11278
11279 paxos->plug();
11280 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
11281 paxos->unplug();
11282
11283 if (err < 0) {
11284 goto reply;
11285 }
11286
11287 if (f) {
11288 f->flush(rdata);
11289 } else {
11290 rdata.append(ss);
11291 }
11292
11293 if (err == EEXIST) {
11294 // idempotent operation
11295 err = 0;
11296 goto reply;
11297 }
11298
11299 wait_for_finished_proposal(op,
11300 new Monitor::C_Command(mon, op, 0, rs, rdata,
11301 get_last_committed() + 1));
11302 force_immediate_propose();
11303 return true;
11304
11305 } else if (prefix == "osd create") {
11306
11307 // optional id provided?
11308 int64_t id = -1, cmd_id = -1;
11309 if (cmd_getval(cct, cmdmap, "id", cmd_id)) {
11310 if (cmd_id < 0) {
11311 ss << "invalid osd id value '" << cmd_id << "'";
11312 err = -EINVAL;
11313 goto reply;
11314 }
11315 dout(10) << " osd create got id " << cmd_id << dendl;
11316 }
11317
11318 uuid_d uuid;
11319 string uuidstr;
11320 if (cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
11321 if (!uuid.parse(uuidstr.c_str())) {
11322 ss << "invalid uuid value '" << uuidstr << "'";
11323 err = -EINVAL;
11324 goto reply;
11325 }
11326 // we only care about the id if we also have the uuid, to
11327 // ensure the operation's idempotency.
11328 id = cmd_id;
11329 }
11330
11331 int32_t new_id = -1;
11332 err = prepare_command_osd_create(id, uuid, &new_id, ss);
11333 if (err < 0) {
11334 if (err == -EAGAIN) {
11335 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11336 return true;
11337 }
11338 // a check has failed; reply to the user.
11339 goto reply;
11340
11341 } else if (err == EEXIST) {
11342 // this is an idempotent operation; we can go ahead and reply.
11343 if (f) {
11344 f->open_object_section("created_osd");
11345 f->dump_int("osdid", new_id);
11346 f->close_section();
11347 f->flush(rdata);
11348 } else {
11349 ss << new_id;
11350 rdata.append(ss);
11351 }
11352 err = 0;
11353 goto reply;
11354 }
11355
11356 string empty_device_class;
11357 do_osd_create(id, uuid, empty_device_class, &new_id);
11358
11359 if (f) {
11360 f->open_object_section("created_osd");
11361 f->dump_int("osdid", new_id);
11362 f->close_section();
11363 f->flush(rdata);
11364 } else {
11365 ss << new_id;
11366 rdata.append(ss);
11367 }
11368 wait_for_finished_proposal(op,
11369 new Monitor::C_Command(mon, op, 0, rs, rdata,
11370 get_last_committed() + 1));
11371 return true;
11372
11373 } else if (prefix == "osd blacklist clear") {
11374 pending_inc.new_blacklist.clear();
11375 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
11376 osdmap.get_blacklist(&blacklist);
11377 for (const auto &entry : blacklist) {
11378 pending_inc.old_blacklist.push_back(entry.first);
11379 }
11380 ss << " removed all blacklist entries";
11381 getline(ss, rs);
11382 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11383 get_last_committed() + 1));
11384 return true;
11385 } else if (prefix == "osd blacklist") {
11386 string addrstr;
11387 cmd_getval(cct, cmdmap, "addr", addrstr);
11388 entity_addr_t addr;
11389 if (!addr.parse(addrstr.c_str(), 0)) {
11390 ss << "unable to parse address " << addrstr;
11391 err = -EINVAL;
11392 goto reply;
11393 }
11394 else {
11395 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
11396 // always blacklist type ANY
11397 addr.set_type(entity_addr_t::TYPE_ANY);
11398 } else {
11399 addr.set_type(entity_addr_t::TYPE_LEGACY);
11400 }
11401
11402 string blacklistop;
11403 cmd_getval(cct, cmdmap, "blacklistop", blacklistop);
11404 if (blacklistop == "add") {
11405 utime_t expires = ceph_clock_now();
11406 double d;
11407 // default one hour
11408 cmd_getval(cct, cmdmap, "expire", d,
11409 g_conf()->mon_osd_blacklist_default_expire);
11410 expires += d;
11411
11412 pending_inc.new_blacklist[addr] = expires;
11413
11414 {
11415 // cancel any pending un-blacklisting request too
11416 auto it = std::find(pending_inc.old_blacklist.begin(),
11417 pending_inc.old_blacklist.end(), addr);
11418 if (it != pending_inc.old_blacklist.end()) {
11419 pending_inc.old_blacklist.erase(it);
11420 }
11421 }
11422
11423 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
11424 getline(ss, rs);
11425 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11426 get_last_committed() + 1));
11427 return true;
11428 } else if (blacklistop == "rm") {
11429 if (osdmap.is_blacklisted(addr) ||
11430 pending_inc.new_blacklist.count(addr)) {
11431 if (osdmap.is_blacklisted(addr))
11432 pending_inc.old_blacklist.push_back(addr);
11433 else
11434 pending_inc.new_blacklist.erase(addr);
11435 ss << "un-blacklisting " << addr;
11436 getline(ss, rs);
11437 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11438 get_last_committed() + 1));
11439 return true;
11440 }
11441 ss << addr << " isn't blacklisted";
11442 err = 0;
11443 goto reply;
11444 }
11445 }
11446 } else if (prefix == "osd pool mksnap") {
11447 string poolstr;
11448 cmd_getval(cct, cmdmap, "pool", poolstr);
11449 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11450 if (pool < 0) {
11451 ss << "unrecognized pool '" << poolstr << "'";
11452 err = -ENOENT;
11453 goto reply;
11454 }
11455 string snapname;
11456 cmd_getval(cct, cmdmap, "snap", snapname);
11457 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11458 if (p->is_unmanaged_snaps_mode()) {
11459 ss << "pool " << poolstr << " is in unmanaged snaps mode";
11460 err = -EINVAL;
11461 goto reply;
11462 } else if (p->snap_exists(snapname.c_str())) {
11463 ss << "pool " << poolstr << " snap " << snapname << " already exists";
11464 err = 0;
11465 goto reply;
11466 } else if (p->is_tier()) {
11467 ss << "pool " << poolstr << " is a cache tier";
11468 err = -EINVAL;
11469 goto reply;
11470 }
11471 pg_pool_t *pp = 0;
11472 if (pending_inc.new_pools.count(pool))
11473 pp = &pending_inc.new_pools[pool];
11474 if (!pp) {
11475 pp = &pending_inc.new_pools[pool];
11476 *pp = *p;
11477 }
11478 if (pp->snap_exists(snapname.c_str())) {
11479 ss << "pool " << poolstr << " snap " << snapname << " already exists";
11480 } else {
11481 pp->add_snap(snapname.c_str(), ceph_clock_now());
11482 pp->set_snap_epoch(pending_inc.epoch);
11483 ss << "created pool " << poolstr << " snap " << snapname;
11484 }
11485 getline(ss, rs);
11486 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11487 get_last_committed() + 1));
11488 return true;
11489 } else if (prefix == "osd pool rmsnap") {
11490 string poolstr;
11491 cmd_getval(cct, cmdmap, "pool", poolstr);
11492 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11493 if (pool < 0) {
11494 ss << "unrecognized pool '" << poolstr << "'";
11495 err = -ENOENT;
11496 goto reply;
11497 }
11498 string snapname;
11499 cmd_getval(cct, cmdmap, "snap", snapname);
11500 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11501 if (p->is_unmanaged_snaps_mode()) {
11502 ss << "pool " << poolstr << " is in unmanaged snaps mode";
11503 err = -EINVAL;
11504 goto reply;
11505 } else if (!p->snap_exists(snapname.c_str())) {
11506 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
11507 err = 0;
11508 goto reply;
11509 }
11510 pg_pool_t *pp = 0;
11511 if (pending_inc.new_pools.count(pool))
11512 pp = &pending_inc.new_pools[pool];
11513 if (!pp) {
11514 pp = &pending_inc.new_pools[pool];
11515 *pp = *p;
11516 }
11517 snapid_t sn = pp->snap_exists(snapname.c_str());
11518 if (sn) {
11519 pp->remove_snap(sn);
11520 pp->set_snap_epoch(pending_inc.epoch);
11521 ss << "removed pool " << poolstr << " snap " << snapname;
11522 } else {
11523 ss << "already removed pool " << poolstr << " snap " << snapname;
11524 }
11525 getline(ss, rs);
11526 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11527 get_last_committed() + 1));
11528 return true;
11529 } else if (prefix == "osd pool create") {
11530 int64_t pg_num, pg_num_min;
11531 int64_t pgp_num;
11532 cmd_getval(cct, cmdmap, "pg_num", pg_num, int64_t(0));
11533 cmd_getval(cct, cmdmap, "pgp_num", pgp_num, pg_num);
11534 cmd_getval(cct, cmdmap, "pg_num_min", pg_num_min, int64_t(0));
11535
11536 string pool_type_str;
11537 cmd_getval(cct, cmdmap, "pool_type", pool_type_str);
11538 if (pool_type_str.empty())
11539 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
11540
11541 string poolstr;
11542 cmd_getval(cct, cmdmap, "pool", poolstr);
11543 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11544 if (pool_id >= 0) {
11545 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11546 if (pool_type_str != p->get_type_name()) {
11547 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
11548 err = -EINVAL;
11549 } else {
11550 ss << "pool '" << poolstr << "' already exists";
11551 err = 0;
11552 }
11553 goto reply;
11554 }
11555
11556 int pool_type;
11557 if (pool_type_str == "replicated") {
11558 pool_type = pg_pool_t::TYPE_REPLICATED;
11559 } else if (pool_type_str == "erasure") {
11560 pool_type = pg_pool_t::TYPE_ERASURE;
11561 } else {
11562 ss << "unknown pool type '" << pool_type_str << "'";
11563 err = -EINVAL;
11564 goto reply;
11565 }
11566
11567 bool implicit_rule_creation = false;
11568 int64_t expected_num_objects = 0;
11569 string rule_name;
11570 cmd_getval(cct, cmdmap, "rule", rule_name);
11571 string erasure_code_profile;
11572 cmd_getval(cct, cmdmap, "erasure_code_profile", erasure_code_profile);
11573
11574 if (pool_type == pg_pool_t::TYPE_ERASURE) {
11575 if (erasure_code_profile == "")
11576 erasure_code_profile = "default";
11577 //handle the erasure code profile
11578 if (erasure_code_profile == "default") {
11579 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
11580 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
11581 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
11582 goto wait;
11583 }
11584
11585 map<string,string> profile_map;
11586 err = osdmap.get_erasure_code_profile_default(cct,
11587 profile_map,
11588 &ss);
11589 if (err)
11590 goto reply;
11591 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
11592 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
11593 goto wait;
11594 }
11595 }
11596 if (rule_name == "") {
11597 implicit_rule_creation = true;
11598 if (erasure_code_profile == "default") {
11599 rule_name = "erasure-code";
11600 } else {
11601 dout(1) << "implicitly use rule named after the pool: "
11602 << poolstr << dendl;
11603 rule_name = poolstr;
11604 }
11605 }
11606 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
11607 expected_num_objects, int64_t(0));
11608 } else {
11609 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
11610 // and put expected_num_objects to rule field
11611 if (erasure_code_profile != "") { // cmd is from CLI
11612 if (rule_name != "") {
11613 string interr;
11614 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
11615 if (interr.length()) {
11616 ss << "error parsing integer value '" << rule_name << "': " << interr;
11617 err = -EINVAL;
11618 goto reply;
11619 }
11620 }
11621 rule_name = erasure_code_profile;
11622 } else { // cmd is well-formed
11623 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
11624 expected_num_objects, int64_t(0));
11625 }
11626 }
11627
11628 if (!implicit_rule_creation && rule_name != "") {
11629 int rule;
11630 err = get_crush_rule(rule_name, &rule, &ss);
11631 if (err == -EAGAIN) {
11632 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11633 return true;
11634 }
11635 if (err)
11636 goto reply;
11637 }
11638
11639 if (expected_num_objects < 0) {
11640 ss << "'expected_num_objects' must be non-negative";
11641 err = -EINVAL;
11642 goto reply;
11643 }
11644
11645 if (expected_num_objects > 0 &&
11646 cct->_conf->osd_objectstore == "filestore" &&
11647 cct->_conf->filestore_merge_threshold > 0) {
11648 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
11649 err = -EINVAL;
11650 goto reply;
11651 }
11652
11653 if (expected_num_objects == 0 &&
11654 cct->_conf->osd_objectstore == "filestore" &&
11655 cct->_conf->filestore_merge_threshold < 0) {
11656 int osds = osdmap.get_num_osds();
11657 if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
11658 ss << "For better initial performance on pools expected to store a "
11659 << "large number of objects, consider supplying the "
11660 << "expected_num_objects parameter when creating the pool.\n";
11661 }
11662 }
11663
11664 int64_t fast_read_param;
11665 cmd_getval(cct, cmdmap, "fast_read", fast_read_param, int64_t(-1));
11666 FastReadType fast_read = FAST_READ_DEFAULT;
11667 if (fast_read_param == 0)
11668 fast_read = FAST_READ_OFF;
11669 else if (fast_read_param > 0)
11670 fast_read = FAST_READ_ON;
11671
11672 int64_t repl_size = 0;
11673 cmd_getval(cct, cmdmap, "size", repl_size);
11674 int64_t target_size_bytes = 0;
11675 double target_size_ratio = 0.0;
11676 cmd_getval(cct, cmdmap, "target_size_bytes", target_size_bytes);
11677 cmd_getval(cct, cmdmap, "target_size_ratio", target_size_ratio);
11678
11679 err = prepare_new_pool(poolstr,
11680 -1, // default crush rule
11681 rule_name,
11682 pg_num, pgp_num, pg_num_min,
11683 repl_size, target_size_bytes, target_size_ratio,
11684 erasure_code_profile, pool_type,
11685 (uint64_t)expected_num_objects,
11686 fast_read,
11687 &ss);
11688 if (err < 0) {
11689 switch(err) {
11690 case -EEXIST:
11691 ss << "pool '" << poolstr << "' already exists";
11692 break;
11693 case -EAGAIN:
11694 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11695 return true;
11696 case -ERANGE:
11697 goto reply;
11698 default:
11699 goto reply;
11700 break;
11701 }
11702 } else {
11703 ss << "pool '" << poolstr << "' created";
11704 }
11705 getline(ss, rs);
11706 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11707 get_last_committed() + 1));
11708 return true;
11709
11710 } else if (prefix == "osd pool delete" ||
11711 prefix == "osd pool rm") {
11712 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
11713 string poolstr, poolstr2, sure;
11714 cmd_getval(cct, cmdmap, "pool", poolstr);
11715 cmd_getval(cct, cmdmap, "pool2", poolstr2);
11716 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11717 if (pool < 0) {
11718 ss << "pool '" << poolstr << "' does not exist";
11719 err = 0;
11720 goto reply;
11721 }
11722
11723 bool force_no_fake = false;
11724 cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it", force_no_fake);
11725 bool force = false;
11726 cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it_not_faking", force);
11727 if (poolstr2 != poolstr ||
11728 (!force && !force_no_fake)) {
11729 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
11730 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
11731 << "followed by --yes-i-really-really-mean-it.";
11732 err = -EPERM;
11733 goto reply;
11734 }
11735 err = _prepare_remove_pool(pool, &ss, force_no_fake);
11736 if (err == -EAGAIN) {
11737 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11738 return true;
11739 }
11740 if (err < 0)
11741 goto reply;
11742 goto update;
11743 } else if (prefix == "osd pool rename") {
11744 string srcpoolstr, destpoolstr;
11745 cmd_getval(cct, cmdmap, "srcpool", srcpoolstr);
11746 cmd_getval(cct, cmdmap, "destpool", destpoolstr);
11747 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
11748 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
11749
11750 if (pool_src < 0) {
11751 if (pool_dst >= 0) {
11752 // src pool doesn't exist, dst pool does exist: to ensure idempotency
11753 // of operations, assume this rename succeeded, as it is not changing
11754 // the current state. Make sure we output something understandable
11755 // for whoever is issuing the command, if they are paying attention,
11756 // in case it was not intentional; or to avoid a "wtf?" and a bug
11757 // report in case it was intentional, while expecting a failure.
11758 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
11759 << destpoolstr << "' does -- assuming successful rename";
11760 err = 0;
11761 } else {
11762 ss << "unrecognized pool '" << srcpoolstr << "'";
11763 err = -ENOENT;
11764 }
11765 goto reply;
11766 } else if (pool_dst >= 0) {
11767 // source pool exists and so does the destination pool
11768 ss << "pool '" << destpoolstr << "' already exists";
11769 err = -EEXIST;
11770 goto reply;
11771 }
11772
11773 int ret = _prepare_rename_pool(pool_src, destpoolstr);
11774 if (ret == 0) {
11775 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
11776 } else {
11777 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
11778 << cpp_strerror(ret);
11779 }
11780 getline(ss, rs);
11781 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
11782 get_last_committed() + 1));
11783 return true;
11784
11785 } else if (prefix == "osd pool set") {
11786 err = prepare_command_pool_set(cmdmap, ss);
11787 if (err == -EAGAIN)
11788 goto wait;
11789 if (err < 0)
11790 goto reply;
11791
11792 getline(ss, rs);
11793 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11794 get_last_committed() + 1));
11795 return true;
11796 } else if (prefix == "osd tier add") {
11797 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11798 if (err == -EAGAIN)
11799 goto wait;
11800 if (err)
11801 goto reply;
11802 string poolstr;
11803 cmd_getval(cct, cmdmap, "pool", poolstr);
11804 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11805 if (pool_id < 0) {
11806 ss << "unrecognized pool '" << poolstr << "'";
11807 err = -ENOENT;
11808 goto reply;
11809 }
11810 string tierpoolstr;
11811 cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
11812 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11813 if (tierpool_id < 0) {
11814 ss << "unrecognized pool '" << tierpoolstr << "'";
11815 err = -ENOENT;
11816 goto reply;
11817 }
11818 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11819 ceph_assert(p);
11820 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11821 ceph_assert(tp);
11822
11823 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11824 goto reply;
11825 }
11826
11827 // make sure new tier is empty
11828 string force_nonempty;
11829 cmd_getval(cct, cmdmap, "force_nonempty", force_nonempty);
11830 const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
11831 if (pstats && pstats->stats.sum.num_objects != 0 &&
11832 force_nonempty != "--force-nonempty") {
11833 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
11834 err = -ENOTEMPTY;
11835 goto reply;
11836 }
11837 if (tp->is_erasure()) {
11838 ss << "tier pool '" << tierpoolstr
11839 << "' is an ec pool, which cannot be a tier";
11840 err = -ENOTSUP;
11841 goto reply;
11842 }
11843 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
11844 ((force_nonempty != "--force-nonempty") ||
11845 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
11846 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
11847 err = -ENOTEMPTY;
11848 goto reply;
11849 }
11850 // go
11851 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11852 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11853 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11854 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11855 return true;
11856 }
11857 np->tiers.insert(tierpool_id);
11858 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11859 ntp->tier_of = pool_id;
11860 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
11861 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11862 get_last_committed() + 1));
11863 return true;
11864 } else if (prefix == "osd tier remove" ||
11865 prefix == "osd tier rm") {
11866 string poolstr;
11867 cmd_getval(cct, cmdmap, "pool", poolstr);
11868 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11869 if (pool_id < 0) {
11870 ss << "unrecognized pool '" << poolstr << "'";
11871 err = -ENOENT;
11872 goto reply;
11873 }
11874 string tierpoolstr;
11875 cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
11876 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11877 if (tierpool_id < 0) {
11878 ss << "unrecognized pool '" << tierpoolstr << "'";
11879 err = -ENOENT;
11880 goto reply;
11881 }
11882 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11883 ceph_assert(p);
11884 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11885 ceph_assert(tp);
11886
11887 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
11888 goto reply;
11889 }
11890
11891 if (p->tiers.count(tierpool_id) == 0) {
11892 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11893 err = 0;
11894 goto reply;
11895 }
11896 if (tp->tier_of != pool_id) {
11897 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
11898 << osdmap.get_pool_name(tp->tier_of) << "': "
11899 // be scary about it; this is an inconsistency and bells must go off
11900 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
11901 err = -EINVAL;
11902 goto reply;
11903 }
11904 if (p->read_tier == tierpool_id) {
11905 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
11906 err = -EBUSY;
11907 goto reply;
11908 }
11909 // go
11910 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11911 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11912 if (np->tiers.count(tierpool_id) == 0 ||
11913 ntp->tier_of != pool_id ||
11914 np->read_tier == tierpool_id) {
11915 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11916 return true;
11917 }
11918 np->tiers.erase(tierpool_id);
11919 ntp->clear_tier();
11920 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11921 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11922 get_last_committed() + 1));
11923 return true;
11924 } else if (prefix == "osd tier set-overlay") {
11925 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11926 if (err == -EAGAIN)
11927 goto wait;
11928 if (err)
11929 goto reply;
11930 string poolstr;
11931 cmd_getval(cct, cmdmap, "pool", poolstr);
11932 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11933 if (pool_id < 0) {
11934 ss << "unrecognized pool '" << poolstr << "'";
11935 err = -ENOENT;
11936 goto reply;
11937 }
11938 string overlaypoolstr;
11939 cmd_getval(cct, cmdmap, "overlaypool", overlaypoolstr);
11940 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
11941 if (overlaypool_id < 0) {
11942 ss << "unrecognized pool '" << overlaypoolstr << "'";
11943 err = -ENOENT;
11944 goto reply;
11945 }
11946 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11947 ceph_assert(p);
11948 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11949 ceph_assert(overlay_p);
11950 if (p->tiers.count(overlaypool_id) == 0) {
11951 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
11952 err = -EINVAL;
11953 goto reply;
11954 }
11955 if (p->read_tier == overlaypool_id) {
11956 err = 0;
11957 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
11958 goto reply;
11959 }
11960 if (p->has_read_tier()) {
11961 ss << "pool '" << poolstr << "' has overlay '"
11962 << osdmap.get_pool_name(p->read_tier)
11963 << "'; please remove-overlay first";
11964 err = -EINVAL;
11965 goto reply;
11966 }
11967
11968 // go
11969 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11970 np->read_tier = overlaypool_id;
11971 np->write_tier = overlaypool_id;
11972 np->set_last_force_op_resend(pending_inc.epoch);
11973 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
11974 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
11975 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
11976 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
11977 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
11978 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11979 get_last_committed() + 1));
11980 return true;
11981 } else if (prefix == "osd tier remove-overlay" ||
11982 prefix == "osd tier rm-overlay") {
11983 string poolstr;
11984 cmd_getval(cct, cmdmap, "pool", poolstr);
11985 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11986 if (pool_id < 0) {
11987 ss << "unrecognized pool '" << poolstr << "'";
11988 err = -ENOENT;
11989 goto reply;
11990 }
11991 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11992 ceph_assert(p);
11993 if (!p->has_read_tier()) {
11994 err = 0;
11995 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
11996 goto reply;
11997 }
11998
11999 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12000 goto reply;
12001 }
12002
12003 // go
12004 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12005 if (np->has_read_tier()) {
12006 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12007 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12008 nop->set_last_force_op_resend(pending_inc.epoch);
12009 }
12010 if (np->has_write_tier()) {
12011 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12012 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12013 nop->set_last_force_op_resend(pending_inc.epoch);
12014 }
12015 np->clear_read_tier();
12016 np->clear_write_tier();
12017 np->set_last_force_op_resend(pending_inc.epoch);
12018 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12019 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12020 get_last_committed() + 1));
12021 return true;
12022 } else if (prefix == "osd tier cache-mode") {
12023 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12024 if (err == -EAGAIN)
12025 goto wait;
12026 if (err)
12027 goto reply;
12028 string poolstr;
12029 cmd_getval(cct, cmdmap, "pool", poolstr);
12030 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12031 if (pool_id < 0) {
12032 ss << "unrecognized pool '" << poolstr << "'";
12033 err = -ENOENT;
12034 goto reply;
12035 }
12036 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12037 ceph_assert(p);
12038 if (!p->is_tier()) {
12039 ss << "pool '" << poolstr << "' is not a tier";
12040 err = -EINVAL;
12041 goto reply;
12042 }
12043 string modestr;
12044 cmd_getval(cct, cmdmap, "mode", modestr);
12045 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12046 if (mode < 0) {
12047 ss << "'" << modestr << "' is not a valid cache mode";
12048 err = -EINVAL;
12049 goto reply;
12050 }
12051
12052 bool sure = false;
12053 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12054
12055 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12056 mode != pg_pool_t::CACHEMODE_NONE &&
12057 mode != pg_pool_t::CACHEMODE_PROXY &&
12058 mode != pg_pool_t::CACHEMODE_READPROXY) &&
12059 !sure) {
12060 ss << "'" << modestr << "' is not a well-supported cache mode and may "
12061 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12062 err = -EPERM;
12063 goto reply;
12064 }
12065
12066 // pool already has this cache-mode set and there are no pending changes
12067 if (p->cache_mode == mode &&
12068 (pending_inc.new_pools.count(pool_id) == 0 ||
12069 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12070 ss << "set cache-mode for pool '" << poolstr << "'"
12071 << " to " << pg_pool_t::get_cache_mode_name(mode);
12072 err = 0;
12073 goto reply;
12074 }
12075
12076 /* Mode description:
12077 *
12078 * none: No cache-mode defined
12079 * forward: Forward all reads and writes to base pool
12080 * writeback: Cache writes, promote reads from base pool
12081 * readonly: Forward writes to base pool
12082 * readforward: Writes are in writeback mode, Reads are in forward mode
12083 * proxy: Proxy all reads and writes to base pool
12084 * readproxy: Writes are in writeback mode, Reads are in proxy mode
12085 *
12086 * Hence, these are the allowed transitions:
12087 *
12088 * none -> any
12089 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12090 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12091 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12092 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
12093 * writeback -> readforward || readproxy || forward || proxy
12094 * readonly -> any
12095 */
12096
12097 // We check if the transition is valid against the current pool mode, as
12098 // it is the only committed state thus far. We will blantly squash
12099 // whatever mode is on the pending state.
12100
12101 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
12102 (mode != pg_pool_t::CACHEMODE_FORWARD &&
12103 mode != pg_pool_t::CACHEMODE_PROXY &&
12104 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12105 mode != pg_pool_t::CACHEMODE_READPROXY)) {
12106 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
12107 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
12108 << "' pool; only '"
12109 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
12110 << "','"
12111 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
12112 << "','"
12113 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
12114 << "','"
12115 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
12116 << "' allowed.";
12117 err = -EINVAL;
12118 goto reply;
12119 }
12120 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
12121 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12122 mode != pg_pool_t::CACHEMODE_FORWARD &&
12123 mode != pg_pool_t::CACHEMODE_PROXY &&
12124 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12125
12126 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
12127 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12128 mode != pg_pool_t::CACHEMODE_FORWARD &&
12129 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12130 mode != pg_pool_t::CACHEMODE_PROXY)) ||
12131
12132 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
12133 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12134 mode != pg_pool_t::CACHEMODE_FORWARD &&
12135 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12136 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12137
12138 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
12139 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12140 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12141 mode != pg_pool_t::CACHEMODE_PROXY &&
12142 mode != pg_pool_t::CACHEMODE_READPROXY))) {
12143
12144 const pool_stat_t* pstats =
12145 mon->mgrstatmon()->get_pool_stat(pool_id);
12146
12147 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
12148 ss << "unable to set cache-mode '"
12149 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
12150 << "': dirty objects found";
12151 err = -EBUSY;
12152 goto reply;
12153 }
12154 }
12155 // go
12156 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12157 np->cache_mode = mode;
12158 // set this both when moving to and from cache_mode NONE. this is to
12159 // capture legacy pools that were set up before this flag existed.
12160 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
12161 ss << "set cache-mode for pool '" << poolstr
12162 << "' to " << pg_pool_t::get_cache_mode_name(mode);
12163 if (mode == pg_pool_t::CACHEMODE_NONE) {
12164 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
12165 ceph_assert(base_pool);
12166 if (base_pool->read_tier == pool_id ||
12167 base_pool->write_tier == pool_id)
12168 ss <<" (WARNING: pool is still configured as read or write tier)";
12169 }
12170 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12171 get_last_committed() + 1));
12172 return true;
12173 } else if (prefix == "osd tier add-cache") {
12174 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12175 if (err == -EAGAIN)
12176 goto wait;
12177 if (err)
12178 goto reply;
12179 string poolstr;
12180 cmd_getval(cct, cmdmap, "pool", poolstr);
12181 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12182 if (pool_id < 0) {
12183 ss << "unrecognized pool '" << poolstr << "'";
12184 err = -ENOENT;
12185 goto reply;
12186 }
12187 string tierpoolstr;
12188 cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
12189 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12190 if (tierpool_id < 0) {
12191 ss << "unrecognized pool '" << tierpoolstr << "'";
12192 err = -ENOENT;
12193 goto reply;
12194 }
12195 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12196 ceph_assert(p);
12197 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12198 ceph_assert(tp);
12199
12200 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12201 goto reply;
12202 }
12203
12204 int64_t size = 0;
12205 if (!cmd_getval(cct, cmdmap, "size", size)) {
12206 ss << "unable to parse 'size' value '"
12207 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
12208 err = -EINVAL;
12209 goto reply;
12210 }
12211 // make sure new tier is empty
12212 const pool_stat_t *pstats =
12213 mon->mgrstatmon()->get_pool_stat(tierpool_id);
12214 if (pstats && pstats->stats.sum.num_objects != 0) {
12215 ss << "tier pool '" << tierpoolstr << "' is not empty";
12216 err = -ENOTEMPTY;
12217 goto reply;
12218 }
12219 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
12220 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12221 if (mode < 0) {
12222 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
12223 err = -EINVAL;
12224 goto reply;
12225 }
12226 HitSet::Params hsp;
12227 auto& cache_hit_set_type =
12228 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
12229 if (cache_hit_set_type == "bloom") {
12230 BloomHitSet::Params *bsp = new BloomHitSet::Params;
12231 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
12232 hsp = HitSet::Params(bsp);
12233 } else if (cache_hit_set_type == "explicit_hash") {
12234 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
12235 } else if (cache_hit_set_type == "explicit_object") {
12236 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
12237 } else {
12238 ss << "osd tier cache default hit set type '"
12239 << cache_hit_set_type << "' is not a known type";
12240 err = -EINVAL;
12241 goto reply;
12242 }
12243 // go
12244 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12245 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12246 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12247 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12248 return true;
12249 }
12250 np->tiers.insert(tierpool_id);
12251 np->read_tier = np->write_tier = tierpool_id;
12252 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12253 np->set_last_force_op_resend(pending_inc.epoch);
12254 ntp->set_last_force_op_resend(pending_inc.epoch);
12255 ntp->tier_of = pool_id;
12256 ntp->cache_mode = mode;
12257 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
12258 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
12259 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
12260 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
12261 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
12262 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
12263 ntp->hit_set_params = hsp;
12264 ntp->target_max_bytes = size;
12265 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
12266 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12267 get_last_committed() + 1));
12268 return true;
12269 } else if (prefix == "osd pool set-quota") {
12270 string poolstr;
12271 cmd_getval(cct, cmdmap, "pool", poolstr);
12272 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12273 if (pool_id < 0) {
12274 ss << "unrecognized pool '" << poolstr << "'";
12275 err = -ENOENT;
12276 goto reply;
12277 }
12278
12279 string field;
12280 cmd_getval(cct, cmdmap, "field", field);
12281 if (field != "max_objects" && field != "max_bytes") {
12282 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
12283 err = -EINVAL;
12284 goto reply;
12285 }
12286
12287 // val could contain unit designations, so we treat as a string
12288 string val;
12289 cmd_getval(cct, cmdmap, "val", val);
12290 string tss;
12291 int64_t value;
12292 if (field == "max_objects") {
12293 value = strict_sistrtoll(val.c_str(), &tss);
12294 } else if (field == "max_bytes") {
12295 value = strict_iecstrtoll(val.c_str(), &tss);
12296 } else {
12297 ceph_abort_msg("unrecognized option");
12298 }
12299 if (!tss.empty()) {
12300 ss << "error parsing value '" << val << "': " << tss;
12301 err = -EINVAL;
12302 goto reply;
12303 }
12304
12305 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
12306 if (field == "max_objects") {
12307 pi->quota_max_objects = value;
12308 } else if (field == "max_bytes") {
12309 pi->quota_max_bytes = value;
12310 } else {
12311 ceph_abort_msg("unrecognized option");
12312 }
12313 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
12314 rs = ss.str();
12315 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12316 get_last_committed() + 1));
12317 return true;
12318 } else if (prefix == "osd pool application enable" ||
12319 prefix == "osd pool application disable" ||
12320 prefix == "osd pool application set" ||
12321 prefix == "osd pool application rm") {
12322 err = prepare_command_pool_application(prefix, cmdmap, ss);
12323 if (err == -EAGAIN) {
12324 goto wait;
12325 } else if (err < 0) {
12326 goto reply;
12327 } else {
12328 goto update;
12329 }
12330 } else if (prefix == "osd force-create-pg") {
12331 pg_t pgid;
12332 string pgidstr;
12333 cmd_getval(cct, cmdmap, "pgid", pgidstr);
12334 if (!pgid.parse(pgidstr.c_str())) {
12335 ss << "invalid pgid '" << pgidstr << "'";
12336 err = -EINVAL;
12337 goto reply;
12338 }
12339 if (!osdmap.pg_exists(pgid)) {
12340 ss << "pg " << pgid << " should not exist";
12341 err = -ENOENT;
12342 goto reply;
12343 }
12344 bool sure = false;
12345 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12346 if (!sure) {
12347 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
12348 << "that the cluster will give up ever trying to recover the lost data. Do this "
12349 << "only if you are certain that all copies of the PG are in fact lost and you are "
12350 << "willing to accept that the data is permanently destroyed. Pass "
12351 << "--yes-i-really-mean-it to proceed.";
12352 err = -EPERM;
12353 goto reply;
12354 }
12355 bool creating_now;
12356 {
12357 std::lock_guard<std::mutex> l(creating_pgs_lock);
12358 auto emplaced = creating_pgs.pgs.emplace(pgid,
12359 make_pair(osdmap.get_epoch(),
12360 ceph_clock_now()));
12361 creating_now = emplaced.second;
12362 }
12363 if (creating_now) {
12364 ss << "pg " << pgidstr << " now creating, ok";
12365 // set the pool's CREATING flag so that (1) the osd won't ignore our
12366 // create message and (2) we won't propose any future pg_num changes
12367 // until after the PG has been instantiated.
12368 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
12369 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
12370 }
12371 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
12372 err = 0;
12373 goto update;
12374 } else {
12375 ss << "pg " << pgid << " already creating";
12376 err = 0;
12377 goto reply;
12378 }
12379 } else {
12380 err = -EINVAL;
12381 }
12382
12383 reply:
12384 getline(ss, rs);
12385 if (err < 0 && rs.length() == 0)
12386 rs = cpp_strerror(err);
12387 mon->reply_command(op, err, rs, rdata, get_last_committed());
12388 return ret;
12389
12390 update:
12391 getline(ss, rs);
12392 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12393 get_last_committed() + 1));
12394 return true;
12395
12396 wait:
12397 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12398 return true;
12399 }
12400
12401 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
12402 {
12403 op->mark_osdmon_event(__func__);
12404
12405 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12406 MonSession *session = op->get_session();
12407 if (!session) {
12408 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12409 return true;
12410 }
12411
12412 switch (m->op) {
12413 case POOL_OP_CREATE_UNMANAGED_SNAP:
12414 case POOL_OP_DELETE_UNMANAGED_SNAP:
12415 {
12416 const std::string* pool_name = nullptr;
12417 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
12418 if (pg_pool != nullptr) {
12419 pool_name = &osdmap.get_pool_name(m->pool);
12420 }
12421
12422 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
12423 session->entity_name, session->caps,
12424 session->get_peer_socket_addr(),
12425 pool_name)) {
12426 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
12427 << "privileges. message: " << *m << std::endl
12428 << "caps: " << session->caps << dendl;
12429 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12430 return true;
12431 }
12432 }
12433 break;
12434 default:
12435 if (!session->is_capable("osd", MON_CAP_W)) {
12436 dout(0) << "got pool op from entity with insufficient privileges. "
12437 << "message: " << *m << std::endl
12438 << "caps: " << session->caps << dendl;
12439 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12440 return true;
12441 }
12442 break;
12443 }
12444
12445 return false;
12446 }
12447
12448 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
12449 {
12450 op->mark_osdmon_event(__func__);
12451 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12452
12453 if (enforce_pool_op_caps(op)) {
12454 return true;
12455 }
12456
12457 if (m->fsid != mon->monmap->fsid) {
12458 dout(0) << __func__ << " drop message on fsid " << m->fsid
12459 << " != " << mon->monmap->fsid << " for " << *m << dendl;
12460 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12461 return true;
12462 }
12463
12464 if (m->op == POOL_OP_CREATE)
12465 return preprocess_pool_op_create(op);
12466
12467 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
12468 if (p == nullptr) {
12469 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
12470 if (m->op == POOL_OP_DELETE) {
12471 _pool_op_reply(op, 0, osdmap.get_epoch());
12472 } else {
12473 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12474 }
12475 return true;
12476 }
12477
12478 // check if the snap and snapname exist
12479 bool snap_exists = false;
12480 if (p->snap_exists(m->name.c_str()))
12481 snap_exists = true;
12482
12483 switch (m->op) {
12484 case POOL_OP_CREATE_SNAP:
12485 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
12486 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12487 return true;
12488 }
12489 if (snap_exists) {
12490 _pool_op_reply(op, 0, osdmap.get_epoch());
12491 return true;
12492 }
12493 return false;
12494 case POOL_OP_CREATE_UNMANAGED_SNAP:
12495 if (p->is_pool_snaps_mode()) {
12496 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12497 return true;
12498 }
12499 return false;
12500 case POOL_OP_DELETE_SNAP:
12501 if (p->is_unmanaged_snaps_mode()) {
12502 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12503 return true;
12504 }
12505 if (!snap_exists) {
12506 _pool_op_reply(op, 0, osdmap.get_epoch());
12507 return true;
12508 }
12509 return false;
12510 case POOL_OP_DELETE_UNMANAGED_SNAP:
12511 if (p->is_pool_snaps_mode()) {
12512 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12513 return true;
12514 }
12515 if (p->is_removed_snap(m->snapid)) {
12516 _pool_op_reply(op, 0, osdmap.get_epoch());
12517 return true;
12518 }
12519 return false;
12520 case POOL_OP_DELETE:
12521 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
12522 _pool_op_reply(op, 0, osdmap.get_epoch());
12523 return true;
12524 }
12525 return false;
12526 case POOL_OP_AUID_CHANGE:
12527 return false;
12528 default:
12529 ceph_abort();
12530 break;
12531 }
12532
12533 return false;
12534 }
12535
12536 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
12537 {
12538 op->mark_osdmon_event(__func__);
12539 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12540 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
12541 if (pool >= 0) {
12542 _pool_op_reply(op, 0, osdmap.get_epoch());
12543 return true;
12544 }
12545
12546 return false;
12547 }
12548
12549 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
12550 {
12551 op->mark_osdmon_event(__func__);
12552 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12553 dout(10) << "prepare_pool_op " << *m << dendl;
12554 if (m->op == POOL_OP_CREATE) {
12555 return prepare_pool_op_create(op);
12556 } else if (m->op == POOL_OP_DELETE) {
12557 return prepare_pool_op_delete(op);
12558 }
12559
12560 int ret = 0;
12561 bool changed = false;
12562
12563 if (!osdmap.have_pg_pool(m->pool)) {
12564 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12565 return false;
12566 }
12567
12568 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
12569
12570 switch (m->op) {
12571 case POOL_OP_CREATE_SNAP:
12572 if (pool->is_tier()) {
12573 ret = -EINVAL;
12574 _pool_op_reply(op, ret, osdmap.get_epoch());
12575 return false;
12576 } // else, fall through
12577 case POOL_OP_DELETE_SNAP:
12578 if (!pool->is_unmanaged_snaps_mode()) {
12579 bool snap_exists = pool->snap_exists(m->name.c_str());
12580 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
12581 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
12582 ret = 0;
12583 } else {
12584 break;
12585 }
12586 } else {
12587 ret = -EINVAL;
12588 }
12589 _pool_op_reply(op, ret, osdmap.get_epoch());
12590 return false;
12591
12592 case POOL_OP_DELETE_UNMANAGED_SNAP:
12593 // we won't allow removal of an unmanaged snapshot from a pool
12594 // not in unmanaged snaps mode.
12595 if (!pool->is_unmanaged_snaps_mode()) {
12596 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
12597 return false;
12598 }
12599 /* fall-thru */
12600 case POOL_OP_CREATE_UNMANAGED_SNAP:
12601 // but we will allow creating an unmanaged snapshot on any pool
12602 // as long as it is not in 'pool' snaps mode.
12603 if (pool->is_pool_snaps_mode()) {
12604 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12605 return false;
12606 }
12607 }
12608
12609 // projected pool info
12610 pg_pool_t pp;
12611 if (pending_inc.new_pools.count(m->pool))
12612 pp = pending_inc.new_pools[m->pool];
12613 else
12614 pp = *osdmap.get_pg_pool(m->pool);
12615
12616 bufferlist reply_data;
12617
12618 // pool snaps vs unmanaged snaps are mutually exclusive
12619 switch (m->op) {
12620 case POOL_OP_CREATE_SNAP:
12621 case POOL_OP_DELETE_SNAP:
12622 if (pp.is_unmanaged_snaps_mode()) {
12623 ret = -EINVAL;
12624 goto out;
12625 }
12626 break;
12627
12628 case POOL_OP_CREATE_UNMANAGED_SNAP:
12629 case POOL_OP_DELETE_UNMANAGED_SNAP:
12630 if (pp.is_pool_snaps_mode()) {
12631 ret = -EINVAL;
12632 goto out;
12633 }
12634 }
12635
12636 switch (m->op) {
12637 case POOL_OP_CREATE_SNAP:
12638 if (!pp.snap_exists(m->name.c_str())) {
12639 pp.add_snap(m->name.c_str(), ceph_clock_now());
12640 dout(10) << "create snap in pool " << m->pool << " " << m->name
12641 << " seq " << pp.get_snap_epoch() << dendl;
12642 changed = true;
12643 }
12644 break;
12645
12646 case POOL_OP_DELETE_SNAP:
12647 {
12648 snapid_t s = pp.snap_exists(m->name.c_str());
12649 if (s) {
12650 pp.remove_snap(s);
12651 pending_inc.new_removed_snaps[m->pool].insert(s);
12652 changed = true;
12653 }
12654 }
12655 break;
12656
12657 case POOL_OP_CREATE_UNMANAGED_SNAP:
12658 {
12659 uint64_t snapid;
12660 pp.add_unmanaged_snap(snapid);
12661 encode(snapid, reply_data);
12662 changed = true;
12663 }
12664 break;
12665
12666 case POOL_OP_DELETE_UNMANAGED_SNAP:
12667 if (!pp.is_removed_snap(m->snapid)) {
12668 if (m->snapid > pp.get_snap_seq()) {
12669 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12670 return false;
12671 }
12672 pp.remove_unmanaged_snap(m->snapid);
12673 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
12674 changed = true;
12675 }
12676 break;
12677
12678 case POOL_OP_AUID_CHANGE:
12679 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
12680 return false;
12681
12682 default:
12683 ceph_abort();
12684 break;
12685 }
12686
12687 if (changed) {
12688 pp.set_snap_epoch(pending_inc.epoch);
12689 pending_inc.new_pools[m->pool] = pp;
12690 }
12691
12692 out:
12693 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
12694 return true;
12695 }
12696
12697 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
12698 {
12699 op->mark_osdmon_event(__func__);
12700 int err = prepare_new_pool(op);
12701 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
12702 return true;
12703 }
12704
12705 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
12706 ostream *ss)
12707 {
12708 const string& poolstr = osdmap.get_pool_name(pool_id);
12709
12710 // If the Pool is in use by CephFS, refuse to delete it
12711 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12712 if (pending_fsmap.pool_in_use(pool_id)) {
12713 *ss << "pool '" << poolstr << "' is in use by CephFS";
12714 return -EBUSY;
12715 }
12716
12717 if (pool.tier_of >= 0) {
12718 *ss << "pool '" << poolstr << "' is a tier of '"
12719 << osdmap.get_pool_name(pool.tier_of) << "'";
12720 return -EBUSY;
12721 }
12722 if (!pool.tiers.empty()) {
12723 *ss << "pool '" << poolstr << "' has tiers";
12724 for(auto tier : pool.tiers) {
12725 *ss << " " << osdmap.get_pool_name(tier);
12726 }
12727 return -EBUSY;
12728 }
12729
12730 if (!g_conf()->mon_allow_pool_delete) {
12731 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
12732 return -EPERM;
12733 }
12734
12735 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
12736 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
12737 return -EPERM;
12738 }
12739
12740 *ss << "pool '" << poolstr << "' removed";
12741 return 0;
12742 }
12743
12744 /**
12745 * Check if it is safe to add a tier to a base pool
12746 *
12747 * @return
12748 * True if the operation should proceed, false if we should abort here
12749 * (abort doesn't necessarily mean error, could be idempotency)
12750 */
12751 bool OSDMonitor::_check_become_tier(
12752 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
12753 const int64_t base_pool_id, const pg_pool_t *base_pool,
12754 int *err,
12755 ostream *ss) const
12756 {
12757 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
12758 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12759
12760 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12761 if (pending_fsmap.pool_in_use(tier_pool_id)) {
12762 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
12763 *err = -EBUSY;
12764 return false;
12765 }
12766
12767 if (base_pool->tiers.count(tier_pool_id)) {
12768 ceph_assert(tier_pool->tier_of == base_pool_id);
12769 *err = 0;
12770 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
12771 << base_pool_name << "'";
12772 return false;
12773 }
12774
12775 if (base_pool->is_tier()) {
12776 *ss << "pool '" << base_pool_name << "' is already a tier of '"
12777 << osdmap.get_pool_name(base_pool->tier_of) << "', "
12778 << "multiple tiers are not yet supported.";
12779 *err = -EINVAL;
12780 return false;
12781 }
12782
12783 if (tier_pool->has_tiers()) {
12784 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
12785 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
12786 it != tier_pool->tiers.end(); ++it)
12787 *ss << "'" << osdmap.get_pool_name(*it) << "',";
12788 *ss << " multiple tiers are not yet supported.";
12789 *err = -EINVAL;
12790 return false;
12791 }
12792
12793 if (tier_pool->is_tier()) {
12794 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
12795 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
12796 *err = -EINVAL;
12797 return false;
12798 }
12799
12800 *err = 0;
12801 return true;
12802 }
12803
12804
12805 /**
12806 * Check if it is safe to remove a tier from this base pool
12807 *
12808 * @return
12809 * True if the operation should proceed, false if we should abort here
12810 * (abort doesn't necessarily mean error, could be idempotency)
12811 */
12812 bool OSDMonitor::_check_remove_tier(
12813 const int64_t base_pool_id, const pg_pool_t *base_pool,
12814 const pg_pool_t *tier_pool,
12815 int *err, ostream *ss) const
12816 {
12817 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12818
12819 // Apply CephFS-specific checks
12820 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12821 if (pending_fsmap.pool_in_use(base_pool_id)) {
12822 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
12823 // If the underlying pool is erasure coded and does not allow EC
12824 // overwrites, we can't permit the removal of the replicated tier that
12825 // CephFS relies on to access it
12826 *ss << "pool '" << base_pool_name <<
12827 "' does not allow EC overwrites and is in use by CephFS"
12828 " via its tier";
12829 *err = -EBUSY;
12830 return false;
12831 }
12832
12833 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
12834 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
12835 "tier is still in use as a writeback cache. Change the cache "
12836 "mode and flush the cache before removing it";
12837 *err = -EBUSY;
12838 return false;
12839 }
12840 }
12841
12842 *err = 0;
12843 return true;
12844 }
12845
12846 int OSDMonitor::_prepare_remove_pool(
12847 int64_t pool, ostream *ss, bool no_fake)
12848 {
12849 dout(10) << __func__ << " " << pool << dendl;
12850 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12851 int r = _check_remove_pool(pool, *p, ss);
12852 if (r < 0)
12853 return r;
12854
12855 auto new_pool = pending_inc.new_pools.find(pool);
12856 if (new_pool != pending_inc.new_pools.end()) {
12857 // if there is a problem with the pending info, wait and retry
12858 // this op.
12859 const auto& p = new_pool->second;
12860 int r = _check_remove_pool(pool, p, ss);
12861 if (r < 0)
12862 return -EAGAIN;
12863 }
12864
12865 if (pending_inc.old_pools.count(pool)) {
12866 dout(10) << __func__ << " " << pool << " already pending removal"
12867 << dendl;
12868 return 0;
12869 }
12870
12871 if (g_conf()->mon_fake_pool_delete && !no_fake) {
12872 string old_name = osdmap.get_pool_name(pool);
12873 string new_name = old_name + "." + stringify(pool) + ".DELETED";
12874 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
12875 << old_name << " -> " << new_name << dendl;
12876 pending_inc.new_pool_names[pool] = new_name;
12877 return 0;
12878 }
12879
12880 // remove
12881 pending_inc.old_pools.insert(pool);
12882
12883 // remove any pg_temp mappings for this pool
12884 for (auto p = osdmap.pg_temp->begin();
12885 p != osdmap.pg_temp->end();
12886 ++p) {
12887 if (p->first.pool() == pool) {
12888 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
12889 << p->first << dendl;
12890 pending_inc.new_pg_temp[p->first].clear();
12891 }
12892 }
12893 // remove any primary_temp mappings for this pool
12894 for (auto p = osdmap.primary_temp->begin();
12895 p != osdmap.primary_temp->end();
12896 ++p) {
12897 if (p->first.pool() == pool) {
12898 dout(10) << __func__ << " " << pool
12899 << " removing obsolete primary_temp" << p->first << dendl;
12900 pending_inc.new_primary_temp[p->first] = -1;
12901 }
12902 }
12903 // remove any pg_upmap mappings for this pool
12904 for (auto& p : osdmap.pg_upmap) {
12905 if (p.first.pool() == pool) {
12906 dout(10) << __func__ << " " << pool
12907 << " removing obsolete pg_upmap "
12908 << p.first << dendl;
12909 pending_inc.old_pg_upmap.insert(p.first);
12910 }
12911 }
12912 // remove any pending pg_upmap mappings for this pool
12913 {
12914 auto it = pending_inc.new_pg_upmap.begin();
12915 while (it != pending_inc.new_pg_upmap.end()) {
12916 if (it->first.pool() == pool) {
12917 dout(10) << __func__ << " " << pool
12918 << " removing pending pg_upmap "
12919 << it->first << dendl;
12920 it = pending_inc.new_pg_upmap.erase(it);
12921 } else {
12922 it++;
12923 }
12924 }
12925 }
12926 // remove any pg_upmap_items mappings for this pool
12927 for (auto& p : osdmap.pg_upmap_items) {
12928 if (p.first.pool() == pool) {
12929 dout(10) << __func__ << " " << pool
12930 << " removing obsolete pg_upmap_items " << p.first
12931 << dendl;
12932 pending_inc.old_pg_upmap_items.insert(p.first);
12933 }
12934 }
12935 // remove any pending pg_upmap mappings for this pool
12936 {
12937 auto it = pending_inc.new_pg_upmap_items.begin();
12938 while (it != pending_inc.new_pg_upmap_items.end()) {
12939 if (it->first.pool() == pool) {
12940 dout(10) << __func__ << " " << pool
12941 << " removing pending pg_upmap_items "
12942 << it->first << dendl;
12943 it = pending_inc.new_pg_upmap_items.erase(it);
12944 } else {
12945 it++;
12946 }
12947 }
12948 }
12949
12950 // remove any choose_args for this pool
12951 CrushWrapper newcrush;
12952 _get_pending_crush(newcrush);
12953 if (newcrush.have_choose_args(pool)) {
12954 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
12955 newcrush.rm_choose_args(pool);
12956 pending_inc.crush.clear();
12957 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
12958 }
12959 return 0;
12960 }
12961
12962 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
12963 {
12964 dout(10) << "_prepare_rename_pool " << pool << dendl;
12965 if (pending_inc.old_pools.count(pool)) {
12966 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
12967 return -ENOENT;
12968 }
12969 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
12970 p != pending_inc.new_pool_names.end();
12971 ++p) {
12972 if (p->second == newname && p->first != pool) {
12973 return -EEXIST;
12974 }
12975 }
12976
12977 pending_inc.new_pool_names[pool] = newname;
12978 return 0;
12979 }
12980
12981 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
12982 {
12983 op->mark_osdmon_event(__func__);
12984 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12985 ostringstream ss;
12986 int ret = _prepare_remove_pool(m->pool, &ss, false);
12987 if (ret == -EAGAIN) {
12988 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12989 return true;
12990 }
12991 if (ret < 0)
12992 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
12993 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
12994 pending_inc.epoch));
12995 return true;
12996 }
12997
12998 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
12999 int ret, epoch_t epoch, bufferlist *blp)
13000 {
13001 op->mark_osdmon_event(__func__);
13002 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
13003 dout(20) << "_pool_op_reply " << ret << dendl;
13004 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13005 ret, epoch, get_last_committed(), blp);
13006 mon->send_reply(op, reply);
13007 }
13008
13009 void OSDMonitor::convert_pool_priorities(void)
13010 {
13011 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13012 int64_t max_prio = 0;
13013 int64_t min_prio = 0;
13014 for (const auto &i : osdmap.get_pools()) {
13015 const auto &pool = i.second;
13016
13017 if (pool.opts.is_set(key)) {
13018 int64_t prio;
13019 pool.opts.get(key, &prio);
13020 if (prio > max_prio)
13021 max_prio = prio;
13022 if (prio < min_prio)
13023 min_prio = prio;
13024 }
13025 }
13026 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13027 dout(20) << __func__ << " nothing to fix" << dendl;
13028 return;
13029 }
13030 // Current pool priorities exceeds new maximum
13031 for (const auto &i : osdmap.get_pools()) {
13032 const auto pool_id = i.first;
13033 pg_pool_t pool = i.second;
13034
13035 int64_t prio = 0;
13036 pool.opts.get(key, &prio);
13037 int64_t n;
13038
13039 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13040 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13041 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13042 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13043 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13044 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13045 } else {
13046 continue;
13047 }
13048 if (n == 0) {
13049 pool.opts.unset(key);
13050 } else {
13051 pool.opts.set(key, static_cast<int64_t>(n));
13052 }
13053 dout(10) << __func__ << " pool " << pool_id
13054 << " recovery_priority adjusted "
13055 << prio << " to " << n << dendl;
13056 pool.last_change = pending_inc.epoch;
13057 pending_inc.new_pools[pool_id] = pool;
13058 }
13059 }