]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
3acafbb82e682b812e468749815ba786ad9ea2e4
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate2.h"
51 #include "messages/MOSDPGCreated.h"
52 #include "messages/MOSDPGTemp.h"
53 #include "messages/MOSDPGReadyToMerge.h"
54 #include "messages/MMonCommand.h"
55 #include "messages/MRemoveSnaps.h"
56 #include "messages/MRoute.h"
57 #include "messages/MMonGetPurgedSnaps.h"
58 #include "messages/MMonGetPurgedSnapsReply.h"
59
60 #include "common/TextTable.h"
61 #include "common/Timer.h"
62 #include "common/ceph_argparse.h"
63 #include "common/perf_counters.h"
64 #include "common/PriorityCache.h"
65 #include "common/strtol.h"
66 #include "common/numa.h"
67
68 #include "common/config.h"
69 #include "common/errno.h"
70
71 #include "erasure-code/ErasureCodePlugin.h"
72 #include "compressor/Compressor.h"
73 #include "common/Checksummer.h"
74
75 #include "include/compat.h"
76 #include "include/ceph_assert.h"
77 #include "include/stringify.h"
78 #include "include/util.h"
79 #include "common/cmdparse.h"
80 #include "include/str_list.h"
81 #include "include/str_map.h"
82 #include "include/scope_guard.h"
83 #include "perfglue/heap_profiler.h"
84
85 #include "auth/cephx/CephxKeyServer.h"
86 #include "osd/OSDCap.h"
87
88 #include "json_spirit/json_spirit_reader.h"
89
90 #include <boost/algorithm/string/predicate.hpp>
91
92 using std::dec;
93 using std::hex;
94 using std::list;
95 using std::map;
96 using std::make_pair;
97 using std::ostringstream;
98 using std::pair;
99 using std::set;
100 using std::string;
101 using std::stringstream;
102 using std::to_string;
103 using std::vector;
104
105 using ceph::bufferlist;
106 using ceph::decode;
107 using ceph::encode;
108 using ceph::ErasureCodeInterfaceRef;
109 using ceph::ErasureCodePluginRegistry;
110 using ceph::ErasureCodeProfile;
111 using ceph::Formatter;
112 using ceph::JSONFormatter;
113 using ceph::make_message;
114
115 #define dout_subsys ceph_subsys_mon
116 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
117 static const string OSD_METADATA_PREFIX("osd_metadata");
118 static const string OSD_SNAP_PREFIX("osd_snap");
119
120 /*
121
122 OSD snapshot metadata
123 ---------------------
124
125 -- starting with mimic, removed in octopus --
126
127 "removed_epoch_%llu_%08lx" % (pool, epoch)
128 -> interval_set<snapid_t>
129
130 "removed_snap_%llu_%016llx" % (pool, last_snap)
131 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
132
133
134 -- starting with mimic --
135
136 "purged_snap_%llu_%016llx" % (pool, last_snap)
137 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
138
139 - note that the {removed,purged}_snap put the last snap in they key so
140 that we can use forward iteration only to search for an epoch in an
141 interval. e.g., to test if epoch N is removed/purged, we'll find a key
142 >= N that either does or doesn't contain the given snap.
143
144
145 -- starting with octopus --
146
147 "purged_epoch_%08lx" % epoch
148 -> map<int64_t,interval_set<snapid_t>>
149
150 */
151 using namespace TOPNSPC::common;
152 namespace {
153
154 struct OSDMemCache : public PriorityCache::PriCache {
155 OSDMonitor *osdmon;
156 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
157 int64_t committed_bytes = 0;
158 double cache_ratio = 0;
159
160 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
161
162 virtual uint64_t _get_used_bytes() const = 0;
163
164 virtual int64_t request_cache_bytes(
165 PriorityCache::Priority pri, uint64_t total_cache) const {
166 int64_t assigned = get_cache_bytes(pri);
167
168 switch (pri) {
169 // All cache items are currently set to have PRI1 priority
170 case PriorityCache::Priority::PRI1:
171 {
172 int64_t request = _get_used_bytes();
173 return (request > assigned) ? request - assigned : 0;
174 }
175 default:
176 break;
177 }
178 return -EOPNOTSUPP;
179 }
180
181 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
182 return cache_bytes[pri];
183 }
184
185 virtual int64_t get_cache_bytes() const {
186 int64_t total = 0;
187
188 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
189 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
190 total += get_cache_bytes(pri);
191 }
192 return total;
193 }
194
195 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
196 cache_bytes[pri] = bytes;
197 }
198 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
199 cache_bytes[pri] += bytes;
200 }
201 virtual int64_t commit_cache_size(uint64_t total_cache) {
202 committed_bytes = PriorityCache::get_chunk(
203 get_cache_bytes(), total_cache);
204 return committed_bytes;
205 }
206 virtual int64_t get_committed_size() const {
207 return committed_bytes;
208 }
209 virtual double get_cache_ratio() const {
210 return cache_ratio;
211 }
212 virtual void set_cache_ratio(double ratio) {
213 cache_ratio = ratio;
214 }
215 virtual void shift_bins() {
216 }
217 virtual void import_bins(const std::vector<uint64_t> &bins) {
218 }
219 virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
220 }
221 virtual uint64_t get_bins(PriorityCache::Priority pri) const {
222 return 0;
223 }
224
225 virtual string get_cache_name() const = 0;
226 };
227
228 struct IncCache : public OSDMemCache {
229 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
230
231 virtual uint64_t _get_used_bytes() const {
232 return osdmon->inc_osd_cache.get_bytes();
233 }
234
235 virtual string get_cache_name() const {
236 return "OSDMap Inc Cache";
237 }
238
239 uint64_t _get_num_osdmaps() const {
240 return osdmon->inc_osd_cache.get_size();
241 }
242 };
243
244 struct FullCache : public OSDMemCache {
245 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
246
247 virtual uint64_t _get_used_bytes() const {
248 return osdmon->full_osd_cache.get_bytes();
249 }
250
251 virtual string get_cache_name() const {
252 return "OSDMap Full Cache";
253 }
254
255 uint64_t _get_num_osdmaps() const {
256 return osdmon->full_osd_cache.get_size();
257 }
258 };
259
260 std::shared_ptr<IncCache> inc_cache;
261 std::shared_ptr<FullCache> full_cache;
262
263 const uint32_t MAX_POOL_APPLICATIONS = 4;
264 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
265 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
266
267 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
268 // Note: this doesn't include support for the application tag match
269 if ((grant.spec.allow & OSD_CAP_W) != 0) {
270 auto& match = grant.match;
271 if (match.is_match_all()) {
272 return true;
273 } else if (pool_name != nullptr &&
274 !match.pool_namespace.pool_name.empty() &&
275 match.pool_namespace.pool_name == *pool_name) {
276 return true;
277 }
278 }
279 return false;
280 }
281
282 bool is_unmanaged_snap_op_permitted(CephContext* cct,
283 const KeyServer& key_server,
284 const EntityName& entity_name,
285 const MonCap& mon_caps,
286 const entity_addr_t& peer_socket_addr,
287 const std::string* pool_name)
288 {
289 typedef std::map<std::string, std::string> CommandArgs;
290
291 if (mon_caps.is_capable(
292 cct, entity_name, "osd",
293 "osd pool op unmanaged-snap",
294 (pool_name == nullptr ?
295 CommandArgs{} /* pool DNE, require unrestricted cap */ :
296 CommandArgs{{"poolname", *pool_name}}),
297 false, true, false,
298 peer_socket_addr)) {
299 return true;
300 }
301
302 AuthCapsInfo caps_info;
303 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
304 caps_info)) {
305 dout(10) << "unable to locate OSD cap data for " << entity_name
306 << " in auth db" << dendl;
307 return false;
308 }
309
310 string caps_str;
311 if (caps_info.caps.length() > 0) {
312 auto p = caps_info.caps.cbegin();
313 try {
314 decode(caps_str, p);
315 } catch (const ceph::buffer::error &err) {
316 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
317 << dendl;
318 return false;
319 }
320 }
321
322 OSDCap osd_cap;
323 if (!osd_cap.parse(caps_str, nullptr)) {
324 dout(10) << "unable to parse OSD cap data for " << entity_name
325 << " in auth db" << dendl;
326 return false;
327 }
328
329 // if the entity has write permissions in one or all pools, permit
330 // usage of unmanaged-snapshots
331 if (osd_cap.allow_all()) {
332 return true;
333 }
334
335 for (auto& grant : osd_cap.grants) {
336 if (grant.profile.is_valid()) {
337 for (auto& profile_grant : grant.profile_grants) {
338 if (is_osd_writable(profile_grant, pool_name)) {
339 return true;
340 }
341 }
342 } else if (is_osd_writable(grant, pool_name)) {
343 return true;
344 }
345 }
346
347 return false;
348 }
349
350 } // anonymous namespace
351
352 void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
353 epoch_t last_epoch_clean)
354 {
355 if (ps >= pg_num) {
356 // removed PG
357 return;
358 }
359 epoch_by_pg.resize(pg_num, 0);
360 const auto old_lec = epoch_by_pg[ps];
361 if (old_lec >= last_epoch_clean) {
362 // stale lec
363 return;
364 }
365 epoch_by_pg[ps] = last_epoch_clean;
366 if (last_epoch_clean < floor) {
367 floor = last_epoch_clean;
368 } else if (last_epoch_clean > floor) {
369 if (old_lec == floor) {
370 // probably should increase floor?
371 auto new_floor = std::min_element(std::begin(epoch_by_pg),
372 std::end(epoch_by_pg));
373 floor = *new_floor;
374 }
375 }
376 if (ps != next_missing) {
377 return;
378 }
379 for (; next_missing < epoch_by_pg.size(); next_missing++) {
380 if (epoch_by_pg[next_missing] == 0) {
381 break;
382 }
383 }
384 }
385
386 void LastEpochClean::remove_pool(uint64_t pool)
387 {
388 report_by_pool.erase(pool);
389 }
390
391 void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
392 epoch_t last_epoch_clean)
393 {
394 auto& lec = report_by_pool[pg.pool()];
395 return lec.report(pg_num, pg.ps(), last_epoch_clean);
396 }
397
398 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
399 {
400 auto floor = latest.get_epoch();
401 for (auto& pool : latest.get_pools()) {
402 auto reported = report_by_pool.find(pool.first);
403 if (reported == report_by_pool.end()) {
404 return 0;
405 }
406 if (reported->second.next_missing < pool.second.get_pg_num()) {
407 return 0;
408 }
409 if (reported->second.floor < floor) {
410 floor = reported->second.floor;
411 }
412 }
413 return floor;
414 }
415
416 void LastEpochClean::dump(Formatter *f) const
417 {
418 f->open_array_section("per_pool");
419
420 for (auto& [pool, lec] : report_by_pool) {
421 f->open_object_section("pool");
422 f->dump_unsigned("poolid", pool);
423 f->dump_unsigned("floor", lec.floor);
424 f->close_section();
425 }
426
427 f->close_section();
428 }
429
430 class C_UpdateCreatingPGs : public Context {
431 public:
432 OSDMonitor *osdmon;
433 utime_t start;
434 epoch_t epoch;
435 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
436 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
437 void finish(int r) override {
438 if (r >= 0) {
439 utime_t end = ceph_clock_now();
440 dout(10) << "osdmap epoch " << epoch << " mapping took "
441 << (end - start) << " seconds" << dendl;
442 osdmon->update_creating_pgs();
443 osdmon->check_pg_creates_subs();
444 }
445 }
446 };
447
448 #undef dout_prefix
449 #define dout_prefix _prefix(_dout, mon, osdmap)
450 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
451 return *_dout << "mon." << mon.name << "@" << mon.rank
452 << "(" << mon.get_state_name()
453 << ").osd e" << osdmap.get_epoch() << " ";
454 }
455
456 OSDMonitor::OSDMonitor(
457 CephContext *cct,
458 Monitor &mn,
459 Paxos &p,
460 const string& service_name)
461 : PaxosService(mn, p, service_name),
462 cct(cct),
463 inc_osd_cache(g_conf()->mon_osd_cache_size),
464 full_osd_cache(g_conf()->mon_osd_cache_size),
465 has_osdmap_manifest(false),
466 mapper(mn.cct, &mn.cpu_tp)
467 {
468 inc_cache = std::make_shared<IncCache>(this);
469 full_cache = std::make_shared<FullCache>(this);
470 cct->_conf.add_observer(this);
471 int r = _set_cache_sizes();
472 if (r < 0) {
473 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
474 << g_conf()->mon_osd_cache_size
475 << ") without priority cache management"
476 << dendl;
477 }
478 }
479
480 const char **OSDMonitor::get_tracked_conf_keys() const
481 {
482 static const char* KEYS[] = {
483 "mon_memory_target",
484 "mon_memory_autotune",
485 "rocksdb_cache_size",
486 NULL
487 };
488 return KEYS;
489 }
490
491 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
492 const std::set<std::string> &changed)
493 {
494 dout(10) << __func__ << " " << changed << dendl;
495
496 if (changed.count("mon_memory_autotune")) {
497 _set_cache_autotuning();
498 }
499 if (changed.count("mon_memory_target") ||
500 changed.count("rocksdb_cache_size")) {
501 int r = _update_mon_cache_settings();
502 if (r < 0) {
503 derr << __func__ << " mon_memory_target:"
504 << g_conf()->mon_memory_target
505 << " rocksdb_cache_size:"
506 << g_conf()->rocksdb_cache_size
507 << ". Unable to update cache size."
508 << dendl;
509 }
510 }
511 }
512
513 void OSDMonitor::_set_cache_autotuning()
514 {
515 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
516 // Disable cache autotuning
517 std::lock_guard l(balancer_lock);
518 pcm = nullptr;
519 }
520
521 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
522 int r = register_cache_with_pcm();
523 if (r < 0) {
524 dout(10) << __func__
525 << " Error while registering osdmon caches with pcm."
526 << " Cache auto tuning not enabled."
527 << dendl;
528 mon_memory_autotune = false;
529 } else {
530 mon_memory_autotune = true;
531 }
532 }
533 }
534
535 int OSDMonitor::_update_mon_cache_settings()
536 {
537 if (g_conf()->mon_memory_target <= 0 ||
538 g_conf()->mon_memory_target < mon_memory_min ||
539 g_conf()->rocksdb_cache_size <= 0) {
540 return -EINVAL;
541 }
542
543 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
544 derr << __func__ << " not using pcm and rocksdb" << dendl;
545 return -EINVAL;
546 }
547
548 uint64_t old_mon_memory_target = mon_memory_target;
549 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
550
551 // Set the new pcm memory cache sizes
552 mon_memory_target = g_conf()->mon_memory_target;
553 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
554
555 uint64_t base = mon_memory_base;
556 double fragmentation = mon_memory_fragmentation;
557 uint64_t target = mon_memory_target;
558 uint64_t min = mon_memory_min;
559 uint64_t max = min;
560
561 uint64_t ltarget = (1.0 - fragmentation) * target;
562 if (ltarget > base + min) {
563 max = ltarget - base;
564 }
565
566 int r = _set_cache_ratios();
567 if (r < 0) {
568 derr << __func__ << " Cache ratios for pcm could not be set."
569 << " Review the kv (rocksdb) and mon_memory_target sizes."
570 << dendl;
571 mon_memory_target = old_mon_memory_target;
572 rocksdb_cache_size = old_rocksdb_cache_size;
573 return -EINVAL;
574 }
575
576 if (mon_memory_autotune && pcm != nullptr) {
577 std::lock_guard l(balancer_lock);
578 // set pcm cache levels
579 pcm->set_target_memory(target);
580 pcm->set_min_memory(min);
581 pcm->set_max_memory(max);
582 // tune memory based on new values
583 pcm->tune_memory();
584 pcm->balance();
585 _set_new_cache_sizes();
586 dout(1) << __func__ << " Updated mon cache setting."
587 << " target: " << target
588 << " min: " << min
589 << " max: " << max
590 << dendl;
591 }
592 return 0;
593 }
594
595 int OSDMonitor::_set_cache_sizes()
596 {
597 if (g_conf()->mon_memory_autotune) {
598 // set the new osdmon cache targets to be managed by pcm
599 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
600 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
601 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
602 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
603 mon_memory_target = g_conf()->mon_memory_target;
604 mon_memory_min = g_conf()->mon_osd_cache_size_min;
605 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
606 derr << __func__ << " mon_memory_target:" << mon_memory_target
607 << " mon_memory_min:" << mon_memory_min
608 << ". Invalid size option(s) provided."
609 << dendl;
610 return -EINVAL;
611 }
612 // Set the initial inc and full LRU cache sizes
613 inc_osd_cache.set_bytes(mon_memory_min);
614 full_osd_cache.set_bytes(mon_memory_min);
615 mon_memory_autotune = g_conf()->mon_memory_autotune;
616 }
617 return 0;
618 }
619
620 bool OSDMonitor::_have_pending_crush()
621 {
622 return pending_inc.crush.length() > 0;
623 }
624
625 CrushWrapper &OSDMonitor::_get_stable_crush()
626 {
627 return *osdmap.crush;
628 }
629
630 CrushWrapper OSDMonitor::_get_pending_crush()
631 {
632 bufferlist bl;
633 if (pending_inc.crush.length())
634 bl = pending_inc.crush;
635 else
636 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
637
638 auto p = bl.cbegin();
639 CrushWrapper crush;
640 crush.decode(p);
641 return crush;
642 }
643
644 void OSDMonitor::create_initial()
645 {
646 dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
647
648 OSDMap newmap;
649
650 bufferlist bl;
651 mon.store->get("mkfs", "osdmap", bl);
652
653 if (bl.length()) {
654 newmap.decode(bl);
655 newmap.set_fsid(mon.monmap->fsid);
656 } else {
657 newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
658 }
659 newmap.set_epoch(1);
660 newmap.created = newmap.modified = ceph_clock_now();
661
662 // new clusters should sort bitwise by default.
663 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
664
665 newmap.flags |=
666 CEPH_OSDMAP_RECOVERY_DELETES |
667 CEPH_OSDMAP_PURGED_SNAPDIRS |
668 CEPH_OSDMAP_PGLOG_HARDLIMIT;
669 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
670 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
671 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
672 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
673 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
674 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
675
676 // new cluster should require latest by default
677 if (g_conf().get_val<bool>("mon_debug_no_require_reef")) {
678 if (g_conf().get_val<bool>("mon_debug_no_require_quincy")) {
679 derr << __func__ << " mon_debug_no_require_reef and quincy=true" << dendl;
680 newmap.require_osd_release = ceph_release_t::pacific;
681 } else {
682 derr << __func__ << " mon_debug_no_require_reef=true" << dendl;
683 newmap.require_osd_release = ceph_release_t::quincy;
684 }
685 } else {
686 newmap.require_osd_release = ceph_release_t::reef;
687 }
688
689 ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
690 if (!r) {
691 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
692 }
693 newmap.require_min_compat_client = r;
694
695 // encode into pending incremental
696 uint64_t features = newmap.get_encoding_features();
697 newmap.encode(pending_inc.fullmap,
698 features | CEPH_FEATURE_RESERVED);
699 pending_inc.full_crc = newmap.get_crc();
700 dout(20) << " full crc " << pending_inc.full_crc << dendl;
701 }
702
703 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
704 {
705 s.insert(service_name);
706 s.insert(OSD_PG_CREATING_PREFIX);
707 s.insert(OSD_METADATA_PREFIX);
708 s.insert(OSD_SNAP_PREFIX);
709 }
710
711 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
712 {
713 // we really don't care if the version has been updated, because we may
714 // have trimmed without having increased the last committed; yet, we may
715 // need to update the in-memory manifest.
716 load_osdmap_manifest();
717
718 version_t version = get_last_committed();
719 if (version == osdmap.epoch)
720 return;
721 ceph_assert(version > osdmap.epoch);
722
723 dout(15) << "update_from_paxos paxos e " << version
724 << ", my e " << osdmap.epoch << dendl;
725
726 int prev_num_up_osd = osdmap.num_up_osd;
727
728 if (mapping_job) {
729 if (!mapping_job->is_done()) {
730 dout(1) << __func__ << " mapping job "
731 << mapping_job.get() << " did not complete, "
732 << mapping_job->shards << " left, canceling" << dendl;
733 mapping_job->abort();
734 }
735 mapping_job.reset();
736 }
737
738 load_health();
739
740 /*
741 * We will possibly have a stashed latest that *we* wrote, and we will
742 * always be sure to have the oldest full map in the first..last range
743 * due to encode_trim_extra(), which includes the oldest full map in the trim
744 * transaction.
745 *
746 * encode_trim_extra() does not however write the full map's
747 * version to 'full_latest'. This is only done when we are building the
748 * full maps from the incremental versions. But don't panic! We make sure
749 * that the following conditions find whichever full map version is newer.
750 */
751 version_t latest_full = get_version_latest_full();
752 if (latest_full == 0 && get_first_committed() > 1)
753 latest_full = get_first_committed();
754
755 if (get_first_committed() > 1 &&
756 latest_full < get_first_committed()) {
757 // the monitor could be just sync'ed with its peer, and the latest_full key
758 // is not encoded in the paxos commits in encode_pending(), so we need to
759 // make sure we get it pointing to a proper version.
760 version_t lc = get_last_committed();
761 version_t fc = get_first_committed();
762
763 dout(10) << __func__ << " looking for valid full map in interval"
764 << " [" << fc << ", " << lc << "]" << dendl;
765
766 latest_full = 0;
767 for (version_t v = lc; v >= fc; v--) {
768 string full_key = "full_" + stringify(v);
769 if (mon.store->exists(get_service_name(), full_key)) {
770 dout(10) << __func__ << " found latest full map v " << v << dendl;
771 latest_full = v;
772 break;
773 }
774 }
775
776 ceph_assert(latest_full > 0);
777 auto t(std::make_shared<MonitorDBStore::Transaction>());
778 put_version_latest_full(t, latest_full);
779 mon.store->apply_transaction(t);
780 dout(10) << __func__ << " updated the on-disk full map version to "
781 << latest_full << dendl;
782 }
783
784 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
785 bufferlist latest_bl;
786 get_version_full(latest_full, latest_bl);
787 ceph_assert(latest_bl.length() != 0);
788 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
789 osdmap = OSDMap();
790 osdmap.decode(latest_bl);
791 }
792
793 bufferlist bl;
794 if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
795 auto p = bl.cbegin();
796 std::lock_guard<std::mutex> l(creating_pgs_lock);
797 creating_pgs.decode(p);
798 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
799 << creating_pgs.last_scan_epoch
800 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
801 } else {
802 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
803 << dendl;
804 }
805
806 // walk through incrementals
807 MonitorDBStore::TransactionRef t;
808 size_t tx_size = 0;
809 while (version > osdmap.epoch) {
810 bufferlist inc_bl;
811 int err = get_version(osdmap.epoch+1, inc_bl);
812 ceph_assert(err == 0);
813 ceph_assert(inc_bl.length());
814 // set priority cache manager levels if the osdmap is
815 // being populated for the first time.
816 if (mon_memory_autotune && pcm == nullptr) {
817 int r = register_cache_with_pcm();
818 if (r < 0) {
819 dout(10) << __func__
820 << " Error while registering osdmon caches with pcm."
821 << " Proceeding without cache auto tuning."
822 << dendl;
823 }
824 }
825
826 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
827 << dendl;
828 OSDMap::Incremental inc(inc_bl);
829 err = osdmap.apply_incremental(inc);
830 ceph_assert(err == 0);
831
832 if (!t)
833 t.reset(new MonitorDBStore::Transaction);
834
835 // Write out the full map for all past epochs. Encode the full
836 // map with the same features as the incremental. If we don't
837 // know, use the quorum features. If we don't know those either,
838 // encode with all features.
839 uint64_t f = inc.encode_features;
840 if (!f)
841 f = mon.get_quorum_con_features();
842 if (!f)
843 f = -1;
844 bufferlist full_bl;
845 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
846 tx_size += full_bl.length();
847
848 bufferlist orig_full_bl;
849 get_version_full(osdmap.epoch, orig_full_bl);
850 if (orig_full_bl.length()) {
851 // the primary provided the full map
852 ceph_assert(inc.have_crc);
853 if (inc.full_crc != osdmap.crc) {
854 // This will happen if the mons were running mixed versions in
855 // the past or some other circumstance made the full encoded
856 // maps divergent. Reloading here will bring us back into
857 // sync with the primary for this and all future maps. OSDs
858 // will also be brought back into sync when they discover the
859 // crc mismatch and request a full map from a mon.
860 derr << __func__ << " full map CRC mismatch, resetting to canonical"
861 << dendl;
862
863 dout(20) << __func__ << " my (bad) full osdmap:\n";
864 JSONFormatter jf(true);
865 jf.dump_object("osdmap", osdmap);
866 jf.flush(*_dout);
867 *_dout << "\nhexdump:\n";
868 full_bl.hexdump(*_dout);
869 *_dout << dendl;
870
871 osdmap = OSDMap();
872 osdmap.decode(orig_full_bl);
873
874 dout(20) << __func__ << " canonical full osdmap:\n";
875 JSONFormatter jf(true);
876 jf.dump_object("osdmap", osdmap);
877 jf.flush(*_dout);
878 *_dout << "\nhexdump:\n";
879 orig_full_bl.hexdump(*_dout);
880 *_dout << dendl;
881 }
882 } else {
883 ceph_assert(!inc.have_crc);
884 put_version_full(t, osdmap.epoch, full_bl);
885 }
886 put_version_latest_full(t, osdmap.epoch);
887
888 // share
889 dout(1) << osdmap << dendl;
890
891 if (osdmap.epoch == 1) {
892 t->erase("mkfs", "osdmap");
893 }
894
895 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
896 mon.store->apply_transaction(t);
897 t = MonitorDBStore::TransactionRef();
898 tx_size = 0;
899 }
900 for (auto [osd, state] : inc.new_state) {
901 if (state & CEPH_OSD_UP) {
902 // could be marked up *or* down, but we're too lazy to check which
903 last_osd_report.erase(osd);
904 }
905 }
906 for (auto [osd, weight] : inc.new_weight) {
907 if (weight == CEPH_OSD_OUT) {
908 // manually marked out, so drop it
909 osd_epochs.erase(osd);
910 }
911 }
912 }
913
914 if (t) {
915 mon.store->apply_transaction(t);
916 }
917
918 bool marked_osd_down = false;
919 for (int o = 0; o < osdmap.get_max_osd(); o++) {
920 if (osdmap.is_out(o))
921 continue;
922 auto found = down_pending_out.find(o);
923 if (osdmap.is_down(o)) {
924 // populate down -> out map
925 if (found == down_pending_out.end()) {
926 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
927 down_pending_out[o] = ceph_clock_now();
928 marked_osd_down = true;
929 }
930 } else {
931 if (found != down_pending_out.end()) {
932 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
933 down_pending_out.erase(found);
934 }
935 }
936 }
937 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
938
939 check_osdmap_subs();
940 check_pg_creates_subs();
941
942 share_map_with_random_osd();
943 update_logger();
944 process_failures();
945
946 // make sure our feature bits reflect the latest map
947 update_msgr_features();
948
949 if (!mon.is_leader()) {
950 // will be called by on_active() on the leader, avoid doing so twice
951 start_mapping();
952 }
953 if (osdmap.stretch_mode_enabled) {
954 dout(20) << "Stretch mode enabled in this map" << dendl;
955 mon.try_engage_stretch_mode();
956 if (osdmap.degraded_stretch_mode) {
957 dout(20) << "Degraded stretch mode set in this map" << dendl;
958 if (!osdmap.recovering_stretch_mode) {
959 mon.set_degraded_stretch_mode();
960 dout(20) << "prev_num_up_osd: " << prev_num_up_osd << dendl;
961 dout(20) << "osdmap.num_up_osd: " << osdmap.num_up_osd << dendl;
962 dout(20) << "osdmap.num_osd: " << osdmap.num_osd << dendl;
963 dout(20) << "mon_stretch_cluster_recovery_ratio: " << cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") << dendl;
964 if (prev_num_up_osd < osdmap.num_up_osd &&
965 (osdmap.num_up_osd / (double)osdmap.num_osd) >
966 cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") &&
967 mon.dead_mon_buckets.size() == 0) {
968 // TODO: This works for 2-site clusters when the OSD maps are appropriately
969 // trimmed and everything is "normal" but not if you have a lot of out OSDs
970 // you're ignoring or in some really degenerate failure cases
971
972 dout(10) << "Enabling recovery stretch mode in this map" << dendl;
973 mon.go_recovery_stretch_mode();
974 }
975 } else {
976 mon.set_recovery_stretch_mode();
977 }
978 } else {
979 mon.set_healthy_stretch_mode();
980 }
981 if (marked_osd_down &&
982 (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
983 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
984 mon.maybe_go_degraded_stretch_mode();
985 }
986 }
987 }
988
989 int OSDMonitor::register_cache_with_pcm()
990 {
991 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
992 derr << __func__ << " Invalid memory size specified for mon caches."
993 << " Caches will not be auto-tuned."
994 << dendl;
995 return -EINVAL;
996 }
997 uint64_t base = mon_memory_base;
998 double fragmentation = mon_memory_fragmentation;
999 // For calculating total target memory, consider rocksdb cache size.
1000 uint64_t target = mon_memory_target;
1001 uint64_t min = mon_memory_min;
1002 uint64_t max = min;
1003
1004 // Apply the same logic as in bluestore to set the max amount
1005 // of memory to use for cache. Assume base memory for OSDMaps
1006 // and then add in some overhead for fragmentation.
1007 uint64_t ltarget = (1.0 - fragmentation) * target;
1008 if (ltarget > base + min) {
1009 max = ltarget - base;
1010 }
1011
1012 rocksdb_binned_kv_cache = mon.store->get_priority_cache();
1013 if (!rocksdb_binned_kv_cache) {
1014 derr << __func__ << " not using rocksdb" << dendl;
1015 return -EINVAL;
1016 }
1017
1018 int r = _set_cache_ratios();
1019 if (r < 0) {
1020 derr << __func__ << " Cache ratios for pcm could not be set."
1021 << " Review the kv (rocksdb) and mon_memory_target sizes."
1022 << dendl;
1023 return -EINVAL;
1024 }
1025
1026 pcm = std::make_shared<PriorityCache::Manager>(
1027 cct, min, max, target, true);
1028 pcm->insert("kv", rocksdb_binned_kv_cache, true);
1029 pcm->insert("inc", inc_cache, true);
1030 pcm->insert("full", full_cache, true);
1031 dout(1) << __func__ << " pcm target: " << target
1032 << " pcm max: " << max
1033 << " pcm min: " << min
1034 << " inc_osd_cache size: " << inc_osd_cache.get_size()
1035 << dendl;
1036 return 0;
1037 }
1038
1039 int OSDMonitor::_set_cache_ratios()
1040 {
1041 double old_cache_kv_ratio = cache_kv_ratio;
1042
1043 // Set the cache ratios for kv(rocksdb), inc and full caches
1044 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1045 if (cache_kv_ratio >= 1.0) {
1046 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1047 << ") must be in range [0,<1.0]."
1048 << dendl;
1049 cache_kv_ratio = old_cache_kv_ratio;
1050 return -EINVAL;
1051 }
1052 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1053 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1054 inc_cache->set_cache_ratio(cache_inc_ratio);
1055 full_cache->set_cache_ratio(cache_full_ratio);
1056
1057 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1058 << " inc ratio " << cache_inc_ratio
1059 << " full ratio " << cache_full_ratio
1060 << dendl;
1061 return 0;
1062 }
1063
1064 void OSDMonitor::start_mapping()
1065 {
1066 // initiate mapping job
1067 if (mapping_job) {
1068 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1069 << dendl;
1070 mapping_job->abort();
1071 }
1072 if (!osdmap.get_pools().empty()) {
1073 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1074 mapping_job = mapping.start_update(osdmap, mapper,
1075 g_conf()->mon_osd_mapping_pgs_per_chunk);
1076 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1077 << " at " << fin->start << dendl;
1078 mapping_job->set_finish_event(fin);
1079 } else {
1080 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1081 mapping_job = nullptr;
1082 }
1083 }
1084
1085 void OSDMonitor::update_msgr_features()
1086 {
1087 const int types[] = {
1088 entity_name_t::TYPE_OSD,
1089 entity_name_t::TYPE_CLIENT,
1090 entity_name_t::TYPE_MDS,
1091 entity_name_t::TYPE_MON
1092 };
1093 for (int type : types) {
1094 uint64_t mask;
1095 uint64_t features = osdmap.get_features(type, &mask);
1096 if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1097 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1098 ceph::net::Policy p = mon.messenger->get_policy(type);
1099 p.features_required = (p.features_required & ~mask) | features;
1100 mon.messenger->set_policy(type, p);
1101 }
1102 }
1103 }
1104
1105 void OSDMonitor::on_active()
1106 {
1107 update_logger();
1108
1109 if (mon.is_leader()) {
1110 mon.clog->debug() << "osdmap " << osdmap;
1111 if (!priority_convert) {
1112 // Only do this once at start-up
1113 convert_pool_priorities();
1114 priority_convert = true;
1115 }
1116 } else {
1117 list<MonOpRequestRef> ls;
1118 take_all_failures(ls);
1119 while (!ls.empty()) {
1120 MonOpRequestRef op = ls.front();
1121 op->mark_osdmon_event(__func__);
1122 dispatch(op);
1123 ls.pop_front();
1124 }
1125 }
1126 start_mapping();
1127 }
1128
1129 void OSDMonitor::on_restart()
1130 {
1131 last_osd_report.clear();
1132 }
1133
1134 void OSDMonitor::on_shutdown()
1135 {
1136 dout(10) << __func__ << dendl;
1137 if (mapping_job) {
1138 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1139 << dendl;
1140 mapping_job->abort();
1141 }
1142
1143 // discard failure info, waiters
1144 list<MonOpRequestRef> ls;
1145 take_all_failures(ls);
1146 ls.clear();
1147 }
1148
1149 void OSDMonitor::update_logger()
1150 {
1151 dout(10) << "update_logger" << dendl;
1152
1153 mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1154 mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1155 mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1156 mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1157 }
1158
1159 void OSDMonitor::create_pending()
1160 {
1161 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1162 pending_inc.fsid = mon.monmap->fsid;
1163 pending_metadata.clear();
1164 pending_metadata_rm.clear();
1165 pending_pseudo_purged_snaps.clear();
1166
1167 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1168
1169 // safety checks (this shouldn't really happen)
1170 {
1171 if (osdmap.backfillfull_ratio <= 0) {
1172 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1173 if (pending_inc.new_backfillfull_ratio > 1.0)
1174 pending_inc.new_backfillfull_ratio /= 100;
1175 dout(1) << __func__ << " setting backfillfull_ratio = "
1176 << pending_inc.new_backfillfull_ratio << dendl;
1177 }
1178 if (osdmap.full_ratio <= 0) {
1179 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1180 if (pending_inc.new_full_ratio > 1.0)
1181 pending_inc.new_full_ratio /= 100;
1182 dout(1) << __func__ << " setting full_ratio = "
1183 << pending_inc.new_full_ratio << dendl;
1184 }
1185 if (osdmap.nearfull_ratio <= 0) {
1186 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1187 if (pending_inc.new_nearfull_ratio > 1.0)
1188 pending_inc.new_nearfull_ratio /= 100;
1189 dout(1) << __func__ << " setting nearfull_ratio = "
1190 << pending_inc.new_nearfull_ratio << dendl;
1191 }
1192 }
1193 }
1194
1195 creating_pgs_t
1196 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1197 const OSDMap& nextmap)
1198 {
1199 dout(10) << __func__ << dendl;
1200 creating_pgs_t pending_creatings;
1201 {
1202 std::lock_guard<std::mutex> l(creating_pgs_lock);
1203 pending_creatings = creating_pgs;
1204 }
1205 // check for new or old pools
1206 if (pending_creatings.last_scan_epoch < inc.epoch) {
1207 unsigned queued = 0;
1208 queued += scan_for_creating_pgs(osdmap.get_pools(),
1209 inc.old_pools,
1210 inc.modified,
1211 &pending_creatings);
1212 queued += scan_for_creating_pgs(inc.new_pools,
1213 inc.old_pools,
1214 inc.modified,
1215 &pending_creatings);
1216 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1217 for (auto deleted_pool : inc.old_pools) {
1218 auto removed = pending_creatings.remove_pool(deleted_pool);
1219 dout(10) << __func__ << " " << removed
1220 << " pg removed because containing pool deleted: "
1221 << deleted_pool << dendl;
1222 last_epoch_clean.remove_pool(deleted_pool);
1223 }
1224 // pgmon updates its creating_pgs in check_osd_map() which is called by
1225 // on_active() and check_osd_map() could be delayed if lease expires, so its
1226 // creating_pgs could be stale in comparison with the one of osdmon. let's
1227 // trim them here. otherwise, they will be added back after being erased.
1228 unsigned removed = 0;
1229 for (auto& pg : pending_created_pgs) {
1230 dout(20) << __func__ << " noting created pg " << pg << dendl;
1231 pending_creatings.created_pools.insert(pg.pool());
1232 removed += pending_creatings.pgs.erase(pg);
1233 }
1234 pending_created_pgs.clear();
1235 dout(10) << __func__ << " " << removed
1236 << " pgs removed because they're created" << dendl;
1237 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1238 }
1239
1240 // filter out any pgs that shouldn't exist.
1241 {
1242 auto i = pending_creatings.pgs.begin();
1243 while (i != pending_creatings.pgs.end()) {
1244 if (!nextmap.pg_exists(i->first)) {
1245 dout(10) << __func__ << " removing pg " << i->first
1246 << " which should not exist" << dendl;
1247 i = pending_creatings.pgs.erase(i);
1248 } else {
1249 ++i;
1250 }
1251 }
1252 }
1253
1254 // process queue
1255 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1256 const auto total = pending_creatings.pgs.size();
1257 while (pending_creatings.pgs.size() < max &&
1258 !pending_creatings.queue.empty()) {
1259 auto p = pending_creatings.queue.begin();
1260 int64_t poolid = p->first;
1261 dout(10) << __func__ << " pool " << poolid
1262 << " created " << p->second.created
1263 << " modified " << p->second.modified
1264 << " [" << p->second.start << "-" << p->second.end << ")"
1265 << dendl;
1266 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1267 p->second.end - p->second.start);
1268 ps_t first = p->second.start;
1269 ps_t end = first + n;
1270 for (ps_t ps = first; ps < end; ++ps) {
1271 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1272 // NOTE: use the *current* epoch as the PG creation epoch so that the
1273 // OSD does not have to generate a long set of PastIntervals.
1274 pending_creatings.pgs.emplace(
1275 pgid,
1276 creating_pgs_t::pg_create_info(inc.epoch,
1277 p->second.modified));
1278 dout(10) << __func__ << " adding " << pgid << dendl;
1279 }
1280 p->second.start = end;
1281 if (p->second.done()) {
1282 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1283 pending_creatings.queue.erase(p);
1284 } else {
1285 dout(10) << __func__ << " pool " << poolid
1286 << " now [" << p->second.start << "-" << p->second.end << ")"
1287 << dendl;
1288 }
1289 }
1290 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1291 << " pools" << dendl;
1292
1293 if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1294 // walk creating pgs' history and past_intervals forward
1295 for (auto& i : pending_creatings.pgs) {
1296 // this mirrors PG::start_peering_interval()
1297 pg_t pgid = i.first;
1298
1299 // this is a bit imprecise, but sufficient?
1300 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1301 const pg_pool_t *pi;
1302 bool operator()(const set<pg_shard_t> &have) const {
1303 return have.size() >= pi->min_size;
1304 }
1305 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1306 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1307
1308 vector<int> up, acting;
1309 int up_primary, acting_primary;
1310 nextmap.pg_to_up_acting_osds(
1311 pgid, &up, &up_primary, &acting, &acting_primary);
1312 if (i.second.history.epoch_created == 0) {
1313 // new pg entry, set it up
1314 i.second.up = up;
1315 i.second.acting = acting;
1316 i.second.up_primary = up_primary;
1317 i.second.acting_primary = acting_primary;
1318 i.second.history = pg_history_t(i.second.create_epoch,
1319 i.second.create_stamp);
1320 dout(10) << __func__ << " pg " << pgid << " just added, "
1321 << " up " << i.second.up
1322 << " p " << i.second.up_primary
1323 << " acting " << i.second.acting
1324 << " p " << i.second.acting_primary
1325 << " history " << i.second.history
1326 << " past_intervals " << i.second.past_intervals
1327 << dendl;
1328 } else {
1329 std::stringstream debug;
1330 if (PastIntervals::check_new_interval(
1331 i.second.acting_primary, acting_primary,
1332 i.second.acting, acting,
1333 i.second.up_primary, up_primary,
1334 i.second.up, up,
1335 i.second.history.same_interval_since,
1336 i.second.history.last_epoch_clean,
1337 &nextmap,
1338 &osdmap,
1339 pgid,
1340 min_size_predicate,
1341 &i.second.past_intervals,
1342 &debug)) {
1343 epoch_t e = inc.epoch;
1344 i.second.history.same_interval_since = e;
1345 if (i.second.up != up) {
1346 i.second.history.same_up_since = e;
1347 }
1348 if (i.second.acting_primary != acting_primary) {
1349 i.second.history.same_primary_since = e;
1350 }
1351 if (pgid.is_split(
1352 osdmap.get_pg_num(pgid.pool()),
1353 nextmap.get_pg_num(pgid.pool()),
1354 nullptr)) {
1355 i.second.history.last_epoch_split = e;
1356 }
1357 dout(10) << __func__ << " pg " << pgid << " new interval,"
1358 << " up " << i.second.up << " -> " << up
1359 << " p " << i.second.up_primary << " -> " << up_primary
1360 << " acting " << i.second.acting << " -> " << acting
1361 << " p " << i.second.acting_primary << " -> "
1362 << acting_primary
1363 << " history " << i.second.history
1364 << " past_intervals " << i.second.past_intervals
1365 << dendl;
1366 dout(20) << " debug: " << debug.str() << dendl;
1367 i.second.up = up;
1368 i.second.acting = acting;
1369 i.second.up_primary = up_primary;
1370 i.second.acting_primary = acting_primary;
1371 }
1372 }
1373 }
1374 }
1375 dout(10) << __func__
1376 << " " << (pending_creatings.pgs.size() - total)
1377 << "/" << pending_creatings.pgs.size()
1378 << " pgs added from queued pools" << dendl;
1379 return pending_creatings;
1380 }
1381
1382 void OSDMonitor::maybe_prime_pg_temp()
1383 {
1384 bool all = false;
1385 if (pending_inc.crush.length()) {
1386 dout(10) << __func__ << " new crush map, all" << dendl;
1387 all = true;
1388 }
1389
1390 if (!pending_inc.new_up_client.empty()) {
1391 dout(10) << __func__ << " new up osds, all" << dendl;
1392 all = true;
1393 }
1394
1395 // check for interesting OSDs
1396 set<int> osds;
1397 for (auto p = pending_inc.new_state.begin();
1398 !all && p != pending_inc.new_state.end();
1399 ++p) {
1400 if ((p->second & CEPH_OSD_UP) &&
1401 osdmap.is_up(p->first)) {
1402 osds.insert(p->first);
1403 }
1404 }
1405 for (auto p = pending_inc.new_weight.begin();
1406 !all && p != pending_inc.new_weight.end();
1407 ++p) {
1408 if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1409 // weight reduction
1410 osds.insert(p->first);
1411 } else {
1412 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1413 << dendl;
1414 all = true;
1415 }
1416 }
1417
1418 if (!all && osds.empty())
1419 return;
1420
1421 if (!all) {
1422 unsigned estimate =
1423 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1424 if (estimate > mapping.get_num_pgs() *
1425 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1426 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1427 << osds.size() << " osds >= "
1428 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1429 << mapping.get_num_pgs() << " pgs, all"
1430 << dendl;
1431 all = true;
1432 } else {
1433 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1434 << osds.size() << " osds" << dendl;
1435 }
1436 }
1437
1438 OSDMap next;
1439 next.deepish_copy_from(osdmap);
1440 next.apply_incremental(pending_inc);
1441
1442 if (next.get_pools().empty()) {
1443 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1444 } else if (all) {
1445 PrimeTempJob job(next, this);
1446 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1447 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1448 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1449 } else {
1450 dout(10) << __func__ << " did not finish in "
1451 << g_conf()->mon_osd_prime_pg_temp_max_time
1452 << ", stopping" << dendl;
1453 job.abort();
1454 }
1455 } else {
1456 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1457 utime_t stop = ceph_clock_now();
1458 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1459 const int chunk = 1000;
1460 int n = chunk;
1461 std::unordered_set<pg_t> did_pgs;
1462 for (auto osd : osds) {
1463 auto& pgs = mapping.get_osd_acting_pgs(osd);
1464 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1465 for (auto pgid : pgs) {
1466 if (!did_pgs.insert(pgid).second) {
1467 continue;
1468 }
1469 prime_pg_temp(next, pgid);
1470 if (--n <= 0) {
1471 n = chunk;
1472 if (ceph_clock_now() > stop) {
1473 dout(10) << __func__ << " consumed more than "
1474 << g_conf()->mon_osd_prime_pg_temp_max_time
1475 << " seconds, stopping"
1476 << dendl;
1477 return;
1478 }
1479 }
1480 }
1481 }
1482 }
1483 }
1484
1485 void OSDMonitor::prime_pg_temp(
1486 const OSDMap& next,
1487 pg_t pgid)
1488 {
1489 // TODO: remove this creating_pgs direct access?
1490 if (creating_pgs.pgs.count(pgid)) {
1491 return;
1492 }
1493 if (!osdmap.pg_exists(pgid)) {
1494 return;
1495 }
1496
1497 vector<int> up, acting;
1498 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1499
1500 vector<int> next_up, next_acting;
1501 int next_up_primary, next_acting_primary;
1502 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1503 &next_acting, &next_acting_primary);
1504 if (acting == next_acting &&
1505 !(up != acting && next_up == next_acting))
1506 return; // no change since last epoch
1507
1508 if (acting.empty())
1509 return; // if previously empty now we can be no worse off
1510 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1511 if (pool && acting.size() < pool->min_size)
1512 return; // can be no worse off than before
1513
1514 if (next_up == next_acting) {
1515 acting.clear();
1516 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1517 << dendl;
1518 }
1519
1520 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1521 << " -> " << next_up << "/" << next_acting
1522 << ", priming " << acting
1523 << dendl;
1524 {
1525 std::lock_guard l(prime_pg_temp_lock);
1526 // do not touch a mapping if a change is pending
1527 pending_inc.new_pg_temp.emplace(
1528 pgid,
1529 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1530 }
1531 }
1532
1533 /**
1534 * @note receiving a transaction in this function gives a fair amount of
1535 * freedom to the service implementation if it does need it. It shouldn't.
1536 */
1537 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1538 {
1539 dout(10) << "encode_pending e " << pending_inc.epoch
1540 << dendl;
1541
1542 if (do_prune(t)) {
1543 dout(1) << __func__ << " osdmap full prune encoded e"
1544 << pending_inc.epoch << dendl;
1545 }
1546
1547 // finalize up pending_inc
1548 pending_inc.modified = ceph_clock_now();
1549
1550 int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1551 ceph_assert(r == 0);
1552
1553 if (mapping_job) {
1554 if (!mapping_job->is_done()) {
1555 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1556 << mapping_job.get() << " did not complete, "
1557 << mapping_job->shards << " left" << dendl;
1558 mapping_job->abort();
1559 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1560 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1561 << mapping_job.get() << " is prior epoch "
1562 << mapping.get_epoch() << dendl;
1563 } else {
1564 if (g_conf()->mon_osd_prime_pg_temp) {
1565 maybe_prime_pg_temp();
1566 }
1567 }
1568 } else if (g_conf()->mon_osd_prime_pg_temp) {
1569 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1570 << dendl;
1571 }
1572 mapping_job.reset();
1573
1574 // ensure we don't have blank new_state updates. these are interrpeted as
1575 // CEPH_OSD_UP (and almost certainly not what we want!).
1576 auto p = pending_inc.new_state.begin();
1577 while (p != pending_inc.new_state.end()) {
1578 if (p->second == 0) {
1579 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1580 p = pending_inc.new_state.erase(p);
1581 } else {
1582 if (p->second & CEPH_OSD_UP) {
1583 pending_inc.new_last_up_change = pending_inc.modified;
1584 }
1585 ++p;
1586 }
1587 }
1588 if (!pending_inc.new_up_client.empty()) {
1589 pending_inc.new_last_up_change = pending_inc.modified;
1590 }
1591 for (auto& i : pending_inc.new_weight) {
1592 if (i.first >= osdmap.max_osd) {
1593 if (i.second) {
1594 // new osd is already marked in
1595 pending_inc.new_last_in_change = pending_inc.modified;
1596 break;
1597 }
1598 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1599 // existing osd marked in or out
1600 pending_inc.new_last_in_change = pending_inc.modified;
1601 break;
1602 }
1603 }
1604
1605 {
1606 OSDMap tmp;
1607 tmp.deepish_copy_from(osdmap);
1608 tmp.apply_incremental(pending_inc);
1609
1610 // clean pg_temp mappings
1611 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1612
1613 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1614 {
1615 // check every upmapped pg for now
1616 // until we could reliably identify certain cases to ignore,
1617 // which is obviously the hard part TBD..
1618 vector<pg_t> pgs_to_check;
1619 tmp.get_upmap_pgs(&pgs_to_check);
1620 if (pgs_to_check.size() <
1621 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1622 // not enough pgs, do it inline
1623 tmp.clean_pg_upmaps(cct, &pending_inc);
1624 } else {
1625 CleanUpmapJob job(cct, tmp, pending_inc);
1626 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1627 job.wait();
1628 }
1629 }
1630
1631 // update creating pgs first so that we can remove the created pgid and
1632 // process the pool flag removal below in the same osdmap epoch.
1633 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1634 bufferlist creatings_bl;
1635 uint64_t features = CEPH_FEATURES_ALL;
1636 if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1637 dout(20) << __func__ << " encoding pending pgs without octopus features"
1638 << dendl;
1639 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1640 }
1641 encode(pending_creatings, creatings_bl, features);
1642 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1643
1644 // remove any old (or incompat) POOL_CREATING flags
1645 for (auto& i : tmp.get_pools()) {
1646 if (tmp.require_osd_release < ceph_release_t::nautilus) {
1647 // pre-nautilus OSDMaps shouldn't get this flag.
1648 if (pending_inc.new_pools.count(i.first)) {
1649 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1650 }
1651 }
1652 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1653 !pending_creatings.still_creating_pool(i.first)) {
1654 dout(10) << __func__ << " done creating pool " << i.first
1655 << ", clearing CREATING flag" << dendl;
1656 if (pending_inc.new_pools.count(i.first) == 0) {
1657 pending_inc.new_pools[i.first] = i.second;
1658 }
1659 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1660 }
1661 }
1662
1663 // collect which pools are currently affected by
1664 // the near/backfill/full osd(s),
1665 // and set per-pool near/backfill/full flag instead
1666 set<int64_t> full_pool_ids;
1667 set<int64_t> backfillfull_pool_ids;
1668 set<int64_t> nearfull_pool_ids;
1669 tmp.get_full_pools(cct,
1670 &full_pool_ids,
1671 &backfillfull_pool_ids,
1672 &nearfull_pool_ids);
1673 if (full_pool_ids.empty() ||
1674 backfillfull_pool_ids.empty() ||
1675 nearfull_pool_ids.empty()) {
1676 // normal case - no nearfull, backfillfull or full osds
1677 // try cancel any improper nearfull/backfillfull/full pool
1678 // flags first
1679 for (auto &pool: tmp.get_pools()) {
1680 auto p = pool.first;
1681 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1682 nearfull_pool_ids.empty()) {
1683 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1684 << "'s nearfull flag" << dendl;
1685 if (pending_inc.new_pools.count(p) == 0) {
1686 // load original pool info first!
1687 pending_inc.new_pools[p] = pool.second;
1688 }
1689 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1690 }
1691 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1692 backfillfull_pool_ids.empty()) {
1693 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1694 << "'s backfillfull flag" << dendl;
1695 if (pending_inc.new_pools.count(p) == 0) {
1696 pending_inc.new_pools[p] = pool.second;
1697 }
1698 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1699 }
1700 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1701 full_pool_ids.empty()) {
1702 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1703 // set by EQUOTA, skipping
1704 continue;
1705 }
1706 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1707 << "'s full flag" << dendl;
1708 if (pending_inc.new_pools.count(p) == 0) {
1709 pending_inc.new_pools[p] = pool.second;
1710 }
1711 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1712 }
1713 }
1714 }
1715 if (!full_pool_ids.empty()) {
1716 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1717 << " as full" << dendl;
1718 for (auto &p: full_pool_ids) {
1719 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1720 continue;
1721 }
1722 if (pending_inc.new_pools.count(p) == 0) {
1723 pending_inc.new_pools[p] = tmp.pools[p];
1724 }
1725 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1726 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1727 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1728 }
1729 // cancel FLAG_FULL for pools which are no longer full too
1730 for (auto &pool: tmp.get_pools()) {
1731 auto p = pool.first;
1732 if (full_pool_ids.count(p)) {
1733 // skip pools we have just marked as full above
1734 continue;
1735 }
1736 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1737 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1738 // don't touch if currently is not full
1739 // or is running out of quota (and hence considered as full)
1740 continue;
1741 }
1742 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743 << "'s full flag" << dendl;
1744 if (pending_inc.new_pools.count(p) == 0) {
1745 pending_inc.new_pools[p] = pool.second;
1746 }
1747 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1748 }
1749 }
1750 if (!backfillfull_pool_ids.empty()) {
1751 for (auto &p: backfillfull_pool_ids) {
1752 if (full_pool_ids.count(p)) {
1753 // skip pools we have already considered as full above
1754 continue;
1755 }
1756 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1757 // make sure FLAG_FULL is truly set, so we are safe not
1758 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1759 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1760 continue;
1761 }
1762 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1763 // don't bother if pool is already marked as backfillfull
1764 continue;
1765 }
1766 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1767 << "'s as backfillfull" << dendl;
1768 if (pending_inc.new_pools.count(p) == 0) {
1769 pending_inc.new_pools[p] = tmp.pools[p];
1770 }
1771 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1772 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1773 }
1774 // cancel FLAG_BACKFILLFULL for pools
1775 // which are no longer backfillfull too
1776 for (auto &pool: tmp.get_pools()) {
1777 auto p = pool.first;
1778 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1779 // skip pools we have just marked as backfillfull/full above
1780 continue;
1781 }
1782 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1783 // and don't touch if currently is not backfillfull
1784 continue;
1785 }
1786 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1787 << "'s backfillfull flag" << dendl;
1788 if (pending_inc.new_pools.count(p) == 0) {
1789 pending_inc.new_pools[p] = pool.second;
1790 }
1791 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1792 }
1793 }
1794 if (!nearfull_pool_ids.empty()) {
1795 for (auto &p: nearfull_pool_ids) {
1796 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1797 continue;
1798 }
1799 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1800 // make sure FLAG_FULL is truly set, so we are safe not
1801 // to set a extra (redundant) FLAG_NEARFULL flag
1802 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1803 continue;
1804 }
1805 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1806 // don't bother if pool is already marked as nearfull
1807 continue;
1808 }
1809 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1810 << "'s as nearfull" << dendl;
1811 if (pending_inc.new_pools.count(p) == 0) {
1812 pending_inc.new_pools[p] = tmp.pools[p];
1813 }
1814 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1815 }
1816 // cancel FLAG_NEARFULL for pools
1817 // which are no longer nearfull too
1818 for (auto &pool: tmp.get_pools()) {
1819 auto p = pool.first;
1820 if (full_pool_ids.count(p) ||
1821 backfillfull_pool_ids.count(p) ||
1822 nearfull_pool_ids.count(p)) {
1823 // skip pools we have just marked as
1824 // nearfull/backfillfull/full above
1825 continue;
1826 }
1827 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1828 // and don't touch if currently is not nearfull
1829 continue;
1830 }
1831 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1832 << "'s nearfull flag" << dendl;
1833 if (pending_inc.new_pools.count(p) == 0) {
1834 pending_inc.new_pools[p] = pool.second;
1835 }
1836 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1837 }
1838 }
1839
1840 // min_compat_client?
1841 if (!tmp.require_min_compat_client) {
1842 auto mv = tmp.get_min_compat_client();
1843 dout(1) << __func__ << " setting require_min_compat_client to currently "
1844 << "required " << mv << dendl;
1845 mon.clog->info() << "setting require_min_compat_client to currently "
1846 << "required " << mv;
1847 pending_inc.new_require_min_compat_client = mv;
1848 }
1849
1850 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1851 tmp.require_osd_release >= ceph_release_t::nautilus) {
1852 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1853 // add creating flags?
1854 for (auto& i : tmp.get_pools()) {
1855 if (pending_creatings.still_creating_pool(i.first)) {
1856 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1857 << dendl;
1858 if (pending_inc.new_pools.count(i.first) == 0) {
1859 pending_inc.new_pools[i.first] = i.second;
1860 }
1861 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1862 }
1863 }
1864 // adjust blocklist items to all be TYPE_ANY
1865 for (auto& i : tmp.blocklist) {
1866 auto a = i.first;
1867 a.set_type(entity_addr_t::TYPE_ANY);
1868 pending_inc.new_blocklist[a] = i.second;
1869 pending_inc.old_blocklist.push_back(i.first);
1870 }
1871 }
1872
1873 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1874 tmp.require_osd_release >= ceph_release_t::octopus) {
1875 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1876
1877 // adjust obsoleted cache modes
1878 for (auto& [poolid, pi] : tmp.pools) {
1879 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1880 if (pending_inc.new_pools.count(poolid) == 0) {
1881 pending_inc.new_pools[poolid] = pi;
1882 }
1883 dout(10) << __func__ << " switching pool " << poolid
1884 << " cachemode from forward -> proxy" << dendl;
1885 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1886 }
1887 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1888 if (pending_inc.new_pools.count(poolid) == 0) {
1889 pending_inc.new_pools[poolid] = pi;
1890 }
1891 dout(10) << __func__ << " switching pool " << poolid
1892 << " cachemode from readforward -> readproxy" << dendl;
1893 pending_inc.new_pools[poolid].cache_mode =
1894 pg_pool_t::CACHEMODE_READPROXY;
1895 }
1896 }
1897
1898 // clear removed_snaps for every pool
1899 for (auto& [poolid, pi] : tmp.pools) {
1900 if (pi.removed_snaps.empty()) {
1901 continue;
1902 }
1903 if (pending_inc.new_pools.count(poolid) == 0) {
1904 pending_inc.new_pools[poolid] = pi;
1905 }
1906 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1907 << dendl;
1908 pending_inc.new_pools[poolid].removed_snaps.clear();
1909 }
1910
1911 // create a combined purged snap epoch key for all purged snaps
1912 // prior to this epoch, and store it in the current epoch (i.e.,
1913 // the last pre-octopus epoch, just prior to the one we're
1914 // encoding now).
1915 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1916 it->lower_bound("purged_snap_");
1917 map<int64_t,snap_interval_set_t> combined;
1918 while (it->valid()) {
1919 if (it->key().find("purged_snap_") != 0) {
1920 break;
1921 }
1922 string k = it->key();
1923 long long unsigned pool;
1924 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1925 if (n != 1) {
1926 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1927 } else {
1928 bufferlist v = it->value();
1929 auto p = v.cbegin();
1930 snapid_t begin, end;
1931 ceph::decode(begin, p);
1932 ceph::decode(end, p);
1933 combined[pool].insert(begin, end - begin);
1934 }
1935 it->next();
1936 }
1937 if (!combined.empty()) {
1938 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1939 bufferlist v;
1940 ceph::encode(combined, v);
1941 t->put(OSD_SNAP_PREFIX, k, v);
1942 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1943 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1944 << dendl;
1945 } else {
1946 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1947 << dendl;
1948 }
1949
1950 // clean out the old removed_snap_ and removed_epoch keys
1951 // ('`' is ASCII '_' + 1)
1952 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1953 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1954 }
1955 }
1956
1957 // tell me about it
1958 for (auto i = pending_inc.new_state.begin();
1959 i != pending_inc.new_state.end();
1960 ++i) {
1961 int s = i->second ? i->second : CEPH_OSD_UP;
1962 if (s & CEPH_OSD_UP) {
1963 dout(2) << " osd." << i->first << " DOWN" << dendl;
1964 // Reset laggy parameters if failure interval exceeds a threshold.
1965 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1966 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1967 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1968 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1969 set_default_laggy_params(i->first);
1970 }
1971 }
1972 }
1973 if (s & CEPH_OSD_EXISTS)
1974 dout(2) << " osd." << i->first << " DNE" << dendl;
1975 }
1976 for (auto i = pending_inc.new_up_client.begin();
1977 i != pending_inc.new_up_client.end();
1978 ++i) {
1979 //FIXME: insert cluster addresses too
1980 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1981 }
1982 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1983 i != pending_inc.new_weight.end();
1984 ++i) {
1985 if (i->second == CEPH_OSD_OUT) {
1986 dout(2) << " osd." << i->first << " OUT" << dendl;
1987 } else if (i->second == CEPH_OSD_IN) {
1988 dout(2) << " osd." << i->first << " IN" << dendl;
1989 } else {
1990 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1991 }
1992 }
1993
1994 // features for osdmap and its incremental
1995 uint64_t features;
1996
1997 // encode full map and determine its crc
1998 OSDMap tmp;
1999 {
2000 tmp.deepish_copy_from(osdmap);
2001 tmp.apply_incremental(pending_inc);
2002
2003 // determine appropriate features
2004 features = tmp.get_encoding_features();
2005 dout(10) << __func__ << " encoding full map with "
2006 << tmp.require_osd_release
2007 << " features " << features << dendl;
2008
2009 // the features should be a subset of the mon quorum's features!
2010 ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2011
2012 bufferlist fullbl;
2013 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2014 pending_inc.full_crc = tmp.get_crc();
2015
2016 // include full map in the txn. note that old monitors will
2017 // overwrite this. new ones will now skip the local full map
2018 // encode and reload from this.
2019 put_version_full(t, pending_inc.epoch, fullbl);
2020 }
2021
2022 // encode
2023 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2024 bufferlist bl;
2025 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2026
2027 dout(20) << " full_crc " << tmp.get_crc()
2028 << " inc_crc " << pending_inc.inc_crc << dendl;
2029
2030 /* put everything in the transaction */
2031 put_version(t, pending_inc.epoch, bl);
2032 put_last_committed(t, pending_inc.epoch);
2033
2034 // metadata, too!
2035 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2036 p != pending_metadata.end();
2037 ++p) {
2038 Metadata m;
2039 auto mp = p->second.cbegin();
2040 decode(m, mp);
2041 auto it = m.find("osd_objectstore");
2042 if (it != m.end()) {
2043 if (it->second == "filestore") {
2044 filestore_osds.insert(p->first);
2045 } else {
2046 filestore_osds.erase(p->first);
2047 }
2048 }
2049 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2050 }
2051 for (set<int>::iterator p = pending_metadata_rm.begin();
2052 p != pending_metadata_rm.end();
2053 ++p) {
2054 filestore_osds.erase(*p);
2055 t->erase(OSD_METADATA_PREFIX, stringify(*p));
2056 }
2057 pending_metadata.clear();
2058 pending_metadata_rm.clear();
2059
2060 // purged_snaps
2061 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2062 !pending_inc.new_purged_snaps.empty()) {
2063 // all snaps purged this epoch (across all pools)
2064 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2065 bufferlist v;
2066 encode(pending_inc.new_purged_snaps, v);
2067 t->put(OSD_SNAP_PREFIX, k, v);
2068 }
2069 for (auto& i : pending_inc.new_purged_snaps) {
2070 for (auto q = i.second.begin();
2071 q != i.second.end();
2072 ++q) {
2073 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2074 pending_inc.epoch,
2075 t);
2076 }
2077 }
2078 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2079 for (auto snap : snaps) {
2080 insert_purged_snap_update(pool, snap, snap + 1,
2081 pending_inc.epoch,
2082 t);
2083 }
2084 }
2085
2086 // health
2087 health_check_map_t next;
2088 tmp.check_health(cct, &next);
2089 // OSD_FILESTORE
2090 check_for_filestore_osds(&next);
2091 encode_health(next, t);
2092 }
2093
2094 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2095 {
2096 bufferlist bl;
2097 int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2098 if (r < 0)
2099 return r;
2100 try {
2101 auto p = bl.cbegin();
2102 decode(m, p);
2103 }
2104 catch (ceph::buffer::error& e) {
2105 if (err)
2106 *err << "osd." << osd << " metadata is corrupt";
2107 return -EIO;
2108 }
2109 return 0;
2110 }
2111
2112 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2113 {
2114 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2115 if (osdmap.is_up(osd)) {
2116 map<string,string> meta;
2117 load_metadata(osd, meta, nullptr);
2118 auto p = meta.find(field);
2119 if (p == meta.end()) {
2120 (*out)["unknown"]++;
2121 } else {
2122 (*out)[p->second]++;
2123 }
2124 }
2125 }
2126 }
2127
2128 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2129 {
2130 map<string,int> by_val;
2131 count_metadata(field, &by_val);
2132 f->open_object_section(field.c_str());
2133 for (auto& p : by_val) {
2134 f->dump_int(p.first.c_str(), p.second);
2135 }
2136 f->close_section();
2137 }
2138
2139 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2140 {
2141 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2142 if (osdmap.is_up(osd)) {
2143 map<string,string> meta;
2144 load_metadata(osd, meta, nullptr);
2145 auto p = meta.find("ceph_version_short");
2146 if (p == meta.end()) continue;
2147 versions[p->second].push_back(string("osd.") + stringify(osd));
2148 }
2149 }
2150 }
2151
2152 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2153 {
2154 map<string, string> metadata;
2155 int r = load_metadata(osd, metadata, nullptr);
2156 if (r < 0)
2157 return r;
2158
2159 auto it = metadata.find("osd_objectstore");
2160 if (it == metadata.end())
2161 return -ENOENT;
2162 *type = it->second;
2163 return 0;
2164 }
2165
2166 void OSDMonitor::get_filestore_osd_list()
2167 {
2168 for (unsigned osd = 0; osd < osdmap.get_num_osds(); ++osd) {
2169 string objectstore_type;
2170 int r = get_osd_objectstore_type(osd, &objectstore_type);
2171 if (r == 0 && objectstore_type == "filestore") {
2172 filestore_osds.insert(osd);
2173 }
2174 }
2175 }
2176
2177 void OSDMonitor::check_for_filestore_osds(health_check_map_t *checks)
2178 {
2179 if (g_conf()->mon_warn_on_filestore_osds &&
2180 filestore_osds.size() > 0) {
2181 ostringstream ss, deprecated_tip;
2182 list<string> detail;
2183 ss << filestore_osds.size()
2184 << " osd(s) "
2185 << (filestore_osds.size() == 1 ? "is" : "are")
2186 << " running Filestore";
2187 deprecated_tip << ss.str();
2188 ss << " [Deprecated]";
2189 auto& d = checks->add("OSD_FILESTORE", HEALTH_WARN, ss.str(),
2190 filestore_osds.size());
2191 deprecated_tip << ", which has been deprecated and"
2192 << " not been optimized for QoS"
2193 << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2194 detail.push_back(deprecated_tip.str());
2195 d.detail.swap(detail);
2196 }
2197 }
2198
2199 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2200 const pg_pool_t &pool,
2201 ostream *err)
2202 {
2203 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2204 // since filestore osds could always join the pool later
2205 set<int> checked_osds;
2206 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2207 vector<int> up, acting;
2208 pg_t pgid(ps, pool_id);
2209 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2210 for (int osd : up) {
2211 if (checked_osds.find(osd) != checked_osds.end())
2212 continue;
2213 string objectstore_type;
2214 int r = get_osd_objectstore_type(osd, &objectstore_type);
2215 // allow with missing metadata, e.g. due to an osd never booting yet
2216 if (r < 0 || objectstore_type == "bluestore") {
2217 checked_osds.insert(osd);
2218 continue;
2219 }
2220 *err << "osd." << osd << " uses " << objectstore_type;
2221 return false;
2222 }
2223 }
2224 return true;
2225 }
2226
2227 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2228 {
2229 map<string,string> m;
2230 if (int r = load_metadata(osd, m, err))
2231 return r;
2232 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2233 f->dump_string(p->first.c_str(), p->second);
2234 return 0;
2235 }
2236
2237 void OSDMonitor::print_nodes(Formatter *f)
2238 {
2239 // group OSDs by their hosts
2240 map<string, list<int> > osds; // hostname => osd
2241 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2242 map<string, string> m;
2243 if (load_metadata(osd, m, NULL)) {
2244 continue;
2245 }
2246 map<string, string>::iterator hostname = m.find("hostname");
2247 if (hostname == m.end()) {
2248 // not likely though
2249 continue;
2250 }
2251 osds[hostname->second].push_back(osd);
2252 }
2253
2254 dump_services(f, osds, "osd");
2255 }
2256
2257 void OSDMonitor::share_map_with_random_osd()
2258 {
2259 if (osdmap.get_num_up_osds() == 0) {
2260 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2261 return;
2262 }
2263
2264 MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2265 if (!s) {
2266 dout(10) << __func__ << " no up osd on our session map" << dendl;
2267 return;
2268 }
2269
2270 dout(10) << "committed, telling random " << s->name
2271 << " all about it" << dendl;
2272
2273 // get feature of the peer
2274 // use quorum_con_features, if it's an anonymous connection.
2275 uint64_t features = s->con_features ? s->con_features :
2276 mon.get_quorum_con_features();
2277 // whatev, they'll request more if they need it
2278 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2279 s->con->send_message(m);
2280 // NOTE: do *not* record osd has up to this epoch (as we do
2281 // elsewhere) as they may still need to request older values.
2282 }
2283
2284 version_t OSDMonitor::get_trim_to() const
2285 {
2286 if (mon.get_quorum().empty()) {
2287 dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2288 return 0;
2289 }
2290
2291 {
2292 std::lock_guard<std::mutex> l(creating_pgs_lock);
2293 if (!creating_pgs.pgs.empty()) {
2294 dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2295 return 0;
2296 }
2297 }
2298
2299 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2300 dout(0) << __func__
2301 << " blocking osdmap trim"
2302 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2303 << " trim_to = 0" << dendl;
2304 return 0;
2305 }
2306
2307 {
2308 epoch_t floor = get_min_last_epoch_clean();
2309 dout(10) << " min_last_epoch_clean " << floor << dendl;
2310 if (g_conf()->mon_osd_force_trim_to > 0 &&
2311 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2312 floor = g_conf()->mon_osd_force_trim_to;
2313 dout(10) << __func__
2314 << " explicit mon_osd_force_trim_to = " << floor << dendl;
2315 }
2316 unsigned min = g_conf()->mon_min_osdmap_epochs;
2317 if (floor + min > get_last_committed()) {
2318 if (min < get_last_committed())
2319 floor = get_last_committed() - min;
2320 else
2321 floor = 0;
2322 }
2323 if (floor > get_first_committed()) {
2324 dout(10) << __func__ << " trim_to = " << floor << dendl;
2325 return floor;
2326 }
2327 }
2328 dout(10) << __func__ << " trim_to = 0" << dendl;
2329 return 0;
2330 }
2331
2332 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2333 {
2334 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2335 // also scan osd epochs
2336 // don't trim past the oldest reported osd epoch
2337 for (auto [osd, epoch] : osd_epochs) {
2338 if (epoch < floor) {
2339 floor = epoch;
2340 }
2341 }
2342 return floor;
2343 }
2344
2345 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2346 version_t first)
2347 {
2348 dout(10) << __func__ << " including full map for e " << first << dendl;
2349 bufferlist bl;
2350 get_version_full(first, bl);
2351 put_version_full(tx, first, bl);
2352
2353 if (has_osdmap_manifest &&
2354 first > osdmap_manifest.get_first_pinned()) {
2355 _prune_update_trimmed(tx, first);
2356 }
2357 }
2358
2359
2360 /* full osdmap prune
2361 *
2362 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2363 */
2364
2365 void OSDMonitor::load_osdmap_manifest()
2366 {
2367 bool store_has_manifest =
2368 mon.store->exists(get_service_name(), "osdmap_manifest");
2369
2370 if (!store_has_manifest) {
2371 if (!has_osdmap_manifest) {
2372 return;
2373 }
2374
2375 dout(20) << __func__
2376 << " dropping osdmap manifest from memory." << dendl;
2377 osdmap_manifest = osdmap_manifest_t();
2378 has_osdmap_manifest = false;
2379 return;
2380 }
2381
2382 dout(20) << __func__
2383 << " osdmap manifest detected in store; reload." << dendl;
2384
2385 bufferlist manifest_bl;
2386 int r = get_value("osdmap_manifest", manifest_bl);
2387 if (r < 0) {
2388 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2389 ceph_abort_msg("error reading manifest");
2390 }
2391 osdmap_manifest.decode(manifest_bl);
2392 has_osdmap_manifest = true;
2393
2394 dout(10) << __func__ << " store osdmap manifest pinned ("
2395 << osdmap_manifest.get_first_pinned()
2396 << " .. "
2397 << osdmap_manifest.get_last_pinned()
2398 << ")"
2399 << dendl;
2400 }
2401
2402 bool OSDMonitor::should_prune() const
2403 {
2404 version_t first = get_first_committed();
2405 version_t last = get_last_committed();
2406 version_t min_osdmap_epochs =
2407 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2408 version_t prune_min =
2409 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2410 version_t prune_interval =
2411 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2412 version_t last_pinned = osdmap_manifest.get_last_pinned();
2413 version_t last_to_pin = last - min_osdmap_epochs;
2414
2415 // Make it or break it constraints.
2416 //
2417 // If any of these conditions fails, we will not prune, regardless of
2418 // whether we have an on-disk manifest with an on-going pruning state.
2419 //
2420 if ((last - first) <= min_osdmap_epochs) {
2421 // between the first and last committed epochs, we don't have
2422 // enough epochs to trim, much less to prune.
2423 dout(10) << __func__
2424 << " currently holding only " << (last - first)
2425 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2426 << "); do not prune."
2427 << dendl;
2428 return false;
2429
2430 } else if ((last_to_pin - first) < prune_min) {
2431 // between the first committed epoch and the last epoch we would prune,
2432 // we simply don't have enough versions over the minimum to prune maps.
2433 dout(10) << __func__
2434 << " could only prune " << (last_to_pin - first)
2435 << " epochs (" << first << ".." << last_to_pin << "), which"
2436 " is less than the required minimum (" << prune_min << ")"
2437 << dendl;
2438 return false;
2439
2440 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2441 dout(10) << __func__
2442 << " we have pruned as far as we can; do not prune."
2443 << dendl;
2444 return false;
2445
2446 } else if (last_pinned + prune_interval > last_to_pin) {
2447 dout(10) << __func__
2448 << " not enough epochs to form an interval (last pinned: "
2449 << last_pinned << ", last to pin: "
2450 << last_to_pin << ", interval: " << prune_interval << ")"
2451 << dendl;
2452 return false;
2453 }
2454
2455 dout(15) << __func__
2456 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2457 << " lc (" << first << ".." << last << ")"
2458 << dendl;
2459 return true;
2460 }
2461
2462 void OSDMonitor::_prune_update_trimmed(
2463 MonitorDBStore::TransactionRef tx,
2464 version_t first)
2465 {
2466 dout(10) << __func__
2467 << " first " << first
2468 << " last_pinned " << osdmap_manifest.get_last_pinned()
2469 << dendl;
2470
2471 osdmap_manifest_t manifest = osdmap_manifest;
2472
2473 if (!manifest.is_pinned(first)) {
2474 manifest.pin(first);
2475 }
2476
2477 set<version_t>::iterator p_end = manifest.pinned.find(first);
2478 set<version_t>::iterator p = manifest.pinned.begin();
2479 manifest.pinned.erase(p, p_end);
2480 ceph_assert(manifest.get_first_pinned() == first);
2481
2482 if (manifest.get_last_pinned() == first+1 ||
2483 manifest.pinned.size() == 1) {
2484 // we reached the end of the line, as pinned maps go; clean up our
2485 // manifest, and let `should_prune()` decide whether we should prune
2486 // again.
2487 tx->erase(get_service_name(), "osdmap_manifest");
2488 return;
2489 }
2490
2491 bufferlist bl;
2492 manifest.encode(bl);
2493 tx->put(get_service_name(), "osdmap_manifest", bl);
2494 }
2495
2496 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2497 {
2498 dout(1) << __func__ << dendl;
2499
2500 version_t pin_first;
2501
2502 // verify constrainsts on stable in-memory state
2503 if (!has_osdmap_manifest) {
2504 // we must have never pruned, OR if we pruned the state must no longer
2505 // be relevant (i.e., the state must have been removed alongside with
2506 // the trim that *must* have removed past the last pinned map in a
2507 // previous prune).
2508 ceph_assert(osdmap_manifest.pinned.empty());
2509 ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2510 pin_first = get_first_committed();
2511
2512 } else {
2513 // we must have pruned in the past AND its state is still relevant
2514 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2515 // and thus we still hold a manifest in the store).
2516 ceph_assert(!osdmap_manifest.pinned.empty());
2517 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2518 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2519
2520 dout(10) << __func__
2521 << " first_pinned " << osdmap_manifest.get_first_pinned()
2522 << " last_pinned " << osdmap_manifest.get_last_pinned()
2523 << dendl;
2524
2525 pin_first = osdmap_manifest.get_last_pinned();
2526 }
2527
2528 manifest.pin(pin_first);
2529 }
2530
2531 bool OSDMonitor::_prune_sanitize_options() const
2532 {
2533 uint64_t prune_interval =
2534 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2535 uint64_t prune_min =
2536 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2537 uint64_t txsize =
2538 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2539
2540 bool r = true;
2541
2542 if (prune_interval == 0) {
2543 derr << __func__
2544 << " prune is enabled BUT prune interval is zero; abort."
2545 << dendl;
2546 r = false;
2547 } else if (prune_interval == 1) {
2548 derr << __func__
2549 << " prune interval is equal to one, which essentially means"
2550 " no pruning; abort."
2551 << dendl;
2552 r = false;
2553 }
2554 if (prune_min == 0) {
2555 derr << __func__
2556 << " prune is enabled BUT prune min is zero; abort."
2557 << dendl;
2558 r = false;
2559 }
2560 if (prune_interval > prune_min) {
2561 derr << __func__
2562 << " impossible to ascertain proper prune interval because"
2563 << " it is greater than the minimum prune epochs"
2564 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2565 << dendl;
2566 r = false;
2567 }
2568
2569 if (txsize < prune_interval - 1) {
2570 derr << __func__
2571 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2572 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2573 << "); abort." << dendl;
2574 r = false;
2575 }
2576 return r;
2577 }
2578
2579 bool OSDMonitor::is_prune_enabled() const {
2580 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2581 }
2582
2583 bool OSDMonitor::is_prune_supported() const {
2584 return mon.get_required_mon_features().contains_any(
2585 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2586 }
2587
2588 /** do_prune
2589 *
2590 * @returns true if has side-effects; false otherwise.
2591 */
2592 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2593 {
2594 bool enabled = is_prune_enabled();
2595
2596 dout(1) << __func__ << " osdmap full prune "
2597 << ( enabled ? "enabled" : "disabled")
2598 << dendl;
2599
2600 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2601 return false;
2602 }
2603
2604 // we are beyond the minimum prune versions, we need to remove maps because
2605 // otherwise the store will grow unbounded and we may end up having issues
2606 // with available disk space or store hangs.
2607
2608 // we will not pin all versions. We will leave a buffer number of versions.
2609 // this allows us the monitor to trim maps without caring too much about
2610 // pinned maps, and then allow us to use another ceph-mon without these
2611 // capabilities, without having to repair the store.
2612
2613 osdmap_manifest_t manifest = osdmap_manifest;
2614
2615 version_t first = get_first_committed();
2616 version_t last = get_last_committed();
2617
2618 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2619 version_t last_pinned = manifest.get_last_pinned();
2620 uint64_t prune_interval =
2621 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2622 uint64_t txsize =
2623 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2624
2625 prune_init(manifest);
2626
2627 // we need to get rid of some osdmaps
2628
2629 dout(5) << __func__
2630 << " lc (" << first << " .. " << last << ")"
2631 << " last_pinned " << last_pinned
2632 << " interval " << prune_interval
2633 << " last_to_pin " << last_to_pin
2634 << dendl;
2635
2636 // We will be erasing maps as we go.
2637 //
2638 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2639 //
2640 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2641 // we stop pruning. We could prune the maps between `next_to_pin` and
2642 // `last_to_pin`, but by not doing it we end up with neater pruned
2643 // intervals, aligned with `prune_interval`. Besides, this should not be a
2644 // problem as long as `prune_interval` is set to a sane value, instead of
2645 // hundreds or thousands of maps.
2646
2647 auto map_exists = [this](version_t v) {
2648 string k = mon.store->combine_strings("full", v);
2649 return mon.store->exists(get_service_name(), k);
2650 };
2651
2652 // 'interval' represents the number of maps from the last pinned
2653 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2654 // version 11 next; all intermediate versions will be removed.
2655 //
2656 // 'txsize' represents the maximum number of versions we'll be removing in
2657 // this iteration. If 'txsize' is large enough to perform multiple passes
2658 // pinning and removing maps, we will do so; if not, we'll do at least one
2659 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2660 // ensure that we never go *over* the maximum.
2661
2662 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2663 uint64_t removal_interval = prune_interval - 1;
2664
2665 if (txsize < removal_interval) {
2666 dout(5) << __func__
2667 << " setting txsize to removal interval size ("
2668 << removal_interval << " versions"
2669 << dendl;
2670 txsize = removal_interval;
2671 }
2672 ceph_assert(removal_interval > 0);
2673
2674 uint64_t num_pruned = 0;
2675 while (num_pruned + removal_interval <= txsize) {
2676 last_pinned = manifest.get_last_pinned();
2677
2678 if (last_pinned + prune_interval > last_to_pin) {
2679 break;
2680 }
2681 ceph_assert(last_pinned < last_to_pin);
2682
2683 version_t next_pinned = last_pinned + prune_interval;
2684 ceph_assert(next_pinned <= last_to_pin);
2685 manifest.pin(next_pinned);
2686
2687 dout(20) << __func__
2688 << " last_pinned " << last_pinned
2689 << " next_pinned " << next_pinned
2690 << " num_pruned " << num_pruned
2691 << " removal interval (" << (last_pinned+1)
2692 << ".." << (next_pinned-1) << ")"
2693 << " txsize " << txsize << dendl;
2694
2695 ceph_assert(map_exists(last_pinned));
2696 ceph_assert(map_exists(next_pinned));
2697
2698 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2699 ceph_assert(!manifest.is_pinned(v));
2700
2701 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2702 string full_key = mon.store->combine_strings("full", v);
2703 tx->erase(get_service_name(), full_key);
2704 ++num_pruned;
2705 }
2706 }
2707
2708 ceph_assert(num_pruned > 0);
2709
2710 bufferlist bl;
2711 manifest.encode(bl);
2712 tx->put(get_service_name(), "osdmap_manifest", bl);
2713
2714 return true;
2715 }
2716
2717
2718 // -------------
2719
2720 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2721 {
2722 op->mark_osdmon_event(__func__);
2723 Message *m = op->get_req();
2724 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2725
2726 switch (m->get_type()) {
2727 // READs
2728 case MSG_MON_COMMAND:
2729 try {
2730 return preprocess_command(op);
2731 } catch (const bad_cmd_get& e) {
2732 bufferlist bl;
2733 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2734 return true;
2735 }
2736 case CEPH_MSG_MON_GET_OSDMAP:
2737 return preprocess_get_osdmap(op);
2738
2739 // damp updates
2740 case MSG_OSD_MARK_ME_DOWN:
2741 return preprocess_mark_me_down(op);
2742 case MSG_OSD_MARK_ME_DEAD:
2743 return preprocess_mark_me_dead(op);
2744 case MSG_OSD_FULL:
2745 return preprocess_full(op);
2746 case MSG_OSD_FAILURE:
2747 return preprocess_failure(op);
2748 case MSG_OSD_BOOT:
2749 return preprocess_boot(op);
2750 case MSG_OSD_ALIVE:
2751 return preprocess_alive(op);
2752 case MSG_OSD_PG_CREATED:
2753 return preprocess_pg_created(op);
2754 case MSG_OSD_PG_READY_TO_MERGE:
2755 return preprocess_pg_ready_to_merge(op);
2756 case MSG_OSD_PGTEMP:
2757 return preprocess_pgtemp(op);
2758 case MSG_OSD_BEACON:
2759 return preprocess_beacon(op);
2760
2761 case CEPH_MSG_POOLOP:
2762 return preprocess_pool_op(op);
2763
2764 case MSG_REMOVE_SNAPS:
2765 return preprocess_remove_snaps(op);
2766
2767 case MSG_MON_GET_PURGED_SNAPS:
2768 return preprocess_get_purged_snaps(op);
2769
2770 default:
2771 ceph_abort();
2772 return true;
2773 }
2774 }
2775
2776 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2777 {
2778 op->mark_osdmon_event(__func__);
2779 Message *m = op->get_req();
2780 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2781
2782 switch (m->get_type()) {
2783 // damp updates
2784 case MSG_OSD_MARK_ME_DOWN:
2785 return prepare_mark_me_down(op);
2786 case MSG_OSD_MARK_ME_DEAD:
2787 return prepare_mark_me_dead(op);
2788 case MSG_OSD_FULL:
2789 return prepare_full(op);
2790 case MSG_OSD_FAILURE:
2791 return prepare_failure(op);
2792 case MSG_OSD_BOOT:
2793 return prepare_boot(op);
2794 case MSG_OSD_ALIVE:
2795 return prepare_alive(op);
2796 case MSG_OSD_PG_CREATED:
2797 return prepare_pg_created(op);
2798 case MSG_OSD_PGTEMP:
2799 return prepare_pgtemp(op);
2800 case MSG_OSD_PG_READY_TO_MERGE:
2801 return prepare_pg_ready_to_merge(op);
2802 case MSG_OSD_BEACON:
2803 return prepare_beacon(op);
2804
2805 case MSG_MON_COMMAND:
2806 try {
2807 return prepare_command(op);
2808 } catch (const bad_cmd_get& e) {
2809 bufferlist bl;
2810 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2811 return true;
2812 }
2813
2814 case CEPH_MSG_POOLOP:
2815 return prepare_pool_op(op);
2816
2817 case MSG_REMOVE_SNAPS:
2818 return prepare_remove_snaps(op);
2819
2820
2821 default:
2822 ceph_abort();
2823 }
2824
2825 return false;
2826 }
2827
2828 bool OSDMonitor::should_propose(double& delay)
2829 {
2830 dout(10) << "should_propose" << dendl;
2831
2832 // if full map, propose immediately! any subsequent changes will be clobbered.
2833 if (pending_inc.fullmap.length())
2834 return true;
2835
2836 // adjust osd weights?
2837 if (!osd_weight.empty() &&
2838 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2839 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2840 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2841 delay = 0.0;
2842 osd_weight.clear();
2843 return true;
2844 }
2845
2846 return PaxosService::should_propose(delay);
2847 }
2848
2849
2850
2851 // ---------------------------
2852 // READs
2853
2854 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2855 {
2856 op->mark_osdmon_event(__func__);
2857 auto m = op->get_req<MMonGetOSDMap>();
2858
2859 uint64_t features = mon.get_quorum_con_features();
2860 if (op->get_session() && op->get_session()->con_features)
2861 features = op->get_session()->con_features;
2862
2863 dout(10) << __func__ << " " << *m << dendl;
2864 MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2865 epoch_t first = get_first_committed();
2866 epoch_t last = osdmap.get_epoch();
2867 int max = g_conf()->osd_map_message_max;
2868 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2869 for (epoch_t e = std::max(first, m->get_full_first());
2870 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2871 ++e, --max) {
2872 bufferlist& bl = reply->maps[e];
2873 int r = get_version_full(e, features, bl);
2874 ceph_assert(r >= 0);
2875 max_bytes -= bl.length();
2876 }
2877 for (epoch_t e = std::max(first, m->get_inc_first());
2878 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2879 ++e, --max) {
2880 bufferlist& bl = reply->incremental_maps[e];
2881 int r = get_version(e, features, bl);
2882 ceph_assert(r >= 0);
2883 max_bytes -= bl.length();
2884 }
2885 reply->cluster_osdmap_trim_lower_bound = first;
2886 reply->newest_map = last;
2887 mon.send_reply(op, reply);
2888 return true;
2889 }
2890
2891
2892 // ---------------------------
2893 // UPDATEs
2894
2895 // failure --
2896
2897 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2898 // check permissions
2899 MonSession *session = op->get_session();
2900 if (!session)
2901 return true;
2902 if (!session->is_capable("osd", MON_CAP_X)) {
2903 dout(0) << "got MOSDFailure from entity with insufficient caps "
2904 << session->caps << dendl;
2905 return true;
2906 }
2907 if (fsid != mon.monmap->fsid) {
2908 dout(0) << "check_source: on fsid " << fsid
2909 << " != " << mon.monmap->fsid << dendl;
2910 return true;
2911 }
2912 return false;
2913 }
2914
2915
2916 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2917 {
2918 op->mark_osdmon_event(__func__);
2919 auto m = op->get_req<MOSDFailure>();
2920 // who is target_osd
2921 int badboy = m->get_target_osd();
2922
2923 // check permissions
2924 if (check_source(op, m->fsid))
2925 goto didit;
2926
2927 // first, verify the reporting host is valid
2928 if (m->get_orig_source().is_osd()) {
2929 int from = m->get_orig_source().num();
2930 if (!osdmap.exists(from) ||
2931 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2932 (osdmap.is_down(from) && m->if_osd_failed())) {
2933 dout(5) << "preprocess_failure from dead osd." << from
2934 << ", ignoring" << dendl;
2935 send_incremental(op, m->get_epoch()+1);
2936 goto didit;
2937 }
2938 }
2939
2940
2941 // weird?
2942 if (osdmap.is_down(badboy)) {
2943 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2944 << " " << m->get_target_addrs()
2945 << ", from " << m->get_orig_source() << dendl;
2946 if (m->get_epoch() < osdmap.get_epoch())
2947 send_incremental(op, m->get_epoch()+1);
2948 goto didit;
2949 }
2950 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2951 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2952 << " " << m->get_target_addrs()
2953 << " != map's " << osdmap.get_addrs(badboy)
2954 << ", from " << m->get_orig_source() << dendl;
2955 if (m->get_epoch() < osdmap.get_epoch())
2956 send_incremental(op, m->get_epoch()+1);
2957 goto didit;
2958 }
2959
2960 // already reported?
2961 if (osdmap.is_down(badboy) ||
2962 osdmap.get_up_from(badboy) > m->get_epoch()) {
2963 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2964 << " " << m->get_target_addrs()
2965 << ", from " << m->get_orig_source() << dendl;
2966 if (m->get_epoch() < osdmap.get_epoch())
2967 send_incremental(op, m->get_epoch()+1);
2968 goto didit;
2969 }
2970
2971 if (!can_mark_down(badboy)) {
2972 dout(5) << "preprocess_failure ignoring report of osd."
2973 << m->get_target_osd() << " " << m->get_target_addrs()
2974 << " from " << m->get_orig_source() << dendl;
2975 goto didit;
2976 }
2977
2978 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2979 << " " << m->get_target_addrs()
2980 << ", from " << m->get_orig_source() << dendl;
2981 return false;
2982
2983 didit:
2984 mon.no_reply(op);
2985 return true;
2986 }
2987
2988 class C_AckMarkedDown : public C_MonOp {
2989 OSDMonitor *osdmon;
2990 public:
2991 C_AckMarkedDown(
2992 OSDMonitor *osdmon,
2993 MonOpRequestRef op)
2994 : C_MonOp(op), osdmon(osdmon) {}
2995
2996 void _finish(int r) override {
2997 if (r == 0) {
2998 auto m = op->get_req<MOSDMarkMeDown>();
2999 osdmon->mon.send_reply(
3000 op,
3001 new MOSDMarkMeDown(
3002 m->fsid,
3003 m->target_osd,
3004 m->target_addrs,
3005 m->get_epoch(),
3006 false)); // ACK itself does not request an ack
3007 } else if (r == -EAGAIN) {
3008 osdmon->dispatch(op);
3009 } else {
3010 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
3011 }
3012 }
3013 ~C_AckMarkedDown() override {
3014 }
3015 };
3016
3017 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
3018 {
3019 op->mark_osdmon_event(__func__);
3020 auto m = op->get_req<MOSDMarkMeDown>();
3021 int from = m->target_osd;
3022
3023 // check permissions
3024 if (check_source(op, m->fsid))
3025 goto reply;
3026
3027 // first, verify the reporting host is valid
3028 if (!m->get_orig_source().is_osd())
3029 goto reply;
3030
3031 if (!osdmap.exists(from) ||
3032 osdmap.is_down(from) ||
3033 osdmap.get_addrs(from) != m->target_addrs) {
3034 dout(5) << "preprocess_mark_me_down from dead osd."
3035 << from << ", ignoring" << dendl;
3036 send_incremental(op, m->get_epoch()+1);
3037 goto reply;
3038 }
3039
3040 // no down might be set
3041 if (!can_mark_down(from))
3042 goto reply;
3043
3044 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3045 << " " << m->target_addrs << dendl;
3046 return false;
3047
3048 reply:
3049 if (m->request_ack) {
3050 Context *c(new C_AckMarkedDown(this, op));
3051 c->complete(0);
3052 }
3053 return true;
3054 }
3055
3056 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3057 {
3058 op->mark_osdmon_event(__func__);
3059 auto m = op->get_req<MOSDMarkMeDown>();
3060 int target_osd = m->target_osd;
3061
3062 ceph_assert(osdmap.is_up(target_osd));
3063 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3064
3065 mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
3066 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3067 if (m->down_and_dead) {
3068 if (!pending_inc.new_xinfo.count(target_osd)) {
3069 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3070 }
3071 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3072 }
3073 if (m->request_ack)
3074 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3075 return true;
3076 }
3077
3078 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3079 {
3080 op->mark_osdmon_event(__func__);
3081 auto m = op->get_req<MOSDMarkMeDead>();
3082 int from = m->target_osd;
3083
3084 // check permissions
3085 if (check_source(op, m->fsid)) {
3086 mon.no_reply(op);
3087 return true;
3088 }
3089
3090 // first, verify the reporting host is valid
3091 if (!m->get_orig_source().is_osd()) {
3092 mon.no_reply(op);
3093 return true;
3094 }
3095
3096 if (!osdmap.exists(from) ||
3097 !osdmap.is_down(from)) {
3098 dout(5) << __func__ << " from nonexistent or up osd." << from
3099 << ", ignoring" << dendl;
3100 send_incremental(op, m->get_epoch()+1);
3101 mon.no_reply(op);
3102 return true;
3103 }
3104
3105 return false;
3106 }
3107
3108 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3109 {
3110 op->mark_osdmon_event(__func__);
3111 auto m = op->get_req<MOSDMarkMeDead>();
3112 int target_osd = m->target_osd;
3113
3114 ceph_assert(osdmap.is_down(target_osd));
3115
3116 mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3117 << m->get_epoch();
3118 if (!pending_inc.new_xinfo.count(target_osd)) {
3119 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3120 }
3121 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3122 wait_for_finished_proposal(
3123 op,
3124 new LambdaContext(
3125 [op, this] (int r) {
3126 if (r >= 0) {
3127 mon.no_reply(op); // ignore on success
3128 }
3129 }
3130 ));
3131 return true;
3132 }
3133
3134 bool OSDMonitor::can_mark_down(int i)
3135 {
3136 if (osdmap.is_nodown(i)) {
3137 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3138 << "will not mark it down" << dendl;
3139 return false;
3140 }
3141
3142 int num_osds = osdmap.get_num_osds();
3143 if (num_osds == 0) {
3144 dout(5) << __func__ << " no osds" << dendl;
3145 return false;
3146 }
3147 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3148 float up_ratio = (float)up / (float)num_osds;
3149 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3150 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3151 << g_conf()->mon_osd_min_up_ratio
3152 << ", will not mark osd." << i << " down" << dendl;
3153 return false;
3154 }
3155 return true;
3156 }
3157
3158 bool OSDMonitor::can_mark_up(int i)
3159 {
3160 if (osdmap.is_noup(i)) {
3161 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3162 << "will not mark it up" << dendl;
3163 return false;
3164 }
3165
3166 return true;
3167 }
3168
3169 /**
3170 * @note the parameter @p i apparently only exists here so we can output the
3171 * osd's id on messages.
3172 */
3173 bool OSDMonitor::can_mark_out(int i)
3174 {
3175 if (osdmap.is_noout(i)) {
3176 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3177 << "will not mark it out" << dendl;
3178 return false;
3179 }
3180
3181 int num_osds = osdmap.get_num_osds();
3182 if (num_osds == 0) {
3183 dout(5) << __func__ << " no osds" << dendl;
3184 return false;
3185 }
3186 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3187 float in_ratio = (float)in / (float)num_osds;
3188 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3189 if (i >= 0)
3190 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3191 << g_conf()->mon_osd_min_in_ratio
3192 << ", will not mark osd." << i << " out" << dendl;
3193 else
3194 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3195 << g_conf()->mon_osd_min_in_ratio
3196 << ", will not mark osds out" << dendl;
3197 return false;
3198 }
3199
3200 return true;
3201 }
3202
3203 bool OSDMonitor::can_mark_in(int i)
3204 {
3205 if (osdmap.is_noin(i)) {
3206 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3207 << "will not mark it in" << dendl;
3208 return false;
3209 }
3210
3211 return true;
3212 }
3213
3214 bool OSDMonitor::check_failures(utime_t now)
3215 {
3216 bool found_failure = false;
3217 auto p = failure_info.begin();
3218 while (p != failure_info.end()) {
3219 auto& [target_osd, fi] = *p;
3220 if (can_mark_down(target_osd) &&
3221 check_failure(now, target_osd, fi)) {
3222 found_failure = true;
3223 ++p;
3224 } else if (is_failure_stale(now, fi)) {
3225 dout(10) << " dropping stale failure_info for osd." << target_osd
3226 << " from " << fi.reporters.size() << " reporters"
3227 << dendl;
3228 p = failure_info.erase(p);
3229 } else {
3230 ++p;
3231 }
3232 }
3233 return found_failure;
3234 }
3235
3236 utime_t OSDMonitor::get_grace_time(utime_t now,
3237 int target_osd,
3238 failure_info_t& fi) const
3239 {
3240 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3241 if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3242 return orig_grace;
3243 }
3244 utime_t grace = orig_grace;
3245 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3246 double decay_k = ::log(.5) / halflife;
3247
3248 // scale grace period based on historical probability of 'lagginess'
3249 // (false positive failures due to slowness).
3250 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3251 const utime_t failed_for = now - fi.get_failed_since();
3252 double decay = exp((double)failed_for * decay_k);
3253 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3254 << " failed_for " << failed_for << " decay " << decay << dendl;
3255 double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3256 grace += my_grace;
3257
3258 // consider the peers reporting a failure a proxy for a potential
3259 // 'subcluster' over the overall cluster that is similarly
3260 // laggy. this is clearly not true in all cases, but will sometimes
3261 // help us localize the grace correction to a subset of the system
3262 // (say, a rack with a bad switch) that is unhappy.
3263 double peer_grace = 0;
3264 for (auto& [reporter, report] : fi.reporters) {
3265 if (osdmap.exists(reporter)) {
3266 const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3267 utime_t elapsed = now - xi.down_stamp;
3268 double decay = exp((double)elapsed * decay_k);
3269 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3270 }
3271 }
3272 peer_grace /= (double)fi.reporters.size();
3273 grace += peer_grace;
3274 dout(10) << " osd." << target_osd << " has "
3275 << fi.reporters.size() << " reporters, "
3276 << grace << " grace (" << orig_grace << " + " << my_grace
3277 << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3278 << dendl;
3279
3280 return grace;
3281 }
3282
3283 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3284 {
3285 // already pending failure?
3286 if (pending_inc.new_state.count(target_osd) &&
3287 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3288 dout(10) << " already pending failure" << dendl;
3289 return true;
3290 }
3291
3292 set<string> reporters_by_subtree;
3293 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3294 ceph_assert(fi.reporters.size());
3295 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3296 // get the parent bucket whose type matches with "reporter_subtree_level".
3297 // fall back to OSD if the level doesn't exist.
3298 if (osdmap.exists(p->first)) {
3299 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3300 if (auto iter = reporter_loc.find(reporter_subtree_level);
3301 iter == reporter_loc.end()) {
3302 reporters_by_subtree.insert("osd." + to_string(p->first));
3303 } else {
3304 reporters_by_subtree.insert(iter->second);
3305 }
3306 ++p;
3307 } else {
3308 fi.cancel_report(p->first);;
3309 p = fi.reporters.erase(p);
3310 }
3311 }
3312 if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3313 return false;
3314 }
3315 const utime_t failed_for = now - fi.get_failed_since();
3316 const utime_t grace = get_grace_time(now, target_osd, fi);
3317 if (failed_for >= grace) {
3318 dout(1) << " we have enough reporters to mark osd." << target_osd
3319 << " down" << dendl;
3320 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3321
3322 mon.clog->info() << "osd." << target_osd << " failed ("
3323 << osdmap.crush->get_full_location_ordered_string(
3324 target_osd)
3325 << ") ("
3326 << (int)reporters_by_subtree.size()
3327 << " reporters from different "
3328 << reporter_subtree_level << " after "
3329 << failed_for << " >= grace " << grace << ")";
3330 return true;
3331 }
3332 return false;
3333 }
3334
3335 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3336 {
3337 // if it takes too long to either cancel the report to mark the osd down,
3338 // some reporters must have failed to cancel their reports. let's just
3339 // forget these reports.
3340 const utime_t failed_for = now - fi.get_failed_since();
3341 auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3342 auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3343 return failed_for >= (heartbeat_grace + heartbeat_stale);
3344 }
3345
3346 void OSDMonitor::force_failure(int target_osd, int by)
3347 {
3348 // already pending failure?
3349 if (pending_inc.new_state.count(target_osd) &&
3350 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3351 dout(10) << " already pending failure" << dendl;
3352 return;
3353 }
3354
3355 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3356 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3357 if (!pending_inc.new_xinfo.count(target_osd)) {
3358 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3359 }
3360 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3361
3362 mon.clog->info() << "osd." << target_osd << " failed ("
3363 << osdmap.crush->get_full_location_ordered_string(target_osd)
3364 << ") (connection refused reported by osd." << by << ")";
3365 return;
3366 }
3367
3368 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3369 {
3370 op->mark_osdmon_event(__func__);
3371 auto m = op->get_req<MOSDFailure>();
3372 dout(1) << "prepare_failure osd." << m->get_target_osd()
3373 << " " << m->get_target_addrs()
3374 << " from " << m->get_orig_source()
3375 << " is reporting failure:" << m->if_osd_failed() << dendl;
3376
3377 int target_osd = m->get_target_osd();
3378 int reporter = m->get_orig_source().num();
3379 ceph_assert(osdmap.is_up(target_osd));
3380 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3381
3382 mon.no_reply(op);
3383
3384 if (m->if_osd_failed()) {
3385 // calculate failure time
3386 utime_t now = ceph_clock_now();
3387 utime_t failed_since =
3388 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3389
3390 // add a report
3391 if (m->is_immediate()) {
3392 mon.clog->debug() << "osd." << m->get_target_osd()
3393 << " reported immediately failed by "
3394 << m->get_orig_source();
3395 force_failure(target_osd, reporter);
3396 return true;
3397 }
3398 mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3399 << m->get_orig_source();
3400
3401 failure_info_t& fi = failure_info[target_osd];
3402 fi.add_report(reporter, failed_since, op);
3403 return check_failure(now, target_osd, fi);
3404 } else {
3405 // remove the report
3406 mon.clog->debug() << "osd." << m->get_target_osd()
3407 << " failure report canceled by "
3408 << m->get_orig_source();
3409 if (failure_info.count(target_osd)) {
3410 failure_info_t& fi = failure_info[target_osd];
3411 fi.cancel_report(reporter);
3412 if (fi.reporters.empty()) {
3413 dout(10) << " removing last failure_info for osd." << target_osd
3414 << dendl;
3415 failure_info.erase(target_osd);
3416 } else {
3417 dout(10) << " failure_info for osd." << target_osd << " now "
3418 << fi.reporters.size() << " reporters" << dendl;
3419 }
3420 } else {
3421 dout(10) << " no failure_info for osd." << target_osd << dendl;
3422 }
3423 }
3424
3425 return false;
3426 }
3427
3428 void OSDMonitor::process_failures()
3429 {
3430 map<int,failure_info_t>::iterator p = failure_info.begin();
3431 while (p != failure_info.end()) {
3432 if (osdmap.is_up(p->first)) {
3433 ++p;
3434 } else {
3435 dout(10) << "process_failures osd." << p->first << dendl;
3436 list<MonOpRequestRef> ls;
3437 p->second.take_report_messages(ls);
3438 failure_info.erase(p++);
3439
3440 while (!ls.empty()) {
3441 MonOpRequestRef o = ls.front();
3442 if (o) {
3443 o->mark_event(__func__);
3444 MOSDFailure *m = o->get_req<MOSDFailure>();
3445 send_latest(o, m->get_epoch());
3446 mon.no_reply(o);
3447 }
3448 ls.pop_front();
3449 }
3450 }
3451 }
3452 }
3453
3454 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3455 {
3456 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3457
3458 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3459 p != failure_info.end();
3460 ++p) {
3461 p->second.take_report_messages(ls);
3462 }
3463 failure_info.clear();
3464 }
3465
3466 int OSDMonitor::get_grace_interval_threshold()
3467 {
3468 int halflife = g_conf()->mon_osd_laggy_halflife;
3469 // Scale the halflife period (default: 1_hr) by
3470 // a factor (48) to calculate the threshold.
3471 int grace_threshold_factor = 48;
3472 return halflife * grace_threshold_factor;
3473 }
3474
3475 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3476 {
3477 int grace_interval_threshold_secs = get_grace_interval_threshold();
3478 if (last_failed_interval > grace_interval_threshold_secs) {
3479 dout(1) << " last_failed_interval " << last_failed_interval
3480 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3481 << dendl;
3482 return true;
3483 }
3484 return false;
3485 }
3486
3487 void OSDMonitor::set_default_laggy_params(int target_osd)
3488 {
3489 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3490 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3491 }
3492 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3493 xi.down_stamp = pending_inc.modified;
3494 xi.laggy_probability = 0.0;
3495 xi.laggy_interval = 0;
3496 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3497 }
3498
3499
3500 // boot --
3501
3502 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3503 {
3504 op->mark_osdmon_event(__func__);
3505 auto m = op->get_req<MOSDBoot>();
3506 int from = m->get_orig_source_inst().name.num();
3507
3508 // check permissions, ignore if failed (no response expected)
3509 MonSession *session = op->get_session();
3510 if (!session)
3511 goto ignore;
3512 if (!session->is_capable("osd", MON_CAP_X)) {
3513 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3514 << session->caps << dendl;
3515 goto ignore;
3516 }
3517
3518 if (m->sb.cluster_fsid != mon.monmap->fsid) {
3519 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3520 << " != " << mon.monmap->fsid << dendl;
3521 goto ignore;
3522 }
3523
3524 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3525 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3526 goto ignore;
3527 }
3528
3529 ceph_assert(m->get_orig_source_inst().name.is_osd());
3530
3531 // lower bound of N-2
3532 if (!HAVE_FEATURE(m->osd_features, SERVER_PACIFIC)) {
3533 mon.clog->info() << "disallowing boot of OSD "
3534 << m->get_orig_source_inst()
3535 << " because the osd lacks CEPH_FEATURE_SERVER_PACIFIC";
3536 goto ignore;
3537 }
3538
3539 // make sure osd versions do not span more than 3 releases
3540 if (HAVE_FEATURE(m->osd_features, SERVER_QUINCY) &&
3541 osdmap.require_osd_release < ceph_release_t::octopus) {
3542 mon.clog->info() << "disallowing boot of quincy+ OSD "
3543 << m->get_orig_source_inst()
3544 << " because require_osd_release < octopus";
3545 goto ignore;
3546 }
3547 if (HAVE_FEATURE(m->osd_features, SERVER_REEF) &&
3548 osdmap.require_osd_release < ceph_release_t::pacific) {
3549 mon.clog->info() << "disallowing boot of reef+ OSD "
3550 << m->get_orig_source_inst()
3551 << " because require_osd_release < pacific";
3552 goto ignore;
3553 }
3554
3555 // See crimson/osd/osd.cc: OSD::_send_boot
3556 if (auto type_iter = m->metadata.find("osd_type");
3557 type_iter != m->metadata.end()) {
3558 const auto &otype = type_iter->second;
3559 // m->metadata["osd_type"] must be "crimson", classic doesn't send osd_type
3560 if (otype == "crimson") {
3561 if (!osdmap.get_allow_crimson()) {
3562 mon.clog->info()
3563 << "Disallowing boot of crimson-osd without allow_crimson "
3564 << "OSDMap flag. Run ceph osd set_allow_crimson to set "
3565 << "allow_crimson flag. Note that crimson-osd is "
3566 << "considered unstable and may result in crashes or "
3567 << "data loss. Its usage should be restricted to "
3568 << "testing and development.";
3569 goto ignore;
3570 }
3571 } else {
3572 derr << __func__ << ": osd " << m->get_orig_source_inst()
3573 << " sent non-crimson osd_type field in MOSDBoot: "
3574 << otype
3575 << " -- booting anyway"
3576 << dendl;
3577 }
3578 }
3579
3580 if (osdmap.stretch_mode_enabled &&
3581 !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3582 mon.clog->info() << "disallowing boot of OSD "
3583 << m->get_orig_source_inst()
3584 << " because stretch mode is on and OSD lacks support";
3585 goto ignore;
3586 }
3587
3588 // already booted?
3589 if (osdmap.is_up(from) &&
3590 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3591 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3592 // yup.
3593 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3594 << " " << m->get_orig_source_addrs()
3595 << " =~ " << osdmap.get_addrs(from) << dendl;
3596 _booted(op, false);
3597 return true;
3598 }
3599
3600 if (osdmap.exists(from) &&
3601 !osdmap.get_uuid(from).is_zero() &&
3602 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3603 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3604 << " clashes with existing osd: different fsid"
3605 << " (ours: " << osdmap.get_uuid(from)
3606 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3607 goto ignore;
3608 }
3609
3610 if (osdmap.exists(from) &&
3611 osdmap.get_info(from).up_from > m->version &&
3612 osdmap.get_most_recent_addrs(from).legacy_equals(
3613 m->get_orig_source_addrs())) {
3614 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3615 send_latest(op, m->sb.current_epoch+1);
3616 return true;
3617 }
3618
3619 // noup?
3620 if (!can_mark_up(from)) {
3621 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3622 send_latest(op, m->sb.current_epoch+1);
3623 return true;
3624 }
3625
3626 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3627 return false;
3628
3629 ignore:
3630 return true;
3631 }
3632
3633 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3634 {
3635 op->mark_osdmon_event(__func__);
3636 auto m = op->get_req<MOSDBoot>();
3637 dout(7) << __func__ << " from " << m->get_source()
3638 << " sb " << m->sb
3639 << " client_addrs" << m->get_connection()->get_peer_addrs()
3640 << " cluster_addrs " << m->cluster_addrs
3641 << " hb_back_addrs " << m->hb_back_addrs
3642 << " hb_front_addrs " << m->hb_front_addrs
3643 << dendl;
3644
3645 ceph_assert(m->get_orig_source().is_osd());
3646 int from = m->get_orig_source().num();
3647
3648 // does this osd exist?
3649 if (from >= osdmap.get_max_osd()) {
3650 dout(1) << "boot from osd." << from << " >= max_osd "
3651 << osdmap.get_max_osd() << dendl;
3652 return false;
3653 }
3654
3655 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3656 if (pending_inc.new_state.count(from))
3657 oldstate ^= pending_inc.new_state[from];
3658
3659 // already up? mark down first?
3660 if (osdmap.is_up(from)) {
3661 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3662 << osdmap.get_addrs(from) << dendl;
3663 // preprocess should have caught these; if not, assert.
3664 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3665 m->get_orig_source_addrs()) ||
3666 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3667 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3668
3669 if (pending_inc.new_state.count(from) == 0 ||
3670 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3671 // mark previous guy down
3672 pending_inc.new_state[from] = CEPH_OSD_UP;
3673 }
3674 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3675 } else if (pending_inc.new_up_client.count(from)) {
3676 // already prepared, just wait
3677 dout(7) << __func__ << " already prepared, waiting on "
3678 << m->get_orig_source_addr() << dendl;
3679 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3680 } else {
3681 // mark new guy up.
3682 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3683 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3684 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3685 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3686
3687 down_pending_out.erase(from); // if any
3688
3689 if (m->sb.weight)
3690 osd_weight[from] = m->sb.weight;
3691
3692 // set uuid?
3693 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3694 << dendl;
3695 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3696 // preprocess should have caught this; if not, assert.
3697 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3698 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3699 }
3700
3701 // fresh osd?
3702 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3703 const osd_info_t& i = osdmap.get_info(from);
3704 if (i.up_from > i.lost_at) {
3705 dout(10) << " fresh osd; marking lost_at too" << dendl;
3706 pending_inc.new_lost[from] = osdmap.get_epoch();
3707 }
3708 }
3709
3710 // metadata
3711 bufferlist osd_metadata;
3712 encode(m->metadata, osd_metadata);
3713 pending_metadata[from] = osd_metadata;
3714 pending_metadata_rm.erase(from);
3715
3716 // adjust last clean unmount epoch?
3717 const osd_info_t& info = osdmap.get_info(from);
3718 dout(10) << " old osd_info: " << info << dendl;
3719 if (m->sb.mounted > info.last_clean_begin ||
3720 (m->sb.mounted == info.last_clean_begin &&
3721 m->sb.clean_thru > info.last_clean_end)) {
3722 epoch_t begin = m->sb.mounted;
3723 epoch_t end = m->sb.clean_thru;
3724
3725 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3726 << "[" << info.last_clean_begin << "," << info.last_clean_end
3727 << ") -> [" << begin << "-" << end << ")"
3728 << dendl;
3729 pending_inc.new_last_clean_interval[from] =
3730 pair<epoch_t,epoch_t>(begin, end);
3731 }
3732
3733 if (pending_inc.new_xinfo.count(from) == 0)
3734 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3735 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3736 if (m->boot_epoch == 0) {
3737 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3738 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3739 dout(10) << " not laggy, new xi " << xi << dendl;
3740 } else {
3741 if (xi.down_stamp.sec()) {
3742 int interval = ceph_clock_now().sec() -
3743 xi.down_stamp.sec();
3744 if (g_conf()->mon_osd_laggy_max_interval &&
3745 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3746 interval = g_conf()->mon_osd_laggy_max_interval;
3747 }
3748 xi.laggy_interval =
3749 interval * g_conf()->mon_osd_laggy_weight +
3750 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3751 }
3752 xi.laggy_probability =
3753 g_conf()->mon_osd_laggy_weight +
3754 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3755 dout(10) << " laggy, now xi " << xi << dendl;
3756 }
3757
3758 // set features shared by the osd
3759 if (m->osd_features)
3760 xi.features = m->osd_features;
3761 else
3762 xi.features = m->get_connection()->get_features();
3763
3764 // mark in?
3765 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3766 (oldstate & CEPH_OSD_AUTOOUT)) ||
3767 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3768 (g_conf()->mon_osd_auto_mark_in)) {
3769 if (can_mark_in(from)) {
3770 if (xi.old_weight > 0) {
3771 pending_inc.new_weight[from] = xi.old_weight;
3772 xi.old_weight = 0;
3773 } else {
3774 pending_inc.new_weight[from] = CEPH_OSD_IN;
3775 }
3776 } else {
3777 dout(7) << __func__ << " NOIN set, will not mark in "
3778 << m->get_orig_source_addr() << dendl;
3779 }
3780 }
3781
3782 // wait
3783 wait_for_finished_proposal(op, new C_Booted(this, op));
3784 }
3785 return true;
3786 }
3787
3788 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3789 {
3790 op->mark_osdmon_event(__func__);
3791 auto m = op->get_req<MOSDBoot>();
3792 dout(7) << "_booted " << m->get_orig_source_inst()
3793 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3794
3795 if (logit) {
3796 mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3797 << " boot";
3798 }
3799
3800 send_latest(op, m->sb.current_epoch+1);
3801 }
3802
3803
3804 // -------------
3805 // full
3806
3807 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3808 {
3809 op->mark_osdmon_event(__func__);
3810 auto m = op->get_req<MOSDFull>();
3811 int from = m->get_orig_source().num();
3812 set<string> state;
3813 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3814
3815 // check permissions, ignore if failed
3816 MonSession *session = op->get_session();
3817 if (!session)
3818 goto ignore;
3819 if (!session->is_capable("osd", MON_CAP_X)) {
3820 dout(0) << "MOSDFull from entity with insufficient privileges:"
3821 << session->caps << dendl;
3822 goto ignore;
3823 }
3824
3825 // ignore a full message from the osd instance that already went down
3826 if (!osdmap.exists(from)) {
3827 dout(7) << __func__ << " ignoring full message from nonexistent "
3828 << m->get_orig_source_inst() << dendl;
3829 goto ignore;
3830 }
3831 if ((!osdmap.is_up(from) &&
3832 osdmap.get_most_recent_addrs(from).legacy_equals(
3833 m->get_orig_source_addrs())) ||
3834 (osdmap.is_up(from) &&
3835 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3836 dout(7) << __func__ << " ignoring full message from down "
3837 << m->get_orig_source_inst() << dendl;
3838 goto ignore;
3839 }
3840
3841 OSDMap::calc_state_set(osdmap.get_state(from), state);
3842
3843 if ((osdmap.get_state(from) & mask) == m->state) {
3844 dout(7) << __func__ << " state already " << state << " for osd." << from
3845 << " " << m->get_orig_source_inst() << dendl;
3846 _reply_map(op, m->version);
3847 goto ignore;
3848 }
3849
3850 dout(10) << __func__ << " want state " << state << " for osd." << from
3851 << " " << m->get_orig_source_inst() << dendl;
3852 return false;
3853
3854 ignore:
3855 return true;
3856 }
3857
3858 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3859 {
3860 op->mark_osdmon_event(__func__);
3861 auto m = op->get_req<MOSDFull>();
3862 const int from = m->get_orig_source().num();
3863
3864 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3865 const unsigned want_state = m->state & mask; // safety first
3866
3867 unsigned cur_state = osdmap.get_state(from);
3868 auto p = pending_inc.new_state.find(from);
3869 if (p != pending_inc.new_state.end()) {
3870 cur_state ^= p->second;
3871 }
3872 cur_state &= mask;
3873
3874 set<string> want_state_set, cur_state_set;
3875 OSDMap::calc_state_set(want_state, want_state_set);
3876 OSDMap::calc_state_set(cur_state, cur_state_set);
3877
3878 if (cur_state != want_state) {
3879 if (p != pending_inc.new_state.end()) {
3880 p->second &= ~mask;
3881 } else {
3882 pending_inc.new_state[from] = 0;
3883 }
3884 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3885 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3886 << " -> " << want_state_set << dendl;
3887 } else {
3888 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3889 << " = wanted " << want_state_set << ", just waiting" << dendl;
3890 }
3891
3892 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3893 return true;
3894 }
3895
3896 // -------------
3897 // alive
3898
3899 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3900 {
3901 op->mark_osdmon_event(__func__);
3902 auto m = op->get_req<MOSDAlive>();
3903 int from = m->get_orig_source().num();
3904
3905 // check permissions, ignore if failed
3906 MonSession *session = op->get_session();
3907 if (!session)
3908 goto ignore;
3909 if (!session->is_capable("osd", MON_CAP_X)) {
3910 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3911 << session->caps << dendl;
3912 goto ignore;
3913 }
3914
3915 if (!osdmap.is_up(from) ||
3916 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3917 dout(7) << "preprocess_alive ignoring alive message from down "
3918 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3919 << dendl;
3920 goto ignore;
3921 }
3922
3923 if (osdmap.get_up_thru(from) >= m->want) {
3924 // yup.
3925 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3926 _reply_map(op, m->version);
3927 return true;
3928 }
3929
3930 dout(10) << "preprocess_alive want up_thru " << m->want
3931 << " from " << m->get_orig_source_inst() << dendl;
3932 return false;
3933
3934 ignore:
3935 return true;
3936 }
3937
3938 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3939 {
3940 op->mark_osdmon_event(__func__);
3941 auto m = op->get_req<MOSDAlive>();
3942 int from = m->get_orig_source().num();
3943
3944 if (0) { // we probably don't care much about these
3945 mon.clog->debug() << m->get_orig_source_inst() << " alive";
3946 }
3947
3948 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3949 << " from " << m->get_orig_source_inst() << dendl;
3950
3951 update_up_thru(from, m->version); // set to the latest map the OSD has
3952 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3953 return true;
3954 }
3955
3956 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3957 {
3958 op->mark_osdmon_event(__func__);
3959 dout(7) << "_reply_map " << e
3960 << " from " << op->get_req()->get_orig_source_inst()
3961 << dendl;
3962 send_latest(op, e);
3963 }
3964
3965 // pg_created
3966 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3967 {
3968 op->mark_osdmon_event(__func__);
3969 auto m = op->get_req<MOSDPGCreated>();
3970 dout(10) << __func__ << " " << *m << dendl;
3971 auto session = op->get_session();
3972 mon.no_reply(op);
3973 if (!session) {
3974 dout(10) << __func__ << ": no monitor session!" << dendl;
3975 return true;
3976 }
3977 if (!session->is_capable("osd", MON_CAP_X)) {
3978 derr << __func__ << " received from entity "
3979 << "with insufficient privileges " << session->caps << dendl;
3980 return true;
3981 }
3982 // always forward the "created!" to the leader
3983 return false;
3984 }
3985
3986 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3987 {
3988 op->mark_osdmon_event(__func__);
3989 auto m = op->get_req<MOSDPGCreated>();
3990 dout(10) << __func__ << " " << *m << dendl;
3991 auto src = m->get_orig_source();
3992 auto from = src.num();
3993 if (!src.is_osd() ||
3994 !mon.osdmon()->osdmap.is_up(from) ||
3995 !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3996 m->get_orig_source_addrs())) {
3997 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3998 return false;
3999 }
4000 pending_created_pgs.push_back(m->pgid);
4001 return true;
4002 }
4003
4004 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
4005 {
4006 op->mark_osdmon_event(__func__);
4007 auto m = op->get_req<MOSDPGReadyToMerge>();
4008 dout(10) << __func__ << " " << *m << dendl;
4009 const pg_pool_t *pi;
4010 auto session = op->get_session();
4011 if (!session) {
4012 dout(10) << __func__ << ": no monitor session!" << dendl;
4013 goto ignore;
4014 }
4015 if (!session->is_capable("osd", MON_CAP_X)) {
4016 derr << __func__ << " received from entity "
4017 << "with insufficient privileges " << session->caps << dendl;
4018 goto ignore;
4019 }
4020 pi = osdmap.get_pg_pool(m->pgid.pool());
4021 if (!pi) {
4022 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
4023 goto ignore;
4024 }
4025 if (pi->get_pg_num() <= m->pgid.ps()) {
4026 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
4027 goto ignore;
4028 }
4029 if (pi->get_pg_num() != m->pgid.ps() + 1) {
4030 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
4031 goto ignore;
4032 }
4033 if (pi->get_pg_num_pending() > m->pgid.ps()) {
4034 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4035 goto ignore;
4036 }
4037 return false;
4038
4039 ignore:
4040 mon.no_reply(op);
4041 return true;
4042 }
4043
4044 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4045 {
4046 op->mark_osdmon_event(__func__);
4047 auto m = op->get_req<MOSDPGReadyToMerge>();
4048 dout(10) << __func__ << " " << *m << dendl;
4049 pg_pool_t p;
4050 if (pending_inc.new_pools.count(m->pgid.pool()))
4051 p = pending_inc.new_pools[m->pgid.pool()];
4052 else
4053 p = *osdmap.get_pg_pool(m->pgid.pool());
4054 if (p.get_pg_num() != m->pgid.ps() + 1 ||
4055 p.get_pg_num_pending() > m->pgid.ps()) {
4056 dout(10) << __func__
4057 << " race with concurrent pg_num[_pending] update, will retry"
4058 << dendl;
4059 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4060 return true;
4061 }
4062
4063 if (m->ready) {
4064 p.dec_pg_num(m->pgid,
4065 pending_inc.epoch,
4066 m->source_version,
4067 m->target_version,
4068 m->last_epoch_started,
4069 m->last_epoch_clean);
4070 p.last_change = pending_inc.epoch;
4071 } else {
4072 // back off the merge attempt!
4073 p.set_pg_num_pending(p.get_pg_num());
4074 }
4075
4076 // force pre-nautilus clients to resend their ops, since they
4077 // don't understand pg_num_pending changes form a new interval
4078 p.last_force_op_resend_prenautilus = pending_inc.epoch;
4079
4080 pending_inc.new_pools[m->pgid.pool()] = p;
4081
4082 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4083 if (m->ready &&
4084 prob > 0 &&
4085 prob > (double)(rand() % 1000)/1000.0) {
4086 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4087 auto n = new MMonCommand(mon.monmap->get_fsid());
4088 n->set_connection(m->get_connection());
4089 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4090 osdmap.get_pool_name(m->pgid.pool()) +
4091 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4092 stringify(m->pgid.ps() + 1) + "\"}" };
4093 MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4094 nop->set_type_service();
4095 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4096 } else {
4097 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4098 }
4099 return true;
4100 }
4101
4102
4103 // -------------
4104 // pg_temp changes
4105
4106 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4107 {
4108 auto m = op->get_req<MOSDPGTemp>();
4109 dout(10) << "preprocess_pgtemp " << *m << dendl;
4110 mempool::osdmap::vector<int> empty;
4111 int from = m->get_orig_source().num();
4112 size_t ignore_cnt = 0;
4113
4114 // check caps
4115 MonSession *session = op->get_session();
4116 if (!session)
4117 goto ignore;
4118 if (!session->is_capable("osd", MON_CAP_X)) {
4119 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4120 << session->caps << dendl;
4121 goto ignore;
4122 }
4123
4124 if (!osdmap.is_up(from) ||
4125 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4126 dout(7) << "ignoring pgtemp message from down "
4127 << m->get_orig_source() << " " << m->get_orig_source_addrs()
4128 << dendl;
4129 goto ignore;
4130 }
4131
4132 if (m->forced) {
4133 return false;
4134 }
4135
4136 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4137 dout(20) << " " << p->first
4138 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4139 << " -> " << p->second << dendl;
4140
4141 // does the pool exist?
4142 if (!osdmap.have_pg_pool(p->first.pool())) {
4143 /*
4144 * 1. If the osdmap does not have the pool, it means the pool has been
4145 * removed in-between the osd sending this message and us handling it.
4146 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4147 * not exist in the pending either, as the osds would not send a
4148 * message about a pool they know nothing about (yet).
4149 * 3. However, if the pool does exist in the pending, then it must be a
4150 * new pool, and not relevant to this message (see 1).
4151 */
4152 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4153 << ": pool has been removed" << dendl;
4154 ignore_cnt++;
4155 continue;
4156 }
4157
4158 int acting_primary = -1;
4159 osdmap.pg_to_up_acting_osds(
4160 p->first, nullptr, nullptr, nullptr, &acting_primary);
4161 if (acting_primary != from) {
4162 /* If the source isn't the primary based on the current osdmap, we know
4163 * that the interval changed and that we can discard this message.
4164 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4165 * which of two pg temp mappings on the same pg is more recent.
4166 */
4167 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4168 << ": primary has changed" << dendl;
4169 ignore_cnt++;
4170 continue;
4171 }
4172
4173 // removal?
4174 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4175 osdmap.primary_temp->count(p->first)))
4176 return false;
4177 // change?
4178 // NOTE: we assume that this will clear pg_primary, so consider
4179 // an existing pg_primary field to imply a change
4180 if (p->second.size() &&
4181 (osdmap.pg_temp->count(p->first) == 0 ||
4182 osdmap.pg_temp->get(p->first) != p->second ||
4183 osdmap.primary_temp->count(p->first)))
4184 return false;
4185 }
4186
4187 // should we ignore all the pgs?
4188 if (ignore_cnt == m->pg_temp.size())
4189 goto ignore;
4190
4191 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4192 _reply_map(op, m->map_epoch);
4193 return true;
4194
4195 ignore:
4196 mon.no_reply(op);
4197 return true;
4198 }
4199
4200 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4201 {
4202 epoch_t old_up_thru = osdmap.get_up_thru(from);
4203 auto ut = pending_inc.new_up_thru.find(from);
4204 if (ut != pending_inc.new_up_thru.end()) {
4205 old_up_thru = ut->second;
4206 }
4207 if (up_thru > old_up_thru) {
4208 // set up_thru too, so the osd doesn't have to ask again
4209 pending_inc.new_up_thru[from] = up_thru;
4210 }
4211 }
4212
4213 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4214 {
4215 op->mark_osdmon_event(__func__);
4216 auto m = op->get_req<MOSDPGTemp>();
4217 int from = m->get_orig_source().num();
4218 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4219 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4220 uint64_t pool = p->first.pool();
4221 if (pending_inc.old_pools.count(pool)) {
4222 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4223 << ": pool pending removal" << dendl;
4224 continue;
4225 }
4226 if (!osdmap.have_pg_pool(pool)) {
4227 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4228 << ": pool has been removed" << dendl;
4229 continue;
4230 }
4231 pending_inc.new_pg_temp[p->first] =
4232 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4233
4234 // unconditionally clear pg_primary (until this message can encode
4235 // a change for that, too.. at which point we need to also fix
4236 // preprocess_pg_temp)
4237 if (osdmap.primary_temp->count(p->first) ||
4238 pending_inc.new_primary_temp.count(p->first))
4239 pending_inc.new_primary_temp[p->first] = -1;
4240 }
4241
4242 // set up_thru too, so the osd doesn't have to ask again
4243 update_up_thru(from, m->map_epoch);
4244
4245 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4246 return true;
4247 }
4248
4249
4250 // ---
4251
4252 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4253 {
4254 op->mark_osdmon_event(__func__);
4255 auto m = op->get_req<MRemoveSnaps>();
4256 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4257
4258 // check privilege, ignore if failed
4259 MonSession *session = op->get_session();
4260 mon.no_reply(op);
4261 if (!session)
4262 goto ignore;
4263 if (!session->caps.is_capable(
4264 cct,
4265 session->entity_name,
4266 "osd", "osd pool rmsnap", {}, true, true, false,
4267 session->get_peer_socket_addr())) {
4268 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4269 << session->caps << dendl;
4270 goto ignore;
4271 }
4272
4273 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4274 q != m->snaps.end();
4275 ++q) {
4276 if (!osdmap.have_pg_pool(q->first)) {
4277 dout(10) << " ignoring removed_snaps " << q->second
4278 << " on non-existent pool " << q->first << dendl;
4279 continue;
4280 }
4281 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4282 for (vector<snapid_t>::iterator p = q->second.begin();
4283 p != q->second.end();
4284 ++p) {
4285 if (*p > pi->get_snap_seq() ||
4286 !_is_removed_snap(q->first, *p)) {
4287 return false;
4288 }
4289 }
4290 }
4291
4292 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4293 auto reply = make_message<MRemoveSnaps>();
4294 reply->snaps = m->snaps;
4295 mon.send_reply(op, reply.detach());
4296 }
4297
4298 ignore:
4299 return true;
4300 }
4301
4302 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4303 {
4304 op->mark_osdmon_event(__func__);
4305 auto m = op->get_req<MRemoveSnaps>();
4306 dout(7) << "prepare_remove_snaps " << *m << dendl;
4307
4308 for (auto& [pool, snaps] : m->snaps) {
4309 if (!osdmap.have_pg_pool(pool)) {
4310 dout(10) << " ignoring removed_snaps " << snaps
4311 << " on non-existent pool " << pool << dendl;
4312 continue;
4313 }
4314
4315 pg_pool_t& pi = osdmap.pools[pool];
4316 for (auto s : snaps) {
4317 if (!_is_removed_snap(pool, s) &&
4318 (!pending_inc.new_pools.count(pool) ||
4319 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4320 (!pending_inc.new_removed_snaps.count(pool) ||
4321 !pending_inc.new_removed_snaps[pool].contains(s))) {
4322 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4323 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4324 newpi->removed_snaps.insert(s);
4325 dout(10) << " pool " << pool << " removed_snaps added " << s
4326 << " (now " << newpi->removed_snaps << ")" << dendl;
4327 }
4328 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4329 if (s > newpi->get_snap_seq()) {
4330 dout(10) << " pool " << pool << " snap_seq "
4331 << newpi->get_snap_seq() << " -> " << s << dendl;
4332 newpi->set_snap_seq(s);
4333 }
4334 newpi->set_snap_epoch(pending_inc.epoch);
4335 dout(10) << " added pool " << pool << " snap " << s
4336 << " to removed_snaps queue" << dendl;
4337 pending_inc.new_removed_snaps[pool].insert(s);
4338 }
4339 }
4340 }
4341
4342 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4343 auto reply = make_message<MRemoveSnaps>();
4344 reply->snaps = m->snaps;
4345 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4346 }
4347
4348 return true;
4349 }
4350
4351 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4352 {
4353 op->mark_osdmon_event(__func__);
4354 auto m = op->get_req<MMonGetPurgedSnaps>();
4355 dout(7) << __func__ << " " << *m << dendl;
4356
4357 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4358
4359 string k = make_purged_snap_epoch_key(m->start);
4360 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4361 it->upper_bound(k);
4362 unsigned long epoch = m->last;
4363 while (it->valid()) {
4364 if (it->key().find("purged_epoch_") != 0) {
4365 break;
4366 }
4367 string k = it->key();
4368 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4369 if (n != 1) {
4370 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4371 } else if (epoch > m->last) {
4372 break;
4373 } else {
4374 bufferlist bl = it->value();
4375 auto p = bl.cbegin();
4376 auto &v = r[epoch];
4377 try {
4378 ceph::decode(v, p);
4379 } catch (ceph::buffer::error& e) {
4380 derr << __func__ << " unable to parse value for key '" << it->key()
4381 << "': \n";
4382 bl.hexdump(*_dout);
4383 *_dout << dendl;
4384 }
4385 n += 4 + v.size() * 16;
4386 }
4387 if (n > 1048576) {
4388 // impose a semi-arbitrary limit to message size
4389 break;
4390 }
4391 it->next();
4392 }
4393
4394 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4395 reply->purged_snaps.swap(r);
4396 mon.send_reply(op, reply.detach());
4397
4398 return true;
4399 }
4400
4401 // osd beacon
4402 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4403 {
4404 op->mark_osdmon_event(__func__);
4405 // check caps
4406 auto session = op->get_session();
4407 mon.no_reply(op);
4408 if (!session) {
4409 dout(10) << __func__ << " no monitor session!" << dendl;
4410 return true;
4411 }
4412 if (!session->is_capable("osd", MON_CAP_X)) {
4413 derr << __func__ << " received from entity "
4414 << "with insufficient privileges " << session->caps << dendl;
4415 return true;
4416 }
4417 // Always forward the beacon to the leader, even if they are the same as
4418 // the old one. The leader will mark as down osds that haven't sent
4419 // beacon for a few minutes.
4420 return false;
4421 }
4422
4423 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4424 {
4425 op->mark_osdmon_event(__func__);
4426 const auto beacon = op->get_req<MOSDBeacon>();
4427 const auto src = beacon->get_orig_source();
4428 dout(10) << __func__ << " " << *beacon
4429 << " from " << src << dendl;
4430 int from = src.num();
4431
4432 if (!src.is_osd() ||
4433 !osdmap.is_up(from) ||
4434 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4435 if (src.is_osd() && !osdmap.is_up(from)) {
4436 // share some new maps with this guy in case it may not be
4437 // aware of its own deadness...
4438 send_latest(op, beacon->version+1);
4439 }
4440 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4441 return false;
4442 }
4443
4444 last_osd_report[from].first = ceph_clock_now();
4445 last_osd_report[from].second = beacon->osd_beacon_report_interval;
4446 osd_epochs[from] = beacon->version;
4447
4448 for (const auto& pg : beacon->pgs) {
4449 if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4450 unsigned pg_num = pool->get_pg_num();
4451 last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4452 }
4453 }
4454
4455 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4456 beacon->last_purged_snaps_scrub) {
4457 if (pending_inc.new_xinfo.count(from) == 0) {
4458 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4459 }
4460 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4461 beacon->last_purged_snaps_scrub;
4462 return true;
4463 } else {
4464 return false;
4465 }
4466 }
4467
4468 // ---------------
4469 // map helpers
4470
4471 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4472 {
4473 op->mark_osdmon_event(__func__);
4474 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4475 << " start " << start << dendl;
4476 if (start == 0)
4477 send_full(op);
4478 else
4479 send_incremental(op, start);
4480 }
4481
4482
4483 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4484 {
4485 MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4486 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4487 r->cluster_osdmap_trim_lower_bound = get_first_committed();
4488 r->newest_map = osdmap.get_epoch();
4489 return r;
4490 }
4491
4492 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4493 {
4494 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4495 << std::hex << features << std::dec << dendl;
4496 MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4497 m->cluster_osdmap_trim_lower_bound = get_first_committed();
4498 m->newest_map = osdmap.get_epoch();
4499
4500 for (epoch_t e = to; e >= from && e > 0; e--) {
4501 bufferlist bl;
4502 int err = get_version(e, features, bl);
4503 if (err == 0) {
4504 ceph_assert(bl.length());
4505 // if (get_version(e, bl) > 0) {
4506 dout(20) << "build_incremental inc " << e << " "
4507 << bl.length() << " bytes" << dendl;
4508 m->incremental_maps[e] = bl;
4509 } else {
4510 ceph_assert(err == -ENOENT);
4511 ceph_assert(!bl.length());
4512 get_version_full(e, features, bl);
4513 if (bl.length() > 0) {
4514 //else if (get_version("full", e, bl) > 0) {
4515 dout(20) << "build_incremental full " << e << " "
4516 << bl.length() << " bytes" << dendl;
4517 m->maps[e] = bl;
4518 } else {
4519 ceph_abort(); // we should have all maps.
4520 }
4521 }
4522 }
4523 return m;
4524 }
4525
4526 void OSDMonitor::send_full(MonOpRequestRef op)
4527 {
4528 op->mark_osdmon_event(__func__);
4529 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4530 mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4531 }
4532
4533 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4534 {
4535 op->mark_osdmon_event(__func__);
4536
4537 MonSession *s = op->get_session();
4538 ceph_assert(s);
4539
4540 if (s->proxy_con) {
4541 // oh, we can tell the other mon to do it
4542 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4543 << first << dendl;
4544 MRoute *r = new MRoute(s->proxy_tid, NULL);
4545 r->send_osdmap_first = first;
4546 s->proxy_con->send_message(r);
4547 op->mark_event("reply: send routed send_osdmap_first reply");
4548 } else {
4549 // do it ourselves
4550 send_incremental(first, s, false, op);
4551 }
4552 }
4553
4554 void OSDMonitor::send_incremental(epoch_t first,
4555 MonSession *session,
4556 bool onetime,
4557 MonOpRequestRef req)
4558 {
4559 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4560 << " to " << session->name << dendl;
4561
4562 // get feature of the peer
4563 // use quorum_con_features, if it's an anonymous connection.
4564 uint64_t features = session->con_features ? session->con_features :
4565 mon.get_quorum_con_features();
4566
4567 if (first <= session->osd_epoch) {
4568 dout(10) << __func__ << " " << session->name << " should already have epoch "
4569 << session->osd_epoch << dendl;
4570 first = session->osd_epoch + 1;
4571 }
4572
4573 if (first < get_first_committed()) {
4574 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4575 m->cluster_osdmap_trim_lower_bound = get_first_committed();
4576 m->newest_map = osdmap.get_epoch();
4577
4578 first = get_first_committed();
4579 bufferlist bl;
4580 int err = get_version_full(first, features, bl);
4581 ceph_assert(err == 0);
4582 ceph_assert(bl.length());
4583 dout(20) << "send_incremental starting with base full "
4584 << first << " " << bl.length() << " bytes" << dendl;
4585 m->maps[first] = bl;
4586
4587 if (req) {
4588 mon.send_reply(req, m);
4589 session->osd_epoch = first;
4590 return;
4591 } else {
4592 session->con->send_message(m);
4593 session->osd_epoch = first;
4594 }
4595 first++;
4596 }
4597
4598 while (first <= osdmap.get_epoch()) {
4599 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4600 osdmap.get_epoch());
4601 MOSDMap *m = build_incremental(first, last, features);
4602
4603 if (req) {
4604 // send some maps. it may not be all of them, but it will get them
4605 // started.
4606 mon.send_reply(req, m);
4607 } else {
4608 session->con->send_message(m);
4609 first = last + 1;
4610 }
4611 session->osd_epoch = last;
4612 if (onetime || req)
4613 break;
4614 }
4615 }
4616
4617 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4618 {
4619 return get_version(ver, mon.get_quorum_con_features(), bl);
4620 }
4621
4622 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4623 {
4624 OSDMap::Incremental inc;
4625 auto q = bl.cbegin();
4626 inc.decode(q);
4627 // always encode with subset of osdmap's canonical features
4628 uint64_t f = features & inc.encode_features;
4629 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4630 << dendl;
4631 bl.clear();
4632 if (inc.fullmap.length()) {
4633 // embedded full map?
4634 OSDMap m;
4635 m.decode(inc.fullmap);
4636 inc.fullmap.clear();
4637 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4638 }
4639 if (inc.crush.length()) {
4640 // embedded crush map
4641 CrushWrapper c;
4642 auto p = inc.crush.cbegin();
4643 c.decode(p);
4644 inc.crush.clear();
4645 c.encode(inc.crush, f);
4646 }
4647 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4648 }
4649
4650 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4651 {
4652 OSDMap m;
4653 auto q = bl.cbegin();
4654 m.decode(q);
4655 // always encode with subset of osdmap's canonical features
4656 uint64_t f = features & m.get_encoding_features();
4657 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4658 << dendl;
4659 bl.clear();
4660 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4661 }
4662
4663 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4664 {
4665 uint64_t significant_features = OSDMap::get_significant_features(features);
4666 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4667 return 0;
4668 }
4669 int ret = PaxosService::get_version(ver, bl);
4670 if (ret < 0) {
4671 return ret;
4672 }
4673 // NOTE: this check is imprecise; the OSDMap encoding features may
4674 // be a subset of the latest mon quorum features, but worst case we
4675 // reencode once and then cache the (identical) result under both
4676 // feature masks.
4677 if (significant_features !=
4678 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4679 reencode_incremental_map(bl, features);
4680 }
4681 inc_osd_cache.add_bytes({ver, significant_features}, bl);
4682 return 0;
4683 }
4684
4685 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4686 {
4687 bufferlist inc_bl;
4688 int err = get_version(ver, inc_bl);
4689 ceph_assert(err == 0);
4690 ceph_assert(inc_bl.length());
4691
4692 auto p = inc_bl.cbegin();
4693 inc.decode(p);
4694 dout(10) << __func__ << " "
4695 << " epoch " << inc.epoch
4696 << " inc_crc " << inc.inc_crc
4697 << " full_crc " << inc.full_crc
4698 << " encode_features " << inc.encode_features << dendl;
4699 return 0;
4700 }
4701
4702 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4703 {
4704 dout(10) << __func__ << " ver " << ver << dendl;
4705
4706 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4707 if (closest_pinned == 0) {
4708 return -ENOENT;
4709 }
4710 if (closest_pinned > ver) {
4711 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4712 }
4713 ceph_assert(closest_pinned <= ver);
4714
4715 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4716
4717 // get osdmap incremental maps and apply on top of this one.
4718 bufferlist osdm_bl;
4719 bool has_cached_osdmap = false;
4720 for (version_t v = ver-1; v >= closest_pinned; --v) {
4721 if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4722 &osdm_bl)) {
4723 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4724 closest_pinned = v;
4725 has_cached_osdmap = true;
4726 break;
4727 }
4728 }
4729
4730 if (!has_cached_osdmap) {
4731 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4732 if (err != 0) {
4733 derr << __func__ << " closest pinned map ver " << closest_pinned
4734 << " not available! error: " << cpp_strerror(err) << dendl;
4735 }
4736 ceph_assert(err == 0);
4737 }
4738
4739 ceph_assert(osdm_bl.length());
4740
4741 OSDMap osdm;
4742 osdm.decode(osdm_bl);
4743
4744 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4745 << " e" << osdm.epoch
4746 << " crc " << osdm.get_crc()
4747 << " -- applying incremental maps." << dendl;
4748
4749 uint64_t encode_features = 0;
4750 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4751 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4752
4753 OSDMap::Incremental inc;
4754 int err = get_inc(v, inc);
4755 ceph_assert(err == 0);
4756
4757 encode_features = inc.encode_features;
4758
4759 err = osdm.apply_incremental(inc);
4760 ceph_assert(err == 0);
4761
4762 // this block performs paranoid checks on map retrieval
4763 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4764 inc.full_crc != 0) {
4765
4766 uint64_t f = encode_features;
4767 if (!f) {
4768 f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4769 }
4770
4771 // encode osdmap to force calculating crcs
4772 bufferlist tbl;
4773 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4774 // decode osdmap to compare crcs with what's expected by incremental
4775 OSDMap tosdm;
4776 tosdm.decode(tbl);
4777
4778 if (tosdm.get_crc() != inc.full_crc) {
4779 derr << __func__
4780 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4781 << ", expected " << inc.full_crc << ")" << dendl;
4782 ceph_abort_msg("osdmap crc mismatch");
4783 }
4784 }
4785
4786 // note: we cannot add the recently computed map to the cache, as is,
4787 // because we have not encoded the map into a bl.
4788 }
4789
4790 if (!encode_features) {
4791 dout(10) << __func__
4792 << " last incremental map didn't have features;"
4793 << " defaulting to quorum's or all" << dendl;
4794 encode_features =
4795 (mon.quorum_con_features ? mon.quorum_con_features : -1);
4796 }
4797 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4798
4799 return 0;
4800 }
4801
4802 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4803 {
4804 return get_version_full(ver, mon.get_quorum_con_features(), bl);
4805 }
4806
4807 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4808 bufferlist& bl)
4809 {
4810 uint64_t significant_features = OSDMap::get_significant_features(features);
4811 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4812 return 0;
4813 }
4814 int ret = PaxosService::get_version_full(ver, bl);
4815 if (ret == -ENOENT) {
4816 // build map?
4817 ret = get_full_from_pinned_map(ver, bl);
4818 }
4819 if (ret < 0) {
4820 return ret;
4821 }
4822 // NOTE: this check is imprecise; the OSDMap encoding features may
4823 // be a subset of the latest mon quorum features, but worst case we
4824 // reencode once and then cache the (identical) result under both
4825 // feature masks.
4826 if (significant_features !=
4827 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4828 reencode_full_map(bl, features);
4829 }
4830 full_osd_cache.add_bytes({ver, significant_features}, bl);
4831 return 0;
4832 }
4833
4834 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4835 {
4836 dout(10) << "blocklist " << av << " until " << until << dendl;
4837 for (auto a : av.v) {
4838 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4839 a.set_type(entity_addr_t::TYPE_ANY);
4840 } else {
4841 a.set_type(entity_addr_t::TYPE_LEGACY);
4842 }
4843 pending_inc.new_blocklist[a] = until;
4844 }
4845 return pending_inc.epoch;
4846 }
4847
4848 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4849 {
4850 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4851 a.set_type(entity_addr_t::TYPE_ANY);
4852 } else {
4853 a.set_type(entity_addr_t::TYPE_LEGACY);
4854 }
4855 dout(10) << "blocklist " << a << " until " << until << dendl;
4856 pending_inc.new_blocklist[a] = until;
4857 return pending_inc.epoch;
4858 }
4859
4860
4861 void OSDMonitor::check_osdmap_subs()
4862 {
4863 dout(10) << __func__ << dendl;
4864 if (!osdmap.get_epoch()) {
4865 return;
4866 }
4867 auto osdmap_subs = mon.session_map.subs.find("osdmap");
4868 if (osdmap_subs == mon.session_map.subs.end()) {
4869 return;
4870 }
4871 auto p = osdmap_subs->second->begin();
4872 while (!p.end()) {
4873 auto sub = *p;
4874 ++p;
4875 check_osdmap_sub(sub);
4876 }
4877 }
4878
4879 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4880 {
4881 dout(10) << __func__ << " " << sub << " next " << sub->next
4882 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4883 if (sub->next <= osdmap.get_epoch()) {
4884 if (sub->next >= 1)
4885 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4886 else
4887 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4888 if (sub->onetime)
4889 mon.session_map.remove_sub(sub);
4890 else
4891 sub->next = osdmap.get_epoch() + 1;
4892 }
4893 }
4894
4895 void OSDMonitor::check_pg_creates_subs()
4896 {
4897 if (!osdmap.get_num_up_osds()) {
4898 return;
4899 }
4900 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4901 mon.with_session_map([this](const MonSessionMap& session_map) {
4902 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4903 if (pg_creates_subs == session_map.subs.end()) {
4904 return;
4905 }
4906 for (auto sub : *pg_creates_subs->second) {
4907 check_pg_creates_sub(sub);
4908 }
4909 });
4910 }
4911
4912 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4913 {
4914 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4915 ceph_assert(sub->type == "osd_pg_creates");
4916 // only send these if the OSD is up. we will check_subs() when they do
4917 // come up so they will get the creates then.
4918 if (sub->session->name.is_osd() &&
4919 mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4920 sub->next = send_pg_creates(sub->session->name.num(),
4921 sub->session->con.get(),
4922 sub->next);
4923 }
4924 }
4925
4926 void OSDMonitor::do_application_enable(int64_t pool_id,
4927 const std::string &app_name,
4928 const std::string &app_key,
4929 const std::string &app_value,
4930 bool force)
4931 {
4932 ceph_assert(paxos.is_plugged() && is_writeable());
4933
4934 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4935 << dendl;
4936
4937 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4938
4939 auto pp = osdmap.get_pg_pool(pool_id);
4940 ceph_assert(pp != nullptr);
4941
4942 pg_pool_t p = *pp;
4943 if (pending_inc.new_pools.count(pool_id)) {
4944 p = pending_inc.new_pools[pool_id];
4945 }
4946
4947 if (app_key.empty()) {
4948 p.application_metadata.insert({app_name, {}});
4949 } else {
4950 if (force) {
4951 p.application_metadata[app_name][app_key] = app_value;
4952 } else {
4953 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4954 }
4955 }
4956 p.last_change = pending_inc.epoch;
4957 pending_inc.new_pools[pool_id] = p;
4958 }
4959
4960 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4961 pool_opts_t::key_t opt,
4962 pool_opts_t::value_t val)
4963 {
4964 dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
4965 << " val: " << val << dendl;
4966 auto p = pending_inc.new_pools.try_emplace(
4967 pool_id, *osdmap.get_pg_pool(pool_id));
4968 p.first->second.opts.set(opt, val);
4969 }
4970
4971 unsigned OSDMonitor::scan_for_creating_pgs(
4972 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4973 const mempool::osdmap::set<int64_t>& removed_pools,
4974 utime_t modified,
4975 creating_pgs_t* creating_pgs) const
4976 {
4977 unsigned queued = 0;
4978 for (auto& p : pools) {
4979 int64_t poolid = p.first;
4980 if (creating_pgs->created_pools.count(poolid)) {
4981 dout(10) << __func__ << " already created " << poolid << dendl;
4982 continue;
4983 }
4984 const pg_pool_t& pool = p.second;
4985 int ruleno = pool.get_crush_rule();
4986 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4987 continue;
4988
4989 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4990 const auto created = pool.get_last_change();
4991 if (last_scan_epoch && created <= last_scan_epoch) {
4992 dout(10) << __func__ << " no change in pool " << poolid
4993 << " " << pool << dendl;
4994 continue;
4995 }
4996 if (removed_pools.count(poolid)) {
4997 dout(10) << __func__ << " pool is being removed: " << poolid
4998 << " " << pool << dendl;
4999 continue;
5000 }
5001 dout(10) << __func__ << " queueing pool create for " << poolid
5002 << " " << pool << dendl;
5003 creating_pgs->create_pool(poolid, pool.get_pg_num(),
5004 created, modified);
5005 queued++;
5006 }
5007 return queued;
5008 }
5009
5010 void OSDMonitor::update_creating_pgs()
5011 {
5012 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
5013 << creating_pgs.queue.size() << " pools in queue" << dendl;
5014 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
5015 std::lock_guard<std::mutex> l(creating_pgs_lock);
5016 for (const auto& pg : creating_pgs.pgs) {
5017 int acting_primary = -1;
5018 auto pgid = pg.first;
5019 if (!osdmap.pg_exists(pgid)) {
5020 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
5021 << dendl;
5022 continue;
5023 }
5024 auto mapped = pg.second.create_epoch;
5025 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
5026 spg_t spgid(pgid);
5027 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
5028 // check the previous creating_pgs, look for the target to whom the pg was
5029 // previously mapped
5030 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
5031 const auto last_acting_primary = pgs_by_epoch.first;
5032 for (auto& pgs: pgs_by_epoch.second) {
5033 if (pgs.second.count(spgid)) {
5034 if (last_acting_primary == acting_primary) {
5035 mapped = pgs.first;
5036 } else {
5037 dout(20) << __func__ << " " << pgid << " "
5038 << " acting_primary:" << last_acting_primary
5039 << " -> " << acting_primary << dendl;
5040 // note epoch if the target of the create message changed.
5041 mapped = mapping.get_epoch();
5042 }
5043 break;
5044 } else {
5045 // newly creating
5046 mapped = mapping.get_epoch();
5047 }
5048 }
5049 }
5050 dout(10) << __func__ << " will instruct osd." << acting_primary
5051 << " to create " << pgid << "@" << mapped << dendl;
5052 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5053 }
5054 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5055 creating_pgs_epoch = mapping.get_epoch();
5056 }
5057
5058 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5059 {
5060 dout(30) << __func__ << " osd." << osd << " next=" << next
5061 << " " << creating_pgs_by_osd_epoch << dendl;
5062 std::lock_guard<std::mutex> l(creating_pgs_lock);
5063 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5064 dout(20) << __func__
5065 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5066 // the subscribers will be updated when the mapping is completed anyway
5067 return next;
5068 }
5069 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5070 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5071 return next;
5072 ceph_assert(!creating_pgs_by_epoch->second.empty());
5073
5074 auto m = make_message<MOSDPGCreate2>(creating_pgs_epoch);
5075
5076 epoch_t last = 0;
5077 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5078 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5079 auto epoch = epoch_pgs->first;
5080 auto& pgs = epoch_pgs->second;
5081 dout(20) << __func__ << " osd." << osd << " from " << next
5082 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5083 last = epoch;
5084 for (auto& pg : pgs) {
5085 // Need the create time from the monitor using its clock to set
5086 // last_scrub_stamp upon pg creation.
5087 auto create = creating_pgs.pgs.find(pg.pgid);
5088 ceph_assert(create != creating_pgs.pgs.end());
5089 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5090 create->second.create_stamp));
5091 if (create->second.history.epoch_created) {
5092 dout(20) << __func__ << " " << pg << " " << create->second.history
5093 << " " << create->second.past_intervals << dendl;
5094 m->pg_extra.emplace(pg, make_pair(create->second.history,
5095 create->second.past_intervals));
5096 }
5097 dout(20) << __func__ << " will create " << pg
5098 << " at " << create->second.create_epoch << dendl;
5099 }
5100 }
5101 if (!m->pgs.empty()) {
5102 con->send_message2(std::move(m));
5103 } else {
5104 dout(20) << __func__ << " osd." << osd << " from " << next
5105 << " has nothing to send" << dendl;
5106 return next;
5107 }
5108
5109 // sub is current through last + 1
5110 return last + 1;
5111 }
5112
5113 // TICK
5114
5115
5116 void OSDMonitor::tick()
5117 {
5118 if (!is_active()) return;
5119
5120 dout(10) << osdmap << dendl;
5121
5122 // always update osdmap manifest, regardless of being the leader.
5123 load_osdmap_manifest();
5124
5125 // always tune priority cache manager memory on leader and peons
5126 if (ceph_using_tcmalloc() && mon_memory_autotune) {
5127 std::lock_guard l(balancer_lock);
5128 if (pcm != nullptr) {
5129 pcm->tune_memory();
5130 pcm->balance();
5131 _set_new_cache_sizes();
5132 dout(10) << "tick balancer "
5133 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5134 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5135 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5136 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5137 << dendl;
5138 dout(10) << "tick balancer "
5139 << " full cache_bytes: " << full_cache->get_cache_bytes()
5140 << " full comtd_bytes: " << full_cache->get_committed_size()
5141 << " full used_bytes: " << full_cache->_get_used_bytes()
5142 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5143 << dendl;
5144 }
5145 }
5146
5147 if (!mon.is_leader()) return;
5148
5149 bool do_propose = false;
5150 utime_t now = ceph_clock_now();
5151
5152 if (handle_osd_timeouts(now, last_osd_report)) {
5153 do_propose = true;
5154 }
5155
5156 // mark osds down?
5157 if (check_failures(now)) {
5158 do_propose = true;
5159 }
5160
5161 // Force a proposal if we need to prune; pruning is performed on
5162 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5163 // even if there's nothing going on.
5164 if (is_prune_enabled() && should_prune()) {
5165 do_propose = true;
5166 }
5167
5168 // mark down osds out?
5169
5170 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5171 * influence at all. The decision is made based on the ratio of "in" osds,
5172 * and the function returns false if this ratio is lower that the minimum
5173 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5174 */
5175 if (can_mark_out(-1)) {
5176 string down_out_subtree_limit = g_conf().get_val<string>(
5177 "mon_osd_down_out_subtree_limit");
5178 set<int> down_cache; // quick cache of down subtrees
5179
5180 map<int,utime_t>::iterator i = down_pending_out.begin();
5181 while (i != down_pending_out.end()) {
5182 int o = i->first;
5183 utime_t down = now;
5184 down -= i->second;
5185 ++i;
5186
5187 if (osdmap.is_down(o) &&
5188 osdmap.is_in(o) &&
5189 can_mark_out(o)) {
5190 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5191 utime_t grace = orig_grace;
5192 double my_grace = 0.0;
5193
5194 if (g_conf()->mon_osd_adjust_down_out_interval) {
5195 // scale grace period the same way we do the heartbeat grace.
5196 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5197 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5198 double decay_k = ::log(.5) / halflife;
5199 double decay = exp((double)down * decay_k);
5200 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5201 << " down for " << down << " decay " << decay << dendl;
5202 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5203 grace += my_grace;
5204 }
5205
5206 // is this an entire large subtree down?
5207 if (down_out_subtree_limit.length()) {
5208 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5209 if (type > 0) {
5210 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5211 dout(10) << "tick entire containing " << down_out_subtree_limit
5212 << " subtree for osd." << o
5213 << " is down; resetting timer" << dendl;
5214 // reset timer, too.
5215 down_pending_out[o] = now;
5216 continue;
5217 }
5218 }
5219 }
5220
5221 bool down_out = !osdmap.is_destroyed(o) &&
5222 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5223 bool destroyed_out = osdmap.is_destroyed(o) &&
5224 g_conf()->mon_osd_destroyed_out_interval > 0 &&
5225 // this is not precise enough as we did not make a note when this osd
5226 // was marked as destroyed, but let's not bother with that
5227 // complexity for now.
5228 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5229 if (down_out || destroyed_out) {
5230 dout(10) << "tick marking osd." << o << " OUT after " << down
5231 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5232 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5233
5234 // set the AUTOOUT bit.
5235 if (pending_inc.new_state.count(o) == 0)
5236 pending_inc.new_state[o] = 0;
5237 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5238
5239 // remember previous weight
5240 if (pending_inc.new_xinfo.count(o) == 0)
5241 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5242 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5243
5244 do_propose = true;
5245
5246 mon.clog->info() << "Marking osd." << o << " out (has been down for "
5247 << int(down.sec()) << " seconds)";
5248 } else
5249 continue;
5250 }
5251
5252 down_pending_out.erase(o);
5253 }
5254 } else {
5255 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5256 }
5257
5258 // expire blocklisted items?
5259 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5260 p != osdmap.blocklist.end();
5261 ++p) {
5262 if (p->second < now) {
5263 dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5264 pending_inc.old_blocklist.push_back(p->first);
5265 do_propose = true;
5266 }
5267 }
5268 for (auto p = osdmap.range_blocklist.begin();
5269 p != osdmap.range_blocklist.end();
5270 ++p) {
5271 if (p->second < now) {
5272 dout(10) << "expiring range_blocklist item " << p->first
5273 << " expired " << p->second << " < now " << now << dendl;
5274 pending_inc.old_range_blocklist.push_back(p->first);
5275 do_propose = true;
5276 }
5277 }
5278
5279 if (try_prune_purged_snaps()) {
5280 do_propose = true;
5281 }
5282
5283 if (update_pools_status())
5284 do_propose = true;
5285
5286 if (do_propose ||
5287 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5288 propose_pending();
5289 }
5290
5291 void OSDMonitor::_set_new_cache_sizes()
5292 {
5293 uint64_t cache_size = 0;
5294 int64_t inc_alloc = 0;
5295 int64_t full_alloc = 0;
5296 int64_t kv_alloc = 0;
5297
5298 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5299 cache_size = pcm->get_tuned_mem();
5300 inc_alloc = inc_cache->get_committed_size();
5301 full_alloc = full_cache->get_committed_size();
5302 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5303 }
5304
5305 inc_osd_cache.set_bytes(inc_alloc);
5306 full_osd_cache.set_bytes(full_alloc);
5307
5308 dout(1) << __func__ << " cache_size:" << cache_size
5309 << " inc_alloc: " << inc_alloc
5310 << " full_alloc: " << full_alloc
5311 << " kv_alloc: " << kv_alloc
5312 << dendl;
5313 }
5314
5315 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5316 std::map<int, std::pair<utime_t, int>> &last_osd_report)
5317 {
5318 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5319 if (now - mon.get_leader_since() < timeo) {
5320 // We haven't been the leader for long enough to consider OSD timeouts
5321 return false;
5322 }
5323
5324 int max_osd = osdmap.get_max_osd();
5325 bool new_down = false;
5326
5327 for (int i=0; i < max_osd; ++i) {
5328 dout(30) << __func__ << ": checking up on osd " << i << dendl;
5329 if (!osdmap.exists(i)) {
5330 last_osd_report.erase(i); // if any
5331 continue;
5332 }
5333 if (!osdmap.is_up(i))
5334 continue;
5335 const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5336 if (t == last_osd_report.end()) {
5337 // it wasn't in the map; start the timer.
5338 last_osd_report[i].first = now;
5339 last_osd_report[i].second = 0;
5340 } else if (can_mark_down(i)) {
5341 utime_t diff = now - t->second.first;
5342 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5343 // to allow for the osd to miss a beacon.
5344 int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5345 utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0);
5346 if (diff > max_timeout) {
5347 mon.clog->info() << "osd." << i << " marked down after no beacon for "
5348 << diff << " seconds";
5349 derr << "no beacon from osd." << i << " since " << t->second.first
5350 << ", " << diff << " seconds ago. marking down" << dendl;
5351 pending_inc.new_state[i] = CEPH_OSD_UP;
5352 new_down = true;
5353 }
5354 }
5355 }
5356 return new_down;
5357 }
5358
5359 static void dump_cpu_list(Formatter *f, const char *name,
5360 const string& strlist)
5361 {
5362 cpu_set_t cpu_set;
5363 size_t cpu_set_size;
5364 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5365 return;
5366 }
5367 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5368 f->open_array_section(name);
5369 for (auto cpu : cpus) {
5370 f->dump_int("cpu", cpu);
5371 }
5372 f->close_section();
5373 }
5374
5375 void OSDMonitor::dump_info(Formatter *f)
5376 {
5377 f->open_object_section("osdmap");
5378 osdmap.dump(f, cct);
5379 f->close_section();
5380
5381 f->open_array_section("osd_metadata");
5382 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5383 if (osdmap.exists(i)) {
5384 f->open_object_section("osd");
5385 f->dump_unsigned("id", i);
5386 dump_osd_metadata(i, f, NULL);
5387 f->close_section();
5388 }
5389 }
5390 f->close_section();
5391
5392 f->open_object_section("osdmap_clean_epochs");
5393 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5394
5395 f->open_object_section("last_epoch_clean");
5396 last_epoch_clean.dump(f);
5397 f->close_section();
5398
5399 f->open_array_section("osd_epochs");
5400 for (auto& osd_epoch : osd_epochs) {
5401 f->open_object_section("osd");
5402 f->dump_unsigned("id", osd_epoch.first);
5403 f->dump_unsigned("epoch", osd_epoch.second);
5404 f->close_section();
5405 }
5406 f->close_section(); // osd_epochs
5407
5408 f->close_section(); // osd_clean_epochs
5409
5410 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5411 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5412
5413 f->open_object_section("crushmap");
5414 osdmap.crush->dump(f);
5415 f->close_section();
5416
5417 if (has_osdmap_manifest) {
5418 f->open_object_section("osdmap_manifest");
5419 osdmap_manifest.dump(f);
5420 f->close_section();
5421 }
5422 }
5423
5424 namespace {
5425 enum osd_pool_get_choices {
5426 SIZE, MIN_SIZE,
5427 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5428 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5429 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5430 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5431 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5432 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5433 CACHE_TARGET_FULL_RATIO,
5434 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5435 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5436 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5437 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5438 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5439 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5440 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5441 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5442 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5443 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5444 PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5445 DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX };
5446
5447 std::set<osd_pool_get_choices>
5448 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5449 const std::set<osd_pool_get_choices>& second)
5450 {
5451 std::set<osd_pool_get_choices> result;
5452 std::set_difference(first.begin(), first.end(),
5453 second.begin(), second.end(),
5454 std::inserter(result, result.end()));
5455 return result;
5456 }
5457 }
5458
5459
5460 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5461 {
5462 op->mark_osdmon_event(__func__);
5463 auto m = op->get_req<MMonCommand>();
5464 int r = 0;
5465 bufferlist rdata;
5466 stringstream ss, ds;
5467
5468 cmdmap_t cmdmap;
5469 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5470 string rs = ss.str();
5471 mon.reply_command(op, -EINVAL, rs, get_last_committed());
5472 return true;
5473 }
5474
5475 MonSession *session = op->get_session();
5476 if (!session) {
5477 derr << __func__ << " no session" << dendl;
5478 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5479 return true;
5480 }
5481
5482 string prefix;
5483 cmd_getval(cmdmap, "prefix", prefix);
5484
5485 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
5486 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5487
5488 if (prefix == "osd stat") {
5489 if (f) {
5490 f->open_object_section("osdmap");
5491 osdmap.print_summary(f.get(), ds, "", true);
5492 f->close_section();
5493 f->flush(rdata);
5494 } else {
5495 osdmap.print_summary(nullptr, ds, "", true);
5496 rdata.append(ds);
5497 }
5498 }
5499 else if (prefix == "osd dump" ||
5500 prefix == "osd tree" ||
5501 prefix == "osd tree-from" ||
5502 prefix == "osd ls" ||
5503 prefix == "osd getmap" ||
5504 prefix == "osd getcrushmap" ||
5505 prefix == "osd ls-tree" ||
5506 prefix == "osd info") {
5507
5508 epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
5509 bufferlist osdmap_bl;
5510 int err = get_version_full(epoch, osdmap_bl);
5511 if (err == -ENOENT) {
5512 r = -ENOENT;
5513 ss << "there is no map for epoch " << epoch;
5514 goto reply;
5515 }
5516 ceph_assert(err == 0);
5517 ceph_assert(osdmap_bl.length());
5518
5519 OSDMap *p;
5520 if (epoch == osdmap.get_epoch()) {
5521 p = &osdmap;
5522 } else {
5523 p = new OSDMap;
5524 p->decode(osdmap_bl);
5525 }
5526
5527 auto sg = make_scope_guard([&] {
5528 if (p != &osdmap) {
5529 delete p;
5530 }
5531 });
5532
5533 if (prefix == "osd dump") {
5534 stringstream ds;
5535 if (f) {
5536 f->open_object_section("osdmap");
5537 p->dump(f.get(), cct);
5538 f->close_section();
5539 f->flush(ds);
5540 } else {
5541 p->print(cct, ds);
5542 }
5543 rdata.append(ds);
5544 if (!f)
5545 ds << " ";
5546 } else if (prefix == "osd ls") {
5547 if (f) {
5548 f->open_array_section("osds");
5549 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5550 if (osdmap.exists(i)) {
5551 f->dump_int("osd", i);
5552 }
5553 }
5554 f->close_section();
5555 f->flush(ds);
5556 } else {
5557 bool first = true;
5558 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5559 if (osdmap.exists(i)) {
5560 if (!first)
5561 ds << "\n";
5562 first = false;
5563 ds << i;
5564 }
5565 }
5566 }
5567 rdata.append(ds);
5568 } else if (prefix == "osd info") {
5569 int64_t osd_id;
5570 bool do_single_osd = true;
5571 if (!cmd_getval(cmdmap, "id", osd_id)) {
5572 do_single_osd = false;
5573 }
5574
5575 if (do_single_osd && !osdmap.exists(osd_id)) {
5576 ss << "osd." << osd_id << " does not exist";
5577 r = -EINVAL;
5578 goto reply;
5579 }
5580
5581 if (f) {
5582 if (do_single_osd) {
5583 osdmap.dump_osd(osd_id, f.get());
5584 } else {
5585 osdmap.dump_osds(f.get());
5586 }
5587 f->flush(ds);
5588 } else {
5589 if (do_single_osd) {
5590 osdmap.print_osd(osd_id, ds);
5591 } else {
5592 osdmap.print_osds(ds);
5593 }
5594 }
5595 rdata.append(ds);
5596 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5597 string bucket;
5598 if (prefix == "osd tree-from") {
5599 cmd_getval(cmdmap, "bucket", bucket);
5600 if (!osdmap.crush->name_exists(bucket)) {
5601 ss << "bucket '" << bucket << "' does not exist";
5602 r = -ENOENT;
5603 goto reply;
5604 }
5605 int id = osdmap.crush->get_item_id(bucket);
5606 if (id >= 0) {
5607 ss << "\"" << bucket << "\" is not a bucket";
5608 r = -EINVAL;
5609 goto reply;
5610 }
5611 }
5612
5613 vector<string> states;
5614 cmd_getval(cmdmap, "states", states);
5615 unsigned filter = 0;
5616 for (auto& s : states) {
5617 if (s == "up") {
5618 filter |= OSDMap::DUMP_UP;
5619 } else if (s == "down") {
5620 filter |= OSDMap::DUMP_DOWN;
5621 } else if (s == "in") {
5622 filter |= OSDMap::DUMP_IN;
5623 } else if (s == "out") {
5624 filter |= OSDMap::DUMP_OUT;
5625 } else if (s == "destroyed") {
5626 filter |= OSDMap::DUMP_DESTROYED;
5627 } else {
5628 ss << "unrecognized state '" << s << "'";
5629 r = -EINVAL;
5630 goto reply;
5631 }
5632 }
5633 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5634 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5635 ss << "cannot specify both 'in' and 'out'";
5636 r = -EINVAL;
5637 goto reply;
5638 }
5639 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5640 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5641 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5642 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5643 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5644 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5645 ss << "can specify only one of 'up', 'down' and 'destroyed'";
5646 r = -EINVAL;
5647 goto reply;
5648 }
5649 if (f) {
5650 f->open_object_section("tree");
5651 p->print_tree(f.get(), NULL, filter, bucket);
5652 f->close_section();
5653 f->flush(ds);
5654 } else {
5655 p->print_tree(NULL, &ds, filter, bucket);
5656 }
5657 rdata.append(ds);
5658 } else if (prefix == "osd getmap") {
5659 rdata.append(osdmap_bl);
5660 ss << "got osdmap epoch " << p->get_epoch();
5661 } else if (prefix == "osd getcrushmap") {
5662 p->crush->encode(rdata, mon.get_quorum_con_features());
5663 ss << p->get_crush_version();
5664 } else if (prefix == "osd ls-tree") {
5665 string bucket_name;
5666 cmd_getval(cmdmap, "name", bucket_name);
5667 set<int> osds;
5668 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5669 if (r == -ENOENT) {
5670 ss << "\"" << bucket_name << "\" does not exist";
5671 goto reply;
5672 } else if (r < 0) {
5673 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5674 goto reply;
5675 }
5676
5677 if (f) {
5678 f->open_array_section("osds");
5679 for (auto &i : osds) {
5680 if (osdmap.exists(i)) {
5681 f->dump_int("osd", i);
5682 }
5683 }
5684 f->close_section();
5685 f->flush(ds);
5686 } else {
5687 bool first = true;
5688 for (auto &i : osds) {
5689 if (osdmap.exists(i)) {
5690 if (!first)
5691 ds << "\n";
5692 first = false;
5693 ds << i;
5694 }
5695 }
5696 }
5697
5698 rdata.append(ds);
5699 }
5700 } else if (prefix == "osd getmaxosd") {
5701 if (f) {
5702 f->open_object_section("getmaxosd");
5703 f->dump_unsigned("epoch", osdmap.get_epoch());
5704 f->dump_int("max_osd", osdmap.get_max_osd());
5705 f->close_section();
5706 f->flush(rdata);
5707 } else {
5708 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5709 rdata.append(ds);
5710 }
5711 } else if (prefix == "osd utilization") {
5712 string out;
5713 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5714 if (f)
5715 f->flush(rdata);
5716 else
5717 rdata.append(out);
5718 r = 0;
5719 goto reply;
5720 } else if (prefix == "osd find") {
5721 int64_t osd;
5722 if (!cmd_getval(cmdmap, "id", osd)) {
5723 ss << "unable to parse osd id value '"
5724 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5725 r = -EINVAL;
5726 goto reply;
5727 }
5728 if (!osdmap.exists(osd)) {
5729 ss << "osd." << osd << " does not exist";
5730 r = -ENOENT;
5731 goto reply;
5732 }
5733 string format;
5734 cmd_getval(cmdmap, "format", format);
5735 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5736 f->open_object_section("osd_location");
5737 f->dump_int("osd", osd);
5738 f->dump_object("addrs", osdmap.get_addrs(osd));
5739 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5740
5741 // try to identify host, pod/container name, etc.
5742 map<string,string> m;
5743 load_metadata(osd, m, nullptr);
5744 if (auto p = m.find("hostname"); p != m.end()) {
5745 f->dump_string("host", p->second);
5746 }
5747 for (auto& k : {
5748 "pod_name", "pod_namespace", // set by rook
5749 "container_name" // set by cephadm, ceph-ansible
5750 }) {
5751 if (auto p = m.find(k); p != m.end()) {
5752 f->dump_string(k, p->second);
5753 }
5754 }
5755
5756 // crush is helpful too
5757 f->open_object_section("crush_location");
5758 map<string,string> loc = osdmap.crush->get_full_location(osd);
5759 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5760 f->dump_string(p->first.c_str(), p->second);
5761 f->close_section();
5762 f->close_section();
5763 f->flush(rdata);
5764 } else if (prefix == "osd metadata") {
5765 int64_t osd = -1;
5766 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5767 !cmd_getval(cmdmap, "id", osd)) {
5768 ss << "unable to parse osd id value '"
5769 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5770 r = -EINVAL;
5771 goto reply;
5772 }
5773 if (osd >= 0 && !osdmap.exists(osd)) {
5774 ss << "osd." << osd << " does not exist";
5775 r = -ENOENT;
5776 goto reply;
5777 }
5778 string format;
5779 cmd_getval(cmdmap, "format", format);
5780 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5781 if (osd >= 0) {
5782 f->open_object_section("osd_metadata");
5783 f->dump_unsigned("id", osd);
5784 r = dump_osd_metadata(osd, f.get(), &ss);
5785 if (r < 0)
5786 goto reply;
5787 f->close_section();
5788 } else {
5789 r = 0;
5790 f->open_array_section("osd_metadata");
5791 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5792 if (osdmap.exists(i)) {
5793 f->open_object_section("osd");
5794 f->dump_unsigned("id", i);
5795 r = dump_osd_metadata(i, f.get(), NULL);
5796 if (r == -EINVAL || r == -ENOENT) {
5797 // Drop error, continue to get other daemons' metadata
5798 dout(4) << "No metadata for osd." << i << dendl;
5799 r = 0;
5800 } else if (r < 0) {
5801 // Unexpected error
5802 goto reply;
5803 }
5804 f->close_section();
5805 }
5806 }
5807 f->close_section();
5808 }
5809 f->flush(rdata);
5810 } else if (prefix == "osd versions") {
5811 if (!f)
5812 f.reset(Formatter::create("json-pretty"));
5813 count_metadata("ceph_version", f.get());
5814 f->flush(rdata);
5815 r = 0;
5816 } else if (prefix == "osd count-metadata") {
5817 if (!f)
5818 f.reset(Formatter::create("json-pretty"));
5819 string field;
5820 cmd_getval(cmdmap, "property", field);
5821 count_metadata(field, f.get());
5822 f->flush(rdata);
5823 r = 0;
5824 } else if (prefix == "osd numa-status") {
5825 TextTable tbl;
5826 if (f) {
5827 f->open_array_section("osds");
5828 } else {
5829 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5830 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5831 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5832 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5833 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5834 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5835 }
5836 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5837 if (osdmap.exists(i)) {
5838 map<string,string> m;
5839 ostringstream err;
5840 if (load_metadata(i, m, &err) < 0) {
5841 continue;
5842 }
5843 string host;
5844 auto p = m.find("hostname");
5845 if (p != m.end()) {
5846 host = p->second;
5847 }
5848 if (f) {
5849 f->open_object_section("osd");
5850 f->dump_int("osd", i);
5851 f->dump_string("host", host);
5852 for (auto n : { "network_numa_node", "objectstore_numa_node",
5853 "numa_node" }) {
5854 p = m.find(n);
5855 if (p != m.end()) {
5856 f->dump_int(n, atoi(p->second.c_str()));
5857 }
5858 }
5859 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5860 p = m.find(n);
5861 if (p != m.end()) {
5862 list<string> ls = get_str_list(p->second, ",");
5863 f->open_array_section(n);
5864 for (auto node : ls) {
5865 f->dump_int("node", atoi(node.c_str()));
5866 }
5867 f->close_section();
5868 }
5869 }
5870 for (auto n : { "numa_node_cpus" }) {
5871 p = m.find(n);
5872 if (p != m.end()) {
5873 dump_cpu_list(f.get(), n, p->second);
5874 }
5875 }
5876 f->close_section();
5877 } else {
5878 tbl << i;
5879 tbl << host;
5880 p = m.find("network_numa_nodes");
5881 if (p != m.end()) {
5882 tbl << p->second;
5883 } else {
5884 tbl << "-";
5885 }
5886 p = m.find("objectstore_numa_nodes");
5887 if (p != m.end()) {
5888 tbl << p->second;
5889 } else {
5890 tbl << "-";
5891 }
5892 p = m.find("numa_node");
5893 auto q = m.find("numa_node_cpus");
5894 if (p != m.end() && q != m.end()) {
5895 tbl << p->second;
5896 tbl << q->second;
5897 } else {
5898 tbl << "-";
5899 tbl << "-";
5900 }
5901 tbl << TextTable::endrow;
5902 }
5903 }
5904 }
5905 if (f) {
5906 f->close_section();
5907 f->flush(rdata);
5908 } else {
5909 rdata.append(stringify(tbl));
5910 }
5911 } else if (prefix == "osd map") {
5912 string poolstr, objstr, namespacestr;
5913 cmd_getval(cmdmap, "pool", poolstr);
5914 cmd_getval(cmdmap, "object", objstr);
5915 cmd_getval(cmdmap, "nspace", namespacestr);
5916
5917 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5918 if (pool < 0) {
5919 ss << "pool " << poolstr << " does not exist";
5920 r = -ENOENT;
5921 goto reply;
5922 }
5923 object_locator_t oloc(pool, namespacestr);
5924 object_t oid(objstr);
5925 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5926 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5927 vector<int> up, acting;
5928 int up_p, acting_p;
5929 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5930
5931 string fullobjname;
5932 if (!namespacestr.empty())
5933 fullobjname = namespacestr + string("/") + oid.name;
5934 else
5935 fullobjname = oid.name;
5936 if (f) {
5937 f->open_object_section("osd_map");
5938 f->dump_unsigned("epoch", osdmap.get_epoch());
5939 f->dump_string("pool", poolstr);
5940 f->dump_int("pool_id", pool);
5941 f->dump_stream("objname") << fullobjname;
5942 f->dump_stream("raw_pgid") << pgid;
5943 f->dump_stream("pgid") << mpgid;
5944 f->open_array_section("up");
5945 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5946 f->dump_int("osd", *p);
5947 f->close_section();
5948 f->dump_int("up_primary", up_p);
5949 f->open_array_section("acting");
5950 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5951 f->dump_int("osd", *p);
5952 f->close_section();
5953 f->dump_int("acting_primary", acting_p);
5954 f->close_section(); // osd_map
5955 f->flush(rdata);
5956 } else {
5957 ds << "osdmap e" << osdmap.get_epoch()
5958 << " pool '" << poolstr << "' (" << pool << ")"
5959 << " object '" << fullobjname << "' ->"
5960 << " pg " << pgid << " (" << mpgid << ")"
5961 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5962 << pg_vector_string(acting) << ", p" << acting_p << ")";
5963 rdata.append(ds);
5964 }
5965
5966 } else if (prefix == "pg map") {
5967 pg_t pgid;
5968 vector<int> up, acting;
5969 r = parse_pgid(cmdmap, ss, pgid);
5970 if (r < 0)
5971 goto reply;
5972 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5973 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5974 if (f) {
5975 f->open_object_section("pg_map");
5976 f->dump_unsigned("epoch", osdmap.get_epoch());
5977 f->dump_stream("raw_pgid") << pgid;
5978 f->dump_stream("pgid") << mpgid;
5979 f->open_array_section("up");
5980 for (auto osd : up) {
5981 f->dump_int("up_osd", osd);
5982 }
5983 f->close_section();
5984 f->open_array_section("acting");
5985 for (auto osd : acting) {
5986 f->dump_int("acting_osd", osd);
5987 }
5988 f->close_section();
5989 f->close_section();
5990 f->flush(rdata);
5991 } else {
5992 ds << "osdmap e" << osdmap.get_epoch()
5993 << " pg " << pgid << " (" << mpgid << ")"
5994 << " -> up " << up << " acting " << acting;
5995 rdata.append(ds);
5996 }
5997 goto reply;
5998
5999 } else if (prefix == "osd lspools") {
6000 if (f)
6001 f->open_array_section("pools");
6002 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
6003 p != osdmap.pools.end();
6004 ++p) {
6005 if (f) {
6006 f->open_object_section("pool");
6007 f->dump_int("poolnum", p->first);
6008 f->dump_string("poolname", osdmap.pool_name[p->first]);
6009 f->close_section();
6010 } else {
6011 ds << p->first << ' ' << osdmap.pool_name[p->first];
6012 if (next(p) != osdmap.pools.end()) {
6013 ds << '\n';
6014 }
6015 }
6016 }
6017 if (f) {
6018 f->close_section();
6019 f->flush(ds);
6020 }
6021 rdata.append(ds);
6022 } else if (prefix == "osd blocklist ls" ||
6023 prefix == "osd blacklist ls") {
6024 if (f)
6025 f->open_array_section("blocklist");
6026
6027 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6028 p != osdmap.blocklist.end();
6029 ++p) {
6030 if (f) {
6031 f->open_object_section("entry");
6032 f->dump_string("addr", p->first.get_legacy_str());
6033 f->dump_stream("until") << p->second;
6034 f->close_section();
6035 } else {
6036 stringstream ss;
6037 string s;
6038 ss << p->first << " " << p->second;
6039 getline(ss, s);
6040 s += "\n";
6041 rdata.append(s);
6042 }
6043 }
6044 if (f) {
6045 f->close_section();
6046 f->flush(rdata);
6047 }
6048 if (f)
6049 f->open_array_section("range_blocklist");
6050
6051 for (auto p = osdmap.range_blocklist.begin();
6052 p != osdmap.range_blocklist.end();
6053 ++p) {
6054 if (f) {
6055 f->open_object_section("entry");
6056 f->dump_string("range", p->first.get_legacy_str());
6057 f->dump_stream("until") << p->second;
6058 f->close_section();
6059 } else {
6060 stringstream ss;
6061 string s;
6062 ss << p->first << " " << p->second;
6063 getline(ss, s);
6064 s += "\n";
6065 rdata.append(s);
6066 }
6067 }
6068 if (f) {
6069 f->close_section();
6070 f->flush(rdata);
6071 }
6072 ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
6073
6074 } else if (prefix == "osd pool ls") {
6075 string detail;
6076 cmd_getval(cmdmap, "detail", detail);
6077 if (!f && detail == "detail") {
6078 ostringstream ss;
6079 osdmap.print_pools(cct, ss);
6080 rdata.append(ss.str());
6081 } else {
6082 if (f)
6083 f->open_array_section("pools");
6084 for (auto &[pid, pdata] : osdmap.get_pools()) {
6085 if (f) {
6086 if (detail == "detail") {
6087 f->open_object_section("pool");
6088 f->dump_int("pool_id", pid);
6089 f->dump_string("pool_name", osdmap.get_pool_name(pid));
6090 pdata.dump(f.get());
6091 osdmap.dump_read_balance_score(cct, pid, pdata, f.get());
6092 f->close_section();
6093 } else {
6094 f->dump_string("pool_name", osdmap.get_pool_name(pid));
6095 }
6096 } else {
6097 rdata.append(osdmap.get_pool_name(pid) + "\n");
6098 }
6099 }
6100 if (f) {
6101 f->close_section();
6102 f->flush(rdata);
6103 }
6104 }
6105
6106 } else if (prefix == "osd crush get-tunable") {
6107 string tunable;
6108 cmd_getval(cmdmap, "tunable", tunable);
6109 ostringstream rss;
6110 if (f)
6111 f->open_object_section("tunable");
6112 if (tunable == "straw_calc_version") {
6113 if (f)
6114 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6115 else
6116 rss << osdmap.crush->get_straw_calc_version() << "\n";
6117 } else {
6118 r = -EINVAL;
6119 goto reply;
6120 }
6121 if (f) {
6122 f->close_section();
6123 f->flush(rdata);
6124 } else {
6125 rdata.append(rss.str());
6126 }
6127 r = 0;
6128
6129 } else if (prefix == "osd pool get") {
6130 string poolstr;
6131 cmd_getval(cmdmap, "pool", poolstr);
6132 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6133 if (pool < 0) {
6134 ss << "unrecognized pool '" << poolstr << "'";
6135 r = -ENOENT;
6136 goto reply;
6137 }
6138
6139 const pg_pool_t *p = osdmap.get_pg_pool(pool);
6140 string var;
6141 cmd_getval(cmdmap, "var", var);
6142
6143 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6144 const choices_map_t ALL_CHOICES = {
6145 {"size", SIZE},
6146 {"min_size", MIN_SIZE},
6147 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6148 {"crush_rule", CRUSH_RULE},
6149 {"hashpspool", HASHPSPOOL},
6150 {"eio", POOL_EIO},
6151 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6152 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6153 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6154 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6155 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6156 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6157 {"use_gmt_hitset", USE_GMT_HITSET},
6158 {"target_max_objects", TARGET_MAX_OBJECTS},
6159 {"target_max_bytes", TARGET_MAX_BYTES},
6160 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6161 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6162 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6163 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6164 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6165 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6166 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6167 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6168 {"fast_read", FAST_READ},
6169 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6170 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6171 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6172 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6173 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6174 {"recovery_priority", RECOVERY_PRIORITY},
6175 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6176 {"scrub_priority", SCRUB_PRIORITY},
6177 {"compression_mode", COMPRESSION_MODE},
6178 {"compression_algorithm", COMPRESSION_ALGORITHM},
6179 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6180 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6181 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6182 {"csum_type", CSUM_TYPE},
6183 {"csum_max_block", CSUM_MAX_BLOCK},
6184 {"csum_min_block", CSUM_MIN_BLOCK},
6185 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6186 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6187 {"pg_num_min", PG_NUM_MIN},
6188 {"pg_num_max", PG_NUM_MAX},
6189 {"target_size_bytes", TARGET_SIZE_BYTES},
6190 {"target_size_ratio", TARGET_SIZE_RATIO},
6191 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6192 {"dedup_tier", DEDUP_TIER},
6193 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6194 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6195 {"bulk", BULK}
6196 };
6197
6198 typedef std::set<osd_pool_get_choices> choices_set_t;
6199
6200 const choices_set_t ONLY_TIER_CHOICES = {
6201 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6202 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6203 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6204 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6205 MIN_READ_RECENCY_FOR_PROMOTE,
6206 MIN_WRITE_RECENCY_FOR_PROMOTE,
6207 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6208 };
6209 const choices_set_t ONLY_ERASURE_CHOICES = {
6210 EC_OVERWRITES, ERASURE_CODE_PROFILE
6211 };
6212
6213 choices_set_t selected_choices;
6214 if (var == "all") {
6215 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6216 it != ALL_CHOICES.end(); ++it) {
6217 selected_choices.insert(it->second);
6218 }
6219
6220 if(!p->is_tier()) {
6221 selected_choices = subtract_second_from_first(selected_choices,
6222 ONLY_TIER_CHOICES);
6223 }
6224
6225 if(!p->is_erasure()) {
6226 selected_choices = subtract_second_from_first(selected_choices,
6227 ONLY_ERASURE_CHOICES);
6228 }
6229 } else /* var != "all" */ {
6230 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6231 if (found == ALL_CHOICES.end()) {
6232 ss << "pool '" << poolstr
6233 << "': invalid variable: '" << var << "'";
6234 r = -EINVAL;
6235 goto reply;
6236 }
6237
6238 osd_pool_get_choices selected = found->second;
6239
6240 if (!p->is_tier() &&
6241 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6242 ss << "pool '" << poolstr
6243 << "' is not a tier pool: variable not applicable";
6244 r = -EACCES;
6245 goto reply;
6246 }
6247
6248 if (!p->is_erasure() &&
6249 ONLY_ERASURE_CHOICES.find(selected)
6250 != ONLY_ERASURE_CHOICES.end()) {
6251 ss << "pool '" << poolstr
6252 << "' is not a erasure pool: variable not applicable";
6253 r = -EACCES;
6254 goto reply;
6255 }
6256
6257 if (pool_opts_t::is_opt_name(var) &&
6258 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6259 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6260 r = -ENOENT;
6261 goto reply;
6262 }
6263
6264 selected_choices.insert(selected);
6265 }
6266
6267 if (f) {
6268 f->open_object_section("pool");
6269 f->dump_string("pool", poolstr);
6270 f->dump_int("pool_id", pool);
6271 for(choices_set_t::const_iterator it = selected_choices.begin();
6272 it != selected_choices.end(); ++it) {
6273 choices_map_t::const_iterator i;
6274 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6275 if (i->second == *it) {
6276 break;
6277 }
6278 }
6279 ceph_assert(i != ALL_CHOICES.end());
6280 switch(*it) {
6281 case PG_NUM:
6282 f->dump_int("pg_num", p->get_pg_num());
6283 break;
6284 case PGP_NUM:
6285 f->dump_int("pgp_num", p->get_pgp_num());
6286 break;
6287 case SIZE:
6288 f->dump_int("size", p->get_size());
6289 break;
6290 case MIN_SIZE:
6291 f->dump_int("min_size", p->get_min_size());
6292 break;
6293 case CRUSH_RULE:
6294 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6295 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6296 p->get_crush_rule()));
6297 } else {
6298 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6299 }
6300 break;
6301 case EC_OVERWRITES:
6302 f->dump_bool("allow_ec_overwrites",
6303 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6304 break;
6305 case PG_AUTOSCALE_MODE:
6306 f->dump_string("pg_autoscale_mode",
6307 pg_pool_t::get_pg_autoscale_mode_name(
6308 p->pg_autoscale_mode));
6309 break;
6310 case HASHPSPOOL:
6311 case POOL_EIO:
6312 case NODELETE:
6313 case BULK:
6314 case NOPGCHANGE:
6315 case NOSIZECHANGE:
6316 case WRITE_FADVISE_DONTNEED:
6317 case NOSCRUB:
6318 case NODEEP_SCRUB:
6319 f->dump_bool(i->first.c_str(),
6320 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6321 break;
6322 case HIT_SET_PERIOD:
6323 f->dump_int("hit_set_period", p->hit_set_period);
6324 break;
6325 case HIT_SET_COUNT:
6326 f->dump_int("hit_set_count", p->hit_set_count);
6327 break;
6328 case HIT_SET_TYPE:
6329 f->dump_string("hit_set_type",
6330 HitSet::get_type_name(p->hit_set_params.get_type()));
6331 break;
6332 case HIT_SET_FPP:
6333 {
6334 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6335 BloomHitSet::Params *bloomp =
6336 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6337 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6338 } else if(var != "all") {
6339 f->close_section();
6340 ss << "hit set is not of type Bloom; " <<
6341 "invalid to get a false positive rate!";
6342 r = -EINVAL;
6343 goto reply;
6344 }
6345 }
6346 break;
6347 case USE_GMT_HITSET:
6348 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6349 break;
6350 case TARGET_MAX_OBJECTS:
6351 f->dump_unsigned("target_max_objects", p->target_max_objects);
6352 break;
6353 case TARGET_MAX_BYTES:
6354 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6355 break;
6356 case CACHE_TARGET_DIRTY_RATIO:
6357 f->dump_unsigned("cache_target_dirty_ratio_micro",
6358 p->cache_target_dirty_ratio_micro);
6359 f->dump_float("cache_target_dirty_ratio",
6360 ((float)p->cache_target_dirty_ratio_micro/1000000));
6361 break;
6362 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6363 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6364 p->cache_target_dirty_high_ratio_micro);
6365 f->dump_float("cache_target_dirty_high_ratio",
6366 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6367 break;
6368 case CACHE_TARGET_FULL_RATIO:
6369 f->dump_unsigned("cache_target_full_ratio_micro",
6370 p->cache_target_full_ratio_micro);
6371 f->dump_float("cache_target_full_ratio",
6372 ((float)p->cache_target_full_ratio_micro/1000000));
6373 break;
6374 case CACHE_MIN_FLUSH_AGE:
6375 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6376 break;
6377 case CACHE_MIN_EVICT_AGE:
6378 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6379 break;
6380 case ERASURE_CODE_PROFILE:
6381 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6382 break;
6383 case MIN_READ_RECENCY_FOR_PROMOTE:
6384 f->dump_int("min_read_recency_for_promote",
6385 p->min_read_recency_for_promote);
6386 break;
6387 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6388 f->dump_int("min_write_recency_for_promote",
6389 p->min_write_recency_for_promote);
6390 break;
6391 case FAST_READ:
6392 f->dump_int("fast_read", p->fast_read);
6393 break;
6394 case HIT_SET_GRADE_DECAY_RATE:
6395 f->dump_int("hit_set_grade_decay_rate",
6396 p->hit_set_grade_decay_rate);
6397 break;
6398 case HIT_SET_SEARCH_LAST_N:
6399 f->dump_int("hit_set_search_last_n",
6400 p->hit_set_search_last_n);
6401 break;
6402 case SCRUB_MIN_INTERVAL:
6403 case SCRUB_MAX_INTERVAL:
6404 case DEEP_SCRUB_INTERVAL:
6405 case RECOVERY_PRIORITY:
6406 case RECOVERY_OP_PRIORITY:
6407 case SCRUB_PRIORITY:
6408 case COMPRESSION_MODE:
6409 case COMPRESSION_ALGORITHM:
6410 case COMPRESSION_REQUIRED_RATIO:
6411 case COMPRESSION_MAX_BLOB_SIZE:
6412 case COMPRESSION_MIN_BLOB_SIZE:
6413 case CSUM_TYPE:
6414 case CSUM_MAX_BLOCK:
6415 case CSUM_MIN_BLOCK:
6416 case FINGERPRINT_ALGORITHM:
6417 case PG_NUM_MIN:
6418 case PG_NUM_MAX:
6419 case TARGET_SIZE_BYTES:
6420 case TARGET_SIZE_RATIO:
6421 case PG_AUTOSCALE_BIAS:
6422 case DEDUP_TIER:
6423 case DEDUP_CHUNK_ALGORITHM:
6424 case DEDUP_CDC_CHUNK_SIZE:
6425 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6426 if (p->opts.is_set(key)) {
6427 if(*it == CSUM_TYPE) {
6428 int64_t val;
6429 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6430 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6431 } else {
6432 p->opts.dump(i->first, f.get());
6433 }
6434 }
6435 break;
6436 }
6437 }
6438 f->close_section();
6439 f->flush(rdata);
6440 } else /* !f */ {
6441 for(choices_set_t::const_iterator it = selected_choices.begin();
6442 it != selected_choices.end(); ++it) {
6443 choices_map_t::const_iterator i;
6444 switch(*it) {
6445 case PG_NUM:
6446 ss << "pg_num: " << p->get_pg_num() << "\n";
6447 break;
6448 case PGP_NUM:
6449 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6450 break;
6451 case SIZE:
6452 ss << "size: " << p->get_size() << "\n";
6453 break;
6454 case MIN_SIZE:
6455 ss << "min_size: " << p->get_min_size() << "\n";
6456 break;
6457 case CRUSH_RULE:
6458 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6459 ss << "crush_rule: " << osdmap.crush->get_rule_name(
6460 p->get_crush_rule()) << "\n";
6461 } else {
6462 ss << "crush_rule: " << p->get_crush_rule() << "\n";
6463 }
6464 break;
6465 case PG_AUTOSCALE_MODE:
6466 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6467 p->pg_autoscale_mode) <<"\n";
6468 break;
6469 case HIT_SET_PERIOD:
6470 ss << "hit_set_period: " << p->hit_set_period << "\n";
6471 break;
6472 case HIT_SET_COUNT:
6473 ss << "hit_set_count: " << p->hit_set_count << "\n";
6474 break;
6475 case HIT_SET_TYPE:
6476 ss << "hit_set_type: " <<
6477 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6478 break;
6479 case HIT_SET_FPP:
6480 {
6481 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6482 BloomHitSet::Params *bloomp =
6483 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6484 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6485 } else if(var != "all") {
6486 ss << "hit set is not of type Bloom; " <<
6487 "invalid to get a false positive rate!";
6488 r = -EINVAL;
6489 goto reply;
6490 }
6491 }
6492 break;
6493 case USE_GMT_HITSET:
6494 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6495 break;
6496 case TARGET_MAX_OBJECTS:
6497 ss << "target_max_objects: " << p->target_max_objects << "\n";
6498 break;
6499 case TARGET_MAX_BYTES:
6500 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6501 break;
6502 case CACHE_TARGET_DIRTY_RATIO:
6503 ss << "cache_target_dirty_ratio: "
6504 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6505 break;
6506 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6507 ss << "cache_target_dirty_high_ratio: "
6508 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6509 break;
6510 case CACHE_TARGET_FULL_RATIO:
6511 ss << "cache_target_full_ratio: "
6512 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6513 break;
6514 case CACHE_MIN_FLUSH_AGE:
6515 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6516 break;
6517 case CACHE_MIN_EVICT_AGE:
6518 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6519 break;
6520 case ERASURE_CODE_PROFILE:
6521 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6522 break;
6523 case MIN_READ_RECENCY_FOR_PROMOTE:
6524 ss << "min_read_recency_for_promote: " <<
6525 p->min_read_recency_for_promote << "\n";
6526 break;
6527 case HIT_SET_GRADE_DECAY_RATE:
6528 ss << "hit_set_grade_decay_rate: " <<
6529 p->hit_set_grade_decay_rate << "\n";
6530 break;
6531 case HIT_SET_SEARCH_LAST_N:
6532 ss << "hit_set_search_last_n: " <<
6533 p->hit_set_search_last_n << "\n";
6534 break;
6535 case EC_OVERWRITES:
6536 ss << "allow_ec_overwrites: " <<
6537 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6538 "\n";
6539 break;
6540 case HASHPSPOOL:
6541 case POOL_EIO:
6542 case NODELETE:
6543 case BULK:
6544 case NOPGCHANGE:
6545 case NOSIZECHANGE:
6546 case WRITE_FADVISE_DONTNEED:
6547 case NOSCRUB:
6548 case NODEEP_SCRUB:
6549 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6550 if (i->second == *it)
6551 break;
6552 }
6553 ceph_assert(i != ALL_CHOICES.end());
6554 ss << i->first << ": " <<
6555 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6556 "true" : "false") << "\n";
6557 break;
6558 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6559 ss << "min_write_recency_for_promote: " <<
6560 p->min_write_recency_for_promote << "\n";
6561 break;
6562 case FAST_READ:
6563 ss << "fast_read: " << p->fast_read << "\n";
6564 break;
6565 case SCRUB_MIN_INTERVAL:
6566 case SCRUB_MAX_INTERVAL:
6567 case DEEP_SCRUB_INTERVAL:
6568 case RECOVERY_PRIORITY:
6569 case RECOVERY_OP_PRIORITY:
6570 case SCRUB_PRIORITY:
6571 case COMPRESSION_MODE:
6572 case COMPRESSION_ALGORITHM:
6573 case COMPRESSION_REQUIRED_RATIO:
6574 case COMPRESSION_MAX_BLOB_SIZE:
6575 case COMPRESSION_MIN_BLOB_SIZE:
6576 case CSUM_TYPE:
6577 case CSUM_MAX_BLOCK:
6578 case CSUM_MIN_BLOCK:
6579 case FINGERPRINT_ALGORITHM:
6580 case PG_NUM_MIN:
6581 case PG_NUM_MAX:
6582 case TARGET_SIZE_BYTES:
6583 case TARGET_SIZE_RATIO:
6584 case PG_AUTOSCALE_BIAS:
6585 case DEDUP_TIER:
6586 case DEDUP_CHUNK_ALGORITHM:
6587 case DEDUP_CDC_CHUNK_SIZE:
6588 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6589 if (i->second == *it)
6590 break;
6591 }
6592 ceph_assert(i != ALL_CHOICES.end());
6593 {
6594 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6595 if (p->opts.is_set(key)) {
6596 if(key == pool_opts_t::CSUM_TYPE) {
6597 int64_t val;
6598 p->opts.get(key, &val);
6599 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6600 } else {
6601 ss << i->first << ": " << p->opts.get(key) << "\n";
6602 }
6603 }
6604 }
6605 break;
6606 }
6607 rdata.append(ss.str());
6608 ss.str("");
6609 }
6610 }
6611 r = 0;
6612 } else if (prefix == "osd pool get-quota") {
6613 string pool_name;
6614 cmd_getval(cmdmap, "pool", pool_name);
6615
6616 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6617 if (poolid < 0) {
6618 ceph_assert(poolid == -ENOENT);
6619 ss << "unrecognized pool '" << pool_name << "'";
6620 r = -ENOENT;
6621 goto reply;
6622 }
6623 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6624 const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6625 if (!pstat) {
6626 ss << "no stats for pool '" << pool_name << "'";
6627 r = -ENOENT;
6628 goto reply;
6629 }
6630 const object_stat_sum_t& sum = pstat->stats.sum;
6631 if (f) {
6632 f->open_object_section("pool_quotas");
6633 f->dump_string("pool_name", pool_name);
6634 f->dump_unsigned("pool_id", poolid);
6635 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6636 f->dump_int("current_num_objects", sum.num_objects);
6637 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6638 f->dump_int("current_num_bytes", sum.num_bytes);
6639 f->close_section();
6640 f->flush(rdata);
6641 } else {
6642 stringstream rs;
6643 rs << "quotas for pool '" << pool_name << "':\n"
6644 << " max objects: ";
6645 if (p->quota_max_objects == 0)
6646 rs << "N/A";
6647 else {
6648 rs << si_u_t(p->quota_max_objects) << " objects";
6649 rs << " (current num objects: " << sum.num_objects << " objects)";
6650 }
6651 rs << "\n"
6652 << " max bytes : ";
6653 if (p->quota_max_bytes == 0)
6654 rs << "N/A";
6655 else {
6656 rs << byte_u_t(p->quota_max_bytes);
6657 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6658 }
6659 rdata.append(rs.str());
6660 }
6661 rdata.append("\n");
6662 r = 0;
6663 } else if (prefix == "osd crush rule list" ||
6664 prefix == "osd crush rule ls") {
6665 if (f) {
6666 f->open_array_section("rules");
6667 osdmap.crush->list_rules(f.get());
6668 f->close_section();
6669 f->flush(rdata);
6670 } else {
6671 ostringstream ss;
6672 osdmap.crush->list_rules(&ss);
6673 rdata.append(ss.str());
6674 }
6675 } else if (prefix == "osd crush rule ls-by-class") {
6676 string class_name;
6677 cmd_getval(cmdmap, "class", class_name);
6678 if (class_name.empty()) {
6679 ss << "no class specified";
6680 r = -EINVAL;
6681 goto reply;
6682 }
6683 set<int> rules;
6684 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6685 if (r < 0) {
6686 ss << "failed to get rules by class '" << class_name << "'";
6687 goto reply;
6688 }
6689 if (f) {
6690 f->open_array_section("rules");
6691 for (auto &rule: rules) {
6692 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6693 }
6694 f->close_section();
6695 f->flush(rdata);
6696 } else {
6697 ostringstream rs;
6698 for (auto &rule: rules) {
6699 rs << osdmap.crush->get_rule_name(rule) << "\n";
6700 }
6701 rdata.append(rs.str());
6702 }
6703 } else if (prefix == "osd crush rule dump") {
6704 string name;
6705 cmd_getval(cmdmap, "name", name);
6706 string format;
6707 cmd_getval(cmdmap, "format", format);
6708 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6709 if (name == "") {
6710 f->open_array_section("rules");
6711 osdmap.crush->dump_rules(f.get());
6712 f->close_section();
6713 } else {
6714 int ruleno = osdmap.crush->get_rule_id(name);
6715 if (ruleno < 0) {
6716 ss << "unknown crush rule '" << name << "'";
6717 r = ruleno;
6718 goto reply;
6719 }
6720 osdmap.crush->dump_rule(ruleno, f.get());
6721 }
6722 ostringstream rs;
6723 f->flush(rs);
6724 rs << "\n";
6725 rdata.append(rs.str());
6726 } else if (prefix == "osd crush dump") {
6727 string format;
6728 cmd_getval(cmdmap, "format", format);
6729 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6730 f->open_object_section("crush_map");
6731 osdmap.crush->dump(f.get());
6732 f->close_section();
6733 ostringstream rs;
6734 f->flush(rs);
6735 rs << "\n";
6736 rdata.append(rs.str());
6737 } else if (prefix == "osd crush show-tunables") {
6738 string format;
6739 cmd_getval(cmdmap, "format", format);
6740 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6741 f->open_object_section("crush_map_tunables");
6742 osdmap.crush->dump_tunables(f.get());
6743 f->close_section();
6744 ostringstream rs;
6745 f->flush(rs);
6746 rs << "\n";
6747 rdata.append(rs.str());
6748 } else if (prefix == "osd crush tree") {
6749 bool show_shadow = false;
6750 if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
6751 std::string shadow;
6752 if (cmd_getval(cmdmap, "shadow", shadow) &&
6753 shadow == "--show-shadow") {
6754 show_shadow = true;
6755 }
6756 }
6757 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6758 if (f) {
6759 f->open_object_section("crush_tree");
6760 osdmap.crush->dump_tree(nullptr,
6761 f.get(),
6762 osdmap.get_pool_names(),
6763 show_shadow);
6764 f->close_section();
6765 f->flush(rdata);
6766 } else {
6767 ostringstream ss;
6768 osdmap.crush->dump_tree(&ss,
6769 nullptr,
6770 osdmap.get_pool_names(),
6771 show_shadow);
6772 rdata.append(ss.str());
6773 }
6774 } else if (prefix == "osd crush ls") {
6775 string name;
6776 if (!cmd_getval(cmdmap, "node", name)) {
6777 ss << "no node specified";
6778 r = -EINVAL;
6779 goto reply;
6780 }
6781 if (!osdmap.crush->name_exists(name)) {
6782 ss << "node '" << name << "' does not exist";
6783 r = -ENOENT;
6784 goto reply;
6785 }
6786 int id = osdmap.crush->get_item_id(name);
6787 list<int> result;
6788 if (id >= 0) {
6789 result.push_back(id);
6790 } else {
6791 int num = osdmap.crush->get_bucket_size(id);
6792 for (int i = 0; i < num; ++i) {
6793 result.push_back(osdmap.crush->get_bucket_item(id, i));
6794 }
6795 }
6796 if (f) {
6797 f->open_array_section("items");
6798 for (auto i : result) {
6799 f->dump_string("item", osdmap.crush->get_item_name(i));
6800 }
6801 f->close_section();
6802 f->flush(rdata);
6803 } else {
6804 ostringstream ss;
6805 for (auto i : result) {
6806 ss << osdmap.crush->get_item_name(i) << "\n";
6807 }
6808 rdata.append(ss.str());
6809 }
6810 r = 0;
6811 } else if (prefix == "osd crush class ls") {
6812 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6813 f->open_array_section("crush_classes");
6814 for (auto i : osdmap.crush->class_name)
6815 f->dump_string("class", i.second);
6816 f->close_section();
6817 f->flush(rdata);
6818 } else if (prefix == "osd crush class ls-osd") {
6819 string name;
6820 cmd_getval(cmdmap, "class", name);
6821 set<int> osds;
6822 osdmap.crush->get_devices_by_class(name, &osds);
6823 if (f) {
6824 f->open_array_section("osds");
6825 for (auto &osd: osds)
6826 f->dump_int("osd", osd);
6827 f->close_section();
6828 f->flush(rdata);
6829 } else {
6830 bool first = true;
6831 for (auto &osd : osds) {
6832 if (!first)
6833 ds << "\n";
6834 first = false;
6835 ds << osd;
6836 }
6837 rdata.append(ds);
6838 }
6839 } else if (prefix == "osd crush get-device-class") {
6840 vector<string> idvec;
6841 cmd_getval(cmdmap, "ids", idvec);
6842 map<int, string> class_by_osd;
6843 for (auto& id : idvec) {
6844 ostringstream ts;
6845 long osd = parse_osd_id(id.c_str(), &ts);
6846 if (osd < 0) {
6847 ss << "unable to parse osd id:'" << id << "'";
6848 r = -EINVAL;
6849 goto reply;
6850 }
6851 auto device_class = osdmap.crush->get_item_class(osd);
6852 if (device_class)
6853 class_by_osd[osd] = device_class;
6854 else
6855 class_by_osd[osd] = ""; // no class
6856 }
6857 if (f) {
6858 f->open_array_section("osd_device_classes");
6859 for (auto& i : class_by_osd) {
6860 f->open_object_section("osd_device_class");
6861 f->dump_int("osd", i.first);
6862 f->dump_string("device_class", i.second);
6863 f->close_section();
6864 }
6865 f->close_section();
6866 f->flush(rdata);
6867 } else {
6868 if (class_by_osd.size() == 1) {
6869 // for single input, make a clean output
6870 ds << class_by_osd.begin()->second;
6871 } else {
6872 // note that we do not group osds by class here
6873 for (auto it = class_by_osd.begin();
6874 it != class_by_osd.end();
6875 it++) {
6876 ds << "osd." << it->first << ' ' << it->second;
6877 if (next(it) != class_by_osd.end())
6878 ds << '\n';
6879 }
6880 }
6881 rdata.append(ds);
6882 }
6883 } else if (prefix == "osd erasure-code-profile ls") {
6884 const auto &profiles = osdmap.get_erasure_code_profiles();
6885 if (f)
6886 f->open_array_section("erasure-code-profiles");
6887 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6888 if (f)
6889 f->dump_string("profile", i->first.c_str());
6890 else
6891 rdata.append(i->first + "\n");
6892 }
6893 if (f) {
6894 f->close_section();
6895 ostringstream rs;
6896 f->flush(rs);
6897 rs << "\n";
6898 rdata.append(rs.str());
6899 }
6900 } else if (prefix == "osd crush weight-set ls") {
6901 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6902 if (f) {
6903 f->open_array_section("weight_sets");
6904 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6905 f->dump_string("pool", "(compat)");
6906 }
6907 for (auto& i : osdmap.crush->choose_args) {
6908 if (i.first >= 0) {
6909 f->dump_string("pool", osdmap.get_pool_name(i.first));
6910 }
6911 }
6912 f->close_section();
6913 f->flush(rdata);
6914 } else {
6915 ostringstream rs;
6916 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6917 rs << "(compat)\n";
6918 }
6919 for (auto& i : osdmap.crush->choose_args) {
6920 if (i.first >= 0) {
6921 rs << osdmap.get_pool_name(i.first) << "\n";
6922 }
6923 }
6924 rdata.append(rs.str());
6925 }
6926 } else if (prefix == "osd crush weight-set dump") {
6927 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6928 "json-pretty"));
6929 osdmap.crush->dump_choose_args(f.get());
6930 f->flush(rdata);
6931 } else if (prefix == "osd erasure-code-profile get") {
6932 string name;
6933 cmd_getval(cmdmap, "name", name);
6934 if (!osdmap.has_erasure_code_profile(name)) {
6935 ss << "unknown erasure code profile '" << name << "'";
6936 r = -ENOENT;
6937 goto reply;
6938 }
6939 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6940 if (f)
6941 f->open_object_section("profile");
6942 for (map<string,string>::const_iterator i = profile.begin();
6943 i != profile.end();
6944 ++i) {
6945 if (f)
6946 f->dump_string(i->first.c_str(), i->second.c_str());
6947 else
6948 rdata.append(i->first + "=" + i->second + "\n");
6949 }
6950 if (f) {
6951 f->close_section();
6952 ostringstream rs;
6953 f->flush(rs);
6954 rs << "\n";
6955 rdata.append(rs.str());
6956 }
6957 } else if (prefix == "osd pool application get") {
6958 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6959 "json-pretty"));
6960 string pool_name;
6961 cmd_getval(cmdmap, "pool", pool_name);
6962 string app;
6963 cmd_getval(cmdmap, "app", app);
6964 string key;
6965 cmd_getval(cmdmap, "key", key);
6966
6967 if (pool_name.empty()) {
6968 // all
6969 f->open_object_section("pools");
6970 for (const auto &pool : osdmap.pools) {
6971 std::string name("<unknown>");
6972 const auto &pni = osdmap.pool_name.find(pool.first);
6973 if (pni != osdmap.pool_name.end())
6974 name = pni->second;
6975 f->open_object_section(name.c_str());
6976 for (auto &app_pair : pool.second.application_metadata) {
6977 f->open_object_section(app_pair.first.c_str());
6978 for (auto &kv_pair : app_pair.second) {
6979 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6980 }
6981 f->close_section();
6982 }
6983 f->close_section(); // name
6984 }
6985 f->close_section(); // pools
6986 f->flush(rdata);
6987 } else {
6988 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6989 if (pool < 0) {
6990 ss << "unrecognized pool '" << pool_name << "'";
6991 r = -ENOENT;
6992 goto reply;
6993 }
6994 auto p = osdmap.get_pg_pool(pool);
6995 // filter by pool
6996 if (app.empty()) {
6997 f->open_object_section(pool_name.c_str());
6998 for (auto &app_pair : p->application_metadata) {
6999 f->open_object_section(app_pair.first.c_str());
7000 for (auto &kv_pair : app_pair.second) {
7001 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7002 }
7003 f->close_section(); // application
7004 }
7005 f->close_section(); // pool_name
7006 f->flush(rdata);
7007 goto reply;
7008 }
7009
7010 auto app_it = p->application_metadata.find(app);
7011 if (app_it == p->application_metadata.end()) {
7012 ss << "pool '" << pool_name << "' has no application '" << app << "'";
7013 r = -ENOENT;
7014 goto reply;
7015 }
7016 // filter by pool + app
7017 if (key.empty()) {
7018 f->open_object_section(app_it->first.c_str());
7019 for (auto &kv_pair : app_it->second) {
7020 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7021 }
7022 f->close_section(); // application
7023 f->flush(rdata);
7024 goto reply;
7025 }
7026 // filter by pool + app + key
7027 auto key_it = app_it->second.find(key);
7028 if (key_it == app_it->second.end()) {
7029 ss << "application '" << app << "' on pool '" << pool_name
7030 << "' does not have key '" << key << "'";
7031 r = -ENOENT;
7032 goto reply;
7033 }
7034 ss << key_it->second << "\n";
7035 rdata.append(ss.str());
7036 ss.str("");
7037 }
7038 } else if (prefix == "osd get-require-min-compat-client") {
7039 ss << osdmap.require_min_compat_client << std::endl;
7040 rdata.append(ss.str());
7041 ss.str("");
7042 goto reply;
7043 } else if (prefix == "osd pool application enable" ||
7044 prefix == "osd pool application disable" ||
7045 prefix == "osd pool application set" ||
7046 prefix == "osd pool application rm") {
7047 bool changed = false;
7048 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7049 if (r != 0) {
7050 // Error, reply.
7051 goto reply;
7052 } else if (changed) {
7053 // Valid mutation, proceed to prepare phase
7054 return false;
7055 } else {
7056 // Idempotent case, reply
7057 goto reply;
7058 }
7059 } else {
7060 // try prepare update
7061 return false;
7062 }
7063
7064 reply:
7065 string rs;
7066 getline(ss, rs);
7067 mon.reply_command(op, r, rs, rdata, get_last_committed());
7068 return true;
7069 }
7070
7071 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7072 {
7073 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7074 osdmap.get_pg_pool(pool_id));
7075 ceph_assert(pool);
7076 pool->set_flag(flags);
7077 }
7078
7079 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7080 {
7081 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7082 osdmap.get_pg_pool(pool_id));
7083 ceph_assert(pool);
7084 pool->unset_flag(flags);
7085 }
7086
7087 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7088 {
7089 char k[80];
7090 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7091 return k;
7092 }
7093
7094 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7095 {
7096 char k[80];
7097 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7098 (unsigned long long)pool, (unsigned long long)snap);
7099 return k;
7100 }
7101
7102 string OSDMonitor::make_purged_snap_key_value(
7103 int64_t pool, snapid_t snap, snapid_t num,
7104 epoch_t epoch, bufferlist *v)
7105 {
7106 // encode the *last* epoch in the key so that we can use forward
7107 // iteration only to search for an epoch in an interval.
7108 encode(snap, *v);
7109 encode(snap + num, *v);
7110 encode(epoch, *v);
7111 return make_purged_snap_key(pool, snap + num - 1);
7112 }
7113
7114
7115 int OSDMonitor::lookup_purged_snap(
7116 int64_t pool, snapid_t snap,
7117 snapid_t *begin, snapid_t *end)
7118 {
7119 string k = make_purged_snap_key(pool, snap);
7120 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7121 it->lower_bound(k);
7122 if (!it->valid()) {
7123 dout(20) << __func__
7124 << " pool " << pool << " snap " << snap
7125 << " - key '" << k << "' not found" << dendl;
7126 return -ENOENT;
7127 }
7128 if (it->key().find("purged_snap_") != 0) {
7129 dout(20) << __func__
7130 << " pool " << pool << " snap " << snap
7131 << " - key '" << k << "' got '" << it->key()
7132 << "', wrong prefix" << dendl;
7133 return -ENOENT;
7134 }
7135 string gotk = it->key();
7136 const char *format = "purged_snap_%llu_";
7137 long long int keypool;
7138 int n = sscanf(gotk.c_str(), format, &keypool);
7139 if (n != 1) {
7140 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7141 return -ENOENT;
7142 }
7143 if (pool != keypool) {
7144 dout(20) << __func__
7145 << " pool " << pool << " snap " << snap
7146 << " - key '" << k << "' got '" << gotk
7147 << "', wrong pool " << keypool
7148 << dendl;
7149 return -ENOENT;
7150 }
7151 bufferlist v = it->value();
7152 auto p = v.cbegin();
7153 decode(*begin, p);
7154 decode(*end, p);
7155 if (snap < *begin || snap >= *end) {
7156 dout(20) << __func__
7157 << " pool " << pool << " snap " << snap
7158 << " - found [" << *begin << "," << *end << "), no overlap"
7159 << dendl;
7160 return -ENOENT;
7161 }
7162 return 0;
7163 }
7164
7165 void OSDMonitor::insert_purged_snap_update(
7166 int64_t pool,
7167 snapid_t start, snapid_t end,
7168 epoch_t epoch,
7169 MonitorDBStore::TransactionRef t)
7170 {
7171 snapid_t before_begin, before_end;
7172 snapid_t after_begin, after_end;
7173 int b = lookup_purged_snap(pool, start - 1,
7174 &before_begin, &before_end);
7175 int a = lookup_purged_snap(pool, end,
7176 &after_begin, &after_end);
7177 if (!b && !a) {
7178 dout(10) << __func__
7179 << " [" << start << "," << end << ") - joins ["
7180 << before_begin << "," << before_end << ") and ["
7181 << after_begin << "," << after_end << ")" << dendl;
7182 // erase only the begin record; we'll overwrite the end one.
7183 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7184 bufferlist v;
7185 string k = make_purged_snap_key_value(pool,
7186 before_begin, after_end - before_begin,
7187 pending_inc.epoch, &v);
7188 t->put(OSD_SNAP_PREFIX, k, v);
7189 } else if (!b) {
7190 dout(10) << __func__
7191 << " [" << start << "," << end << ") - join with earlier ["
7192 << before_begin << "," << before_end << ")" << dendl;
7193 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7194 bufferlist v;
7195 string k = make_purged_snap_key_value(pool,
7196 before_begin, end - before_begin,
7197 pending_inc.epoch, &v);
7198 t->put(OSD_SNAP_PREFIX, k, v);
7199 } else if (!a) {
7200 dout(10) << __func__
7201 << " [" << start << "," << end << ") - join with later ["
7202 << after_begin << "," << after_end << ")" << dendl;
7203 // overwrite after record
7204 bufferlist v;
7205 string k = make_purged_snap_key_value(pool,
7206 start, after_end - start,
7207 pending_inc.epoch, &v);
7208 t->put(OSD_SNAP_PREFIX, k, v);
7209 } else {
7210 dout(10) << __func__
7211 << " [" << start << "," << end << ") - new"
7212 << dendl;
7213 bufferlist v;
7214 string k = make_purged_snap_key_value(pool,
7215 start, end - start,
7216 pending_inc.epoch, &v);
7217 t->put(OSD_SNAP_PREFIX, k, v);
7218 }
7219 }
7220
7221 bool OSDMonitor::try_prune_purged_snaps()
7222 {
7223 if (!mon.mgrstatmon()->is_readable()) {
7224 return false;
7225 }
7226 if (!pending_inc.new_purged_snaps.empty()) {
7227 return false; // we already pruned for this epoch
7228 }
7229
7230 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7231 "mon_max_snap_prune_per_epoch");
7232 if (!max_prune) {
7233 max_prune = 100000;
7234 }
7235 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7236
7237 unsigned actually_pruned = 0;
7238 auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7239 for (auto& p : osdmap.get_pools()) {
7240 auto q = purged_snaps.find(p.first);
7241 if (q == purged_snaps.end()) {
7242 continue;
7243 }
7244 auto& purged = q->second;
7245 if (purged.empty()) {
7246 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7247 continue;
7248 }
7249 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7250 snap_interval_set_t to_prune;
7251 unsigned maybe_pruned = actually_pruned;
7252 for (auto i = purged.begin(); i != purged.end(); ++i) {
7253 snapid_t begin = i.get_start();
7254 auto end = i.get_start() + i.get_len();
7255 snapid_t pbegin = 0, pend = 0;
7256 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7257 if (r == 0) {
7258 // already purged.
7259 // be a bit aggressive about backing off here, because the mon may
7260 // do a lot of work going through this set, and if we know the
7261 // purged set from the OSDs is at least *partly* stale we may as
7262 // well wait for it to be fresh.
7263 dout(20) << __func__ << " we've already purged " << pbegin
7264 << "~" << (pend - pbegin) << dendl;
7265 break; // next pool
7266 }
7267 if (pbegin && pbegin > begin && pbegin < end) {
7268 // the tail of [begin,end) is purged; shorten the range
7269 end = pbegin;
7270 }
7271 to_prune.insert(begin, end - begin);
7272 maybe_pruned += end - begin;
7273 if (maybe_pruned >= max_prune) {
7274 break;
7275 }
7276 }
7277 if (!to_prune.empty()) {
7278 // PGs may still be reporting things as purged that we have already
7279 // pruned from removed_snaps_queue.
7280 snap_interval_set_t actual;
7281 auto r = osdmap.removed_snaps_queue.find(p.first);
7282 if (r != osdmap.removed_snaps_queue.end()) {
7283 actual.intersection_of(to_prune, r->second);
7284 }
7285 actually_pruned += actual.size();
7286 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7287 << ", actual pruned " << actual << dendl;
7288 if (!actual.empty()) {
7289 pending_inc.new_purged_snaps[p.first].swap(actual);
7290 }
7291 }
7292 if (actually_pruned >= max_prune) {
7293 break;
7294 }
7295 }
7296 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7297 return !!actually_pruned;
7298 }
7299
7300 bool OSDMonitor::update_pools_status()
7301 {
7302 if (!mon.mgrstatmon()->is_readable())
7303 return false;
7304
7305 bool ret = false;
7306
7307 auto& pools = osdmap.get_pools();
7308 for (auto it = pools.begin(); it != pools.end(); ++it) {
7309 const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7310 if (!pstat)
7311 continue;
7312 const object_stat_sum_t& sum = pstat->stats.sum;
7313 const pg_pool_t &pool = it->second;
7314 const string& pool_name = osdmap.get_pool_name(it->first);
7315
7316 bool pool_is_full =
7317 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7318 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7319
7320 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7321 if (pool_is_full)
7322 continue;
7323
7324 mon.clog->info() << "pool '" << pool_name
7325 << "' no longer out of quota; removing NO_QUOTA flag";
7326 // below we cancel FLAG_FULL too, we'll set it again in
7327 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7328 clear_pool_flags(it->first,
7329 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7330 ret = true;
7331 } else {
7332 if (!pool_is_full)
7333 continue;
7334
7335 if (pool.quota_max_bytes > 0 &&
7336 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7337 mon.clog->warn() << "pool '" << pool_name << "' is full"
7338 << " (reached quota's max_bytes: "
7339 << byte_u_t(pool.quota_max_bytes) << ")";
7340 }
7341 if (pool.quota_max_objects > 0 &&
7342 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7343 mon.clog->warn() << "pool '" << pool_name << "' is full"
7344 << " (reached quota's max_objects: "
7345 << pool.quota_max_objects << ")";
7346 }
7347 // set both FLAG_FULL_QUOTA and FLAG_FULL
7348 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7349 // since FLAG_FULL should always take precedence
7350 set_pool_flags(it->first,
7351 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7352 clear_pool_flags(it->first,
7353 pg_pool_t::FLAG_NEARFULL |
7354 pg_pool_t::FLAG_BACKFILLFULL);
7355 ret = true;
7356 }
7357 }
7358 return ret;
7359 }
7360
7361 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7362 {
7363 op->mark_osdmon_event(__func__);
7364 auto m = op->get_req<MPoolOp>();
7365 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7366 MonSession *session = op->get_session();
7367 if (!session)
7368 return -EPERM;
7369 string erasure_code_profile;
7370 stringstream ss;
7371 string rule_name;
7372 bool bulk = false;
7373 int ret = 0;
7374 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7375 0, 0, 0, 0, 0, 0, 0.0,
7376 erasure_code_profile,
7377 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
7378 cct->_conf.get_val<bool>("osd_pool_default_crimson"),
7379 &ss);
7380
7381 if (ret < 0) {
7382 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7383 }
7384 return ret;
7385 }
7386
7387 int OSDMonitor::crush_rename_bucket(const string& srcname,
7388 const string& dstname,
7389 ostream *ss)
7390 {
7391 int ret;
7392 //
7393 // Avoid creating a pending crush if it does not already exists and
7394 // the rename would fail.
7395 //
7396 if (!_have_pending_crush()) {
7397 ret = _get_stable_crush().can_rename_bucket(srcname,
7398 dstname,
7399 ss);
7400 if (ret)
7401 return ret;
7402 }
7403
7404 CrushWrapper newcrush = _get_pending_crush();
7405
7406 ret = newcrush.rename_bucket(srcname,
7407 dstname,
7408 ss);
7409 if (ret)
7410 return ret;
7411
7412 pending_inc.crush.clear();
7413 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7414 *ss << "renamed bucket " << srcname << " into " << dstname;
7415 return 0;
7416 }
7417
7418 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7419 {
7420 string replacement = "";
7421
7422 if (plugin == "jerasure_generic" ||
7423 plugin == "jerasure_sse3" ||
7424 plugin == "jerasure_sse4" ||
7425 plugin == "jerasure_neon") {
7426 replacement = "jerasure";
7427 } else if (plugin == "shec_generic" ||
7428 plugin == "shec_sse3" ||
7429 plugin == "shec_sse4" ||
7430 plugin == "shec_neon") {
7431 replacement = "shec";
7432 }
7433
7434 if (replacement != "") {
7435 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7436 << plugin << " that has been deprecated. Please use "
7437 << replacement << " instead." << dendl;
7438 }
7439 }
7440
7441 int OSDMonitor::normalize_profile(const string& profilename,
7442 ErasureCodeProfile &profile,
7443 bool force,
7444 ostream *ss)
7445 {
7446 ErasureCodeInterfaceRef erasure_code;
7447 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7448 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7449 check_legacy_ec_plugin(plugin->second, profilename);
7450 int err = instance.factory(plugin->second,
7451 g_conf().get_val<std::string>("erasure_code_dir"),
7452 profile, &erasure_code, ss);
7453 if (err) {
7454 return err;
7455 }
7456
7457 err = erasure_code->init(profile, ss);
7458 if (err) {
7459 return err;
7460 }
7461
7462 auto it = profile.find("stripe_unit");
7463 if (it != profile.end()) {
7464 string err_str;
7465 uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
7466 if (!err_str.empty()) {
7467 *ss << "could not parse stripe_unit '" << it->second
7468 << "': " << err_str << std::endl;
7469 return -EINVAL;
7470 }
7471 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7472 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7473 if (chunk_size != stripe_unit) {
7474 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7475 << "alignment. Would be padded to " << chunk_size
7476 << std::endl;
7477 return -EINVAL;
7478 }
7479 if ((stripe_unit % 4096) != 0 && !force) {
7480 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7481 << "use --force to override this check" << std::endl;
7482 return -EINVAL;
7483 }
7484 }
7485 return 0;
7486 }
7487
7488 int OSDMonitor::crush_rule_create_erasure(const string &name,
7489 const string &profile,
7490 int *rule,
7491 ostream *ss)
7492 {
7493 int ruleid = osdmap.crush->get_rule_id(name);
7494 if (ruleid != -ENOENT) {
7495 *rule = ruleid;
7496 return -EEXIST;
7497 }
7498
7499 CrushWrapper newcrush = _get_pending_crush();
7500
7501 ruleid = newcrush.get_rule_id(name);
7502 if (ruleid != -ENOENT) {
7503 *rule = ruleid;
7504 return -EALREADY;
7505 } else {
7506 ErasureCodeInterfaceRef erasure_code;
7507 int err = get_erasure_code(profile, &erasure_code, ss);
7508 if (err) {
7509 *ss << "failed to load plugin using profile " << profile << std::endl;
7510 return err;
7511 }
7512
7513 err = erasure_code->create_rule(name, newcrush, ss);
7514 erasure_code.reset();
7515 if (err < 0)
7516 return err;
7517 *rule = err;
7518 pending_inc.crush.clear();
7519 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7520 return 0;
7521 }
7522 }
7523
7524 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7525 ErasureCodeInterfaceRef *erasure_code,
7526 ostream *ss) const
7527 {
7528 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7529 return -EAGAIN;
7530 ErasureCodeProfile profile =
7531 osdmap.get_erasure_code_profile(erasure_code_profile);
7532 ErasureCodeProfile::const_iterator plugin =
7533 profile.find("plugin");
7534 if (plugin == profile.end()) {
7535 *ss << "cannot determine the erasure code plugin"
7536 << " because there is no 'plugin' entry in the erasure_code_profile "
7537 << profile << std::endl;
7538 return -EINVAL;
7539 }
7540 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7541 auto& instance = ErasureCodePluginRegistry::instance();
7542 return instance.factory(plugin->second,
7543 g_conf().get_val<std::string>("erasure_code_dir"),
7544 profile, erasure_code, ss);
7545 }
7546
7547 int OSDMonitor::check_cluster_features(uint64_t features,
7548 stringstream &ss)
7549 {
7550 stringstream unsupported_ss;
7551 int unsupported_count = 0;
7552 if ((mon.get_quorum_con_features() & features) != features) {
7553 unsupported_ss << "the monitor cluster";
7554 ++unsupported_count;
7555 }
7556
7557 set<int32_t> up_osds;
7558 osdmap.get_up_osds(up_osds);
7559 for (set<int32_t>::iterator it = up_osds.begin();
7560 it != up_osds.end(); ++it) {
7561 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7562 if ((xi.features & features) != features) {
7563 if (unsupported_count > 0)
7564 unsupported_ss << ", ";
7565 unsupported_ss << "osd." << *it;
7566 unsupported_count ++;
7567 }
7568 }
7569
7570 if (unsupported_count > 0) {
7571 ss << "features " << features << " unsupported by: "
7572 << unsupported_ss.str();
7573 return -ENOTSUP;
7574 }
7575
7576 // check pending osd state, too!
7577 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7578 pending_inc.new_xinfo.begin();
7579 p != pending_inc.new_xinfo.end(); ++p) {
7580 const osd_xinfo_t &xi = p->second;
7581 if ((xi.features & features) != features) {
7582 dout(10) << __func__ << " pending osd." << p->first
7583 << " features are insufficient; retry" << dendl;
7584 return -EAGAIN;
7585 }
7586 }
7587
7588 return 0;
7589 }
7590
7591 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7592 stringstream& ss)
7593 {
7594 OSDMap::Incremental new_pending = pending_inc;
7595 encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7596 OSDMap newmap;
7597 newmap.deepish_copy_from(osdmap);
7598 newmap.apply_incremental(new_pending);
7599
7600 // client compat
7601 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7602 auto mv = newmap.get_min_compat_client();
7603 if (mv > newmap.require_min_compat_client) {
7604 ss << "new crush map requires client version " << mv
7605 << " but require_min_compat_client is "
7606 << newmap.require_min_compat_client;
7607 return false;
7608 }
7609 }
7610
7611 // osd compat
7612 uint64_t features =
7613 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7614 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7615 stringstream features_ss;
7616 int r = check_cluster_features(features, features_ss);
7617 if (r) {
7618 ss << "Could not change CRUSH: " << features_ss.str();
7619 return false;
7620 }
7621
7622 return true;
7623 }
7624
7625 bool OSDMonitor::erasure_code_profile_in_use(
7626 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7627 const string &profile,
7628 ostream *ss)
7629 {
7630 bool found = false;
7631 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7632 p != pools.end();
7633 ++p) {
7634 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7635 *ss << osdmap.pool_name[p->first] << " ";
7636 found = true;
7637 }
7638 }
7639 if (found) {
7640 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7641 }
7642 return found;
7643 }
7644
7645 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7646 map<string,string> *erasure_code_profile_map,
7647 ostream *ss)
7648 {
7649 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7650 get_json_str_map,
7651 *ss,
7652 erasure_code_profile_map,
7653 true);
7654 if (r)
7655 return r;
7656 ceph_assert((*erasure_code_profile_map).count("plugin"));
7657 string default_plugin = (*erasure_code_profile_map)["plugin"];
7658 map<string,string> user_map;
7659 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7660 i != erasure_code_profile.end();
7661 ++i) {
7662 size_t equal = i->find('=');
7663 if (equal == string::npos) {
7664 user_map[*i] = string();
7665 (*erasure_code_profile_map)[*i] = string();
7666 } else {
7667 const string key = i->substr(0, equal);
7668 equal++;
7669 const string value = i->substr(equal);
7670 if (key.find("ruleset-") == 0) {
7671 *ss << "property '" << key << "' is no longer supported; try "
7672 << "'crush-" << key.substr(8) << "' instead";
7673 return -EINVAL;
7674 }
7675 user_map[key] = value;
7676 (*erasure_code_profile_map)[key] = value;
7677 }
7678 }
7679
7680 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7681 (*erasure_code_profile_map) = user_map;
7682
7683 return 0;
7684 }
7685
7686 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7687 const string &erasure_code_profile,
7688 uint8_t repl_size,
7689 unsigned *size, unsigned *min_size,
7690 ostream *ss)
7691 {
7692 int err = 0;
7693 bool set_min_size = false;
7694 switch (pool_type) {
7695 case pg_pool_t::TYPE_REPLICATED:
7696 if (osdmap.stretch_mode_enabled) {
7697 if (repl_size == 0)
7698 repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7699 if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7700 *ss << "prepare_pool_size: we are in stretch mode but size "
7701 << repl_size << " does not match!";
7702 return -EINVAL;
7703 }
7704 *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7705 set_min_size = true;
7706 }
7707 if (repl_size == 0) {
7708 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7709 }
7710 *size = repl_size;
7711 if (!set_min_size)
7712 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7713 break;
7714 case pg_pool_t::TYPE_ERASURE:
7715 {
7716 if (osdmap.stretch_mode_enabled) {
7717 *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7718 return -EINVAL;
7719 }
7720 ErasureCodeInterfaceRef erasure_code;
7721 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7722 if (err == 0) {
7723 *size = erasure_code->get_chunk_count();
7724 *min_size =
7725 erasure_code->get_data_chunk_count() +
7726 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7727 assert(*min_size <= *size);
7728 assert(*min_size >= erasure_code->get_data_chunk_count());
7729 }
7730 }
7731 break;
7732 default:
7733 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7734 err = -EINVAL;
7735 break;
7736 }
7737 return err;
7738 }
7739
7740 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7741 const string &erasure_code_profile,
7742 uint32_t *stripe_width,
7743 ostream *ss)
7744 {
7745 int err = 0;
7746 switch (pool_type) {
7747 case pg_pool_t::TYPE_REPLICATED:
7748 // ignored
7749 break;
7750 case pg_pool_t::TYPE_ERASURE:
7751 {
7752 ErasureCodeProfile profile =
7753 osdmap.get_erasure_code_profile(erasure_code_profile);
7754 ErasureCodeInterfaceRef erasure_code;
7755 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7756 if (err)
7757 break;
7758 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7759 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7760 auto it = profile.find("stripe_unit");
7761 if (it != profile.end()) {
7762 string err_str;
7763 stripe_unit = strict_iecstrtoll(it->second, &err_str);
7764 ceph_assert(err_str.empty());
7765 }
7766 *stripe_width = data_chunks *
7767 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7768 }
7769 break;
7770 default:
7771 *ss << "prepare_pool_stripe_width: "
7772 << pool_type << " is not a known pool type";
7773 err = -EINVAL;
7774 break;
7775 }
7776 return err;
7777 }
7778
7779 int OSDMonitor::get_replicated_stretch_crush_rule()
7780 {
7781 /* we don't write down the stretch rule anywhere, so
7782 * we have to guess it. How? Look at all the pools
7783 * and count up how many times a given rule is used
7784 * on stretch pools and then return the one with
7785 * the most users!
7786 */
7787 map<int,int> rule_counts;
7788 for (const auto& pooli : osdmap.pools) {
7789 const pg_pool_t& p = pooli.second;
7790 if (p.is_replicated() && p.is_stretch_pool()) {
7791 if (!rule_counts.count(p.crush_rule)) {
7792 rule_counts[p.crush_rule] = 1;
7793 } else {
7794 ++rule_counts[p.crush_rule];
7795 }
7796 }
7797 }
7798
7799 if (rule_counts.empty()) {
7800 return -ENOENT;
7801 }
7802
7803 int most_used_count = 0;
7804 int most_used_rule = -1;
7805 for (auto i : rule_counts) {
7806 if (i.second > most_used_count) {
7807 most_used_rule = i.first;
7808 most_used_count = i.second;
7809 }
7810 }
7811 ceph_assert(most_used_count > 0);
7812 ceph_assert(most_used_rule >= 0);
7813 return most_used_rule;
7814 }
7815
7816 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7817 const string &erasure_code_profile,
7818 const string &rule_name,
7819 int *crush_rule,
7820 ostream *ss)
7821 {
7822
7823 if (*crush_rule < 0) {
7824 switch (pool_type) {
7825 case pg_pool_t::TYPE_REPLICATED:
7826 {
7827 if (rule_name == "") {
7828 if (osdmap.stretch_mode_enabled) {
7829 *crush_rule = get_replicated_stretch_crush_rule();
7830 } else {
7831 // Use default rule
7832 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
7833 }
7834 if (*crush_rule < 0) {
7835 // Errors may happen e.g. if no valid rule is available
7836 *ss << "No suitable CRUSH rule exists, check "
7837 << "'osd pool default crush *' config options";
7838 return -ENOENT;
7839 }
7840 } else {
7841 return get_crush_rule(rule_name, crush_rule, ss);
7842 }
7843 }
7844 break;
7845 case pg_pool_t::TYPE_ERASURE:
7846 {
7847 int err = crush_rule_create_erasure(rule_name,
7848 erasure_code_profile,
7849 crush_rule, ss);
7850 switch (err) {
7851 case -EALREADY:
7852 dout(20) << "prepare_pool_crush_rule: rule "
7853 << rule_name << " try again" << dendl;
7854 // fall through
7855 case 0:
7856 // need to wait for the crush rule to be proposed before proceeding
7857 err = -EAGAIN;
7858 break;
7859 case -EEXIST:
7860 err = 0;
7861 break;
7862 }
7863 return err;
7864 }
7865 break;
7866 default:
7867 *ss << "prepare_pool_crush_rule: " << pool_type
7868 << " is not a known pool type";
7869 return -EINVAL;
7870 }
7871 } else {
7872 if (!osdmap.crush->rule_exists(*crush_rule)) {
7873 *ss << "CRUSH rule " << *crush_rule << " not found";
7874 return -ENOENT;
7875 }
7876 }
7877
7878 return 0;
7879 }
7880
7881 int OSDMonitor::get_crush_rule(const string &rule_name,
7882 int *crush_rule,
7883 ostream *ss)
7884 {
7885 int ret;
7886 ret = osdmap.crush->get_rule_id(rule_name);
7887 if (ret != -ENOENT) {
7888 // found it, use it
7889 *crush_rule = ret;
7890 } else {
7891 CrushWrapper newcrush = _get_pending_crush();
7892
7893 ret = newcrush.get_rule_id(rule_name);
7894 if (ret != -ENOENT) {
7895 // found it, wait for it to be proposed
7896 dout(20) << __func__ << ": rule " << rule_name
7897 << " try again" << dendl;
7898 return -EAGAIN;
7899 } else {
7900 // Cannot find it , return error
7901 *ss << "specified rule " << rule_name << " doesn't exist";
7902 return ret;
7903 }
7904 }
7905 return 0;
7906 }
7907
7908 /*
7909 * Get the number of 'in' osds according to the crush_rule,
7910 */
7911 uint32_t OSDMonitor::get_osd_num_by_crush(int crush_rule)
7912 {
7913 set<int> out_osds;
7914 set<int> crush_in_osds;
7915 set<int> roots;
7916 CrushWrapper newcrush = _get_pending_crush();
7917 newcrush.find_takes_by_rule(crush_rule, &roots);
7918 for (auto root : roots) {
7919 const char *rootname = newcrush.get_item_name(root);
7920 set<int> crush_all_osds;
7921 newcrush.get_leaves(rootname, &crush_all_osds);
7922 std::set_difference(crush_all_osds.begin(), crush_all_osds.end(),
7923 out_osds.begin(), out_osds.end(),
7924 std::inserter(crush_in_osds, crush_in_osds.end()));
7925 }
7926 return crush_in_osds.size();
7927 }
7928
7929 int OSDMonitor::check_pg_num(int64_t pool,
7930 int pg_num,
7931 int size,
7932 int crush_rule,
7933 ostream *ss)
7934 {
7935 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7936 uint64_t projected = 0;
7937 uint32_t osd_num_by_crush = 0;
7938 set<int64_t> crush_pool_ids;
7939 if (pool < 0) {
7940 // a new pool
7941 projected += pg_num * size;
7942 }
7943
7944 osd_num_by_crush = get_osd_num_by_crush(crush_rule);
7945 osdmap.get_pool_ids_by_rule(crush_rule, &crush_pool_ids);
7946
7947 for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
7948 // Check only for pools affected by crush rule
7949 if (crush_pool_ids.contains(pool_id)) {
7950 if (pool_id == pool) {
7951 // Specified pool, use given pg_num and size values.
7952 projected += pg_num * size;
7953 } else {
7954 // Use pg_num_target for evaluating the projected pg num
7955 projected += pool_info.get_pg_num_target() * pool_info.get_size();
7956 }
7957 }
7958 }
7959 // assume min cluster size 3
7960 osd_num_by_crush = std::max(osd_num_by_crush, 3u);
7961 auto projected_pgs_per_osd = projected / osd_num_by_crush;
7962
7963 if (projected_pgs_per_osd > max_pgs_per_osd) {
7964 if (pool >= 0) {
7965 *ss << "pool id " << pool;
7966 }
7967 *ss << " pg_num " << pg_num
7968 << " size " << size
7969 << " for this pool would result in "
7970 << projected_pgs_per_osd
7971 << " cumulative PGs per OSD (" << projected
7972 << " total PG replicas on " << osd_num_by_crush
7973 << " 'in' root OSDs by crush rule) "
7974 << "which exceeds the mon_max_pg_per_osd "
7975 << "value of " << max_pgs_per_osd;
7976 return -ERANGE;
7977 }
7978 return 0;
7979 }
7980
7981 /**
7982 * @param name The name of the new pool
7983 * @param crush_rule The crush rule to use. If <0, will use the system default
7984 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7985 * @param pg_num The pg_num to use. If set to 0, will use the system default
7986 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7987 * @param pg_num_min min pg_num
7988 * @param pg_num_max max pg_num
7989 * @param repl_size Replication factor, or 0 for default
7990 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7991 * @param pool_type TYPE_ERASURE, or TYPE_REP
7992 * @param expected_num_objects expected number of objects on the pool
7993 * @param fast_read fast read type.
7994 * @param pg_autoscale_mode autoscale mode, one of on, off, warn
7995 * @param bool bulk indicates whether pool should be a bulk pool
7996 * @param bool crimson indicates whether pool is a crimson pool
7997 * @param ss human readable error message, if any.
7998 *
7999 * @return 0 on success, negative errno on failure.
8000 */
8001 int OSDMonitor::prepare_new_pool(string& name,
8002 int crush_rule,
8003 const string &crush_rule_name,
8004 unsigned pg_num, unsigned pgp_num,
8005 unsigned pg_num_min,
8006 unsigned pg_num_max,
8007 const uint64_t repl_size,
8008 const uint64_t target_size_bytes,
8009 const float target_size_ratio,
8010 const string &erasure_code_profile,
8011 const unsigned pool_type,
8012 const uint64_t expected_num_objects,
8013 FastReadType fast_read,
8014 string pg_autoscale_mode,
8015 bool bulk,
8016 bool crimson,
8017 ostream *ss)
8018 {
8019 if (crimson && pg_autoscale_mode.empty()) {
8020 // default pg_autoscale_mode to off for crimson, we'll error out below if
8021 // the user tried to actually set pg_autoscale_mode to something other than
8022 // "off"
8023 pg_autoscale_mode = "off";
8024 }
8025
8026 if (name.length() == 0)
8027 return -EINVAL;
8028
8029 if (pg_num == 0) {
8030 auto pg_num_from_mode =
8031 [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
8032 (const string& mode) {
8033 return mode == "on" ? 1 : pg_num;
8034 };
8035 pg_num = pg_num_from_mode(
8036 pg_autoscale_mode.empty() ?
8037 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
8038 pg_autoscale_mode);
8039 }
8040 if (pgp_num == 0)
8041 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
8042 if (!pgp_num)
8043 pgp_num = pg_num;
8044 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8045 *ss << "'pg_num' must be greater than 0 and less than or equal to "
8046 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8047 << " (you may adjust 'mon max pool pg num' for higher values)";
8048 return -ERANGE;
8049 }
8050 if (pgp_num > pg_num) {
8051 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8052 << ", which in this case is " << pg_num;
8053 return -ERANGE;
8054 }
8055
8056 if (crimson) {
8057 /* crimson-osd requires that the pool be replicated and that pg_num/pgp_num
8058 * be static. User must also have specified set-allow-crimson */
8059 const auto *suffix = " (--crimson specified or osd_pool_default_crimson set)";
8060 if (pool_type != pg_pool_t::TYPE_REPLICATED) {
8061 *ss << "crimson-osd only supports replicated pools" << suffix;
8062 return -EINVAL;
8063 } else if (pg_autoscale_mode != "off") {
8064 *ss << "crimson-osd does not support changing pg_num or pgp_num, "
8065 << "pg_autoscale_mode must be set to 'off'" << suffix;
8066 return -EINVAL;
8067 } else if (!osdmap.get_allow_crimson()) {
8068 *ss << "set-allow-crimson must be set to create a pool with the "
8069 << "crimson flag" << suffix;
8070 return -EINVAL;
8071 }
8072 }
8073
8074 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
8075 *ss << "'fast_read' can only apply to erasure coding pool";
8076 return -EINVAL;
8077 }
8078 int r;
8079 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
8080 crush_rule_name, &crush_rule, ss);
8081 if (r) {
8082 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
8083 return r;
8084 }
8085 unsigned size, min_size;
8086 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
8087 &size, &min_size, ss);
8088 if (r) {
8089 dout(10) << "prepare_pool_size returns " << r << dendl;
8090 return r;
8091 }
8092 if (g_conf()->mon_osd_crush_smoke_test) {
8093 CrushWrapper newcrush = _get_pending_crush();
8094 ostringstream err;
8095 CrushTester tester(newcrush, err);
8096 tester.set_min_x(0);
8097 tester.set_max_x(50);
8098 tester.set_rule(crush_rule);
8099 tester.set_num_rep(size);
8100 auto start = ceph::coarse_mono_clock::now();
8101 r = tester.test_with_fork(cct, g_conf()->mon_lease);
8102 dout(10) << __func__ << " crush test_with_fork tester created " << dendl;
8103 auto duration = ceph::coarse_mono_clock::now() - start;
8104 if (r < 0) {
8105 dout(10) << "tester.test_with_fork returns " << r
8106 << ": " << err.str() << dendl;
8107 *ss << "crush test failed with " << r << ": " << err.str();
8108 return r;
8109 }
8110 dout(10) << __func__ << " crush smoke test duration: "
8111 << duration << dendl;
8112 }
8113 r = check_pg_num(-1, pg_num, size, crush_rule, ss);
8114 if (r) {
8115 dout(10) << "check_pg_num returns " << r << dendl;
8116 return r;
8117 }
8118
8119 if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) {
8120 *ss << "crush rule " << crush_rule << " type does not match pool";
8121 return -EINVAL;
8122 }
8123
8124 uint32_t stripe_width = 0;
8125 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
8126 if (r) {
8127 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
8128 return r;
8129 }
8130
8131 bool fread = false;
8132 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8133 switch (fast_read) {
8134 case FAST_READ_OFF:
8135 fread = false;
8136 break;
8137 case FAST_READ_ON:
8138 fread = true;
8139 break;
8140 case FAST_READ_DEFAULT:
8141 fread = g_conf()->osd_pool_default_ec_fast_read;
8142 break;
8143 default:
8144 *ss << "invalid fast_read setting: " << fast_read;
8145 return -EINVAL;
8146 }
8147 }
8148
8149 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8150 p != pending_inc.new_pool_names.end();
8151 ++p) {
8152 if (p->second == name)
8153 return 0;
8154 }
8155
8156 if (-1 == pending_inc.new_pool_max)
8157 pending_inc.new_pool_max = osdmap.pool_max;
8158 int64_t pool = ++pending_inc.new_pool_max;
8159 pg_pool_t empty;
8160 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
8161 pi->create_time = ceph_clock_now();
8162 pi->type = pool_type;
8163 pi->fast_read = fread;
8164 pi->flags = g_conf()->osd_pool_default_flags;
8165 if (bulk) {
8166 pi->set_flag(pg_pool_t::FLAG_BULK);
8167 } else if (g_conf()->osd_pool_default_flag_bulk) {
8168 pi->set_flag(pg_pool_t::FLAG_BULK);
8169 }
8170 if (g_conf()->osd_pool_default_flag_hashpspool)
8171 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
8172 if (g_conf()->osd_pool_default_flag_nodelete)
8173 pi->set_flag(pg_pool_t::FLAG_NODELETE);
8174 if (g_conf()->osd_pool_default_flag_nopgchange)
8175 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8176 if (g_conf()->osd_pool_default_flag_nosizechange)
8177 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
8178 pi->set_flag(pg_pool_t::FLAG_CREATING);
8179 if (g_conf()->osd_pool_use_gmt_hitset)
8180 pi->use_gmt_hitset = true;
8181 else
8182 pi->use_gmt_hitset = false;
8183 if (crimson) {
8184 pi->set_flag(pg_pool_t::FLAG_CRIMSON);
8185 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8186 }
8187
8188 pi->size = size;
8189 pi->min_size = min_size;
8190 pi->crush_rule = crush_rule;
8191 pi->expected_num_objects = expected_num_objects;
8192 pi->object_hash = CEPH_STR_HASH_RJENKINS;
8193 if (osdmap.stretch_mode_enabled) {
8194 pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8195 pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8196 pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8197 pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8198 if (osdmap.degraded_stretch_mode) {
8199 pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8200 pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8201 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8202 // TODO: drat, we don't record this ^ anywhere, though given that it
8203 // necessarily won't exist elsewhere it likely doesn't matter
8204 pi->min_size = pi->min_size / 2;
8205 pi->size = pi->size / 2; // only support 2 zones now
8206 }
8207 }
8208
8209 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8210 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8211 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8212 pi->pg_autoscale_mode = m;
8213 } else {
8214 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
8215 }
8216 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8217 pi->set_pg_num(
8218 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8219 : pg_num);
8220 pi->set_pg_num_pending(pi->get_pg_num());
8221 pi->set_pg_num_target(pg_num);
8222 pi->set_pgp_num(pi->get_pg_num());
8223 pi->set_pgp_num_target(pgp_num);
8224 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8225 pg_num_min) {
8226 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8227 }
8228 if (osdmap.require_osd_release >= ceph_release_t::quincy &&
8229 pg_num_max) {
8230 pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
8231 }
8232 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8233 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8234 pi->pg_autoscale_mode = m;
8235 }
8236
8237 pi->last_change = pending_inc.epoch;
8238 pi->auid = 0;
8239
8240 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8241 pi->erasure_code_profile = erasure_code_profile;
8242 } else {
8243 pi->erasure_code_profile = "";
8244 }
8245 pi->stripe_width = stripe_width;
8246
8247 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8248 target_size_bytes) {
8249 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8250 // larger than int32_t max.
8251 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8252 }
8253 if (target_size_ratio > 0.0 &&
8254 osdmap.require_osd_release >= ceph_release_t::nautilus) {
8255 // only store for nautilus+, just to be consistent and tidy.
8256 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8257 }
8258
8259 pi->cache_target_dirty_ratio_micro =
8260 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8261 pi->cache_target_dirty_high_ratio_micro =
8262 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8263 pi->cache_target_full_ratio_micro =
8264 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8265 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8266 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8267
8268 pending_inc.new_pool_names[pool] = name;
8269 return 0;
8270 }
8271
8272 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8273 {
8274 op->mark_osdmon_event(__func__);
8275 ostringstream ss;
8276 if (pending_inc.new_flags < 0)
8277 pending_inc.new_flags = osdmap.get_flags();
8278 pending_inc.new_flags |= flag;
8279 ss << OSDMap::get_flag_string(flag) << " is set";
8280 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8281 get_last_committed() + 1));
8282 return true;
8283 }
8284
8285 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8286 {
8287 op->mark_osdmon_event(__func__);
8288 ostringstream ss;
8289 if (pending_inc.new_flags < 0)
8290 pending_inc.new_flags = osdmap.get_flags();
8291 pending_inc.new_flags &= ~flag;
8292 ss << OSDMap::get_flag_string(flag) << " is unset";
8293 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8294 get_last_committed() + 1));
8295 return true;
8296 }
8297
8298 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8299 stringstream& ss)
8300 {
8301 string poolstr;
8302 cmd_getval(cmdmap, "pool", poolstr);
8303 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8304 if (pool < 0) {
8305 ss << "unrecognized pool '" << poolstr << "'";
8306 return -ENOENT;
8307 }
8308 string var;
8309 cmd_getval(cmdmap, "var", var);
8310
8311 pg_pool_t p = *osdmap.get_pg_pool(pool);
8312 if (pending_inc.new_pools.count(pool))
8313 p = pending_inc.new_pools[pool];
8314
8315 // accept val as a json string in the normal case (current
8316 // generation monitor). parse out int or float values from the
8317 // string as needed. however, if it is not a string, try to pull
8318 // out an int, in case an older monitor with an older json schema is
8319 // forwarding a request.
8320 string val;
8321 string interr, floaterr;
8322 int64_t n = 0;
8323 double f = 0;
8324 int64_t uf = 0; // micro-f
8325 cmd_getval(cmdmap, "val", val);
8326
8327 auto si_options = {
8328 "target_max_objects"
8329 };
8330 auto iec_options = {
8331 "target_max_bytes",
8332 "target_size_bytes",
8333 "compression_max_blob_size",
8334 "compression_min_blob_size",
8335 "csum_max_block",
8336 "csum_min_block",
8337 };
8338 if (count(begin(si_options), end(si_options), var)) {
8339 n = strict_si_cast<int64_t>(val, &interr);
8340 } else if (count(begin(iec_options), end(iec_options), var)) {
8341 n = strict_iec_cast<int64_t>(val, &interr);
8342 } else {
8343 // parse string as both int and float; different fields use different types.
8344 n = strict_strtoll(val.c_str(), 10, &interr);
8345 f = strict_strtod(val.c_str(), &floaterr);
8346 uf = llrintl(f * (double)1000000.0);
8347 }
8348
8349 if (!p.is_tier() &&
8350 (var == "hit_set_type" || var == "hit_set_period" ||
8351 var == "hit_set_count" || var == "hit_set_fpp" ||
8352 var == "target_max_objects" || var == "target_max_bytes" ||
8353 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8354 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8355 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8356 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8357 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8358 return -EACCES;
8359 }
8360
8361 if (var == "size") {
8362 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8363 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8364 return -EPERM;
8365 }
8366 if (p.type == pg_pool_t::TYPE_ERASURE) {
8367 ss << "can not change the size of an erasure-coded pool";
8368 return -ENOTSUP;
8369 }
8370 if (interr.length()) {
8371 ss << "error parsing integer value '" << val << "': " << interr;
8372 return -EINVAL;
8373 }
8374 if (n <= 0 || n > 10) {
8375 ss << "pool size must be between 1 and 10";
8376 return -EINVAL;
8377 }
8378 if (n == 1) {
8379 if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8380 ss << "configuring pool size as 1 is disabled by default.";
8381 return -EPERM;
8382 }
8383 bool sure = false;
8384 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8385 if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8386 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8387 "pass the flag --yes-i-really-mean-it.";
8388 return -EPERM;
8389 }
8390 }
8391 if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) {
8392 ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
8393 return -EINVAL;
8394 }
8395 if (n > p.size) {
8396 // only when increasing pool size
8397 int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
8398 if (r < 0) {
8399 return r;
8400 }
8401 }
8402 p.size = n;
8403 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8404 } else if (var == "min_size") {
8405 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8406 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8407 return -EPERM;
8408 }
8409 if (interr.length()) {
8410 ss << "error parsing integer value '" << val << "': " << interr;
8411 return -EINVAL;
8412 }
8413
8414 if (p.type != pg_pool_t::TYPE_ERASURE) {
8415 if (n < 1 || n > p.size) {
8416 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8417 return -EINVAL;
8418 }
8419 } else {
8420 ErasureCodeInterfaceRef erasure_code;
8421 int k;
8422 stringstream tmp;
8423 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8424 if (err == 0) {
8425 k = erasure_code->get_data_chunk_count();
8426 } else {
8427 ss << __func__ << " get_erasure_code failed: " << tmp.str();
8428 return err;
8429 }
8430
8431 if (n < k || n > p.size) {
8432 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8433 return -EINVAL;
8434 }
8435 }
8436 p.min_size = n;
8437 } else if (var == "pg_num_actual") {
8438 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8439 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8440 return -EPERM;
8441 }
8442 if (interr.length()) {
8443 ss << "error parsing integer value '" << val << "': " << interr;
8444 return -EINVAL;
8445 }
8446 if (n == (int)p.get_pg_num()) {
8447 return 0;
8448 }
8449 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8450 ss << "'pg_num' must be greater than 0 and less than or equal to "
8451 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8452 << " (you may adjust 'mon max pool pg num' for higher values)";
8453 return -ERANGE;
8454 }
8455 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8456 ss << "cannot adjust pg_num while initial PGs are being created";
8457 return -EBUSY;
8458 }
8459 if (n > (int)p.get_pg_num()) {
8460 if (p.get_pg_num() != p.get_pg_num_pending()) {
8461 // force pre-nautilus clients to resend their ops, since they
8462 // don't understand pg_num_pending changes form a new interval
8463 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8464 }
8465 p.set_pg_num(n);
8466 } else {
8467 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8468 ss << "nautilus OSDs are required to adjust pg_num_pending";
8469 return -EPERM;
8470 }
8471 if (n < (int)p.get_pgp_num()) {
8472 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8473 return -EINVAL;
8474 }
8475 if (n < (int)p.get_pg_num() - 1) {
8476 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8477 << ") - 1; only single pg decrease is currently supported";
8478 return -EINVAL;
8479 }
8480 p.set_pg_num_pending(n);
8481 // force pre-nautilus clients to resend their ops, since they
8482 // don't understand pg_num_pending changes form a new interval
8483 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8484 }
8485 // force pre-luminous clients to resend their ops, since they
8486 // don't understand that split PGs now form a new interval.
8487 p.last_force_op_resend_preluminous = pending_inc.epoch;
8488 } else if (var == "pg_num") {
8489 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8490 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8491 return -EPERM;
8492 }
8493 if (interr.length()) {
8494 ss << "error parsing integer value '" << val << "': " << interr;
8495 return -EINVAL;
8496 }
8497 if (n == (int)p.get_pg_num_target()) {
8498 return 0;
8499 }
8500 if (n <= 0 || static_cast<uint64_t>(n) >
8501 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8502 ss << "'pg_num' must be greater than 0 and less than or equal to "
8503 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8504 << " (you may adjust 'mon max pool pg num' for higher values)";
8505 return -ERANGE;
8506 }
8507 if (n > (int)p.get_pg_num_target()) {
8508 int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
8509 if (r) {
8510 return r;
8511 }
8512 bool force = false;
8513 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8514 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8515 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8516 return -EPERM;
8517 }
8518 } else {
8519 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8520 ss << "nautilus OSDs are required to decrease pg_num";
8521 return -EPERM;
8522 }
8523 }
8524 int64_t pg_min = 0, pg_max = 0;
8525 p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
8526 p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
8527 if (pg_min && n < pg_min) {
8528 ss << "specified pg_num " << n
8529 << " < pg_num_min " << pg_min;
8530 return -EINVAL;
8531 }
8532 if (pg_max && n > pg_max) {
8533 ss << "specified pg_num " << n
8534 << " < pg_num_max " << pg_max;
8535 return -EINVAL;
8536 }
8537 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8538 // pre-nautilus osdmap format; increase pg_num directly
8539 assert(n > (int)p.get_pg_num());
8540 // force pre-nautilus clients to resend their ops, since they
8541 // don't understand pg_num_target changes form a new interval
8542 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8543 // force pre-luminous clients to resend their ops, since they
8544 // don't understand that split PGs now form a new interval.
8545 p.last_force_op_resend_preluminous = pending_inc.epoch;
8546 p.set_pg_num(n);
8547 } else {
8548 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8549 // make pgp_num track pg_num if it already matches. if it is set
8550 // differently, leave it different and let the user control it
8551 // manually.
8552 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8553 p.set_pgp_num_target(n);
8554 }
8555 p.set_pg_num_target(n);
8556 }
8557 } else if (var == "pgp_num_actual") {
8558 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8559 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8560 return -EPERM;
8561 }
8562 if (interr.length()) {
8563 ss << "error parsing integer value '" << val << "': " << interr;
8564 return -EINVAL;
8565 }
8566 if (n <= 0) {
8567 ss << "specified pgp_num must > 0, but you set to " << n;
8568 return -EINVAL;
8569 }
8570 if (n > (int)p.get_pg_num()) {
8571 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8572 return -EINVAL;
8573 }
8574 if (n > (int)p.get_pg_num_pending()) {
8575 ss << "specified pgp_num " << n
8576 << " > pg_num_pending " << p.get_pg_num_pending();
8577 return -EINVAL;
8578 }
8579 p.set_pgp_num(n);
8580 } else if (var == "pgp_num") {
8581 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8582 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8583 return -EPERM;
8584 }
8585 if (interr.length()) {
8586 ss << "error parsing integer value '" << val << "': " << interr;
8587 return -EINVAL;
8588 }
8589 if (n <= 0) {
8590 ss << "specified pgp_num must > 0, but you set to " << n;
8591 return -EINVAL;
8592 }
8593 if (n > (int)p.get_pg_num_target()) {
8594 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8595 return -EINVAL;
8596 }
8597 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8598 // pre-nautilus osdmap format; increase pgp_num directly
8599 p.set_pgp_num(n);
8600 } else {
8601 p.set_pgp_num_target(n);
8602 }
8603 } else if (var == "pg_autoscale_mode") {
8604 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8605 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8606 ss << "specified invalid mode " << val;
8607 return -EINVAL;
8608 }
8609 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8610 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8611 return -EINVAL;
8612 }
8613 p.pg_autoscale_mode = m;
8614 } else if (var == "crush_rule") {
8615 int id = osdmap.crush->get_rule_id(val);
8616 if (id == -ENOENT) {
8617 ss << "crush rule " << val << " does not exist";
8618 return -ENOENT;
8619 }
8620 if (id < 0) {
8621 ss << cpp_strerror(id);
8622 return -ENOENT;
8623 }
8624 if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) {
8625 ss << "crush rule " << id << " type does not match pool";
8626 return -EINVAL;
8627 }
8628 p.crush_rule = id;
8629 } else if (var == "nodelete" || var == "nopgchange" ||
8630 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8631 var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
8632 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8633 // make sure we only compare against 'n' if we didn't receive a string
8634 if (val == "true" || (interr.empty() && n == 1)) {
8635 p.set_flag(flag);
8636 } else if (val == "false" || (interr.empty() && n == 0)) {
8637 if (flag == pg_pool_t::FLAG_NOPGCHANGE && p.is_crimson()) {
8638 ss << "cannot clear FLAG_NOPGCHANGE on a crimson pool";
8639 return -EINVAL;
8640 }
8641 p.unset_flag(flag);
8642 } else {
8643 ss << "expecting value 'true', 'false', '0', or '1'";
8644 return -EINVAL;
8645 }
8646 } else if (var == "eio") {
8647 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8648
8649 // make sure we only compare against 'n' if we didn't receive a string
8650 if (val == "true" || (interr.empty() && n == 1)) {
8651 p.set_flag(flag);
8652 } else if (val == "false" || (interr.empty() && n == 0)) {
8653 p.unset_flag(flag);
8654 } else {
8655 ss << "expecting value 'true', 'false', '0', or '1'";
8656 return -EINVAL;
8657 }
8658 } else if (var == "hashpspool") {
8659 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8660 bool force = false;
8661 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8662
8663 if (!force) {
8664 ss << "are you SURE? this will remap all placement groups in this pool,"
8665 " this triggers large data movement,"
8666 " pass --yes-i-really-mean-it if you really do.";
8667 return -EPERM;
8668 }
8669 // make sure we only compare against 'n' if we didn't receive a string
8670 if (val == "true" || (interr.empty() && n == 1)) {
8671 p.set_flag(flag);
8672 } else if (val == "false" || (interr.empty() && n == 0)) {
8673 p.unset_flag(flag);
8674 } else {
8675 ss << "expecting value 'true', 'false', '0', or '1'";
8676 return -EINVAL;
8677 }
8678 } else if (var == "hit_set_type") {
8679 if (val == "none")
8680 p.hit_set_params = HitSet::Params();
8681 else {
8682 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8683 if (err)
8684 return err;
8685 if (val == "bloom") {
8686 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8687 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8688 p.hit_set_params = HitSet::Params(bsp);
8689 } else if (val == "explicit_hash")
8690 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8691 else if (val == "explicit_object")
8692 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8693 else {
8694 ss << "unrecognized hit_set type '" << val << "'";
8695 return -EINVAL;
8696 }
8697 }
8698 } else if (var == "hit_set_period") {
8699 if (interr.length()) {
8700 ss << "error parsing integer value '" << val << "': " << interr;
8701 return -EINVAL;
8702 } else if (n < 0) {
8703 ss << "hit_set_period should be non-negative";
8704 return -EINVAL;
8705 }
8706 p.hit_set_period = n;
8707 } else if (var == "hit_set_count") {
8708 if (interr.length()) {
8709 ss << "error parsing integer value '" << val << "': " << interr;
8710 return -EINVAL;
8711 } else if (n < 0) {
8712 ss << "hit_set_count should be non-negative";
8713 return -EINVAL;
8714 }
8715 p.hit_set_count = n;
8716 } else if (var == "hit_set_fpp") {
8717 if (floaterr.length()) {
8718 ss << "error parsing floating point value '" << val << "': " << floaterr;
8719 return -EINVAL;
8720 } else if (f < 0 || f > 1.0) {
8721 ss << "hit_set_fpp should be in the range 0..1";
8722 return -EINVAL;
8723 }
8724 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8725 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8726 return -EINVAL;
8727 }
8728 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8729 bloomp->set_fpp(f);
8730 } else if (var == "use_gmt_hitset") {
8731 if (val == "true" || (interr.empty() && n == 1)) {
8732 p.use_gmt_hitset = true;
8733 } else {
8734 ss << "expecting value 'true' or '1'";
8735 return -EINVAL;
8736 }
8737 } else if (var == "allow_ec_overwrites") {
8738 if (!p.is_erasure()) {
8739 ss << "ec overwrites can only be enabled for an erasure coded pool";
8740 return -EINVAL;
8741 }
8742 stringstream err;
8743 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8744 !is_pool_currently_all_bluestore(pool, p, &err)) {
8745 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8746 return -EINVAL;
8747 }
8748 if (val == "true" || (interr.empty() && n == 1)) {
8749 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8750 } else if (val == "false" || (interr.empty() && n == 0)) {
8751 ss << "ec overwrites cannot be disabled once enabled";
8752 return -EINVAL;
8753 } else {
8754 ss << "expecting value 'true', 'false', '0', or '1'";
8755 return -EINVAL;
8756 }
8757 } else if (var == "target_max_objects") {
8758 if (interr.length()) {
8759 ss << "error parsing int '" << val << "': " << interr;
8760 return -EINVAL;
8761 }
8762 p.target_max_objects = n;
8763 } else if (var == "target_max_bytes") {
8764 if (interr.length()) {
8765 ss << "error parsing int '" << val << "': " << interr;
8766 return -EINVAL;
8767 }
8768 p.target_max_bytes = n;
8769 } else if (var == "cache_target_dirty_ratio") {
8770 if (floaterr.length()) {
8771 ss << "error parsing float '" << val << "': " << floaterr;
8772 return -EINVAL;
8773 }
8774 if (f < 0 || f > 1.0) {
8775 ss << "value must be in the range 0..1";
8776 return -ERANGE;
8777 }
8778 p.cache_target_dirty_ratio_micro = uf;
8779 } else if (var == "cache_target_dirty_high_ratio") {
8780 if (floaterr.length()) {
8781 ss << "error parsing float '" << val << "': " << floaterr;
8782 return -EINVAL;
8783 }
8784 if (f < 0 || f > 1.0) {
8785 ss << "value must be in the range 0..1";
8786 return -ERANGE;
8787 }
8788 p.cache_target_dirty_high_ratio_micro = uf;
8789 } else if (var == "cache_target_full_ratio") {
8790 if (floaterr.length()) {
8791 ss << "error parsing float '" << val << "': " << floaterr;
8792 return -EINVAL;
8793 }
8794 if (f < 0 || f > 1.0) {
8795 ss << "value must be in the range 0..1";
8796 return -ERANGE;
8797 }
8798 p.cache_target_full_ratio_micro = uf;
8799 } else if (var == "cache_min_flush_age") {
8800 if (interr.length()) {
8801 ss << "error parsing int '" << val << "': " << interr;
8802 return -EINVAL;
8803 }
8804 p.cache_min_flush_age = n;
8805 } else if (var == "cache_min_evict_age") {
8806 if (interr.length()) {
8807 ss << "error parsing int '" << val << "': " << interr;
8808 return -EINVAL;
8809 }
8810 p.cache_min_evict_age = n;
8811 } else if (var == "min_read_recency_for_promote") {
8812 if (interr.length()) {
8813 ss << "error parsing integer value '" << val << "': " << interr;
8814 return -EINVAL;
8815 }
8816 p.min_read_recency_for_promote = n;
8817 } else if (var == "hit_set_grade_decay_rate") {
8818 if (interr.length()) {
8819 ss << "error parsing integer value '" << val << "': " << interr;
8820 return -EINVAL;
8821 }
8822 if (n > 100 || n < 0) {
8823 ss << "value out of range,valid range is 0 - 100";
8824 return -EINVAL;
8825 }
8826 p.hit_set_grade_decay_rate = n;
8827 } else if (var == "hit_set_search_last_n") {
8828 if (interr.length()) {
8829 ss << "error parsing integer value '" << val << "': " << interr;
8830 return -EINVAL;
8831 }
8832 if (n > p.hit_set_count || n < 0) {
8833 ss << "value out of range,valid range is 0 - hit_set_count";
8834 return -EINVAL;
8835 }
8836 p.hit_set_search_last_n = n;
8837 } else if (var == "min_write_recency_for_promote") {
8838 if (interr.length()) {
8839 ss << "error parsing integer value '" << val << "': " << interr;
8840 return -EINVAL;
8841 }
8842 p.min_write_recency_for_promote = n;
8843 } else if (var == "fast_read") {
8844 if (p.is_replicated()) {
8845 ss << "fast read is not supported in replication pool";
8846 return -EINVAL;
8847 }
8848 if (val == "true" || (interr.empty() && n == 1)) {
8849 p.fast_read = true;
8850 } else if (val == "false" || (interr.empty() && n == 0)) {
8851 p.fast_read = false;
8852 } else {
8853 ss << "expecting value 'true', 'false', '0', or '1'";
8854 return -EINVAL;
8855 }
8856 } else if (pool_opts_t::is_opt_name(var)) {
8857 bool unset = val == "unset";
8858 if (var == "compression_mode") {
8859 if (!unset) {
8860 auto cmode = Compressor::get_comp_mode_type(val);
8861 if (!cmode) {
8862 ss << "unrecognized compression mode '" << val << "'";
8863 return -EINVAL;
8864 }
8865 }
8866 } else if (var == "compression_algorithm") {
8867 if (!unset) {
8868 auto alg = Compressor::get_comp_alg_type(val);
8869 if (!alg) {
8870 ss << "unrecognized compression_algorithm '" << val << "'";
8871 return -EINVAL;
8872 }
8873 }
8874 } else if (var == "compression_required_ratio") {
8875 if (floaterr.length()) {
8876 ss << "error parsing float value '" << val << "': " << floaterr;
8877 return -EINVAL;
8878 }
8879 if (f < 0 || f > 1) {
8880 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8881 return -EINVAL;
8882 }
8883 } else if (var == "csum_type") {
8884 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8885 if (t < 0 ) {
8886 ss << "unrecognized csum_type '" << val << "'";
8887 return -EINVAL;
8888 }
8889 //preserve csum_type numeric value
8890 n = t;
8891 interr.clear();
8892 } else if (var == "compression_max_blob_size" ||
8893 var == "compression_min_blob_size" ||
8894 var == "csum_max_block" ||
8895 var == "csum_min_block") {
8896 if (interr.length()) {
8897 ss << "error parsing int value '" << val << "': " << interr;
8898 return -EINVAL;
8899 }
8900 } else if (var == "fingerprint_algorithm") {
8901 if (!unset) {
8902 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8903 if (!alg) {
8904 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8905 return -EINVAL;
8906 }
8907 }
8908 } else if (var == "target_size_bytes") {
8909 if (interr.length()) {
8910 ss << "error parsing unit value '" << val << "': " << interr;
8911 return -EINVAL;
8912 }
8913 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8914 ss << "must set require_osd_release to nautilus or "
8915 << "later before setting target_size_bytes";
8916 return -EINVAL;
8917 }
8918 } else if (var == "target_size_ratio") {
8919 if (f < 0.0) {
8920 ss << "target_size_ratio cannot be negative";
8921 return -EINVAL;
8922 }
8923 } else if (var == "pg_num_min") {
8924 if (interr.length()) {
8925 ss << "error parsing int value '" << val << "': " << interr;
8926 return -EINVAL;
8927 }
8928 if (n > (int)p.get_pg_num_target()) {
8929 ss << "specified pg_num_min " << n
8930 << " > pg_num " << p.get_pg_num_target();
8931 return -EINVAL;
8932 }
8933 } else if (var == "pg_num_max") {
8934 if (interr.length()) {
8935 ss << "error parsing int value '" << val << "': " << interr;
8936 return -EINVAL;
8937 }
8938 if (n && n < (int)p.get_pg_num_target()) {
8939 ss << "specified pg_num_max " << n
8940 << " < pg_num " << p.get_pg_num_target();
8941 return -EINVAL;
8942 }
8943 } else if (var == "recovery_priority") {
8944 if (interr.length()) {
8945 ss << "error parsing int value '" << val << "': " << interr;
8946 return -EINVAL;
8947 }
8948 if (!g_conf()->debug_allow_any_pool_priority) {
8949 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8950 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8951 << " and " << OSD_POOL_PRIORITY_MAX;
8952 return -EINVAL;
8953 }
8954 }
8955 } else if (var == "pg_autoscale_bias") {
8956 if (f < 0.0 || f > 1000.0) {
8957 ss << "pg_autoscale_bias must be between 0 and 1000";
8958 return -EINVAL;
8959 }
8960 } else if (var == "dedup_tier") {
8961 if (interr.empty()) {
8962 ss << "expecting value 'pool name'";
8963 return -EINVAL;
8964 }
8965 // Current base tier in dedup does not support ec pool
8966 if (p.is_erasure()) {
8967 ss << "pool '" << poolstr
8968 << "' is an ec pool, which cannot be a base tier";
8969 return -ENOTSUP;
8970 }
8971 int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8972 if (lowtierpool_id < 0) {
8973 ss << "unrecognized pool '" << val << "'";
8974 return -ENOENT;
8975 }
8976 const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8977 ceph_assert(tp);
8978 n = lowtierpool_id;
8979 // The original input is string (pool name), but we convert it to int64_t.
8980 // So, clear interr
8981 interr.clear();
8982 } else if (var == "dedup_chunk_algorithm") {
8983 if (!unset) {
8984 auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8985 if (!alg) {
8986 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8987 return -EINVAL;
8988 }
8989 }
8990 } else if (var == "dedup_cdc_chunk_size") {
8991 if (interr.length()) {
8992 ss << "error parsing int value '" << val << "': " << interr;
8993 return -EINVAL;
8994 }
8995 }
8996
8997 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8998 switch (desc.type) {
8999 case pool_opts_t::STR:
9000 if (unset) {
9001 p.opts.unset(desc.key);
9002 } else {
9003 p.opts.set(desc.key, static_cast<std::string>(val));
9004 }
9005 break;
9006 case pool_opts_t::INT:
9007 if (interr.length()) {
9008 ss << "error parsing integer value '" << val << "': " << interr;
9009 return -EINVAL;
9010 }
9011 if (n == 0) {
9012 p.opts.unset(desc.key);
9013 } else {
9014 p.opts.set(desc.key, static_cast<int64_t>(n));
9015 }
9016 break;
9017 case pool_opts_t::DOUBLE:
9018 if (floaterr.length()) {
9019 ss << "error parsing floating point value '" << val << "': " << floaterr;
9020 return -EINVAL;
9021 }
9022 if (f == 0) {
9023 p.opts.unset(desc.key);
9024 } else {
9025 p.opts.set(desc.key, static_cast<double>(f));
9026 }
9027 break;
9028 default:
9029 ceph_assert(!"unknown type");
9030 }
9031 } else {
9032 ss << "unrecognized variable '" << var << "'";
9033 return -EINVAL;
9034 }
9035 if (val != "unset") {
9036 ss << "set pool " << pool << " " << var << " to " << val;
9037 } else {
9038 ss << "unset pool " << pool << " " << var;
9039 }
9040 p.last_change = pending_inc.epoch;
9041 pending_inc.new_pools[pool] = p;
9042 return 0;
9043 }
9044
9045 int OSDMonitor::prepare_command_pool_application(const string &prefix,
9046 const cmdmap_t& cmdmap,
9047 stringstream& ss)
9048 {
9049 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
9050 }
9051
9052 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
9053 const cmdmap_t& cmdmap,
9054 stringstream& ss,
9055 bool *modified)
9056 {
9057 return _command_pool_application(prefix, cmdmap, ss, modified, false);
9058 }
9059
9060
9061 /**
9062 * Common logic for preprocess and prepare phases of pool application
9063 * tag commands. In preprocess mode we're only detecting invalid
9064 * commands, and determining whether it was a modification or a no-op.
9065 * In prepare mode we're actually updating the pending state.
9066 */
9067 int OSDMonitor::_command_pool_application(const string &prefix,
9068 const cmdmap_t& cmdmap,
9069 stringstream& ss,
9070 bool *modified,
9071 bool preparing)
9072 {
9073 string pool_name;
9074 cmd_getval(cmdmap, "pool", pool_name);
9075 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
9076 if (pool < 0) {
9077 ss << "unrecognized pool '" << pool_name << "'";
9078 return -ENOENT;
9079 }
9080
9081 pg_pool_t p = *osdmap.get_pg_pool(pool);
9082 if (preparing) {
9083 if (pending_inc.new_pools.count(pool)) {
9084 p = pending_inc.new_pools[pool];
9085 }
9086 }
9087
9088 string app;
9089 cmd_getval(cmdmap, "app", app);
9090 bool app_exists = (p.application_metadata.count(app) > 0);
9091
9092 string key;
9093 cmd_getval(cmdmap, "key", key);
9094 if (key == "all") {
9095 ss << "key cannot be 'all'";
9096 return -EINVAL;
9097 }
9098
9099 string value;
9100 cmd_getval(cmdmap, "value", value);
9101 if (value == "all") {
9102 ss << "value cannot be 'all'";
9103 return -EINVAL;
9104 }
9105
9106 if (boost::algorithm::ends_with(prefix, "enable")) {
9107 if (app.empty()) {
9108 ss << "application name must be provided";
9109 return -EINVAL;
9110 }
9111
9112 if (p.is_tier()) {
9113 ss << "application must be enabled on base tier";
9114 return -EINVAL;
9115 }
9116
9117 bool force = false;
9118 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9119
9120 if (!app_exists && !p.application_metadata.empty() && !force) {
9121 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
9122 << "application; pass --yes-i-really-mean-it to proceed anyway";
9123 return -EPERM;
9124 }
9125
9126 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
9127 ss << "too many enabled applications on pool '" << pool_name << "'; "
9128 << "max " << MAX_POOL_APPLICATIONS;
9129 return -EINVAL;
9130 }
9131
9132 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
9133 ss << "application name '" << app << "' too long; max length "
9134 << MAX_POOL_APPLICATION_LENGTH;
9135 return -EINVAL;
9136 }
9137
9138 if (!app_exists) {
9139 p.application_metadata[app] = {};
9140 }
9141 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
9142
9143 } else if (boost::algorithm::ends_with(prefix, "disable")) {
9144 bool force = false;
9145 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9146
9147 if (!force) {
9148 ss << "Are you SURE? Disabling an application within a pool might result "
9149 << "in loss of application functionality; pass "
9150 << "--yes-i-really-mean-it to proceed anyway";
9151 return -EPERM;
9152 }
9153
9154 if (!app_exists) {
9155 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9156 << "'";
9157 return 0; // idempotent
9158 }
9159
9160 p.application_metadata.erase(app);
9161 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
9162
9163 } else if (boost::algorithm::ends_with(prefix, "set")) {
9164 if (p.is_tier()) {
9165 ss << "application metadata must be set on base tier";
9166 return -EINVAL;
9167 }
9168
9169 if (!app_exists) {
9170 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9171 << "'";
9172 return -ENOENT;
9173 }
9174
9175 string key;
9176 cmd_getval(cmdmap, "key", key);
9177
9178 if (key.empty()) {
9179 ss << "key must be provided";
9180 return -EINVAL;
9181 }
9182
9183 auto &app_keys = p.application_metadata[app];
9184 if (app_keys.count(key) == 0 &&
9185 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
9186 ss << "too many keys set for application '" << app << "' on pool '"
9187 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
9188 return -EINVAL;
9189 }
9190
9191 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
9192 ss << "key '" << app << "' too long; max length "
9193 << MAX_POOL_APPLICATION_LENGTH;
9194 return -EINVAL;
9195 }
9196
9197 string value;
9198 cmd_getval(cmdmap, "value", value);
9199 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9200 ss << "value '" << value << "' too long; max length "
9201 << MAX_POOL_APPLICATION_LENGTH;
9202 return -EINVAL;
9203 }
9204
9205 p.application_metadata[app][key] = value;
9206 ss << "set application '" << app << "' key '" << key << "' to '"
9207 << value << "' on pool '" << pool_name << "'";
9208 } else if (boost::algorithm::ends_with(prefix, "rm")) {
9209 if (!app_exists) {
9210 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9211 << "'";
9212 return -ENOENT;
9213 }
9214
9215 string key;
9216 cmd_getval(cmdmap, "key", key);
9217 auto it = p.application_metadata[app].find(key);
9218 if (it == p.application_metadata[app].end()) {
9219 ss << "application '" << app << "' on pool '" << pool_name
9220 << "' does not have key '" << key << "'";
9221 return 0; // idempotent
9222 }
9223
9224 p.application_metadata[app].erase(it);
9225 ss << "removed application '" << app << "' key '" << key << "' on pool '"
9226 << pool_name << "'";
9227 } else {
9228 ceph_abort();
9229 }
9230
9231 if (preparing) {
9232 p.last_change = pending_inc.epoch;
9233 pending_inc.new_pools[pool] = p;
9234 }
9235
9236 // Because we fell through this far, we didn't hit no-op cases,
9237 // so pool was definitely modified
9238 if (modified != nullptr) {
9239 *modified = true;
9240 }
9241
9242 return 0;
9243 }
9244
9245 int OSDMonitor::_prepare_command_osd_crush_remove(
9246 CrushWrapper &newcrush,
9247 int32_t id,
9248 int32_t ancestor,
9249 bool has_ancestor,
9250 bool unlink_only)
9251 {
9252 int err = 0;
9253
9254 if (has_ancestor) {
9255 err = newcrush.remove_item_under(cct, id, ancestor,
9256 unlink_only);
9257 } else {
9258 err = newcrush.remove_item(cct, id, unlink_only);
9259 }
9260 return err;
9261 }
9262
9263 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9264 {
9265 pending_inc.crush.clear();
9266 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9267 }
9268
9269 int OSDMonitor::prepare_command_osd_crush_remove(
9270 CrushWrapper &newcrush,
9271 int32_t id,
9272 int32_t ancestor,
9273 bool has_ancestor,
9274 bool unlink_only)
9275 {
9276 int err = _prepare_command_osd_crush_remove(
9277 newcrush, id, ancestor,
9278 has_ancestor, unlink_only);
9279
9280 if (err < 0)
9281 return err;
9282
9283 ceph_assert(err == 0);
9284 do_osd_crush_remove(newcrush);
9285
9286 return 0;
9287 }
9288
9289 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9290 {
9291 if (osdmap.is_up(id)) {
9292 return -EBUSY;
9293 }
9294
9295 pending_inc.new_state[id] = osdmap.get_state(id);
9296 pending_inc.new_uuid[id] = uuid_d();
9297 pending_metadata_rm.insert(id);
9298 pending_metadata.erase(id);
9299
9300 return 0;
9301 }
9302
9303 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9304 {
9305 ceph_assert(existing_id);
9306 *existing_id = -1;
9307
9308 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9309 if (!osdmap.exists(i) &&
9310 pending_inc.new_up_client.count(i) == 0 &&
9311 (pending_inc.new_state.count(i) == 0 ||
9312 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9313 *existing_id = i;
9314 return -1;
9315 }
9316 }
9317
9318 if (pending_inc.new_max_osd < 0) {
9319 return osdmap.get_max_osd();
9320 }
9321 return pending_inc.new_max_osd;
9322 }
9323
9324 void OSDMonitor::do_osd_create(
9325 const int32_t id,
9326 const uuid_d& uuid,
9327 const string& device_class,
9328 int32_t* new_id)
9329 {
9330 dout(10) << __func__ << " uuid " << uuid << dendl;
9331 ceph_assert(new_id);
9332
9333 // We presume validation has been performed prior to calling this
9334 // function. We assert with prejudice.
9335
9336 int32_t allocated_id = -1; // declare here so we can jump
9337 int32_t existing_id = -1;
9338 if (!uuid.is_zero()) {
9339 existing_id = osdmap.identify_osd(uuid);
9340 if (existing_id >= 0) {
9341 ceph_assert(id < 0 || id == existing_id);
9342 *new_id = existing_id;
9343 goto out;
9344 } else if (id >= 0) {
9345 // uuid does not exist, and id has been provided, so just create
9346 // the new osd.id
9347 *new_id = id;
9348 goto out;
9349 }
9350 }
9351
9352 // allocate a new id
9353 allocated_id = _allocate_osd_id(&existing_id);
9354 dout(10) << __func__ << " allocated id " << allocated_id
9355 << " existing id " << existing_id << dendl;
9356 if (existing_id >= 0) {
9357 ceph_assert(existing_id < osdmap.get_max_osd());
9358 ceph_assert(allocated_id < 0);
9359 *new_id = existing_id;
9360 } else if (allocated_id >= 0) {
9361 ceph_assert(existing_id < 0);
9362 // raise max_osd
9363 if (pending_inc.new_max_osd < 0) {
9364 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9365 } else {
9366 ++pending_inc.new_max_osd;
9367 }
9368 *new_id = pending_inc.new_max_osd - 1;
9369 ceph_assert(*new_id == allocated_id);
9370 } else {
9371 ceph_abort_msg("unexpected condition");
9372 }
9373
9374 out:
9375 if (device_class.size()) {
9376 CrushWrapper newcrush = _get_pending_crush();
9377 if (newcrush.get_max_devices() < *new_id + 1) {
9378 newcrush.set_max_devices(*new_id + 1);
9379 }
9380 string name = string("osd.") + stringify(*new_id);
9381 if (!newcrush.item_exists(*new_id)) {
9382 newcrush.set_item_name(*new_id, name);
9383 }
9384 ostringstream ss;
9385 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9386 if (r < 0) {
9387 derr << __func__ << " failed to set " << name << " device_class "
9388 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9389 << dendl;
9390 // non-fatal... this might be a replay and we want to be idempotent.
9391 } else {
9392 dout(20) << __func__ << " set " << name << " device_class " << device_class
9393 << dendl;
9394 pending_inc.crush.clear();
9395 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9396 }
9397 } else {
9398 dout(20) << __func__ << " no device_class" << dendl;
9399 }
9400
9401 dout(10) << __func__ << " using id " << *new_id << dendl;
9402 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9403 pending_inc.new_max_osd = *new_id + 1;
9404 }
9405
9406 pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9407 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9408 // set it for us. (ugh.)
9409 pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9410 if (!uuid.is_zero())
9411 pending_inc.new_uuid[*new_id] = uuid;
9412 }
9413
9414 int OSDMonitor::validate_osd_create(
9415 const int32_t id,
9416 const uuid_d& uuid,
9417 const bool check_osd_exists,
9418 int32_t* existing_id,
9419 stringstream& ss)
9420 {
9421
9422 dout(10) << __func__ << " id " << id << " uuid " << uuid
9423 << " check_osd_exists " << check_osd_exists << dendl;
9424
9425 ceph_assert(existing_id);
9426
9427 if (id < 0 && uuid.is_zero()) {
9428 // we have nothing to validate
9429 *existing_id = -1;
9430 return 0;
9431 } else if (uuid.is_zero()) {
9432 // we have an id but we will ignore it - because that's what
9433 // `osd create` does.
9434 return 0;
9435 }
9436
9437 /*
9438 * This function will be used to validate whether we are able to
9439 * create a new osd when the `uuid` is specified.
9440 *
9441 * It will be used by both `osd create` and `osd new`, as the checks
9442 * are basically the same when it pertains to osd id and uuid validation.
9443 * However, `osd create` presumes an `uuid` is optional, for legacy
9444 * reasons, while `osd new` requires the `uuid` to be provided. This
9445 * means that `osd create` will not be idempotent if an `uuid` is not
9446 * provided, but we will always guarantee the idempotency of `osd new`.
9447 */
9448
9449 ceph_assert(!uuid.is_zero());
9450 if (pending_inc.identify_osd(uuid) >= 0) {
9451 // osd is about to exist
9452 return -EAGAIN;
9453 }
9454
9455 int32_t i = osdmap.identify_osd(uuid);
9456 if (i >= 0) {
9457 // osd already exists
9458 if (id >= 0 && i != id) {
9459 ss << "uuid " << uuid << " already in use for different id " << i;
9460 return -EEXIST;
9461 }
9462 // return a positive errno to distinguish between a blocking error
9463 // and an error we consider to not be a problem (i.e., this would be
9464 // an idempotent operation).
9465 *existing_id = i;
9466 return EEXIST;
9467 }
9468 // i < 0
9469 if (id >= 0) {
9470 if (pending_inc.new_state.count(id)) {
9471 // osd is about to exist
9472 return -EAGAIN;
9473 }
9474 // we may not care if an osd exists if we are recreating a previously
9475 // destroyed osd.
9476 if (check_osd_exists && osdmap.exists(id)) {
9477 ss << "id " << id << " already in use and does not match uuid "
9478 << uuid;
9479 return -EINVAL;
9480 }
9481 }
9482 return 0;
9483 }
9484
9485 int OSDMonitor::prepare_command_osd_create(
9486 const int32_t id,
9487 const uuid_d& uuid,
9488 int32_t* existing_id,
9489 stringstream& ss)
9490 {
9491 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9492 ceph_assert(existing_id);
9493 if (osdmap.is_destroyed(id)) {
9494 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9495 "instead.";
9496 return -EINVAL;
9497 }
9498
9499 if (uuid.is_zero()) {
9500 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9501 }
9502
9503 return validate_osd_create(id, uuid, true, existing_id, ss);
9504 }
9505
9506 int OSDMonitor::prepare_command_osd_new(
9507 MonOpRequestRef op,
9508 const cmdmap_t& cmdmap,
9509 const map<string,string>& params,
9510 stringstream &ss,
9511 Formatter *f)
9512 {
9513 uuid_d uuid;
9514 string uuidstr;
9515 int64_t id = -1;
9516
9517 ceph_assert(paxos.is_plugged());
9518
9519 dout(10) << __func__ << " " << op << dendl;
9520
9521 /* validate command. abort now if something's wrong. */
9522
9523 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9524 *
9525 * If `id` is not specified, we will identify any existing osd based
9526 * on `uuid`. Operation will be idempotent iff secrets match.
9527 *
9528 * If `id` is specified, we will identify any existing osd based on
9529 * `uuid` and match against `id`. If they match, operation will be
9530 * idempotent iff secrets match.
9531 *
9532 * `-i secrets.json` will be optional. If supplied, will be used
9533 * to check for idempotency when `id` and `uuid` match.
9534 *
9535 * If `id` is not specified, and `uuid` does not exist, an id will
9536 * be found or allocated for the osd.
9537 *
9538 * If `id` is specified, and the osd has been previously marked
9539 * as destroyed, then the `id` will be reused.
9540 */
9541 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9542 ss << "requires the OSD's UUID to be specified.";
9543 return -EINVAL;
9544 } else if (!uuid.parse(uuidstr.c_str())) {
9545 ss << "invalid UUID value '" << uuidstr << "'.";
9546 return -EINVAL;
9547 }
9548
9549 if (cmd_getval(cmdmap, "id", id) &&
9550 (id < 0)) {
9551 ss << "invalid OSD id; must be greater or equal than zero.";
9552 return -EINVAL;
9553 }
9554
9555 // are we running an `osd create`-like command, or recreating
9556 // a previously destroyed osd?
9557
9558 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9559
9560 // we will care about `id` to assess whether osd is `destroyed`, or
9561 // to create a new osd.
9562 // we will need an `id` by the time we reach auth.
9563
9564 int32_t existing_id = -1;
9565 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9566 &existing_id, ss);
9567
9568 bool may_be_idempotent = false;
9569 if (err == EEXIST) {
9570 // this is idempotent from the osdmon's point-of-view
9571 may_be_idempotent = true;
9572 ceph_assert(existing_id >= 0);
9573 id = existing_id;
9574 } else if (err < 0) {
9575 return err;
9576 }
9577
9578 if (!may_be_idempotent) {
9579 // idempotency is out of the window. We are either creating a new
9580 // osd or recreating a destroyed osd.
9581 //
9582 // We now need to figure out if we have an `id` (and if it's valid),
9583 // of find an `id` if we don't have one.
9584
9585 // NOTE: we need to consider the case where the `id` is specified for
9586 // `osd create`, and we must honor it. So this means checking if
9587 // the `id` is destroyed, and if so assume the destroy; otherwise,
9588 // check if it `exists` - in which case we complain about not being
9589 // `destroyed`. In the end, if nothing fails, we must allow the
9590 // creation, so that we are compatible with `create`.
9591 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9592 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9593 ss << "OSD " << id << " has not yet been destroyed";
9594 return -EINVAL;
9595 } else if (id < 0) {
9596 // find an `id`
9597 id = _allocate_osd_id(&existing_id);
9598 if (id < 0) {
9599 ceph_assert(existing_id >= 0);
9600 id = existing_id;
9601 }
9602 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9603 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9604 dout(10) << __func__ << " recreating osd." << id << dendl;
9605 } else {
9606 dout(10) << __func__ << " creating new osd." << id << dendl;
9607 }
9608 } else {
9609 ceph_assert(id >= 0);
9610 ceph_assert(osdmap.exists(id));
9611 }
9612
9613 // we are now able to either create a brand new osd or reuse an existing
9614 // osd that has been previously destroyed.
9615
9616 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9617
9618 if (may_be_idempotent && params.empty()) {
9619 // nothing to do, really.
9620 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9621 ceph_assert(id >= 0);
9622 if (f) {
9623 f->open_object_section("created_osd");
9624 f->dump_int("osdid", id);
9625 f->close_section();
9626 } else {
9627 ss << id;
9628 }
9629 return EEXIST;
9630 }
9631
9632 string device_class;
9633 auto p = params.find("crush_device_class");
9634 if (p != params.end()) {
9635 device_class = p->second;
9636 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9637 }
9638 string cephx_secret, lockbox_secret, dmcrypt_key;
9639 bool has_lockbox = false;
9640 bool has_secrets = params.count("cephx_secret")
9641 || params.count("cephx_lockbox_secret")
9642 || params.count("dmcrypt_key");
9643
9644 KVMonitor *svc = nullptr;
9645 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9646
9647 if (has_secrets) {
9648 if (params.count("cephx_secret") == 0) {
9649 ss << "requires a cephx secret.";
9650 return -EINVAL;
9651 }
9652 cephx_secret = params.at("cephx_secret");
9653
9654 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9655 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9656
9657 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9658 << " dmcrypt " << has_dmcrypt_key << dendl;
9659
9660 if (has_lockbox_secret && has_dmcrypt_key) {
9661 has_lockbox = true;
9662 lockbox_secret = params.at("cephx_lockbox_secret");
9663 dmcrypt_key = params.at("dmcrypt_key");
9664 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9665 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9666 return -EINVAL;
9667 }
9668
9669 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9670
9671 err = mon.authmon()->validate_osd_new(id, uuid,
9672 cephx_secret,
9673 lockbox_secret,
9674 cephx_entity,
9675 lockbox_entity,
9676 ss);
9677 if (err < 0) {
9678 return err;
9679 } else if (may_be_idempotent && err != EEXIST) {
9680 // for this to be idempotent, `id` should already be >= 0; no need
9681 // to use validate_id.
9682 ceph_assert(id >= 0);
9683 ss << "osd." << id << " exists but secrets do not match";
9684 return -EEXIST;
9685 }
9686
9687 if (has_lockbox) {
9688 svc = mon.kvmon();
9689 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9690 if (err < 0) {
9691 return err;
9692 } else if (may_be_idempotent && err != EEXIST) {
9693 ceph_assert(id >= 0);
9694 ss << "osd." << id << " exists but dm-crypt key does not match.";
9695 return -EEXIST;
9696 }
9697 }
9698 }
9699 ceph_assert(!has_secrets || !cephx_secret.empty());
9700 ceph_assert(!has_lockbox || !lockbox_secret.empty());
9701
9702 if (may_be_idempotent) {
9703 // we have nothing to do for either the osdmon or the authmon,
9704 // and we have no lockbox - so the config key service will not be
9705 // touched. This is therefore an idempotent operation, and we can
9706 // just return right away.
9707 dout(10) << __func__ << " idempotent -- no op." << dendl;
9708 ceph_assert(id >= 0);
9709 if (f) {
9710 f->open_object_section("created_osd");
9711 f->dump_int("osdid", id);
9712 f->close_section();
9713 } else {
9714 ss << id;
9715 }
9716 return EEXIST;
9717 }
9718 ceph_assert(!may_be_idempotent);
9719
9720 // perform updates.
9721 if (has_secrets) {
9722 ceph_assert(!cephx_secret.empty());
9723 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9724 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9725
9726 err = mon.authmon()->do_osd_new(cephx_entity,
9727 lockbox_entity,
9728 has_lockbox);
9729 ceph_assert(0 == err);
9730
9731 if (has_lockbox) {
9732 ceph_assert(nullptr != svc);
9733 svc->do_osd_new(uuid, dmcrypt_key);
9734 }
9735 }
9736
9737 if (is_recreate_destroyed) {
9738 ceph_assert(id >= 0);
9739 ceph_assert(osdmap.is_destroyed(id));
9740 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9741 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9742 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9743 }
9744 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9745 // due to http://tracker.ceph.com/issues/20751 some clusters may
9746 // have UP set for non-existent OSDs; make sure it is cleared
9747 // for a newly created osd.
9748 pending_inc.new_state[id] |= CEPH_OSD_UP;
9749 }
9750 pending_inc.new_uuid[id] = uuid;
9751 } else {
9752 ceph_assert(id >= 0);
9753 int32_t new_id = -1;
9754 do_osd_create(id, uuid, device_class, &new_id);
9755 ceph_assert(new_id >= 0);
9756 ceph_assert(id == new_id);
9757 }
9758
9759 if (f) {
9760 f->open_object_section("created_osd");
9761 f->dump_int("osdid", id);
9762 f->close_section();
9763 } else {
9764 ss << id;
9765 }
9766
9767 return 0;
9768 }
9769
9770 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9771 {
9772 op->mark_osdmon_event(__func__);
9773 auto m = op->get_req<MMonCommand>();
9774 stringstream ss;
9775 cmdmap_t cmdmap;
9776 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9777 string rs = ss.str();
9778 mon.reply_command(op, -EINVAL, rs, get_last_committed());
9779 return true;
9780 }
9781
9782 MonSession *session = op->get_session();
9783 if (!session) {
9784 derr << __func__ << " no session" << dendl;
9785 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9786 return true;
9787 }
9788
9789 return prepare_command_impl(op, cmdmap);
9790 }
9791
9792 static int parse_reweights(CephContext *cct,
9793 const cmdmap_t& cmdmap,
9794 const OSDMap& osdmap,
9795 map<int32_t, uint32_t>* weights)
9796 {
9797 string weights_str;
9798 if (!cmd_getval(cmdmap, "weights", weights_str)) {
9799 return -EINVAL;
9800 }
9801 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9802 json_spirit::mValue json_value;
9803 if (!json_spirit::read(weights_str, json_value)) {
9804 return -EINVAL;
9805 }
9806 if (json_value.type() != json_spirit::obj_type) {
9807 return -EINVAL;
9808 }
9809 const auto obj = json_value.get_obj();
9810 try {
9811 for (auto& osd_weight : obj) {
9812 auto osd_id = std::stoi(osd_weight.first);
9813 if (!osdmap.exists(osd_id)) {
9814 return -ENOENT;
9815 }
9816 if (osd_weight.second.type() != json_spirit::str_type) {
9817 return -EINVAL;
9818 }
9819 auto weight = std::stoul(osd_weight.second.get_str());
9820 weights->insert({osd_id, weight});
9821 }
9822 } catch (const std::logic_error& e) {
9823 return -EINVAL;
9824 }
9825 return 0;
9826 }
9827
9828 int OSDMonitor::prepare_command_osd_destroy(
9829 int32_t id,
9830 stringstream& ss)
9831 {
9832 ceph_assert(paxos.is_plugged());
9833
9834 // we check if the osd exists for the benefit of `osd purge`, which may
9835 // have previously removed the osd. If the osd does not exist, return
9836 // -ENOENT to convey this, and let the caller deal with it.
9837 //
9838 // we presume that all auth secrets and config keys were removed prior
9839 // to this command being called. if they exist by now, we also assume
9840 // they must have been created by some other command and do not pertain
9841 // to this non-existent osd.
9842 if (!osdmap.exists(id)) {
9843 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9844 return -ENOENT;
9845 }
9846
9847 uuid_d uuid = osdmap.get_uuid(id);
9848 dout(10) << __func__ << " destroying osd." << id
9849 << " uuid " << uuid << dendl;
9850
9851 // if it has been destroyed, we assume our work here is done.
9852 if (osdmap.is_destroyed(id)) {
9853 ss << "destroyed osd." << id;
9854 return 0;
9855 }
9856
9857 EntityName cephx_entity, lockbox_entity;
9858 bool idempotent_auth = false, idempotent_cks = false;
9859
9860 int err = mon.authmon()->validate_osd_destroy(id, uuid,
9861 cephx_entity,
9862 lockbox_entity,
9863 ss);
9864 if (err < 0) {
9865 if (err == -ENOENT) {
9866 idempotent_auth = true;
9867 } else {
9868 return err;
9869 }
9870 }
9871
9872 auto svc = mon.kvmon();
9873 err = svc->validate_osd_destroy(id, uuid);
9874 if (err < 0) {
9875 ceph_assert(err == -ENOENT);
9876 err = 0;
9877 idempotent_cks = true;
9878 }
9879
9880 if (!idempotent_auth) {
9881 err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9882 ceph_assert(0 == err);
9883 }
9884
9885 if (!idempotent_cks) {
9886 svc->do_osd_destroy(id, uuid);
9887 }
9888
9889 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9890 pending_inc.new_uuid[id] = uuid_d();
9891
9892 // we can only propose_pending() once per service, otherwise we'll be
9893 // defying PaxosService and all laws of nature. Therefore, as we may
9894 // be used during 'osd purge', let's keep the caller responsible for
9895 // proposing.
9896 ceph_assert(err == 0);
9897 return 0;
9898 }
9899
9900 int OSDMonitor::prepare_command_osd_purge(
9901 int32_t id,
9902 stringstream& ss)
9903 {
9904 ceph_assert(paxos.is_plugged());
9905 dout(10) << __func__ << " purging osd." << id << dendl;
9906
9907 ceph_assert(!osdmap.is_up(id));
9908
9909 /*
9910 * This may look a bit weird, but this is what's going to happen:
9911 *
9912 * 1. we make sure that removing from crush works
9913 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9914 * error, then we abort the whole operation, as no updates
9915 * have been made. However, we this function will have
9916 * side-effects, thus we need to make sure that all operations
9917 * performed henceforth will *always* succeed.
9918 * 3. we call `prepare_command_osd_remove()`. Although this
9919 * function can return an error, it currently only checks if the
9920 * osd is up - and we have made sure that it is not so, so there
9921 * is no conflict, and it is effectively an update.
9922 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9923 * the crush update we delayed from before.
9924 */
9925
9926 CrushWrapper newcrush = _get_pending_crush();
9927
9928 bool may_be_idempotent = false;
9929
9930 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9931 if (err == -ENOENT) {
9932 err = 0;
9933 may_be_idempotent = true;
9934 } else if (err < 0) {
9935 ss << "error removing osd." << id << " from crush";
9936 return err;
9937 }
9938
9939 // no point destroying the osd again if it has already been marked destroyed
9940 if (!osdmap.is_destroyed(id)) {
9941 err = prepare_command_osd_destroy(id, ss);
9942 if (err < 0) {
9943 if (err == -ENOENT) {
9944 err = 0;
9945 } else {
9946 return err;
9947 }
9948 } else {
9949 may_be_idempotent = false;
9950 }
9951 }
9952 ceph_assert(0 == err);
9953
9954 if (may_be_idempotent && !osdmap.exists(id)) {
9955 dout(10) << __func__ << " osd." << id << " does not exist and "
9956 << "we are idempotent." << dendl;
9957 return -ENOENT;
9958 }
9959
9960 err = prepare_command_osd_remove(id);
9961 // we should not be busy, as we should have made sure this id is not up.
9962 ceph_assert(0 == err);
9963
9964 do_osd_crush_remove(newcrush);
9965 return 0;
9966 }
9967
9968 int OSDMonitor::parse_pgid(const cmdmap_t& cmdmap, stringstream &ss,
9969 /* out */ pg_t &pgid, std::optional<string> pgids) {
9970 string pgidstr;
9971 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
9972 ss << "unable to parse 'pgid' value '"
9973 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
9974 return -EINVAL;
9975 }
9976 if (!pgid.parse(pgidstr.c_str())) {
9977 ss << "invalid pgid '" << pgidstr << "'";
9978 return -EINVAL;
9979 }
9980 if (!osdmap.pg_exists(pgid)) {
9981 ss << "pgid '" << pgid << "' does not exist";
9982 return -ENOENT;
9983 }
9984 if (pgids.has_value())
9985 pgids.value() = pgidstr;
9986 return 0;
9987 }
9988
9989 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9990 const cmdmap_t& cmdmap)
9991 {
9992 op->mark_osdmon_event(__func__);
9993 auto m = op->get_req<MMonCommand>();
9994 bool ret = false;
9995 stringstream ss;
9996 string rs;
9997 bufferlist rdata;
9998 int err = 0;
9999
10000 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
10001 boost::scoped_ptr<Formatter> f(Formatter::create(format));
10002
10003 string prefix;
10004 cmd_getval(cmdmap, "prefix", prefix);
10005
10006 int64_t osdid;
10007 string osd_name;
10008 bool osdid_present = false;
10009 if (prefix != "osd pg-temp" &&
10010 prefix != "osd pg-upmap" &&
10011 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
10012 osdid_present = cmd_getval(cmdmap, "id", osdid);
10013 }
10014 if (osdid_present) {
10015 ostringstream oss;
10016 oss << "osd." << osdid;
10017 osd_name = oss.str();
10018 }
10019
10020 // Even if there's a pending state with changes that could affect
10021 // a command, considering that said state isn't yet committed, we
10022 // just don't care about those changes if the command currently being
10023 // handled acts as a no-op against the current committed state.
10024 // In a nutshell, we assume this command happens *before*.
10025 //
10026 // Let me make this clearer:
10027 //
10028 // - If we have only one client, and that client issues some
10029 // operation that would conflict with this operation but is
10030 // still on the pending state, then we would be sure that said
10031 // operation wouldn't have returned yet, so the client wouldn't
10032 // issue this operation (unless the client didn't wait for the
10033 // operation to finish, and that would be the client's own fault).
10034 //
10035 // - If we have more than one client, each client will observe
10036 // whatever is the state at the moment of the commit. So, if we
10037 // have two clients, one issuing an unlink and another issuing a
10038 // link, and if the link happens while the unlink is still on the
10039 // pending state, from the link's point-of-view this is a no-op.
10040 // If different clients are issuing conflicting operations and
10041 // they care about that, then the clients should make sure they
10042 // enforce some kind of concurrency mechanism -- from our
10043 // perspective that's what Douglas Adams would call an SEP.
10044 //
10045 // This should be used as a general guideline for most commands handled
10046 // in this function. Adapt as you see fit, but please bear in mind that
10047 // this is the expected behavior.
10048
10049
10050 if (prefix == "osd setcrushmap" ||
10051 (prefix == "osd crush set" && !osdid_present)) {
10052 if (pending_inc.crush.length()) {
10053 dout(10) << __func__ << " waiting for pending crush update " << dendl;
10054 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10055 return true;
10056 }
10057 dout(10) << "prepare_command setting new crush map" << dendl;
10058 bufferlist data(m->get_data());
10059 CrushWrapper crush;
10060 try {
10061 auto bl = data.cbegin();
10062 crush.decode(bl);
10063 }
10064 catch (const std::exception &e) {
10065 err = -EINVAL;
10066 ss << "Failed to parse crushmap: " << e.what();
10067 goto reply;
10068 }
10069
10070 int64_t prior_version = 0;
10071 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
10072 if (prior_version == osdmap.get_crush_version() - 1) {
10073 // see if we are a resend of the last update. this is imperfect
10074 // (multiple racing updaters may not both get reliable success)
10075 // but we expect crush updaters (via this interface) to be rare-ish.
10076 bufferlist current, proposed;
10077 osdmap.crush->encode(current, mon.get_quorum_con_features());
10078 crush.encode(proposed, mon.get_quorum_con_features());
10079 if (current.contents_equal(proposed)) {
10080 dout(10) << __func__
10081 << " proposed matches current and version equals previous"
10082 << dendl;
10083 err = 0;
10084 ss << osdmap.get_crush_version();
10085 goto reply;
10086 }
10087 }
10088 if (prior_version != osdmap.get_crush_version()) {
10089 err = -EPERM;
10090 ss << "prior_version " << prior_version << " != crush version "
10091 << osdmap.get_crush_version();
10092 goto reply;
10093 }
10094 }
10095
10096 if (!validate_crush_against_features(&crush, ss)) {
10097 err = -EINVAL;
10098 goto reply;
10099 }
10100
10101 err = osdmap.validate_crush_rules(&crush, &ss);
10102 if (err < 0) {
10103 goto reply;
10104 }
10105
10106 if (g_conf()->mon_osd_crush_smoke_test) {
10107 // sanity check: test some inputs to make sure this map isn't
10108 // totally broken
10109 dout(10) << " testing map" << dendl;
10110 stringstream ess;
10111 CrushTester tester(crush, ess);
10112 tester.set_min_x(0);
10113 tester.set_max_x(50);
10114 tester.set_num_rep(3); // arbitrary
10115 auto start = ceph::coarse_mono_clock::now();
10116 int r = tester.test_with_fork(cct, g_conf()->mon_lease);
10117 auto duration = ceph::coarse_mono_clock::now() - start;
10118 if (r < 0) {
10119 dout(10) << " tester.test_with_fork returns " << r
10120 << ": " << ess.str() << dendl;
10121 ss << "crush smoke test failed with " << r << ": " << ess.str();
10122 err = r;
10123 goto reply;
10124 }
10125 dout(10) << __func__ << " crush somke test duration: "
10126 << duration << ", result: " << ess.str() << dendl;
10127 }
10128
10129 pending_inc.crush = data;
10130 ss << osdmap.get_crush_version() + 1;
10131 goto update;
10132
10133 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
10134 CrushWrapper newcrush = _get_pending_crush();
10135 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
10136 int bid = -1 - b;
10137 if (newcrush.bucket_exists(bid) &&
10138 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
10139 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
10140 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
10141 }
10142 }
10143 if (!validate_crush_against_features(&newcrush, ss)) {
10144 err = -EINVAL;
10145 goto reply;
10146 }
10147 pending_inc.crush.clear();
10148 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10149 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10150 get_last_committed() + 1));
10151 return true;
10152 } else if (prefix == "osd crush set-device-class") {
10153 string device_class;
10154 if (!cmd_getval(cmdmap, "class", device_class)) {
10155 err = -EINVAL; // no value!
10156 goto reply;
10157 }
10158
10159 bool stop = false;
10160 vector<string> idvec;
10161 cmd_getval(cmdmap, "ids", idvec);
10162 CrushWrapper newcrush = _get_pending_crush();
10163 set<int> updated;
10164 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10165 set<int> osds;
10166 // wildcard?
10167 if (j == 0 &&
10168 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10169 osdmap.get_all_osds(osds);
10170 stop = true;
10171 } else {
10172 // try traditional single osd way
10173 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10174 if (osd < 0) {
10175 // ss has reason for failure
10176 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10177 err = -EINVAL;
10178 continue;
10179 }
10180 osds.insert(osd);
10181 }
10182
10183 for (auto &osd : osds) {
10184 if (!osdmap.exists(osd)) {
10185 ss << "osd." << osd << " does not exist. ";
10186 continue;
10187 }
10188
10189 ostringstream oss;
10190 oss << "osd." << osd;
10191 string name = oss.str();
10192
10193 if (newcrush.get_max_devices() < osd + 1) {
10194 newcrush.set_max_devices(osd + 1);
10195 }
10196 string action;
10197 if (newcrush.item_exists(osd)) {
10198 action = "updating";
10199 } else {
10200 action = "creating";
10201 newcrush.set_item_name(osd, name);
10202 }
10203
10204 dout(5) << action << " crush item id " << osd << " name '" << name
10205 << "' device_class '" << device_class << "'"
10206 << dendl;
10207 err = newcrush.update_device_class(osd, device_class, name, &ss);
10208 if (err < 0) {
10209 goto reply;
10210 }
10211 if (err == 0 && !_have_pending_crush()) {
10212 if (!stop) {
10213 // for single osd only, wildcard makes too much noise
10214 ss << "set-device-class item id " << osd << " name '" << name
10215 << "' device_class '" << device_class << "': no change. ";
10216 }
10217 } else {
10218 updated.insert(osd);
10219 }
10220 }
10221 }
10222
10223 pending_inc.crush.clear();
10224 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10225 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10226 getline(ss, rs);
10227 wait_for_finished_proposal(
10228 op,
10229 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10230 return true;
10231 } else if (prefix == "osd crush rm-device-class") {
10232 bool stop = false;
10233 vector<string> idvec;
10234 cmd_getval(cmdmap, "ids", idvec);
10235 CrushWrapper newcrush = _get_pending_crush();
10236 set<int> updated;
10237
10238 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10239 set<int> osds;
10240
10241 // wildcard?
10242 if (j == 0 &&
10243 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10244 osdmap.get_all_osds(osds);
10245 stop = true;
10246 } else {
10247 // try traditional single osd way
10248 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10249 if (osd < 0) {
10250 // ss has reason for failure
10251 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10252 err = -EINVAL;
10253 goto reply;
10254 }
10255 osds.insert(osd);
10256 }
10257
10258 for (auto &osd : osds) {
10259 if (!osdmap.exists(osd)) {
10260 ss << "osd." << osd << " does not exist. ";
10261 continue;
10262 }
10263
10264 auto class_name = newcrush.get_item_class(osd);
10265 if (!class_name) {
10266 ss << "osd." << osd << " belongs to no class, ";
10267 continue;
10268 }
10269 // note that we do not verify if class_is_in_use here
10270 // in case the device is misclassified and user wants
10271 // to overridely reset...
10272
10273 err = newcrush.remove_device_class(cct, osd, &ss);
10274 if (err < 0) {
10275 // ss has reason for failure
10276 goto reply;
10277 }
10278 updated.insert(osd);
10279 }
10280 }
10281
10282 pending_inc.crush.clear();
10283 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10284 ss << "done removing class of osd(s): " << updated;
10285 getline(ss, rs);
10286 wait_for_finished_proposal(
10287 op,
10288 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10289 return true;
10290 } else if (prefix == "osd crush class create") {
10291 string device_class;
10292 if (!cmd_getval(cmdmap, "class", device_class)) {
10293 err = -EINVAL; // no value!
10294 goto reply;
10295 }
10296 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10297 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10298 << "luminous' before using crush device classes";
10299 err = -EPERM;
10300 goto reply;
10301 }
10302 if (!_have_pending_crush() &&
10303 _get_stable_crush().class_exists(device_class)) {
10304 ss << "class '" << device_class << "' already exists";
10305 goto reply;
10306 }
10307 CrushWrapper newcrush = _get_pending_crush();
10308 if (newcrush.class_exists(device_class)) {
10309 ss << "class '" << device_class << "' already exists";
10310 goto update;
10311 }
10312 int class_id = newcrush.get_or_create_class_id(device_class);
10313 pending_inc.crush.clear();
10314 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10315 ss << "created class " << device_class << " with id " << class_id
10316 << " to crush map";
10317 goto update;
10318 } else if (prefix == "osd crush class rm") {
10319 string device_class;
10320 if (!cmd_getval(cmdmap, "class", device_class)) {
10321 err = -EINVAL; // no value!
10322 goto reply;
10323 }
10324 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10325 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10326 << "luminous' before using crush device classes";
10327 err = -EPERM;
10328 goto reply;
10329 }
10330
10331 if (!osdmap.crush->class_exists(device_class)) {
10332 err = 0;
10333 goto reply;
10334 }
10335
10336 CrushWrapper newcrush = _get_pending_crush();
10337 if (!newcrush.class_exists(device_class)) {
10338 err = 0; // make command idempotent
10339 goto wait;
10340 }
10341 int class_id = newcrush.get_class_id(device_class);
10342 stringstream ts;
10343 if (newcrush.class_is_in_use(class_id, &ts)) {
10344 err = -EBUSY;
10345 ss << "class '" << device_class << "' " << ts.str();
10346 goto reply;
10347 }
10348
10349 // check if class is used by any erasure-code-profiles
10350 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10351 osdmap.get_erasure_code_profiles();
10352 auto ec_profiles = pending_inc.get_erasure_code_profiles();
10353 #ifdef HAVE_STDLIB_MAP_SPLICING
10354 ec_profiles.merge(old_ec_profiles);
10355 #else
10356 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10357 make_move_iterator(end(old_ec_profiles)));
10358 #endif
10359 list<string> referenced_by;
10360 for (auto &i: ec_profiles) {
10361 for (auto &j: i.second) {
10362 if ("crush-device-class" == j.first && device_class == j.second) {
10363 referenced_by.push_back(i.first);
10364 }
10365 }
10366 }
10367 if (!referenced_by.empty()) {
10368 err = -EBUSY;
10369 ss << "class '" << device_class
10370 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10371 goto reply;
10372 }
10373
10374 set<int> osds;
10375 newcrush.get_devices_by_class(device_class, &osds);
10376 for (auto& p: osds) {
10377 err = newcrush.remove_device_class(cct, p, &ss);
10378 if (err < 0) {
10379 // ss has reason for failure
10380 goto reply;
10381 }
10382 }
10383
10384 if (osds.empty()) {
10385 // empty class, remove directly
10386 err = newcrush.remove_class_name(device_class);
10387 if (err < 0) {
10388 ss << "class '" << device_class << "' cannot be removed '"
10389 << cpp_strerror(err) << "'";
10390 goto reply;
10391 }
10392 }
10393
10394 pending_inc.crush.clear();
10395 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10396 ss << "removed class " << device_class << " with id " << class_id
10397 << " from crush map";
10398 goto update;
10399 } else if (prefix == "osd crush class rename") {
10400 string srcname, dstname;
10401 if (!cmd_getval(cmdmap, "srcname", srcname)) {
10402 err = -EINVAL;
10403 goto reply;
10404 }
10405 if (!cmd_getval(cmdmap, "dstname", dstname)) {
10406 err = -EINVAL;
10407 goto reply;
10408 }
10409
10410 CrushWrapper newcrush = _get_pending_crush();
10411 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10412 // suppose this is a replay and return success
10413 // so command is idempotent
10414 ss << "already renamed to '" << dstname << "'";
10415 err = 0;
10416 goto reply;
10417 }
10418
10419 err = newcrush.rename_class(srcname, dstname);
10420 if (err < 0) {
10421 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10422 << cpp_strerror(err);
10423 goto reply;
10424 }
10425
10426 pending_inc.crush.clear();
10427 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10428 ss << "rename class '" << srcname << "' to '" << dstname << "'";
10429 goto update;
10430 } else if (prefix == "osd crush add-bucket") {
10431 // os crush add-bucket <name> <type>
10432 string name, typestr;
10433 vector<string> argvec;
10434 cmd_getval(cmdmap, "name", name);
10435 cmd_getval(cmdmap, "type", typestr);
10436 cmd_getval(cmdmap, "args", argvec);
10437 map<string,string> loc;
10438 if (!argvec.empty()) {
10439 CrushWrapper::parse_loc_map(argvec, &loc);
10440 dout(0) << "will create and move bucket '" << name
10441 << "' to location " << loc << dendl;
10442 }
10443
10444 if (!_have_pending_crush() &&
10445 _get_stable_crush().name_exists(name)) {
10446 ss << "bucket '" << name << "' already exists";
10447 goto reply;
10448 }
10449
10450 CrushWrapper newcrush = _get_pending_crush();
10451
10452 if (newcrush.name_exists(name)) {
10453 ss << "bucket '" << name << "' already exists";
10454 goto update;
10455 }
10456 int type = newcrush.get_type_id(typestr);
10457 if (type < 0) {
10458 ss << "type '" << typestr << "' does not exist";
10459 err = -EINVAL;
10460 goto reply;
10461 }
10462 if (type == 0) {
10463 ss << "type '" << typestr << "' is for devices, not buckets";
10464 err = -EINVAL;
10465 goto reply;
10466 }
10467 int bucketno;
10468 err = newcrush.add_bucket(0, 0,
10469 CRUSH_HASH_DEFAULT, type, 0, NULL,
10470 NULL, &bucketno);
10471 if (err < 0) {
10472 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10473 goto reply;
10474 }
10475 err = newcrush.set_item_name(bucketno, name);
10476 if (err < 0) {
10477 ss << "error setting bucket name to '" << name << "'";
10478 goto reply;
10479 }
10480
10481 if (!loc.empty()) {
10482 if (!newcrush.check_item_loc(cct, bucketno, loc,
10483 (int *)NULL)) {
10484 err = newcrush.move_bucket(cct, bucketno, loc);
10485 if (err < 0) {
10486 ss << "error moving bucket '" << name << "' to location " << loc;
10487 goto reply;
10488 }
10489 } else {
10490 ss << "no need to move item id " << bucketno << " name '" << name
10491 << "' to location " << loc << " in crush map";
10492 }
10493 }
10494
10495 pending_inc.crush.clear();
10496 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10497 if (loc.empty()) {
10498 ss << "added bucket " << name << " type " << typestr
10499 << " to crush map";
10500 } else {
10501 ss << "added bucket " << name << " type " << typestr
10502 << " to location " << loc;
10503 }
10504 goto update;
10505 } else if (prefix == "osd crush rename-bucket") {
10506 string srcname, dstname;
10507 cmd_getval(cmdmap, "srcname", srcname);
10508 cmd_getval(cmdmap, "dstname", dstname);
10509
10510 err = crush_rename_bucket(srcname, dstname, &ss);
10511 if (err == -EALREADY) // equivalent to success for idempotency
10512 err = 0;
10513 if (err)
10514 goto reply;
10515 else
10516 goto update;
10517 } else if (prefix == "osd crush weight-set create" ||
10518 prefix == "osd crush weight-set create-compat") {
10519 if (_have_pending_crush()) {
10520 dout(10) << " first waiting for pending crush changes to commit" << dendl;
10521 goto wait;
10522 }
10523 CrushWrapper newcrush = _get_pending_crush();
10524 int64_t pool;
10525 int positions;
10526 if (newcrush.has_non_straw2_buckets()) {
10527 ss << "crush map contains one or more bucket(s) that are not straw2";
10528 err = -EPERM;
10529 goto reply;
10530 }
10531 if (prefix == "osd crush weight-set create") {
10532 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10533 osdmap.require_min_compat_client < ceph_release_t::luminous) {
10534 ss << "require_min_compat_client "
10535 << osdmap.require_min_compat_client
10536 << " < luminous, which is required for per-pool weight-sets. "
10537 << "Try 'ceph osd set-require-min-compat-client luminous' "
10538 << "before using the new interface";
10539 err = -EPERM;
10540 goto reply;
10541 }
10542 string poolname, mode;
10543 cmd_getval(cmdmap, "pool", poolname);
10544 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10545 if (pool < 0) {
10546 ss << "pool '" << poolname << "' not found";
10547 err = -ENOENT;
10548 goto reply;
10549 }
10550 cmd_getval(cmdmap, "mode", mode);
10551 if (mode != "flat" && mode != "positional") {
10552 ss << "unrecognized weight-set mode '" << mode << "'";
10553 err = -EINVAL;
10554 goto reply;
10555 }
10556 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10557 } else {
10558 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10559 positions = 1;
10560 }
10561 if (!newcrush.create_choose_args(pool, positions)) {
10562 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10563 ss << "compat weight-set already created";
10564 } else {
10565 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10566 << "' already created";
10567 }
10568 goto reply;
10569 }
10570 pending_inc.crush.clear();
10571 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10572 goto update;
10573
10574 } else if (prefix == "osd crush weight-set rm" ||
10575 prefix == "osd crush weight-set rm-compat") {
10576 CrushWrapper newcrush = _get_pending_crush();
10577 int64_t pool;
10578 if (prefix == "osd crush weight-set rm") {
10579 string poolname;
10580 cmd_getval(cmdmap, "pool", poolname);
10581 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10582 if (pool < 0) {
10583 ss << "pool '" << poolname << "' not found";
10584 err = -ENOENT;
10585 goto reply;
10586 }
10587 } else {
10588 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10589 }
10590 newcrush.rm_choose_args(pool);
10591 pending_inc.crush.clear();
10592 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10593 goto update;
10594
10595 } else if (prefix == "osd crush weight-set reweight" ||
10596 prefix == "osd crush weight-set reweight-compat") {
10597 string poolname, item;
10598 vector<double> weight;
10599 cmd_getval(cmdmap, "pool", poolname);
10600 cmd_getval(cmdmap, "item", item);
10601 cmd_getval(cmdmap, "weight", weight);
10602 CrushWrapper newcrush = _get_pending_crush();
10603 int64_t pool;
10604 if (prefix == "osd crush weight-set reweight") {
10605 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10606 if (pool < 0) {
10607 ss << "pool '" << poolname << "' not found";
10608 err = -ENOENT;
10609 goto reply;
10610 }
10611 if (!newcrush.have_choose_args(pool)) {
10612 ss << "no weight-set for pool '" << poolname << "'";
10613 err = -ENOENT;
10614 goto reply;
10615 }
10616 auto arg_map = newcrush.choose_args_get(pool);
10617 int positions = newcrush.get_choose_args_positions(arg_map);
10618 if (weight.size() != (size_t)positions) {
10619 ss << "must specify exact " << positions << " weight values";
10620 err = -EINVAL;
10621 goto reply;
10622 }
10623 } else {
10624 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10625 if (!newcrush.have_choose_args(pool)) {
10626 ss << "no backward-compatible weight-set";
10627 err = -ENOENT;
10628 goto reply;
10629 }
10630 }
10631 if (!newcrush.name_exists(item)) {
10632 ss << "item '" << item << "' does not exist";
10633 err = -ENOENT;
10634 goto reply;
10635 }
10636 err = newcrush.choose_args_adjust_item_weightf(
10637 cct,
10638 newcrush.choose_args_get(pool),
10639 newcrush.get_item_id(item),
10640 weight,
10641 &ss);
10642 if (err < 0) {
10643 goto reply;
10644 }
10645 err = 0;
10646 pending_inc.crush.clear();
10647 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10648 goto update;
10649 } else if (osdid_present &&
10650 (prefix == "osd crush set" || prefix == "osd crush add")) {
10651 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10652 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10653 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10654
10655 if (!osdmap.exists(osdid)) {
10656 err = -ENOENT;
10657 ss << osd_name
10658 << " does not exist. Create it before updating the crush map";
10659 goto reply;
10660 }
10661
10662 double weight;
10663 if (!cmd_getval(cmdmap, "weight", weight)) {
10664 ss << "unable to parse weight value '"
10665 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10666 err = -EINVAL;
10667 goto reply;
10668 }
10669
10670 string args;
10671 vector<string> argvec;
10672 cmd_getval(cmdmap, "args", argvec);
10673 map<string,string> loc;
10674 CrushWrapper::parse_loc_map(argvec, &loc);
10675
10676 if (prefix == "osd crush set"
10677 && !_get_stable_crush().item_exists(osdid)) {
10678 err = -ENOENT;
10679 ss << "unable to set item id " << osdid << " name '" << osd_name
10680 << "' weight " << weight << " at location " << loc
10681 << ": does not exist";
10682 goto reply;
10683 }
10684
10685 dout(5) << "adding/updating crush item id " << osdid << " name '"
10686 << osd_name << "' weight " << weight << " at location "
10687 << loc << dendl;
10688 CrushWrapper newcrush = _get_pending_crush();
10689
10690 string action;
10691 if (prefix == "osd crush set" ||
10692 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10693 action = "set";
10694 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10695 } else {
10696 action = "add";
10697 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10698 if (err == 0)
10699 err = 1;
10700 }
10701
10702 if (err < 0)
10703 goto reply;
10704
10705 if (err == 0 && !_have_pending_crush()) {
10706 ss << action << " item id " << osdid << " name '" << osd_name
10707 << "' weight " << weight << " at location " << loc << ": no change";
10708 goto reply;
10709 }
10710
10711 pending_inc.crush.clear();
10712 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10713 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10714 << weight << " at location " << loc << " to crush map";
10715 getline(ss, rs);
10716 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10717 get_last_committed() + 1));
10718 return true;
10719
10720 } else if (prefix == "osd crush create-or-move") {
10721 do {
10722 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10723 if (!osdmap.exists(osdid)) {
10724 err = -ENOENT;
10725 ss << osd_name
10726 << " does not exist. create it before updating the crush map";
10727 goto reply;
10728 }
10729
10730 double weight;
10731 if (!cmd_getval(cmdmap, "weight", weight)) {
10732 ss << "unable to parse weight value '"
10733 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10734 err = -EINVAL;
10735 goto reply;
10736 }
10737
10738 string args;
10739 vector<string> argvec;
10740 cmd_getval(cmdmap, "args", argvec);
10741 map<string,string> loc;
10742 CrushWrapper::parse_loc_map(argvec, &loc);
10743
10744 dout(0) << "create-or-move crush item name '" << osd_name
10745 << "' initial_weight " << weight << " at location " << loc
10746 << dendl;
10747
10748 CrushWrapper newcrush = _get_pending_crush();
10749
10750 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10751 g_conf()->osd_crush_update_weight_set);
10752 if (err == 0) {
10753 ss << "create-or-move updated item name '" << osd_name
10754 << "' weight " << weight
10755 << " at location " << loc << " to crush map";
10756 break;
10757 }
10758 if (err > 0) {
10759 pending_inc.crush.clear();
10760 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10761 ss << "create-or-move updating item name '" << osd_name
10762 << "' weight " << weight
10763 << " at location " << loc << " to crush map";
10764 getline(ss, rs);
10765 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10766 get_last_committed() + 1));
10767 return true;
10768 }
10769 } while (false);
10770
10771 } else if (prefix == "osd crush move") {
10772 do {
10773 // osd crush move <name> <loc1> [<loc2> ...]
10774 string name;
10775 vector<string> argvec;
10776 cmd_getval(cmdmap, "name", name);
10777 cmd_getval(cmdmap, "args", argvec);
10778 map<string,string> loc;
10779 CrushWrapper::parse_loc_map(argvec, &loc);
10780
10781 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10782 CrushWrapper newcrush = _get_pending_crush();
10783
10784 if (!newcrush.name_exists(name)) {
10785 err = -ENOENT;
10786 ss << "item " << name << " does not exist";
10787 break;
10788 }
10789 int id = newcrush.get_item_id(name);
10790
10791 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10792 if (id >= 0) {
10793 err = newcrush.create_or_move_item(
10794 cct, id, 0, name, loc,
10795 g_conf()->osd_crush_update_weight_set);
10796 } else {
10797 err = newcrush.move_bucket(cct, id, loc);
10798 }
10799 if (err >= 0) {
10800 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10801 pending_inc.crush.clear();
10802 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10803 getline(ss, rs);
10804 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10805 get_last_committed() + 1));
10806 return true;
10807 }
10808 } else {
10809 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10810 err = 0;
10811 }
10812 } while (false);
10813 } else if (prefix == "osd crush swap-bucket") {
10814 string source, dest;
10815 cmd_getval(cmdmap, "source", source);
10816 cmd_getval(cmdmap, "dest", dest);
10817
10818 bool force = false;
10819 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10820
10821 CrushWrapper newcrush = _get_pending_crush();
10822 if (!newcrush.name_exists(source)) {
10823 ss << "source item " << source << " does not exist";
10824 err = -ENOENT;
10825 goto reply;
10826 }
10827 if (!newcrush.name_exists(dest)) {
10828 ss << "dest item " << dest << " does not exist";
10829 err = -ENOENT;
10830 goto reply;
10831 }
10832 int sid = newcrush.get_item_id(source);
10833 int did = newcrush.get_item_id(dest);
10834 int sparent;
10835 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10836 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10837 err = -EPERM;
10838 goto reply;
10839 }
10840 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10841 !force) {
10842 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10843 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10844 << "; pass --yes-i-really-mean-it to proceed anyway";
10845 err = -EPERM;
10846 goto reply;
10847 }
10848 int r = newcrush.swap_bucket(cct, sid, did);
10849 if (r < 0) {
10850 ss << "failed to swap bucket contents: " << cpp_strerror(r);
10851 err = r;
10852 goto reply;
10853 }
10854 ss << "swapped bucket of " << source << " to " << dest;
10855 pending_inc.crush.clear();
10856 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10857 wait_for_finished_proposal(op,
10858 new Monitor::C_Command(mon, op, err, ss.str(),
10859 get_last_committed() + 1));
10860 return true;
10861 } else if (prefix == "osd crush link") {
10862 // osd crush link <name> <loc1> [<loc2> ...]
10863 string name;
10864 cmd_getval(cmdmap, "name", name);
10865 vector<string> argvec;
10866 cmd_getval(cmdmap, "args", argvec);
10867 map<string,string> loc;
10868 CrushWrapper::parse_loc_map(argvec, &loc);
10869
10870 // Need an explicit check for name_exists because get_item_id returns
10871 // 0 on unfound.
10872 int id = osdmap.crush->get_item_id(name);
10873 if (!osdmap.crush->name_exists(name)) {
10874 err = -ENOENT;
10875 ss << "item " << name << " does not exist";
10876 goto reply;
10877 } else {
10878 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10879 }
10880 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10881 ss << "no need to move item id " << id << " name '" << name
10882 << "' to location " << loc << " in crush map";
10883 err = 0;
10884 goto reply;
10885 }
10886
10887 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10888 CrushWrapper newcrush = _get_pending_crush();
10889
10890 if (!newcrush.name_exists(name)) {
10891 err = -ENOENT;
10892 ss << "item " << name << " does not exist";
10893 goto reply;
10894 } else {
10895 int id = newcrush.get_item_id(name);
10896 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10897 err = newcrush.link_bucket(cct, id, loc);
10898 if (err >= 0) {
10899 ss << "linked item id " << id << " name '" << name
10900 << "' to location " << loc << " in crush map";
10901 pending_inc.crush.clear();
10902 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10903 } else {
10904 ss << "cannot link item id " << id << " name '" << name
10905 << "' to location " << loc;
10906 goto reply;
10907 }
10908 } else {
10909 ss << "no need to move item id " << id << " name '" << name
10910 << "' to location " << loc << " in crush map";
10911 err = 0;
10912 }
10913 }
10914 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10915 get_last_committed() + 1));
10916 return true;
10917 } else if (prefix == "osd crush rm" ||
10918 prefix == "osd crush remove" ||
10919 prefix == "osd crush unlink") {
10920 do {
10921 // osd crush rm <id> [ancestor]
10922 CrushWrapper newcrush = _get_pending_crush();
10923
10924 string name;
10925 cmd_getval(cmdmap, "name", name);
10926
10927 if (!osdmap.crush->name_exists(name)) {
10928 err = 0;
10929 ss << "device '" << name << "' does not appear in the crush map";
10930 break;
10931 }
10932 if (!newcrush.name_exists(name)) {
10933 err = 0;
10934 ss << "device '" << name << "' does not appear in the crush map";
10935 getline(ss, rs);
10936 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10937 get_last_committed() + 1));
10938 return true;
10939 }
10940 int id = newcrush.get_item_id(name);
10941 int ancestor = 0;
10942
10943 bool unlink_only = prefix == "osd crush unlink";
10944 string ancestor_str;
10945 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10946 if (!newcrush.name_exists(ancestor_str)) {
10947 err = -ENOENT;
10948 ss << "ancestor item '" << ancestor_str
10949 << "' does not appear in the crush map";
10950 break;
10951 }
10952 ancestor = newcrush.get_item_id(ancestor_str);
10953 }
10954
10955 err = prepare_command_osd_crush_remove(
10956 newcrush,
10957 id, ancestor,
10958 (ancestor < 0), unlink_only);
10959
10960 if (err == -ENOENT) {
10961 ss << "item " << id << " does not appear in that position";
10962 err = 0;
10963 break;
10964 }
10965 if (err == 0) {
10966 if (!unlink_only)
10967 pending_inc.new_crush_node_flags[id] = 0;
10968 ss << "removed item id " << id << " name '" << name << "' from crush map";
10969 getline(ss, rs);
10970 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10971 get_last_committed() + 1));
10972 return true;
10973 }
10974 } while (false);
10975
10976 } else if (prefix == "osd crush reweight-all") {
10977 CrushWrapper newcrush = _get_pending_crush();
10978
10979 newcrush.reweight(cct);
10980 pending_inc.crush.clear();
10981 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10982 ss << "reweighted crush hierarchy";
10983 getline(ss, rs);
10984 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10985 get_last_committed() + 1));
10986 return true;
10987 } else if (prefix == "osd crush reweight") {
10988 // osd crush reweight <name> <weight>
10989 CrushWrapper newcrush = _get_pending_crush();
10990
10991 string name;
10992 cmd_getval(cmdmap, "name", name);
10993 if (!newcrush.name_exists(name)) {
10994 err = -ENOENT;
10995 ss << "device '" << name << "' does not appear in the crush map";
10996 goto reply;
10997 }
10998
10999 int id = newcrush.get_item_id(name);
11000 if (id < 0) {
11001 ss << "device '" << name << "' is not a leaf in the crush map";
11002 err = -EINVAL;
11003 goto reply;
11004 }
11005 double w;
11006 if (!cmd_getval(cmdmap, "weight", w)) {
11007 ss << "unable to parse weight value '"
11008 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11009 err = -EINVAL;
11010 goto reply;
11011 }
11012
11013 err = newcrush.adjust_item_weightf(cct, id, w,
11014 g_conf()->osd_crush_update_weight_set);
11015 if (err < 0)
11016 goto reply;
11017 pending_inc.crush.clear();
11018 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11019 ss << "reweighted item id " << id << " name '" << name << "' to " << w
11020 << " in crush map";
11021 getline(ss, rs);
11022 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11023 get_last_committed() + 1));
11024 return true;
11025 } else if (prefix == "osd crush reweight-subtree") {
11026 // osd crush reweight <name> <weight>
11027 CrushWrapper newcrush = _get_pending_crush();
11028
11029 string name;
11030 cmd_getval(cmdmap, "name", name);
11031 if (!newcrush.name_exists(name)) {
11032 err = -ENOENT;
11033 ss << "device '" << name << "' does not appear in the crush map";
11034 goto reply;
11035 }
11036
11037 int id = newcrush.get_item_id(name);
11038 if (id >= 0) {
11039 ss << "device '" << name << "' is not a subtree in the crush map";
11040 err = -EINVAL;
11041 goto reply;
11042 }
11043 double w;
11044 if (!cmd_getval(cmdmap, "weight", w)) {
11045 ss << "unable to parse weight value '"
11046 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11047 err = -EINVAL;
11048 goto reply;
11049 }
11050
11051 err = newcrush.adjust_subtree_weightf(cct, id, w,
11052 g_conf()->osd_crush_update_weight_set);
11053 if (err < 0)
11054 goto reply;
11055 pending_inc.crush.clear();
11056 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11057 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
11058 << " in crush map";
11059 getline(ss, rs);
11060 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11061 get_last_committed() + 1));
11062 return true;
11063 } else if (prefix == "osd crush tunables") {
11064 CrushWrapper newcrush = _get_pending_crush();
11065
11066 err = 0;
11067 string profile;
11068 cmd_getval(cmdmap, "profile", profile);
11069 if (profile == "legacy" || profile == "argonaut") {
11070 newcrush.set_tunables_legacy();
11071 } else if (profile == "bobtail") {
11072 newcrush.set_tunables_bobtail();
11073 } else if (profile == "firefly") {
11074 newcrush.set_tunables_firefly();
11075 } else if (profile == "hammer") {
11076 newcrush.set_tunables_hammer();
11077 } else if (profile == "jewel") {
11078 newcrush.set_tunables_jewel();
11079 } else if (profile == "optimal") {
11080 newcrush.set_tunables_optimal();
11081 } else if (profile == "default") {
11082 newcrush.set_tunables_default();
11083 } else {
11084 ss << "unrecognized profile '" << profile << "'";
11085 err = -EINVAL;
11086 goto reply;
11087 }
11088
11089 if (!validate_crush_against_features(&newcrush, ss)) {
11090 err = -EINVAL;
11091 goto reply;
11092 }
11093
11094 pending_inc.crush.clear();
11095 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11096 ss << "adjusted tunables profile to " << profile;
11097 getline(ss, rs);
11098 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11099 get_last_committed() + 1));
11100 return true;
11101 } else if (prefix == "osd crush set-tunable") {
11102 CrushWrapper newcrush = _get_pending_crush();
11103
11104 err = 0;
11105 string tunable;
11106 cmd_getval(cmdmap, "tunable", tunable);
11107
11108 int64_t value = -1;
11109 if (!cmd_getval(cmdmap, "value", value)) {
11110 err = -EINVAL;
11111 ss << "failed to parse integer value "
11112 << cmd_vartype_stringify(cmdmap.at("value"));
11113 goto reply;
11114 }
11115
11116 if (tunable == "straw_calc_version") {
11117 if (value != 0 && value != 1) {
11118 ss << "value must be 0 or 1; got " << value;
11119 err = -EINVAL;
11120 goto reply;
11121 }
11122 newcrush.set_straw_calc_version(value);
11123 } else {
11124 ss << "unrecognized tunable '" << tunable << "'";
11125 err = -EINVAL;
11126 goto reply;
11127 }
11128
11129 if (!validate_crush_against_features(&newcrush, ss)) {
11130 err = -EINVAL;
11131 goto reply;
11132 }
11133
11134 pending_inc.crush.clear();
11135 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11136 ss << "adjusted tunable " << tunable << " to " << value;
11137 getline(ss, rs);
11138 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11139 get_last_committed() + 1));
11140 return true;
11141
11142 } else if (prefix == "osd crush rule create-simple") {
11143 string name, root, type, mode;
11144 cmd_getval(cmdmap, "name", name);
11145 cmd_getval(cmdmap, "root", root);
11146 cmd_getval(cmdmap, "type", type);
11147 cmd_getval(cmdmap, "mode", mode);
11148 if (mode == "")
11149 mode = "firstn";
11150
11151 if (osdmap.crush->rule_exists(name)) {
11152 // The name is uniquely associated to a ruleid and the rule it contains
11153 // From the user point of view, the rule is more meaningfull.
11154 ss << "rule " << name << " already exists";
11155 err = 0;
11156 goto reply;
11157 }
11158
11159 CrushWrapper newcrush = _get_pending_crush();
11160
11161 if (newcrush.rule_exists(name)) {
11162 // The name is uniquely associated to a ruleid and the rule it contains
11163 // From the user point of view, the rule is more meaningfull.
11164 ss << "rule " << name << " already exists";
11165 err = 0;
11166 } else {
11167 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
11168 pg_pool_t::TYPE_REPLICATED, &ss);
11169 if (ruleno < 0) {
11170 err = ruleno;
11171 goto reply;
11172 }
11173
11174 pending_inc.crush.clear();
11175 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11176 }
11177 getline(ss, rs);
11178 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11179 get_last_committed() + 1));
11180 return true;
11181
11182 } else if (prefix == "osd crush rule create-replicated") {
11183 string name, root, type, device_class;
11184 cmd_getval(cmdmap, "name", name);
11185 cmd_getval(cmdmap, "root", root);
11186 cmd_getval(cmdmap, "type", type);
11187 cmd_getval(cmdmap, "class", device_class);
11188
11189 if (osdmap.crush->rule_exists(name)) {
11190 // The name is uniquely associated to a ruleid and the rule it contains
11191 // From the user point of view, the rule is more meaningfull.
11192 ss << "rule " << name << " already exists";
11193 err = 0;
11194 goto reply;
11195 }
11196
11197 CrushWrapper newcrush = _get_pending_crush();
11198
11199 if (newcrush.rule_exists(name)) {
11200 // The name is uniquely associated to a ruleid and the rule it contains
11201 // From the user point of view, the rule is more meaningfull.
11202 ss << "rule " << name << " already exists";
11203 err = 0;
11204 } else {
11205 int ruleno = newcrush.add_simple_rule(
11206 name, root, type, device_class,
11207 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11208 if (ruleno < 0) {
11209 err = ruleno;
11210 goto reply;
11211 }
11212
11213 pending_inc.crush.clear();
11214 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11215 }
11216 getline(ss, rs);
11217 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11218 get_last_committed() + 1));
11219 return true;
11220
11221 } else if (prefix == "osd erasure-code-profile rm") {
11222 string name;
11223 cmd_getval(cmdmap, "name", name);
11224
11225 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11226 goto wait;
11227
11228 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11229 err = -EBUSY;
11230 goto reply;
11231 }
11232
11233 if (osdmap.has_erasure_code_profile(name) ||
11234 pending_inc.new_erasure_code_profiles.count(name)) {
11235 if (osdmap.has_erasure_code_profile(name)) {
11236 pending_inc.old_erasure_code_profiles.push_back(name);
11237 } else {
11238 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11239 pending_inc.new_erasure_code_profiles.erase(name);
11240 }
11241
11242 getline(ss, rs);
11243 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11244 get_last_committed() + 1));
11245 return true;
11246 } else {
11247 ss << "erasure-code-profile " << name << " does not exist";
11248 err = 0;
11249 goto reply;
11250 }
11251
11252 } else if (prefix == "osd erasure-code-profile set") {
11253 string name;
11254 cmd_getval(cmdmap, "name", name);
11255 vector<string> profile;
11256 cmd_getval(cmdmap, "profile", profile);
11257
11258 bool force = false;
11259 cmd_getval(cmdmap, "force", force);
11260
11261 map<string,string> profile_map;
11262 err = parse_erasure_code_profile(profile, &profile_map, &ss);
11263 if (err)
11264 goto reply;
11265 if (auto found = profile_map.find("crush-failure-domain");
11266 found != profile_map.end()) {
11267 const auto& failure_domain = found->second;
11268 int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11269 if (failure_domain_type < 0) {
11270 ss << "erasure-code-profile " << profile_map
11271 << " contains an invalid failure-domain " << std::quoted(failure_domain);
11272 err = -EINVAL;
11273 goto reply;
11274 }
11275 }
11276
11277 if (profile_map.find("plugin") == profile_map.end()) {
11278 ss << "erasure-code-profile " << profile_map
11279 << " must contain a plugin entry" << std::endl;
11280 err = -EINVAL;
11281 goto reply;
11282 }
11283 string plugin = profile_map["plugin"];
11284
11285 if (pending_inc.has_erasure_code_profile(name)) {
11286 dout(20) << "erasure code profile " << name << " try again" << dendl;
11287 goto wait;
11288 } else {
11289 err = normalize_profile(name, profile_map, force, &ss);
11290 if (err)
11291 goto reply;
11292
11293 if (osdmap.has_erasure_code_profile(name)) {
11294 ErasureCodeProfile existing_profile_map =
11295 osdmap.get_erasure_code_profile(name);
11296 err = normalize_profile(name, existing_profile_map, force, &ss);
11297 if (err)
11298 goto reply;
11299
11300 if (existing_profile_map == profile_map) {
11301 err = 0;
11302 goto reply;
11303 }
11304 if (!force) {
11305 err = -EPERM;
11306 ss << "will not override erasure code profile " << name
11307 << " because the existing profile "
11308 << existing_profile_map
11309 << " is different from the proposed profile "
11310 << profile_map;
11311 goto reply;
11312 }
11313 }
11314
11315 dout(20) << "erasure code profile set " << name << "="
11316 << profile_map << dendl;
11317 pending_inc.set_erasure_code_profile(name, profile_map);
11318 }
11319
11320 getline(ss, rs);
11321 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11322 get_last_committed() + 1));
11323 return true;
11324
11325 } else if (prefix == "osd crush rule create-erasure") {
11326 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11327 if (err == -EAGAIN)
11328 goto wait;
11329 if (err)
11330 goto reply;
11331 string name, poolstr;
11332 cmd_getval(cmdmap, "name", name);
11333 string profile;
11334 cmd_getval(cmdmap, "profile", profile);
11335 if (profile == "")
11336 profile = "default";
11337 if (profile == "default") {
11338 if (!osdmap.has_erasure_code_profile(profile)) {
11339 if (pending_inc.has_erasure_code_profile(profile)) {
11340 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11341 goto wait;
11342 }
11343
11344 map<string,string> profile_map;
11345 err = osdmap.get_erasure_code_profile_default(cct,
11346 profile_map,
11347 &ss);
11348 if (err)
11349 goto reply;
11350 err = normalize_profile(name, profile_map, true, &ss);
11351 if (err)
11352 goto reply;
11353 dout(20) << "erasure code profile set " << profile << "="
11354 << profile_map << dendl;
11355 pending_inc.set_erasure_code_profile(profile, profile_map);
11356 goto wait;
11357 }
11358 }
11359
11360 int rule;
11361 err = crush_rule_create_erasure(name, profile, &rule, &ss);
11362 if (err < 0) {
11363 switch(err) {
11364 case -EEXIST: // return immediately
11365 ss << "rule " << name << " already exists";
11366 err = 0;
11367 goto reply;
11368 break;
11369 case -EALREADY: // wait for pending to be proposed
11370 ss << "rule " << name << " already exists";
11371 err = 0;
11372 break;
11373 default: // non recoverable error
11374 goto reply;
11375 break;
11376 }
11377 } else {
11378 ss << "created rule " << name << " at " << rule;
11379 }
11380
11381 getline(ss, rs);
11382 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11383 get_last_committed() + 1));
11384 return true;
11385
11386 } else if (prefix == "osd crush rule rm") {
11387 string name;
11388 cmd_getval(cmdmap, "name", name);
11389
11390 if (!osdmap.crush->rule_exists(name)) {
11391 ss << "rule " << name << " does not exist";
11392 err = 0;
11393 goto reply;
11394 }
11395
11396 CrushWrapper newcrush = _get_pending_crush();
11397
11398 if (!newcrush.rule_exists(name)) {
11399 ss << "rule " << name << " does not exist";
11400 err = 0;
11401 } else {
11402 int ruleno = newcrush.get_rule_id(name);
11403 ceph_assert(ruleno >= 0);
11404
11405 // make sure it is not in use.
11406 // FIXME: this is ok in some situations, but let's not bother with that
11407 // complexity now.
11408 if (osdmap.crush_rule_in_use(ruleno)) {
11409 ss << "crush rule " << name << " (" << ruleno << ") is in use";
11410 err = -EBUSY;
11411 goto reply;
11412 }
11413
11414 err = newcrush.remove_rule(ruleno);
11415 if (err < 0) {
11416 goto reply;
11417 }
11418
11419 pending_inc.crush.clear();
11420 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11421 }
11422 getline(ss, rs);
11423 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11424 get_last_committed() + 1));
11425 return true;
11426
11427 } else if (prefix == "osd crush rule rename") {
11428 string srcname;
11429 string dstname;
11430 cmd_getval(cmdmap, "srcname", srcname);
11431 cmd_getval(cmdmap, "dstname", dstname);
11432 if (srcname.empty() || dstname.empty()) {
11433 ss << "must specify both source rule name and destination rule name";
11434 err = -EINVAL;
11435 goto reply;
11436 }
11437 if (srcname == dstname) {
11438 ss << "destination rule name is equal to source rule name";
11439 err = 0;
11440 goto reply;
11441 }
11442
11443 CrushWrapper newcrush = _get_pending_crush();
11444 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11445 // srcname does not exist and dstname already exists
11446 // suppose this is a replay and return success
11447 // (so this command is idempotent)
11448 ss << "already renamed to '" << dstname << "'";
11449 err = 0;
11450 goto reply;
11451 }
11452
11453 err = newcrush.rename_rule(srcname, dstname, &ss);
11454 if (err < 0) {
11455 // ss has reason for failure
11456 goto reply;
11457 }
11458 pending_inc.crush.clear();
11459 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11460 getline(ss, rs);
11461 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11462 get_last_committed() + 1));
11463 return true;
11464
11465 } else if (prefix == "osd setmaxosd") {
11466 int64_t newmax;
11467 if (!cmd_getval(cmdmap, "newmax", newmax)) {
11468 ss << "unable to parse 'newmax' value '"
11469 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11470 err = -EINVAL;
11471 goto reply;
11472 }
11473
11474 if (newmax > g_conf()->mon_max_osd) {
11475 err = -ERANGE;
11476 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11477 << g_conf()->mon_max_osd << ")";
11478 goto reply;
11479 }
11480
11481 // Don't allow shrinking OSD number as this will cause data loss
11482 // and may cause kernel crashes.
11483 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11484 if (newmax < osdmap.get_max_osd()) {
11485 // Check if the OSDs exist between current max and new value.
11486 // If there are any OSDs exist, then don't allow shrinking number
11487 // of OSDs.
11488 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11489 if (osdmap.exists(i)) {
11490 err = -EBUSY;
11491 ss << "cannot shrink max_osd to " << newmax
11492 << " because osd." << i << " (and possibly others) still in use";
11493 goto reply;
11494 }
11495 }
11496 }
11497
11498 pending_inc.new_max_osd = newmax;
11499 ss << "set new max_osd = " << pending_inc.new_max_osd;
11500 getline(ss, rs);
11501 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11502 get_last_committed() + 1));
11503 return true;
11504
11505 } else if (prefix == "osd set-full-ratio" ||
11506 prefix == "osd set-backfillfull-ratio" ||
11507 prefix == "osd set-nearfull-ratio") {
11508 double n;
11509 if (!cmd_getval(cmdmap, "ratio", n)) {
11510 ss << "unable to parse 'ratio' value '"
11511 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11512 err = -EINVAL;
11513 goto reply;
11514 }
11515 if (prefix == "osd set-full-ratio")
11516 pending_inc.new_full_ratio = n;
11517 else if (prefix == "osd set-backfillfull-ratio")
11518 pending_inc.new_backfillfull_ratio = n;
11519 else if (prefix == "osd set-nearfull-ratio")
11520 pending_inc.new_nearfull_ratio = n;
11521 ss << prefix << " " << n;
11522 getline(ss, rs);
11523 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11524 get_last_committed() + 1));
11525 return true;
11526 } else if (prefix == "osd set-require-min-compat-client") {
11527 string v;
11528 cmd_getval(cmdmap, "version", v);
11529 ceph_release_t vno = ceph_release_from_name(v);
11530 if (!vno) {
11531 ss << "version " << v << " is not recognized";
11532 err = -EINVAL;
11533 goto reply;
11534 }
11535 OSDMap newmap;
11536 newmap.deepish_copy_from(osdmap);
11537 newmap.apply_incremental(pending_inc);
11538 newmap.require_min_compat_client = vno;
11539 auto mvno = newmap.get_min_compat_client();
11540 if (vno < mvno) {
11541 ss << "osdmap current utilizes features that require " << mvno
11542 << "; cannot set require_min_compat_client below that to " << vno;
11543 err = -EPERM;
11544 goto reply;
11545 }
11546 bool sure = false;
11547 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11548 if (!sure) {
11549 FeatureMap m;
11550 mon.get_combined_feature_map(&m);
11551 uint64_t features = ceph_release_features(to_integer<int>(vno));
11552 bool first = true;
11553 bool ok = true;
11554 for (int type : {
11555 CEPH_ENTITY_TYPE_CLIENT,
11556 CEPH_ENTITY_TYPE_MDS,
11557 CEPH_ENTITY_TYPE_MGR }) {
11558 auto p = m.m.find(type);
11559 if (p == m.m.end()) {
11560 continue;
11561 }
11562 for (auto& q : p->second) {
11563 uint64_t missing = ~q.first & features;
11564 if (missing) {
11565 if (first) {
11566 ss << "cannot set require_min_compat_client to " << v << ": ";
11567 } else {
11568 ss << "; ";
11569 }
11570 first = false;
11571 ss << q.second << " connected " << ceph_entity_type_name(type)
11572 << "(s) look like " << ceph_release_name(
11573 ceph_release_from_features(q.first))
11574 << " (missing 0x" << std::hex << missing << std::dec << ")";
11575 ok = false;
11576 }
11577 }
11578 }
11579 if (!ok) {
11580 ss << "; add --yes-i-really-mean-it to do it anyway";
11581 err = -EPERM;
11582 goto reply;
11583 }
11584 }
11585 ss << "set require_min_compat_client to " << vno;
11586 pending_inc.new_require_min_compat_client = vno;
11587 getline(ss, rs);
11588 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11589 get_last_committed() + 1));
11590 return true;
11591 } else if (prefix == "osd pause") {
11592 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11593
11594 } else if (prefix == "osd unpause") {
11595 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11596
11597 } else if (prefix == "osd set") {
11598 bool sure = false;
11599 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11600
11601 string key;
11602 cmd_getval(cmdmap, "key", key);
11603 if (key == "pause")
11604 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11605 else if (key == "noup")
11606 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11607 else if (key == "nodown")
11608 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11609 else if (key == "noout")
11610 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11611 else if (key == "noin")
11612 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11613 else if (key == "nobackfill")
11614 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11615 else if (key == "norebalance")
11616 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11617 else if (key == "norecover")
11618 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11619 else if (key == "noscrub")
11620 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11621 else if (key == "nodeep-scrub")
11622 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11623 else if (key == "notieragent")
11624 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11625 else if (key == "nosnaptrim")
11626 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11627 else if (key == "pglog_hardlimit") {
11628 if (!osdmap.get_num_up_osds() && !sure) {
11629 ss << "Not advisable to continue since no OSDs are up. Pass "
11630 << "--yes-i-really-mean-it if you really wish to continue.";
11631 err = -EPERM;
11632 goto reply;
11633 }
11634 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11635 // we are reusing a jewel feature bit that was retired in luminous.
11636 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11637 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11638 || sure)) {
11639 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11640 } else {
11641 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11642 err = -EPERM;
11643 goto reply;
11644 }
11645 } else {
11646 ss << "unrecognized flag '" << key << "'";
11647 err = -EINVAL;
11648 }
11649
11650 } else if (prefix == "osd unset") {
11651 string key;
11652 cmd_getval(cmdmap, "key", key);
11653 if (key == "pause")
11654 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11655 else if (key == "noup")
11656 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11657 else if (key == "nodown")
11658 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11659 else if (key == "noout")
11660 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11661 else if (key == "noin")
11662 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11663 else if (key == "nobackfill")
11664 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11665 else if (key == "norebalance")
11666 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11667 else if (key == "norecover")
11668 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11669 else if (key == "noscrub")
11670 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11671 else if (key == "nodeep-scrub")
11672 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11673 else if (key == "notieragent")
11674 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11675 else if (key == "nosnaptrim")
11676 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11677 else {
11678 ss << "unrecognized flag '" << key << "'";
11679 err = -EINVAL;
11680 }
11681
11682 } else if (prefix == "osd require-osd-release") {
11683 string release;
11684 cmd_getval(cmdmap, "release", release);
11685 bool sure = false;
11686 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11687 ceph_release_t rel = ceph_release_from_name(release.c_str());
11688 if (!rel) {
11689 ss << "unrecognized release " << release;
11690 err = -EINVAL;
11691 goto reply;
11692 }
11693 if (rel == osdmap.require_osd_release) {
11694 // idempotent
11695 err = 0;
11696 goto reply;
11697 }
11698 if (osdmap.require_osd_release < ceph_release_t::pacific && !sure) {
11699 ss << "Not advisable to continue since current 'require_osd_release' "
11700 << "refers to a very old Ceph release. Pass "
11701 << "--yes-i-really-mean-it if you really wish to continue.";
11702 err = -EPERM;
11703 goto reply;
11704 }
11705 if (!osdmap.get_num_up_osds() && !sure) {
11706 ss << "Not advisable to continue since no OSDs are up. Pass "
11707 << "--yes-i-really-mean-it if you really wish to continue.";
11708 err = -EPERM;
11709 goto reply;
11710 }
11711 if (rel == ceph_release_t::pacific) {
11712 if (!mon.monmap->get_required_features().contains_all(
11713 ceph::features::mon::FEATURE_PACIFIC)) {
11714 ss << "not all mons are pacific";
11715 err = -EPERM;
11716 goto reply;
11717 }
11718 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11719 && !sure) {
11720 ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11721 err = -EPERM;
11722 goto reply;
11723 }
11724 } else if (rel == ceph_release_t::quincy) {
11725 if (!mon.monmap->get_required_features().contains_all(
11726 ceph::features::mon::FEATURE_QUINCY)) {
11727 ss << "not all mons are quincy";
11728 err = -EPERM;
11729 goto reply;
11730 }
11731 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
11732 && !sure) {
11733 ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11734 err = -EPERM;
11735 goto reply;
11736 }
11737 } else if (rel == ceph_release_t::reef) {
11738 if (!mon.monmap->get_required_features().contains_all(
11739 ceph::features::mon::FEATURE_REEF)) {
11740 ss << "not all mons are reef";
11741 err = -EPERM;
11742 goto reply;
11743 }
11744 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_REEF))
11745 && !sure) {
11746 ss << "not all up OSDs have CEPH_FEATURE_SERVER_REEF feature";
11747 err = -EPERM;
11748 goto reply;
11749 }
11750 } else {
11751 ss << "not supported for this release";
11752 err = -EPERM;
11753 goto reply;
11754 }
11755 if (rel < osdmap.require_osd_release) {
11756 ss << "require_osd_release cannot be lowered once it has been set";
11757 err = -EPERM;
11758 goto reply;
11759 }
11760 pending_inc.new_require_osd_release = rel;
11761 goto update;
11762 } else if (prefix == "osd down" ||
11763 prefix == "osd out" ||
11764 prefix == "osd in" ||
11765 prefix == "osd rm" ||
11766 prefix == "osd stop") {
11767
11768 bool any = false;
11769 bool stop = false;
11770 bool verbose = true;
11771 bool definitely_dead = false;
11772
11773 vector<string> idvec;
11774 cmd_getval(cmdmap, "ids", idvec);
11775 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11776 derr << "definitely_dead " << (int)definitely_dead << dendl;
11777 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11778 set<int> osds;
11779
11780 // wildcard?
11781 if (j == 0 &&
11782 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11783 if (prefix == "osd in") {
11784 // touch out osds only
11785 osdmap.get_out_existing_osds(osds);
11786 } else {
11787 osdmap.get_all_osds(osds);
11788 }
11789 stop = true;
11790 verbose = false; // so the output is less noisy.
11791 } else {
11792 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11793 if (osd < 0) {
11794 ss << "invalid osd id" << osd;
11795 err = -EINVAL;
11796 continue;
11797 } else if (!osdmap.exists(osd)) {
11798 ss << "osd." << osd << " does not exist. ";
11799 continue;
11800 }
11801
11802 osds.insert(osd);
11803 }
11804
11805 for (auto &osd : osds) {
11806 if (prefix == "osd down") {
11807 if (osdmap.is_down(osd)) {
11808 if (verbose)
11809 ss << "osd." << osd << " is already down. ";
11810 } else {
11811 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11812 ss << "marked down osd." << osd << ". ";
11813 any = true;
11814 }
11815 if (definitely_dead) {
11816 if (!pending_inc.new_xinfo.count(osd)) {
11817 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11818 }
11819 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11820 any = true;
11821 }
11822 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11823 }
11824 } else if (prefix == "osd out") {
11825 if (osdmap.is_out(osd)) {
11826 if (verbose)
11827 ss << "osd." << osd << " is already out. ";
11828 } else {
11829 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11830 if (osdmap.osd_weight[osd]) {
11831 if (pending_inc.new_xinfo.count(osd) == 0) {
11832 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11833 }
11834 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11835 }
11836 ss << "marked out osd." << osd << ". ";
11837 std::ostringstream msg;
11838 msg << "Client " << op->get_session()->entity_name
11839 << " marked osd." << osd << " out";
11840 if (osdmap.is_up(osd)) {
11841 msg << ", while it was still marked up";
11842 } else {
11843 auto period = ceph_clock_now() - down_pending_out[osd];
11844 msg << ", after it was down for " << int(period.sec())
11845 << " seconds";
11846 }
11847
11848 mon.clog->info() << msg.str();
11849 any = true;
11850 }
11851 } else if (prefix == "osd in") {
11852 if (osdmap.is_in(osd)) {
11853 if (verbose)
11854 ss << "osd." << osd << " is already in. ";
11855 } else {
11856 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11857 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11858 if (pending_inc.new_xinfo.count(osd) == 0) {
11859 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11860 }
11861 pending_inc.new_xinfo[osd].old_weight = 0;
11862 } else {
11863 pending_inc.new_weight[osd] = CEPH_OSD_IN;
11864 }
11865 ss << "marked in osd." << osd << ". ";
11866 any = true;
11867 }
11868 } else if (prefix == "osd rm") {
11869 err = prepare_command_osd_remove(osd);
11870
11871 if (err == -EBUSY) {
11872 if (any)
11873 ss << ", ";
11874 ss << "osd." << osd << " is still up; must be down before removal. ";
11875 } else {
11876 ceph_assert(err == 0);
11877 if (any) {
11878 ss << ", osd." << osd;
11879 } else {
11880 ss << "removed osd." << osd;
11881 }
11882 any = true;
11883 }
11884 } else if (prefix == "osd stop") {
11885 if (osdmap.is_stop(osd)) {
11886 if (verbose)
11887 ss << "osd." << osd << " is already stopped. ";
11888 } else if (osdmap.is_down(osd)) {
11889 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11890 ss << "stop down osd." << osd << ". ";
11891 any = true;
11892 } else {
11893 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11894 ss << "stop osd." << osd << ". ";
11895 any = true;
11896 }
11897 }
11898 }
11899 }
11900 if (any) {
11901 getline(ss, rs);
11902 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11903 get_last_committed() + 1));
11904 return true;
11905 }
11906 } else if (prefix == "osd set-group" ||
11907 prefix == "osd unset-group" ||
11908 prefix == "osd add-noup" ||
11909 prefix == "osd add-nodown" ||
11910 prefix == "osd add-noin" ||
11911 prefix == "osd add-noout" ||
11912 prefix == "osd rm-noup" ||
11913 prefix == "osd rm-nodown" ||
11914 prefix == "osd rm-noin" ||
11915 prefix == "osd rm-noout") {
11916 bool do_set = prefix == "osd set-group" ||
11917 prefix.find("add") != string::npos;
11918 string flag_str;
11919 unsigned flags = 0;
11920 vector<string> who;
11921 if (prefix == "osd set-group" || prefix == "osd unset-group") {
11922 cmd_getval(cmdmap, "flags", flag_str);
11923 cmd_getval(cmdmap, "who", who);
11924 vector<string> raw_flags;
11925 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11926 for (auto& f : raw_flags) {
11927 if (f == "noup")
11928 flags |= CEPH_OSD_NOUP;
11929 else if (f == "nodown")
11930 flags |= CEPH_OSD_NODOWN;
11931 else if (f == "noin")
11932 flags |= CEPH_OSD_NOIN;
11933 else if (f == "noout")
11934 flags |= CEPH_OSD_NOOUT;
11935 else {
11936 ss << "unrecognized flag '" << f << "', must be one of "
11937 << "{noup,nodown,noin,noout}";
11938 err = -EINVAL;
11939 goto reply;
11940 }
11941 }
11942 } else {
11943 cmd_getval(cmdmap, "ids", who);
11944 if (prefix.find("noup") != string::npos)
11945 flags = CEPH_OSD_NOUP;
11946 else if (prefix.find("nodown") != string::npos)
11947 flags = CEPH_OSD_NODOWN;
11948 else if (prefix.find("noin") != string::npos)
11949 flags = CEPH_OSD_NOIN;
11950 else if (prefix.find("noout") != string::npos)
11951 flags = CEPH_OSD_NOOUT;
11952 else
11953 ceph_assert(0 == "Unreachable!");
11954 }
11955 if (flags == 0) {
11956 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11957 err = -EINVAL;
11958 goto reply;
11959 }
11960 if (who.empty()) {
11961 ss << "must specify at least one or more targets to set/unset";
11962 err = -EINVAL;
11963 goto reply;
11964 }
11965 set<int> osds;
11966 set<int> crush_nodes;
11967 set<int> device_classes;
11968 for (auto& w : who) {
11969 if (w == "any" || w == "all" || w == "*") {
11970 osdmap.get_all_osds(osds);
11971 break;
11972 }
11973 std::stringstream ts;
11974 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11975 osds.insert(osd);
11976 } else if (osdmap.crush->name_exists(w)) {
11977 crush_nodes.insert(osdmap.crush->get_item_id(w));
11978 } else if (osdmap.crush->class_exists(w)) {
11979 device_classes.insert(osdmap.crush->get_class_id(w));
11980 } else {
11981 ss << "unable to parse osd id or crush node or device class: "
11982 << "\"" << w << "\". ";
11983 }
11984 }
11985 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11986 // ss has reason for failure
11987 err = -EINVAL;
11988 goto reply;
11989 }
11990 bool any = false;
11991 for (auto osd : osds) {
11992 if (!osdmap.exists(osd)) {
11993 ss << "osd." << osd << " does not exist. ";
11994 continue;
11995 }
11996 if (do_set) {
11997 if (flags & CEPH_OSD_NOUP) {
11998 any |= osdmap.is_noup_by_osd(osd) ?
11999 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
12000 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
12001 }
12002 if (flags & CEPH_OSD_NODOWN) {
12003 any |= osdmap.is_nodown_by_osd(osd) ?
12004 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
12005 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
12006 }
12007 if (flags & CEPH_OSD_NOIN) {
12008 any |= osdmap.is_noin_by_osd(osd) ?
12009 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
12010 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
12011 }
12012 if (flags & CEPH_OSD_NOOUT) {
12013 any |= osdmap.is_noout_by_osd(osd) ?
12014 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
12015 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
12016 }
12017 } else {
12018 if (flags & CEPH_OSD_NOUP) {
12019 any |= osdmap.is_noup_by_osd(osd) ?
12020 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
12021 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
12022 }
12023 if (flags & CEPH_OSD_NODOWN) {
12024 any |= osdmap.is_nodown_by_osd(osd) ?
12025 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
12026 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
12027 }
12028 if (flags & CEPH_OSD_NOIN) {
12029 any |= osdmap.is_noin_by_osd(osd) ?
12030 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
12031 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
12032 }
12033 if (flags & CEPH_OSD_NOOUT) {
12034 any |= osdmap.is_noout_by_osd(osd) ?
12035 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
12036 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
12037 }
12038 }
12039 }
12040 for (auto& id : crush_nodes) {
12041 auto old_flags = osdmap.get_crush_node_flags(id);
12042 auto& pending_flags = pending_inc.new_crush_node_flags[id];
12043 pending_flags |= old_flags; // adopt existing flags first!
12044 if (do_set) {
12045 pending_flags |= flags;
12046 } else {
12047 pending_flags &= ~flags;
12048 }
12049 any = true;
12050 }
12051 for (auto& id : device_classes) {
12052 auto old_flags = osdmap.get_device_class_flags(id);
12053 auto& pending_flags = pending_inc.new_device_class_flags[id];
12054 pending_flags |= old_flags;
12055 if (do_set) {
12056 pending_flags |= flags;
12057 } else {
12058 pending_flags &= ~flags;
12059 }
12060 any = true;
12061 }
12062 if (any) {
12063 getline(ss, rs);
12064 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
12065 get_last_committed() + 1));
12066 return true;
12067 }
12068 } else if (prefix == "osd pg-temp") {
12069 pg_t pgid;
12070 err = parse_pgid(cmdmap, ss, pgid);
12071 if (err < 0)
12072 goto reply;
12073 if (pending_inc.new_pg_temp.count(pgid)) {
12074 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
12075 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12076 return true;
12077 }
12078
12079 vector<int64_t> id_vec;
12080 vector<int32_t> new_pg_temp;
12081 cmd_getval(cmdmap, "id", id_vec);
12082 if (id_vec.empty()) {
12083 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
12084 ss << "done cleaning up pg_temp of " << pgid;
12085 goto update;
12086 }
12087 for (auto osd : id_vec) {
12088 if (!osdmap.exists(osd)) {
12089 ss << "osd." << osd << " does not exist";
12090 err = -ENOENT;
12091 goto reply;
12092 }
12093 new_pg_temp.push_back(osd);
12094 }
12095
12096 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12097 if ((int)new_pg_temp.size() < pool_min_size) {
12098 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
12099 << pool_min_size << ")";
12100 err = -EINVAL;
12101 goto reply;
12102 }
12103
12104 int pool_size = osdmap.get_pg_pool_size(pgid);
12105 if ((int)new_pg_temp.size() > pool_size) {
12106 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
12107 << pool_size << ")";
12108 err = -EINVAL;
12109 goto reply;
12110 }
12111
12112 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
12113 new_pg_temp.begin(), new_pg_temp.end());
12114 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
12115 goto update;
12116 } else if (prefix == "osd primary-temp" ||
12117 prefix == "osd rm-primary-temp") {
12118 pg_t pgid;
12119 err = parse_pgid(cmdmap, ss, pgid);
12120 if (err < 0)
12121 goto reply;
12122
12123 int64_t osd;
12124 if (prefix == "osd primary-temp") {
12125 if (!cmd_getval(cmdmap, "id", osd)) {
12126 ss << "unable to parse 'id' value '"
12127 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12128 err = -EINVAL;
12129 goto reply;
12130 }
12131 if (!osdmap.exists(osd)) {
12132 ss << "osd." << osd << " does not exist";
12133 err = -ENOENT;
12134 goto reply;
12135 }
12136 }
12137 else if (prefix == "osd rm-primary-temp") {
12138 osd = -1;
12139 }
12140 else {
12141 ceph_assert(0 == "Unreachable!");
12142 }
12143
12144 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12145 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12146 ss << "require_min_compat_client "
12147 << osdmap.require_min_compat_client
12148 << " < firefly, which is required for primary-temp";
12149 err = -EPERM;
12150 goto reply;
12151 }
12152
12153 pending_inc.new_primary_temp[pgid] = osd;
12154 ss << "set " << pgid << " primary_temp mapping to " << osd;
12155 goto update;
12156 } else if (prefix == "pg repeer") {
12157 pg_t pgid;
12158 err = parse_pgid(cmdmap, ss, pgid);
12159 if (err < 0)
12160 goto reply;
12161 vector<int> acting;
12162 int primary;
12163 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12164 if (primary < 0) {
12165 err = -EAGAIN;
12166 ss << "pg currently has no primary";
12167 goto reply;
12168 }
12169 if (acting.size() > 1) {
12170 // map to just primary; it will map back to what it wants
12171 pending_inc.new_pg_temp[pgid] = { primary };
12172 } else {
12173 // hmm, pick another arbitrary osd to induce a change. Note
12174 // that this won't work if there is only one suitable OSD in the cluster.
12175 int i;
12176 bool done = false;
12177 for (i = 0; i < osdmap.get_max_osd(); ++i) {
12178 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12179 continue;
12180 }
12181 pending_inc.new_pg_temp[pgid] = { primary, i };
12182 done = true;
12183 break;
12184 }
12185 if (!done) {
12186 err = -EAGAIN;
12187 ss << "not enough up OSDs in the cluster to force repeer";
12188 goto reply;
12189 }
12190 }
12191 goto update;
12192 } else if (prefix == "osd pg-upmap" ||
12193 prefix == "osd rm-pg-upmap" ||
12194 prefix == "osd pg-upmap-items" ||
12195 prefix == "osd rm-pg-upmap-items" ||
12196 prefix == "osd pg-upmap-primary" ||
12197 prefix == "osd rm-pg-upmap-primary") {
12198 enum {
12199 OP_PG_UPMAP,
12200 OP_RM_PG_UPMAP,
12201 OP_PG_UPMAP_ITEMS,
12202 OP_RM_PG_UPMAP_ITEMS,
12203 OP_PG_UPMAP_PRIMARY,
12204 OP_RM_PG_UPMAP_PRIMARY,
12205 } upmap_option;
12206
12207 if (prefix == "osd pg-upmap") {
12208 upmap_option = OP_PG_UPMAP;
12209 } else if (prefix == "osd rm-pg-upmap") {
12210 upmap_option = OP_RM_PG_UPMAP;
12211 } else if (prefix == "osd pg-upmap-items") {
12212 upmap_option = OP_PG_UPMAP_ITEMS;
12213 } else if (prefix == "osd rm-pg-upmap-items") {
12214 upmap_option = OP_RM_PG_UPMAP_ITEMS;
12215 } else if (prefix == "osd pg-upmap-primary") {
12216 upmap_option = OP_PG_UPMAP_PRIMARY;
12217 } else if (prefix == "osd rm-pg-upmap-primary") {
12218 upmap_option = OP_RM_PG_UPMAP_PRIMARY;
12219 } else {
12220 ceph_abort_msg("invalid upmap option");
12221 }
12222
12223 ceph_release_t min_release = ceph_release_t::unknown;
12224 string feature_name = "unknown";
12225 switch (upmap_option) {
12226 case OP_PG_UPMAP: // fall through
12227 case OP_RM_PG_UPMAP: // fall through
12228 case OP_PG_UPMAP_ITEMS: // fall through
12229 case OP_RM_PG_UPMAP_ITEMS:
12230 min_release = ceph_release_t::luminous;
12231 feature_name = "pg-upmap";
12232 break;
12233
12234 case OP_PG_UPMAP_PRIMARY: // fall through
12235 case OP_RM_PG_UPMAP_PRIMARY:
12236 min_release = ceph_release_t::reef;
12237 feature_name = "pg-upmap-primary";
12238 break;
12239
12240 default:
12241 ceph_abort_msg("invalid upmap option");
12242 }
12243 uint64_t min_feature = CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
12244 string min_release_name = ceph_release_name(static_cast<int>(min_release));
12245
12246 if (osdmap.require_min_compat_client < min_release) {
12247 ss << "min_compat_client "
12248 << osdmap.require_min_compat_client
12249 << " < " << min_release_name << ", which is required for " << feature_name << ". "
12250 << "Try 'ceph osd set-require-min-compat-client " << min_release_name << "' "
12251 << "before using the new interface";
12252 err = -EPERM;
12253 goto reply;
12254 }
12255
12256 //TODO: Should I add feature and test for upmap-primary?
12257 err = check_cluster_features(min_feature, ss);
12258 if (err == -EAGAIN)
12259 goto wait;
12260 if (err < 0)
12261 goto reply;
12262 pg_t pgid;
12263 err = parse_pgid(cmdmap, ss, pgid);
12264 if (err < 0)
12265 goto reply;
12266 if (pending_inc.old_pools.count(pgid.pool())) {
12267 ss << "pool of " << pgid << " is pending removal";
12268 err = -ENOENT;
12269 getline(ss, rs);
12270 wait_for_finished_proposal(op,
12271 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12272 return true;
12273 }
12274
12275 // check pending upmap changes
12276 switch (upmap_option) {
12277 case OP_PG_UPMAP: // fall through
12278 case OP_RM_PG_UPMAP:
12279 if (pending_inc.new_pg_upmap.count(pgid) ||
12280 pending_inc.old_pg_upmap.count(pgid)) {
12281 dout(10) << __func__ << " waiting for pending update on "
12282 << pgid << dendl;
12283 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12284 return true;
12285 }
12286 break;
12287
12288 case OP_PG_UPMAP_PRIMARY: // fall through
12289 case OP_RM_PG_UPMAP_PRIMARY:
12290 {
12291 const pg_pool_t *pt = osdmap.get_pg_pool(pgid.pool());
12292 if (! pt->is_replicated()) {
12293 ss << "pg-upmap-primary is only supported for replicated pools";
12294 err = -EINVAL;
12295 goto reply;
12296 }
12297 }
12298 // fall through
12299 case OP_PG_UPMAP_ITEMS: // fall through
12300 case OP_RM_PG_UPMAP_ITEMS: // fall through
12301 if (pending_inc.new_pg_upmap_items.count(pgid) ||
12302 pending_inc.old_pg_upmap_items.count(pgid)) {
12303 dout(10) << __func__ << " waiting for pending update on "
12304 << pgid << dendl;
12305 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12306 return true;
12307 }
12308 break;
12309
12310 default:
12311 ceph_abort_msg("invalid upmap option");
12312 }
12313
12314 switch (upmap_option) {
12315 case OP_PG_UPMAP:
12316 {
12317 vector<int64_t> id_vec;
12318 if (!cmd_getval(cmdmap, "id", id_vec)) {
12319 ss << "unable to parse 'id' value(s) '"
12320 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12321 err = -EINVAL;
12322 goto reply;
12323 }
12324
12325 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12326 if ((int)id_vec.size() < pool_min_size) {
12327 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12328 << pool_min_size << ")";
12329 err = -EINVAL;
12330 goto reply;
12331 }
12332
12333 int pool_size = osdmap.get_pg_pool_size(pgid);
12334 if ((int)id_vec.size() > pool_size) {
12335 ss << "num of osds (" << id_vec.size() <<") > pool size ("
12336 << pool_size << ")";
12337 err = -EINVAL;
12338 goto reply;
12339 }
12340
12341 vector<int32_t> new_pg_upmap;
12342 for (auto osd : id_vec) {
12343 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12344 ss << "osd." << osd << " does not exist";
12345 err = -ENOENT;
12346 goto reply;
12347 }
12348 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12349 if (it != new_pg_upmap.end()) {
12350 ss << "osd." << osd << " already exists, ";
12351 continue;
12352 }
12353 new_pg_upmap.push_back(osd);
12354 }
12355
12356 if (new_pg_upmap.empty()) {
12357 ss << "no valid upmap items(pairs) is specified";
12358 err = -EINVAL;
12359 goto reply;
12360 }
12361
12362 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12363 new_pg_upmap.begin(), new_pg_upmap.end());
12364 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12365 }
12366 break;
12367
12368 case OP_RM_PG_UPMAP:
12369 {
12370 pending_inc.old_pg_upmap.insert(pgid);
12371 ss << "clear " << pgid << " pg_upmap mapping";
12372 }
12373 break;
12374
12375 case OP_PG_UPMAP_ITEMS:
12376 {
12377 vector<int64_t> id_vec;
12378 if (!cmd_getval(cmdmap, "id", id_vec)) {
12379 ss << "unable to parse 'id' value(s) '"
12380 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12381 err = -EINVAL;
12382 goto reply;
12383 }
12384
12385 if (id_vec.size() % 2) {
12386 ss << "you must specify pairs of osd ids to be remapped";
12387 err = -EINVAL;
12388 goto reply;
12389 }
12390
12391 int pool_size = osdmap.get_pg_pool_size(pgid);
12392 if ((int)(id_vec.size() / 2) > pool_size) {
12393 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12394 << pool_size << ")";
12395 err = -EINVAL;
12396 goto reply;
12397 }
12398
12399 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12400 ostringstream items;
12401 items << "[";
12402 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12403 int from = *p++;
12404 int to = *p;
12405 if (from == to) {
12406 ss << "from osd." << from << " == to osd." << to << ", ";
12407 continue;
12408 }
12409 if (!osdmap.exists(from)) {
12410 ss << "osd." << from << " does not exist";
12411 err = -ENOENT;
12412 goto reply;
12413 }
12414 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12415 ss << "osd." << to << " does not exist";
12416 err = -ENOENT;
12417 goto reply;
12418 }
12419 pair<int32_t,int32_t> entry = make_pair(from, to);
12420 auto it = std::find(new_pg_upmap_items.begin(),
12421 new_pg_upmap_items.end(), entry);
12422 if (it != new_pg_upmap_items.end()) {
12423 ss << "osd." << from << " -> osd." << to << " already exists, ";
12424 continue;
12425 }
12426 new_pg_upmap_items.push_back(entry);
12427 items << from << "->" << to << ",";
12428 }
12429 string out(items.str());
12430 out.resize(out.size() - 1); // drop last ','
12431 out += "]";
12432
12433 if (new_pg_upmap_items.empty()) {
12434 ss << "no valid upmap items(pairs) is specified";
12435 err = -EINVAL;
12436 goto reply;
12437 }
12438
12439 pending_inc.new_pg_upmap_items[pgid] =
12440 mempool::osdmap::vector<pair<int32_t,int32_t>>(
12441 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12442 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12443 }
12444 break;
12445
12446 case OP_RM_PG_UPMAP_ITEMS:
12447 {
12448 pending_inc.old_pg_upmap_items.insert(pgid);
12449 ss << "clear " << pgid << " pg_upmap_items mapping";
12450 }
12451 break;
12452
12453 case OP_PG_UPMAP_PRIMARY:
12454 {
12455 int64_t id;
12456 if (!cmd_getval(cmdmap, "id", id)) {
12457 ss << "invalid osd id value '"
12458 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12459 err = -EINVAL;
12460 goto reply;
12461 }
12462 if (id != CRUSH_ITEM_NONE && !osdmap.exists(id)) {
12463 ss << "osd." << id << " does not exist";
12464 err = -ENOENT;
12465 goto reply;
12466 }
12467 vector<int> acting;
12468 int primary;
12469 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12470 if (id == primary) {
12471 ss << "osd." << id << " is already primary for pg " << pgid;
12472 err = -EINVAL;
12473 goto reply;
12474 }
12475 int found_idx = 0;
12476 for (int i = 1 ; i < (int)acting.size(); i++) { // skip 0 on purpose
12477 if (acting[i] == id) {
12478 found_idx = i;
12479 break;
12480 }
12481 }
12482 if (found_idx == 0) {
12483 ss << "osd." << id << " is not in acting set for pg " << pgid;
12484 err = -EINVAL;
12485 goto reply;
12486 }
12487 vector<int> new_acting(acting);
12488 new_acting[found_idx] = new_acting[0];
12489 new_acting[0] = id;
12490 int pool_size = osdmap.get_pg_pool_size(pgid);
12491 if (osdmap.crush->verify_upmap(cct, osdmap.get_pg_pool_crush_rule(pgid),
12492 pool_size, new_acting) >= 0) {
12493 ss << "change primary for pg " << pgid << " to osd." << id;
12494 }
12495 else {
12496 ss << "can't change primary for pg " << pgid << " to osd." << id
12497 << " - illegal pg after the change";
12498 err = -EINVAL;
12499 goto reply;
12500 }
12501 pending_inc.new_pg_upmap_primary[pgid] = id;
12502 //TO-REMOVE:
12503 ldout(cct, 20) << "pg " << pgid << ": set pg_upmap_primary to " << id << dendl;
12504 }
12505 break;
12506
12507 case OP_RM_PG_UPMAP_PRIMARY:
12508 {
12509 pending_inc.old_pg_upmap_primary.insert(pgid);
12510 ss << "clear " << pgid << " pg_upmap_primary mapping";
12511 }
12512 break;
12513
12514 default:
12515 ceph_abort_msg("invalid upmap option");
12516 }
12517
12518 goto update;
12519 } else if (prefix == "osd primary-affinity") {
12520 int64_t id;
12521 if (!cmd_getval(cmdmap, "id", id)) {
12522 ss << "invalid osd id value '"
12523 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12524 err = -EINVAL;
12525 goto reply;
12526 }
12527 double w;
12528 if (!cmd_getval(cmdmap, "weight", w)) {
12529 ss << "unable to parse 'weight' value '"
12530 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12531 err = -EINVAL;
12532 goto reply;
12533 }
12534 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12535 if (ww < 0L) {
12536 ss << "weight must be >= 0";
12537 err = -EINVAL;
12538 goto reply;
12539 }
12540 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12541 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12542 ss << "require_min_compat_client "
12543 << osdmap.require_min_compat_client
12544 << " < firefly, which is required for primary-affinity";
12545 err = -EPERM;
12546 goto reply;
12547 }
12548 if (osdmap.exists(id)) {
12549 pending_inc.new_primary_affinity[id] = ww;
12550 ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12551 getline(ss, rs);
12552 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12553 get_last_committed() + 1));
12554 return true;
12555 } else {
12556 ss << "osd." << id << " does not exist";
12557 err = -ENOENT;
12558 goto reply;
12559 }
12560 } else if (prefix == "osd reweight") {
12561 int64_t id;
12562 if (!cmd_getval(cmdmap, "id", id)) {
12563 ss << "unable to parse osd id value '"
12564 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12565 err = -EINVAL;
12566 goto reply;
12567 }
12568 double w;
12569 if (!cmd_getval(cmdmap, "weight", w)) {
12570 ss << "unable to parse weight value '"
12571 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12572 err = -EINVAL;
12573 goto reply;
12574 }
12575 long ww = (int)((double)CEPH_OSD_IN*w);
12576 if (ww < 0L) {
12577 ss << "weight must be >= 0";
12578 err = -EINVAL;
12579 goto reply;
12580 }
12581 if (osdmap.exists(id)) {
12582 pending_inc.new_weight[id] = ww;
12583 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12584 getline(ss, rs);
12585 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12586 get_last_committed() + 1));
12587 return true;
12588 } else {
12589 ss << "osd." << id << " does not exist";
12590 err = -ENOENT;
12591 goto reply;
12592 }
12593 } else if (prefix == "osd reweightn") {
12594 map<int32_t, uint32_t> weights;
12595 err = parse_reweights(cct, cmdmap, osdmap, &weights);
12596 if (err) {
12597 ss << "unable to parse 'weights' value '"
12598 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12599 goto reply;
12600 }
12601 pending_inc.new_weight.insert(weights.begin(), weights.end());
12602 wait_for_finished_proposal(
12603 op,
12604 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12605 return true;
12606 } else if (prefix == "osd lost") {
12607 int64_t id;
12608 if (!cmd_getval(cmdmap, "id", id)) {
12609 ss << "unable to parse osd id value '"
12610 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12611 err = -EINVAL;
12612 goto reply;
12613 }
12614 bool sure = false;
12615 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12616 if (!sure) {
12617 ss << "are you SURE? this might mean real, permanent data loss. pass "
12618 "--yes-i-really-mean-it if you really do.";
12619 err = -EPERM;
12620 goto reply;
12621 } else if (!osdmap.exists(id)) {
12622 ss << "osd." << id << " does not exist";
12623 err = -ENOENT;
12624 goto reply;
12625 } else if (!osdmap.is_down(id)) {
12626 ss << "osd." << id << " is not down";
12627 err = -EBUSY;
12628 goto reply;
12629 } else {
12630 epoch_t e = osdmap.get_info(id).down_at;
12631 pending_inc.new_lost[id] = e;
12632 ss << "marked osd lost in epoch " << e;
12633 getline(ss, rs);
12634 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12635 get_last_committed() + 1));
12636 return true;
12637 }
12638
12639 } else if (prefix == "osd destroy-actual" ||
12640 prefix == "osd purge-actual" ||
12641 prefix == "osd purge-new") {
12642 /* Destroying an OSD means that we don't expect to further make use of
12643 * the OSDs data (which may even become unreadable after this operation),
12644 * and that we are okay with scrubbing all its cephx keys and config-key
12645 * data (which may include lockbox keys, thus rendering the osd's data
12646 * unreadable).
12647 *
12648 * The OSD will not be removed. Instead, we will mark it as destroyed,
12649 * such that a subsequent call to `create` will not reuse the osd id.
12650 * This will play into being able to recreate the OSD, at the same
12651 * crush location, with minimal data movement.
12652 */
12653
12654 // make sure authmon is writeable.
12655 if (!mon.authmon()->is_writeable()) {
12656 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12657 << "osd destroy" << dendl;
12658 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12659 return false;
12660 }
12661
12662 int64_t id;
12663 if (!cmd_getval(cmdmap, "id", id)) {
12664 auto p = cmdmap.find("id");
12665 if (p == cmdmap.end()) {
12666 ss << "no osd id specified";
12667 } else {
12668 ss << "unable to parse osd id value '"
12669 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12670 }
12671 err = -EINVAL;
12672 goto reply;
12673 }
12674
12675 bool is_destroy = (prefix == "osd destroy-actual");
12676 if (!is_destroy) {
12677 ceph_assert("osd purge-actual" == prefix ||
12678 "osd purge-new" == prefix);
12679 }
12680
12681 bool sure = false;
12682 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12683 if (!sure) {
12684 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12685 << "This will mean real, permanent data loss, as well "
12686 << "as deletion of cephx and lockbox keys. "
12687 << "Pass --yes-i-really-mean-it if you really do.";
12688 err = -EPERM;
12689 goto reply;
12690 } else if (!osdmap.exists(id)) {
12691 ss << "osd." << id << " does not exist";
12692 err = 0; // idempotent
12693 goto reply;
12694 } else if (osdmap.is_up(id)) {
12695 ss << "osd." << id << " is not `down`.";
12696 err = -EBUSY;
12697 goto reply;
12698 } else if (is_destroy && osdmap.is_destroyed(id)) {
12699 ss << "destroyed osd." << id;
12700 err = 0;
12701 goto reply;
12702 }
12703
12704 if (prefix == "osd purge-new" &&
12705 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12706 ss << "osd." << id << " is not new";
12707 err = -EPERM;
12708 goto reply;
12709 }
12710
12711 bool goto_reply = false;
12712
12713 paxos.plug();
12714 if (is_destroy) {
12715 err = prepare_command_osd_destroy(id, ss);
12716 // we checked above that it should exist.
12717 ceph_assert(err != -ENOENT);
12718 } else {
12719 err = prepare_command_osd_purge(id, ss);
12720 if (err == -ENOENT) {
12721 err = 0;
12722 ss << "osd." << id << " does not exist.";
12723 goto_reply = true;
12724 }
12725 }
12726 paxos.unplug();
12727
12728 if (err < 0 || goto_reply) {
12729 goto reply;
12730 }
12731
12732 if (is_destroy) {
12733 ss << "destroyed osd." << id;
12734 } else {
12735 ss << "purged osd." << id;
12736 }
12737
12738 getline(ss, rs);
12739 wait_for_finished_proposal(op,
12740 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12741 force_immediate_propose();
12742 return true;
12743
12744 } else if (prefix == "osd new") {
12745
12746 // make sure authmon is writeable.
12747 if (!mon.authmon()->is_writeable()) {
12748 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12749 << "osd new" << dendl;
12750 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12751 return false;
12752 }
12753
12754 // make sure kvmon is writeable.
12755 if (!mon.kvmon()->is_writeable()) {
12756 dout(10) << __func__ << " waiting for kv mon to be writeable for "
12757 << "osd new" << dendl;
12758 mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12759 return false;
12760 }
12761
12762 map<string,string> param_map;
12763
12764 bufferlist bl = m->get_data();
12765 string param_json = bl.to_str();
12766 dout(20) << __func__ << " osd new json = " << param_json << dendl;
12767
12768 err = get_json_str_map(param_json, ss, &param_map);
12769 if (err < 0)
12770 goto reply;
12771
12772 dout(20) << __func__ << " osd new params " << param_map << dendl;
12773
12774 paxos.plug();
12775 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12776 paxos.unplug();
12777
12778 if (err < 0) {
12779 goto reply;
12780 }
12781
12782 if (f) {
12783 f->flush(rdata);
12784 } else {
12785 rdata.append(ss);
12786 }
12787
12788 if (err == EEXIST) {
12789 // idempotent operation
12790 err = 0;
12791 goto reply;
12792 }
12793
12794 wait_for_finished_proposal(op,
12795 new Monitor::C_Command(mon, op, 0, rs, rdata,
12796 get_last_committed() + 1));
12797 force_immediate_propose();
12798 return true;
12799
12800 } else if (prefix == "osd create") {
12801
12802 // optional id provided?
12803 int64_t id = -1, cmd_id = -1;
12804 if (cmd_getval(cmdmap, "id", cmd_id)) {
12805 if (cmd_id < 0) {
12806 ss << "invalid osd id value '" << cmd_id << "'";
12807 err = -EINVAL;
12808 goto reply;
12809 }
12810 dout(10) << " osd create got id " << cmd_id << dendl;
12811 }
12812
12813 uuid_d uuid;
12814 string uuidstr;
12815 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12816 if (!uuid.parse(uuidstr.c_str())) {
12817 ss << "invalid uuid value '" << uuidstr << "'";
12818 err = -EINVAL;
12819 goto reply;
12820 }
12821 // we only care about the id if we also have the uuid, to
12822 // ensure the operation's idempotency.
12823 id = cmd_id;
12824 }
12825
12826 int32_t new_id = -1;
12827 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12828 if (err < 0) {
12829 if (err == -EAGAIN) {
12830 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12831 return true;
12832 }
12833 // a check has failed; reply to the user.
12834 goto reply;
12835
12836 } else if (err == EEXIST) {
12837 // this is an idempotent operation; we can go ahead and reply.
12838 if (f) {
12839 f->open_object_section("created_osd");
12840 f->dump_int("osdid", new_id);
12841 f->close_section();
12842 f->flush(rdata);
12843 } else {
12844 ss << new_id;
12845 rdata.append(ss);
12846 }
12847 err = 0;
12848 goto reply;
12849 }
12850
12851 string empty_device_class;
12852 do_osd_create(id, uuid, empty_device_class, &new_id);
12853
12854 if (f) {
12855 f->open_object_section("created_osd");
12856 f->dump_int("osdid", new_id);
12857 f->close_section();
12858 f->flush(rdata);
12859 } else {
12860 ss << new_id;
12861 rdata.append(ss);
12862 }
12863 wait_for_finished_proposal(op,
12864 new Monitor::C_Command(mon, op, 0, rs, rdata,
12865 get_last_committed() + 1));
12866 return true;
12867
12868 } else if (prefix == "osd blocklist clear" ||
12869 prefix == "osd blacklist clear") {
12870 pending_inc.new_blocklist.clear();
12871 std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12872 std::list<std::pair<entity_addr_t,utime_t > > range_b;
12873 osdmap.get_blocklist(&blocklist, &range_b);
12874 for (const auto &entry : blocklist) {
12875 pending_inc.old_blocklist.push_back(entry.first);
12876 }
12877 for (const auto &entry : range_b) {
12878 pending_inc.old_range_blocklist.push_back(entry.first);
12879 }
12880 ss << " removed all blocklist entries";
12881 getline(ss, rs);
12882 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12883 get_last_committed() + 1));
12884 return true;
12885 } else if (prefix == "osd blocklist" ||
12886 prefix == "osd blacklist") {
12887 string addrstr, rangestr;
12888 bool range = false;
12889 cmd_getval(cmdmap, "addr", addrstr);
12890 if (cmd_getval(cmdmap, "range", rangestr)) {
12891 if (rangestr == "range") {
12892 range = true;
12893 } else {
12894 ss << "Did you mean to specify \"osd blocklist range\"?";
12895 err = -EINVAL;
12896 goto reply;
12897 }
12898 }
12899 entity_addr_t addr;
12900 if (!addr.parse(addrstr)) {
12901 ss << "unable to parse address " << addrstr;
12902 err = -EINVAL;
12903 goto reply;
12904 }
12905 else {
12906 if (range) {
12907 if (!addr.maybe_cidr()) {
12908 ss << "You specified a range command, but " << addr
12909 << " does not parse as a CIDR range";
12910 err = -EINVAL;
12911 goto reply;
12912 }
12913 addr.type = entity_addr_t::TYPE_CIDR;
12914 err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
12915 if (err) {
12916 goto reply;
12917 }
12918 if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
12919 (addr.is_ipv6() && addr.get_nonce() > 128)) {
12920 ss << "Too many bits in range for that protocol!";
12921 err = -EINVAL;
12922 goto reply;
12923 }
12924 } else {
12925 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12926 // always blocklist type ANY
12927 addr.set_type(entity_addr_t::TYPE_ANY);
12928 } else {
12929 addr.set_type(entity_addr_t::TYPE_LEGACY);
12930 }
12931 }
12932
12933 string blocklistop;
12934 if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12935 cmd_getval(cmdmap, "blacklistop", blocklistop);
12936 }
12937 if (blocklistop == "add") {
12938 utime_t expires = ceph_clock_now();
12939 // default one hour
12940 double d = cmd_getval_or<double>(cmdmap, "expire",
12941 g_conf()->mon_osd_blocklist_default_expire);
12942 expires += d;
12943
12944 auto add_to_pending_blocklists = [](auto& nb, auto& ob,
12945 const auto& addr,
12946 const auto& expires) {
12947 nb[addr] = expires;
12948 // cancel any pending un-blocklisting request too
12949 auto it = std::find(ob.begin(),
12950 ob.end(), addr);
12951 if (it != ob.end()) {
12952 ob.erase(it);
12953 }
12954 };
12955 if (range) {
12956 add_to_pending_blocklists(pending_inc.new_range_blocklist,
12957 pending_inc.old_range_blocklist,
12958 addr, expires);
12959
12960 } else {
12961 add_to_pending_blocklists(pending_inc.new_blocklist,
12962 pending_inc.old_blocklist,
12963 addr, expires);
12964 }
12965
12966 ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12967 getline(ss, rs);
12968 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12969 get_last_committed() + 1));
12970 return true;
12971 } else if (blocklistop == "rm") {
12972 auto rm_from_pending_blocklists = [](const auto& addr,
12973 auto& blocklist,
12974 auto& ob, auto& pb) {
12975 if (blocklist.count(addr)) {
12976 ob.push_back(addr);
12977 return true;
12978 } else if (pb.count(addr)) {
12979 pb.erase(addr);
12980 return true;
12981 }
12982 return false;
12983 };
12984 if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
12985 pending_inc.old_blocklist,
12986 pending_inc.new_blocklist)) ||
12987 (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
12988 pending_inc.old_range_blocklist,
12989 pending_inc.new_range_blocklist))) {
12990 ss << "un-blocklisting " << addr;
12991 getline(ss, rs);
12992 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12993 get_last_committed() + 1));
12994 return true;
12995 }
12996 ss << addr << " isn't blocklisted";
12997 err = 0;
12998 goto reply;
12999 }
13000 }
13001 } else if (prefix == "osd pool mksnap") {
13002 string poolstr;
13003 cmd_getval(cmdmap, "pool", poolstr);
13004 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13005 if (pool < 0) {
13006 ss << "unrecognized pool '" << poolstr << "'";
13007 err = -ENOENT;
13008 goto reply;
13009 }
13010 string snapname;
13011 cmd_getval(cmdmap, "snap", snapname);
13012 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13013 if (p->is_unmanaged_snaps_mode()) {
13014 ss << "pool " << poolstr << " is in unmanaged snaps mode";
13015 err = -EINVAL;
13016 goto reply;
13017 } else if (p->snap_exists(snapname.c_str())) {
13018 ss << "pool " << poolstr << " snap " << snapname << " already exists";
13019 err = 0;
13020 goto reply;
13021 } else if (p->is_tier()) {
13022 ss << "pool " << poolstr << " is a cache tier";
13023 err = -EINVAL;
13024 goto reply;
13025 }
13026 pg_pool_t *pp = 0;
13027 if (pending_inc.new_pools.count(pool))
13028 pp = &pending_inc.new_pools[pool];
13029 if (!pp) {
13030 pp = &pending_inc.new_pools[pool];
13031 *pp = *p;
13032 }
13033 if (pp->snap_exists(snapname.c_str())) {
13034 ss << "pool " << poolstr << " snap " << snapname << " already exists";
13035 } else {
13036 if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(pool)) {
13037 dout(20) << "pool-level snapshots have been disabled for pools "
13038 "attached to an fs - poolid:" << pool << dendl;
13039 err = -EOPNOTSUPP;
13040 goto reply;
13041 }
13042 pp->add_snap(snapname.c_str(), ceph_clock_now());
13043 pp->set_snap_epoch(pending_inc.epoch);
13044 ss << "created pool " << poolstr << " snap " << snapname;
13045 }
13046 getline(ss, rs);
13047 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13048 get_last_committed() + 1));
13049 return true;
13050 } else if (prefix == "osd pool rmsnap") {
13051 string poolstr;
13052 cmd_getval(cmdmap, "pool", poolstr);
13053 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13054 if (pool < 0) {
13055 ss << "unrecognized pool '" << poolstr << "'";
13056 err = -ENOENT;
13057 goto reply;
13058 }
13059 string snapname;
13060 cmd_getval(cmdmap, "snap", snapname);
13061 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13062 if (p->is_unmanaged_snaps_mode()) {
13063 ss << "pool " << poolstr << " is in unmanaged snaps mode";
13064 err = -EINVAL;
13065 goto reply;
13066 } else if (!p->snap_exists(snapname.c_str())) {
13067 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
13068 err = 0;
13069 goto reply;
13070 }
13071 pg_pool_t *pp = 0;
13072 if (pending_inc.new_pools.count(pool))
13073 pp = &pending_inc.new_pools[pool];
13074 if (!pp) {
13075 pp = &pending_inc.new_pools[pool];
13076 *pp = *p;
13077 }
13078 snapid_t sn = pp->snap_exists(snapname.c_str());
13079 if (sn) {
13080 pp->remove_snap(sn);
13081 pp->set_snap_epoch(pending_inc.epoch);
13082 ss << "removed pool " << poolstr << " snap " << snapname;
13083 } else {
13084 ss << "already removed pool " << poolstr << " snap " << snapname;
13085 }
13086 getline(ss, rs);
13087 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13088 get_last_committed() + 1));
13089 return true;
13090 } else if (prefix == "osd pool create") {
13091 int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
13092 int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
13093 int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
13094 int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
13095 string pool_type_str;
13096 cmd_getval(cmdmap, "pool_type", pool_type_str);
13097 if (pool_type_str.empty())
13098 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
13099
13100 string poolstr;
13101 cmd_getval(cmdmap, "pool", poolstr);
13102 bool confirm = false;
13103 //confirmation may be set to true only by internal operations.
13104 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13105 if (poolstr[0] == '.' && !confirm) {
13106 ss << "pool names beginning with . are not allowed";
13107 err = 0;
13108 goto reply;
13109 }
13110 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13111 if (pool_id >= 0) {
13112 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13113 if (pool_type_str != p->get_type_name()) {
13114 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
13115 err = -EINVAL;
13116 } else {
13117 ss << "pool '" << poolstr << "' already exists";
13118 err = 0;
13119 }
13120 goto reply;
13121 }
13122
13123 int pool_type;
13124 if (pool_type_str == "replicated") {
13125 pool_type = pg_pool_t::TYPE_REPLICATED;
13126 } else if (pool_type_str == "erasure") {
13127 pool_type = pg_pool_t::TYPE_ERASURE;
13128 } else {
13129 ss << "unknown pool type '" << pool_type_str << "'";
13130 err = -EINVAL;
13131 goto reply;
13132 }
13133
13134 bool implicit_rule_creation = false;
13135 int64_t expected_num_objects = 0;
13136 string rule_name;
13137 cmd_getval(cmdmap, "rule", rule_name);
13138 string erasure_code_profile;
13139 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
13140
13141 if (pool_type == pg_pool_t::TYPE_ERASURE) {
13142 if (erasure_code_profile == "")
13143 erasure_code_profile = "default";
13144 //handle the erasure code profile
13145 if (erasure_code_profile == "default") {
13146 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
13147 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
13148 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
13149 goto wait;
13150 }
13151
13152 map<string,string> profile_map;
13153 err = osdmap.get_erasure_code_profile_default(cct,
13154 profile_map,
13155 &ss);
13156 if (err)
13157 goto reply;
13158 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
13159 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
13160 goto wait;
13161 }
13162 }
13163 if (rule_name == "") {
13164 implicit_rule_creation = true;
13165 if (erasure_code_profile == "default") {
13166 rule_name = "erasure-code";
13167 } else {
13168 dout(1) << "implicitly use rule named after the pool: "
13169 << poolstr << dendl;
13170 rule_name = poolstr;
13171 }
13172 }
13173 expected_num_objects =
13174 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13175 } else {
13176 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
13177 // and put expected_num_objects to rule field
13178 if (erasure_code_profile != "") { // cmd is from CLI
13179 if (rule_name != "") {
13180 string interr;
13181 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
13182 if (interr.length()) {
13183 ss << "error parsing integer value '" << rule_name << "': " << interr;
13184 err = -EINVAL;
13185 goto reply;
13186 }
13187 }
13188 rule_name = erasure_code_profile;
13189 } else { // cmd is well-formed
13190 expected_num_objects =
13191 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13192 }
13193 }
13194
13195 if (!implicit_rule_creation && rule_name != "") {
13196 int rule;
13197 err = get_crush_rule(rule_name, &rule, &ss);
13198 if (err == -EAGAIN) {
13199 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13200 return true;
13201 }
13202 if (err)
13203 goto reply;
13204 }
13205
13206 if (expected_num_objects < 0) {
13207 ss << "'expected_num_objects' must be non-negative";
13208 err = -EINVAL;
13209 goto reply;
13210 }
13211
13212 set<int32_t> osds;
13213 osdmap.get_all_osds(osds);
13214 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
13215 string type;
13216 if (!get_osd_objectstore_type(osd, &type)) {
13217 return type == "filestore";
13218 } else {
13219 return false;
13220 }
13221 });
13222
13223 if (has_filestore_osd &&
13224 expected_num_objects > 0 &&
13225 cct->_conf->filestore_merge_threshold > 0) {
13226 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13227 err = -EINVAL;
13228 goto reply;
13229 }
13230
13231 if (has_filestore_osd &&
13232 expected_num_objects == 0 &&
13233 cct->_conf->filestore_merge_threshold < 0) {
13234 int osds = osdmap.get_num_osds();
13235 bool sure = false;
13236 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13237 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
13238 ss << "For better initial performance on pools expected to store a "
13239 << "large number of objects, consider supplying the "
13240 << "expected_num_objects parameter when creating the pool."
13241 << " Pass --yes-i-really-mean-it to ignore it";
13242 err = -EPERM;
13243 goto reply;
13244 }
13245 }
13246
13247 int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
13248 FastReadType fast_read = FAST_READ_DEFAULT;
13249 if (fast_read_param == 0)
13250 fast_read = FAST_READ_OFF;
13251 else if (fast_read_param > 0)
13252 fast_read = FAST_READ_ON;
13253
13254 int64_t repl_size = 0;
13255 cmd_getval(cmdmap, "size", repl_size);
13256 int64_t target_size_bytes = 0;
13257 double target_size_ratio = 0.0;
13258 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
13259 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
13260
13261 string pg_autoscale_mode;
13262 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
13263
13264 bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
13265
13266 bool crimson = cmd_getval_or<bool>(cmdmap, "crimson", false) ||
13267 cct->_conf.get_val<bool>("osd_pool_default_crimson");
13268
13269 err = prepare_new_pool(poolstr,
13270 -1, // default crush rule
13271 rule_name,
13272 pg_num, pgp_num, pg_num_min, pg_num_max,
13273 repl_size, target_size_bytes, target_size_ratio,
13274 erasure_code_profile, pool_type,
13275 (uint64_t)expected_num_objects,
13276 fast_read,
13277 pg_autoscale_mode,
13278 bulk,
13279 crimson,
13280 &ss);
13281 if (err < 0) {
13282 switch(err) {
13283 case -EEXIST:
13284 ss << "pool '" << poolstr << "' already exists";
13285 break;
13286 case -EAGAIN:
13287 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13288 return true;
13289 case -ERANGE:
13290 goto reply;
13291 default:
13292 goto reply;
13293 break;
13294 }
13295 } else {
13296 ss << "pool '" << poolstr << "' created";
13297 }
13298 getline(ss, rs);
13299 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13300 get_last_committed() + 1));
13301 return true;
13302
13303 } else if (prefix == "osd pool delete" ||
13304 prefix == "osd pool rm") {
13305 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13306 string poolstr, poolstr2, sure;
13307 cmd_getval(cmdmap, "pool", poolstr);
13308 cmd_getval(cmdmap, "pool2", poolstr2);
13309 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13310 if (pool < 0) {
13311 ss << "pool '" << poolstr << "' does not exist";
13312 err = 0;
13313 goto reply;
13314 }
13315
13316 bool force_no_fake = false;
13317 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
13318 bool force = false;
13319 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
13320 if (poolstr2 != poolstr ||
13321 (!force && !force_no_fake)) {
13322 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13323 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13324 << "followed by --yes-i-really-really-mean-it.";
13325 err = -EPERM;
13326 goto reply;
13327 }
13328 err = _prepare_remove_pool(pool, &ss, force_no_fake);
13329 if (err == -EAGAIN) {
13330 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13331 return true;
13332 }
13333 if (err < 0)
13334 goto reply;
13335 goto update;
13336 } else if (prefix == "osd pool rename") {
13337 string srcpoolstr, destpoolstr;
13338 cmd_getval(cmdmap, "srcpool", srcpoolstr);
13339 cmd_getval(cmdmap, "destpool", destpoolstr);
13340 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13341 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
13342 bool confirm = false;
13343 //confirmation may be set to true only by internal operations.
13344 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13345 if (destpoolstr[0] == '.' && !confirm) {
13346 ss << "pool names beginning with . are not allowed";
13347 err = 0;
13348 goto reply;
13349 }
13350 if (pool_src < 0) {
13351 if (pool_dst >= 0) {
13352 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13353 // of operations, assume this rename succeeded, as it is not changing
13354 // the current state. Make sure we output something understandable
13355 // for whoever is issuing the command, if they are paying attention,
13356 // in case it was not intentional; or to avoid a "wtf?" and a bug
13357 // report in case it was intentional, while expecting a failure.
13358 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13359 << destpoolstr << "' does -- assuming successful rename";
13360 err = 0;
13361 } else {
13362 ss << "unrecognized pool '" << srcpoolstr << "'";
13363 err = -ENOENT;
13364 }
13365 goto reply;
13366 } else if (pool_dst >= 0) {
13367 // source pool exists and so does the destination pool
13368 ss << "pool '" << destpoolstr << "' already exists";
13369 err = -EEXIST;
13370 goto reply;
13371 }
13372
13373 int ret = _prepare_rename_pool(pool_src, destpoolstr);
13374 if (ret == 0) {
13375 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13376 } else {
13377 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13378 << cpp_strerror(ret);
13379 }
13380 getline(ss, rs);
13381 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13382 get_last_committed() + 1));
13383 return true;
13384
13385 } else if (prefix == "osd pool set") {
13386 err = prepare_command_pool_set(cmdmap, ss);
13387 if (err == -EAGAIN)
13388 goto wait;
13389 if (err < 0)
13390 goto reply;
13391
13392 getline(ss, rs);
13393 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13394 get_last_committed() + 1));
13395 return true;
13396 } else if (prefix == "osd tier add") {
13397 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13398 if (err == -EAGAIN)
13399 goto wait;
13400 if (err)
13401 goto reply;
13402 string poolstr;
13403 cmd_getval(cmdmap, "pool", poolstr);
13404 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13405 if (pool_id < 0) {
13406 ss << "unrecognized pool '" << poolstr << "'";
13407 err = -ENOENT;
13408 goto reply;
13409 }
13410 string tierpoolstr;
13411 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13412 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13413 if (tierpool_id < 0) {
13414 ss << "unrecognized pool '" << tierpoolstr << "'";
13415 err = -ENOENT;
13416 goto reply;
13417 }
13418 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13419 ceph_assert(p);
13420 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13421 ceph_assert(tp);
13422
13423 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13424 goto reply;
13425 }
13426
13427 // make sure new tier is empty
13428 bool force_nonempty = false;
13429 cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
13430 const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13431 if (pstats && pstats->stats.sum.num_objects != 0 &&
13432 !force_nonempty) {
13433 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13434 err = -ENOTEMPTY;
13435 goto reply;
13436 }
13437 if (tp->is_erasure()) {
13438 ss << "tier pool '" << tierpoolstr
13439 << "' is an ec pool, which cannot be a tier";
13440 err = -ENOTSUP;
13441 goto reply;
13442 }
13443 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13444 (!force_nonempty ||
13445 !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
13446 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13447 err = -ENOTEMPTY;
13448 goto reply;
13449 }
13450 // go
13451 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13452 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13453 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13454 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13455 return true;
13456 }
13457 np->tiers.insert(tierpool_id);
13458 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13459 ntp->tier_of = pool_id;
13460 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13461 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13462 get_last_committed() + 1));
13463 return true;
13464 } else if (prefix == "osd tier remove" ||
13465 prefix == "osd tier rm") {
13466 string poolstr;
13467 cmd_getval(cmdmap, "pool", poolstr);
13468 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13469 if (pool_id < 0) {
13470 ss << "unrecognized pool '" << poolstr << "'";
13471 err = -ENOENT;
13472 goto reply;
13473 }
13474 string tierpoolstr;
13475 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13476 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13477 if (tierpool_id < 0) {
13478 ss << "unrecognized pool '" << tierpoolstr << "'";
13479 err = -ENOENT;
13480 goto reply;
13481 }
13482 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13483 ceph_assert(p);
13484 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13485 ceph_assert(tp);
13486
13487 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13488 goto reply;
13489 }
13490
13491 if (p->tiers.count(tierpool_id) == 0) {
13492 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13493 err = 0;
13494 goto reply;
13495 }
13496 if (tp->tier_of != pool_id) {
13497 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13498 << osdmap.get_pool_name(tp->tier_of) << "': "
13499 // be scary about it; this is an inconsistency and bells must go off
13500 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13501 err = -EINVAL;
13502 goto reply;
13503 }
13504 if (p->read_tier == tierpool_id) {
13505 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13506 err = -EBUSY;
13507 goto reply;
13508 }
13509 // go
13510 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13511 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13512 if (np->tiers.count(tierpool_id) == 0 ||
13513 ntp->tier_of != pool_id ||
13514 np->read_tier == tierpool_id) {
13515 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13516 return true;
13517 }
13518 np->tiers.erase(tierpool_id);
13519 ntp->clear_tier();
13520 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13521 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13522 get_last_committed() + 1));
13523 return true;
13524 } else if (prefix == "osd tier set-overlay") {
13525 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13526 if (err == -EAGAIN)
13527 goto wait;
13528 if (err)
13529 goto reply;
13530 string poolstr;
13531 cmd_getval(cmdmap, "pool", poolstr);
13532 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13533 if (pool_id < 0) {
13534 ss << "unrecognized pool '" << poolstr << "'";
13535 err = -ENOENT;
13536 goto reply;
13537 }
13538 string overlaypoolstr;
13539 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13540 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13541 if (overlaypool_id < 0) {
13542 ss << "unrecognized pool '" << overlaypoolstr << "'";
13543 err = -ENOENT;
13544 goto reply;
13545 }
13546 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13547 ceph_assert(p);
13548 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13549 ceph_assert(overlay_p);
13550 if (p->tiers.count(overlaypool_id) == 0) {
13551 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13552 err = -EINVAL;
13553 goto reply;
13554 }
13555 if (p->read_tier == overlaypool_id) {
13556 err = 0;
13557 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13558 goto reply;
13559 }
13560 if (p->has_read_tier()) {
13561 ss << "pool '" << poolstr << "' has overlay '"
13562 << osdmap.get_pool_name(p->read_tier)
13563 << "'; please remove-overlay first";
13564 err = -EINVAL;
13565 goto reply;
13566 }
13567
13568 // go
13569 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13570 np->read_tier = overlaypool_id;
13571 np->write_tier = overlaypool_id;
13572 np->set_last_force_op_resend(pending_inc.epoch);
13573 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13574 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13575 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13576 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13577 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13578 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13579 get_last_committed() + 1));
13580 return true;
13581 } else if (prefix == "osd tier remove-overlay" ||
13582 prefix == "osd tier rm-overlay") {
13583 string poolstr;
13584 cmd_getval(cmdmap, "pool", poolstr);
13585 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13586 if (pool_id < 0) {
13587 ss << "unrecognized pool '" << poolstr << "'";
13588 err = -ENOENT;
13589 goto reply;
13590 }
13591 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13592 ceph_assert(p);
13593 if (!p->has_read_tier()) {
13594 err = 0;
13595 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13596 goto reply;
13597 }
13598
13599 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13600 goto reply;
13601 }
13602
13603 // go
13604 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13605 if (np->has_read_tier()) {
13606 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13607 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13608 nop->set_last_force_op_resend(pending_inc.epoch);
13609 }
13610 if (np->has_write_tier()) {
13611 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13612 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13613 nop->set_last_force_op_resend(pending_inc.epoch);
13614 }
13615 np->clear_read_tier();
13616 np->clear_write_tier();
13617 np->set_last_force_op_resend(pending_inc.epoch);
13618 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13619 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13620 get_last_committed() + 1));
13621 return true;
13622 } else if (prefix == "osd tier cache-mode") {
13623 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13624 if (err == -EAGAIN)
13625 goto wait;
13626 if (err)
13627 goto reply;
13628 string poolstr;
13629 cmd_getval(cmdmap, "pool", poolstr);
13630 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13631 if (pool_id < 0) {
13632 ss << "unrecognized pool '" << poolstr << "'";
13633 err = -ENOENT;
13634 goto reply;
13635 }
13636 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13637 ceph_assert(p);
13638 if (!p->is_tier()) {
13639 ss << "pool '" << poolstr << "' is not a tier";
13640 err = -EINVAL;
13641 goto reply;
13642 }
13643 string modestr;
13644 cmd_getval(cmdmap, "mode", modestr);
13645 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13646 if (int(mode) < 0) {
13647 ss << "'" << modestr << "' is not a valid cache mode";
13648 err = -EINVAL;
13649 goto reply;
13650 }
13651
13652 bool sure = false;
13653 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13654
13655 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13656 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13657 ss << "'" << modestr << "' is no longer a supported cache mode";
13658 err = -EPERM;
13659 goto reply;
13660 }
13661 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13662 mode != pg_pool_t::CACHEMODE_NONE &&
13663 mode != pg_pool_t::CACHEMODE_PROXY &&
13664 mode != pg_pool_t::CACHEMODE_READPROXY) &&
13665 !sure) {
13666 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13667 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13668 err = -EPERM;
13669 goto reply;
13670 }
13671
13672 // pool already has this cache-mode set and there are no pending changes
13673 if (p->cache_mode == mode &&
13674 (pending_inc.new_pools.count(pool_id) == 0 ||
13675 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13676 ss << "set cache-mode for pool '" << poolstr << "'"
13677 << " to " << pg_pool_t::get_cache_mode_name(mode);
13678 err = 0;
13679 goto reply;
13680 }
13681
13682 /* Mode description:
13683 *
13684 * none: No cache-mode defined
13685 * forward: Forward all reads and writes to base pool [removed]
13686 * writeback: Cache writes, promote reads from base pool
13687 * readonly: Forward writes to base pool
13688 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13689 * proxy: Proxy all reads and writes to base pool
13690 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13691 *
13692 * Hence, these are the allowed transitions:
13693 *
13694 * none -> any
13695 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13696 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13697 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13698 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13699 * writeback -> readproxy || proxy
13700 * readonly -> any
13701 */
13702
13703 // We check if the transition is valid against the current pool mode, as
13704 // it is the only committed state thus far. We will blantly squash
13705 // whatever mode is on the pending state.
13706
13707 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13708 (mode != pg_pool_t::CACHEMODE_PROXY &&
13709 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13710 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13711 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13712 << "' pool; only '"
13713 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
13714 << "','"
13715 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13716 << "' allowed.";
13717 err = -EINVAL;
13718 goto reply;
13719 }
13720 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13721 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13722 mode != pg_pool_t::CACHEMODE_PROXY &&
13723 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13724
13725 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13726 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13727 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13728
13729 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13730 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13731 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13732
13733 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13734 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13735 mode != pg_pool_t::CACHEMODE_PROXY &&
13736 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13737
13738 const pool_stat_t* pstats =
13739 mon.mgrstatmon()->get_pool_stat(pool_id);
13740
13741 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13742 ss << "unable to set cache-mode '"
13743 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13744 << "': dirty objects found";
13745 err = -EBUSY;
13746 goto reply;
13747 }
13748 }
13749 // go
13750 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13751 np->cache_mode = mode;
13752 // set this both when moving to and from cache_mode NONE. this is to
13753 // capture legacy pools that were set up before this flag existed.
13754 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13755 ss << "set cache-mode for pool '" << poolstr
13756 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13757 if (mode == pg_pool_t::CACHEMODE_NONE) {
13758 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13759 ceph_assert(base_pool);
13760 if (base_pool->read_tier == pool_id ||
13761 base_pool->write_tier == pool_id)
13762 ss <<" (WARNING: pool is still configured as read or write tier)";
13763 }
13764 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13765 get_last_committed() + 1));
13766 return true;
13767 } else if (prefix == "osd tier add-cache") {
13768 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13769 if (err == -EAGAIN)
13770 goto wait;
13771 if (err)
13772 goto reply;
13773 string poolstr;
13774 cmd_getval(cmdmap, "pool", poolstr);
13775 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13776 if (pool_id < 0) {
13777 ss << "unrecognized pool '" << poolstr << "'";
13778 err = -ENOENT;
13779 goto reply;
13780 }
13781 string tierpoolstr;
13782 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13783 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13784 if (tierpool_id < 0) {
13785 ss << "unrecognized pool '" << tierpoolstr << "'";
13786 err = -ENOENT;
13787 goto reply;
13788 }
13789 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13790 ceph_assert(p);
13791 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13792 ceph_assert(tp);
13793
13794 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13795 goto reply;
13796 }
13797
13798 int64_t size = 0;
13799 if (!cmd_getval(cmdmap, "size", size)) {
13800 ss << "unable to parse 'size' value '"
13801 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13802 err = -EINVAL;
13803 goto reply;
13804 }
13805 // make sure new tier is empty
13806 const pool_stat_t *pstats =
13807 mon.mgrstatmon()->get_pool_stat(tierpool_id);
13808 if (pstats && pstats->stats.sum.num_objects != 0) {
13809 ss << "tier pool '" << tierpoolstr << "' is not empty";
13810 err = -ENOTEMPTY;
13811 goto reply;
13812 }
13813 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13814 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13815 if (int(mode) < 0) {
13816 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13817 err = -EINVAL;
13818 goto reply;
13819 }
13820 HitSet::Params hsp;
13821 auto& cache_hit_set_type =
13822 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13823 if (cache_hit_set_type == "bloom") {
13824 BloomHitSet::Params *bsp = new BloomHitSet::Params;
13825 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13826 hsp = HitSet::Params(bsp);
13827 } else if (cache_hit_set_type == "explicit_hash") {
13828 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13829 } else if (cache_hit_set_type == "explicit_object") {
13830 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13831 } else {
13832 ss << "osd tier cache default hit set type '"
13833 << cache_hit_set_type << "' is not a known type";
13834 err = -EINVAL;
13835 goto reply;
13836 }
13837 // go
13838 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13839 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13840 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13841 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13842 return true;
13843 }
13844 np->tiers.insert(tierpool_id);
13845 np->read_tier = np->write_tier = tierpool_id;
13846 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13847 np->set_last_force_op_resend(pending_inc.epoch);
13848 ntp->set_last_force_op_resend(pending_inc.epoch);
13849 ntp->tier_of = pool_id;
13850 ntp->cache_mode = mode;
13851 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13852 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13853 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13854 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13855 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13856 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13857 ntp->hit_set_params = hsp;
13858 ntp->target_max_bytes = size;
13859 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13860 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13861 get_last_committed() + 1));
13862 return true;
13863 } else if (prefix == "osd pool set-quota") {
13864 string poolstr;
13865 cmd_getval(cmdmap, "pool", poolstr);
13866 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13867 if (pool_id < 0) {
13868 ss << "unrecognized pool '" << poolstr << "'";
13869 err = -ENOENT;
13870 goto reply;
13871 }
13872
13873 string field;
13874 cmd_getval(cmdmap, "field", field);
13875 if (field != "max_objects" && field != "max_bytes") {
13876 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13877 err = -EINVAL;
13878 goto reply;
13879 }
13880
13881 // val could contain unit designations, so we treat as a string
13882 string val;
13883 cmd_getval(cmdmap, "val", val);
13884 string tss;
13885 int64_t value;
13886 if (field == "max_objects") {
13887 value = strict_si_cast<uint64_t>(val, &tss);
13888 } else if (field == "max_bytes") {
13889 value = strict_iecstrtoll(val, &tss);
13890 } else {
13891 ceph_abort_msg("unrecognized option");
13892 }
13893 if (!tss.empty()) {
13894 ss << "error parsing value '" << val << "': " << tss;
13895 err = -EINVAL;
13896 goto reply;
13897 }
13898
13899 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13900 if (field == "max_objects") {
13901 pi->quota_max_objects = value;
13902 } else if (field == "max_bytes") {
13903 pi->quota_max_bytes = value;
13904 } else {
13905 ceph_abort_msg("unrecognized option");
13906 }
13907 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13908 rs = ss.str();
13909 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13910 get_last_committed() + 1));
13911 return true;
13912 } else if (prefix == "osd pool application enable" ||
13913 prefix == "osd pool application disable" ||
13914 prefix == "osd pool application set" ||
13915 prefix == "osd pool application rm") {
13916 err = prepare_command_pool_application(prefix, cmdmap, ss);
13917 if (err == -EAGAIN) {
13918 goto wait;
13919 } else if (err < 0) {
13920 goto reply;
13921 } else {
13922 goto update;
13923 }
13924 } else if (prefix == "osd force-create-pg") {
13925 pg_t pgid;
13926 string pgidstr;
13927 err = parse_pgid(cmdmap, ss, pgid, pgidstr);
13928 if (err < 0)
13929 goto reply;
13930 bool sure = false;
13931 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13932 if (!sure) {
13933 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13934 << "that the cluster will give up ever trying to recover the lost data. Do this "
13935 << "only if you are certain that all copies of the PG are in fact lost and you are "
13936 << "willing to accept that the data is permanently destroyed. Pass "
13937 << "--yes-i-really-mean-it to proceed.";
13938 err = -EPERM;
13939 goto reply;
13940 }
13941 bool creating_now;
13942 {
13943 std::lock_guard<std::mutex> l(creating_pgs_lock);
13944 auto emplaced = creating_pgs.pgs.emplace(
13945 pgid,
13946 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13947 ceph_clock_now()));
13948 creating_now = emplaced.second;
13949 }
13950 if (creating_now) {
13951 ss << "pg " << pgidstr << " now creating, ok";
13952 // set the pool's CREATING flag so that (1) the osd won't ignore our
13953 // create message and (2) we won't propose any future pg_num changes
13954 // until after the PG has been instantiated.
13955 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13956 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13957 }
13958 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13959 err = 0;
13960 goto update;
13961 } else {
13962 ss << "pg " << pgid << " already creating";
13963 err = 0;
13964 goto reply;
13965 }
13966 } else if (prefix == "osd force_healthy_stretch_mode") {
13967 bool sure = false;
13968 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13969 if (!sure) {
13970 ss << "This command will require peering across multiple CRUSH buckets "
13971 "(probably two data centers or availability zones?) and may result in PGs "
13972 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13973 err = -EPERM;
13974 goto reply;
13975 }
13976 try_end_recovery_stretch_mode(true);
13977 ss << "Triggering healthy stretch mode";
13978 err = 0;
13979 goto reply;
13980 } else if (prefix == "osd force_recovery_stretch_mode") {
13981 bool sure = false;
13982 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13983 if (!sure) {
13984 ss << "This command will increase pool sizes to try and spread them "
13985 "across multiple CRUSH buckets (probably two data centers or "
13986 "availability zones?) and should have happened automatically"
13987 "Pass --yes-i-really-mean-it to proceed.";
13988 err = -EPERM;
13989 goto reply;
13990 }
13991 mon.go_recovery_stretch_mode();
13992 ss << "Triggering recovery stretch mode";
13993 err = 0;
13994 goto reply;
13995 } else if (prefix == "osd set-allow-crimson") {
13996
13997 bool sure = false;
13998 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13999
14000 bool experimental_enabled =
14001 g_ceph_context->check_experimental_feature_enabled("crimson");
14002 if (!sure || !experimental_enabled) {
14003 ss << "This command will allow usage of crimson-osd osd daemons. "
14004 << "crimson-osd is not considered stable and will likely cause "
14005 << "crashes or data corruption. At this time, crimson-osd is mainly "
14006 << "useful for performance evaluation, testing, and development. "
14007 << "If you are sure, add --yes-i-really-mean-it and add 'crimson' to "
14008 << "the experimental features config. This setting is irrevocable.";
14009 err = -EPERM;
14010 goto reply;
14011 }
14012
14013 err = 0;
14014 if (osdmap.get_allow_crimson()) {
14015 goto reply;
14016 } else {
14017 pending_inc.set_allow_crimson();
14018 goto update;
14019 }
14020 } else {
14021 err = -EINVAL;
14022 }
14023
14024 reply:
14025 getline(ss, rs);
14026 if (err < 0 && rs.length() == 0)
14027 rs = cpp_strerror(err);
14028 mon.reply_command(op, err, rs, rdata, get_last_committed());
14029 return ret;
14030
14031 update:
14032 getline(ss, rs);
14033 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
14034 get_last_committed() + 1));
14035 return true;
14036
14037 wait:
14038 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14039 return true;
14040 }
14041
14042 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
14043 {
14044 op->mark_osdmon_event(__func__);
14045
14046 auto m = op->get_req<MPoolOp>();
14047 MonSession *session = op->get_session();
14048 if (!session) {
14049 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14050 return true;
14051 }
14052
14053 switch (m->op) {
14054 case POOL_OP_CREATE_UNMANAGED_SNAP:
14055 case POOL_OP_DELETE_UNMANAGED_SNAP:
14056 {
14057 const std::string* pool_name = nullptr;
14058 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
14059 if (pg_pool != nullptr) {
14060 pool_name = &osdmap.get_pool_name(m->pool);
14061 }
14062
14063 if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
14064 session->entity_name, session->caps,
14065 session->get_peer_socket_addr(),
14066 pool_name)) {
14067 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
14068 << "privileges. message: " << *m << std::endl
14069 << "caps: " << session->caps << dendl;
14070 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14071 return true;
14072 }
14073 }
14074 break;
14075 default:
14076 if (!session->is_capable("osd", MON_CAP_W)) {
14077 dout(0) << "got pool op from entity with insufficient privileges. "
14078 << "message: " << *m << std::endl
14079 << "caps: " << session->caps << dendl;
14080 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14081 return true;
14082 }
14083 break;
14084 }
14085
14086 return false;
14087 }
14088
14089 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
14090 {
14091 op->mark_osdmon_event(__func__);
14092 auto m = op->get_req<MPoolOp>();
14093
14094 if (enforce_pool_op_caps(op)) {
14095 return true;
14096 }
14097
14098 if (m->fsid != mon.monmap->fsid) {
14099 dout(0) << __func__ << " drop message on fsid " << m->fsid
14100 << " != " << mon.monmap->fsid << " for " << *m << dendl;
14101 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14102 return true;
14103 }
14104
14105 if (m->op == POOL_OP_CREATE)
14106 return preprocess_pool_op_create(op);
14107
14108 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
14109 if (p == nullptr) {
14110 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
14111 if (m->op == POOL_OP_DELETE) {
14112 _pool_op_reply(op, 0, osdmap.get_epoch());
14113 } else {
14114 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14115 }
14116 return true;
14117 }
14118
14119 // check if the snap and snapname exist
14120 bool snap_exists = false;
14121 if (p->snap_exists(m->name.c_str()))
14122 snap_exists = true;
14123
14124 switch (m->op) {
14125 case POOL_OP_CREATE_SNAP:
14126 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
14127 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14128 return true;
14129 }
14130 if (snap_exists) {
14131 _pool_op_reply(op, 0, osdmap.get_epoch());
14132 return true;
14133 }
14134 return false;
14135 case POOL_OP_CREATE_UNMANAGED_SNAP:
14136 if (p->is_pool_snaps_mode()) {
14137 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14138 return true;
14139 }
14140 return false;
14141 case POOL_OP_DELETE_SNAP:
14142 if (p->is_unmanaged_snaps_mode()) {
14143 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14144 return true;
14145 }
14146 if (!snap_exists) {
14147 _pool_op_reply(op, 0, osdmap.get_epoch());
14148 return true;
14149 }
14150 return false;
14151 case POOL_OP_DELETE_UNMANAGED_SNAP:
14152 if (p->is_pool_snaps_mode()) {
14153 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14154 return true;
14155 }
14156 if (_is_removed_snap(m->pool, m->snapid)) {
14157 _pool_op_reply(op, 0, osdmap.get_epoch());
14158 return true;
14159 }
14160 return false;
14161 case POOL_OP_DELETE:
14162 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
14163 _pool_op_reply(op, 0, osdmap.get_epoch());
14164 return true;
14165 }
14166 return false;
14167 case POOL_OP_AUID_CHANGE:
14168 return false;
14169 default:
14170 ceph_abort();
14171 break;
14172 }
14173
14174 return false;
14175 }
14176
14177 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
14178 {
14179 if (!osdmap.have_pg_pool(pool)) {
14180 dout(10) << __func__ << " pool " << pool << " snap " << snap
14181 << " - pool dne" << dendl;
14182 return true;
14183 }
14184 if (osdmap.in_removed_snaps_queue(pool, snap)) {
14185 dout(10) << __func__ << " pool " << pool << " snap " << snap
14186 << " - in osdmap removed_snaps_queue" << dendl;
14187 return true;
14188 }
14189 snapid_t begin, end;
14190 int r = lookup_purged_snap(pool, snap, &begin, &end);
14191 if (r == 0) {
14192 dout(10) << __func__ << " pool " << pool << " snap " << snap
14193 << " - purged, [" << begin << "," << end << ")" << dendl;
14194 return true;
14195 }
14196 return false;
14197 }
14198
14199 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
14200 {
14201 if (pending_inc.old_pools.count(pool)) {
14202 dout(10) << __func__ << " pool " << pool << " snap " << snap
14203 << " - pool pending deletion" << dendl;
14204 return true;
14205 }
14206 if (pending_inc.in_new_removed_snaps(pool, snap)) {
14207 dout(10) << __func__ << " pool " << pool << " snap " << snap
14208 << " - in pending new_removed_snaps" << dendl;
14209 return true;
14210 }
14211 return false;
14212 }
14213
14214 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
14215 {
14216 op->mark_osdmon_event(__func__);
14217 auto m = op->get_req<MPoolOp>();
14218 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
14219 if (pool >= 0) {
14220 _pool_op_reply(op, 0, osdmap.get_epoch());
14221 return true;
14222 }
14223
14224 return false;
14225 }
14226
14227 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
14228 {
14229 op->mark_osdmon_event(__func__);
14230 auto m = op->get_req<MPoolOp>();
14231 dout(10) << "prepare_pool_op " << *m << dendl;
14232 if (m->op == POOL_OP_CREATE) {
14233 return prepare_pool_op_create(op);
14234 } else if (m->op == POOL_OP_DELETE) {
14235 return prepare_pool_op_delete(op);
14236 }
14237
14238 int ret = 0;
14239 bool changed = false;
14240
14241 if (!osdmap.have_pg_pool(m->pool)) {
14242 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14243 return false;
14244 }
14245
14246 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
14247
14248 if (m->op == POOL_OP_CREATE_SNAP ||
14249 m->op == POOL_OP_CREATE_UNMANAGED_SNAP) {
14250 if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(m->pool)) {
14251 dout(20) << "monitor-managed snapshots have been disabled for pools "
14252 " attached to an fs - pool:" << m->pool << dendl;
14253 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14254 return false;
14255 }
14256 }
14257
14258 switch (m->op) {
14259 case POOL_OP_CREATE_SNAP:
14260 if (pool->is_tier()) {
14261 ret = -EINVAL;
14262 _pool_op_reply(op, ret, osdmap.get_epoch());
14263 return false;
14264 } // else, fall through
14265 case POOL_OP_DELETE_SNAP:
14266 if (!pool->is_unmanaged_snaps_mode()) {
14267 bool snap_exists = pool->snap_exists(m->name.c_str());
14268 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
14269 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
14270 ret = 0;
14271 } else {
14272 break;
14273 }
14274 } else {
14275 ret = -EINVAL;
14276 }
14277 _pool_op_reply(op, ret, osdmap.get_epoch());
14278 return false;
14279
14280 case POOL_OP_DELETE_UNMANAGED_SNAP:
14281 // we won't allow removal of an unmanaged snapshot from a pool
14282 // not in unmanaged snaps mode.
14283 if (!pool->is_unmanaged_snaps_mode()) {
14284 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
14285 return false;
14286 }
14287 /* fall-thru */
14288 case POOL_OP_CREATE_UNMANAGED_SNAP:
14289 // but we will allow creating an unmanaged snapshot on any pool
14290 // as long as it is not in 'pool' snaps mode.
14291 if (pool->is_pool_snaps_mode()) {
14292 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14293 return false;
14294 }
14295 }
14296
14297 // projected pool info
14298 pg_pool_t pp;
14299 if (pending_inc.new_pools.count(m->pool))
14300 pp = pending_inc.new_pools[m->pool];
14301 else
14302 pp = *osdmap.get_pg_pool(m->pool);
14303
14304 bufferlist reply_data;
14305
14306 // pool snaps vs unmanaged snaps are mutually exclusive
14307 switch (m->op) {
14308 case POOL_OP_CREATE_SNAP:
14309 case POOL_OP_DELETE_SNAP:
14310 if (pp.is_unmanaged_snaps_mode()) {
14311 ret = -EINVAL;
14312 goto out;
14313 }
14314 break;
14315
14316 case POOL_OP_CREATE_UNMANAGED_SNAP:
14317 case POOL_OP_DELETE_UNMANAGED_SNAP:
14318 if (pp.is_pool_snaps_mode()) {
14319 ret = -EINVAL;
14320 goto out;
14321 }
14322 }
14323
14324 switch (m->op) {
14325 case POOL_OP_CREATE_SNAP:
14326 if (!pp.snap_exists(m->name.c_str())) {
14327 pp.add_snap(m->name.c_str(), ceph_clock_now());
14328 dout(10) << "create snap in pool " << m->pool << " " << m->name
14329 << " seq " << pp.get_snap_epoch() << dendl;
14330 changed = true;
14331 }
14332 break;
14333
14334 case POOL_OP_DELETE_SNAP:
14335 {
14336 snapid_t s = pp.snap_exists(m->name.c_str());
14337 if (s) {
14338 pp.remove_snap(s);
14339 pending_inc.new_removed_snaps[m->pool].insert(s);
14340 changed = true;
14341 }
14342 }
14343 break;
14344
14345 case POOL_OP_CREATE_UNMANAGED_SNAP:
14346 {
14347 uint64_t snapid = pp.add_unmanaged_snap(
14348 osdmap.require_osd_release < ceph_release_t::octopus);
14349 encode(snapid, reply_data);
14350 changed = true;
14351 }
14352 break;
14353
14354 case POOL_OP_DELETE_UNMANAGED_SNAP:
14355 if (!_is_removed_snap(m->pool, m->snapid) &&
14356 !_is_pending_removed_snap(m->pool, m->snapid)) {
14357 if (m->snapid > pp.get_snap_seq()) {
14358 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14359 return false;
14360 }
14361 pp.remove_unmanaged_snap(
14362 m->snapid,
14363 osdmap.require_osd_release < ceph_release_t::octopus);
14364 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
14365 // also record the new seq as purged: this avoids a discontinuity
14366 // after all of the snaps have been purged, since the seq assigned
14367 // during removal lives in the same namespace as the actual snaps.
14368 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
14369 changed = true;
14370 }
14371 break;
14372
14373 case POOL_OP_AUID_CHANGE:
14374 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14375 return false;
14376
14377 default:
14378 ceph_abort();
14379 break;
14380 }
14381
14382 if (changed) {
14383 pp.set_snap_epoch(pending_inc.epoch);
14384 pending_inc.new_pools[m->pool] = pp;
14385 }
14386
14387 out:
14388 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14389 return true;
14390 }
14391
14392 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14393 {
14394 op->mark_osdmon_event(__func__);
14395 int err = prepare_new_pool(op);
14396 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14397 return true;
14398 }
14399
14400 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14401 ostream *ss)
14402 {
14403 const string& poolstr = osdmap.get_pool_name(pool_id);
14404
14405 // If the Pool is in use by CephFS, refuse to delete it
14406 FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14407 if (pending_fsmap.pool_in_use(pool_id)) {
14408 *ss << "pool '" << poolstr << "' is in use by CephFS";
14409 return -EBUSY;
14410 }
14411
14412 if (pool.tier_of >= 0) {
14413 *ss << "pool '" << poolstr << "' is a tier of '"
14414 << osdmap.get_pool_name(pool.tier_of) << "'";
14415 return -EBUSY;
14416 }
14417 if (!pool.tiers.empty()) {
14418 *ss << "pool '" << poolstr << "' has tiers";
14419 for(auto tier : pool.tiers) {
14420 *ss << " " << osdmap.get_pool_name(tier);
14421 }
14422 return -EBUSY;
14423 }
14424
14425 if (!g_conf()->mon_allow_pool_delete) {
14426 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14427 return -EPERM;
14428 }
14429
14430 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14431 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14432 return -EPERM;
14433 }
14434
14435 *ss << "pool '" << poolstr << "' removed";
14436 return 0;
14437 }
14438
14439 /**
14440 * Check if it is safe to add a tier to a base pool
14441 *
14442 * @return
14443 * True if the operation should proceed, false if we should abort here
14444 * (abort doesn't necessarily mean error, could be idempotency)
14445 */
14446 bool OSDMonitor::_check_become_tier(
14447 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14448 const int64_t base_pool_id, const pg_pool_t *base_pool,
14449 int *err,
14450 ostream *ss) const
14451 {
14452 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14453 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14454
14455 if (tier_pool->is_crimson()) {
14456 *ss << "pool '" << tier_pool_name << "' is a crimson pool, tiering "
14457 << "features are not supported";
14458 *err = -EINVAL;
14459 return false;
14460 }
14461 if (base_pool->is_crimson()) {
14462 *ss << "pool '" << base_pool_name << "' is a crimson pool, tiering "
14463 << "features are not supported";
14464 *err = -EINVAL;
14465 return false;
14466 }
14467
14468 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14469 if (pending_fsmap.pool_in_use(tier_pool_id)) {
14470 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14471 *err = -EBUSY;
14472 return false;
14473 }
14474
14475 if (base_pool->tiers.count(tier_pool_id)) {
14476 ceph_assert(tier_pool->tier_of == base_pool_id);
14477 *err = 0;
14478 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14479 << base_pool_name << "'";
14480 return false;
14481 }
14482
14483 if (base_pool->is_tier()) {
14484 *ss << "pool '" << base_pool_name << "' is already a tier of '"
14485 << osdmap.get_pool_name(base_pool->tier_of) << "', "
14486 << "multiple tiers are not yet supported.";
14487 *err = -EINVAL;
14488 return false;
14489 }
14490
14491 if (tier_pool->has_tiers()) {
14492 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14493 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14494 it != tier_pool->tiers.end(); ++it)
14495 *ss << "'" << osdmap.get_pool_name(*it) << "',";
14496 *ss << " multiple tiers are not yet supported.";
14497 *err = -EINVAL;
14498 return false;
14499 }
14500
14501 if (tier_pool->is_tier()) {
14502 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14503 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14504 *err = -EINVAL;
14505 return false;
14506 }
14507
14508 *err = 0;
14509 return true;
14510 }
14511
14512
14513 /**
14514 * Check if it is safe to remove a tier from this base pool
14515 *
14516 * @return
14517 * True if the operation should proceed, false if we should abort here
14518 * (abort doesn't necessarily mean error, could be idempotency)
14519 */
14520 bool OSDMonitor::_check_remove_tier(
14521 const int64_t base_pool_id, const pg_pool_t *base_pool,
14522 const pg_pool_t *tier_pool,
14523 int *err, ostream *ss) const
14524 {
14525 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14526
14527 // Apply CephFS-specific checks
14528 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14529 if (pending_fsmap.pool_in_use(base_pool_id)) {
14530 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14531 // If the underlying pool is erasure coded and does not allow EC
14532 // overwrites, we can't permit the removal of the replicated tier that
14533 // CephFS relies on to access it
14534 *ss << "pool '" << base_pool_name <<
14535 "' does not allow EC overwrites and is in use by CephFS"
14536 " via its tier";
14537 *err = -EBUSY;
14538 return false;
14539 }
14540
14541 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14542 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14543 "tier is still in use as a writeback cache. Change the cache "
14544 "mode and flush the cache before removing it";
14545 *err = -EBUSY;
14546 return false;
14547 }
14548 }
14549
14550 *err = 0;
14551 return true;
14552 }
14553
14554 int OSDMonitor::_prepare_remove_pool(
14555 int64_t pool, ostream *ss, bool no_fake)
14556 {
14557 dout(10) << __func__ << " " << pool << dendl;
14558 const pg_pool_t *p = osdmap.get_pg_pool(pool);
14559 int r = _check_remove_pool(pool, *p, ss);
14560 if (r < 0)
14561 return r;
14562
14563 auto new_pool = pending_inc.new_pools.find(pool);
14564 if (new_pool != pending_inc.new_pools.end()) {
14565 // if there is a problem with the pending info, wait and retry
14566 // this op.
14567 const auto& p = new_pool->second;
14568 int r = _check_remove_pool(pool, p, ss);
14569 if (r < 0)
14570 return -EAGAIN;
14571 }
14572
14573 if (pending_inc.old_pools.count(pool)) {
14574 dout(10) << __func__ << " " << pool << " already pending removal"
14575 << dendl;
14576 return 0;
14577 }
14578
14579 if (g_conf()->mon_fake_pool_delete && !no_fake) {
14580 string old_name = osdmap.get_pool_name(pool);
14581 string new_name = old_name + "." + stringify(pool) + ".DELETED";
14582 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14583 << old_name << " -> " << new_name << dendl;
14584 pending_inc.new_pool_names[pool] = new_name;
14585 return 0;
14586 }
14587
14588 // remove
14589 pending_inc.old_pools.insert(pool);
14590
14591 // remove any pg_temp mappings for this pool
14592 for (auto p = osdmap.pg_temp->begin();
14593 p != osdmap.pg_temp->end();
14594 ++p) {
14595 if (p->first.pool() == pool) {
14596 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14597 << p->first << dendl;
14598 pending_inc.new_pg_temp[p->first].clear();
14599 }
14600 }
14601 // remove any primary_temp mappings for this pool
14602 for (auto p = osdmap.primary_temp->begin();
14603 p != osdmap.primary_temp->end();
14604 ++p) {
14605 if (p->first.pool() == pool) {
14606 dout(10) << __func__ << " " << pool
14607 << " removing obsolete primary_temp" << p->first << dendl;
14608 pending_inc.new_primary_temp[p->first] = -1;
14609 }
14610 }
14611 // remove any pg_upmap mappings for this pool
14612 for (auto& p : osdmap.pg_upmap) {
14613 if (p.first.pool() == pool) {
14614 dout(10) << __func__ << " " << pool
14615 << " removing obsolete pg_upmap "
14616 << p.first << dendl;
14617 pending_inc.old_pg_upmap.insert(p.first);
14618 }
14619 }
14620 // remove any pending pg_upmap mappings for this pool
14621 {
14622 auto it = pending_inc.new_pg_upmap.begin();
14623 while (it != pending_inc.new_pg_upmap.end()) {
14624 if (it->first.pool() == pool) {
14625 dout(10) << __func__ << " " << pool
14626 << " removing pending pg_upmap "
14627 << it->first << dendl;
14628 it = pending_inc.new_pg_upmap.erase(it);
14629 } else {
14630 it++;
14631 }
14632 }
14633 }
14634 // remove any pg_upmap_items mappings for this pool
14635 for (auto& p : osdmap.pg_upmap_items) {
14636 if (p.first.pool() == pool) {
14637 dout(10) << __func__ << " " << pool
14638 << " removing obsolete pg_upmap_items " << p.first
14639 << dendl;
14640 pending_inc.old_pg_upmap_items.insert(p.first);
14641 }
14642 }
14643 // remove any pending pg_upmap mappings for this pool
14644 {
14645 auto it = pending_inc.new_pg_upmap_items.begin();
14646 while (it != pending_inc.new_pg_upmap_items.end()) {
14647 if (it->first.pool() == pool) {
14648 dout(10) << __func__ << " " << pool
14649 << " removing pending pg_upmap_items "
14650 << it->first << dendl;
14651 it = pending_inc.new_pg_upmap_items.erase(it);
14652 } else {
14653 it++;
14654 }
14655 }
14656 }
14657
14658 // remove any choose_args for this pool
14659 CrushWrapper newcrush = _get_pending_crush();
14660 if (newcrush.have_choose_args(pool)) {
14661 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14662 newcrush.rm_choose_args(pool);
14663 pending_inc.crush.clear();
14664 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14665 }
14666 return 0;
14667 }
14668
14669 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14670 {
14671 dout(10) << "_prepare_rename_pool " << pool << dendl;
14672 if (pending_inc.old_pools.count(pool)) {
14673 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14674 return -ENOENT;
14675 }
14676 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14677 p != pending_inc.new_pool_names.end();
14678 ++p) {
14679 if (p->second == newname && p->first != pool) {
14680 return -EEXIST;
14681 }
14682 }
14683
14684 pending_inc.new_pool_names[pool] = newname;
14685 return 0;
14686 }
14687
14688 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14689 {
14690 op->mark_osdmon_event(__func__);
14691 auto m = op->get_req<MPoolOp>();
14692 ostringstream ss;
14693 int ret = _prepare_remove_pool(m->pool, &ss, false);
14694 if (ret == -EAGAIN) {
14695 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14696 return true;
14697 }
14698 if (ret < 0)
14699 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14700 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14701 pending_inc.epoch));
14702 return true;
14703 }
14704
14705 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14706 int ret, epoch_t epoch, bufferlist *blp)
14707 {
14708 op->mark_osdmon_event(__func__);
14709 auto m = op->get_req<MPoolOp>();
14710 dout(20) << "_pool_op_reply " << ret << dendl;
14711 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14712 ret, epoch, get_last_committed(), blp);
14713 mon.send_reply(op, reply);
14714 }
14715
14716 void OSDMonitor::convert_pool_priorities(void)
14717 {
14718 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14719 int64_t max_prio = 0;
14720 int64_t min_prio = 0;
14721 for (const auto &i : osdmap.get_pools()) {
14722 const auto &pool = i.second;
14723
14724 if (pool.opts.is_set(key)) {
14725 int64_t prio = 0;
14726 pool.opts.get(key, &prio);
14727 if (prio > max_prio)
14728 max_prio = prio;
14729 if (prio < min_prio)
14730 min_prio = prio;
14731 }
14732 }
14733 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14734 dout(20) << __func__ << " nothing to fix" << dendl;
14735 return;
14736 }
14737 // Current pool priorities exceeds new maximum
14738 for (const auto &i : osdmap.get_pools()) {
14739 const auto pool_id = i.first;
14740 pg_pool_t pool = i.second;
14741
14742 int64_t prio = 0;
14743 pool.opts.get(key, &prio);
14744 int64_t n;
14745
14746 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14747 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14748 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14749 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14750 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14751 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14752 } else {
14753 continue;
14754 }
14755 if (n == 0) {
14756 pool.opts.unset(key);
14757 } else {
14758 pool.opts.set(key, static_cast<int64_t>(n));
14759 }
14760 dout(10) << __func__ << " pool " << pool_id
14761 << " recovery_priority adjusted "
14762 << prio << " to " << n << dendl;
14763 pool.last_change = pending_inc.epoch;
14764 pending_inc.new_pools[pool_id] = pool;
14765 }
14766 }
14767
14768 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14769 int *errcode,
14770 set<pg_pool_t*>* pools,
14771 const string& new_crush_rule)
14772 {
14773 dout(20) << __func__ << dendl;
14774 *okay = false;
14775 int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14776 if (new_crush_rule_result < 0) {
14777 ss << "unrecognized crush rule " << new_crush_rule_result;
14778 *errcode = new_crush_rule_result;
14779 return;
14780 }
14781 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14782 for (const auto& pooli : osdmap.pools) {
14783 int64_t poolid = pooli.first;
14784 const pg_pool_t *p = &pooli.second;
14785 if (!p->is_replicated()) {
14786 ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14787 *errcode = -EINVAL;
14788 return;
14789 }
14790 uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14791 if ((p->get_size() != default_size ||
14792 (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14793 (p->get_crush_rule() != new_rule)) {
14794 ss << "we currently require stretch mode pools start out with the"
14795 " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14796 *errcode = -EINVAL;
14797 return;
14798 }
14799 pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14800 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14801 // the attempt may fail and then we have these pool updates...but they won't do anything
14802 // if there is a failure, so if it's hard to change the interface, no need to bother
14803 pools->insert(pp);
14804 }
14805 *okay = true;
14806 return;
14807 }
14808
14809 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14810 int *errcode, bool commit,
14811 const string& dividing_bucket,
14812 uint32_t bucket_count,
14813 const set<pg_pool_t*>& pools,
14814 const string& new_crush_rule)
14815 {
14816 dout(20) << __func__ << dendl;
14817 *okay = false;
14818 CrushWrapper crush = _get_pending_crush();
14819 int dividing_id = -1;
14820 if (auto type_id = crush.get_validated_type_id(dividing_bucket);
14821 !type_id.has_value()) {
14822 ss << dividing_bucket << " is not a valid crush bucket type";
14823 *errcode = -ENOENT;
14824 ceph_assert(!commit);
14825 return;
14826 } else {
14827 dividing_id = *type_id;
14828 }
14829 vector<int> subtrees;
14830 crush.get_subtree_of_type(dividing_id, &subtrees);
14831 if (subtrees.size() != 2) {
14832 ss << "there are " << subtrees.size() << dividing_bucket
14833 << "'s in the cluster but stretch mode currently only works with 2!";
14834 *errcode = -EINVAL;
14835 ceph_assert(!commit || subtrees.size() == 2);
14836 return;
14837 }
14838
14839 int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14840 if (new_crush_rule_result < 0) {
14841 ss << "unrecognized crush rule " << new_crush_rule;
14842 *errcode = new_crush_rule_result;
14843 ceph_assert(!commit || (new_crush_rule_result > 0));
14844 return;
14845 }
14846 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14847
14848 int weight1 = crush.get_item_weight(subtrees[0]);
14849 int weight2 = crush.get_item_weight(subtrees[1]);
14850 if (weight1 != weight2) {
14851 // TODO: I'm really not sure this is a good idea?
14852 ss << "the 2 " << dividing_bucket
14853 << "instances in the cluster have differing weights "
14854 << weight1 << " and " << weight2
14855 <<" but stretch mode currently requires they be the same!";
14856 *errcode = -EINVAL;
14857 ceph_assert(!commit || (weight1 == weight2));
14858 return;
14859 }
14860 if (bucket_count != 2) {
14861 ss << "currently we only support 2-site stretch clusters!";
14862 *errcode = -EINVAL;
14863 ceph_assert(!commit || bucket_count == 2);
14864 return;
14865 }
14866 // TODO: check CRUSH rules for pools so that we are appropriately divided
14867 if (commit) {
14868 for (auto pool : pools) {
14869 pool->crush_rule = new_rule;
14870 pool->peering_crush_bucket_count = bucket_count;
14871 pool->peering_crush_bucket_target = bucket_count;
14872 pool->peering_crush_bucket_barrier = dividing_id;
14873 pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14874 pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14875 pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14876 }
14877 pending_inc.change_stretch_mode = true;
14878 pending_inc.stretch_mode_enabled = true;
14879 pending_inc.new_stretch_bucket_count = bucket_count;
14880 pending_inc.new_degraded_stretch_mode = 0;
14881 pending_inc.new_stretch_mode_bucket = dividing_id;
14882 }
14883 *okay = true;
14884 return;
14885 }
14886
14887 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14888 set<int> *really_down_buckets,
14889 set<string> *really_down_mons)
14890 {
14891 dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14892 ceph_assert(is_readable());
14893 if (dead_buckets.empty()) return false;
14894 set<int> down_cache;
14895 bool really_down = false;
14896 for (auto dbi : dead_buckets) {
14897 const string& bucket_name = dbi.first;
14898 ceph_assert(osdmap.crush->name_exists(bucket_name));
14899 int bucket_id = osdmap.crush->get_item_id(bucket_name);
14900 dout(20) << "Checking " << bucket_name << " id " << bucket_id
14901 << " to see if OSDs are also down" << dendl;
14902 bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14903 if (subtree_down) {
14904 dout(20) << "subtree is down!" << dendl;
14905 really_down = true;
14906 really_down_buckets->insert(bucket_id);
14907 really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14908 }
14909 }
14910 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14911 << " and mons " << *really_down_mons << " are really down" << dendl;
14912 return really_down;
14913 }
14914
14915 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14916 const set<string>& live_zones)
14917 {
14918 dout(20) << __func__ << dendl;
14919 stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14920 // update the general OSDMap changes
14921 pending_inc.change_stretch_mode = true;
14922 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14923 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14924 int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14925 ceph_assert(new_site_count == 1); // stretch count 2!
14926 pending_inc.new_degraded_stretch_mode = new_site_count;
14927 pending_inc.new_recovering_stretch_mode = 0;
14928 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14929
14930 // and then apply them to all the pg_pool_ts
14931 ceph_assert(live_zones.size() == 1); // only support 2 zones now
14932 const string& remaining_site_name = *(live_zones.begin());
14933 ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14934 int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14935 for (auto pgi : osdmap.pools) {
14936 if (pgi.second.peering_crush_bucket_count) {
14937 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14938 newp.peering_crush_bucket_count = new_site_count;
14939 newp.peering_crush_mandatory_member = remaining_site;
14940 newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14941 newp.set_last_force_op_resend(pending_inc.epoch);
14942 }
14943 }
14944 propose_pending();
14945 }
14946
14947 void OSDMonitor::trigger_recovery_stretch_mode()
14948 {
14949 dout(20) << __func__ << dendl;
14950 stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14951 pending_inc.change_stretch_mode = true;
14952 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14953 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14954 pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14955 pending_inc.new_recovering_stretch_mode = 1;
14956 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14957
14958 for (auto pgi : osdmap.pools) {
14959 if (pgi.second.peering_crush_bucket_count) {
14960 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14961 newp.set_last_force_op_resend(pending_inc.epoch);
14962 }
14963 }
14964 propose_pending();
14965 }
14966
14967 void OSDMonitor::set_degraded_stretch_mode()
14968 {
14969 stretch_recovery_triggered.set_from_double(0);
14970 }
14971
14972 void OSDMonitor::set_recovery_stretch_mode()
14973 {
14974 if (stretch_recovery_triggered.is_zero()) {
14975 stretch_recovery_triggered = ceph_clock_now();
14976 }
14977 }
14978
14979 void OSDMonitor::set_healthy_stretch_mode()
14980 {
14981 stretch_recovery_triggered.set_from_double(0);
14982 }
14983
14984 void OSDMonitor::notify_new_pg_digest()
14985 {
14986 dout(20) << __func__ << dendl;
14987 if (!stretch_recovery_triggered.is_zero()) {
14988 try_end_recovery_stretch_mode(false);
14989 }
14990 }
14991
14992 struct CMonExitRecovery : public Context {
14993 OSDMonitor *m;
14994 bool force;
14995 CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14996 void finish(int r) {
14997 m->try_end_recovery_stretch_mode(force);
14998 }
14999 };
15000
15001 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
15002 {
15003 dout(20) << __func__ << dendl;
15004 if (!mon.is_leader()) return;
15005 if (!mon.is_degraded_stretch_mode()) return;
15006 if (!mon.is_recovering_stretch_mode()) return;
15007 if (!is_readable()) {
15008 wait_for_readable_ctx(new CMonExitRecovery(this, force));
15009 return;
15010 }
15011
15012 if (osdmap.recovering_stretch_mode &&
15013 ((!stretch_recovery_triggered.is_zero() &&
15014 ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
15015 stretch_recovery_triggered) ||
15016 force)) {
15017 if (!mon.mgrstatmon()->is_readable()) {
15018 mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
15019 return;
15020 }
15021 const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
15022 double misplaced, degraded, inactive, unknown;
15023 pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
15024 if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
15025 // we can exit degraded stretch mode!
15026 mon.trigger_healthy_stretch_mode();
15027 }
15028 }
15029 }
15030
15031 void OSDMonitor::trigger_healthy_stretch_mode()
15032 {
15033 ceph_assert(is_writeable());
15034 stretch_recovery_triggered.set_from_double(0);
15035 pending_inc.change_stretch_mode = true;
15036 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
15037 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
15038 pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
15039 pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
15040 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
15041 for (auto pgi : osdmap.pools) {
15042 if (pgi.second.peering_crush_bucket_count) {
15043 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
15044 newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
15045 newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
15046 newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
15047 newp.set_last_force_op_resend(pending_inc.epoch);
15048 }
15049 }
15050 propose_pending();
15051 }