]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate2.h"
51 #include "messages/MOSDPGCreated.h"
52 #include "messages/MOSDPGTemp.h"
53 #include "messages/MOSDPGReadyToMerge.h"
54 #include "messages/MMonCommand.h"
55 #include "messages/MRemoveSnaps.h"
56 #include "messages/MRoute.h"
57 #include "messages/MMonGetPurgedSnaps.h"
58 #include "messages/MMonGetPurgedSnapsReply.h"
59
60 #include "common/TextTable.h"
61 #include "common/Timer.h"
62 #include "common/ceph_argparse.h"
63 #include "common/perf_counters.h"
64 #include "common/PriorityCache.h"
65 #include "common/strtol.h"
66 #include "common/numa.h"
67
68 #include "common/config.h"
69 #include "common/errno.h"
70
71 #include "erasure-code/ErasureCodePlugin.h"
72 #include "compressor/Compressor.h"
73 #include "common/Checksummer.h"
74
75 #include "include/compat.h"
76 #include "include/ceph_assert.h"
77 #include "include/stringify.h"
78 #include "include/util.h"
79 #include "common/cmdparse.h"
80 #include "include/str_list.h"
81 #include "include/str_map.h"
82 #include "include/scope_guard.h"
83 #include "perfglue/heap_profiler.h"
84
85 #include "auth/cephx/CephxKeyServer.h"
86 #include "osd/OSDCap.h"
87
88 #include "json_spirit/json_spirit_reader.h"
89
90 #include <boost/algorithm/string/predicate.hpp>
91
92 using std::dec;
93 using std::hex;
94 using std::list;
95 using std::map;
96 using std::make_pair;
97 using std::ostringstream;
98 using std::pair;
99 using std::set;
100 using std::string;
101 using std::stringstream;
102 using std::to_string;
103 using std::vector;
104
105 using ceph::bufferlist;
106 using ceph::decode;
107 using ceph::encode;
108 using ceph::ErasureCodeInterfaceRef;
109 using ceph::ErasureCodePluginRegistry;
110 using ceph::ErasureCodeProfile;
111 using ceph::Formatter;
112 using ceph::JSONFormatter;
113 using ceph::make_message;
114
115 #define dout_subsys ceph_subsys_mon
116 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
117 static const string OSD_METADATA_PREFIX("osd_metadata");
118 static const string OSD_SNAP_PREFIX("osd_snap");
119
120 /*
121
122 OSD snapshot metadata
123 ---------------------
124
125 -- starting with mimic, removed in octopus --
126
127 "removed_epoch_%llu_%08lx" % (pool, epoch)
128 -> interval_set<snapid_t>
129
130 "removed_snap_%llu_%016llx" % (pool, last_snap)
131 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
132
133
134 -- starting with mimic --
135
136 "purged_snap_%llu_%016llx" % (pool, last_snap)
137 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
138
139 - note that the {removed,purged}_snap put the last snap in they key so
140 that we can use forward iteration only to search for an epoch in an
141 interval. e.g., to test if epoch N is removed/purged, we'll find a key
142 >= N that either does or doesn't contain the given snap.
143
144
145 -- starting with octopus --
146
147 "purged_epoch_%08lx" % epoch
148 -> map<int64_t,interval_set<snapid_t>>
149
150 */
151 using namespace TOPNSPC::common;
152 namespace {
153
154 struct OSDMemCache : public PriorityCache::PriCache {
155 OSDMonitor *osdmon;
156 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
157 int64_t committed_bytes = 0;
158 double cache_ratio = 0;
159
160 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
161
162 virtual uint64_t _get_used_bytes() const = 0;
163
164 virtual int64_t request_cache_bytes(
165 PriorityCache::Priority pri, uint64_t total_cache) const {
166 int64_t assigned = get_cache_bytes(pri);
167
168 switch (pri) {
169 // All cache items are currently set to have PRI1 priority
170 case PriorityCache::Priority::PRI1:
171 {
172 int64_t request = _get_used_bytes();
173 return (request > assigned) ? request - assigned : 0;
174 }
175 default:
176 break;
177 }
178 return -EOPNOTSUPP;
179 }
180
181 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
182 return cache_bytes[pri];
183 }
184
185 virtual int64_t get_cache_bytes() const {
186 int64_t total = 0;
187
188 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
189 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
190 total += get_cache_bytes(pri);
191 }
192 return total;
193 }
194
195 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
196 cache_bytes[pri] = bytes;
197 }
198 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
199 cache_bytes[pri] += bytes;
200 }
201 virtual int64_t commit_cache_size(uint64_t total_cache) {
202 committed_bytes = PriorityCache::get_chunk(
203 get_cache_bytes(), total_cache);
204 return committed_bytes;
205 }
206 virtual int64_t get_committed_size() const {
207 return committed_bytes;
208 }
209 virtual double get_cache_ratio() const {
210 return cache_ratio;
211 }
212 virtual void set_cache_ratio(double ratio) {
213 cache_ratio = ratio;
214 }
215 virtual void shift_bins() {
216 }
217 virtual void import_bins(const std::vector<uint64_t> &bins) {
218 }
219 virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
220 }
221 virtual uint64_t get_bins(PriorityCache::Priority pri) const {
222 return 0;
223 }
224
225 virtual string get_cache_name() const = 0;
226 };
227
228 struct IncCache : public OSDMemCache {
229 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
230
231 virtual uint64_t _get_used_bytes() const {
232 return osdmon->inc_osd_cache.get_bytes();
233 }
234
235 virtual string get_cache_name() const {
236 return "OSDMap Inc Cache";
237 }
238
239 uint64_t _get_num_osdmaps() const {
240 return osdmon->inc_osd_cache.get_size();
241 }
242 };
243
244 struct FullCache : public OSDMemCache {
245 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
246
247 virtual uint64_t _get_used_bytes() const {
248 return osdmon->full_osd_cache.get_bytes();
249 }
250
251 virtual string get_cache_name() const {
252 return "OSDMap Full Cache";
253 }
254
255 uint64_t _get_num_osdmaps() const {
256 return osdmon->full_osd_cache.get_size();
257 }
258 };
259
260 std::shared_ptr<IncCache> inc_cache;
261 std::shared_ptr<FullCache> full_cache;
262
263 const uint32_t MAX_POOL_APPLICATIONS = 4;
264 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
265 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
266
267 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
268 // Note: this doesn't include support for the application tag match
269 if ((grant.spec.allow & OSD_CAP_W) != 0) {
270 auto& match = grant.match;
271 if (match.is_match_all()) {
272 return true;
273 } else if (pool_name != nullptr &&
274 !match.pool_namespace.pool_name.empty() &&
275 match.pool_namespace.pool_name == *pool_name) {
276 return true;
277 }
278 }
279 return false;
280 }
281
282 bool is_unmanaged_snap_op_permitted(CephContext* cct,
283 const KeyServer& key_server,
284 const EntityName& entity_name,
285 const MonCap& mon_caps,
286 const entity_addr_t& peer_socket_addr,
287 const std::string* pool_name)
288 {
289 typedef std::map<std::string, std::string> CommandArgs;
290
291 if (mon_caps.is_capable(
292 cct, entity_name, "osd",
293 "osd pool op unmanaged-snap",
294 (pool_name == nullptr ?
295 CommandArgs{} /* pool DNE, require unrestricted cap */ :
296 CommandArgs{{"poolname", *pool_name}}),
297 false, true, false,
298 peer_socket_addr)) {
299 return true;
300 }
301
302 AuthCapsInfo caps_info;
303 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
304 caps_info)) {
305 dout(10) << "unable to locate OSD cap data for " << entity_name
306 << " in auth db" << dendl;
307 return false;
308 }
309
310 string caps_str;
311 if (caps_info.caps.length() > 0) {
312 auto p = caps_info.caps.cbegin();
313 try {
314 decode(caps_str, p);
315 } catch (const ceph::buffer::error &err) {
316 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
317 << dendl;
318 return false;
319 }
320 }
321
322 OSDCap osd_cap;
323 if (!osd_cap.parse(caps_str, nullptr)) {
324 dout(10) << "unable to parse OSD cap data for " << entity_name
325 << " in auth db" << dendl;
326 return false;
327 }
328
329 // if the entity has write permissions in one or all pools, permit
330 // usage of unmanaged-snapshots
331 if (osd_cap.allow_all()) {
332 return true;
333 }
334
335 for (auto& grant : osd_cap.grants) {
336 if (grant.profile.is_valid()) {
337 for (auto& profile_grant : grant.profile_grants) {
338 if (is_osd_writable(profile_grant, pool_name)) {
339 return true;
340 }
341 }
342 } else if (is_osd_writable(grant, pool_name)) {
343 return true;
344 }
345 }
346
347 return false;
348 }
349
350 } // anonymous namespace
351
352 void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
353 epoch_t last_epoch_clean)
354 {
355 if (ps >= pg_num) {
356 // removed PG
357 return;
358 }
359 epoch_by_pg.resize(pg_num, 0);
360 const auto old_lec = epoch_by_pg[ps];
361 if (old_lec >= last_epoch_clean) {
362 // stale lec
363 return;
364 }
365 epoch_by_pg[ps] = last_epoch_clean;
366 if (last_epoch_clean < floor) {
367 floor = last_epoch_clean;
368 } else if (last_epoch_clean > floor) {
369 if (old_lec == floor) {
370 // probably should increase floor?
371 auto new_floor = std::min_element(std::begin(epoch_by_pg),
372 std::end(epoch_by_pg));
373 floor = *new_floor;
374 }
375 }
376 if (ps != next_missing) {
377 return;
378 }
379 for (; next_missing < epoch_by_pg.size(); next_missing++) {
380 if (epoch_by_pg[next_missing] == 0) {
381 break;
382 }
383 }
384 }
385
386 void LastEpochClean::remove_pool(uint64_t pool)
387 {
388 report_by_pool.erase(pool);
389 }
390
391 void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
392 epoch_t last_epoch_clean)
393 {
394 auto& lec = report_by_pool[pg.pool()];
395 return lec.report(pg_num, pg.ps(), last_epoch_clean);
396 }
397
398 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
399 {
400 auto floor = latest.get_epoch();
401 for (auto& pool : latest.get_pools()) {
402 auto reported = report_by_pool.find(pool.first);
403 if (reported == report_by_pool.end()) {
404 return 0;
405 }
406 if (reported->second.next_missing < pool.second.get_pg_num()) {
407 return 0;
408 }
409 if (reported->second.floor < floor) {
410 floor = reported->second.floor;
411 }
412 }
413 return floor;
414 }
415
416 void LastEpochClean::dump(Formatter *f) const
417 {
418 f->open_array_section("per_pool");
419
420 for (auto& [pool, lec] : report_by_pool) {
421 f->open_object_section("pool");
422 f->dump_unsigned("poolid", pool);
423 f->dump_unsigned("floor", lec.floor);
424 f->close_section();
425 }
426
427 f->close_section();
428 }
429
430 class C_UpdateCreatingPGs : public Context {
431 public:
432 OSDMonitor *osdmon;
433 utime_t start;
434 epoch_t epoch;
435 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
436 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
437 void finish(int r) override {
438 if (r >= 0) {
439 utime_t end = ceph_clock_now();
440 dout(10) << "osdmap epoch " << epoch << " mapping took "
441 << (end - start) << " seconds" << dendl;
442 osdmon->update_creating_pgs();
443 osdmon->check_pg_creates_subs();
444 }
445 }
446 };
447
448 #undef dout_prefix
449 #define dout_prefix _prefix(_dout, mon, osdmap)
450 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
451 return *_dout << "mon." << mon.name << "@" << mon.rank
452 << "(" << mon.get_state_name()
453 << ").osd e" << osdmap.get_epoch() << " ";
454 }
455
456 OSDMonitor::OSDMonitor(
457 CephContext *cct,
458 Monitor &mn,
459 Paxos &p,
460 const string& service_name)
461 : PaxosService(mn, p, service_name),
462 cct(cct),
463 inc_osd_cache(g_conf()->mon_osd_cache_size),
464 full_osd_cache(g_conf()->mon_osd_cache_size),
465 has_osdmap_manifest(false),
466 mapper(mn.cct, &mn.cpu_tp)
467 {
468 inc_cache = std::make_shared<IncCache>(this);
469 full_cache = std::make_shared<FullCache>(this);
470 cct->_conf.add_observer(this);
471 int r = _set_cache_sizes();
472 if (r < 0) {
473 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
474 << g_conf()->mon_osd_cache_size
475 << ") without priority cache management"
476 << dendl;
477 }
478 }
479
480 const char **OSDMonitor::get_tracked_conf_keys() const
481 {
482 static const char* KEYS[] = {
483 "mon_memory_target",
484 "mon_memory_autotune",
485 "rocksdb_cache_size",
486 NULL
487 };
488 return KEYS;
489 }
490
491 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
492 const std::set<std::string> &changed)
493 {
494 dout(10) << __func__ << " " << changed << dendl;
495
496 if (changed.count("mon_memory_autotune")) {
497 _set_cache_autotuning();
498 }
499 if (changed.count("mon_memory_target") ||
500 changed.count("rocksdb_cache_size")) {
501 int r = _update_mon_cache_settings();
502 if (r < 0) {
503 derr << __func__ << " mon_memory_target:"
504 << g_conf()->mon_memory_target
505 << " rocksdb_cache_size:"
506 << g_conf()->rocksdb_cache_size
507 << ". Unable to update cache size."
508 << dendl;
509 }
510 }
511 }
512
513 void OSDMonitor::_set_cache_autotuning()
514 {
515 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
516 // Disable cache autotuning
517 std::lock_guard l(balancer_lock);
518 pcm = nullptr;
519 }
520
521 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
522 int r = register_cache_with_pcm();
523 if (r < 0) {
524 dout(10) << __func__
525 << " Error while registering osdmon caches with pcm."
526 << " Cache auto tuning not enabled."
527 << dendl;
528 mon_memory_autotune = false;
529 } else {
530 mon_memory_autotune = true;
531 }
532 }
533 }
534
535 int OSDMonitor::_update_mon_cache_settings()
536 {
537 if (g_conf()->mon_memory_target <= 0 ||
538 g_conf()->mon_memory_target < mon_memory_min ||
539 g_conf()->rocksdb_cache_size <= 0) {
540 return -EINVAL;
541 }
542
543 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
544 derr << __func__ << " not using pcm and rocksdb" << dendl;
545 return -EINVAL;
546 }
547
548 uint64_t old_mon_memory_target = mon_memory_target;
549 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
550
551 // Set the new pcm memory cache sizes
552 mon_memory_target = g_conf()->mon_memory_target;
553 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
554
555 uint64_t base = mon_memory_base;
556 double fragmentation = mon_memory_fragmentation;
557 uint64_t target = mon_memory_target;
558 uint64_t min = mon_memory_min;
559 uint64_t max = min;
560
561 uint64_t ltarget = (1.0 - fragmentation) * target;
562 if (ltarget > base + min) {
563 max = ltarget - base;
564 }
565
566 int r = _set_cache_ratios();
567 if (r < 0) {
568 derr << __func__ << " Cache ratios for pcm could not be set."
569 << " Review the kv (rocksdb) and mon_memory_target sizes."
570 << dendl;
571 mon_memory_target = old_mon_memory_target;
572 rocksdb_cache_size = old_rocksdb_cache_size;
573 return -EINVAL;
574 }
575
576 if (mon_memory_autotune && pcm != nullptr) {
577 std::lock_guard l(balancer_lock);
578 // set pcm cache levels
579 pcm->set_target_memory(target);
580 pcm->set_min_memory(min);
581 pcm->set_max_memory(max);
582 // tune memory based on new values
583 pcm->tune_memory();
584 pcm->balance();
585 _set_new_cache_sizes();
586 dout(1) << __func__ << " Updated mon cache setting."
587 << " target: " << target
588 << " min: " << min
589 << " max: " << max
590 << dendl;
591 }
592 return 0;
593 }
594
595 int OSDMonitor::_set_cache_sizes()
596 {
597 if (g_conf()->mon_memory_autotune) {
598 // set the new osdmon cache targets to be managed by pcm
599 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
600 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
601 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
602 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
603 mon_memory_target = g_conf()->mon_memory_target;
604 mon_memory_min = g_conf()->mon_osd_cache_size_min;
605 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
606 derr << __func__ << " mon_memory_target:" << mon_memory_target
607 << " mon_memory_min:" << mon_memory_min
608 << ". Invalid size option(s) provided."
609 << dendl;
610 return -EINVAL;
611 }
612 // Set the initial inc and full LRU cache sizes
613 inc_osd_cache.set_bytes(mon_memory_min);
614 full_osd_cache.set_bytes(mon_memory_min);
615 mon_memory_autotune = g_conf()->mon_memory_autotune;
616 }
617 return 0;
618 }
619
620 bool OSDMonitor::_have_pending_crush()
621 {
622 return pending_inc.crush.length() > 0;
623 }
624
625 CrushWrapper &OSDMonitor::_get_stable_crush()
626 {
627 return *osdmap.crush;
628 }
629
630 CrushWrapper OSDMonitor::_get_pending_crush()
631 {
632 bufferlist bl;
633 if (pending_inc.crush.length())
634 bl = pending_inc.crush;
635 else
636 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
637
638 auto p = bl.cbegin();
639 CrushWrapper crush;
640 crush.decode(p);
641 return crush;
642 }
643
644 void OSDMonitor::create_initial()
645 {
646 dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
647
648 OSDMap newmap;
649
650 bufferlist bl;
651 mon.store->get("mkfs", "osdmap", bl);
652
653 if (bl.length()) {
654 newmap.decode(bl);
655 newmap.set_fsid(mon.monmap->fsid);
656 } else {
657 newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
658 }
659 newmap.set_epoch(1);
660 newmap.created = newmap.modified = ceph_clock_now();
661
662 // new clusters should sort bitwise by default.
663 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
664
665 newmap.flags |=
666 CEPH_OSDMAP_RECOVERY_DELETES |
667 CEPH_OSDMAP_PURGED_SNAPDIRS |
668 CEPH_OSDMAP_PGLOG_HARDLIMIT;
669 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
670 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
671 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
672 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
673 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
674 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
675
676 // new cluster should require latest by default
677 if (g_conf().get_val<bool>("mon_debug_no_require_reef")) {
678 if (g_conf().get_val<bool>("mon_debug_no_require_quincy")) {
679 derr << __func__ << " mon_debug_no_require_reef and quincy=true" << dendl;
680 newmap.require_osd_release = ceph_release_t::pacific;
681 } else {
682 derr << __func__ << " mon_debug_no_require_reef=true" << dendl;
683 newmap.require_osd_release = ceph_release_t::quincy;
684 }
685 } else {
686 newmap.require_osd_release = ceph_release_t::reef;
687 }
688
689 ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
690 if (!r) {
691 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
692 }
693 newmap.require_min_compat_client = r;
694
695 // encode into pending incremental
696 uint64_t features = newmap.get_encoding_features();
697 newmap.encode(pending_inc.fullmap,
698 features | CEPH_FEATURE_RESERVED);
699 pending_inc.full_crc = newmap.get_crc();
700 dout(20) << " full crc " << pending_inc.full_crc << dendl;
701 }
702
703 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
704 {
705 s.insert(service_name);
706 s.insert(OSD_PG_CREATING_PREFIX);
707 s.insert(OSD_METADATA_PREFIX);
708 s.insert(OSD_SNAP_PREFIX);
709 }
710
711 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
712 {
713 // we really don't care if the version has been updated, because we may
714 // have trimmed without having increased the last committed; yet, we may
715 // need to update the in-memory manifest.
716 load_osdmap_manifest();
717
718 version_t version = get_last_committed();
719 if (version == osdmap.epoch)
720 return;
721 ceph_assert(version > osdmap.epoch);
722
723 dout(15) << "update_from_paxos paxos e " << version
724 << ", my e " << osdmap.epoch << dendl;
725
726 int prev_num_up_osd = osdmap.num_up_osd;
727
728 if (mapping_job) {
729 if (!mapping_job->is_done()) {
730 dout(1) << __func__ << " mapping job "
731 << mapping_job.get() << " did not complete, "
732 << mapping_job->shards << " left, canceling" << dendl;
733 mapping_job->abort();
734 }
735 mapping_job.reset();
736 }
737
738 load_health();
739
740 /*
741 * We will possibly have a stashed latest that *we* wrote, and we will
742 * always be sure to have the oldest full map in the first..last range
743 * due to encode_trim_extra(), which includes the oldest full map in the trim
744 * transaction.
745 *
746 * encode_trim_extra() does not however write the full map's
747 * version to 'full_latest'. This is only done when we are building the
748 * full maps from the incremental versions. But don't panic! We make sure
749 * that the following conditions find whichever full map version is newer.
750 */
751 version_t latest_full = get_version_latest_full();
752 if (latest_full == 0 && get_first_committed() > 1)
753 latest_full = get_first_committed();
754
755 if (get_first_committed() > 1 &&
756 latest_full < get_first_committed()) {
757 // the monitor could be just sync'ed with its peer, and the latest_full key
758 // is not encoded in the paxos commits in encode_pending(), so we need to
759 // make sure we get it pointing to a proper version.
760 version_t lc = get_last_committed();
761 version_t fc = get_first_committed();
762
763 dout(10) << __func__ << " looking for valid full map in interval"
764 << " [" << fc << ", " << lc << "]" << dendl;
765
766 latest_full = 0;
767 for (version_t v = lc; v >= fc; v--) {
768 string full_key = "full_" + stringify(v);
769 if (mon.store->exists(get_service_name(), full_key)) {
770 dout(10) << __func__ << " found latest full map v " << v << dendl;
771 latest_full = v;
772 break;
773 }
774 }
775
776 ceph_assert(latest_full > 0);
777 auto t(std::make_shared<MonitorDBStore::Transaction>());
778 put_version_latest_full(t, latest_full);
779 mon.store->apply_transaction(t);
780 dout(10) << __func__ << " updated the on-disk full map version to "
781 << latest_full << dendl;
782 }
783
784 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
785 bufferlist latest_bl;
786 get_version_full(latest_full, latest_bl);
787 ceph_assert(latest_bl.length() != 0);
788 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
789 osdmap = OSDMap();
790 osdmap.decode(latest_bl);
791 }
792
793 bufferlist bl;
794 if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
795 auto p = bl.cbegin();
796 std::lock_guard<std::mutex> l(creating_pgs_lock);
797 creating_pgs.decode(p);
798 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
799 << creating_pgs.last_scan_epoch
800 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
801 } else {
802 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
803 << dendl;
804 }
805
806 // walk through incrementals
807 MonitorDBStore::TransactionRef t;
808 size_t tx_size = 0;
809 while (version > osdmap.epoch) {
810 bufferlist inc_bl;
811 int err = get_version(osdmap.epoch+1, inc_bl);
812 ceph_assert(err == 0);
813 ceph_assert(inc_bl.length());
814 // set priority cache manager levels if the osdmap is
815 // being populated for the first time.
816 if (mon_memory_autotune && pcm == nullptr) {
817 int r = register_cache_with_pcm();
818 if (r < 0) {
819 dout(10) << __func__
820 << " Error while registering osdmon caches with pcm."
821 << " Proceeding without cache auto tuning."
822 << dendl;
823 }
824 }
825
826 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
827 << dendl;
828 OSDMap::Incremental inc(inc_bl);
829 err = osdmap.apply_incremental(inc);
830 ceph_assert(err == 0);
831
832 if (!t)
833 t.reset(new MonitorDBStore::Transaction);
834
835 // Write out the full map for all past epochs. Encode the full
836 // map with the same features as the incremental. If we don't
837 // know, use the quorum features. If we don't know those either,
838 // encode with all features.
839 uint64_t f = inc.encode_features;
840 if (!f)
841 f = mon.get_quorum_con_features();
842 if (!f)
843 f = -1;
844 bufferlist full_bl;
845 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
846 tx_size += full_bl.length();
847
848 bufferlist orig_full_bl;
849 get_version_full(osdmap.epoch, orig_full_bl);
850 if (orig_full_bl.length()) {
851 // the primary provided the full map
852 ceph_assert(inc.have_crc);
853 if (inc.full_crc != osdmap.crc) {
854 // This will happen if the mons were running mixed versions in
855 // the past or some other circumstance made the full encoded
856 // maps divergent. Reloading here will bring us back into
857 // sync with the primary for this and all future maps. OSDs
858 // will also be brought back into sync when they discover the
859 // crc mismatch and request a full map from a mon.
860 derr << __func__ << " full map CRC mismatch, resetting to canonical"
861 << dendl;
862
863 dout(20) << __func__ << " my (bad) full osdmap:\n";
864 JSONFormatter jf(true);
865 jf.dump_object("osdmap", osdmap);
866 jf.flush(*_dout);
867 *_dout << "\nhexdump:\n";
868 full_bl.hexdump(*_dout);
869 *_dout << dendl;
870
871 osdmap = OSDMap();
872 osdmap.decode(orig_full_bl);
873
874 dout(20) << __func__ << " canonical full osdmap:\n";
875 JSONFormatter jf(true);
876 jf.dump_object("osdmap", osdmap);
877 jf.flush(*_dout);
878 *_dout << "\nhexdump:\n";
879 orig_full_bl.hexdump(*_dout);
880 *_dout << dendl;
881 }
882 } else {
883 ceph_assert(!inc.have_crc);
884 put_version_full(t, osdmap.epoch, full_bl);
885 }
886 put_version_latest_full(t, osdmap.epoch);
887
888 // share
889 dout(1) << osdmap << dendl;
890
891 if (osdmap.epoch == 1) {
892 t->erase("mkfs", "osdmap");
893 }
894
895 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
896 mon.store->apply_transaction(t);
897 t = MonitorDBStore::TransactionRef();
898 tx_size = 0;
899 }
900 for (auto [osd, state] : inc.new_state) {
901 if (state & CEPH_OSD_UP) {
902 // could be marked up *or* down, but we're too lazy to check which
903 last_osd_report.erase(osd);
904 }
905 }
906 for (auto [osd, weight] : inc.new_weight) {
907 if (weight == CEPH_OSD_OUT) {
908 // manually marked out, so drop it
909 osd_epochs.erase(osd);
910 }
911 }
912 }
913
914 if (t) {
915 mon.store->apply_transaction(t);
916 }
917
918 bool marked_osd_down = false;
919 for (int o = 0; o < osdmap.get_max_osd(); o++) {
920 if (osdmap.is_out(o))
921 continue;
922 auto found = down_pending_out.find(o);
923 if (osdmap.is_down(o)) {
924 // populate down -> out map
925 if (found == down_pending_out.end()) {
926 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
927 down_pending_out[o] = ceph_clock_now();
928 marked_osd_down = true;
929 }
930 } else {
931 if (found != down_pending_out.end()) {
932 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
933 down_pending_out.erase(found);
934 }
935 }
936 }
937 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
938
939 check_osdmap_subs();
940 check_pg_creates_subs();
941
942 share_map_with_random_osd();
943 update_logger();
944 process_failures();
945
946 // make sure our feature bits reflect the latest map
947 update_msgr_features();
948
949 if (!mon.is_leader()) {
950 // will be called by on_active() on the leader, avoid doing so twice
951 start_mapping();
952 }
953 if (osdmap.stretch_mode_enabled) {
954 dout(20) << "Stretch mode enabled in this map" << dendl;
955 mon.try_engage_stretch_mode();
956 if (osdmap.degraded_stretch_mode) {
957 dout(20) << "Degraded stretch mode set in this map" << dendl;
958 if (!osdmap.recovering_stretch_mode) {
959 mon.set_degraded_stretch_mode();
960 dout(20) << "prev_num_up_osd: " << prev_num_up_osd << dendl;
961 dout(20) << "osdmap.num_up_osd: " << osdmap.num_up_osd << dendl;
962 dout(20) << "osdmap.num_osd: " << osdmap.num_osd << dendl;
963 dout(20) << "mon_stretch_cluster_recovery_ratio: " << cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") << dendl;
964 if (prev_num_up_osd < osdmap.num_up_osd &&
965 (osdmap.num_up_osd / (double)osdmap.num_osd) >
966 cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") &&
967 mon.dead_mon_buckets.size() == 0) {
968 // TODO: This works for 2-site clusters when the OSD maps are appropriately
969 // trimmed and everything is "normal" but not if you have a lot of out OSDs
970 // you're ignoring or in some really degenerate failure cases
971
972 dout(10) << "Enabling recovery stretch mode in this map" << dendl;
973 mon.go_recovery_stretch_mode();
974 }
975 } else {
976 mon.set_recovery_stretch_mode();
977 }
978 } else {
979 mon.set_healthy_stretch_mode();
980 }
981 if (marked_osd_down &&
982 (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
983 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
984 mon.maybe_go_degraded_stretch_mode();
985 }
986 }
987 }
988
989 int OSDMonitor::register_cache_with_pcm()
990 {
991 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
992 derr << __func__ << " Invalid memory size specified for mon caches."
993 << " Caches will not be auto-tuned."
994 << dendl;
995 return -EINVAL;
996 }
997 uint64_t base = mon_memory_base;
998 double fragmentation = mon_memory_fragmentation;
999 // For calculating total target memory, consider rocksdb cache size.
1000 uint64_t target = mon_memory_target;
1001 uint64_t min = mon_memory_min;
1002 uint64_t max = min;
1003
1004 // Apply the same logic as in bluestore to set the max amount
1005 // of memory to use for cache. Assume base memory for OSDMaps
1006 // and then add in some overhead for fragmentation.
1007 uint64_t ltarget = (1.0 - fragmentation) * target;
1008 if (ltarget > base + min) {
1009 max = ltarget - base;
1010 }
1011
1012 rocksdb_binned_kv_cache = mon.store->get_priority_cache();
1013 if (!rocksdb_binned_kv_cache) {
1014 derr << __func__ << " not using rocksdb" << dendl;
1015 return -EINVAL;
1016 }
1017
1018 int r = _set_cache_ratios();
1019 if (r < 0) {
1020 derr << __func__ << " Cache ratios for pcm could not be set."
1021 << " Review the kv (rocksdb) and mon_memory_target sizes."
1022 << dendl;
1023 return -EINVAL;
1024 }
1025
1026 pcm = std::make_shared<PriorityCache::Manager>(
1027 cct, min, max, target, true);
1028 pcm->insert("kv", rocksdb_binned_kv_cache, true);
1029 pcm->insert("inc", inc_cache, true);
1030 pcm->insert("full", full_cache, true);
1031 dout(1) << __func__ << " pcm target: " << target
1032 << " pcm max: " << max
1033 << " pcm min: " << min
1034 << " inc_osd_cache size: " << inc_osd_cache.get_size()
1035 << dendl;
1036 return 0;
1037 }
1038
1039 int OSDMonitor::_set_cache_ratios()
1040 {
1041 double old_cache_kv_ratio = cache_kv_ratio;
1042
1043 // Set the cache ratios for kv(rocksdb), inc and full caches
1044 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1045 if (cache_kv_ratio >= 1.0) {
1046 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1047 << ") must be in range [0,<1.0]."
1048 << dendl;
1049 cache_kv_ratio = old_cache_kv_ratio;
1050 return -EINVAL;
1051 }
1052 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1053 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1054 inc_cache->set_cache_ratio(cache_inc_ratio);
1055 full_cache->set_cache_ratio(cache_full_ratio);
1056
1057 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1058 << " inc ratio " << cache_inc_ratio
1059 << " full ratio " << cache_full_ratio
1060 << dendl;
1061 return 0;
1062 }
1063
1064 void OSDMonitor::start_mapping()
1065 {
1066 // initiate mapping job
1067 if (mapping_job) {
1068 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1069 << dendl;
1070 mapping_job->abort();
1071 }
1072 if (!osdmap.get_pools().empty()) {
1073 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1074 mapping_job = mapping.start_update(osdmap, mapper,
1075 g_conf()->mon_osd_mapping_pgs_per_chunk);
1076 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1077 << " at " << fin->start << dendl;
1078 mapping_job->set_finish_event(fin);
1079 } else {
1080 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1081 mapping_job = nullptr;
1082 }
1083 }
1084
1085 void OSDMonitor::update_msgr_features()
1086 {
1087 const int types[] = {
1088 entity_name_t::TYPE_OSD,
1089 entity_name_t::TYPE_CLIENT,
1090 entity_name_t::TYPE_MDS,
1091 entity_name_t::TYPE_MON
1092 };
1093 for (int type : types) {
1094 uint64_t mask;
1095 uint64_t features = osdmap.get_features(type, &mask);
1096 if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1097 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1098 ceph::net::Policy p = mon.messenger->get_policy(type);
1099 p.features_required = (p.features_required & ~mask) | features;
1100 mon.messenger->set_policy(type, p);
1101 }
1102 }
1103 }
1104
1105 void OSDMonitor::on_active()
1106 {
1107 update_logger();
1108
1109 if (mon.is_leader()) {
1110 mon.clog->debug() << "osdmap " << osdmap;
1111 if (!priority_convert) {
1112 // Only do this once at start-up
1113 convert_pool_priorities();
1114 priority_convert = true;
1115 }
1116 } else {
1117 list<MonOpRequestRef> ls;
1118 take_all_failures(ls);
1119 while (!ls.empty()) {
1120 MonOpRequestRef op = ls.front();
1121 op->mark_osdmon_event(__func__);
1122 dispatch(op);
1123 ls.pop_front();
1124 }
1125 }
1126 start_mapping();
1127 }
1128
1129 void OSDMonitor::on_restart()
1130 {
1131 last_osd_report.clear();
1132 }
1133
1134 void OSDMonitor::on_shutdown()
1135 {
1136 dout(10) << __func__ << dendl;
1137 if (mapping_job) {
1138 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1139 << dendl;
1140 mapping_job->abort();
1141 }
1142
1143 // discard failure info, waiters
1144 list<MonOpRequestRef> ls;
1145 take_all_failures(ls);
1146 ls.clear();
1147 }
1148
1149 void OSDMonitor::update_logger()
1150 {
1151 dout(10) << "update_logger" << dendl;
1152
1153 mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1154 mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1155 mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1156 mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1157 }
1158
1159 void OSDMonitor::create_pending()
1160 {
1161 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1162 pending_inc.fsid = mon.monmap->fsid;
1163 pending_metadata.clear();
1164 pending_metadata_rm.clear();
1165 pending_pseudo_purged_snaps.clear();
1166
1167 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1168
1169 // safety checks (this shouldn't really happen)
1170 {
1171 if (osdmap.backfillfull_ratio <= 0) {
1172 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1173 if (pending_inc.new_backfillfull_ratio > 1.0)
1174 pending_inc.new_backfillfull_ratio /= 100;
1175 dout(1) << __func__ << " setting backfillfull_ratio = "
1176 << pending_inc.new_backfillfull_ratio << dendl;
1177 }
1178 if (osdmap.full_ratio <= 0) {
1179 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1180 if (pending_inc.new_full_ratio > 1.0)
1181 pending_inc.new_full_ratio /= 100;
1182 dout(1) << __func__ << " setting full_ratio = "
1183 << pending_inc.new_full_ratio << dendl;
1184 }
1185 if (osdmap.nearfull_ratio <= 0) {
1186 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1187 if (pending_inc.new_nearfull_ratio > 1.0)
1188 pending_inc.new_nearfull_ratio /= 100;
1189 dout(1) << __func__ << " setting nearfull_ratio = "
1190 << pending_inc.new_nearfull_ratio << dendl;
1191 }
1192 }
1193 }
1194
1195 creating_pgs_t
1196 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1197 const OSDMap& nextmap)
1198 {
1199 dout(10) << __func__ << dendl;
1200 creating_pgs_t pending_creatings;
1201 {
1202 std::lock_guard<std::mutex> l(creating_pgs_lock);
1203 pending_creatings = creating_pgs;
1204 }
1205 // check for new or old pools
1206 if (pending_creatings.last_scan_epoch < inc.epoch) {
1207 unsigned queued = 0;
1208 queued += scan_for_creating_pgs(osdmap.get_pools(),
1209 inc.old_pools,
1210 inc.modified,
1211 &pending_creatings);
1212 queued += scan_for_creating_pgs(inc.new_pools,
1213 inc.old_pools,
1214 inc.modified,
1215 &pending_creatings);
1216 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1217 for (auto deleted_pool : inc.old_pools) {
1218 auto removed = pending_creatings.remove_pool(deleted_pool);
1219 dout(10) << __func__ << " " << removed
1220 << " pg removed because containing pool deleted: "
1221 << deleted_pool << dendl;
1222 last_epoch_clean.remove_pool(deleted_pool);
1223 }
1224 // pgmon updates its creating_pgs in check_osd_map() which is called by
1225 // on_active() and check_osd_map() could be delayed if lease expires, so its
1226 // creating_pgs could be stale in comparison with the one of osdmon. let's
1227 // trim them here. otherwise, they will be added back after being erased.
1228 unsigned removed = 0;
1229 for (auto& pg : pending_created_pgs) {
1230 dout(20) << __func__ << " noting created pg " << pg << dendl;
1231 pending_creatings.created_pools.insert(pg.pool());
1232 removed += pending_creatings.pgs.erase(pg);
1233 }
1234 pending_created_pgs.clear();
1235 dout(10) << __func__ << " " << removed
1236 << " pgs removed because they're created" << dendl;
1237 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1238 }
1239
1240 // filter out any pgs that shouldn't exist.
1241 {
1242 auto i = pending_creatings.pgs.begin();
1243 while (i != pending_creatings.pgs.end()) {
1244 if (!nextmap.pg_exists(i->first)) {
1245 dout(10) << __func__ << " removing pg " << i->first
1246 << " which should not exist" << dendl;
1247 i = pending_creatings.pgs.erase(i);
1248 } else {
1249 ++i;
1250 }
1251 }
1252 }
1253
1254 // process queue
1255 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1256 const auto total = pending_creatings.pgs.size();
1257 while (pending_creatings.pgs.size() < max &&
1258 !pending_creatings.queue.empty()) {
1259 auto p = pending_creatings.queue.begin();
1260 int64_t poolid = p->first;
1261 dout(10) << __func__ << " pool " << poolid
1262 << " created " << p->second.created
1263 << " modified " << p->second.modified
1264 << " [" << p->second.start << "-" << p->second.end << ")"
1265 << dendl;
1266 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1267 p->second.end - p->second.start);
1268 ps_t first = p->second.start;
1269 ps_t end = first + n;
1270 for (ps_t ps = first; ps < end; ++ps) {
1271 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1272 // NOTE: use the *current* epoch as the PG creation epoch so that the
1273 // OSD does not have to generate a long set of PastIntervals.
1274 pending_creatings.pgs.emplace(
1275 pgid,
1276 creating_pgs_t::pg_create_info(inc.epoch,
1277 p->second.modified));
1278 dout(10) << __func__ << " adding " << pgid << dendl;
1279 }
1280 p->second.start = end;
1281 if (p->second.done()) {
1282 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1283 pending_creatings.queue.erase(p);
1284 } else {
1285 dout(10) << __func__ << " pool " << poolid
1286 << " now [" << p->second.start << "-" << p->second.end << ")"
1287 << dendl;
1288 }
1289 }
1290 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1291 << " pools" << dendl;
1292
1293 if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1294 // walk creating pgs' history and past_intervals forward
1295 for (auto& i : pending_creatings.pgs) {
1296 // this mirrors PG::start_peering_interval()
1297 pg_t pgid = i.first;
1298
1299 // this is a bit imprecise, but sufficient?
1300 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1301 const pg_pool_t *pi;
1302 bool operator()(const set<pg_shard_t> &have) const {
1303 return have.size() >= pi->min_size;
1304 }
1305 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1306 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1307
1308 vector<int> up, acting;
1309 int up_primary, acting_primary;
1310 nextmap.pg_to_up_acting_osds(
1311 pgid, &up, &up_primary, &acting, &acting_primary);
1312 if (i.second.history.epoch_created == 0) {
1313 // new pg entry, set it up
1314 i.second.up = up;
1315 i.second.acting = acting;
1316 i.second.up_primary = up_primary;
1317 i.second.acting_primary = acting_primary;
1318 i.second.history = pg_history_t(i.second.create_epoch,
1319 i.second.create_stamp);
1320 dout(10) << __func__ << " pg " << pgid << " just added, "
1321 << " up " << i.second.up
1322 << " p " << i.second.up_primary
1323 << " acting " << i.second.acting
1324 << " p " << i.second.acting_primary
1325 << " history " << i.second.history
1326 << " past_intervals " << i.second.past_intervals
1327 << dendl;
1328 } else {
1329 std::stringstream debug;
1330 if (PastIntervals::check_new_interval(
1331 i.second.acting_primary, acting_primary,
1332 i.second.acting, acting,
1333 i.second.up_primary, up_primary,
1334 i.second.up, up,
1335 i.second.history.same_interval_since,
1336 i.second.history.last_epoch_clean,
1337 &nextmap,
1338 &osdmap,
1339 pgid,
1340 min_size_predicate,
1341 &i.second.past_intervals,
1342 &debug)) {
1343 epoch_t e = inc.epoch;
1344 i.second.history.same_interval_since = e;
1345 if (i.second.up != up) {
1346 i.second.history.same_up_since = e;
1347 }
1348 if (i.second.acting_primary != acting_primary) {
1349 i.second.history.same_primary_since = e;
1350 }
1351 if (pgid.is_split(
1352 osdmap.get_pg_num(pgid.pool()),
1353 nextmap.get_pg_num(pgid.pool()),
1354 nullptr)) {
1355 i.second.history.last_epoch_split = e;
1356 }
1357 dout(10) << __func__ << " pg " << pgid << " new interval,"
1358 << " up " << i.second.up << " -> " << up
1359 << " p " << i.second.up_primary << " -> " << up_primary
1360 << " acting " << i.second.acting << " -> " << acting
1361 << " p " << i.second.acting_primary << " -> "
1362 << acting_primary
1363 << " history " << i.second.history
1364 << " past_intervals " << i.second.past_intervals
1365 << dendl;
1366 dout(20) << " debug: " << debug.str() << dendl;
1367 i.second.up = up;
1368 i.second.acting = acting;
1369 i.second.up_primary = up_primary;
1370 i.second.acting_primary = acting_primary;
1371 }
1372 }
1373 }
1374 }
1375 dout(10) << __func__
1376 << " " << (pending_creatings.pgs.size() - total)
1377 << "/" << pending_creatings.pgs.size()
1378 << " pgs added from queued pools" << dendl;
1379 return pending_creatings;
1380 }
1381
1382 void OSDMonitor::maybe_prime_pg_temp()
1383 {
1384 bool all = false;
1385 if (pending_inc.crush.length()) {
1386 dout(10) << __func__ << " new crush map, all" << dendl;
1387 all = true;
1388 }
1389
1390 if (!pending_inc.new_up_client.empty()) {
1391 dout(10) << __func__ << " new up osds, all" << dendl;
1392 all = true;
1393 }
1394
1395 // check for interesting OSDs
1396 set<int> osds;
1397 for (auto p = pending_inc.new_state.begin();
1398 !all && p != pending_inc.new_state.end();
1399 ++p) {
1400 if ((p->second & CEPH_OSD_UP) &&
1401 osdmap.is_up(p->first)) {
1402 osds.insert(p->first);
1403 }
1404 }
1405 for (auto p = pending_inc.new_weight.begin();
1406 !all && p != pending_inc.new_weight.end();
1407 ++p) {
1408 if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1409 // weight reduction
1410 osds.insert(p->first);
1411 } else {
1412 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1413 << dendl;
1414 all = true;
1415 }
1416 }
1417
1418 if (!all && osds.empty())
1419 return;
1420
1421 if (!all) {
1422 unsigned estimate =
1423 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1424 if (estimate > mapping.get_num_pgs() *
1425 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1426 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1427 << osds.size() << " osds >= "
1428 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1429 << mapping.get_num_pgs() << " pgs, all"
1430 << dendl;
1431 all = true;
1432 } else {
1433 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1434 << osds.size() << " osds" << dendl;
1435 }
1436 }
1437
1438 OSDMap next;
1439 next.deepish_copy_from(osdmap);
1440 next.apply_incremental(pending_inc);
1441
1442 if (next.get_pools().empty()) {
1443 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1444 } else if (all) {
1445 PrimeTempJob job(next, this);
1446 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1447 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1448 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1449 } else {
1450 dout(10) << __func__ << " did not finish in "
1451 << g_conf()->mon_osd_prime_pg_temp_max_time
1452 << ", stopping" << dendl;
1453 job.abort();
1454 }
1455 } else {
1456 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1457 utime_t stop = ceph_clock_now();
1458 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1459 const int chunk = 1000;
1460 int n = chunk;
1461 std::unordered_set<pg_t> did_pgs;
1462 for (auto osd : osds) {
1463 auto& pgs = mapping.get_osd_acting_pgs(osd);
1464 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1465 for (auto pgid : pgs) {
1466 if (!did_pgs.insert(pgid).second) {
1467 continue;
1468 }
1469 prime_pg_temp(next, pgid);
1470 if (--n <= 0) {
1471 n = chunk;
1472 if (ceph_clock_now() > stop) {
1473 dout(10) << __func__ << " consumed more than "
1474 << g_conf()->mon_osd_prime_pg_temp_max_time
1475 << " seconds, stopping"
1476 << dendl;
1477 return;
1478 }
1479 }
1480 }
1481 }
1482 }
1483 }
1484
1485 void OSDMonitor::prime_pg_temp(
1486 const OSDMap& next,
1487 pg_t pgid)
1488 {
1489 // TODO: remove this creating_pgs direct access?
1490 if (creating_pgs.pgs.count(pgid)) {
1491 return;
1492 }
1493 if (!osdmap.pg_exists(pgid)) {
1494 return;
1495 }
1496
1497 vector<int> up, acting;
1498 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1499
1500 vector<int> next_up, next_acting;
1501 int next_up_primary, next_acting_primary;
1502 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1503 &next_acting, &next_acting_primary);
1504 if (acting == next_acting &&
1505 !(up != acting && next_up == next_acting))
1506 return; // no change since last epoch
1507
1508 if (acting.empty())
1509 return; // if previously empty now we can be no worse off
1510 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1511 if (pool && acting.size() < pool->min_size)
1512 return; // can be no worse off than before
1513
1514 if (next_up == next_acting) {
1515 acting.clear();
1516 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1517 << dendl;
1518 }
1519
1520 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1521 << " -> " << next_up << "/" << next_acting
1522 << ", priming " << acting
1523 << dendl;
1524 {
1525 std::lock_guard l(prime_pg_temp_lock);
1526 // do not touch a mapping if a change is pending
1527 pending_inc.new_pg_temp.emplace(
1528 pgid,
1529 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1530 }
1531 }
1532
1533 /**
1534 * @note receiving a transaction in this function gives a fair amount of
1535 * freedom to the service implementation if it does need it. It shouldn't.
1536 */
1537 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1538 {
1539 dout(10) << "encode_pending e " << pending_inc.epoch
1540 << dendl;
1541
1542 if (do_prune(t)) {
1543 dout(1) << __func__ << " osdmap full prune encoded e"
1544 << pending_inc.epoch << dendl;
1545 }
1546
1547 // finalize up pending_inc
1548 pending_inc.modified = ceph_clock_now();
1549
1550 int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1551 ceph_assert(r == 0);
1552
1553 if (mapping_job) {
1554 if (!mapping_job->is_done()) {
1555 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1556 << mapping_job.get() << " did not complete, "
1557 << mapping_job->shards << " left" << dendl;
1558 mapping_job->abort();
1559 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1560 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1561 << mapping_job.get() << " is prior epoch "
1562 << mapping.get_epoch() << dendl;
1563 } else {
1564 if (g_conf()->mon_osd_prime_pg_temp) {
1565 maybe_prime_pg_temp();
1566 }
1567 }
1568 } else if (g_conf()->mon_osd_prime_pg_temp) {
1569 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1570 << dendl;
1571 }
1572 mapping_job.reset();
1573
1574 // ensure we don't have blank new_state updates. these are interrpeted as
1575 // CEPH_OSD_UP (and almost certainly not what we want!).
1576 auto p = pending_inc.new_state.begin();
1577 while (p != pending_inc.new_state.end()) {
1578 if (p->second == 0) {
1579 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1580 p = pending_inc.new_state.erase(p);
1581 } else {
1582 if (p->second & CEPH_OSD_UP) {
1583 pending_inc.new_last_up_change = pending_inc.modified;
1584 }
1585 ++p;
1586 }
1587 }
1588 if (!pending_inc.new_up_client.empty()) {
1589 pending_inc.new_last_up_change = pending_inc.modified;
1590 }
1591 for (auto& i : pending_inc.new_weight) {
1592 if (i.first >= osdmap.max_osd) {
1593 if (i.second) {
1594 // new osd is already marked in
1595 pending_inc.new_last_in_change = pending_inc.modified;
1596 break;
1597 }
1598 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1599 // existing osd marked in or out
1600 pending_inc.new_last_in_change = pending_inc.modified;
1601 break;
1602 }
1603 }
1604
1605 {
1606 OSDMap tmp;
1607 tmp.deepish_copy_from(osdmap);
1608 tmp.apply_incremental(pending_inc);
1609
1610 // clean pg_temp mappings
1611 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1612
1613 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1614 {
1615 // check every upmapped pg for now
1616 // until we could reliably identify certain cases to ignore,
1617 // which is obviously the hard part TBD..
1618 vector<pg_t> pgs_to_check;
1619 tmp.get_upmap_pgs(&pgs_to_check);
1620 if (pgs_to_check.size() <
1621 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1622 // not enough pgs, do it inline
1623 tmp.clean_pg_upmaps(cct, &pending_inc);
1624 } else {
1625 CleanUpmapJob job(cct, tmp, pending_inc);
1626 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1627 job.wait();
1628 }
1629 }
1630
1631 // update creating pgs first so that we can remove the created pgid and
1632 // process the pool flag removal below in the same osdmap epoch.
1633 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1634 bufferlist creatings_bl;
1635 uint64_t features = CEPH_FEATURES_ALL;
1636 if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1637 dout(20) << __func__ << " encoding pending pgs without octopus features"
1638 << dendl;
1639 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1640 }
1641 encode(pending_creatings, creatings_bl, features);
1642 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1643
1644 // remove any old (or incompat) POOL_CREATING flags
1645 for (auto& i : tmp.get_pools()) {
1646 if (tmp.require_osd_release < ceph_release_t::nautilus) {
1647 // pre-nautilus OSDMaps shouldn't get this flag.
1648 if (pending_inc.new_pools.count(i.first)) {
1649 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1650 }
1651 }
1652 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1653 !pending_creatings.still_creating_pool(i.first)) {
1654 dout(10) << __func__ << " done creating pool " << i.first
1655 << ", clearing CREATING flag" << dendl;
1656 if (pending_inc.new_pools.count(i.first) == 0) {
1657 pending_inc.new_pools[i.first] = i.second;
1658 }
1659 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1660 }
1661 }
1662
1663 // collect which pools are currently affected by
1664 // the near/backfill/full osd(s),
1665 // and set per-pool near/backfill/full flag instead
1666 set<int64_t> full_pool_ids;
1667 set<int64_t> backfillfull_pool_ids;
1668 set<int64_t> nearfull_pool_ids;
1669 tmp.get_full_pools(cct,
1670 &full_pool_ids,
1671 &backfillfull_pool_ids,
1672 &nearfull_pool_ids);
1673 if (full_pool_ids.empty() ||
1674 backfillfull_pool_ids.empty() ||
1675 nearfull_pool_ids.empty()) {
1676 // normal case - no nearfull, backfillfull or full osds
1677 // try cancel any improper nearfull/backfillfull/full pool
1678 // flags first
1679 for (auto &pool: tmp.get_pools()) {
1680 auto p = pool.first;
1681 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1682 nearfull_pool_ids.empty()) {
1683 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1684 << "'s nearfull flag" << dendl;
1685 if (pending_inc.new_pools.count(p) == 0) {
1686 // load original pool info first!
1687 pending_inc.new_pools[p] = pool.second;
1688 }
1689 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1690 }
1691 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1692 backfillfull_pool_ids.empty()) {
1693 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1694 << "'s backfillfull flag" << dendl;
1695 if (pending_inc.new_pools.count(p) == 0) {
1696 pending_inc.new_pools[p] = pool.second;
1697 }
1698 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1699 }
1700 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1701 full_pool_ids.empty()) {
1702 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1703 // set by EQUOTA, skipping
1704 continue;
1705 }
1706 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1707 << "'s full flag" << dendl;
1708 if (pending_inc.new_pools.count(p) == 0) {
1709 pending_inc.new_pools[p] = pool.second;
1710 }
1711 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1712 }
1713 }
1714 }
1715 if (!full_pool_ids.empty()) {
1716 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1717 << " as full" << dendl;
1718 for (auto &p: full_pool_ids) {
1719 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1720 continue;
1721 }
1722 if (pending_inc.new_pools.count(p) == 0) {
1723 pending_inc.new_pools[p] = tmp.pools[p];
1724 }
1725 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1726 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1727 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1728 }
1729 // cancel FLAG_FULL for pools which are no longer full too
1730 for (auto &pool: tmp.get_pools()) {
1731 auto p = pool.first;
1732 if (full_pool_ids.count(p)) {
1733 // skip pools we have just marked as full above
1734 continue;
1735 }
1736 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1737 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1738 // don't touch if currently is not full
1739 // or is running out of quota (and hence considered as full)
1740 continue;
1741 }
1742 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743 << "'s full flag" << dendl;
1744 if (pending_inc.new_pools.count(p) == 0) {
1745 pending_inc.new_pools[p] = pool.second;
1746 }
1747 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1748 }
1749 }
1750 if (!backfillfull_pool_ids.empty()) {
1751 for (auto &p: backfillfull_pool_ids) {
1752 if (full_pool_ids.count(p)) {
1753 // skip pools we have already considered as full above
1754 continue;
1755 }
1756 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1757 // make sure FLAG_FULL is truly set, so we are safe not
1758 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1759 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1760 continue;
1761 }
1762 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1763 // don't bother if pool is already marked as backfillfull
1764 continue;
1765 }
1766 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1767 << "'s as backfillfull" << dendl;
1768 if (pending_inc.new_pools.count(p) == 0) {
1769 pending_inc.new_pools[p] = tmp.pools[p];
1770 }
1771 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1772 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1773 }
1774 // cancel FLAG_BACKFILLFULL for pools
1775 // which are no longer backfillfull too
1776 for (auto &pool: tmp.get_pools()) {
1777 auto p = pool.first;
1778 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1779 // skip pools we have just marked as backfillfull/full above
1780 continue;
1781 }
1782 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1783 // and don't touch if currently is not backfillfull
1784 continue;
1785 }
1786 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1787 << "'s backfillfull flag" << dendl;
1788 if (pending_inc.new_pools.count(p) == 0) {
1789 pending_inc.new_pools[p] = pool.second;
1790 }
1791 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1792 }
1793 }
1794 if (!nearfull_pool_ids.empty()) {
1795 for (auto &p: nearfull_pool_ids) {
1796 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1797 continue;
1798 }
1799 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1800 // make sure FLAG_FULL is truly set, so we are safe not
1801 // to set a extra (redundant) FLAG_NEARFULL flag
1802 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1803 continue;
1804 }
1805 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1806 // don't bother if pool is already marked as nearfull
1807 continue;
1808 }
1809 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1810 << "'s as nearfull" << dendl;
1811 if (pending_inc.new_pools.count(p) == 0) {
1812 pending_inc.new_pools[p] = tmp.pools[p];
1813 }
1814 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1815 }
1816 // cancel FLAG_NEARFULL for pools
1817 // which are no longer nearfull too
1818 for (auto &pool: tmp.get_pools()) {
1819 auto p = pool.first;
1820 if (full_pool_ids.count(p) ||
1821 backfillfull_pool_ids.count(p) ||
1822 nearfull_pool_ids.count(p)) {
1823 // skip pools we have just marked as
1824 // nearfull/backfillfull/full above
1825 continue;
1826 }
1827 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1828 // and don't touch if currently is not nearfull
1829 continue;
1830 }
1831 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1832 << "'s nearfull flag" << dendl;
1833 if (pending_inc.new_pools.count(p) == 0) {
1834 pending_inc.new_pools[p] = pool.second;
1835 }
1836 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1837 }
1838 }
1839
1840 // min_compat_client?
1841 if (!tmp.require_min_compat_client) {
1842 auto mv = tmp.get_min_compat_client();
1843 dout(1) << __func__ << " setting require_min_compat_client to currently "
1844 << "required " << mv << dendl;
1845 mon.clog->info() << "setting require_min_compat_client to currently "
1846 << "required " << mv;
1847 pending_inc.new_require_min_compat_client = mv;
1848 }
1849
1850 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1851 tmp.require_osd_release >= ceph_release_t::nautilus) {
1852 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1853 // add creating flags?
1854 for (auto& i : tmp.get_pools()) {
1855 if (pending_creatings.still_creating_pool(i.first)) {
1856 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1857 << dendl;
1858 if (pending_inc.new_pools.count(i.first) == 0) {
1859 pending_inc.new_pools[i.first] = i.second;
1860 }
1861 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1862 }
1863 }
1864 // adjust blocklist items to all be TYPE_ANY
1865 for (auto& i : tmp.blocklist) {
1866 auto a = i.first;
1867 a.set_type(entity_addr_t::TYPE_ANY);
1868 pending_inc.new_blocklist[a] = i.second;
1869 pending_inc.old_blocklist.push_back(i.first);
1870 }
1871 }
1872
1873 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1874 tmp.require_osd_release >= ceph_release_t::octopus) {
1875 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1876
1877 // adjust obsoleted cache modes
1878 for (auto& [poolid, pi] : tmp.pools) {
1879 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1880 if (pending_inc.new_pools.count(poolid) == 0) {
1881 pending_inc.new_pools[poolid] = pi;
1882 }
1883 dout(10) << __func__ << " switching pool " << poolid
1884 << " cachemode from forward -> proxy" << dendl;
1885 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1886 }
1887 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1888 if (pending_inc.new_pools.count(poolid) == 0) {
1889 pending_inc.new_pools[poolid] = pi;
1890 }
1891 dout(10) << __func__ << " switching pool " << poolid
1892 << " cachemode from readforward -> readproxy" << dendl;
1893 pending_inc.new_pools[poolid].cache_mode =
1894 pg_pool_t::CACHEMODE_READPROXY;
1895 }
1896 }
1897
1898 // clear removed_snaps for every pool
1899 for (auto& [poolid, pi] : tmp.pools) {
1900 if (pi.removed_snaps.empty()) {
1901 continue;
1902 }
1903 if (pending_inc.new_pools.count(poolid) == 0) {
1904 pending_inc.new_pools[poolid] = pi;
1905 }
1906 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1907 << dendl;
1908 pending_inc.new_pools[poolid].removed_snaps.clear();
1909 }
1910
1911 // create a combined purged snap epoch key for all purged snaps
1912 // prior to this epoch, and store it in the current epoch (i.e.,
1913 // the last pre-octopus epoch, just prior to the one we're
1914 // encoding now).
1915 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1916 it->lower_bound("purged_snap_");
1917 map<int64_t,snap_interval_set_t> combined;
1918 while (it->valid()) {
1919 if (it->key().find("purged_snap_") != 0) {
1920 break;
1921 }
1922 string k = it->key();
1923 long long unsigned pool;
1924 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1925 if (n != 1) {
1926 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1927 } else {
1928 bufferlist v = it->value();
1929 auto p = v.cbegin();
1930 snapid_t begin, end;
1931 ceph::decode(begin, p);
1932 ceph::decode(end, p);
1933 combined[pool].insert(begin, end - begin);
1934 }
1935 it->next();
1936 }
1937 if (!combined.empty()) {
1938 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1939 bufferlist v;
1940 ceph::encode(combined, v);
1941 t->put(OSD_SNAP_PREFIX, k, v);
1942 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1943 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1944 << dendl;
1945 } else {
1946 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1947 << dendl;
1948 }
1949
1950 // clean out the old removed_snap_ and removed_epoch keys
1951 // ('`' is ASCII '_' + 1)
1952 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1953 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1954 }
1955 }
1956
1957 // tell me about it
1958 for (auto i = pending_inc.new_state.begin();
1959 i != pending_inc.new_state.end();
1960 ++i) {
1961 int s = i->second ? i->second : CEPH_OSD_UP;
1962 if (s & CEPH_OSD_UP) {
1963 dout(2) << " osd." << i->first << " DOWN" << dendl;
1964 // Reset laggy parameters if failure interval exceeds a threshold.
1965 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1966 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1967 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1968 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1969 set_default_laggy_params(i->first);
1970 }
1971 }
1972 }
1973 if (s & CEPH_OSD_EXISTS)
1974 dout(2) << " osd." << i->first << " DNE" << dendl;
1975 }
1976 for (auto i = pending_inc.new_up_client.begin();
1977 i != pending_inc.new_up_client.end();
1978 ++i) {
1979 //FIXME: insert cluster addresses too
1980 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1981 }
1982 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1983 i != pending_inc.new_weight.end();
1984 ++i) {
1985 if (i->second == CEPH_OSD_OUT) {
1986 dout(2) << " osd." << i->first << " OUT" << dendl;
1987 } else if (i->second == CEPH_OSD_IN) {
1988 dout(2) << " osd." << i->first << " IN" << dendl;
1989 } else {
1990 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1991 }
1992 }
1993
1994 // features for osdmap and its incremental
1995 uint64_t features;
1996
1997 // encode full map and determine its crc
1998 OSDMap tmp;
1999 {
2000 tmp.deepish_copy_from(osdmap);
2001 tmp.apply_incremental(pending_inc);
2002
2003 // determine appropriate features
2004 features = tmp.get_encoding_features();
2005 dout(10) << __func__ << " encoding full map with "
2006 << tmp.require_osd_release
2007 << " features " << features << dendl;
2008
2009 // the features should be a subset of the mon quorum's features!
2010 ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2011
2012 bufferlist fullbl;
2013 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2014 pending_inc.full_crc = tmp.get_crc();
2015
2016 // include full map in the txn. note that old monitors will
2017 // overwrite this. new ones will now skip the local full map
2018 // encode and reload from this.
2019 put_version_full(t, pending_inc.epoch, fullbl);
2020 }
2021
2022 // encode
2023 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2024 bufferlist bl;
2025 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2026
2027 dout(20) << " full_crc " << tmp.get_crc()
2028 << " inc_crc " << pending_inc.inc_crc << dendl;
2029
2030 /* put everything in the transaction */
2031 put_version(t, pending_inc.epoch, bl);
2032 put_last_committed(t, pending_inc.epoch);
2033
2034 // metadata, too!
2035 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2036 p != pending_metadata.end();
2037 ++p) {
2038 Metadata m;
2039 auto mp = p->second.cbegin();
2040 decode(m, mp);
2041 auto it = m.find("osd_objectstore");
2042 if (it != m.end()) {
2043 if (it->second == "filestore") {
2044 filestore_osds.insert(p->first);
2045 } else {
2046 filestore_osds.erase(p->first);
2047 }
2048 }
2049 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2050 }
2051 for (set<int>::iterator p = pending_metadata_rm.begin();
2052 p != pending_metadata_rm.end();
2053 ++p) {
2054 filestore_osds.erase(*p);
2055 t->erase(OSD_METADATA_PREFIX, stringify(*p));
2056 }
2057 pending_metadata.clear();
2058 pending_metadata_rm.clear();
2059
2060 // purged_snaps
2061 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2062 !pending_inc.new_purged_snaps.empty()) {
2063 // all snaps purged this epoch (across all pools)
2064 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2065 bufferlist v;
2066 encode(pending_inc.new_purged_snaps, v);
2067 t->put(OSD_SNAP_PREFIX, k, v);
2068 }
2069 for (auto& i : pending_inc.new_purged_snaps) {
2070 for (auto q = i.second.begin();
2071 q != i.second.end();
2072 ++q) {
2073 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2074 pending_inc.epoch,
2075 t);
2076 }
2077 }
2078 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2079 for (auto snap : snaps) {
2080 insert_purged_snap_update(pool, snap, snap + 1,
2081 pending_inc.epoch,
2082 t);
2083 }
2084 }
2085
2086 // health
2087 health_check_map_t next;
2088 tmp.check_health(cct, &next);
2089 // OSD_FILESTORE
2090 check_for_filestore_osds(&next);
2091 encode_health(next, t);
2092 }
2093
2094 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2095 {
2096 bufferlist bl;
2097 int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2098 if (r < 0)
2099 return r;
2100 try {
2101 auto p = bl.cbegin();
2102 decode(m, p);
2103 }
2104 catch (ceph::buffer::error& e) {
2105 if (err)
2106 *err << "osd." << osd << " metadata is corrupt";
2107 return -EIO;
2108 }
2109 return 0;
2110 }
2111
2112 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2113 {
2114 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2115 if (osdmap.is_up(osd)) {
2116 map<string,string> meta;
2117 load_metadata(osd, meta, nullptr);
2118 auto p = meta.find(field);
2119 if (p == meta.end()) {
2120 (*out)["unknown"]++;
2121 } else {
2122 (*out)[p->second]++;
2123 }
2124 }
2125 }
2126 }
2127
2128 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2129 {
2130 map<string,int> by_val;
2131 count_metadata(field, &by_val);
2132 f->open_object_section(field.c_str());
2133 for (auto& p : by_val) {
2134 f->dump_int(p.first.c_str(), p.second);
2135 }
2136 f->close_section();
2137 }
2138
2139 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2140 {
2141 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2142 if (osdmap.is_up(osd)) {
2143 map<string,string> meta;
2144 load_metadata(osd, meta, nullptr);
2145 auto p = meta.find("ceph_version_short");
2146 if (p == meta.end()) continue;
2147 versions[p->second].push_back(string("osd.") + stringify(osd));
2148 }
2149 }
2150 }
2151
2152 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2153 {
2154 map<string, string> metadata;
2155 int r = load_metadata(osd, metadata, nullptr);
2156 if (r < 0)
2157 return r;
2158
2159 auto it = metadata.find("osd_objectstore");
2160 if (it == metadata.end())
2161 return -ENOENT;
2162 *type = it->second;
2163 return 0;
2164 }
2165
2166 void OSDMonitor::get_filestore_osd_list()
2167 {
2168 for (unsigned osd = 0; osd < osdmap.get_num_osds(); ++osd) {
2169 string objectstore_type;
2170 int r = get_osd_objectstore_type(osd, &objectstore_type);
2171 if (r == 0 && objectstore_type == "filestore") {
2172 filestore_osds.insert(osd);
2173 }
2174 }
2175 }
2176
2177 void OSDMonitor::check_for_filestore_osds(health_check_map_t *checks)
2178 {
2179 if (g_conf()->mon_warn_on_filestore_osds &&
2180 filestore_osds.size() > 0) {
2181 ostringstream ss, deprecated_tip;
2182 list<string> detail;
2183 ss << filestore_osds.size()
2184 << " osd(s) "
2185 << (filestore_osds.size() == 1 ? "is" : "are")
2186 << " running Filestore";
2187 deprecated_tip << ss.str();
2188 ss << " [Deprecated]";
2189 auto& d = checks->add("OSD_FILESTORE", HEALTH_WARN, ss.str(),
2190 filestore_osds.size());
2191 deprecated_tip << ", which has been deprecated and"
2192 << " not been optimized for QoS"
2193 << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2194 detail.push_back(deprecated_tip.str());
2195 d.detail.swap(detail);
2196 }
2197 }
2198
2199 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2200 const pg_pool_t &pool,
2201 ostream *err)
2202 {
2203 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2204 // since filestore osds could always join the pool later
2205 set<int> checked_osds;
2206 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2207 vector<int> up, acting;
2208 pg_t pgid(ps, pool_id);
2209 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2210 for (int osd : up) {
2211 if (checked_osds.find(osd) != checked_osds.end())
2212 continue;
2213 string objectstore_type;
2214 int r = get_osd_objectstore_type(osd, &objectstore_type);
2215 // allow with missing metadata, e.g. due to an osd never booting yet
2216 if (r < 0 || objectstore_type == "bluestore") {
2217 checked_osds.insert(osd);
2218 continue;
2219 }
2220 *err << "osd." << osd << " uses " << objectstore_type;
2221 return false;
2222 }
2223 }
2224 return true;
2225 }
2226
2227 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2228 {
2229 map<string,string> m;
2230 if (int r = load_metadata(osd, m, err))
2231 return r;
2232 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2233 f->dump_string(p->first.c_str(), p->second);
2234 return 0;
2235 }
2236
2237 void OSDMonitor::print_nodes(Formatter *f)
2238 {
2239 // group OSDs by their hosts
2240 map<string, list<int> > osds; // hostname => osd
2241 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2242 map<string, string> m;
2243 if (load_metadata(osd, m, NULL)) {
2244 continue;
2245 }
2246 map<string, string>::iterator hostname = m.find("hostname");
2247 if (hostname == m.end()) {
2248 // not likely though
2249 continue;
2250 }
2251 osds[hostname->second].push_back(osd);
2252 }
2253
2254 dump_services(f, osds, "osd");
2255 }
2256
2257 void OSDMonitor::share_map_with_random_osd()
2258 {
2259 if (osdmap.get_num_up_osds() == 0) {
2260 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2261 return;
2262 }
2263
2264 MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2265 if (!s) {
2266 dout(10) << __func__ << " no up osd on our session map" << dendl;
2267 return;
2268 }
2269
2270 dout(10) << "committed, telling random " << s->name
2271 << " all about it" << dendl;
2272
2273 // get feature of the peer
2274 // use quorum_con_features, if it's an anonymous connection.
2275 uint64_t features = s->con_features ? s->con_features :
2276 mon.get_quorum_con_features();
2277 // whatev, they'll request more if they need it
2278 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2279 s->con->send_message(m);
2280 // NOTE: do *not* record osd has up to this epoch (as we do
2281 // elsewhere) as they may still need to request older values.
2282 }
2283
2284 version_t OSDMonitor::get_trim_to() const
2285 {
2286 if (mon.get_quorum().empty()) {
2287 dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2288 return 0;
2289 }
2290
2291 {
2292 std::lock_guard<std::mutex> l(creating_pgs_lock);
2293 if (!creating_pgs.pgs.empty()) {
2294 dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2295 return 0;
2296 }
2297 }
2298
2299 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2300 dout(0) << __func__
2301 << " blocking osdmap trim"
2302 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2303 << " trim_to = 0" << dendl;
2304 return 0;
2305 }
2306
2307 {
2308 epoch_t floor = get_min_last_epoch_clean();
2309 dout(10) << " min_last_epoch_clean " << floor << dendl;
2310 if (g_conf()->mon_osd_force_trim_to > 0 &&
2311 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2312 floor = g_conf()->mon_osd_force_trim_to;
2313 dout(10) << __func__
2314 << " explicit mon_osd_force_trim_to = " << floor << dendl;
2315 }
2316 unsigned min = g_conf()->mon_min_osdmap_epochs;
2317 if (floor + min > get_last_committed()) {
2318 if (min < get_last_committed())
2319 floor = get_last_committed() - min;
2320 else
2321 floor = 0;
2322 }
2323 if (floor > get_first_committed()) {
2324 dout(10) << __func__ << " trim_to = " << floor << dendl;
2325 return floor;
2326 }
2327 }
2328 dout(10) << __func__ << " trim_to = 0" << dendl;
2329 return 0;
2330 }
2331
2332 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2333 {
2334 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2335 // also scan osd epochs
2336 // don't trim past the oldest reported osd epoch
2337 for (auto [osd, epoch] : osd_epochs) {
2338 if (epoch < floor) {
2339 floor = epoch;
2340 }
2341 }
2342 return floor;
2343 }
2344
2345 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2346 version_t first)
2347 {
2348 dout(10) << __func__ << " including full map for e " << first << dendl;
2349 bufferlist bl;
2350 get_version_full(first, bl);
2351 put_version_full(tx, first, bl);
2352
2353 if (has_osdmap_manifest &&
2354 first > osdmap_manifest.get_first_pinned()) {
2355 _prune_update_trimmed(tx, first);
2356 }
2357 }
2358
2359
2360 /* full osdmap prune
2361 *
2362 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2363 */
2364
2365 void OSDMonitor::load_osdmap_manifest()
2366 {
2367 bool store_has_manifest =
2368 mon.store->exists(get_service_name(), "osdmap_manifest");
2369
2370 if (!store_has_manifest) {
2371 if (!has_osdmap_manifest) {
2372 return;
2373 }
2374
2375 dout(20) << __func__
2376 << " dropping osdmap manifest from memory." << dendl;
2377 osdmap_manifest = osdmap_manifest_t();
2378 has_osdmap_manifest = false;
2379 return;
2380 }
2381
2382 dout(20) << __func__
2383 << " osdmap manifest detected in store; reload." << dendl;
2384
2385 bufferlist manifest_bl;
2386 int r = get_value("osdmap_manifest", manifest_bl);
2387 if (r < 0) {
2388 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2389 ceph_abort_msg("error reading manifest");
2390 }
2391 osdmap_manifest.decode(manifest_bl);
2392 has_osdmap_manifest = true;
2393
2394 dout(10) << __func__ << " store osdmap manifest pinned ("
2395 << osdmap_manifest.get_first_pinned()
2396 << " .. "
2397 << osdmap_manifest.get_last_pinned()
2398 << ")"
2399 << dendl;
2400 }
2401
2402 bool OSDMonitor::should_prune() const
2403 {
2404 version_t first = get_first_committed();
2405 version_t last = get_last_committed();
2406 version_t min_osdmap_epochs =
2407 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2408 version_t prune_min =
2409 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2410 version_t prune_interval =
2411 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2412 version_t last_pinned = osdmap_manifest.get_last_pinned();
2413 version_t last_to_pin = last - min_osdmap_epochs;
2414
2415 // Make it or break it constraints.
2416 //
2417 // If any of these conditions fails, we will not prune, regardless of
2418 // whether we have an on-disk manifest with an on-going pruning state.
2419 //
2420 if ((last - first) <= min_osdmap_epochs) {
2421 // between the first and last committed epochs, we don't have
2422 // enough epochs to trim, much less to prune.
2423 dout(10) << __func__
2424 << " currently holding only " << (last - first)
2425 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2426 << "); do not prune."
2427 << dendl;
2428 return false;
2429
2430 } else if ((last_to_pin - first) < prune_min) {
2431 // between the first committed epoch and the last epoch we would prune,
2432 // we simply don't have enough versions over the minimum to prune maps.
2433 dout(10) << __func__
2434 << " could only prune " << (last_to_pin - first)
2435 << " epochs (" << first << ".." << last_to_pin << "), which"
2436 " is less than the required minimum (" << prune_min << ")"
2437 << dendl;
2438 return false;
2439
2440 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2441 dout(10) << __func__
2442 << " we have pruned as far as we can; do not prune."
2443 << dendl;
2444 return false;
2445
2446 } else if (last_pinned + prune_interval > last_to_pin) {
2447 dout(10) << __func__
2448 << " not enough epochs to form an interval (last pinned: "
2449 << last_pinned << ", last to pin: "
2450 << last_to_pin << ", interval: " << prune_interval << ")"
2451 << dendl;
2452 return false;
2453 }
2454
2455 dout(15) << __func__
2456 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2457 << " lc (" << first << ".." << last << ")"
2458 << dendl;
2459 return true;
2460 }
2461
2462 void OSDMonitor::_prune_update_trimmed(
2463 MonitorDBStore::TransactionRef tx,
2464 version_t first)
2465 {
2466 dout(10) << __func__
2467 << " first " << first
2468 << " last_pinned " << osdmap_manifest.get_last_pinned()
2469 << dendl;
2470
2471 osdmap_manifest_t manifest = osdmap_manifest;
2472
2473 if (!manifest.is_pinned(first)) {
2474 manifest.pin(first);
2475 }
2476
2477 set<version_t>::iterator p_end = manifest.pinned.find(first);
2478 set<version_t>::iterator p = manifest.pinned.begin();
2479 manifest.pinned.erase(p, p_end);
2480 ceph_assert(manifest.get_first_pinned() == first);
2481
2482 if (manifest.get_last_pinned() == first+1 ||
2483 manifest.pinned.size() == 1) {
2484 // we reached the end of the line, as pinned maps go; clean up our
2485 // manifest, and let `should_prune()` decide whether we should prune
2486 // again.
2487 tx->erase(get_service_name(), "osdmap_manifest");
2488 return;
2489 }
2490
2491 bufferlist bl;
2492 manifest.encode(bl);
2493 tx->put(get_service_name(), "osdmap_manifest", bl);
2494 }
2495
2496 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2497 {
2498 dout(1) << __func__ << dendl;
2499
2500 version_t pin_first;
2501
2502 // verify constrainsts on stable in-memory state
2503 if (!has_osdmap_manifest) {
2504 // we must have never pruned, OR if we pruned the state must no longer
2505 // be relevant (i.e., the state must have been removed alongside with
2506 // the trim that *must* have removed past the last pinned map in a
2507 // previous prune).
2508 ceph_assert(osdmap_manifest.pinned.empty());
2509 ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2510 pin_first = get_first_committed();
2511
2512 } else {
2513 // we must have pruned in the past AND its state is still relevant
2514 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2515 // and thus we still hold a manifest in the store).
2516 ceph_assert(!osdmap_manifest.pinned.empty());
2517 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2518 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2519
2520 dout(10) << __func__
2521 << " first_pinned " << osdmap_manifest.get_first_pinned()
2522 << " last_pinned " << osdmap_manifest.get_last_pinned()
2523 << dendl;
2524
2525 pin_first = osdmap_manifest.get_last_pinned();
2526 }
2527
2528 manifest.pin(pin_first);
2529 }
2530
2531 bool OSDMonitor::_prune_sanitize_options() const
2532 {
2533 uint64_t prune_interval =
2534 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2535 uint64_t prune_min =
2536 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2537 uint64_t txsize =
2538 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2539
2540 bool r = true;
2541
2542 if (prune_interval == 0) {
2543 derr << __func__
2544 << " prune is enabled BUT prune interval is zero; abort."
2545 << dendl;
2546 r = false;
2547 } else if (prune_interval == 1) {
2548 derr << __func__
2549 << " prune interval is equal to one, which essentially means"
2550 " no pruning; abort."
2551 << dendl;
2552 r = false;
2553 }
2554 if (prune_min == 0) {
2555 derr << __func__
2556 << " prune is enabled BUT prune min is zero; abort."
2557 << dendl;
2558 r = false;
2559 }
2560 if (prune_interval > prune_min) {
2561 derr << __func__
2562 << " impossible to ascertain proper prune interval because"
2563 << " it is greater than the minimum prune epochs"
2564 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2565 << dendl;
2566 r = false;
2567 }
2568
2569 if (txsize < prune_interval - 1) {
2570 derr << __func__
2571 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2572 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2573 << "); abort." << dendl;
2574 r = false;
2575 }
2576 return r;
2577 }
2578
2579 bool OSDMonitor::is_prune_enabled() const {
2580 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2581 }
2582
2583 bool OSDMonitor::is_prune_supported() const {
2584 return mon.get_required_mon_features().contains_any(
2585 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2586 }
2587
2588 /** do_prune
2589 *
2590 * @returns true if has side-effects; false otherwise.
2591 */
2592 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2593 {
2594 bool enabled = is_prune_enabled();
2595
2596 dout(1) << __func__ << " osdmap full prune "
2597 << ( enabled ? "enabled" : "disabled")
2598 << dendl;
2599
2600 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2601 return false;
2602 }
2603
2604 // we are beyond the minimum prune versions, we need to remove maps because
2605 // otherwise the store will grow unbounded and we may end up having issues
2606 // with available disk space or store hangs.
2607
2608 // we will not pin all versions. We will leave a buffer number of versions.
2609 // this allows us the monitor to trim maps without caring too much about
2610 // pinned maps, and then allow us to use another ceph-mon without these
2611 // capabilities, without having to repair the store.
2612
2613 osdmap_manifest_t manifest = osdmap_manifest;
2614
2615 version_t first = get_first_committed();
2616 version_t last = get_last_committed();
2617
2618 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2619 version_t last_pinned = manifest.get_last_pinned();
2620 uint64_t prune_interval =
2621 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2622 uint64_t txsize =
2623 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2624
2625 prune_init(manifest);
2626
2627 // we need to get rid of some osdmaps
2628
2629 dout(5) << __func__
2630 << " lc (" << first << " .. " << last << ")"
2631 << " last_pinned " << last_pinned
2632 << " interval " << prune_interval
2633 << " last_to_pin " << last_to_pin
2634 << dendl;
2635
2636 // We will be erasing maps as we go.
2637 //
2638 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2639 //
2640 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2641 // we stop pruning. We could prune the maps between `next_to_pin` and
2642 // `last_to_pin`, but by not doing it we end up with neater pruned
2643 // intervals, aligned with `prune_interval`. Besides, this should not be a
2644 // problem as long as `prune_interval` is set to a sane value, instead of
2645 // hundreds or thousands of maps.
2646
2647 auto map_exists = [this](version_t v) {
2648 string k = mon.store->combine_strings("full", v);
2649 return mon.store->exists(get_service_name(), k);
2650 };
2651
2652 // 'interval' represents the number of maps from the last pinned
2653 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2654 // version 11 next; all intermediate versions will be removed.
2655 //
2656 // 'txsize' represents the maximum number of versions we'll be removing in
2657 // this iteration. If 'txsize' is large enough to perform multiple passes
2658 // pinning and removing maps, we will do so; if not, we'll do at least one
2659 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2660 // ensure that we never go *over* the maximum.
2661
2662 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2663 uint64_t removal_interval = prune_interval - 1;
2664
2665 if (txsize < removal_interval) {
2666 dout(5) << __func__
2667 << " setting txsize to removal interval size ("
2668 << removal_interval << " versions"
2669 << dendl;
2670 txsize = removal_interval;
2671 }
2672 ceph_assert(removal_interval > 0);
2673
2674 uint64_t num_pruned = 0;
2675 while (num_pruned + removal_interval <= txsize) {
2676 last_pinned = manifest.get_last_pinned();
2677
2678 if (last_pinned + prune_interval > last_to_pin) {
2679 break;
2680 }
2681 ceph_assert(last_pinned < last_to_pin);
2682
2683 version_t next_pinned = last_pinned + prune_interval;
2684 ceph_assert(next_pinned <= last_to_pin);
2685 manifest.pin(next_pinned);
2686
2687 dout(20) << __func__
2688 << " last_pinned " << last_pinned
2689 << " next_pinned " << next_pinned
2690 << " num_pruned " << num_pruned
2691 << " removal interval (" << (last_pinned+1)
2692 << ".." << (next_pinned-1) << ")"
2693 << " txsize " << txsize << dendl;
2694
2695 ceph_assert(map_exists(last_pinned));
2696 ceph_assert(map_exists(next_pinned));
2697
2698 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2699 ceph_assert(!manifest.is_pinned(v));
2700
2701 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2702 string full_key = mon.store->combine_strings("full", v);
2703 tx->erase(get_service_name(), full_key);
2704 ++num_pruned;
2705 }
2706 }
2707
2708 ceph_assert(num_pruned > 0);
2709
2710 bufferlist bl;
2711 manifest.encode(bl);
2712 tx->put(get_service_name(), "osdmap_manifest", bl);
2713
2714 return true;
2715 }
2716
2717
2718 // -------------
2719
2720 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2721 {
2722 op->mark_osdmon_event(__func__);
2723 Message *m = op->get_req();
2724 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2725
2726 switch (m->get_type()) {
2727 // READs
2728 case MSG_MON_COMMAND:
2729 try {
2730 return preprocess_command(op);
2731 } catch (const bad_cmd_get& e) {
2732 bufferlist bl;
2733 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2734 return true;
2735 }
2736 case CEPH_MSG_MON_GET_OSDMAP:
2737 return preprocess_get_osdmap(op);
2738
2739 // damp updates
2740 case MSG_OSD_MARK_ME_DOWN:
2741 return preprocess_mark_me_down(op);
2742 case MSG_OSD_MARK_ME_DEAD:
2743 return preprocess_mark_me_dead(op);
2744 case MSG_OSD_FULL:
2745 return preprocess_full(op);
2746 case MSG_OSD_FAILURE:
2747 return preprocess_failure(op);
2748 case MSG_OSD_BOOT:
2749 return preprocess_boot(op);
2750 case MSG_OSD_ALIVE:
2751 return preprocess_alive(op);
2752 case MSG_OSD_PG_CREATED:
2753 return preprocess_pg_created(op);
2754 case MSG_OSD_PG_READY_TO_MERGE:
2755 return preprocess_pg_ready_to_merge(op);
2756 case MSG_OSD_PGTEMP:
2757 return preprocess_pgtemp(op);
2758 case MSG_OSD_BEACON:
2759 return preprocess_beacon(op);
2760
2761 case CEPH_MSG_POOLOP:
2762 return preprocess_pool_op(op);
2763
2764 case MSG_REMOVE_SNAPS:
2765 return preprocess_remove_snaps(op);
2766
2767 case MSG_MON_GET_PURGED_SNAPS:
2768 return preprocess_get_purged_snaps(op);
2769
2770 default:
2771 ceph_abort();
2772 return false;
2773 }
2774 }
2775
2776 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2777 {
2778 op->mark_osdmon_event(__func__);
2779 Message *m = op->get_req();
2780 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2781
2782 switch (m->get_type()) {
2783 // damp updates
2784 case MSG_OSD_MARK_ME_DOWN:
2785 return prepare_mark_me_down(op);
2786 case MSG_OSD_MARK_ME_DEAD:
2787 return prepare_mark_me_dead(op);
2788 case MSG_OSD_FULL:
2789 return prepare_full(op);
2790 case MSG_OSD_FAILURE:
2791 return prepare_failure(op);
2792 case MSG_OSD_BOOT:
2793 return prepare_boot(op);
2794 case MSG_OSD_ALIVE:
2795 return prepare_alive(op);
2796 case MSG_OSD_PG_CREATED:
2797 return prepare_pg_created(op);
2798 case MSG_OSD_PGTEMP:
2799 return prepare_pgtemp(op);
2800 case MSG_OSD_PG_READY_TO_MERGE:
2801 return prepare_pg_ready_to_merge(op);
2802 case MSG_OSD_BEACON:
2803 return prepare_beacon(op);
2804
2805 case MSG_MON_COMMAND:
2806 try {
2807 return prepare_command(op);
2808 } catch (const bad_cmd_get& e) {
2809 bufferlist bl;
2810 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2811 return false; /* nothing to propose */
2812 }
2813
2814 case CEPH_MSG_POOLOP:
2815 return prepare_pool_op(op);
2816
2817 case MSG_REMOVE_SNAPS:
2818 return prepare_remove_snaps(op);
2819
2820
2821 default:
2822 ceph_abort();
2823 }
2824
2825 return false;
2826 }
2827
2828 bool OSDMonitor::should_propose(double& delay)
2829 {
2830 dout(10) << "should_propose" << dendl;
2831
2832 // if full map, propose immediately! any subsequent changes will be clobbered.
2833 if (pending_inc.fullmap.length())
2834 return true;
2835
2836 // adjust osd weights?
2837 if (!osd_weight.empty() &&
2838 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2839 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2840 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2841 delay = 0.0;
2842 osd_weight.clear();
2843 return true;
2844 }
2845
2846 return PaxosService::should_propose(delay);
2847 }
2848
2849
2850
2851 // ---------------------------
2852 // READs
2853
2854 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2855 {
2856 op->mark_osdmon_event(__func__);
2857 auto m = op->get_req<MMonGetOSDMap>();
2858
2859 uint64_t features = mon.get_quorum_con_features();
2860 if (op->get_session() && op->get_session()->con_features)
2861 features = op->get_session()->con_features;
2862
2863 dout(10) << __func__ << " " << *m << dendl;
2864 MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2865 epoch_t first = get_first_committed();
2866 epoch_t last = osdmap.get_epoch();
2867 int max = g_conf()->osd_map_message_max;
2868 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2869 for (epoch_t e = std::max(first, m->get_full_first());
2870 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2871 ++e, --max) {
2872 bufferlist& bl = reply->maps[e];
2873 int r = get_version_full(e, features, bl);
2874 ceph_assert(r >= 0);
2875 max_bytes -= bl.length();
2876 }
2877 for (epoch_t e = std::max(first, m->get_inc_first());
2878 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2879 ++e, --max) {
2880 bufferlist& bl = reply->incremental_maps[e];
2881 int r = get_version(e, features, bl);
2882 ceph_assert(r >= 0);
2883 max_bytes -= bl.length();
2884 }
2885 reply->cluster_osdmap_trim_lower_bound = first;
2886 reply->newest_map = last;
2887 mon.send_reply(op, reply);
2888 return true;
2889 }
2890
2891
2892 // ---------------------------
2893 // UPDATEs
2894
2895 // failure --
2896
2897 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2898 // check permissions
2899 MonSession *session = op->get_session();
2900 if (!session)
2901 return true;
2902 if (!session->is_capable("osd", MON_CAP_X)) {
2903 dout(0) << "got MOSDFailure from entity with insufficient caps "
2904 << session->caps << dendl;
2905 return true;
2906 }
2907 if (fsid != mon.monmap->fsid) {
2908 dout(0) << "check_source: on fsid " << fsid
2909 << " != " << mon.monmap->fsid << dendl;
2910 return true;
2911 }
2912 return false;
2913 }
2914
2915
2916 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2917 {
2918 op->mark_osdmon_event(__func__);
2919 auto m = op->get_req<MOSDFailure>();
2920 // who is target_osd
2921 int badboy = m->get_target_osd();
2922
2923 // check permissions
2924 if (check_source(op, m->fsid))
2925 goto didit;
2926
2927 // first, verify the reporting host is valid
2928 if (m->get_orig_source().is_osd()) {
2929 int from = m->get_orig_source().num();
2930 if (!osdmap.exists(from) ||
2931 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2932 (osdmap.is_down(from) && m->if_osd_failed())) {
2933 dout(5) << "preprocess_failure from dead osd." << from
2934 << ", ignoring" << dendl;
2935 send_incremental(op, m->get_epoch()+1);
2936 goto didit;
2937 }
2938 }
2939
2940
2941 // weird?
2942 if (osdmap.is_down(badboy)) {
2943 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2944 << " " << m->get_target_addrs()
2945 << ", from " << m->get_orig_source() << dendl;
2946 if (m->get_epoch() < osdmap.get_epoch())
2947 send_incremental(op, m->get_epoch()+1);
2948 goto didit;
2949 }
2950 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2951 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2952 << " " << m->get_target_addrs()
2953 << " != map's " << osdmap.get_addrs(badboy)
2954 << ", from " << m->get_orig_source() << dendl;
2955 if (m->get_epoch() < osdmap.get_epoch())
2956 send_incremental(op, m->get_epoch()+1);
2957 goto didit;
2958 }
2959
2960 // already reported?
2961 if (osdmap.is_down(badboy) ||
2962 osdmap.get_up_from(badboy) > m->get_epoch()) {
2963 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2964 << " " << m->get_target_addrs()
2965 << ", from " << m->get_orig_source() << dendl;
2966 if (m->get_epoch() < osdmap.get_epoch())
2967 send_incremental(op, m->get_epoch()+1);
2968 goto didit;
2969 }
2970
2971 if (!can_mark_down(badboy)) {
2972 dout(5) << "preprocess_failure ignoring report of osd."
2973 << m->get_target_osd() << " " << m->get_target_addrs()
2974 << " from " << m->get_orig_source() << dendl;
2975 goto didit;
2976 }
2977
2978 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2979 << " " << m->get_target_addrs()
2980 << ", from " << m->get_orig_source() << dendl;
2981 return false;
2982
2983 didit:
2984 mon.no_reply(op);
2985 return true;
2986 }
2987
2988 class C_AckMarkedDown : public C_MonOp {
2989 OSDMonitor *osdmon;
2990 public:
2991 C_AckMarkedDown(
2992 OSDMonitor *osdmon,
2993 MonOpRequestRef op)
2994 : C_MonOp(op), osdmon(osdmon) {}
2995
2996 void _finish(int r) override {
2997 if (r == 0) {
2998 auto m = op->get_req<MOSDMarkMeDown>();
2999 osdmon->mon.send_reply(
3000 op,
3001 new MOSDMarkMeDown(
3002 m->fsid,
3003 m->target_osd,
3004 m->target_addrs,
3005 m->get_epoch(),
3006 false)); // ACK itself does not request an ack
3007 } else if (r == -EAGAIN) {
3008 osdmon->dispatch(op);
3009 } else {
3010 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
3011 }
3012 }
3013 ~C_AckMarkedDown() override {
3014 }
3015 };
3016
3017 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
3018 {
3019 op->mark_osdmon_event(__func__);
3020 auto m = op->get_req<MOSDMarkMeDown>();
3021 int from = m->target_osd;
3022
3023 // check permissions
3024 if (check_source(op, m->fsid))
3025 goto reply;
3026
3027 // first, verify the reporting host is valid
3028 if (!m->get_orig_source().is_osd())
3029 goto reply;
3030
3031 if (!osdmap.exists(from) ||
3032 osdmap.is_down(from) ||
3033 osdmap.get_addrs(from) != m->target_addrs) {
3034 dout(5) << "preprocess_mark_me_down from dead osd."
3035 << from << ", ignoring" << dendl;
3036 send_incremental(op, m->get_epoch()+1);
3037 goto reply;
3038 }
3039
3040 // no down might be set
3041 if (!can_mark_down(from))
3042 goto reply;
3043
3044 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3045 << " " << m->target_addrs << dendl;
3046 return false;
3047
3048 reply:
3049 if (m->request_ack) {
3050 Context *c(new C_AckMarkedDown(this, op));
3051 c->complete(0);
3052 }
3053 return true;
3054 }
3055
3056 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3057 {
3058 op->mark_osdmon_event(__func__);
3059 auto m = op->get_req<MOSDMarkMeDown>();
3060 int target_osd = m->target_osd;
3061
3062 ceph_assert(osdmap.is_up(target_osd));
3063 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3064
3065 mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
3066 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3067 if (m->down_and_dead) {
3068 if (!pending_inc.new_xinfo.count(target_osd)) {
3069 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3070 }
3071 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3072 }
3073 if (m->request_ack)
3074 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3075 return true;
3076 }
3077
3078 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3079 {
3080 op->mark_osdmon_event(__func__);
3081 auto m = op->get_req<MOSDMarkMeDead>();
3082 int from = m->target_osd;
3083
3084 // check permissions
3085 if (check_source(op, m->fsid)) {
3086 mon.no_reply(op);
3087 return true;
3088 }
3089
3090 // first, verify the reporting host is valid
3091 if (!m->get_orig_source().is_osd()) {
3092 mon.no_reply(op);
3093 return true;
3094 }
3095
3096 if (!osdmap.exists(from) ||
3097 !osdmap.is_down(from)) {
3098 dout(5) << __func__ << " from nonexistent or up osd." << from
3099 << ", ignoring" << dendl;
3100 send_incremental(op, m->get_epoch()+1);
3101 mon.no_reply(op);
3102 return true;
3103 }
3104
3105 return false;
3106 }
3107
3108 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3109 {
3110 op->mark_osdmon_event(__func__);
3111 auto m = op->get_req<MOSDMarkMeDead>();
3112 int target_osd = m->target_osd;
3113
3114 ceph_assert(osdmap.is_down(target_osd));
3115
3116 mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3117 << m->get_epoch();
3118 if (!pending_inc.new_xinfo.count(target_osd)) {
3119 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3120 }
3121 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3122 wait_for_finished_proposal(
3123 op,
3124 new LambdaContext(
3125 [op, this] (int r) {
3126 if (r >= 0) {
3127 mon.no_reply(op); // ignore on success
3128 }
3129 }
3130 ));
3131 return true;
3132 }
3133
3134 bool OSDMonitor::can_mark_down(int i)
3135 {
3136 if (osdmap.is_nodown(i)) {
3137 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3138 << "will not mark it down" << dendl;
3139 return false;
3140 }
3141
3142 int num_osds = osdmap.get_num_osds();
3143 if (num_osds == 0) {
3144 dout(5) << __func__ << " no osds" << dendl;
3145 return false;
3146 }
3147 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3148 float up_ratio = (float)up / (float)num_osds;
3149 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3150 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3151 << g_conf()->mon_osd_min_up_ratio
3152 << ", will not mark osd." << i << " down" << dendl;
3153 return false;
3154 }
3155 return true;
3156 }
3157
3158 bool OSDMonitor::can_mark_up(int i)
3159 {
3160 if (osdmap.is_noup(i)) {
3161 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3162 << "will not mark it up" << dendl;
3163 return false;
3164 }
3165
3166 return true;
3167 }
3168
3169 /**
3170 * @note the parameter @p i apparently only exists here so we can output the
3171 * osd's id on messages.
3172 */
3173 bool OSDMonitor::can_mark_out(int i)
3174 {
3175 if (osdmap.is_noout(i)) {
3176 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3177 << "will not mark it out" << dendl;
3178 return false;
3179 }
3180
3181 int num_osds = osdmap.get_num_osds();
3182 if (num_osds == 0) {
3183 dout(5) << __func__ << " no osds" << dendl;
3184 return false;
3185 }
3186 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3187 float in_ratio = (float)in / (float)num_osds;
3188 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3189 if (i >= 0)
3190 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3191 << g_conf()->mon_osd_min_in_ratio
3192 << ", will not mark osd." << i << " out" << dendl;
3193 else
3194 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3195 << g_conf()->mon_osd_min_in_ratio
3196 << ", will not mark osds out" << dendl;
3197 return false;
3198 }
3199
3200 return true;
3201 }
3202
3203 bool OSDMonitor::can_mark_in(int i)
3204 {
3205 if (osdmap.is_noin(i)) {
3206 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3207 << "will not mark it in" << dendl;
3208 return false;
3209 }
3210
3211 return true;
3212 }
3213
3214 bool OSDMonitor::check_failures(utime_t now)
3215 {
3216 bool found_failure = false;
3217 auto p = failure_info.begin();
3218 while (p != failure_info.end()) {
3219 auto& [target_osd, fi] = *p;
3220 if (can_mark_down(target_osd) &&
3221 check_failure(now, target_osd, fi)) {
3222 found_failure = true;
3223 ++p;
3224 } else if (is_failure_stale(now, fi)) {
3225 dout(10) << " dropping stale failure_info for osd." << target_osd
3226 << " from " << fi.reporters.size() << " reporters"
3227 << dendl;
3228 p = failure_info.erase(p);
3229 } else {
3230 ++p;
3231 }
3232 }
3233 return found_failure;
3234 }
3235
3236 utime_t OSDMonitor::get_grace_time(utime_t now,
3237 int target_osd,
3238 failure_info_t& fi) const
3239 {
3240 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3241 if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3242 return orig_grace;
3243 }
3244 utime_t grace = orig_grace;
3245 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3246 double decay_k = ::log(.5) / halflife;
3247
3248 // scale grace period based on historical probability of 'lagginess'
3249 // (false positive failures due to slowness).
3250 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3251 const utime_t failed_for = now - fi.get_failed_since();
3252 double decay = exp((double)failed_for * decay_k);
3253 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3254 << " failed_for " << failed_for << " decay " << decay << dendl;
3255 double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3256 grace += my_grace;
3257
3258 // consider the peers reporting a failure a proxy for a potential
3259 // 'subcluster' over the overall cluster that is similarly
3260 // laggy. this is clearly not true in all cases, but will sometimes
3261 // help us localize the grace correction to a subset of the system
3262 // (say, a rack with a bad switch) that is unhappy.
3263 double peer_grace = 0;
3264 for (auto& [reporter, report] : fi.reporters) {
3265 if (osdmap.exists(reporter)) {
3266 const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3267 utime_t elapsed = now - xi.down_stamp;
3268 double decay = exp((double)elapsed * decay_k);
3269 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3270 }
3271 }
3272 peer_grace /= (double)fi.reporters.size();
3273 grace += peer_grace;
3274 dout(10) << " osd." << target_osd << " has "
3275 << fi.reporters.size() << " reporters, "
3276 << grace << " grace (" << orig_grace << " + " << my_grace
3277 << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3278 << dendl;
3279
3280 return grace;
3281 }
3282
3283 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3284 {
3285 // already pending failure?
3286 if (pending_inc.new_state.count(target_osd) &&
3287 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3288 dout(10) << " already pending failure" << dendl;
3289 return true;
3290 }
3291
3292 set<string> reporters_by_subtree;
3293 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3294 ceph_assert(fi.reporters.size());
3295 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3296 // get the parent bucket whose type matches with "reporter_subtree_level".
3297 // fall back to OSD if the level doesn't exist.
3298 if (osdmap.exists(p->first)) {
3299 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3300 if (auto iter = reporter_loc.find(reporter_subtree_level);
3301 iter == reporter_loc.end()) {
3302 reporters_by_subtree.insert("osd." + to_string(p->first));
3303 } else {
3304 reporters_by_subtree.insert(iter->second);
3305 }
3306 ++p;
3307 } else {
3308 fi.cancel_report(p->first);;
3309 p = fi.reporters.erase(p);
3310 }
3311 }
3312 if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3313 return false;
3314 }
3315 const utime_t failed_for = now - fi.get_failed_since();
3316 const utime_t grace = get_grace_time(now, target_osd, fi);
3317 if (failed_for >= grace) {
3318 dout(1) << " we have enough reporters to mark osd." << target_osd
3319 << " down" << dendl;
3320 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3321
3322 mon.clog->info() << "osd." << target_osd << " failed ("
3323 << osdmap.crush->get_full_location_ordered_string(
3324 target_osd)
3325 << ") ("
3326 << (int)reporters_by_subtree.size()
3327 << " reporters from different "
3328 << reporter_subtree_level << " after "
3329 << failed_for << " >= grace " << grace << ")";
3330 return true;
3331 }
3332 return false;
3333 }
3334
3335 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3336 {
3337 // if it takes too long to either cancel the report to mark the osd down,
3338 // some reporters must have failed to cancel their reports. let's just
3339 // forget these reports.
3340 const utime_t failed_for = now - fi.get_failed_since();
3341 auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3342 auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3343 return failed_for >= (heartbeat_grace + heartbeat_stale);
3344 }
3345
3346 void OSDMonitor::force_failure(int target_osd, int by)
3347 {
3348 // already pending failure?
3349 if (pending_inc.new_state.count(target_osd) &&
3350 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3351 dout(10) << " already pending failure" << dendl;
3352 return;
3353 }
3354
3355 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3356 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3357 if (!pending_inc.new_xinfo.count(target_osd)) {
3358 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3359 }
3360 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3361
3362 mon.clog->info() << "osd." << target_osd << " failed ("
3363 << osdmap.crush->get_full_location_ordered_string(target_osd)
3364 << ") (connection refused reported by osd." << by << ")";
3365 return;
3366 }
3367
3368 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3369 {
3370 op->mark_osdmon_event(__func__);
3371 auto m = op->get_req<MOSDFailure>();
3372 dout(1) << "prepare_failure osd." << m->get_target_osd()
3373 << " " << m->get_target_addrs()
3374 << " from " << m->get_orig_source()
3375 << " is reporting failure:" << m->if_osd_failed() << dendl;
3376
3377 int target_osd = m->get_target_osd();
3378 int reporter = m->get_orig_source().num();
3379 ceph_assert(osdmap.is_up(target_osd));
3380 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3381
3382 mon.no_reply(op);
3383
3384 if (m->if_osd_failed()) {
3385 // calculate failure time
3386 utime_t now = ceph_clock_now();
3387 utime_t failed_since =
3388 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3389
3390 // add a report
3391 if (m->is_immediate()) {
3392 mon.clog->debug() << "osd." << m->get_target_osd()
3393 << " reported immediately failed by "
3394 << m->get_orig_source();
3395 force_failure(target_osd, reporter);
3396 return true;
3397 }
3398 mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3399 << m->get_orig_source();
3400
3401 failure_info_t& fi = failure_info[target_osd];
3402 fi.add_report(reporter, failed_since, op);
3403 return check_failure(now, target_osd, fi);
3404 } else {
3405 // remove the report
3406 mon.clog->debug() << "osd." << m->get_target_osd()
3407 << " failure report canceled by "
3408 << m->get_orig_source();
3409 if (failure_info.count(target_osd)) {
3410 failure_info_t& fi = failure_info[target_osd];
3411 fi.cancel_report(reporter);
3412 if (fi.reporters.empty()) {
3413 dout(10) << " removing last failure_info for osd." << target_osd
3414 << dendl;
3415 failure_info.erase(target_osd);
3416 } else {
3417 dout(10) << " failure_info for osd." << target_osd << " now "
3418 << fi.reporters.size() << " reporters" << dendl;
3419 }
3420 } else {
3421 dout(10) << " no failure_info for osd." << target_osd << dendl;
3422 }
3423 }
3424
3425 return false;
3426 }
3427
3428 void OSDMonitor::process_failures()
3429 {
3430 map<int,failure_info_t>::iterator p = failure_info.begin();
3431 while (p != failure_info.end()) {
3432 if (osdmap.is_up(p->first)) {
3433 ++p;
3434 } else {
3435 dout(10) << "process_failures osd." << p->first << dendl;
3436 list<MonOpRequestRef> ls;
3437 p->second.take_report_messages(ls);
3438 failure_info.erase(p++);
3439
3440 while (!ls.empty()) {
3441 MonOpRequestRef o = ls.front();
3442 if (o) {
3443 o->mark_event(__func__);
3444 MOSDFailure *m = o->get_req<MOSDFailure>();
3445 send_latest(o, m->get_epoch());
3446 mon.no_reply(o);
3447 }
3448 ls.pop_front();
3449 }
3450 }
3451 }
3452 }
3453
3454 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3455 {
3456 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3457
3458 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3459 p != failure_info.end();
3460 ++p) {
3461 p->second.take_report_messages(ls);
3462 }
3463 failure_info.clear();
3464 }
3465
3466 int OSDMonitor::get_grace_interval_threshold()
3467 {
3468 int halflife = g_conf()->mon_osd_laggy_halflife;
3469 // Scale the halflife period (default: 1_hr) by
3470 // a factor (48) to calculate the threshold.
3471 int grace_threshold_factor = 48;
3472 return halflife * grace_threshold_factor;
3473 }
3474
3475 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3476 {
3477 int grace_interval_threshold_secs = get_grace_interval_threshold();
3478 if (last_failed_interval > grace_interval_threshold_secs) {
3479 dout(1) << " last_failed_interval " << last_failed_interval
3480 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3481 << dendl;
3482 return true;
3483 }
3484 return false;
3485 }
3486
3487 void OSDMonitor::set_default_laggy_params(int target_osd)
3488 {
3489 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3490 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3491 }
3492 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3493 xi.down_stamp = pending_inc.modified;
3494 xi.laggy_probability = 0.0;
3495 xi.laggy_interval = 0;
3496 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3497 }
3498
3499
3500 // boot --
3501
3502 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3503 {
3504 op->mark_osdmon_event(__func__);
3505 auto m = op->get_req<MOSDBoot>();
3506 int from = m->get_orig_source_inst().name.num();
3507
3508 // check permissions, ignore if failed (no response expected)
3509 MonSession *session = op->get_session();
3510 if (!session)
3511 goto ignore;
3512 if (!session->is_capable("osd", MON_CAP_X)) {
3513 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3514 << session->caps << dendl;
3515 goto ignore;
3516 }
3517
3518 if (m->sb.cluster_fsid != mon.monmap->fsid) {
3519 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3520 << " != " << mon.monmap->fsid << dendl;
3521 goto ignore;
3522 }
3523
3524 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3525 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3526 goto ignore;
3527 }
3528
3529 ceph_assert(m->get_orig_source_inst().name.is_osd());
3530
3531 // lower bound of N-2
3532 if (!HAVE_FEATURE(m->osd_features, SERVER_PACIFIC)) {
3533 mon.clog->info() << "disallowing boot of OSD "
3534 << m->get_orig_source_inst()
3535 << " because the osd lacks CEPH_FEATURE_SERVER_PACIFIC";
3536 goto ignore;
3537 }
3538
3539 // make sure osd versions do not span more than 3 releases
3540 if (HAVE_FEATURE(m->osd_features, SERVER_QUINCY) &&
3541 osdmap.require_osd_release < ceph_release_t::octopus) {
3542 mon.clog->info() << "disallowing boot of quincy+ OSD "
3543 << m->get_orig_source_inst()
3544 << " because require_osd_release < octopus";
3545 goto ignore;
3546 }
3547 if (HAVE_FEATURE(m->osd_features, SERVER_REEF) &&
3548 osdmap.require_osd_release < ceph_release_t::pacific) {
3549 mon.clog->info() << "disallowing boot of reef+ OSD "
3550 << m->get_orig_source_inst()
3551 << " because require_osd_release < pacific";
3552 goto ignore;
3553 }
3554
3555 // See crimson/osd/osd.cc: OSD::_send_boot
3556 if (auto type_iter = m->metadata.find("osd_type");
3557 type_iter != m->metadata.end()) {
3558 const auto &otype = type_iter->second;
3559 // m->metadata["osd_type"] must be "crimson", classic doesn't send osd_type
3560 if (otype == "crimson") {
3561 if (!osdmap.get_allow_crimson()) {
3562 mon.clog->info()
3563 << "Disallowing boot of crimson-osd without allow_crimson "
3564 << "OSDMap flag. Run ceph osd set_allow_crimson to set "
3565 << "allow_crimson flag. Note that crimson-osd is "
3566 << "considered unstable and may result in crashes or "
3567 << "data loss. Its usage should be restricted to "
3568 << "testing and development.";
3569 goto ignore;
3570 }
3571 } else {
3572 derr << __func__ << ": osd " << m->get_orig_source_inst()
3573 << " sent non-crimson osd_type field in MOSDBoot: "
3574 << otype
3575 << " -- booting anyway"
3576 << dendl;
3577 }
3578 }
3579
3580 if (osdmap.stretch_mode_enabled &&
3581 !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3582 mon.clog->info() << "disallowing boot of OSD "
3583 << m->get_orig_source_inst()
3584 << " because stretch mode is on and OSD lacks support";
3585 goto ignore;
3586 }
3587
3588 // already booted?
3589 if (osdmap.is_up(from) &&
3590 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3591 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3592 // yup.
3593 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3594 << " " << m->get_orig_source_addrs()
3595 << " =~ " << osdmap.get_addrs(from) << dendl;
3596 _booted(op, false);
3597 return true;
3598 }
3599
3600 if (osdmap.exists(from) &&
3601 !osdmap.get_uuid(from).is_zero() &&
3602 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3603 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3604 << " clashes with existing osd: different fsid"
3605 << " (ours: " << osdmap.get_uuid(from)
3606 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3607 goto ignore;
3608 }
3609
3610 if (osdmap.exists(from) &&
3611 osdmap.get_info(from).up_from > m->version &&
3612 osdmap.get_most_recent_addrs(from).legacy_equals(
3613 m->get_orig_source_addrs())) {
3614 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3615 send_latest(op, m->sb.current_epoch+1);
3616 return true;
3617 }
3618
3619 // noup?
3620 if (!can_mark_up(from)) {
3621 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3622 send_latest(op, m->sb.current_epoch+1);
3623 return true;
3624 }
3625
3626 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3627 return false;
3628
3629 ignore:
3630 return true;
3631 }
3632
3633 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3634 {
3635 op->mark_osdmon_event(__func__);
3636 auto m = op->get_req<MOSDBoot>();
3637 dout(7) << __func__ << " from " << m->get_source()
3638 << " sb " << m->sb
3639 << " client_addrs" << m->get_connection()->get_peer_addrs()
3640 << " cluster_addrs " << m->cluster_addrs
3641 << " hb_back_addrs " << m->hb_back_addrs
3642 << " hb_front_addrs " << m->hb_front_addrs
3643 << dendl;
3644
3645 ceph_assert(m->get_orig_source().is_osd());
3646 int from = m->get_orig_source().num();
3647
3648 // does this osd exist?
3649 if (from >= osdmap.get_max_osd()) {
3650 dout(1) << "boot from osd." << from << " >= max_osd "
3651 << osdmap.get_max_osd() << dendl;
3652 return false;
3653 }
3654
3655 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3656 if (pending_inc.new_state.count(from))
3657 oldstate ^= pending_inc.new_state[from];
3658
3659 // already up? mark down first?
3660 if (osdmap.is_up(from)) {
3661 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3662 << osdmap.get_addrs(from) << dendl;
3663 // preprocess should have caught these; if not, assert.
3664 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3665 m->get_orig_source_addrs()) ||
3666 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3667 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3668
3669 if (pending_inc.new_state.count(from) == 0 ||
3670 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3671 // mark previous guy down
3672 pending_inc.new_state[from] = CEPH_OSD_UP;
3673 }
3674 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3675 } else if (pending_inc.new_up_client.count(from)) {
3676 // already prepared, just wait
3677 dout(7) << __func__ << " already prepared, waiting on "
3678 << m->get_orig_source_addr() << dendl;
3679 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3680 } else {
3681 // mark new guy up.
3682 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3683 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3684 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3685 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3686
3687 down_pending_out.erase(from); // if any
3688
3689 if (m->sb.weight)
3690 osd_weight[from] = m->sb.weight;
3691
3692 // set uuid?
3693 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3694 << dendl;
3695 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3696 // preprocess should have caught this; if not, assert.
3697 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3698 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3699 }
3700
3701 // fresh osd?
3702 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3703 const osd_info_t& i = osdmap.get_info(from);
3704 if (i.up_from > i.lost_at) {
3705 dout(10) << " fresh osd; marking lost_at too" << dendl;
3706 pending_inc.new_lost[from] = osdmap.get_epoch();
3707 }
3708 }
3709
3710 // metadata
3711 bufferlist osd_metadata;
3712 encode(m->metadata, osd_metadata);
3713 pending_metadata[from] = osd_metadata;
3714 pending_metadata_rm.erase(from);
3715
3716 // adjust last clean unmount epoch?
3717 const osd_info_t& info = osdmap.get_info(from);
3718 dout(10) << " old osd_info: " << info << dendl;
3719 if (m->sb.mounted > info.last_clean_begin ||
3720 (m->sb.mounted == info.last_clean_begin &&
3721 m->sb.clean_thru > info.last_clean_end)) {
3722 epoch_t begin = m->sb.mounted;
3723 epoch_t end = m->sb.clean_thru;
3724
3725 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3726 << "[" << info.last_clean_begin << "," << info.last_clean_end
3727 << ") -> [" << begin << "-" << end << ")"
3728 << dendl;
3729 pending_inc.new_last_clean_interval[from] =
3730 pair<epoch_t,epoch_t>(begin, end);
3731 }
3732
3733 if (pending_inc.new_xinfo.count(from) == 0)
3734 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3735 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3736 if (m->boot_epoch == 0) {
3737 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3738 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3739 dout(10) << " not laggy, new xi " << xi << dendl;
3740 } else {
3741 if (xi.down_stamp.sec()) {
3742 int interval = ceph_clock_now().sec() -
3743 xi.down_stamp.sec();
3744 if (g_conf()->mon_osd_laggy_max_interval &&
3745 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3746 interval = g_conf()->mon_osd_laggy_max_interval;
3747 }
3748 xi.laggy_interval =
3749 interval * g_conf()->mon_osd_laggy_weight +
3750 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3751 }
3752 xi.laggy_probability =
3753 g_conf()->mon_osd_laggy_weight +
3754 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3755 dout(10) << " laggy, now xi " << xi << dendl;
3756 }
3757
3758 // set features shared by the osd
3759 if (m->osd_features)
3760 xi.features = m->osd_features;
3761 else
3762 xi.features = m->get_connection()->get_features();
3763
3764 // mark in?
3765 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3766 (oldstate & CEPH_OSD_AUTOOUT)) ||
3767 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3768 (g_conf()->mon_osd_auto_mark_in)) {
3769 if (can_mark_in(from)) {
3770 if (xi.old_weight > 0) {
3771 pending_inc.new_weight[from] = xi.old_weight;
3772 xi.old_weight = 0;
3773 } else {
3774 pending_inc.new_weight[from] = CEPH_OSD_IN;
3775 }
3776 } else {
3777 dout(7) << __func__ << " NOIN set, will not mark in "
3778 << m->get_orig_source_addr() << dendl;
3779 }
3780 }
3781
3782 // wait
3783 wait_for_finished_proposal(op, new C_Booted(this, op));
3784 }
3785 return true;
3786 }
3787
3788 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3789 {
3790 op->mark_osdmon_event(__func__);
3791 auto m = op->get_req<MOSDBoot>();
3792 dout(7) << "_booted " << m->get_orig_source_inst()
3793 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3794
3795 if (logit) {
3796 mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3797 << " boot";
3798 }
3799
3800 send_latest(op, m->sb.current_epoch+1);
3801 }
3802
3803
3804 // -------------
3805 // full
3806
3807 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3808 {
3809 op->mark_osdmon_event(__func__);
3810 auto m = op->get_req<MOSDFull>();
3811 int from = m->get_orig_source().num();
3812 set<string> state;
3813 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3814
3815 // check permissions, ignore if failed
3816 MonSession *session = op->get_session();
3817 if (!session)
3818 goto ignore;
3819 if (!session->is_capable("osd", MON_CAP_X)) {
3820 dout(0) << "MOSDFull from entity with insufficient privileges:"
3821 << session->caps << dendl;
3822 goto ignore;
3823 }
3824
3825 // ignore a full message from the osd instance that already went down
3826 if (!osdmap.exists(from)) {
3827 dout(7) << __func__ << " ignoring full message from nonexistent "
3828 << m->get_orig_source_inst() << dendl;
3829 goto ignore;
3830 }
3831 if ((!osdmap.is_up(from) &&
3832 osdmap.get_most_recent_addrs(from).legacy_equals(
3833 m->get_orig_source_addrs())) ||
3834 (osdmap.is_up(from) &&
3835 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3836 dout(7) << __func__ << " ignoring full message from down "
3837 << m->get_orig_source_inst() << dendl;
3838 goto ignore;
3839 }
3840
3841 OSDMap::calc_state_set(osdmap.get_state(from), state);
3842
3843 if ((osdmap.get_state(from) & mask) == m->state) {
3844 dout(7) << __func__ << " state already " << state << " for osd." << from
3845 << " " << m->get_orig_source_inst() << dendl;
3846 _reply_map(op, m->version);
3847 goto ignore;
3848 }
3849
3850 dout(10) << __func__ << " want state " << state << " for osd." << from
3851 << " " << m->get_orig_source_inst() << dendl;
3852 return false;
3853
3854 ignore:
3855 return true;
3856 }
3857
3858 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3859 {
3860 op->mark_osdmon_event(__func__);
3861 auto m = op->get_req<MOSDFull>();
3862 const int from = m->get_orig_source().num();
3863
3864 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3865 const unsigned want_state = m->state & mask; // safety first
3866
3867 unsigned cur_state = osdmap.get_state(from);
3868 auto p = pending_inc.new_state.find(from);
3869 if (p != pending_inc.new_state.end()) {
3870 cur_state ^= p->second;
3871 }
3872 cur_state &= mask;
3873
3874 set<string> want_state_set, cur_state_set;
3875 OSDMap::calc_state_set(want_state, want_state_set);
3876 OSDMap::calc_state_set(cur_state, cur_state_set);
3877
3878 if (cur_state != want_state) {
3879 if (p != pending_inc.new_state.end()) {
3880 p->second &= ~mask;
3881 } else {
3882 pending_inc.new_state[from] = 0;
3883 }
3884 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3885 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3886 << " -> " << want_state_set << dendl;
3887 } else {
3888 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3889 << " = wanted " << want_state_set << ", just waiting" << dendl;
3890 }
3891
3892 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3893 return true;
3894 }
3895
3896 // -------------
3897 // alive
3898
3899 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3900 {
3901 op->mark_osdmon_event(__func__);
3902 auto m = op->get_req<MOSDAlive>();
3903 int from = m->get_orig_source().num();
3904
3905 // check permissions, ignore if failed
3906 MonSession *session = op->get_session();
3907 if (!session)
3908 goto ignore;
3909 if (!session->is_capable("osd", MON_CAP_X)) {
3910 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3911 << session->caps << dendl;
3912 goto ignore;
3913 }
3914
3915 if (!osdmap.is_up(from) ||
3916 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3917 dout(7) << "preprocess_alive ignoring alive message from down "
3918 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3919 << dendl;
3920 goto ignore;
3921 }
3922
3923 if (osdmap.get_up_thru(from) >= m->want) {
3924 // yup.
3925 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3926 _reply_map(op, m->version);
3927 return true;
3928 }
3929
3930 dout(10) << "preprocess_alive want up_thru " << m->want
3931 << " from " << m->get_orig_source_inst() << dendl;
3932 return false;
3933
3934 ignore:
3935 return true;
3936 }
3937
3938 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3939 {
3940 op->mark_osdmon_event(__func__);
3941 auto m = op->get_req<MOSDAlive>();
3942 int from = m->get_orig_source().num();
3943
3944 if (0) { // we probably don't care much about these
3945 mon.clog->debug() << m->get_orig_source_inst() << " alive";
3946 }
3947
3948 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3949 << " from " << m->get_orig_source_inst() << dendl;
3950
3951 update_up_thru(from, m->version); // set to the latest map the OSD has
3952 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3953 return true;
3954 }
3955
3956 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3957 {
3958 op->mark_osdmon_event(__func__);
3959 dout(7) << "_reply_map " << e
3960 << " from " << op->get_req()->get_orig_source_inst()
3961 << dendl;
3962 send_latest(op, e);
3963 }
3964
3965 // pg_created
3966 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3967 {
3968 op->mark_osdmon_event(__func__);
3969 auto m = op->get_req<MOSDPGCreated>();
3970 dout(10) << __func__ << " " << *m << dendl;
3971 auto session = op->get_session();
3972 mon.no_reply(op);
3973 if (!session) {
3974 dout(10) << __func__ << ": no monitor session!" << dendl;
3975 return true;
3976 }
3977 if (!session->is_capable("osd", MON_CAP_X)) {
3978 derr << __func__ << " received from entity "
3979 << "with insufficient privileges " << session->caps << dendl;
3980 return true;
3981 }
3982 // always forward the "created!" to the leader
3983 return false;
3984 }
3985
3986 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3987 {
3988 op->mark_osdmon_event(__func__);
3989 auto m = op->get_req<MOSDPGCreated>();
3990 dout(10) << __func__ << " " << *m << dendl;
3991 auto src = m->get_orig_source();
3992 auto from = src.num();
3993 if (!src.is_osd() ||
3994 !mon.osdmon()->osdmap.is_up(from) ||
3995 !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3996 m->get_orig_source_addrs())) {
3997 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3998 return false;
3999 }
4000 pending_created_pgs.push_back(m->pgid);
4001 return true;
4002 }
4003
4004 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
4005 {
4006 op->mark_osdmon_event(__func__);
4007 auto m = op->get_req<MOSDPGReadyToMerge>();
4008 dout(10) << __func__ << " " << *m << dendl;
4009 const pg_pool_t *pi;
4010 auto session = op->get_session();
4011 if (!session) {
4012 dout(10) << __func__ << ": no monitor session!" << dendl;
4013 goto ignore;
4014 }
4015 if (!session->is_capable("osd", MON_CAP_X)) {
4016 derr << __func__ << " received from entity "
4017 << "with insufficient privileges " << session->caps << dendl;
4018 goto ignore;
4019 }
4020 pi = osdmap.get_pg_pool(m->pgid.pool());
4021 if (!pi) {
4022 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
4023 goto ignore;
4024 }
4025 if (pi->get_pg_num() <= m->pgid.ps()) {
4026 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
4027 goto ignore;
4028 }
4029 if (pi->get_pg_num() != m->pgid.ps() + 1) {
4030 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
4031 goto ignore;
4032 }
4033 if (pi->get_pg_num_pending() > m->pgid.ps()) {
4034 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4035 goto ignore;
4036 }
4037 return false;
4038
4039 ignore:
4040 mon.no_reply(op);
4041 return true;
4042 }
4043
4044 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4045 {
4046 op->mark_osdmon_event(__func__);
4047 auto m = op->get_req<MOSDPGReadyToMerge>();
4048 dout(10) << __func__ << " " << *m << dendl;
4049 pg_pool_t p;
4050 if (pending_inc.new_pools.count(m->pgid.pool()))
4051 p = pending_inc.new_pools[m->pgid.pool()];
4052 else
4053 p = *osdmap.get_pg_pool(m->pgid.pool());
4054 if (p.get_pg_num() != m->pgid.ps() + 1 ||
4055 p.get_pg_num_pending() > m->pgid.ps()) {
4056 dout(10) << __func__
4057 << " race with concurrent pg_num[_pending] update, will retry"
4058 << dendl;
4059 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4060 return false; /* nothing to propose, yet */
4061 }
4062
4063 if (m->ready) {
4064 p.dec_pg_num(m->pgid,
4065 pending_inc.epoch,
4066 m->source_version,
4067 m->target_version,
4068 m->last_epoch_started,
4069 m->last_epoch_clean);
4070 p.last_change = pending_inc.epoch;
4071 } else {
4072 // back off the merge attempt!
4073 p.set_pg_num_pending(p.get_pg_num());
4074 }
4075
4076 // force pre-nautilus clients to resend their ops, since they
4077 // don't understand pg_num_pending changes form a new interval
4078 p.last_force_op_resend_prenautilus = pending_inc.epoch;
4079
4080 pending_inc.new_pools[m->pgid.pool()] = p;
4081
4082 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4083 if (m->ready &&
4084 prob > 0 &&
4085 prob > (double)(rand() % 1000)/1000.0) {
4086 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4087 auto n = new MMonCommand(mon.monmap->get_fsid());
4088 n->set_connection(m->get_connection());
4089 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4090 osdmap.get_pool_name(m->pgid.pool()) +
4091 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4092 stringify(m->pgid.ps() + 1) + "\"}" };
4093 MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4094 nop->set_type_service();
4095 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4096 } else {
4097 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4098 }
4099 return true;
4100 }
4101
4102
4103 // -------------
4104 // pg_temp changes
4105
4106 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4107 {
4108 auto m = op->get_req<MOSDPGTemp>();
4109 dout(10) << "preprocess_pgtemp " << *m << dendl;
4110 mempool::osdmap::vector<int> empty;
4111 int from = m->get_orig_source().num();
4112 size_t ignore_cnt = 0;
4113
4114 // check caps
4115 MonSession *session = op->get_session();
4116 if (!session)
4117 goto ignore;
4118 if (!session->is_capable("osd", MON_CAP_X)) {
4119 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4120 << session->caps << dendl;
4121 goto ignore;
4122 }
4123
4124 if (!osdmap.is_up(from) ||
4125 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4126 dout(7) << "ignoring pgtemp message from down "
4127 << m->get_orig_source() << " " << m->get_orig_source_addrs()
4128 << dendl;
4129 goto ignore;
4130 }
4131
4132 if (m->forced) {
4133 return false;
4134 }
4135
4136 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4137 dout(20) << " " << p->first
4138 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4139 << " -> " << p->second << dendl;
4140
4141 // does the pool exist?
4142 if (!osdmap.have_pg_pool(p->first.pool())) {
4143 /*
4144 * 1. If the osdmap does not have the pool, it means the pool has been
4145 * removed in-between the osd sending this message and us handling it.
4146 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4147 * not exist in the pending either, as the osds would not send a
4148 * message about a pool they know nothing about (yet).
4149 * 3. However, if the pool does exist in the pending, then it must be a
4150 * new pool, and not relevant to this message (see 1).
4151 */
4152 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4153 << ": pool has been removed" << dendl;
4154 ignore_cnt++;
4155 continue;
4156 }
4157
4158 int acting_primary = -1;
4159 osdmap.pg_to_up_acting_osds(
4160 p->first, nullptr, nullptr, nullptr, &acting_primary);
4161 if (acting_primary != from) {
4162 /* If the source isn't the primary based on the current osdmap, we know
4163 * that the interval changed and that we can discard this message.
4164 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4165 * which of two pg temp mappings on the same pg is more recent.
4166 */
4167 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4168 << ": primary has changed" << dendl;
4169 ignore_cnt++;
4170 continue;
4171 }
4172
4173 // removal?
4174 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4175 osdmap.primary_temp->count(p->first)))
4176 return false;
4177 // change?
4178 // NOTE: we assume that this will clear pg_primary, so consider
4179 // an existing pg_primary field to imply a change
4180 if (p->second.size() &&
4181 (osdmap.pg_temp->count(p->first) == 0 ||
4182 osdmap.pg_temp->get(p->first) != p->second ||
4183 osdmap.primary_temp->count(p->first)))
4184 return false;
4185 }
4186
4187 // should we ignore all the pgs?
4188 if (ignore_cnt == m->pg_temp.size())
4189 goto ignore;
4190
4191 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4192 _reply_map(op, m->map_epoch);
4193 return true;
4194
4195 ignore:
4196 mon.no_reply(op);
4197 return true;
4198 }
4199
4200 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4201 {
4202 epoch_t old_up_thru = osdmap.get_up_thru(from);
4203 auto ut = pending_inc.new_up_thru.find(from);
4204 if (ut != pending_inc.new_up_thru.end()) {
4205 old_up_thru = ut->second;
4206 }
4207 if (up_thru > old_up_thru) {
4208 // set up_thru too, so the osd doesn't have to ask again
4209 pending_inc.new_up_thru[from] = up_thru;
4210 }
4211 }
4212
4213 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4214 {
4215 op->mark_osdmon_event(__func__);
4216 auto m = op->get_req<MOSDPGTemp>();
4217 int from = m->get_orig_source().num();
4218 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4219 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4220 uint64_t pool = p->first.pool();
4221 if (pending_inc.old_pools.count(pool)) {
4222 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4223 << ": pool pending removal" << dendl;
4224 continue;
4225 }
4226 if (!osdmap.have_pg_pool(pool)) {
4227 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4228 << ": pool has been removed" << dendl;
4229 continue;
4230 }
4231 pending_inc.new_pg_temp[p->first] =
4232 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4233
4234 // unconditionally clear pg_primary (until this message can encode
4235 // a change for that, too.. at which point we need to also fix
4236 // preprocess_pg_temp)
4237 if (osdmap.primary_temp->count(p->first) ||
4238 pending_inc.new_primary_temp.count(p->first))
4239 pending_inc.new_primary_temp[p->first] = -1;
4240 }
4241
4242 // set up_thru too, so the osd doesn't have to ask again
4243 update_up_thru(from, m->map_epoch);
4244
4245 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4246 return true;
4247 }
4248
4249
4250 // ---
4251
4252 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4253 {
4254 op->mark_osdmon_event(__func__);
4255 auto m = op->get_req<MRemoveSnaps>();
4256 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4257
4258 // check privilege, ignore if failed
4259 MonSession *session = op->get_session();
4260 mon.no_reply(op);
4261 if (!session)
4262 goto ignore;
4263 if (!session->caps.is_capable(
4264 cct,
4265 session->entity_name,
4266 "osd", "osd pool rmsnap", {}, true, true, false,
4267 session->get_peer_socket_addr())) {
4268 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4269 << session->caps << dendl;
4270 goto ignore;
4271 }
4272
4273 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4274 q != m->snaps.end();
4275 ++q) {
4276 if (!osdmap.have_pg_pool(q->first)) {
4277 dout(10) << " ignoring removed_snaps " << q->second
4278 << " on non-existent pool " << q->first << dendl;
4279 continue;
4280 }
4281 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4282 for (vector<snapid_t>::iterator p = q->second.begin();
4283 p != q->second.end();
4284 ++p) {
4285 if (*p > pi->get_snap_seq() ||
4286 !_is_removed_snap(q->first, *p)) {
4287 return false;
4288 }
4289 }
4290 }
4291
4292 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4293 auto reply = make_message<MRemoveSnaps>();
4294 reply->snaps = m->snaps;
4295 mon.send_reply(op, reply.detach());
4296 }
4297
4298 ignore:
4299 return true;
4300 }
4301
4302 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4303 {
4304 op->mark_osdmon_event(__func__);
4305 auto m = op->get_req<MRemoveSnaps>();
4306 dout(7) << "prepare_remove_snaps " << *m << dendl;
4307
4308 for (auto& [pool, snaps] : m->snaps) {
4309 if (!osdmap.have_pg_pool(pool)) {
4310 dout(10) << " ignoring removed_snaps " << snaps
4311 << " on non-existent pool " << pool << dendl;
4312 continue;
4313 }
4314
4315 pg_pool_t& pi = osdmap.pools[pool];
4316 for (auto s : snaps) {
4317 if (!_is_removed_snap(pool, s) &&
4318 (!pending_inc.new_pools.count(pool) ||
4319 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4320 (!pending_inc.new_removed_snaps.count(pool) ||
4321 !pending_inc.new_removed_snaps[pool].contains(s))) {
4322 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4323 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4324 newpi->removed_snaps.insert(s);
4325 dout(10) << " pool " << pool << " removed_snaps added " << s
4326 << " (now " << newpi->removed_snaps << ")" << dendl;
4327 }
4328 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4329 if (s > newpi->get_snap_seq()) {
4330 dout(10) << " pool " << pool << " snap_seq "
4331 << newpi->get_snap_seq() << " -> " << s << dendl;
4332 newpi->set_snap_seq(s);
4333 }
4334 newpi->set_snap_epoch(pending_inc.epoch);
4335 dout(10) << " added pool " << pool << " snap " << s
4336 << " to removed_snaps queue" << dendl;
4337 pending_inc.new_removed_snaps[pool].insert(s);
4338 }
4339 }
4340 }
4341
4342 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4343 auto reply = make_message<MRemoveSnaps>();
4344 reply->snaps = m->snaps;
4345 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4346 }
4347
4348 return true;
4349 }
4350
4351 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4352 {
4353 op->mark_osdmon_event(__func__);
4354 auto m = op->get_req<MMonGetPurgedSnaps>();
4355 dout(7) << __func__ << " " << *m << dendl;
4356
4357 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4358
4359 string k = make_purged_snap_epoch_key(m->start);
4360 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4361 it->upper_bound(k);
4362 unsigned long epoch = m->last;
4363 while (it->valid()) {
4364 if (it->key().find("purged_epoch_") != 0) {
4365 break;
4366 }
4367 string k = it->key();
4368 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4369 if (n != 1) {
4370 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4371 } else if (epoch > m->last) {
4372 break;
4373 } else {
4374 bufferlist bl = it->value();
4375 auto p = bl.cbegin();
4376 auto &v = r[epoch];
4377 try {
4378 ceph::decode(v, p);
4379 } catch (ceph::buffer::error& e) {
4380 derr << __func__ << " unable to parse value for key '" << it->key()
4381 << "': \n";
4382 bl.hexdump(*_dout);
4383 *_dout << dendl;
4384 }
4385 n += 4 + v.size() * 16;
4386 }
4387 if (n > 1048576) {
4388 // impose a semi-arbitrary limit to message size
4389 break;
4390 }
4391 it->next();
4392 }
4393
4394 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4395 reply->purged_snaps.swap(r);
4396 mon.send_reply(op, reply.detach());
4397
4398 return true;
4399 }
4400
4401 // osd beacon
4402 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4403 {
4404 op->mark_osdmon_event(__func__);
4405 // check caps
4406 auto session = op->get_session();
4407 mon.no_reply(op);
4408 if (!session) {
4409 dout(10) << __func__ << " no monitor session!" << dendl;
4410 return true;
4411 }
4412 if (!session->is_capable("osd", MON_CAP_X)) {
4413 derr << __func__ << " received from entity "
4414 << "with insufficient privileges " << session->caps << dendl;
4415 return true;
4416 }
4417 // Always forward the beacon to the leader, even if they are the same as
4418 // the old one. The leader will mark as down osds that haven't sent
4419 // beacon for a few minutes.
4420 return false;
4421 }
4422
4423 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4424 {
4425 op->mark_osdmon_event(__func__);
4426 const auto beacon = op->get_req<MOSDBeacon>();
4427 const auto src = beacon->get_orig_source();
4428 dout(10) << __func__ << " " << *beacon
4429 << " from " << src << dendl;
4430 int from = src.num();
4431
4432 if (!src.is_osd() ||
4433 !osdmap.is_up(from) ||
4434 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4435 if (src.is_osd() && !osdmap.is_up(from)) {
4436 // share some new maps with this guy in case it may not be
4437 // aware of its own deadness...
4438 send_latest(op, beacon->version+1);
4439 }
4440 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4441 return false; /* nothing to propose */
4442 }
4443
4444 last_osd_report[from].first = ceph_clock_now();
4445 last_osd_report[from].second = beacon->osd_beacon_report_interval;
4446 osd_epochs[from] = beacon->version;
4447
4448 for (const auto& pg : beacon->pgs) {
4449 if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4450 unsigned pg_num = pool->get_pg_num();
4451 last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4452 }
4453 }
4454
4455 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4456 beacon->last_purged_snaps_scrub) {
4457 if (pending_inc.new_xinfo.count(from) == 0) {
4458 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4459 }
4460 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4461 beacon->last_purged_snaps_scrub;
4462 return true;
4463 } else {
4464 return false; /* nothing to propose */
4465 }
4466 }
4467
4468 // ---------------
4469 // map helpers
4470
4471 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4472 {
4473 op->mark_osdmon_event(__func__);
4474 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4475 << " start " << start << dendl;
4476 if (start == 0)
4477 send_full(op);
4478 else
4479 send_incremental(op, start);
4480 }
4481
4482
4483 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4484 {
4485 MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4486 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4487 r->cluster_osdmap_trim_lower_bound = get_first_committed();
4488 r->newest_map = osdmap.get_epoch();
4489 return r;
4490 }
4491
4492 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4493 {
4494 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4495 << std::hex << features << std::dec << dendl;
4496 MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4497 m->cluster_osdmap_trim_lower_bound = get_first_committed();
4498 m->newest_map = osdmap.get_epoch();
4499
4500 for (epoch_t e = to; e >= from && e > 0; e--) {
4501 bufferlist bl;
4502 int err = get_version(e, features, bl);
4503 if (err == 0) {
4504 ceph_assert(bl.length());
4505 // if (get_version(e, bl) > 0) {
4506 dout(20) << "build_incremental inc " << e << " "
4507 << bl.length() << " bytes" << dendl;
4508 m->incremental_maps[e] = bl;
4509 } else {
4510 ceph_assert(err == -ENOENT);
4511 ceph_assert(!bl.length());
4512 get_version_full(e, features, bl);
4513 if (bl.length() > 0) {
4514 //else if (get_version("full", e, bl) > 0) {
4515 dout(20) << "build_incremental full " << e << " "
4516 << bl.length() << " bytes" << dendl;
4517 m->maps[e] = bl;
4518 } else {
4519 ceph_abort(); // we should have all maps.
4520 }
4521 }
4522 }
4523 return m;
4524 }
4525
4526 void OSDMonitor::send_full(MonOpRequestRef op)
4527 {
4528 op->mark_osdmon_event(__func__);
4529 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4530 mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4531 }
4532
4533 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4534 {
4535 op->mark_osdmon_event(__func__);
4536
4537 MonSession *s = op->get_session();
4538 ceph_assert(s);
4539
4540 if (s->proxy_con) {
4541 // oh, we can tell the other mon to do it
4542 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4543 << first << dendl;
4544 MRoute *r = new MRoute(s->proxy_tid, NULL);
4545 r->send_osdmap_first = first;
4546 s->proxy_con->send_message(r);
4547 op->mark_event("reply: send routed send_osdmap_first reply");
4548 } else {
4549 // do it ourselves
4550 send_incremental(first, s, false, op);
4551 }
4552 }
4553
4554 void OSDMonitor::send_incremental(epoch_t first,
4555 MonSession *session,
4556 bool onetime,
4557 MonOpRequestRef req)
4558 {
4559 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4560 << " to " << session->name << dendl;
4561
4562 // get feature of the peer
4563 // use quorum_con_features, if it's an anonymous connection.
4564 uint64_t features = session->con_features ? session->con_features :
4565 mon.get_quorum_con_features();
4566
4567 if (first <= session->osd_epoch) {
4568 dout(10) << __func__ << " " << session->name << " should already have epoch "
4569 << session->osd_epoch << dendl;
4570 first = session->osd_epoch + 1;
4571 }
4572
4573 if (first < get_first_committed()) {
4574 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4575 m->cluster_osdmap_trim_lower_bound = get_first_committed();
4576 m->newest_map = osdmap.get_epoch();
4577
4578 first = get_first_committed();
4579 bufferlist bl;
4580 int err = get_version_full(first, features, bl);
4581 ceph_assert(err == 0);
4582 ceph_assert(bl.length());
4583 dout(20) << "send_incremental starting with base full "
4584 << first << " " << bl.length() << " bytes" << dendl;
4585 m->maps[first] = bl;
4586
4587 if (req) {
4588 mon.send_reply(req, m);
4589 session->osd_epoch = first;
4590 return;
4591 } else {
4592 session->con->send_message(m);
4593 session->osd_epoch = first;
4594 }
4595 first++;
4596 }
4597
4598 while (first <= osdmap.get_epoch()) {
4599 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4600 osdmap.get_epoch());
4601 MOSDMap *m = build_incremental(first, last, features);
4602
4603 if (req) {
4604 // send some maps. it may not be all of them, but it will get them
4605 // started.
4606 mon.send_reply(req, m);
4607 } else {
4608 session->con->send_message(m);
4609 first = last + 1;
4610 }
4611 session->osd_epoch = last;
4612 if (onetime || req)
4613 break;
4614 }
4615 }
4616
4617 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4618 {
4619 return get_version(ver, mon.get_quorum_con_features(), bl);
4620 }
4621
4622 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4623 {
4624 OSDMap::Incremental inc;
4625 auto q = bl.cbegin();
4626 inc.decode(q);
4627 // always encode with subset of osdmap's canonical features
4628 uint64_t f = features & inc.encode_features;
4629 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4630 << dendl;
4631 bl.clear();
4632 if (inc.fullmap.length()) {
4633 // embedded full map?
4634 OSDMap m;
4635 m.decode(inc.fullmap);
4636 inc.fullmap.clear();
4637 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4638 }
4639 if (inc.crush.length()) {
4640 // embedded crush map
4641 CrushWrapper c;
4642 auto p = inc.crush.cbegin();
4643 c.decode(p);
4644 inc.crush.clear();
4645 c.encode(inc.crush, f);
4646 }
4647 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4648 }
4649
4650 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4651 {
4652 OSDMap m;
4653 auto q = bl.cbegin();
4654 m.decode(q);
4655 // always encode with subset of osdmap's canonical features
4656 uint64_t f = features & m.get_encoding_features();
4657 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4658 << dendl;
4659 bl.clear();
4660 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4661 }
4662
4663 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4664 {
4665 uint64_t significant_features = OSDMap::get_significant_features(features);
4666 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4667 return 0;
4668 }
4669 int ret = PaxosService::get_version(ver, bl);
4670 if (ret < 0) {
4671 return ret;
4672 }
4673 // NOTE: this check is imprecise; the OSDMap encoding features may
4674 // be a subset of the latest mon quorum features, but worst case we
4675 // reencode once and then cache the (identical) result under both
4676 // feature masks.
4677 if (significant_features !=
4678 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4679 reencode_incremental_map(bl, features);
4680 }
4681 inc_osd_cache.add_bytes({ver, significant_features}, bl);
4682 return 0;
4683 }
4684
4685 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4686 {
4687 bufferlist inc_bl;
4688 int err = get_version(ver, inc_bl);
4689 ceph_assert(err == 0);
4690 ceph_assert(inc_bl.length());
4691
4692 auto p = inc_bl.cbegin();
4693 inc.decode(p);
4694 dout(10) << __func__ << " "
4695 << " epoch " << inc.epoch
4696 << " inc_crc " << inc.inc_crc
4697 << " full_crc " << inc.full_crc
4698 << " encode_features " << inc.encode_features << dendl;
4699 return 0;
4700 }
4701
4702 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4703 {
4704 dout(10) << __func__ << " ver " << ver << dendl;
4705
4706 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4707 if (closest_pinned == 0) {
4708 return -ENOENT;
4709 }
4710 if (closest_pinned > ver) {
4711 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4712 }
4713 ceph_assert(closest_pinned <= ver);
4714
4715 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4716
4717 // get osdmap incremental maps and apply on top of this one.
4718 bufferlist osdm_bl;
4719 bool has_cached_osdmap = false;
4720 for (version_t v = ver-1; v >= closest_pinned; --v) {
4721 if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4722 &osdm_bl)) {
4723 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4724 closest_pinned = v;
4725 has_cached_osdmap = true;
4726 break;
4727 }
4728 }
4729
4730 if (!has_cached_osdmap) {
4731 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4732 if (err != 0) {
4733 derr << __func__ << " closest pinned map ver " << closest_pinned
4734 << " not available! error: " << cpp_strerror(err) << dendl;
4735 }
4736 ceph_assert(err == 0);
4737 }
4738
4739 ceph_assert(osdm_bl.length());
4740
4741 OSDMap osdm;
4742 osdm.decode(osdm_bl);
4743
4744 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4745 << " e" << osdm.epoch
4746 << " crc " << osdm.get_crc()
4747 << " -- applying incremental maps." << dendl;
4748
4749 uint64_t encode_features = 0;
4750 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4751 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4752
4753 OSDMap::Incremental inc;
4754 int err = get_inc(v, inc);
4755 ceph_assert(err == 0);
4756
4757 encode_features = inc.encode_features;
4758
4759 err = osdm.apply_incremental(inc);
4760 ceph_assert(err == 0);
4761
4762 // this block performs paranoid checks on map retrieval
4763 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4764 inc.full_crc != 0) {
4765
4766 uint64_t f = encode_features;
4767 if (!f) {
4768 f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4769 }
4770
4771 // encode osdmap to force calculating crcs
4772 bufferlist tbl;
4773 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4774 // decode osdmap to compare crcs with what's expected by incremental
4775 OSDMap tosdm;
4776 tosdm.decode(tbl);
4777
4778 if (tosdm.get_crc() != inc.full_crc) {
4779 derr << __func__
4780 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4781 << ", expected " << inc.full_crc << ")" << dendl;
4782 ceph_abort_msg("osdmap crc mismatch");
4783 }
4784 }
4785
4786 // note: we cannot add the recently computed map to the cache, as is,
4787 // because we have not encoded the map into a bl.
4788 }
4789
4790 if (!encode_features) {
4791 dout(10) << __func__
4792 << " last incremental map didn't have features;"
4793 << " defaulting to quorum's or all" << dendl;
4794 encode_features =
4795 (mon.quorum_con_features ? mon.quorum_con_features : -1);
4796 }
4797 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4798
4799 return 0;
4800 }
4801
4802 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4803 {
4804 return get_version_full(ver, mon.get_quorum_con_features(), bl);
4805 }
4806
4807 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4808 bufferlist& bl)
4809 {
4810 uint64_t significant_features = OSDMap::get_significant_features(features);
4811 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4812 return 0;
4813 }
4814 int ret = PaxosService::get_version_full(ver, bl);
4815 if (ret == -ENOENT) {
4816 // build map?
4817 ret = get_full_from_pinned_map(ver, bl);
4818 }
4819 if (ret < 0) {
4820 return ret;
4821 }
4822 // NOTE: this check is imprecise; the OSDMap encoding features may
4823 // be a subset of the latest mon quorum features, but worst case we
4824 // reencode once and then cache the (identical) result under both
4825 // feature masks.
4826 if (significant_features !=
4827 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4828 reencode_full_map(bl, features);
4829 }
4830 full_osd_cache.add_bytes({ver, significant_features}, bl);
4831 return 0;
4832 }
4833
4834 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4835 {
4836 dout(10) << "blocklist " << av << " until " << until << dendl;
4837 for (auto a : av.v) {
4838 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4839 a.set_type(entity_addr_t::TYPE_ANY);
4840 } else {
4841 a.set_type(entity_addr_t::TYPE_LEGACY);
4842 }
4843 pending_inc.new_blocklist[a] = until;
4844 }
4845 return pending_inc.epoch;
4846 }
4847
4848 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4849 {
4850 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4851 a.set_type(entity_addr_t::TYPE_ANY);
4852 } else {
4853 a.set_type(entity_addr_t::TYPE_LEGACY);
4854 }
4855 dout(10) << "blocklist " << a << " until " << until << dendl;
4856 pending_inc.new_blocklist[a] = until;
4857 return pending_inc.epoch;
4858 }
4859
4860
4861 void OSDMonitor::check_osdmap_subs()
4862 {
4863 dout(10) << __func__ << dendl;
4864 if (!osdmap.get_epoch()) {
4865 return;
4866 }
4867 auto osdmap_subs = mon.session_map.subs.find("osdmap");
4868 if (osdmap_subs == mon.session_map.subs.end()) {
4869 return;
4870 }
4871 auto p = osdmap_subs->second->begin();
4872 while (!p.end()) {
4873 auto sub = *p;
4874 ++p;
4875 check_osdmap_sub(sub);
4876 }
4877 }
4878
4879 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4880 {
4881 dout(10) << __func__ << " " << sub << " next " << sub->next
4882 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4883 if (sub->next <= osdmap.get_epoch()) {
4884 if (sub->next >= 1)
4885 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4886 else
4887 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4888 if (sub->onetime)
4889 mon.session_map.remove_sub(sub);
4890 else
4891 sub->next = osdmap.get_epoch() + 1;
4892 }
4893 }
4894
4895 void OSDMonitor::check_pg_creates_subs()
4896 {
4897 if (!osdmap.get_num_up_osds()) {
4898 return;
4899 }
4900 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4901 mon.with_session_map([this](const MonSessionMap& session_map) {
4902 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4903 if (pg_creates_subs == session_map.subs.end()) {
4904 return;
4905 }
4906 for (auto sub : *pg_creates_subs->second) {
4907 check_pg_creates_sub(sub);
4908 }
4909 });
4910 }
4911
4912 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4913 {
4914 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4915 ceph_assert(sub->type == "osd_pg_creates");
4916 // only send these if the OSD is up. we will check_subs() when they do
4917 // come up so they will get the creates then.
4918 if (sub->session->name.is_osd() &&
4919 mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4920 sub->next = send_pg_creates(sub->session->name.num(),
4921 sub->session->con.get(),
4922 sub->next);
4923 }
4924 }
4925
4926 void OSDMonitor::do_application_enable(int64_t pool_id,
4927 const std::string &app_name,
4928 const std::string &app_key,
4929 const std::string &app_value,
4930 bool force)
4931 {
4932 ceph_assert(paxos.is_plugged() && is_writeable());
4933
4934 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4935 << dendl;
4936
4937 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4938
4939 auto pp = osdmap.get_pg_pool(pool_id);
4940 ceph_assert(pp != nullptr);
4941
4942 pg_pool_t p = *pp;
4943 if (pending_inc.new_pools.count(pool_id)) {
4944 p = pending_inc.new_pools[pool_id];
4945 }
4946
4947 if (app_key.empty()) {
4948 p.application_metadata.insert({app_name, {}});
4949 } else {
4950 if (force) {
4951 p.application_metadata[app_name][app_key] = app_value;
4952 } else {
4953 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4954 }
4955 }
4956 p.last_change = pending_inc.epoch;
4957 pending_inc.new_pools[pool_id] = p;
4958 }
4959
4960 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4961 pool_opts_t::key_t opt,
4962 pool_opts_t::value_t val)
4963 {
4964 dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
4965 << " val: " << val << dendl;
4966 auto p = pending_inc.new_pools.try_emplace(
4967 pool_id, *osdmap.get_pg_pool(pool_id));
4968 p.first->second.opts.set(opt, val);
4969 }
4970
4971 unsigned OSDMonitor::scan_for_creating_pgs(
4972 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4973 const mempool::osdmap::set<int64_t>& removed_pools,
4974 utime_t modified,
4975 creating_pgs_t* creating_pgs) const
4976 {
4977 unsigned queued = 0;
4978 for (auto& p : pools) {
4979 int64_t poolid = p.first;
4980 if (creating_pgs->created_pools.count(poolid)) {
4981 dout(10) << __func__ << " already created " << poolid << dendl;
4982 continue;
4983 }
4984 const pg_pool_t& pool = p.second;
4985 int ruleno = pool.get_crush_rule();
4986 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4987 continue;
4988
4989 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4990 const auto created = pool.get_last_change();
4991 if (last_scan_epoch && created <= last_scan_epoch) {
4992 dout(10) << __func__ << " no change in pool " << poolid
4993 << " " << pool << dendl;
4994 continue;
4995 }
4996 if (removed_pools.count(poolid)) {
4997 dout(10) << __func__ << " pool is being removed: " << poolid
4998 << " " << pool << dendl;
4999 continue;
5000 }
5001 dout(10) << __func__ << " queueing pool create for " << poolid
5002 << " " << pool << dendl;
5003 creating_pgs->create_pool(poolid, pool.get_pg_num(),
5004 created, modified);
5005 queued++;
5006 }
5007 return queued;
5008 }
5009
5010 void OSDMonitor::update_creating_pgs()
5011 {
5012 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
5013 << creating_pgs.queue.size() << " pools in queue" << dendl;
5014 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
5015 std::lock_guard<std::mutex> l(creating_pgs_lock);
5016 for (const auto& pg : creating_pgs.pgs) {
5017 int acting_primary = -1;
5018 auto pgid = pg.first;
5019 if (!osdmap.pg_exists(pgid)) {
5020 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
5021 << dendl;
5022 continue;
5023 }
5024 auto mapped = pg.second.create_epoch;
5025 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
5026 spg_t spgid(pgid);
5027 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
5028 // check the previous creating_pgs, look for the target to whom the pg was
5029 // previously mapped
5030 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
5031 const auto last_acting_primary = pgs_by_epoch.first;
5032 for (auto& pgs: pgs_by_epoch.second) {
5033 if (pgs.second.count(spgid)) {
5034 if (last_acting_primary == acting_primary) {
5035 mapped = pgs.first;
5036 } else {
5037 dout(20) << __func__ << " " << pgid << " "
5038 << " acting_primary:" << last_acting_primary
5039 << " -> " << acting_primary << dendl;
5040 // note epoch if the target of the create message changed.
5041 mapped = mapping.get_epoch();
5042 }
5043 break;
5044 } else {
5045 // newly creating
5046 mapped = mapping.get_epoch();
5047 }
5048 }
5049 }
5050 dout(10) << __func__ << " will instruct osd." << acting_primary
5051 << " to create " << pgid << "@" << mapped << dendl;
5052 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5053 }
5054 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5055 creating_pgs_epoch = mapping.get_epoch();
5056 }
5057
5058 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5059 {
5060 dout(30) << __func__ << " osd." << osd << " next=" << next
5061 << " " << creating_pgs_by_osd_epoch << dendl;
5062 std::lock_guard<std::mutex> l(creating_pgs_lock);
5063 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5064 dout(20) << __func__
5065 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5066 // the subscribers will be updated when the mapping is completed anyway
5067 return next;
5068 }
5069 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5070 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5071 return next;
5072 ceph_assert(!creating_pgs_by_epoch->second.empty());
5073
5074 auto m = make_message<MOSDPGCreate2>(creating_pgs_epoch);
5075
5076 epoch_t last = 0;
5077 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5078 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5079 auto epoch = epoch_pgs->first;
5080 auto& pgs = epoch_pgs->second;
5081 dout(20) << __func__ << " osd." << osd << " from " << next
5082 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5083 last = epoch;
5084 for (auto& pg : pgs) {
5085 // Need the create time from the monitor using its clock to set
5086 // last_scrub_stamp upon pg creation.
5087 auto create = creating_pgs.pgs.find(pg.pgid);
5088 ceph_assert(create != creating_pgs.pgs.end());
5089 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5090 create->second.create_stamp));
5091 if (create->second.history.epoch_created) {
5092 dout(20) << __func__ << " " << pg << " " << create->second.history
5093 << " " << create->second.past_intervals << dendl;
5094 m->pg_extra.emplace(pg, make_pair(create->second.history,
5095 create->second.past_intervals));
5096 }
5097 dout(20) << __func__ << " will create " << pg
5098 << " at " << create->second.create_epoch << dendl;
5099 }
5100 }
5101 if (!m->pgs.empty()) {
5102 con->send_message2(std::move(m));
5103 } else {
5104 dout(20) << __func__ << " osd." << osd << " from " << next
5105 << " has nothing to send" << dendl;
5106 return next;
5107 }
5108
5109 // sub is current through last + 1
5110 return last + 1;
5111 }
5112
5113 // TICK
5114
5115
5116 void OSDMonitor::tick()
5117 {
5118 if (!is_active()) return;
5119
5120 dout(10) << osdmap << dendl;
5121
5122 // always update osdmap manifest, regardless of being the leader.
5123 load_osdmap_manifest();
5124
5125 // always tune priority cache manager memory on leader and peons
5126 if (ceph_using_tcmalloc() && mon_memory_autotune) {
5127 std::lock_guard l(balancer_lock);
5128 if (pcm != nullptr) {
5129 pcm->tune_memory();
5130 pcm->balance();
5131 _set_new_cache_sizes();
5132 dout(10) << "tick balancer "
5133 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5134 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5135 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5136 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5137 << dendl;
5138 dout(10) << "tick balancer "
5139 << " full cache_bytes: " << full_cache->get_cache_bytes()
5140 << " full comtd_bytes: " << full_cache->get_committed_size()
5141 << " full used_bytes: " << full_cache->_get_used_bytes()
5142 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5143 << dendl;
5144 }
5145 }
5146
5147 if (!mon.is_leader()) return;
5148
5149 bool do_propose = false;
5150 utime_t now = ceph_clock_now();
5151
5152 if (handle_osd_timeouts(now, last_osd_report)) {
5153 do_propose = true;
5154 }
5155
5156 // mark osds down?
5157 if (check_failures(now)) {
5158 do_propose = true;
5159 }
5160
5161 // Force a proposal if we need to prune; pruning is performed on
5162 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5163 // even if there's nothing going on.
5164 if (is_prune_enabled() && should_prune()) {
5165 do_propose = true;
5166 }
5167
5168 // mark down osds out?
5169
5170 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5171 * influence at all. The decision is made based on the ratio of "in" osds,
5172 * and the function returns false if this ratio is lower that the minimum
5173 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5174 */
5175 if (can_mark_out(-1)) {
5176 string down_out_subtree_limit = g_conf().get_val<string>(
5177 "mon_osd_down_out_subtree_limit");
5178 set<int> down_cache; // quick cache of down subtrees
5179
5180 map<int,utime_t>::iterator i = down_pending_out.begin();
5181 while (i != down_pending_out.end()) {
5182 int o = i->first;
5183 utime_t down = now;
5184 down -= i->second;
5185 ++i;
5186
5187 if (osdmap.is_down(o) &&
5188 osdmap.is_in(o) &&
5189 can_mark_out(o)) {
5190 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5191 utime_t grace = orig_grace;
5192 double my_grace = 0.0;
5193
5194 if (g_conf()->mon_osd_adjust_down_out_interval) {
5195 // scale grace period the same way we do the heartbeat grace.
5196 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5197 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5198 double decay_k = ::log(.5) / halflife;
5199 double decay = exp((double)down * decay_k);
5200 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5201 << " down for " << down << " decay " << decay << dendl;
5202 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5203 grace += my_grace;
5204 }
5205
5206 // is this an entire large subtree down?
5207 if (down_out_subtree_limit.length()) {
5208 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5209 if (type > 0) {
5210 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5211 dout(10) << "tick entire containing " << down_out_subtree_limit
5212 << " subtree for osd." << o
5213 << " is down; resetting timer" << dendl;
5214 // reset timer, too.
5215 down_pending_out[o] = now;
5216 continue;
5217 }
5218 }
5219 }
5220
5221 bool down_out = !osdmap.is_destroyed(o) &&
5222 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5223 bool destroyed_out = osdmap.is_destroyed(o) &&
5224 g_conf()->mon_osd_destroyed_out_interval > 0 &&
5225 // this is not precise enough as we did not make a note when this osd
5226 // was marked as destroyed, but let's not bother with that
5227 // complexity for now.
5228 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5229 if (down_out || destroyed_out) {
5230 dout(10) << "tick marking osd." << o << " OUT after " << down
5231 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5232 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5233
5234 // set the AUTOOUT bit.
5235 if (pending_inc.new_state.count(o) == 0)
5236 pending_inc.new_state[o] = 0;
5237 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5238
5239 // remember previous weight
5240 if (pending_inc.new_xinfo.count(o) == 0)
5241 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5242 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5243
5244 do_propose = true;
5245
5246 mon.clog->info() << "Marking osd." << o << " out (has been down for "
5247 << int(down.sec()) << " seconds)";
5248 } else
5249 continue;
5250 }
5251
5252 down_pending_out.erase(o);
5253 }
5254 } else {
5255 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5256 }
5257
5258 // expire blocklisted items?
5259 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5260 p != osdmap.blocklist.end();
5261 ++p) {
5262 if (p->second < now) {
5263 dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5264 pending_inc.old_blocklist.push_back(p->first);
5265 do_propose = true;
5266 }
5267 }
5268 for (auto p = osdmap.range_blocklist.begin();
5269 p != osdmap.range_blocklist.end();
5270 ++p) {
5271 if (p->second < now) {
5272 dout(10) << "expiring range_blocklist item " << p->first
5273 << " expired " << p->second << " < now " << now << dendl;
5274 pending_inc.old_range_blocklist.push_back(p->first);
5275 do_propose = true;
5276 }
5277 }
5278
5279 if (try_prune_purged_snaps()) {
5280 do_propose = true;
5281 }
5282
5283 if (update_pools_status())
5284 do_propose = true;
5285
5286 if (do_propose ||
5287 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5288 propose_pending();
5289 }
5290
5291 void OSDMonitor::_set_new_cache_sizes()
5292 {
5293 uint64_t cache_size = 0;
5294 int64_t inc_alloc = 0;
5295 int64_t full_alloc = 0;
5296 int64_t kv_alloc = 0;
5297
5298 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5299 cache_size = pcm->get_tuned_mem();
5300 inc_alloc = inc_cache->get_committed_size();
5301 full_alloc = full_cache->get_committed_size();
5302 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5303 }
5304
5305 inc_osd_cache.set_bytes(inc_alloc);
5306 full_osd_cache.set_bytes(full_alloc);
5307
5308 dout(1) << __func__ << " cache_size:" << cache_size
5309 << " inc_alloc: " << inc_alloc
5310 << " full_alloc: " << full_alloc
5311 << " kv_alloc: " << kv_alloc
5312 << dendl;
5313 }
5314
5315 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5316 std::map<int, std::pair<utime_t, int>> &last_osd_report)
5317 {
5318 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5319 if (now - mon.get_leader_since() < timeo) {
5320 // We haven't been the leader for long enough to consider OSD timeouts
5321 return false;
5322 }
5323
5324 int max_osd = osdmap.get_max_osd();
5325 bool new_down = false;
5326
5327 for (int i=0; i < max_osd; ++i) {
5328 dout(30) << __func__ << ": checking up on osd " << i << dendl;
5329 if (!osdmap.exists(i)) {
5330 last_osd_report.erase(i); // if any
5331 continue;
5332 }
5333 if (!osdmap.is_up(i))
5334 continue;
5335 const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5336 if (t == last_osd_report.end()) {
5337 // it wasn't in the map; start the timer.
5338 last_osd_report[i].first = now;
5339 last_osd_report[i].second = 0;
5340 } else if (can_mark_down(i)) {
5341 utime_t diff = now - t->second.first;
5342 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5343 // to allow for the osd to miss a beacon.
5344 int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5345 utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0);
5346 if (diff > max_timeout) {
5347 mon.clog->info() << "osd." << i << " marked down after no beacon for "
5348 << diff << " seconds";
5349 derr << "no beacon from osd." << i << " since " << t->second.first
5350 << ", " << diff << " seconds ago. marking down" << dendl;
5351 pending_inc.new_state[i] = CEPH_OSD_UP;
5352 new_down = true;
5353 }
5354 }
5355 }
5356 return new_down;
5357 }
5358
5359 static void dump_cpu_list(Formatter *f, const char *name,
5360 const string& strlist)
5361 {
5362 cpu_set_t cpu_set;
5363 size_t cpu_set_size;
5364 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5365 return;
5366 }
5367 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5368 f->open_array_section(name);
5369 for (auto cpu : cpus) {
5370 f->dump_int("cpu", cpu);
5371 }
5372 f->close_section();
5373 }
5374
5375 void OSDMonitor::dump_info(Formatter *f)
5376 {
5377 f->open_object_section("osdmap");
5378 osdmap.dump(f, cct);
5379 f->close_section();
5380
5381 f->open_array_section("osd_metadata");
5382 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5383 if (osdmap.exists(i)) {
5384 f->open_object_section("osd");
5385 f->dump_unsigned("id", i);
5386 dump_osd_metadata(i, f, NULL);
5387 f->close_section();
5388 }
5389 }
5390 f->close_section();
5391
5392 f->open_object_section("osdmap_clean_epochs");
5393 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5394
5395 f->open_object_section("last_epoch_clean");
5396 last_epoch_clean.dump(f);
5397 f->close_section();
5398
5399 f->open_array_section("osd_epochs");
5400 for (auto& osd_epoch : osd_epochs) {
5401 f->open_object_section("osd");
5402 f->dump_unsigned("id", osd_epoch.first);
5403 f->dump_unsigned("epoch", osd_epoch.second);
5404 f->close_section();
5405 }
5406 f->close_section(); // osd_epochs
5407
5408 f->close_section(); // osd_clean_epochs
5409
5410 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5411 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5412
5413 f->open_object_section("crushmap");
5414 osdmap.crush->dump(f);
5415 f->close_section();
5416
5417 if (has_osdmap_manifest) {
5418 f->open_object_section("osdmap_manifest");
5419 osdmap_manifest.dump(f);
5420 f->close_section();
5421 }
5422 }
5423
5424 namespace {
5425 enum osd_pool_get_choices {
5426 SIZE, MIN_SIZE,
5427 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5428 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5429 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5430 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5431 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5432 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5433 CACHE_TARGET_FULL_RATIO,
5434 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5435 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5436 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5437 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5438 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5439 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5440 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5441 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5442 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5443 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5444 PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5445 DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX };
5446
5447 std::set<osd_pool_get_choices>
5448 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5449 const std::set<osd_pool_get_choices>& second)
5450 {
5451 std::set<osd_pool_get_choices> result;
5452 std::set_difference(first.begin(), first.end(),
5453 second.begin(), second.end(),
5454 std::inserter(result, result.end()));
5455 return result;
5456 }
5457 }
5458
5459
5460 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5461 {
5462 op->mark_osdmon_event(__func__);
5463 auto m = op->get_req<MMonCommand>();
5464 int r = 0;
5465 bufferlist rdata;
5466 stringstream ss, ds;
5467
5468 cmdmap_t cmdmap;
5469 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5470 string rs = ss.str();
5471 mon.reply_command(op, -EINVAL, rs, get_last_committed());
5472 return true;
5473 }
5474
5475 MonSession *session = op->get_session();
5476 if (!session) {
5477 derr << __func__ << " no session" << dendl;
5478 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5479 return true;
5480 }
5481
5482 string prefix;
5483 cmd_getval(cmdmap, "prefix", prefix);
5484
5485 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
5486 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5487
5488 if (prefix == "osd stat") {
5489 if (f) {
5490 f->open_object_section("osdmap");
5491 osdmap.print_summary(f.get(), ds, "", true);
5492 f->close_section();
5493 f->flush(rdata);
5494 } else {
5495 osdmap.print_summary(nullptr, ds, "", true);
5496 rdata.append(ds);
5497 }
5498 }
5499 else if (prefix == "osd dump" ||
5500 prefix == "osd tree" ||
5501 prefix == "osd tree-from" ||
5502 prefix == "osd ls" ||
5503 prefix == "osd getmap" ||
5504 prefix == "osd getcrushmap" ||
5505 prefix == "osd ls-tree" ||
5506 prefix == "osd info") {
5507
5508 epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
5509 bufferlist osdmap_bl;
5510 int err = get_version_full(epoch, osdmap_bl);
5511 if (err == -ENOENT) {
5512 r = -ENOENT;
5513 ss << "there is no map for epoch " << epoch;
5514 goto reply;
5515 }
5516 ceph_assert(err == 0);
5517 ceph_assert(osdmap_bl.length());
5518
5519 OSDMap *p;
5520 if (epoch == osdmap.get_epoch()) {
5521 p = &osdmap;
5522 } else {
5523 p = new OSDMap;
5524 p->decode(osdmap_bl);
5525 }
5526
5527 auto sg = make_scope_guard([&] {
5528 if (p != &osdmap) {
5529 delete p;
5530 }
5531 });
5532
5533 if (prefix == "osd dump") {
5534 stringstream ds;
5535 if (f) {
5536 f->open_object_section("osdmap");
5537 p->dump(f.get(), cct);
5538 f->close_section();
5539 f->flush(ds);
5540 } else {
5541 p->print(cct, ds);
5542 }
5543 rdata.append(ds);
5544 if (!f)
5545 ds << " ";
5546 } else if (prefix == "osd ls") {
5547 if (f) {
5548 f->open_array_section("osds");
5549 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5550 if (osdmap.exists(i)) {
5551 f->dump_int("osd", i);
5552 }
5553 }
5554 f->close_section();
5555 f->flush(ds);
5556 } else {
5557 bool first = true;
5558 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5559 if (osdmap.exists(i)) {
5560 if (!first)
5561 ds << "\n";
5562 first = false;
5563 ds << i;
5564 }
5565 }
5566 }
5567 rdata.append(ds);
5568 } else if (prefix == "osd info") {
5569 int64_t osd_id;
5570 bool do_single_osd = true;
5571 if (!cmd_getval(cmdmap, "id", osd_id)) {
5572 do_single_osd = false;
5573 }
5574
5575 if (do_single_osd && !osdmap.exists(osd_id)) {
5576 ss << "osd." << osd_id << " does not exist";
5577 r = -EINVAL;
5578 goto reply;
5579 }
5580
5581 if (f) {
5582 if (do_single_osd) {
5583 osdmap.dump_osd(osd_id, f.get());
5584 } else {
5585 osdmap.dump_osds(f.get());
5586 }
5587 f->flush(ds);
5588 } else {
5589 if (do_single_osd) {
5590 osdmap.print_osd(osd_id, ds);
5591 } else {
5592 osdmap.print_osds(ds);
5593 }
5594 }
5595 rdata.append(ds);
5596 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5597 string bucket;
5598 if (prefix == "osd tree-from") {
5599 cmd_getval(cmdmap, "bucket", bucket);
5600 if (!osdmap.crush->name_exists(bucket)) {
5601 ss << "bucket '" << bucket << "' does not exist";
5602 r = -ENOENT;
5603 goto reply;
5604 }
5605 int id = osdmap.crush->get_item_id(bucket);
5606 if (id >= 0) {
5607 ss << "\"" << bucket << "\" is not a bucket";
5608 r = -EINVAL;
5609 goto reply;
5610 }
5611 }
5612
5613 vector<string> states;
5614 cmd_getval(cmdmap, "states", states);
5615 unsigned filter = 0;
5616 for (auto& s : states) {
5617 if (s == "up") {
5618 filter |= OSDMap::DUMP_UP;
5619 } else if (s == "down") {
5620 filter |= OSDMap::DUMP_DOWN;
5621 } else if (s == "in") {
5622 filter |= OSDMap::DUMP_IN;
5623 } else if (s == "out") {
5624 filter |= OSDMap::DUMP_OUT;
5625 } else if (s == "destroyed") {
5626 filter |= OSDMap::DUMP_DESTROYED;
5627 } else {
5628 ss << "unrecognized state '" << s << "'";
5629 r = -EINVAL;
5630 goto reply;
5631 }
5632 }
5633 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5634 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5635 ss << "cannot specify both 'in' and 'out'";
5636 r = -EINVAL;
5637 goto reply;
5638 }
5639 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5640 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5641 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5642 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5643 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5644 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5645 ss << "can specify only one of 'up', 'down' and 'destroyed'";
5646 r = -EINVAL;
5647 goto reply;
5648 }
5649 if (f) {
5650 f->open_object_section("tree");
5651 p->print_tree(f.get(), NULL, filter, bucket);
5652 f->close_section();
5653 f->flush(ds);
5654 } else {
5655 p->print_tree(NULL, &ds, filter, bucket);
5656 }
5657 rdata.append(ds);
5658 } else if (prefix == "osd getmap") {
5659 rdata.append(osdmap_bl);
5660 ss << "got osdmap epoch " << p->get_epoch();
5661 } else if (prefix == "osd getcrushmap") {
5662 p->crush->encode(rdata, mon.get_quorum_con_features());
5663 ss << p->get_crush_version();
5664 } else if (prefix == "osd ls-tree") {
5665 string bucket_name;
5666 cmd_getval(cmdmap, "name", bucket_name);
5667 set<int> osds;
5668 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5669 if (r == -ENOENT) {
5670 ss << "\"" << bucket_name << "\" does not exist";
5671 goto reply;
5672 } else if (r < 0) {
5673 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5674 goto reply;
5675 }
5676
5677 if (f) {
5678 f->open_array_section("osds");
5679 for (auto &i : osds) {
5680 if (osdmap.exists(i)) {
5681 f->dump_int("osd", i);
5682 }
5683 }
5684 f->close_section();
5685 f->flush(ds);
5686 } else {
5687 bool first = true;
5688 for (auto &i : osds) {
5689 if (osdmap.exists(i)) {
5690 if (!first)
5691 ds << "\n";
5692 first = false;
5693 ds << i;
5694 }
5695 }
5696 }
5697
5698 rdata.append(ds);
5699 }
5700 } else if (prefix == "osd getmaxosd") {
5701 if (f) {
5702 f->open_object_section("getmaxosd");
5703 f->dump_unsigned("epoch", osdmap.get_epoch());
5704 f->dump_int("max_osd", osdmap.get_max_osd());
5705 f->close_section();
5706 f->flush(rdata);
5707 } else {
5708 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5709 rdata.append(ds);
5710 }
5711 } else if (prefix == "osd utilization") {
5712 string out;
5713 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5714 if (f)
5715 f->flush(rdata);
5716 else
5717 rdata.append(out);
5718 r = 0;
5719 goto reply;
5720 } else if (prefix == "osd find") {
5721 int64_t osd;
5722 if (!cmd_getval(cmdmap, "id", osd)) {
5723 ss << "unable to parse osd id value '"
5724 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5725 r = -EINVAL;
5726 goto reply;
5727 }
5728 if (!osdmap.exists(osd)) {
5729 ss << "osd." << osd << " does not exist";
5730 r = -ENOENT;
5731 goto reply;
5732 }
5733 string format;
5734 cmd_getval(cmdmap, "format", format);
5735 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5736 f->open_object_section("osd_location");
5737 f->dump_int("osd", osd);
5738 f->dump_object("addrs", osdmap.get_addrs(osd));
5739 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5740
5741 // try to identify host, pod/container name, etc.
5742 map<string,string> m;
5743 load_metadata(osd, m, nullptr);
5744 if (auto p = m.find("hostname"); p != m.end()) {
5745 f->dump_string("host", p->second);
5746 }
5747 for (auto& k : {
5748 "pod_name", "pod_namespace", // set by rook
5749 "container_name" // set by cephadm, ceph-ansible
5750 }) {
5751 if (auto p = m.find(k); p != m.end()) {
5752 f->dump_string(k, p->second);
5753 }
5754 }
5755
5756 // crush is helpful too
5757 f->open_object_section("crush_location");
5758 map<string,string> loc = osdmap.crush->get_full_location(osd);
5759 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5760 f->dump_string(p->first.c_str(), p->second);
5761 f->close_section();
5762 f->close_section();
5763 f->flush(rdata);
5764 } else if (prefix == "osd metadata") {
5765 int64_t osd = -1;
5766 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5767 !cmd_getval(cmdmap, "id", osd)) {
5768 ss << "unable to parse osd id value '"
5769 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5770 r = -EINVAL;
5771 goto reply;
5772 }
5773 if (osd >= 0 && !osdmap.exists(osd)) {
5774 ss << "osd." << osd << " does not exist";
5775 r = -ENOENT;
5776 goto reply;
5777 }
5778 string format;
5779 cmd_getval(cmdmap, "format", format);
5780 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5781 if (osd >= 0) {
5782 f->open_object_section("osd_metadata");
5783 f->dump_unsigned("id", osd);
5784 r = dump_osd_metadata(osd, f.get(), &ss);
5785 if (r < 0)
5786 goto reply;
5787 f->close_section();
5788 } else {
5789 r = 0;
5790 f->open_array_section("osd_metadata");
5791 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5792 if (osdmap.exists(i)) {
5793 f->open_object_section("osd");
5794 f->dump_unsigned("id", i);
5795 r = dump_osd_metadata(i, f.get(), NULL);
5796 if (r == -EINVAL || r == -ENOENT) {
5797 // Drop error, continue to get other daemons' metadata
5798 dout(4) << "No metadata for osd." << i << dendl;
5799 r = 0;
5800 } else if (r < 0) {
5801 // Unexpected error
5802 goto reply;
5803 }
5804 f->close_section();
5805 }
5806 }
5807 f->close_section();
5808 }
5809 f->flush(rdata);
5810 } else if (prefix == "osd versions") {
5811 if (!f)
5812 f.reset(Formatter::create("json-pretty"));
5813 count_metadata("ceph_version", f.get());
5814 f->flush(rdata);
5815 r = 0;
5816 } else if (prefix == "osd count-metadata") {
5817 if (!f)
5818 f.reset(Formatter::create("json-pretty"));
5819 string field;
5820 cmd_getval(cmdmap, "property", field);
5821 count_metadata(field, f.get());
5822 f->flush(rdata);
5823 r = 0;
5824 } else if (prefix == "osd numa-status") {
5825 TextTable tbl;
5826 if (f) {
5827 f->open_array_section("osds");
5828 } else {
5829 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5830 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5831 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5832 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5833 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5834 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5835 }
5836 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5837 if (osdmap.exists(i)) {
5838 map<string,string> m;
5839 ostringstream err;
5840 if (load_metadata(i, m, &err) < 0) {
5841 continue;
5842 }
5843 string host;
5844 auto p = m.find("hostname");
5845 if (p != m.end()) {
5846 host = p->second;
5847 }
5848 if (f) {
5849 f->open_object_section("osd");
5850 f->dump_int("osd", i);
5851 f->dump_string("host", host);
5852 for (auto n : { "network_numa_node", "objectstore_numa_node",
5853 "numa_node" }) {
5854 p = m.find(n);
5855 if (p != m.end()) {
5856 f->dump_int(n, atoi(p->second.c_str()));
5857 }
5858 }
5859 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5860 p = m.find(n);
5861 if (p != m.end()) {
5862 list<string> ls = get_str_list(p->second, ",");
5863 f->open_array_section(n);
5864 for (auto node : ls) {
5865 f->dump_int("node", atoi(node.c_str()));
5866 }
5867 f->close_section();
5868 }
5869 }
5870 for (auto n : { "numa_node_cpus" }) {
5871 p = m.find(n);
5872 if (p != m.end()) {
5873 dump_cpu_list(f.get(), n, p->second);
5874 }
5875 }
5876 f->close_section();
5877 } else {
5878 tbl << i;
5879 tbl << host;
5880 p = m.find("network_numa_nodes");
5881 if (p != m.end()) {
5882 tbl << p->second;
5883 } else {
5884 tbl << "-";
5885 }
5886 p = m.find("objectstore_numa_nodes");
5887 if (p != m.end()) {
5888 tbl << p->second;
5889 } else {
5890 tbl << "-";
5891 }
5892 p = m.find("numa_node");
5893 auto q = m.find("numa_node_cpus");
5894 if (p != m.end() && q != m.end()) {
5895 tbl << p->second;
5896 tbl << q->second;
5897 } else {
5898 tbl << "-";
5899 tbl << "-";
5900 }
5901 tbl << TextTable::endrow;
5902 }
5903 }
5904 }
5905 if (f) {
5906 f->close_section();
5907 f->flush(rdata);
5908 } else {
5909 rdata.append(stringify(tbl));
5910 }
5911 } else if (prefix == "osd map") {
5912 string poolstr, objstr, namespacestr;
5913 cmd_getval(cmdmap, "pool", poolstr);
5914 cmd_getval(cmdmap, "object", objstr);
5915 cmd_getval(cmdmap, "nspace", namespacestr);
5916
5917 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5918 if (pool < 0) {
5919 ss << "pool " << poolstr << " does not exist";
5920 r = -ENOENT;
5921 goto reply;
5922 }
5923 object_locator_t oloc(pool, namespacestr);
5924 object_t oid(objstr);
5925 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5926 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5927 vector<int> up, acting;
5928 int up_p, acting_p;
5929 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5930
5931 string fullobjname;
5932 if (!namespacestr.empty())
5933 fullobjname = namespacestr + string("/") + oid.name;
5934 else
5935 fullobjname = oid.name;
5936 if (f) {
5937 f->open_object_section("osd_map");
5938 f->dump_unsigned("epoch", osdmap.get_epoch());
5939 f->dump_string("pool", poolstr);
5940 f->dump_int("pool_id", pool);
5941 f->dump_stream("objname") << fullobjname;
5942 f->dump_stream("raw_pgid") << pgid;
5943 f->dump_stream("pgid") << mpgid;
5944 f->open_array_section("up");
5945 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5946 f->dump_int("osd", *p);
5947 f->close_section();
5948 f->dump_int("up_primary", up_p);
5949 f->open_array_section("acting");
5950 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5951 f->dump_int("osd", *p);
5952 f->close_section();
5953 f->dump_int("acting_primary", acting_p);
5954 f->close_section(); // osd_map
5955 f->flush(rdata);
5956 } else {
5957 ds << "osdmap e" << osdmap.get_epoch()
5958 << " pool '" << poolstr << "' (" << pool << ")"
5959 << " object '" << fullobjname << "' ->"
5960 << " pg " << pgid << " (" << mpgid << ")"
5961 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5962 << pg_vector_string(acting) << ", p" << acting_p << ")";
5963 rdata.append(ds);
5964 }
5965
5966 } else if (prefix == "pg map") {
5967 pg_t pgid;
5968 vector<int> up, acting;
5969 r = parse_pgid(cmdmap, ss, pgid);
5970 if (r < 0)
5971 goto reply;
5972 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5973 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5974 if (f) {
5975 f->open_object_section("pg_map");
5976 f->dump_unsigned("epoch", osdmap.get_epoch());
5977 f->dump_stream("raw_pgid") << pgid;
5978 f->dump_stream("pgid") << mpgid;
5979 f->open_array_section("up");
5980 for (auto osd : up) {
5981 f->dump_int("up_osd", osd);
5982 }
5983 f->close_section();
5984 f->open_array_section("acting");
5985 for (auto osd : acting) {
5986 f->dump_int("acting_osd", osd);
5987 }
5988 f->close_section();
5989 f->close_section();
5990 f->flush(rdata);
5991 } else {
5992 ds << "osdmap e" << osdmap.get_epoch()
5993 << " pg " << pgid << " (" << mpgid << ")"
5994 << " -> up " << up << " acting " << acting;
5995 rdata.append(ds);
5996 }
5997 goto reply;
5998
5999 } else if (prefix == "osd lspools") {
6000 if (f)
6001 f->open_array_section("pools");
6002 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
6003 p != osdmap.pools.end();
6004 ++p) {
6005 if (f) {
6006 f->open_object_section("pool");
6007 f->dump_int("poolnum", p->first);
6008 f->dump_string("poolname", osdmap.pool_name[p->first]);
6009 f->close_section();
6010 } else {
6011 ds << p->first << ' ' << osdmap.pool_name[p->first];
6012 if (next(p) != osdmap.pools.end()) {
6013 ds << '\n';
6014 }
6015 }
6016 }
6017 if (f) {
6018 f->close_section();
6019 f->flush(ds);
6020 }
6021 rdata.append(ds);
6022 } else if (prefix == "osd blocklist ls" ||
6023 prefix == "osd blacklist ls") {
6024 if (f)
6025 f->open_array_section("blocklist");
6026
6027 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6028 p != osdmap.blocklist.end();
6029 ++p) {
6030 if (f) {
6031 f->open_object_section("entry");
6032 f->dump_string("addr", p->first.get_legacy_str());
6033 f->dump_stream("until") << p->second;
6034 f->close_section();
6035 } else {
6036 stringstream ss;
6037 string s;
6038 ss << p->first << " " << p->second;
6039 getline(ss, s);
6040 s += "\n";
6041 rdata.append(s);
6042 }
6043 }
6044 if (f) {
6045 f->close_section();
6046 f->flush(rdata);
6047 }
6048 if (f)
6049 f->open_array_section("range_blocklist");
6050
6051 for (auto p = osdmap.range_blocklist.begin();
6052 p != osdmap.range_blocklist.end();
6053 ++p) {
6054 if (f) {
6055 f->open_object_section("entry");
6056 f->dump_string("range", p->first.get_legacy_str());
6057 f->dump_stream("until") << p->second;
6058 f->close_section();
6059 } else {
6060 stringstream ss;
6061 string s;
6062 ss << p->first << " " << p->second;
6063 getline(ss, s);
6064 s += "\n";
6065 rdata.append(s);
6066 }
6067 }
6068 if (f) {
6069 f->close_section();
6070 f->flush(rdata);
6071 }
6072 ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
6073
6074 } else if (prefix == "osd pool ls") {
6075 string detail;
6076 cmd_getval(cmdmap, "detail", detail);
6077 if (!f && detail == "detail") {
6078 ostringstream ss;
6079 osdmap.print_pools(cct, ss);
6080 rdata.append(ss.str());
6081 } else {
6082 if (f)
6083 f->open_array_section("pools");
6084 for (auto &[pid, pdata] : osdmap.get_pools()) {
6085 if (f) {
6086 if (detail == "detail") {
6087 f->open_object_section("pool");
6088 f->dump_int("pool_id", pid);
6089 f->dump_string("pool_name", osdmap.get_pool_name(pid));
6090 pdata.dump(f.get());
6091 osdmap.dump_read_balance_score(cct, pid, pdata, f.get());
6092 f->close_section();
6093 } else {
6094 f->dump_string("pool_name", osdmap.get_pool_name(pid));
6095 }
6096 } else {
6097 rdata.append(osdmap.get_pool_name(pid) + "\n");
6098 }
6099 }
6100 if (f) {
6101 f->close_section();
6102 f->flush(rdata);
6103 }
6104 }
6105
6106 } else if (prefix == "osd crush get-tunable") {
6107 string tunable;
6108 cmd_getval(cmdmap, "tunable", tunable);
6109 ostringstream rss;
6110 if (f)
6111 f->open_object_section("tunable");
6112 if (tunable == "straw_calc_version") {
6113 if (f)
6114 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6115 else
6116 rss << osdmap.crush->get_straw_calc_version() << "\n";
6117 } else {
6118 r = -EINVAL;
6119 goto reply;
6120 }
6121 if (f) {
6122 f->close_section();
6123 f->flush(rdata);
6124 } else {
6125 rdata.append(rss.str());
6126 }
6127 r = 0;
6128
6129 } else if (prefix == "osd pool get") {
6130 string poolstr;
6131 cmd_getval(cmdmap, "pool", poolstr);
6132 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6133 if (pool < 0) {
6134 ss << "unrecognized pool '" << poolstr << "'";
6135 r = -ENOENT;
6136 goto reply;
6137 }
6138
6139 const pg_pool_t *p = osdmap.get_pg_pool(pool);
6140 string var;
6141 cmd_getval(cmdmap, "var", var);
6142
6143 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6144 const choices_map_t ALL_CHOICES = {
6145 {"size", SIZE},
6146 {"min_size", MIN_SIZE},
6147 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6148 {"crush_rule", CRUSH_RULE},
6149 {"hashpspool", HASHPSPOOL},
6150 {"eio", POOL_EIO},
6151 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6152 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6153 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6154 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6155 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6156 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6157 {"use_gmt_hitset", USE_GMT_HITSET},
6158 {"target_max_objects", TARGET_MAX_OBJECTS},
6159 {"target_max_bytes", TARGET_MAX_BYTES},
6160 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6161 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6162 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6163 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6164 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6165 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6166 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6167 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6168 {"fast_read", FAST_READ},
6169 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6170 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6171 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6172 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6173 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6174 {"recovery_priority", RECOVERY_PRIORITY},
6175 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6176 {"scrub_priority", SCRUB_PRIORITY},
6177 {"compression_mode", COMPRESSION_MODE},
6178 {"compression_algorithm", COMPRESSION_ALGORITHM},
6179 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6180 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6181 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6182 {"csum_type", CSUM_TYPE},
6183 {"csum_max_block", CSUM_MAX_BLOCK},
6184 {"csum_min_block", CSUM_MIN_BLOCK},
6185 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6186 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6187 {"pg_num_min", PG_NUM_MIN},
6188 {"pg_num_max", PG_NUM_MAX},
6189 {"target_size_bytes", TARGET_SIZE_BYTES},
6190 {"target_size_ratio", TARGET_SIZE_RATIO},
6191 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6192 {"dedup_tier", DEDUP_TIER},
6193 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6194 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6195 {"bulk", BULK}
6196 };
6197
6198 typedef std::set<osd_pool_get_choices> choices_set_t;
6199
6200 const choices_set_t ONLY_TIER_CHOICES = {
6201 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6202 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6203 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6204 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6205 MIN_READ_RECENCY_FOR_PROMOTE,
6206 MIN_WRITE_RECENCY_FOR_PROMOTE,
6207 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6208 };
6209 const choices_set_t ONLY_ERASURE_CHOICES = {
6210 EC_OVERWRITES, ERASURE_CODE_PROFILE
6211 };
6212
6213 choices_set_t selected_choices;
6214 if (var == "all") {
6215 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6216 it != ALL_CHOICES.end(); ++it) {
6217 selected_choices.insert(it->second);
6218 }
6219
6220 if(!p->is_tier()) {
6221 selected_choices = subtract_second_from_first(selected_choices,
6222 ONLY_TIER_CHOICES);
6223 }
6224
6225 if(!p->is_erasure()) {
6226 selected_choices = subtract_second_from_first(selected_choices,
6227 ONLY_ERASURE_CHOICES);
6228 }
6229 } else /* var != "all" */ {
6230 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6231 if (found == ALL_CHOICES.end()) {
6232 ss << "pool '" << poolstr
6233 << "': invalid variable: '" << var << "'";
6234 r = -EINVAL;
6235 goto reply;
6236 }
6237
6238 osd_pool_get_choices selected = found->second;
6239
6240 if (!p->is_tier() &&
6241 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6242 ss << "pool '" << poolstr
6243 << "' is not a tier pool: variable not applicable";
6244 r = -EACCES;
6245 goto reply;
6246 }
6247
6248 if (!p->is_erasure() &&
6249 ONLY_ERASURE_CHOICES.find(selected)
6250 != ONLY_ERASURE_CHOICES.end()) {
6251 ss << "pool '" << poolstr
6252 << "' is not a erasure pool: variable not applicable";
6253 r = -EACCES;
6254 goto reply;
6255 }
6256
6257 if (pool_opts_t::is_opt_name(var) &&
6258 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6259 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6260 r = -ENOENT;
6261 goto reply;
6262 }
6263
6264 selected_choices.insert(selected);
6265 }
6266
6267 if (f) {
6268 f->open_object_section("pool");
6269 f->dump_string("pool", poolstr);
6270 f->dump_int("pool_id", pool);
6271 for(choices_set_t::const_iterator it = selected_choices.begin();
6272 it != selected_choices.end(); ++it) {
6273 choices_map_t::const_iterator i;
6274 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6275 if (i->second == *it) {
6276 break;
6277 }
6278 }
6279 ceph_assert(i != ALL_CHOICES.end());
6280 switch(*it) {
6281 case PG_NUM:
6282 f->dump_int("pg_num", p->get_pg_num());
6283 break;
6284 case PGP_NUM:
6285 f->dump_int("pgp_num", p->get_pgp_num());
6286 break;
6287 case SIZE:
6288 f->dump_int("size", p->get_size());
6289 break;
6290 case MIN_SIZE:
6291 f->dump_int("min_size", p->get_min_size());
6292 break;
6293 case CRUSH_RULE:
6294 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6295 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6296 p->get_crush_rule()));
6297 } else {
6298 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6299 }
6300 break;
6301 case EC_OVERWRITES:
6302 f->dump_bool("allow_ec_overwrites",
6303 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6304 break;
6305 case PG_AUTOSCALE_MODE:
6306 f->dump_string("pg_autoscale_mode",
6307 pg_pool_t::get_pg_autoscale_mode_name(
6308 p->pg_autoscale_mode));
6309 break;
6310 case HASHPSPOOL:
6311 case POOL_EIO:
6312 case NODELETE:
6313 case BULK:
6314 case NOPGCHANGE:
6315 case NOSIZECHANGE:
6316 case WRITE_FADVISE_DONTNEED:
6317 case NOSCRUB:
6318 case NODEEP_SCRUB:
6319 f->dump_bool(i->first.c_str(),
6320 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6321 break;
6322 case HIT_SET_PERIOD:
6323 f->dump_int("hit_set_period", p->hit_set_period);
6324 break;
6325 case HIT_SET_COUNT:
6326 f->dump_int("hit_set_count", p->hit_set_count);
6327 break;
6328 case HIT_SET_TYPE:
6329 f->dump_string("hit_set_type",
6330 HitSet::get_type_name(p->hit_set_params.get_type()));
6331 break;
6332 case HIT_SET_FPP:
6333 {
6334 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6335 BloomHitSet::Params *bloomp =
6336 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6337 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6338 } else if(var != "all") {
6339 f->close_section();
6340 ss << "hit set is not of type Bloom; " <<
6341 "invalid to get a false positive rate!";
6342 r = -EINVAL;
6343 goto reply;
6344 }
6345 }
6346 break;
6347 case USE_GMT_HITSET:
6348 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6349 break;
6350 case TARGET_MAX_OBJECTS:
6351 f->dump_unsigned("target_max_objects", p->target_max_objects);
6352 break;
6353 case TARGET_MAX_BYTES:
6354 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6355 break;
6356 case CACHE_TARGET_DIRTY_RATIO:
6357 f->dump_unsigned("cache_target_dirty_ratio_micro",
6358 p->cache_target_dirty_ratio_micro);
6359 f->dump_float("cache_target_dirty_ratio",
6360 ((float)p->cache_target_dirty_ratio_micro/1000000));
6361 break;
6362 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6363 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6364 p->cache_target_dirty_high_ratio_micro);
6365 f->dump_float("cache_target_dirty_high_ratio",
6366 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6367 break;
6368 case CACHE_TARGET_FULL_RATIO:
6369 f->dump_unsigned("cache_target_full_ratio_micro",
6370 p->cache_target_full_ratio_micro);
6371 f->dump_float("cache_target_full_ratio",
6372 ((float)p->cache_target_full_ratio_micro/1000000));
6373 break;
6374 case CACHE_MIN_FLUSH_AGE:
6375 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6376 break;
6377 case CACHE_MIN_EVICT_AGE:
6378 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6379 break;
6380 case ERASURE_CODE_PROFILE:
6381 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6382 break;
6383 case MIN_READ_RECENCY_FOR_PROMOTE:
6384 f->dump_int("min_read_recency_for_promote",
6385 p->min_read_recency_for_promote);
6386 break;
6387 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6388 f->dump_int("min_write_recency_for_promote",
6389 p->min_write_recency_for_promote);
6390 break;
6391 case FAST_READ:
6392 f->dump_int("fast_read", p->fast_read);
6393 break;
6394 case HIT_SET_GRADE_DECAY_RATE:
6395 f->dump_int("hit_set_grade_decay_rate",
6396 p->hit_set_grade_decay_rate);
6397 break;
6398 case HIT_SET_SEARCH_LAST_N:
6399 f->dump_int("hit_set_search_last_n",
6400 p->hit_set_search_last_n);
6401 break;
6402 case SCRUB_MIN_INTERVAL:
6403 case SCRUB_MAX_INTERVAL:
6404 case DEEP_SCRUB_INTERVAL:
6405 case RECOVERY_PRIORITY:
6406 case RECOVERY_OP_PRIORITY:
6407 case SCRUB_PRIORITY:
6408 case COMPRESSION_MODE:
6409 case COMPRESSION_ALGORITHM:
6410 case COMPRESSION_REQUIRED_RATIO:
6411 case COMPRESSION_MAX_BLOB_SIZE:
6412 case COMPRESSION_MIN_BLOB_SIZE:
6413 case CSUM_TYPE:
6414 case CSUM_MAX_BLOCK:
6415 case CSUM_MIN_BLOCK:
6416 case FINGERPRINT_ALGORITHM:
6417 case PG_NUM_MIN:
6418 case PG_NUM_MAX:
6419 case TARGET_SIZE_BYTES:
6420 case TARGET_SIZE_RATIO:
6421 case PG_AUTOSCALE_BIAS:
6422 case DEDUP_TIER:
6423 case DEDUP_CHUNK_ALGORITHM:
6424 case DEDUP_CDC_CHUNK_SIZE:
6425 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6426 if (p->opts.is_set(key)) {
6427 if(*it == CSUM_TYPE) {
6428 int64_t val;
6429 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6430 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6431 } else {
6432 p->opts.dump(i->first, f.get());
6433 }
6434 }
6435 break;
6436 }
6437 }
6438 f->close_section();
6439 f->flush(rdata);
6440 } else /* !f */ {
6441 for(choices_set_t::const_iterator it = selected_choices.begin();
6442 it != selected_choices.end(); ++it) {
6443 choices_map_t::const_iterator i;
6444 switch(*it) {
6445 case PG_NUM:
6446 ss << "pg_num: " << p->get_pg_num() << "\n";
6447 break;
6448 case PGP_NUM:
6449 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6450 break;
6451 case SIZE:
6452 ss << "size: " << p->get_size() << "\n";
6453 break;
6454 case MIN_SIZE:
6455 ss << "min_size: " << p->get_min_size() << "\n";
6456 break;
6457 case CRUSH_RULE:
6458 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6459 ss << "crush_rule: " << osdmap.crush->get_rule_name(
6460 p->get_crush_rule()) << "\n";
6461 } else {
6462 ss << "crush_rule: " << p->get_crush_rule() << "\n";
6463 }
6464 break;
6465 case PG_AUTOSCALE_MODE:
6466 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6467 p->pg_autoscale_mode) <<"\n";
6468 break;
6469 case HIT_SET_PERIOD:
6470 ss << "hit_set_period: " << p->hit_set_period << "\n";
6471 break;
6472 case HIT_SET_COUNT:
6473 ss << "hit_set_count: " << p->hit_set_count << "\n";
6474 break;
6475 case HIT_SET_TYPE:
6476 ss << "hit_set_type: " <<
6477 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6478 break;
6479 case HIT_SET_FPP:
6480 {
6481 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6482 BloomHitSet::Params *bloomp =
6483 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6484 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6485 } else if(var != "all") {
6486 ss << "hit set is not of type Bloom; " <<
6487 "invalid to get a false positive rate!";
6488 r = -EINVAL;
6489 goto reply;
6490 }
6491 }
6492 break;
6493 case USE_GMT_HITSET:
6494 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6495 break;
6496 case TARGET_MAX_OBJECTS:
6497 ss << "target_max_objects: " << p->target_max_objects << "\n";
6498 break;
6499 case TARGET_MAX_BYTES:
6500 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6501 break;
6502 case CACHE_TARGET_DIRTY_RATIO:
6503 ss << "cache_target_dirty_ratio: "
6504 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6505 break;
6506 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6507 ss << "cache_target_dirty_high_ratio: "
6508 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6509 break;
6510 case CACHE_TARGET_FULL_RATIO:
6511 ss << "cache_target_full_ratio: "
6512 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6513 break;
6514 case CACHE_MIN_FLUSH_AGE:
6515 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6516 break;
6517 case CACHE_MIN_EVICT_AGE:
6518 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6519 break;
6520 case ERASURE_CODE_PROFILE:
6521 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6522 break;
6523 case MIN_READ_RECENCY_FOR_PROMOTE:
6524 ss << "min_read_recency_for_promote: " <<
6525 p->min_read_recency_for_promote << "\n";
6526 break;
6527 case HIT_SET_GRADE_DECAY_RATE:
6528 ss << "hit_set_grade_decay_rate: " <<
6529 p->hit_set_grade_decay_rate << "\n";
6530 break;
6531 case HIT_SET_SEARCH_LAST_N:
6532 ss << "hit_set_search_last_n: " <<
6533 p->hit_set_search_last_n << "\n";
6534 break;
6535 case EC_OVERWRITES:
6536 ss << "allow_ec_overwrites: " <<
6537 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6538 "\n";
6539 break;
6540 case HASHPSPOOL:
6541 case POOL_EIO:
6542 case NODELETE:
6543 case BULK:
6544 case NOPGCHANGE:
6545 case NOSIZECHANGE:
6546 case WRITE_FADVISE_DONTNEED:
6547 case NOSCRUB:
6548 case NODEEP_SCRUB:
6549 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6550 if (i->second == *it)
6551 break;
6552 }
6553 ceph_assert(i != ALL_CHOICES.end());
6554 ss << i->first << ": " <<
6555 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6556 "true" : "false") << "\n";
6557 break;
6558 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6559 ss << "min_write_recency_for_promote: " <<
6560 p->min_write_recency_for_promote << "\n";
6561 break;
6562 case FAST_READ:
6563 ss << "fast_read: " << p->fast_read << "\n";
6564 break;
6565 case SCRUB_MIN_INTERVAL:
6566 case SCRUB_MAX_INTERVAL:
6567 case DEEP_SCRUB_INTERVAL:
6568 case RECOVERY_PRIORITY:
6569 case RECOVERY_OP_PRIORITY:
6570 case SCRUB_PRIORITY:
6571 case COMPRESSION_MODE:
6572 case COMPRESSION_ALGORITHM:
6573 case COMPRESSION_REQUIRED_RATIO:
6574 case COMPRESSION_MAX_BLOB_SIZE:
6575 case COMPRESSION_MIN_BLOB_SIZE:
6576 case CSUM_TYPE:
6577 case CSUM_MAX_BLOCK:
6578 case CSUM_MIN_BLOCK:
6579 case FINGERPRINT_ALGORITHM:
6580 case PG_NUM_MIN:
6581 case PG_NUM_MAX:
6582 case TARGET_SIZE_BYTES:
6583 case TARGET_SIZE_RATIO:
6584 case PG_AUTOSCALE_BIAS:
6585 case DEDUP_TIER:
6586 case DEDUP_CHUNK_ALGORITHM:
6587 case DEDUP_CDC_CHUNK_SIZE:
6588 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6589 if (i->second == *it)
6590 break;
6591 }
6592 ceph_assert(i != ALL_CHOICES.end());
6593 {
6594 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6595 if (p->opts.is_set(key)) {
6596 if(key == pool_opts_t::CSUM_TYPE) {
6597 int64_t val;
6598 p->opts.get(key, &val);
6599 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6600 } else {
6601 ss << i->first << ": " << p->opts.get(key) << "\n";
6602 }
6603 }
6604 }
6605 break;
6606 }
6607 rdata.append(ss.str());
6608 ss.str("");
6609 }
6610 }
6611 r = 0;
6612 } else if (prefix == "osd pool get-quota") {
6613 string pool_name;
6614 cmd_getval(cmdmap, "pool", pool_name);
6615
6616 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6617 if (poolid < 0) {
6618 ceph_assert(poolid == -ENOENT);
6619 ss << "unrecognized pool '" << pool_name << "'";
6620 r = -ENOENT;
6621 goto reply;
6622 }
6623 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6624 const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6625 if (!pstat) {
6626 ss << "no stats for pool '" << pool_name << "'";
6627 r = -ENOENT;
6628 goto reply;
6629 }
6630 const object_stat_sum_t& sum = pstat->stats.sum;
6631 if (f) {
6632 f->open_object_section("pool_quotas");
6633 f->dump_string("pool_name", pool_name);
6634 f->dump_unsigned("pool_id", poolid);
6635 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6636 f->dump_int("current_num_objects", sum.num_objects);
6637 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6638 f->dump_int("current_num_bytes", sum.num_bytes);
6639 f->close_section();
6640 f->flush(rdata);
6641 } else {
6642 stringstream rs;
6643 rs << "quotas for pool '" << pool_name << "':\n"
6644 << " max objects: ";
6645 if (p->quota_max_objects == 0)
6646 rs << "N/A";
6647 else {
6648 rs << si_u_t(p->quota_max_objects) << " objects";
6649 rs << " (current num objects: " << sum.num_objects << " objects)";
6650 }
6651 rs << "\n"
6652 << " max bytes : ";
6653 if (p->quota_max_bytes == 0)
6654 rs << "N/A";
6655 else {
6656 rs << byte_u_t(p->quota_max_bytes);
6657 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6658 }
6659 rdata.append(rs.str());
6660 }
6661 rdata.append("\n");
6662 r = 0;
6663 } else if (prefix == "osd crush rule list" ||
6664 prefix == "osd crush rule ls") {
6665 if (f) {
6666 f->open_array_section("rules");
6667 osdmap.crush->list_rules(f.get());
6668 f->close_section();
6669 f->flush(rdata);
6670 } else {
6671 ostringstream ss;
6672 osdmap.crush->list_rules(&ss);
6673 rdata.append(ss.str());
6674 }
6675 } else if (prefix == "osd crush rule ls-by-class") {
6676 string class_name;
6677 cmd_getval(cmdmap, "class", class_name);
6678 if (class_name.empty()) {
6679 ss << "no class specified";
6680 r = -EINVAL;
6681 goto reply;
6682 }
6683 set<int> rules;
6684 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6685 if (r < 0) {
6686 ss << "failed to get rules by class '" << class_name << "'";
6687 goto reply;
6688 }
6689 if (f) {
6690 f->open_array_section("rules");
6691 for (auto &rule: rules) {
6692 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6693 }
6694 f->close_section();
6695 f->flush(rdata);
6696 } else {
6697 ostringstream rs;
6698 for (auto &rule: rules) {
6699 rs << osdmap.crush->get_rule_name(rule) << "\n";
6700 }
6701 rdata.append(rs.str());
6702 }
6703 } else if (prefix == "osd crush rule dump") {
6704 string name;
6705 cmd_getval(cmdmap, "name", name);
6706 string format;
6707 cmd_getval(cmdmap, "format", format);
6708 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6709 if (name == "") {
6710 f->open_array_section("rules");
6711 osdmap.crush->dump_rules(f.get());
6712 f->close_section();
6713 } else {
6714 int ruleno = osdmap.crush->get_rule_id(name);
6715 if (ruleno < 0) {
6716 ss << "unknown crush rule '" << name << "'";
6717 r = ruleno;
6718 goto reply;
6719 }
6720 osdmap.crush->dump_rule(ruleno, f.get());
6721 }
6722 ostringstream rs;
6723 f->flush(rs);
6724 rs << "\n";
6725 rdata.append(rs.str());
6726 } else if (prefix == "osd crush dump") {
6727 string format;
6728 cmd_getval(cmdmap, "format", format);
6729 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6730 f->open_object_section("crush_map");
6731 osdmap.crush->dump(f.get());
6732 f->close_section();
6733 ostringstream rs;
6734 f->flush(rs);
6735 rs << "\n";
6736 rdata.append(rs.str());
6737 } else if (prefix == "osd crush show-tunables") {
6738 string format;
6739 cmd_getval(cmdmap, "format", format);
6740 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6741 f->open_object_section("crush_map_tunables");
6742 osdmap.crush->dump_tunables(f.get());
6743 f->close_section();
6744 ostringstream rs;
6745 f->flush(rs);
6746 rs << "\n";
6747 rdata.append(rs.str());
6748 } else if (prefix == "osd crush tree") {
6749 bool show_shadow = false;
6750 if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
6751 std::string shadow;
6752 if (cmd_getval(cmdmap, "shadow", shadow) &&
6753 shadow == "--show-shadow") {
6754 show_shadow = true;
6755 }
6756 }
6757 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6758 if (f) {
6759 f->open_object_section("crush_tree");
6760 osdmap.crush->dump_tree(nullptr,
6761 f.get(),
6762 osdmap.get_pool_names(),
6763 show_shadow);
6764 f->close_section();
6765 f->flush(rdata);
6766 } else {
6767 ostringstream ss;
6768 osdmap.crush->dump_tree(&ss,
6769 nullptr,
6770 osdmap.get_pool_names(),
6771 show_shadow);
6772 rdata.append(ss.str());
6773 }
6774 } else if (prefix == "osd crush ls") {
6775 string name;
6776 if (!cmd_getval(cmdmap, "node", name)) {
6777 ss << "no node specified";
6778 r = -EINVAL;
6779 goto reply;
6780 }
6781 if (!osdmap.crush->name_exists(name)) {
6782 ss << "node '" << name << "' does not exist";
6783 r = -ENOENT;
6784 goto reply;
6785 }
6786 int id = osdmap.crush->get_item_id(name);
6787 list<int> result;
6788 if (id >= 0) {
6789 result.push_back(id);
6790 } else {
6791 int num = osdmap.crush->get_bucket_size(id);
6792 for (int i = 0; i < num; ++i) {
6793 result.push_back(osdmap.crush->get_bucket_item(id, i));
6794 }
6795 }
6796 if (f) {
6797 f->open_array_section("items");
6798 for (auto i : result) {
6799 f->dump_string("item", osdmap.crush->get_item_name(i));
6800 }
6801 f->close_section();
6802 f->flush(rdata);
6803 } else {
6804 ostringstream ss;
6805 for (auto i : result) {
6806 ss << osdmap.crush->get_item_name(i) << "\n";
6807 }
6808 rdata.append(ss.str());
6809 }
6810 r = 0;
6811 } else if (prefix == "osd crush class ls") {
6812 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6813 f->open_array_section("crush_classes");
6814 for (auto i : osdmap.crush->class_name)
6815 f->dump_string("class", i.second);
6816 f->close_section();
6817 f->flush(rdata);
6818 } else if (prefix == "osd crush class ls-osd") {
6819 string name;
6820 cmd_getval(cmdmap, "class", name);
6821 set<int> osds;
6822 osdmap.crush->get_devices_by_class(name, &osds);
6823 if (f) {
6824 f->open_array_section("osds");
6825 for (auto &osd: osds)
6826 f->dump_int("osd", osd);
6827 f->close_section();
6828 f->flush(rdata);
6829 } else {
6830 bool first = true;
6831 for (auto &osd : osds) {
6832 if (!first)
6833 ds << "\n";
6834 first = false;
6835 ds << osd;
6836 }
6837 rdata.append(ds);
6838 }
6839 } else if (prefix == "osd crush get-device-class") {
6840 vector<string> idvec;
6841 cmd_getval(cmdmap, "ids", idvec);
6842 map<int, string> class_by_osd;
6843 for (auto& id : idvec) {
6844 ostringstream ts;
6845 long osd = parse_osd_id(id.c_str(), &ts);
6846 if (osd < 0) {
6847 ss << "unable to parse osd id:'" << id << "'";
6848 r = -EINVAL;
6849 goto reply;
6850 }
6851 auto device_class = osdmap.crush->get_item_class(osd);
6852 if (device_class)
6853 class_by_osd[osd] = device_class;
6854 else
6855 class_by_osd[osd] = ""; // no class
6856 }
6857 if (f) {
6858 f->open_array_section("osd_device_classes");
6859 for (auto& i : class_by_osd) {
6860 f->open_object_section("osd_device_class");
6861 f->dump_int("osd", i.first);
6862 f->dump_string("device_class", i.second);
6863 f->close_section();
6864 }
6865 f->close_section();
6866 f->flush(rdata);
6867 } else {
6868 if (class_by_osd.size() == 1) {
6869 // for single input, make a clean output
6870 ds << class_by_osd.begin()->second;
6871 } else {
6872 // note that we do not group osds by class here
6873 for (auto it = class_by_osd.begin();
6874 it != class_by_osd.end();
6875 it++) {
6876 ds << "osd." << it->first << ' ' << it->second;
6877 if (next(it) != class_by_osd.end())
6878 ds << '\n';
6879 }
6880 }
6881 rdata.append(ds);
6882 }
6883 } else if (prefix == "osd erasure-code-profile ls") {
6884 const auto &profiles = osdmap.get_erasure_code_profiles();
6885 if (f)
6886 f->open_array_section("erasure-code-profiles");
6887 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6888 if (f)
6889 f->dump_string("profile", i->first.c_str());
6890 else
6891 rdata.append(i->first + "\n");
6892 }
6893 if (f) {
6894 f->close_section();
6895 ostringstream rs;
6896 f->flush(rs);
6897 rs << "\n";
6898 rdata.append(rs.str());
6899 }
6900 } else if (prefix == "osd crush weight-set ls") {
6901 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6902 if (f) {
6903 f->open_array_section("weight_sets");
6904 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6905 f->dump_string("pool", "(compat)");
6906 }
6907 for (auto& i : osdmap.crush->choose_args) {
6908 if (i.first >= 0) {
6909 f->dump_string("pool", osdmap.get_pool_name(i.first));
6910 }
6911 }
6912 f->close_section();
6913 f->flush(rdata);
6914 } else {
6915 ostringstream rs;
6916 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6917 rs << "(compat)\n";
6918 }
6919 for (auto& i : osdmap.crush->choose_args) {
6920 if (i.first >= 0) {
6921 rs << osdmap.get_pool_name(i.first) << "\n";
6922 }
6923 }
6924 rdata.append(rs.str());
6925 }
6926 } else if (prefix == "osd crush weight-set dump") {
6927 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6928 "json-pretty"));
6929 osdmap.crush->dump_choose_args(f.get());
6930 f->flush(rdata);
6931 } else if (prefix == "osd erasure-code-profile get") {
6932 string name;
6933 cmd_getval(cmdmap, "name", name);
6934 if (!osdmap.has_erasure_code_profile(name)) {
6935 ss << "unknown erasure code profile '" << name << "'";
6936 r = -ENOENT;
6937 goto reply;
6938 }
6939 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6940 if (f)
6941 f->open_object_section("profile");
6942 for (map<string,string>::const_iterator i = profile.begin();
6943 i != profile.end();
6944 ++i) {
6945 if (f)
6946 f->dump_string(i->first.c_str(), i->second.c_str());
6947 else
6948 rdata.append(i->first + "=" + i->second + "\n");
6949 }
6950 if (f) {
6951 f->close_section();
6952 ostringstream rs;
6953 f->flush(rs);
6954 rs << "\n";
6955 rdata.append(rs.str());
6956 }
6957 } else if (prefix == "osd pool application get") {
6958 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6959 "json-pretty"));
6960 string pool_name;
6961 cmd_getval(cmdmap, "pool", pool_name);
6962 string app;
6963 cmd_getval(cmdmap, "app", app);
6964 string key;
6965 cmd_getval(cmdmap, "key", key);
6966
6967 if (pool_name.empty()) {
6968 // all
6969 f->open_object_section("pools");
6970 for (const auto &pool : osdmap.pools) {
6971 std::string name("<unknown>");
6972 const auto &pni = osdmap.pool_name.find(pool.first);
6973 if (pni != osdmap.pool_name.end())
6974 name = pni->second;
6975 f->open_object_section(name.c_str());
6976 for (auto &app_pair : pool.second.application_metadata) {
6977 f->open_object_section(app_pair.first.c_str());
6978 for (auto &kv_pair : app_pair.second) {
6979 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6980 }
6981 f->close_section();
6982 }
6983 f->close_section(); // name
6984 }
6985 f->close_section(); // pools
6986 f->flush(rdata);
6987 } else {
6988 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6989 if (pool < 0) {
6990 ss << "unrecognized pool '" << pool_name << "'";
6991 r = -ENOENT;
6992 goto reply;
6993 }
6994 auto p = osdmap.get_pg_pool(pool);
6995 // filter by pool
6996 if (app.empty()) {
6997 f->open_object_section(pool_name.c_str());
6998 for (auto &app_pair : p->application_metadata) {
6999 f->open_object_section(app_pair.first.c_str());
7000 for (auto &kv_pair : app_pair.second) {
7001 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7002 }
7003 f->close_section(); // application
7004 }
7005 f->close_section(); // pool_name
7006 f->flush(rdata);
7007 goto reply;
7008 }
7009
7010 auto app_it = p->application_metadata.find(app);
7011 if (app_it == p->application_metadata.end()) {
7012 ss << "pool '" << pool_name << "' has no application '" << app << "'";
7013 r = -ENOENT;
7014 goto reply;
7015 }
7016 // filter by pool + app
7017 if (key.empty()) {
7018 f->open_object_section(app_it->first.c_str());
7019 for (auto &kv_pair : app_it->second) {
7020 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7021 }
7022 f->close_section(); // application
7023 f->flush(rdata);
7024 goto reply;
7025 }
7026 // filter by pool + app + key
7027 auto key_it = app_it->second.find(key);
7028 if (key_it == app_it->second.end()) {
7029 ss << "application '" << app << "' on pool '" << pool_name
7030 << "' does not have key '" << key << "'";
7031 r = -ENOENT;
7032 goto reply;
7033 }
7034 ss << key_it->second << "\n";
7035 rdata.append(ss.str());
7036 ss.str("");
7037 }
7038 } else if (prefix == "osd get-require-min-compat-client") {
7039 ss << osdmap.require_min_compat_client << std::endl;
7040 rdata.append(ss.str());
7041 ss.str("");
7042 goto reply;
7043 } else if (prefix == "osd pool application enable" ||
7044 prefix == "osd pool application disable" ||
7045 prefix == "osd pool application set" ||
7046 prefix == "osd pool application rm") {
7047 bool changed = false;
7048 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7049 if (r != 0) {
7050 // Error, reply.
7051 goto reply;
7052 } else if (changed) {
7053 // Valid mutation, proceed to prepare phase
7054 return false;
7055 } else {
7056 // Idempotent case, reply
7057 goto reply;
7058 }
7059 } else {
7060 // try prepare update
7061 return false;
7062 }
7063
7064 reply:
7065 string rs;
7066 getline(ss, rs);
7067 mon.reply_command(op, r, rs, rdata, get_last_committed());
7068 return true;
7069 }
7070
7071 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7072 {
7073 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7074 osdmap.get_pg_pool(pool_id));
7075 ceph_assert(pool);
7076 pool->set_flag(flags);
7077 }
7078
7079 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7080 {
7081 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7082 osdmap.get_pg_pool(pool_id));
7083 ceph_assert(pool);
7084 pool->unset_flag(flags);
7085 }
7086
7087 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7088 {
7089 char k[80];
7090 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7091 return k;
7092 }
7093
7094 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7095 {
7096 char k[80];
7097 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7098 (unsigned long long)pool, (unsigned long long)snap);
7099 return k;
7100 }
7101
7102 string OSDMonitor::make_purged_snap_key_value(
7103 int64_t pool, snapid_t snap, snapid_t num,
7104 epoch_t epoch, bufferlist *v)
7105 {
7106 // encode the *last* epoch in the key so that we can use forward
7107 // iteration only to search for an epoch in an interval.
7108 encode(snap, *v);
7109 encode(snap + num, *v);
7110 encode(epoch, *v);
7111 return make_purged_snap_key(pool, snap + num - 1);
7112 }
7113
7114
7115 int OSDMonitor::lookup_purged_snap(
7116 int64_t pool, snapid_t snap,
7117 snapid_t *begin, snapid_t *end)
7118 {
7119 string k = make_purged_snap_key(pool, snap);
7120 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7121 it->lower_bound(k);
7122 if (!it->valid()) {
7123 dout(20) << __func__
7124 << " pool " << pool << " snap " << snap
7125 << " - key '" << k << "' not found" << dendl;
7126 return -ENOENT;
7127 }
7128 if (it->key().find("purged_snap_") != 0) {
7129 dout(20) << __func__
7130 << " pool " << pool << " snap " << snap
7131 << " - key '" << k << "' got '" << it->key()
7132 << "', wrong prefix" << dendl;
7133 return -ENOENT;
7134 }
7135 string gotk = it->key();
7136 const char *format = "purged_snap_%llu_";
7137 long long int keypool;
7138 int n = sscanf(gotk.c_str(), format, &keypool);
7139 if (n != 1) {
7140 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7141 return -ENOENT;
7142 }
7143 if (pool != keypool) {
7144 dout(20) << __func__
7145 << " pool " << pool << " snap " << snap
7146 << " - key '" << k << "' got '" << gotk
7147 << "', wrong pool " << keypool
7148 << dendl;
7149 return -ENOENT;
7150 }
7151 bufferlist v = it->value();
7152 auto p = v.cbegin();
7153 decode(*begin, p);
7154 decode(*end, p);
7155 if (snap < *begin || snap >= *end) {
7156 dout(20) << __func__
7157 << " pool " << pool << " snap " << snap
7158 << " - found [" << *begin << "," << *end << "), no overlap"
7159 << dendl;
7160 return -ENOENT;
7161 }
7162 return 0;
7163 }
7164
7165 void OSDMonitor::insert_purged_snap_update(
7166 int64_t pool,
7167 snapid_t start, snapid_t end,
7168 epoch_t epoch,
7169 MonitorDBStore::TransactionRef t)
7170 {
7171 snapid_t before_begin, before_end;
7172 snapid_t after_begin, after_end;
7173 int b = lookup_purged_snap(pool, start - 1,
7174 &before_begin, &before_end);
7175 int a = lookup_purged_snap(pool, end,
7176 &after_begin, &after_end);
7177 if (!b && !a) {
7178 dout(10) << __func__
7179 << " [" << start << "," << end << ") - joins ["
7180 << before_begin << "," << before_end << ") and ["
7181 << after_begin << "," << after_end << ")" << dendl;
7182 // erase only the begin record; we'll overwrite the end one.
7183 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7184 bufferlist v;
7185 string k = make_purged_snap_key_value(pool,
7186 before_begin, after_end - before_begin,
7187 pending_inc.epoch, &v);
7188 t->put(OSD_SNAP_PREFIX, k, v);
7189 } else if (!b) {
7190 dout(10) << __func__
7191 << " [" << start << "," << end << ") - join with earlier ["
7192 << before_begin << "," << before_end << ")" << dendl;
7193 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7194 bufferlist v;
7195 string k = make_purged_snap_key_value(pool,
7196 before_begin, end - before_begin,
7197 pending_inc.epoch, &v);
7198 t->put(OSD_SNAP_PREFIX, k, v);
7199 } else if (!a) {
7200 dout(10) << __func__
7201 << " [" << start << "," << end << ") - join with later ["
7202 << after_begin << "," << after_end << ")" << dendl;
7203 // overwrite after record
7204 bufferlist v;
7205 string k = make_purged_snap_key_value(pool,
7206 start, after_end - start,
7207 pending_inc.epoch, &v);
7208 t->put(OSD_SNAP_PREFIX, k, v);
7209 } else {
7210 dout(10) << __func__
7211 << " [" << start << "," << end << ") - new"
7212 << dendl;
7213 bufferlist v;
7214 string k = make_purged_snap_key_value(pool,
7215 start, end - start,
7216 pending_inc.epoch, &v);
7217 t->put(OSD_SNAP_PREFIX, k, v);
7218 }
7219 }
7220
7221 bool OSDMonitor::try_prune_purged_snaps()
7222 {
7223 if (!mon.mgrstatmon()->is_readable()) {
7224 return false;
7225 }
7226 if (!pending_inc.new_purged_snaps.empty()) {
7227 return false; // we already pruned for this epoch
7228 }
7229
7230 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7231 "mon_max_snap_prune_per_epoch");
7232 if (!max_prune) {
7233 max_prune = 100000;
7234 }
7235 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7236
7237 unsigned actually_pruned = 0;
7238 auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7239 for (auto& p : osdmap.get_pools()) {
7240 auto q = purged_snaps.find(p.first);
7241 if (q == purged_snaps.end()) {
7242 continue;
7243 }
7244 auto& purged = q->second;
7245 if (purged.empty()) {
7246 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7247 continue;
7248 }
7249 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7250 snap_interval_set_t to_prune;
7251 unsigned maybe_pruned = actually_pruned;
7252 for (auto i = purged.begin(); i != purged.end(); ++i) {
7253 snapid_t begin = i.get_start();
7254 auto end = i.get_start() + i.get_len();
7255 snapid_t pbegin = 0, pend = 0;
7256 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7257 if (r == 0) {
7258 // already purged.
7259 // be a bit aggressive about backing off here, because the mon may
7260 // do a lot of work going through this set, and if we know the
7261 // purged set from the OSDs is at least *partly* stale we may as
7262 // well wait for it to be fresh.
7263 dout(20) << __func__ << " we've already purged " << pbegin
7264 << "~" << (pend - pbegin) << dendl;
7265 break; // next pool
7266 }
7267 if (pbegin && pbegin > begin && pbegin < end) {
7268 // the tail of [begin,end) is purged; shorten the range
7269 end = pbegin;
7270 }
7271 to_prune.insert(begin, end - begin);
7272 maybe_pruned += end - begin;
7273 if (maybe_pruned >= max_prune) {
7274 break;
7275 }
7276 }
7277 if (!to_prune.empty()) {
7278 // PGs may still be reporting things as purged that we have already
7279 // pruned from removed_snaps_queue.
7280 snap_interval_set_t actual;
7281 auto r = osdmap.removed_snaps_queue.find(p.first);
7282 if (r != osdmap.removed_snaps_queue.end()) {
7283 actual.intersection_of(to_prune, r->second);
7284 }
7285 actually_pruned += actual.size();
7286 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7287 << ", actual pruned " << actual << dendl;
7288 if (!actual.empty()) {
7289 pending_inc.new_purged_snaps[p.first].swap(actual);
7290 }
7291 }
7292 if (actually_pruned >= max_prune) {
7293 break;
7294 }
7295 }
7296 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7297 return !!actually_pruned;
7298 }
7299
7300 bool OSDMonitor::update_pools_status()
7301 {
7302 if (!mon.mgrstatmon()->is_readable())
7303 return false;
7304
7305 bool ret = false;
7306
7307 auto& pools = osdmap.get_pools();
7308 for (auto it = pools.begin(); it != pools.end(); ++it) {
7309 const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7310 if (!pstat)
7311 continue;
7312 const object_stat_sum_t& sum = pstat->stats.sum;
7313 const pg_pool_t &pool = it->second;
7314 const string& pool_name = osdmap.get_pool_name(it->first);
7315
7316 bool pool_is_full =
7317 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7318 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7319
7320 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7321 if (pool_is_full)
7322 continue;
7323
7324 mon.clog->info() << "pool '" << pool_name
7325 << "' no longer out of quota; removing NO_QUOTA flag";
7326 // below we cancel FLAG_FULL too, we'll set it again in
7327 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7328 clear_pool_flags(it->first,
7329 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7330 ret = true;
7331 } else {
7332 if (!pool_is_full)
7333 continue;
7334
7335 if (pool.quota_max_bytes > 0 &&
7336 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7337 mon.clog->warn() << "pool '" << pool_name << "' is full"
7338 << " (reached quota's max_bytes: "
7339 << byte_u_t(pool.quota_max_bytes) << ")";
7340 }
7341 if (pool.quota_max_objects > 0 &&
7342 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7343 mon.clog->warn() << "pool '" << pool_name << "' is full"
7344 << " (reached quota's max_objects: "
7345 << pool.quota_max_objects << ")";
7346 }
7347 // set both FLAG_FULL_QUOTA and FLAG_FULL
7348 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7349 // since FLAG_FULL should always take precedence
7350 set_pool_flags(it->first,
7351 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7352 clear_pool_flags(it->first,
7353 pg_pool_t::FLAG_NEARFULL |
7354 pg_pool_t::FLAG_BACKFILLFULL);
7355 ret = true;
7356 }
7357 }
7358 return ret;
7359 }
7360
7361 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7362 {
7363 op->mark_osdmon_event(__func__);
7364 auto m = op->get_req<MPoolOp>();
7365 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7366 MonSession *session = op->get_session();
7367 if (!session)
7368 return -EPERM;
7369 string erasure_code_profile;
7370 stringstream ss;
7371 string rule_name;
7372 bool bulk = false;
7373 int ret = 0;
7374 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7375 0, 0, 0, 0, 0, 0, 0.0,
7376 erasure_code_profile,
7377 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
7378 cct->_conf.get_val<bool>("osd_pool_default_crimson"),
7379 &ss);
7380
7381 if (ret < 0) {
7382 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7383 }
7384 return ret;
7385 }
7386
7387 int OSDMonitor::crush_rename_bucket(const string& srcname,
7388 const string& dstname,
7389 ostream *ss)
7390 {
7391 int ret;
7392 //
7393 // Avoid creating a pending crush if it does not already exists and
7394 // the rename would fail.
7395 //
7396 if (!_have_pending_crush()) {
7397 ret = _get_stable_crush().can_rename_bucket(srcname,
7398 dstname,
7399 ss);
7400 if (ret)
7401 return ret;
7402 }
7403
7404 CrushWrapper newcrush = _get_pending_crush();
7405
7406 ret = newcrush.rename_bucket(srcname,
7407 dstname,
7408 ss);
7409 if (ret)
7410 return ret;
7411
7412 pending_inc.crush.clear();
7413 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7414 *ss << "renamed bucket " << srcname << " into " << dstname;
7415 return 0;
7416 }
7417
7418 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7419 {
7420 string replacement = "";
7421
7422 if (plugin == "jerasure_generic" ||
7423 plugin == "jerasure_sse3" ||
7424 plugin == "jerasure_sse4" ||
7425 plugin == "jerasure_neon") {
7426 replacement = "jerasure";
7427 } else if (plugin == "shec_generic" ||
7428 plugin == "shec_sse3" ||
7429 plugin == "shec_sse4" ||
7430 plugin == "shec_neon") {
7431 replacement = "shec";
7432 }
7433
7434 if (replacement != "") {
7435 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7436 << plugin << " that has been deprecated. Please use "
7437 << replacement << " instead." << dendl;
7438 }
7439 }
7440
7441 int OSDMonitor::normalize_profile(const string& profilename,
7442 ErasureCodeProfile &profile,
7443 bool force,
7444 ostream *ss)
7445 {
7446 ErasureCodeInterfaceRef erasure_code;
7447 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7448 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7449 check_legacy_ec_plugin(plugin->second, profilename);
7450 int err = instance.factory(plugin->second,
7451 g_conf().get_val<std::string>("erasure_code_dir"),
7452 profile, &erasure_code, ss);
7453 if (err) {
7454 return err;
7455 }
7456
7457 err = erasure_code->init(profile, ss);
7458 if (err) {
7459 return err;
7460 }
7461
7462 auto it = profile.find("stripe_unit");
7463 if (it != profile.end()) {
7464 string err_str;
7465 uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
7466 if (!err_str.empty()) {
7467 *ss << "could not parse stripe_unit '" << it->second
7468 << "': " << err_str << std::endl;
7469 return -EINVAL;
7470 }
7471 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7472 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7473 if (chunk_size != stripe_unit) {
7474 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7475 << "alignment. Would be padded to " << chunk_size
7476 << std::endl;
7477 return -EINVAL;
7478 }
7479 if ((stripe_unit % 4096) != 0 && !force) {
7480 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7481 << "use --force to override this check" << std::endl;
7482 return -EINVAL;
7483 }
7484 }
7485 return 0;
7486 }
7487
7488 int OSDMonitor::crush_rule_create_erasure(const string &name,
7489 const string &profile,
7490 int *rule,
7491 ostream *ss)
7492 {
7493 int ruleid = osdmap.crush->get_rule_id(name);
7494 if (ruleid != -ENOENT) {
7495 *rule = ruleid;
7496 return -EEXIST;
7497 }
7498
7499 CrushWrapper newcrush = _get_pending_crush();
7500
7501 ruleid = newcrush.get_rule_id(name);
7502 if (ruleid != -ENOENT) {
7503 *rule = ruleid;
7504 return -EALREADY;
7505 } else {
7506 ErasureCodeInterfaceRef erasure_code;
7507 int err = get_erasure_code(profile, &erasure_code, ss);
7508 if (err) {
7509 *ss << "failed to load plugin using profile " << profile << std::endl;
7510 return err;
7511 }
7512
7513 err = erasure_code->create_rule(name, newcrush, ss);
7514 erasure_code.reset();
7515 if (err < 0)
7516 return err;
7517 *rule = err;
7518 pending_inc.crush.clear();
7519 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7520 return 0;
7521 }
7522 }
7523
7524 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7525 ErasureCodeInterfaceRef *erasure_code,
7526 ostream *ss) const
7527 {
7528 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7529 return -EAGAIN;
7530 ErasureCodeProfile profile =
7531 osdmap.get_erasure_code_profile(erasure_code_profile);
7532 ErasureCodeProfile::const_iterator plugin =
7533 profile.find("plugin");
7534 if (plugin == profile.end()) {
7535 *ss << "cannot determine the erasure code plugin"
7536 << " because there is no 'plugin' entry in the erasure_code_profile "
7537 << profile << std::endl;
7538 return -EINVAL;
7539 }
7540 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7541 auto& instance = ErasureCodePluginRegistry::instance();
7542 return instance.factory(plugin->second,
7543 g_conf().get_val<std::string>("erasure_code_dir"),
7544 profile, erasure_code, ss);
7545 }
7546
7547 int OSDMonitor::check_cluster_features(uint64_t features,
7548 stringstream &ss)
7549 {
7550 stringstream unsupported_ss;
7551 int unsupported_count = 0;
7552 if ((mon.get_quorum_con_features() & features) != features) {
7553 unsupported_ss << "the monitor cluster";
7554 ++unsupported_count;
7555 }
7556
7557 set<int32_t> up_osds;
7558 osdmap.get_up_osds(up_osds);
7559 for (set<int32_t>::iterator it = up_osds.begin();
7560 it != up_osds.end(); ++it) {
7561 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7562 if ((xi.features & features) != features) {
7563 if (unsupported_count > 0)
7564 unsupported_ss << ", ";
7565 unsupported_ss << "osd." << *it;
7566 unsupported_count ++;
7567 }
7568 }
7569
7570 if (unsupported_count > 0) {
7571 ss << "features " << features << " unsupported by: "
7572 << unsupported_ss.str();
7573 return -ENOTSUP;
7574 }
7575
7576 // check pending osd state, too!
7577 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7578 pending_inc.new_xinfo.begin();
7579 p != pending_inc.new_xinfo.end(); ++p) {
7580 const osd_xinfo_t &xi = p->second;
7581 if ((xi.features & features) != features) {
7582 dout(10) << __func__ << " pending osd." << p->first
7583 << " features are insufficient; retry" << dendl;
7584 return -EAGAIN;
7585 }
7586 }
7587
7588 return 0;
7589 }
7590
7591 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7592 stringstream& ss)
7593 {
7594 OSDMap::Incremental new_pending = pending_inc;
7595 encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7596 OSDMap newmap;
7597 newmap.deepish_copy_from(osdmap);
7598 newmap.apply_incremental(new_pending);
7599
7600 // client compat
7601 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7602 auto mv = newmap.get_min_compat_client();
7603 if (mv > newmap.require_min_compat_client) {
7604 ss << "new crush map requires client version " << mv
7605 << " but require_min_compat_client is "
7606 << newmap.require_min_compat_client;
7607 return false;
7608 }
7609 }
7610
7611 // osd compat
7612 uint64_t features =
7613 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7614 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7615 stringstream features_ss;
7616 int r = check_cluster_features(features, features_ss);
7617 if (r) {
7618 ss << "Could not change CRUSH: " << features_ss.str();
7619 return false;
7620 }
7621
7622 return true;
7623 }
7624
7625 bool OSDMonitor::erasure_code_profile_in_use(
7626 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7627 const string &profile,
7628 ostream *ss)
7629 {
7630 bool found = false;
7631 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7632 p != pools.end();
7633 ++p) {
7634 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7635 *ss << osdmap.pool_name[p->first] << " ";
7636 found = true;
7637 }
7638 }
7639 if (found) {
7640 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7641 }
7642 return found;
7643 }
7644
7645 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7646 map<string,string> *erasure_code_profile_map,
7647 ostream *ss)
7648 {
7649 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7650 get_json_str_map,
7651 *ss,
7652 erasure_code_profile_map,
7653 true);
7654 if (r)
7655 return r;
7656 ceph_assert((*erasure_code_profile_map).count("plugin"));
7657 string default_plugin = (*erasure_code_profile_map)["plugin"];
7658 map<string,string> user_map;
7659 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7660 i != erasure_code_profile.end();
7661 ++i) {
7662 size_t equal = i->find('=');
7663 if (equal == string::npos) {
7664 user_map[*i] = string();
7665 (*erasure_code_profile_map)[*i] = string();
7666 } else {
7667 const string key = i->substr(0, equal);
7668 equal++;
7669 const string value = i->substr(equal);
7670 if (key.find("ruleset-") == 0) {
7671 *ss << "property '" << key << "' is no longer supported; try "
7672 << "'crush-" << key.substr(8) << "' instead";
7673 return -EINVAL;
7674 }
7675 user_map[key] = value;
7676 (*erasure_code_profile_map)[key] = value;
7677 }
7678 }
7679
7680 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7681 (*erasure_code_profile_map) = user_map;
7682
7683 return 0;
7684 }
7685
7686 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7687 const string &erasure_code_profile,
7688 uint8_t repl_size,
7689 unsigned *size, unsigned *min_size,
7690 ostream *ss)
7691 {
7692 int err = 0;
7693 bool set_min_size = false;
7694 switch (pool_type) {
7695 case pg_pool_t::TYPE_REPLICATED:
7696 if (osdmap.stretch_mode_enabled) {
7697 if (repl_size == 0)
7698 repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7699 if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7700 *ss << "prepare_pool_size: we are in stretch mode but size "
7701 << repl_size << " does not match!";
7702 return -EINVAL;
7703 }
7704 *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7705 set_min_size = true;
7706 }
7707 if (repl_size == 0) {
7708 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7709 }
7710 *size = repl_size;
7711 if (!set_min_size)
7712 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7713 break;
7714 case pg_pool_t::TYPE_ERASURE:
7715 {
7716 if (osdmap.stretch_mode_enabled) {
7717 *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7718 return -EINVAL;
7719 }
7720 ErasureCodeInterfaceRef erasure_code;
7721 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7722 if (err == 0) {
7723 *size = erasure_code->get_chunk_count();
7724 *min_size =
7725 erasure_code->get_data_chunk_count() +
7726 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7727 assert(*min_size <= *size);
7728 assert(*min_size >= erasure_code->get_data_chunk_count());
7729 }
7730 }
7731 break;
7732 default:
7733 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7734 err = -EINVAL;
7735 break;
7736 }
7737 return err;
7738 }
7739
7740 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7741 const string &erasure_code_profile,
7742 uint32_t *stripe_width,
7743 ostream *ss)
7744 {
7745 int err = 0;
7746 switch (pool_type) {
7747 case pg_pool_t::TYPE_REPLICATED:
7748 // ignored
7749 break;
7750 case pg_pool_t::TYPE_ERASURE:
7751 {
7752 ErasureCodeProfile profile =
7753 osdmap.get_erasure_code_profile(erasure_code_profile);
7754 ErasureCodeInterfaceRef erasure_code;
7755 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7756 if (err)
7757 break;
7758 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7759 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7760 auto it = profile.find("stripe_unit");
7761 if (it != profile.end()) {
7762 string err_str;
7763 stripe_unit = strict_iecstrtoll(it->second, &err_str);
7764 ceph_assert(err_str.empty());
7765 }
7766 *stripe_width = data_chunks *
7767 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7768 }
7769 break;
7770 default:
7771 *ss << "prepare_pool_stripe_width: "
7772 << pool_type << " is not a known pool type";
7773 err = -EINVAL;
7774 break;
7775 }
7776 return err;
7777 }
7778
7779 int OSDMonitor::get_replicated_stretch_crush_rule()
7780 {
7781 /* we don't write down the stretch rule anywhere, so
7782 * we have to guess it. How? Look at all the pools
7783 * and count up how many times a given rule is used
7784 * on stretch pools and then return the one with
7785 * the most users!
7786 */
7787 map<int,int> rule_counts;
7788 for (const auto& pooli : osdmap.pools) {
7789 const pg_pool_t& p = pooli.second;
7790 if (p.is_replicated() && p.is_stretch_pool()) {
7791 if (!rule_counts.count(p.crush_rule)) {
7792 rule_counts[p.crush_rule] = 1;
7793 } else {
7794 ++rule_counts[p.crush_rule];
7795 }
7796 }
7797 }
7798
7799 if (rule_counts.empty()) {
7800 return -ENOENT;
7801 }
7802
7803 int most_used_count = 0;
7804 int most_used_rule = -1;
7805 for (auto i : rule_counts) {
7806 if (i.second > most_used_count) {
7807 most_used_rule = i.first;
7808 most_used_count = i.second;
7809 }
7810 }
7811 ceph_assert(most_used_count > 0);
7812 ceph_assert(most_used_rule >= 0);
7813 return most_used_rule;
7814 }
7815
7816 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7817 const string &erasure_code_profile,
7818 const string &rule_name,
7819 int *crush_rule,
7820 ostream *ss)
7821 {
7822
7823 if (*crush_rule < 0) {
7824 switch (pool_type) {
7825 case pg_pool_t::TYPE_REPLICATED:
7826 {
7827 if (rule_name == "") {
7828 if (osdmap.stretch_mode_enabled) {
7829 *crush_rule = get_replicated_stretch_crush_rule();
7830 } else {
7831 // Use default rule
7832 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
7833 }
7834 if (*crush_rule < 0) {
7835 // Errors may happen e.g. if no valid rule is available
7836 *ss << "No suitable CRUSH rule exists, check "
7837 << "'osd pool default crush *' config options";
7838 return -ENOENT;
7839 }
7840 } else {
7841 return get_crush_rule(rule_name, crush_rule, ss);
7842 }
7843 }
7844 break;
7845 case pg_pool_t::TYPE_ERASURE:
7846 {
7847 int err = crush_rule_create_erasure(rule_name,
7848 erasure_code_profile,
7849 crush_rule, ss);
7850 switch (err) {
7851 case -EALREADY:
7852 dout(20) << "prepare_pool_crush_rule: rule "
7853 << rule_name << " try again" << dendl;
7854 // fall through
7855 case 0:
7856 // need to wait for the crush rule to be proposed before proceeding
7857 err = -EAGAIN;
7858 break;
7859 case -EEXIST:
7860 err = 0;
7861 break;
7862 }
7863 return err;
7864 }
7865 break;
7866 default:
7867 *ss << "prepare_pool_crush_rule: " << pool_type
7868 << " is not a known pool type";
7869 return -EINVAL;
7870 }
7871 } else {
7872 if (!osdmap.crush->rule_exists(*crush_rule)) {
7873 *ss << "CRUSH rule " << *crush_rule << " not found";
7874 return -ENOENT;
7875 }
7876 }
7877
7878 return 0;
7879 }
7880
7881 int OSDMonitor::get_crush_rule(const string &rule_name,
7882 int *crush_rule,
7883 ostream *ss)
7884 {
7885 int ret;
7886 ret = osdmap.crush->get_rule_id(rule_name);
7887 if (ret != -ENOENT) {
7888 // found it, use it
7889 *crush_rule = ret;
7890 } else {
7891 CrushWrapper newcrush = _get_pending_crush();
7892
7893 ret = newcrush.get_rule_id(rule_name);
7894 if (ret != -ENOENT) {
7895 // found it, wait for it to be proposed
7896 dout(20) << __func__ << ": rule " << rule_name
7897 << " try again" << dendl;
7898 return -EAGAIN;
7899 } else {
7900 // Cannot find it , return error
7901 *ss << "specified rule " << rule_name << " doesn't exist";
7902 return ret;
7903 }
7904 }
7905 return 0;
7906 }
7907
7908 /*
7909 * Get the number of 'in' osds according to the crush_rule,
7910 */
7911 uint32_t OSDMonitor::get_osd_num_by_crush(int crush_rule)
7912 {
7913 set<int> out_osds;
7914 set<int> crush_in_osds;
7915 set<int> roots;
7916 CrushWrapper newcrush = _get_pending_crush();
7917 newcrush.find_takes_by_rule(crush_rule, &roots);
7918 for (auto root : roots) {
7919 const char *rootname = newcrush.get_item_name(root);
7920 set<int> crush_all_osds;
7921 newcrush.get_leaves(rootname, &crush_all_osds);
7922 std::set_difference(crush_all_osds.begin(), crush_all_osds.end(),
7923 out_osds.begin(), out_osds.end(),
7924 std::inserter(crush_in_osds, crush_in_osds.end()));
7925 }
7926 return crush_in_osds.size();
7927 }
7928
7929 int OSDMonitor::check_pg_num(int64_t pool,
7930 int pg_num,
7931 int size,
7932 int crush_rule,
7933 ostream *ss)
7934 {
7935 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7936 uint64_t projected = 0;
7937 uint32_t osd_num_by_crush = 0;
7938 set<int64_t> crush_pool_ids;
7939 if (pool < 0) {
7940 // a new pool
7941 projected += pg_num * size;
7942 }
7943
7944 osd_num_by_crush = get_osd_num_by_crush(crush_rule);
7945 osdmap.get_pool_ids_by_rule(crush_rule, &crush_pool_ids);
7946
7947 for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
7948 // Check only for pools affected by crush rule
7949 if (crush_pool_ids.contains(pool_id)) {
7950 if (pool_id == pool) {
7951 // Specified pool, use given pg_num and size values.
7952 projected += pg_num * size;
7953 } else {
7954 // Use pg_num_target for evaluating the projected pg num
7955 projected += pool_info.get_pg_num_target() * pool_info.get_size();
7956 }
7957 }
7958 }
7959 // assume min cluster size 3
7960 osd_num_by_crush = std::max(osd_num_by_crush, 3u);
7961 auto projected_pgs_per_osd = projected / osd_num_by_crush;
7962
7963 if (projected_pgs_per_osd > max_pgs_per_osd) {
7964 if (pool >= 0) {
7965 *ss << "pool id " << pool;
7966 }
7967 *ss << " pg_num " << pg_num
7968 << " size " << size
7969 << " for this pool would result in "
7970 << projected_pgs_per_osd
7971 << " cumulative PGs per OSD (" << projected
7972 << " total PG replicas on " << osd_num_by_crush
7973 << " 'in' root OSDs by crush rule) "
7974 << "which exceeds the mon_max_pg_per_osd "
7975 << "value of " << max_pgs_per_osd;
7976 return -ERANGE;
7977 }
7978 return 0;
7979 }
7980
7981 /**
7982 * @param name The name of the new pool
7983 * @param crush_rule The crush rule to use. If <0, will use the system default
7984 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7985 * @param pg_num The pg_num to use. If set to 0, will use the system default
7986 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7987 * @param pg_num_min min pg_num
7988 * @param pg_num_max max pg_num
7989 * @param repl_size Replication factor, or 0 for default
7990 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7991 * @param pool_type TYPE_ERASURE, or TYPE_REP
7992 * @param expected_num_objects expected number of objects on the pool
7993 * @param fast_read fast read type.
7994 * @param pg_autoscale_mode autoscale mode, one of on, off, warn
7995 * @param bool bulk indicates whether pool should be a bulk pool
7996 * @param bool crimson indicates whether pool is a crimson pool
7997 * @param ss human readable error message, if any.
7998 *
7999 * @return 0 on success, negative errno on failure.
8000 */
8001 int OSDMonitor::prepare_new_pool(string& name,
8002 int crush_rule,
8003 const string &crush_rule_name,
8004 unsigned pg_num, unsigned pgp_num,
8005 unsigned pg_num_min,
8006 unsigned pg_num_max,
8007 const uint64_t repl_size,
8008 const uint64_t target_size_bytes,
8009 const float target_size_ratio,
8010 const string &erasure_code_profile,
8011 const unsigned pool_type,
8012 const uint64_t expected_num_objects,
8013 FastReadType fast_read,
8014 string pg_autoscale_mode,
8015 bool bulk,
8016 bool crimson,
8017 ostream *ss)
8018 {
8019 if (crimson && pg_autoscale_mode.empty()) {
8020 // default pg_autoscale_mode to off for crimson, we'll error out below if
8021 // the user tried to actually set pg_autoscale_mode to something other than
8022 // "off"
8023 pg_autoscale_mode = "off";
8024 }
8025
8026 if (name.length() == 0)
8027 return -EINVAL;
8028
8029 if (pg_num == 0) {
8030 auto pg_num_from_mode =
8031 [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
8032 (const string& mode) {
8033 return mode == "on" ? 1 : pg_num;
8034 };
8035 pg_num = pg_num_from_mode(
8036 pg_autoscale_mode.empty() ?
8037 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
8038 pg_autoscale_mode);
8039 }
8040 if (pgp_num == 0)
8041 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
8042 if (!pgp_num)
8043 pgp_num = pg_num;
8044 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8045 *ss << "'pg_num' must be greater than 0 and less than or equal to "
8046 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8047 << " (you may adjust 'mon max pool pg num' for higher values)";
8048 return -ERANGE;
8049 }
8050 if (pgp_num > pg_num) {
8051 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8052 << ", which in this case is " << pg_num;
8053 return -ERANGE;
8054 }
8055
8056 if (crimson) {
8057 /* crimson-osd requires that the pool be replicated and that pg_num/pgp_num
8058 * be static. User must also have specified set-allow-crimson */
8059 const auto *suffix = " (--crimson specified or osd_pool_default_crimson set)";
8060 if (pool_type != pg_pool_t::TYPE_REPLICATED) {
8061 *ss << "crimson-osd only supports replicated pools" << suffix;
8062 return -EINVAL;
8063 } else if (pg_autoscale_mode != "off") {
8064 *ss << "crimson-osd does not support changing pg_num or pgp_num, "
8065 << "pg_autoscale_mode must be set to 'off'" << suffix;
8066 return -EINVAL;
8067 } else if (!osdmap.get_allow_crimson()) {
8068 *ss << "set-allow-crimson must be set to create a pool with the "
8069 << "crimson flag" << suffix;
8070 return -EINVAL;
8071 }
8072 }
8073
8074 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
8075 *ss << "'fast_read' can only apply to erasure coding pool";
8076 return -EINVAL;
8077 }
8078 int r;
8079 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
8080 crush_rule_name, &crush_rule, ss);
8081 if (r) {
8082 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
8083 return r;
8084 }
8085 unsigned size, min_size;
8086 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
8087 &size, &min_size, ss);
8088 if (r) {
8089 dout(10) << "prepare_pool_size returns " << r << dendl;
8090 return r;
8091 }
8092 if (g_conf()->mon_osd_crush_smoke_test) {
8093 CrushWrapper newcrush = _get_pending_crush();
8094 ostringstream err;
8095 CrushTester tester(newcrush, err);
8096 tester.set_min_x(0);
8097 tester.set_max_x(50);
8098 tester.set_rule(crush_rule);
8099 tester.set_num_rep(size);
8100 auto start = ceph::coarse_mono_clock::now();
8101 r = tester.test_with_fork(cct, g_conf()->mon_lease);
8102 dout(10) << __func__ << " crush test_with_fork tester created " << dendl;
8103 auto duration = ceph::coarse_mono_clock::now() - start;
8104 if (r < 0) {
8105 dout(10) << "tester.test_with_fork returns " << r
8106 << ": " << err.str() << dendl;
8107 *ss << "crush test failed with " << r << ": " << err.str();
8108 return r;
8109 }
8110 dout(10) << __func__ << " crush smoke test duration: "
8111 << duration << dendl;
8112 }
8113 r = check_pg_num(-1, pg_num, size, crush_rule, ss);
8114 if (r) {
8115 dout(10) << "check_pg_num returns " << r << dendl;
8116 return r;
8117 }
8118
8119 if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) {
8120 *ss << "crush rule " << crush_rule << " type does not match pool";
8121 return -EINVAL;
8122 }
8123
8124 uint32_t stripe_width = 0;
8125 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
8126 if (r) {
8127 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
8128 return r;
8129 }
8130
8131 bool fread = false;
8132 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8133 switch (fast_read) {
8134 case FAST_READ_OFF:
8135 fread = false;
8136 break;
8137 case FAST_READ_ON:
8138 fread = true;
8139 break;
8140 case FAST_READ_DEFAULT:
8141 fread = g_conf()->osd_pool_default_ec_fast_read;
8142 break;
8143 default:
8144 *ss << "invalid fast_read setting: " << fast_read;
8145 return -EINVAL;
8146 }
8147 }
8148
8149 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8150 p != pending_inc.new_pool_names.end();
8151 ++p) {
8152 if (p->second == name)
8153 return 0;
8154 }
8155
8156 if (-1 == pending_inc.new_pool_max)
8157 pending_inc.new_pool_max = osdmap.pool_max;
8158 int64_t pool = ++pending_inc.new_pool_max;
8159 pg_pool_t empty;
8160 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
8161 pi->create_time = ceph_clock_now();
8162 pi->type = pool_type;
8163 pi->fast_read = fread;
8164 pi->flags = g_conf()->osd_pool_default_flags;
8165 if (bulk) {
8166 pi->set_flag(pg_pool_t::FLAG_BULK);
8167 } else if (g_conf()->osd_pool_default_flag_bulk) {
8168 pi->set_flag(pg_pool_t::FLAG_BULK);
8169 }
8170 if (g_conf()->osd_pool_default_flag_hashpspool)
8171 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
8172 if (g_conf()->osd_pool_default_flag_nodelete)
8173 pi->set_flag(pg_pool_t::FLAG_NODELETE);
8174 if (g_conf()->osd_pool_default_flag_nopgchange)
8175 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8176 if (g_conf()->osd_pool_default_flag_nosizechange)
8177 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
8178 pi->set_flag(pg_pool_t::FLAG_CREATING);
8179 if (g_conf()->osd_pool_use_gmt_hitset)
8180 pi->use_gmt_hitset = true;
8181 else
8182 pi->use_gmt_hitset = false;
8183 if (crimson) {
8184 pi->set_flag(pg_pool_t::FLAG_CRIMSON);
8185 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8186 }
8187
8188 pi->size = size;
8189 pi->min_size = min_size;
8190 pi->crush_rule = crush_rule;
8191 pi->expected_num_objects = expected_num_objects;
8192 pi->object_hash = CEPH_STR_HASH_RJENKINS;
8193 if (osdmap.stretch_mode_enabled) {
8194 pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8195 pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8196 pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8197 pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8198 if (osdmap.degraded_stretch_mode) {
8199 pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8200 pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8201 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8202 // TODO: drat, we don't record this ^ anywhere, though given that it
8203 // necessarily won't exist elsewhere it likely doesn't matter
8204 pi->min_size = pi->min_size / 2;
8205 pi->size = pi->size / 2; // only support 2 zones now
8206 }
8207 }
8208
8209 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8210 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8211 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8212 pi->pg_autoscale_mode = m;
8213 } else {
8214 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
8215 }
8216 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8217 pi->set_pg_num(
8218 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8219 : pg_num);
8220 pi->set_pg_num_pending(pi->get_pg_num());
8221 pi->set_pg_num_target(pg_num);
8222 pi->set_pgp_num(pi->get_pg_num());
8223 pi->set_pgp_num_target(pgp_num);
8224 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8225 pg_num_min) {
8226 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8227 }
8228 if (osdmap.require_osd_release >= ceph_release_t::quincy &&
8229 pg_num_max) {
8230 pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
8231 }
8232 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8233 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8234 pi->pg_autoscale_mode = m;
8235 }
8236
8237 pi->last_change = pending_inc.epoch;
8238 pi->auid = 0;
8239
8240 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8241 pi->erasure_code_profile = erasure_code_profile;
8242 } else {
8243 pi->erasure_code_profile = "";
8244 }
8245 pi->stripe_width = stripe_width;
8246
8247 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8248 target_size_bytes) {
8249 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8250 // larger than int32_t max.
8251 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8252 }
8253 if (target_size_ratio > 0.0 &&
8254 osdmap.require_osd_release >= ceph_release_t::nautilus) {
8255 // only store for nautilus+, just to be consistent and tidy.
8256 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8257 }
8258
8259 pi->cache_target_dirty_ratio_micro =
8260 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8261 pi->cache_target_dirty_high_ratio_micro =
8262 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8263 pi->cache_target_full_ratio_micro =
8264 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8265 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8266 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8267
8268 pending_inc.new_pool_names[pool] = name;
8269 return 0;
8270 }
8271
8272 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8273 {
8274 op->mark_osdmon_event(__func__);
8275 ostringstream ss;
8276 if (pending_inc.new_flags < 0)
8277 pending_inc.new_flags = osdmap.get_flags();
8278 pending_inc.new_flags |= flag;
8279 ss << OSDMap::get_flag_string(flag) << " is set";
8280 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8281 get_last_committed() + 1));
8282 return true;
8283 }
8284
8285 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8286 {
8287 op->mark_osdmon_event(__func__);
8288 ostringstream ss;
8289 if (pending_inc.new_flags < 0)
8290 pending_inc.new_flags = osdmap.get_flags();
8291 pending_inc.new_flags &= ~flag;
8292 ss << OSDMap::get_flag_string(flag) << " is unset";
8293 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8294 get_last_committed() + 1));
8295 return true;
8296 }
8297
8298 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8299 stringstream& ss)
8300 {
8301 string poolstr;
8302 cmd_getval(cmdmap, "pool", poolstr);
8303 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8304 if (pool < 0) {
8305 ss << "unrecognized pool '" << poolstr << "'";
8306 return -ENOENT;
8307 }
8308 string var;
8309 cmd_getval(cmdmap, "var", var);
8310
8311 pg_pool_t p = *osdmap.get_pg_pool(pool);
8312 if (pending_inc.new_pools.count(pool))
8313 p = pending_inc.new_pools[pool];
8314
8315 // accept val as a json string in the normal case (current
8316 // generation monitor). parse out int or float values from the
8317 // string as needed. however, if it is not a string, try to pull
8318 // out an int, in case an older monitor with an older json schema is
8319 // forwarding a request.
8320 string val;
8321 string interr, floaterr;
8322 int64_t n = 0;
8323 double f = 0;
8324 int64_t uf = 0; // micro-f
8325 cmd_getval(cmdmap, "val", val);
8326
8327 auto si_options = {
8328 "target_max_objects"
8329 };
8330 auto iec_options = {
8331 "target_max_bytes",
8332 "target_size_bytes",
8333 "compression_max_blob_size",
8334 "compression_min_blob_size",
8335 "csum_max_block",
8336 "csum_min_block",
8337 };
8338 if (count(begin(si_options), end(si_options), var)) {
8339 n = strict_si_cast<int64_t>(val, &interr);
8340 } else if (count(begin(iec_options), end(iec_options), var)) {
8341 n = strict_iec_cast<int64_t>(val, &interr);
8342 } else {
8343 // parse string as both int and float; different fields use different types.
8344 n = strict_strtoll(val.c_str(), 10, &interr);
8345 f = strict_strtod(val.c_str(), &floaterr);
8346 uf = llrintl(f * (double)1000000.0);
8347 }
8348
8349 if (!p.is_tier() &&
8350 (var == "hit_set_type" || var == "hit_set_period" ||
8351 var == "hit_set_count" || var == "hit_set_fpp" ||
8352 var == "target_max_objects" || var == "target_max_bytes" ||
8353 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8354 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8355 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8356 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8357 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8358 return -EACCES;
8359 }
8360
8361 if (var == "size") {
8362 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8363 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8364 return -EPERM;
8365 }
8366 if (p.type == pg_pool_t::TYPE_ERASURE) {
8367 ss << "can not change the size of an erasure-coded pool";
8368 return -ENOTSUP;
8369 }
8370 if (interr.length()) {
8371 ss << "error parsing integer value '" << val << "': " << interr;
8372 return -EINVAL;
8373 }
8374 if (n <= 0 || n > 10) {
8375 ss << "pool size must be between 1 and 10";
8376 return -EINVAL;
8377 }
8378 if (n == 1) {
8379 if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8380 ss << "configuring pool size as 1 is disabled by default.";
8381 return -EPERM;
8382 }
8383 bool sure = false;
8384 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8385 if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8386 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8387 "pass the flag --yes-i-really-mean-it.";
8388 return -EPERM;
8389 }
8390 }
8391 if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) {
8392 ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
8393 return -EINVAL;
8394 }
8395 if (n > p.size) {
8396 // only when increasing pool size
8397 int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
8398 if (r < 0) {
8399 return r;
8400 }
8401 }
8402 p.size = n;
8403 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8404 } else if (var == "min_size") {
8405 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8406 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8407 return -EPERM;
8408 }
8409 if (interr.length()) {
8410 ss << "error parsing integer value '" << val << "': " << interr;
8411 return -EINVAL;
8412 }
8413
8414 if (p.type != pg_pool_t::TYPE_ERASURE) {
8415 if (n < 1 || n > p.size) {
8416 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8417 return -EINVAL;
8418 }
8419 } else {
8420 ErasureCodeInterfaceRef erasure_code;
8421 int k;
8422 stringstream tmp;
8423 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8424 if (err == 0) {
8425 k = erasure_code->get_data_chunk_count();
8426 } else {
8427 ss << __func__ << " get_erasure_code failed: " << tmp.str();
8428 return err;
8429 }
8430
8431 if (n < k || n > p.size) {
8432 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8433 return -EINVAL;
8434 }
8435 }
8436 p.min_size = n;
8437 } else if (var == "pg_num_actual") {
8438 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8439 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8440 return -EPERM;
8441 }
8442 if (interr.length()) {
8443 ss << "error parsing integer value '" << val << "': " << interr;
8444 return -EINVAL;
8445 }
8446 if (n == (int)p.get_pg_num()) {
8447 return 0;
8448 }
8449 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8450 ss << "'pg_num' must be greater than 0 and less than or equal to "
8451 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8452 << " (you may adjust 'mon max pool pg num' for higher values)";
8453 return -ERANGE;
8454 }
8455 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8456 ss << "cannot adjust pg_num while initial PGs are being created";
8457 return -EBUSY;
8458 }
8459 if (n > (int)p.get_pg_num()) {
8460 if (p.get_pg_num() != p.get_pg_num_pending()) {
8461 // force pre-nautilus clients to resend their ops, since they
8462 // don't understand pg_num_pending changes form a new interval
8463 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8464 }
8465 p.set_pg_num(n);
8466 } else {
8467 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8468 ss << "nautilus OSDs are required to adjust pg_num_pending";
8469 return -EPERM;
8470 }
8471 if (n < (int)p.get_pgp_num()) {
8472 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8473 return -EINVAL;
8474 }
8475 if (n < (int)p.get_pg_num() - 1) {
8476 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8477 << ") - 1; only single pg decrease is currently supported";
8478 return -EINVAL;
8479 }
8480 p.set_pg_num_pending(n);
8481 // force pre-nautilus clients to resend their ops, since they
8482 // don't understand pg_num_pending changes form a new interval
8483 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8484 }
8485 // force pre-luminous clients to resend their ops, since they
8486 // don't understand that split PGs now form a new interval.
8487 p.last_force_op_resend_preluminous = pending_inc.epoch;
8488 } else if (var == "pg_num") {
8489 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8490 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8491 return -EPERM;
8492 }
8493 if (interr.length()) {
8494 ss << "error parsing integer value '" << val << "': " << interr;
8495 return -EINVAL;
8496 }
8497 if (n == (int)p.get_pg_num_target()) {
8498 return 0;
8499 }
8500 if (n <= 0 || static_cast<uint64_t>(n) >
8501 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8502 ss << "'pg_num' must be greater than 0 and less than or equal to "
8503 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8504 << " (you may adjust 'mon max pool pg num' for higher values)";
8505 return -ERANGE;
8506 }
8507 if (n > (int)p.get_pg_num_target()) {
8508 int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
8509 if (r) {
8510 return r;
8511 }
8512 bool force = false;
8513 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8514 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8515 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8516 return -EPERM;
8517 }
8518 } else {
8519 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8520 ss << "nautilus OSDs are required to decrease pg_num";
8521 return -EPERM;
8522 }
8523 }
8524 int64_t pg_min = 0, pg_max = 0;
8525 p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
8526 p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
8527 if (pg_min && n < pg_min) {
8528 ss << "specified pg_num " << n
8529 << " < pg_num_min " << pg_min;
8530 return -EINVAL;
8531 }
8532 if (pg_max && n > pg_max) {
8533 ss << "specified pg_num " << n
8534 << " < pg_num_max " << pg_max;
8535 return -EINVAL;
8536 }
8537 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8538 // pre-nautilus osdmap format; increase pg_num directly
8539 assert(n > (int)p.get_pg_num());
8540 // force pre-nautilus clients to resend their ops, since they
8541 // don't understand pg_num_target changes form a new interval
8542 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8543 // force pre-luminous clients to resend their ops, since they
8544 // don't understand that split PGs now form a new interval.
8545 p.last_force_op_resend_preluminous = pending_inc.epoch;
8546 p.set_pg_num(n);
8547 } else {
8548 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8549 // make pgp_num track pg_num if it already matches. if it is set
8550 // differently, leave it different and let the user control it
8551 // manually.
8552 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8553 p.set_pgp_num_target(n);
8554 }
8555 p.set_pg_num_target(n);
8556 }
8557 } else if (var == "pgp_num_actual") {
8558 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8559 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8560 return -EPERM;
8561 }
8562 if (interr.length()) {
8563 ss << "error parsing integer value '" << val << "': " << interr;
8564 return -EINVAL;
8565 }
8566 if (n <= 0) {
8567 ss << "specified pgp_num must > 0, but you set to " << n;
8568 return -EINVAL;
8569 }
8570 if (n > (int)p.get_pg_num()) {
8571 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8572 return -EINVAL;
8573 }
8574 if (n > (int)p.get_pg_num_pending()) {
8575 ss << "specified pgp_num " << n
8576 << " > pg_num_pending " << p.get_pg_num_pending();
8577 return -EINVAL;
8578 }
8579 p.set_pgp_num(n);
8580 } else if (var == "pgp_num") {
8581 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8582 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8583 return -EPERM;
8584 }
8585 if (interr.length()) {
8586 ss << "error parsing integer value '" << val << "': " << interr;
8587 return -EINVAL;
8588 }
8589 if (n <= 0) {
8590 ss << "specified pgp_num must > 0, but you set to " << n;
8591 return -EINVAL;
8592 }
8593 if (n > (int)p.get_pg_num_target()) {
8594 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8595 return -EINVAL;
8596 }
8597 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8598 // pre-nautilus osdmap format; increase pgp_num directly
8599 p.set_pgp_num(n);
8600 } else {
8601 p.set_pgp_num_target(n);
8602 }
8603 } else if (var == "pg_autoscale_mode") {
8604 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8605 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8606 ss << "specified invalid mode " << val;
8607 return -EINVAL;
8608 }
8609 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8610 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8611 return -EINVAL;
8612 }
8613 p.pg_autoscale_mode = m;
8614 } else if (var == "crush_rule") {
8615 int id = osdmap.crush->get_rule_id(val);
8616 if (id == -ENOENT) {
8617 ss << "crush rule " << val << " does not exist";
8618 return -ENOENT;
8619 }
8620 if (id < 0) {
8621 ss << cpp_strerror(id);
8622 return -ENOENT;
8623 }
8624 if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) {
8625 ss << "crush rule " << id << " type does not match pool";
8626 return -EINVAL;
8627 }
8628 p.crush_rule = id;
8629 } else if (var == "nodelete" || var == "nopgchange" ||
8630 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8631 var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
8632 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8633 // make sure we only compare against 'n' if we didn't receive a string
8634 if (val == "true" || (interr.empty() && n == 1)) {
8635 p.set_flag(flag);
8636 } else if (val == "false" || (interr.empty() && n == 0)) {
8637 if (flag == pg_pool_t::FLAG_NOPGCHANGE && p.is_crimson()) {
8638 ss << "cannot clear FLAG_NOPGCHANGE on a crimson pool";
8639 return -EINVAL;
8640 }
8641 p.unset_flag(flag);
8642 } else {
8643 ss << "expecting value 'true', 'false', '0', or '1'";
8644 return -EINVAL;
8645 }
8646 } else if (var == "eio") {
8647 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8648
8649 // make sure we only compare against 'n' if we didn't receive a string
8650 if (val == "true" || (interr.empty() && n == 1)) {
8651 p.set_flag(flag);
8652 } else if (val == "false" || (interr.empty() && n == 0)) {
8653 p.unset_flag(flag);
8654 } else {
8655 ss << "expecting value 'true', 'false', '0', or '1'";
8656 return -EINVAL;
8657 }
8658 } else if (var == "hashpspool") {
8659 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8660 bool force = false;
8661 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8662
8663 if (!force) {
8664 ss << "are you SURE? this will remap all placement groups in this pool,"
8665 " this triggers large data movement,"
8666 " pass --yes-i-really-mean-it if you really do.";
8667 return -EPERM;
8668 }
8669 // make sure we only compare against 'n' if we didn't receive a string
8670 if (val == "true" || (interr.empty() && n == 1)) {
8671 p.set_flag(flag);
8672 } else if (val == "false" || (interr.empty() && n == 0)) {
8673 p.unset_flag(flag);
8674 } else {
8675 ss << "expecting value 'true', 'false', '0', or '1'";
8676 return -EINVAL;
8677 }
8678 } else if (var == "hit_set_type") {
8679 if (val == "none")
8680 p.hit_set_params = HitSet::Params();
8681 else {
8682 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8683 if (err)
8684 return err;
8685 if (val == "bloom") {
8686 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8687 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8688 p.hit_set_params = HitSet::Params(bsp);
8689 } else if (val == "explicit_hash")
8690 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8691 else if (val == "explicit_object")
8692 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8693 else {
8694 ss << "unrecognized hit_set type '" << val << "'";
8695 return -EINVAL;
8696 }
8697 }
8698 } else if (var == "hit_set_period") {
8699 if (interr.length()) {
8700 ss << "error parsing integer value '" << val << "': " << interr;
8701 return -EINVAL;
8702 } else if (n < 0) {
8703 ss << "hit_set_period should be non-negative";
8704 return -EINVAL;
8705 }
8706 p.hit_set_period = n;
8707 } else if (var == "hit_set_count") {
8708 if (interr.length()) {
8709 ss << "error parsing integer value '" << val << "': " << interr;
8710 return -EINVAL;
8711 } else if (n < 0) {
8712 ss << "hit_set_count should be non-negative";
8713 return -EINVAL;
8714 }
8715 p.hit_set_count = n;
8716 } else if (var == "hit_set_fpp") {
8717 if (floaterr.length()) {
8718 ss << "error parsing floating point value '" << val << "': " << floaterr;
8719 return -EINVAL;
8720 } else if (f < 0 || f > 1.0) {
8721 ss << "hit_set_fpp should be in the range 0..1";
8722 return -EINVAL;
8723 }
8724 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8725 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8726 return -EINVAL;
8727 }
8728 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8729 bloomp->set_fpp(f);
8730 } else if (var == "use_gmt_hitset") {
8731 if (val == "true" || (interr.empty() && n == 1)) {
8732 p.use_gmt_hitset = true;
8733 } else {
8734 ss << "expecting value 'true' or '1'";
8735 return -EINVAL;
8736 }
8737 } else if (var == "allow_ec_overwrites") {
8738 if (!p.is_erasure()) {
8739 ss << "ec overwrites can only be enabled for an erasure coded pool";
8740 return -EINVAL;
8741 }
8742 stringstream err;
8743 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8744 !is_pool_currently_all_bluestore(pool, p, &err)) {
8745 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8746 return -EINVAL;
8747 }
8748 if (val == "true" || (interr.empty() && n == 1)) {
8749 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8750 } else if (val == "false" || (interr.empty() && n == 0)) {
8751 ss << "ec overwrites cannot be disabled once enabled";
8752 return -EINVAL;
8753 } else {
8754 ss << "expecting value 'true', 'false', '0', or '1'";
8755 return -EINVAL;
8756 }
8757 } else if (var == "target_max_objects") {
8758 if (interr.length()) {
8759 ss << "error parsing int '" << val << "': " << interr;
8760 return -EINVAL;
8761 }
8762 p.target_max_objects = n;
8763 } else if (var == "target_max_bytes") {
8764 if (interr.length()) {
8765 ss << "error parsing int '" << val << "': " << interr;
8766 return -EINVAL;
8767 }
8768 p.target_max_bytes = n;
8769 } else if (var == "cache_target_dirty_ratio") {
8770 if (floaterr.length()) {
8771 ss << "error parsing float '" << val << "': " << floaterr;
8772 return -EINVAL;
8773 }
8774 if (f < 0 || f > 1.0) {
8775 ss << "value must be in the range 0..1";
8776 return -ERANGE;
8777 }
8778 p.cache_target_dirty_ratio_micro = uf;
8779 } else if (var == "cache_target_dirty_high_ratio") {
8780 if (floaterr.length()) {
8781 ss << "error parsing float '" << val << "': " << floaterr;
8782 return -EINVAL;
8783 }
8784 if (f < 0 || f > 1.0) {
8785 ss << "value must be in the range 0..1";
8786 return -ERANGE;
8787 }
8788 p.cache_target_dirty_high_ratio_micro = uf;
8789 } else if (var == "cache_target_full_ratio") {
8790 if (floaterr.length()) {
8791 ss << "error parsing float '" << val << "': " << floaterr;
8792 return -EINVAL;
8793 }
8794 if (f < 0 || f > 1.0) {
8795 ss << "value must be in the range 0..1";
8796 return -ERANGE;
8797 }
8798 p.cache_target_full_ratio_micro = uf;
8799 } else if (var == "cache_min_flush_age") {
8800 if (interr.length()) {
8801 ss << "error parsing int '" << val << "': " << interr;
8802 return -EINVAL;
8803 }
8804 p.cache_min_flush_age = n;
8805 } else if (var == "cache_min_evict_age") {
8806 if (interr.length()) {
8807 ss << "error parsing int '" << val << "': " << interr;
8808 return -EINVAL;
8809 }
8810 p.cache_min_evict_age = n;
8811 } else if (var == "min_read_recency_for_promote") {
8812 if (interr.length()) {
8813 ss << "error parsing integer value '" << val << "': " << interr;
8814 return -EINVAL;
8815 }
8816 p.min_read_recency_for_promote = n;
8817 } else if (var == "hit_set_grade_decay_rate") {
8818 if (interr.length()) {
8819 ss << "error parsing integer value '" << val << "': " << interr;
8820 return -EINVAL;
8821 }
8822 if (n > 100 || n < 0) {
8823 ss << "value out of range,valid range is 0 - 100";
8824 return -EINVAL;
8825 }
8826 p.hit_set_grade_decay_rate = n;
8827 } else if (var == "hit_set_search_last_n") {
8828 if (interr.length()) {
8829 ss << "error parsing integer value '" << val << "': " << interr;
8830 return -EINVAL;
8831 }
8832 if (n > p.hit_set_count || n < 0) {
8833 ss << "value out of range,valid range is 0 - hit_set_count";
8834 return -EINVAL;
8835 }
8836 p.hit_set_search_last_n = n;
8837 } else if (var == "min_write_recency_for_promote") {
8838 if (interr.length()) {
8839 ss << "error parsing integer value '" << val << "': " << interr;
8840 return -EINVAL;
8841 }
8842 p.min_write_recency_for_promote = n;
8843 } else if (var == "fast_read") {
8844 if (p.is_replicated()) {
8845 ss << "fast read is not supported in replication pool";
8846 return -EINVAL;
8847 }
8848 if (val == "true" || (interr.empty() && n == 1)) {
8849 p.fast_read = true;
8850 } else if (val == "false" || (interr.empty() && n == 0)) {
8851 p.fast_read = false;
8852 } else {
8853 ss << "expecting value 'true', 'false', '0', or '1'";
8854 return -EINVAL;
8855 }
8856 } else if (pool_opts_t::is_opt_name(var)) {
8857 bool unset = val == "unset";
8858 if (var == "compression_mode") {
8859 if (!unset) {
8860 auto cmode = Compressor::get_comp_mode_type(val);
8861 if (!cmode) {
8862 ss << "unrecognized compression mode '" << val << "'";
8863 return -EINVAL;
8864 }
8865 }
8866 } else if (var == "compression_algorithm") {
8867 if (!unset) {
8868 auto alg = Compressor::get_comp_alg_type(val);
8869 if (!alg) {
8870 ss << "unrecognized compression_algorithm '" << val << "'";
8871 return -EINVAL;
8872 }
8873 }
8874 } else if (var == "compression_required_ratio") {
8875 if (floaterr.length()) {
8876 ss << "error parsing float value '" << val << "': " << floaterr;
8877 return -EINVAL;
8878 }
8879 if (f < 0 || f > 1) {
8880 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8881 return -EINVAL;
8882 }
8883 } else if (var == "csum_type") {
8884 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8885 if (t < 0 ) {
8886 ss << "unrecognized csum_type '" << val << "'";
8887 return -EINVAL;
8888 }
8889 //preserve csum_type numeric value
8890 n = t;
8891 interr.clear();
8892 } else if (var == "compression_max_blob_size" ||
8893 var == "compression_min_blob_size" ||
8894 var == "csum_max_block" ||
8895 var == "csum_min_block") {
8896 if (interr.length()) {
8897 ss << "error parsing int value '" << val << "': " << interr;
8898 return -EINVAL;
8899 }
8900 } else if (var == "fingerprint_algorithm") {
8901 if (!unset) {
8902 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8903 if (!alg) {
8904 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8905 return -EINVAL;
8906 }
8907 }
8908 } else if (var == "target_size_bytes") {
8909 if (interr.length()) {
8910 ss << "error parsing unit value '" << val << "': " << interr;
8911 return -EINVAL;
8912 }
8913 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8914 ss << "must set require_osd_release to nautilus or "
8915 << "later before setting target_size_bytes";
8916 return -EINVAL;
8917 }
8918 } else if (var == "target_size_ratio") {
8919 if (f < 0.0) {
8920 ss << "target_size_ratio cannot be negative";
8921 return -EINVAL;
8922 }
8923 } else if (var == "pg_num_min") {
8924 if (interr.length()) {
8925 ss << "error parsing int value '" << val << "': " << interr;
8926 return -EINVAL;
8927 }
8928 if (n > (int)p.get_pg_num_target()) {
8929 ss << "specified pg_num_min " << n
8930 << " > pg_num " << p.get_pg_num_target();
8931 return -EINVAL;
8932 }
8933 } else if (var == "pg_num_max") {
8934 if (interr.length()) {
8935 ss << "error parsing int value '" << val << "': " << interr;
8936 return -EINVAL;
8937 }
8938 if (n && n < (int)p.get_pg_num_target()) {
8939 ss << "specified pg_num_max " << n
8940 << " < pg_num " << p.get_pg_num_target();
8941 return -EINVAL;
8942 }
8943 } else if (var == "recovery_priority") {
8944 if (interr.length()) {
8945 ss << "error parsing int value '" << val << "': " << interr;
8946 return -EINVAL;
8947 }
8948 if (!g_conf()->debug_allow_any_pool_priority) {
8949 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8950 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8951 << " and " << OSD_POOL_PRIORITY_MAX;
8952 return -EINVAL;
8953 }
8954 }
8955 } else if (var == "pg_autoscale_bias") {
8956 if (f < 0.0 || f > 1000.0) {
8957 ss << "pg_autoscale_bias must be between 0 and 1000";
8958 return -EINVAL;
8959 }
8960 } else if (var == "dedup_tier") {
8961 if (interr.empty()) {
8962 ss << "expecting value 'pool name'";
8963 return -EINVAL;
8964 }
8965 // Current base tier in dedup does not support ec pool
8966 if (p.is_erasure()) {
8967 ss << "pool '" << poolstr
8968 << "' is an ec pool, which cannot be a base tier";
8969 return -ENOTSUP;
8970 }
8971 int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8972 if (lowtierpool_id < 0) {
8973 ss << "unrecognized pool '" << val << "'";
8974 return -ENOENT;
8975 }
8976 const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8977 ceph_assert(tp);
8978 n = lowtierpool_id;
8979 // The original input is string (pool name), but we convert it to int64_t.
8980 // So, clear interr
8981 interr.clear();
8982 } else if (var == "dedup_chunk_algorithm") {
8983 if (!unset) {
8984 auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8985 if (!alg) {
8986 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8987 return -EINVAL;
8988 }
8989 }
8990 } else if (var == "dedup_cdc_chunk_size") {
8991 if (interr.length()) {
8992 ss << "error parsing int value '" << val << "': " << interr;
8993 return -EINVAL;
8994 }
8995 }
8996
8997 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8998 switch (desc.type) {
8999 case pool_opts_t::STR:
9000 if (unset) {
9001 p.opts.unset(desc.key);
9002 } else {
9003 p.opts.set(desc.key, static_cast<std::string>(val));
9004 }
9005 break;
9006 case pool_opts_t::INT:
9007 if (interr.length()) {
9008 ss << "error parsing integer value '" << val << "': " << interr;
9009 return -EINVAL;
9010 }
9011 if (n == 0) {
9012 p.opts.unset(desc.key);
9013 } else {
9014 p.opts.set(desc.key, static_cast<int64_t>(n));
9015 }
9016 break;
9017 case pool_opts_t::DOUBLE:
9018 if (floaterr.length()) {
9019 ss << "error parsing floating point value '" << val << "': " << floaterr;
9020 return -EINVAL;
9021 }
9022 if (f == 0) {
9023 p.opts.unset(desc.key);
9024 } else {
9025 p.opts.set(desc.key, static_cast<double>(f));
9026 }
9027 break;
9028 default:
9029 ceph_assert(!"unknown type");
9030 }
9031 } else {
9032 ss << "unrecognized variable '" << var << "'";
9033 return -EINVAL;
9034 }
9035 if (val != "unset") {
9036 ss << "set pool " << pool << " " << var << " to " << val;
9037 } else {
9038 ss << "unset pool " << pool << " " << var;
9039 }
9040 p.last_change = pending_inc.epoch;
9041 pending_inc.new_pools[pool] = p;
9042 return 0;
9043 }
9044
9045 int OSDMonitor::prepare_command_pool_application(const string &prefix,
9046 const cmdmap_t& cmdmap,
9047 stringstream& ss)
9048 {
9049 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
9050 }
9051
9052 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
9053 const cmdmap_t& cmdmap,
9054 stringstream& ss,
9055 bool *modified)
9056 {
9057 return _command_pool_application(prefix, cmdmap, ss, modified, false);
9058 }
9059
9060
9061 /**
9062 * Common logic for preprocess and prepare phases of pool application
9063 * tag commands. In preprocess mode we're only detecting invalid
9064 * commands, and determining whether it was a modification or a no-op.
9065 * In prepare mode we're actually updating the pending state.
9066 */
9067 int OSDMonitor::_command_pool_application(const string &prefix,
9068 const cmdmap_t& cmdmap,
9069 stringstream& ss,
9070 bool *modified,
9071 bool preparing)
9072 {
9073 string pool_name;
9074 cmd_getval(cmdmap, "pool", pool_name);
9075 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
9076 if (pool < 0) {
9077 ss << "unrecognized pool '" << pool_name << "'";
9078 return -ENOENT;
9079 }
9080
9081 pg_pool_t p = *osdmap.get_pg_pool(pool);
9082 if (preparing) {
9083 if (pending_inc.new_pools.count(pool)) {
9084 p = pending_inc.new_pools[pool];
9085 }
9086 }
9087
9088 string app;
9089 cmd_getval(cmdmap, "app", app);
9090 bool app_exists = (p.application_metadata.count(app) > 0);
9091
9092 string key;
9093 cmd_getval(cmdmap, "key", key);
9094 if (key == "all") {
9095 ss << "key cannot be 'all'";
9096 return -EINVAL;
9097 }
9098
9099 string value;
9100 cmd_getval(cmdmap, "value", value);
9101 if (value == "all") {
9102 ss << "value cannot be 'all'";
9103 return -EINVAL;
9104 }
9105
9106 if (boost::algorithm::ends_with(prefix, "enable")) {
9107 if (app.empty()) {
9108 ss << "application name must be provided";
9109 return -EINVAL;
9110 }
9111
9112 if (p.is_tier()) {
9113 ss << "application must be enabled on base tier";
9114 return -EINVAL;
9115 }
9116
9117 bool force = false;
9118 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9119
9120 if (!app_exists && !p.application_metadata.empty() && !force) {
9121 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
9122 << "application; pass --yes-i-really-mean-it to proceed anyway";
9123 return -EPERM;
9124 }
9125
9126 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
9127 ss << "too many enabled applications on pool '" << pool_name << "'; "
9128 << "max " << MAX_POOL_APPLICATIONS;
9129 return -EINVAL;
9130 }
9131
9132 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
9133 ss << "application name '" << app << "' too long; max length "
9134 << MAX_POOL_APPLICATION_LENGTH;
9135 return -EINVAL;
9136 }
9137
9138 if (!app_exists) {
9139 p.application_metadata[app] = {};
9140 }
9141 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
9142
9143 } else if (boost::algorithm::ends_with(prefix, "disable")) {
9144 bool force = false;
9145 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9146
9147 if (!force) {
9148 ss << "Are you SURE? Disabling an application within a pool might result "
9149 << "in loss of application functionality; pass "
9150 << "--yes-i-really-mean-it to proceed anyway";
9151 return -EPERM;
9152 }
9153
9154 if (!app_exists) {
9155 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9156 << "'";
9157 return 0; // idempotent
9158 }
9159
9160 p.application_metadata.erase(app);
9161 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
9162
9163 } else if (boost::algorithm::ends_with(prefix, "set")) {
9164 if (p.is_tier()) {
9165 ss << "application metadata must be set on base tier";
9166 return -EINVAL;
9167 }
9168
9169 if (!app_exists) {
9170 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9171 << "'";
9172 return -ENOENT;
9173 }
9174
9175 string key;
9176 cmd_getval(cmdmap, "key", key);
9177
9178 if (key.empty()) {
9179 ss << "key must be provided";
9180 return -EINVAL;
9181 }
9182
9183 auto &app_keys = p.application_metadata[app];
9184 if (app_keys.count(key) == 0 &&
9185 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
9186 ss << "too many keys set for application '" << app << "' on pool '"
9187 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
9188 return -EINVAL;
9189 }
9190
9191 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
9192 ss << "key '" << app << "' too long; max length "
9193 << MAX_POOL_APPLICATION_LENGTH;
9194 return -EINVAL;
9195 }
9196
9197 string value;
9198 cmd_getval(cmdmap, "value", value);
9199 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9200 ss << "value '" << value << "' too long; max length "
9201 << MAX_POOL_APPLICATION_LENGTH;
9202 return -EINVAL;
9203 }
9204
9205 p.application_metadata[app][key] = value;
9206 ss << "set application '" << app << "' key '" << key << "' to '"
9207 << value << "' on pool '" << pool_name << "'";
9208 } else if (boost::algorithm::ends_with(prefix, "rm")) {
9209 if (!app_exists) {
9210 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9211 << "'";
9212 return -ENOENT;
9213 }
9214
9215 string key;
9216 cmd_getval(cmdmap, "key", key);
9217 auto it = p.application_metadata[app].find(key);
9218 if (it == p.application_metadata[app].end()) {
9219 ss << "application '" << app << "' on pool '" << pool_name
9220 << "' does not have key '" << key << "'";
9221 return 0; // idempotent
9222 }
9223
9224 p.application_metadata[app].erase(it);
9225 ss << "removed application '" << app << "' key '" << key << "' on pool '"
9226 << pool_name << "'";
9227 } else {
9228 ceph_abort();
9229 }
9230
9231 if (preparing) {
9232 p.last_change = pending_inc.epoch;
9233 pending_inc.new_pools[pool] = p;
9234 }
9235
9236 // Because we fell through this far, we didn't hit no-op cases,
9237 // so pool was definitely modified
9238 if (modified != nullptr) {
9239 *modified = true;
9240 }
9241
9242 return 0;
9243 }
9244
9245 int OSDMonitor::_prepare_command_osd_crush_remove(
9246 CrushWrapper &newcrush,
9247 int32_t id,
9248 int32_t ancestor,
9249 bool has_ancestor,
9250 bool unlink_only)
9251 {
9252 int err = 0;
9253
9254 if (has_ancestor) {
9255 err = newcrush.remove_item_under(cct, id, ancestor,
9256 unlink_only);
9257 } else {
9258 err = newcrush.remove_item(cct, id, unlink_only);
9259 }
9260 return err;
9261 }
9262
9263 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9264 {
9265 pending_inc.crush.clear();
9266 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9267 }
9268
9269 int OSDMonitor::prepare_command_osd_crush_remove(
9270 CrushWrapper &newcrush,
9271 int32_t id,
9272 int32_t ancestor,
9273 bool has_ancestor,
9274 bool unlink_only)
9275 {
9276 int err = _prepare_command_osd_crush_remove(
9277 newcrush, id, ancestor,
9278 has_ancestor, unlink_only);
9279
9280 if (err < 0)
9281 return err;
9282
9283 ceph_assert(err == 0);
9284 do_osd_crush_remove(newcrush);
9285
9286 return 0;
9287 }
9288
9289 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9290 {
9291 if (osdmap.is_up(id)) {
9292 return -EBUSY;
9293 }
9294
9295 pending_inc.new_state[id] = osdmap.get_state(id);
9296 pending_inc.new_uuid[id] = uuid_d();
9297 pending_metadata_rm.insert(id);
9298 pending_metadata.erase(id);
9299
9300 return 0;
9301 }
9302
9303 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9304 {
9305 ceph_assert(existing_id);
9306 *existing_id = -1;
9307
9308 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9309 if (!osdmap.exists(i) &&
9310 pending_inc.new_up_client.count(i) == 0 &&
9311 (pending_inc.new_state.count(i) == 0 ||
9312 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9313 *existing_id = i;
9314 return -1;
9315 }
9316 }
9317
9318 if (pending_inc.new_max_osd < 0) {
9319 return osdmap.get_max_osd();
9320 }
9321 return pending_inc.new_max_osd;
9322 }
9323
9324 void OSDMonitor::do_osd_create(
9325 const int32_t id,
9326 const uuid_d& uuid,
9327 const string& device_class,
9328 int32_t* new_id)
9329 {
9330 dout(10) << __func__ << " uuid " << uuid << dendl;
9331 ceph_assert(new_id);
9332
9333 // We presume validation has been performed prior to calling this
9334 // function. We assert with prejudice.
9335
9336 int32_t allocated_id = -1; // declare here so we can jump
9337 int32_t existing_id = -1;
9338 if (!uuid.is_zero()) {
9339 existing_id = osdmap.identify_osd(uuid);
9340 if (existing_id >= 0) {
9341 ceph_assert(id < 0 || id == existing_id);
9342 *new_id = existing_id;
9343 goto out;
9344 } else if (id >= 0) {
9345 // uuid does not exist, and id has been provided, so just create
9346 // the new osd.id
9347 *new_id = id;
9348 goto out;
9349 }
9350 }
9351
9352 // allocate a new id
9353 allocated_id = _allocate_osd_id(&existing_id);
9354 dout(10) << __func__ << " allocated id " << allocated_id
9355 << " existing id " << existing_id << dendl;
9356 if (existing_id >= 0) {
9357 ceph_assert(existing_id < osdmap.get_max_osd());
9358 ceph_assert(allocated_id < 0);
9359 *new_id = existing_id;
9360 } else if (allocated_id >= 0) {
9361 ceph_assert(existing_id < 0);
9362 // raise max_osd
9363 if (pending_inc.new_max_osd < 0) {
9364 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9365 } else {
9366 ++pending_inc.new_max_osd;
9367 }
9368 *new_id = pending_inc.new_max_osd - 1;
9369 ceph_assert(*new_id == allocated_id);
9370 } else {
9371 ceph_abort_msg("unexpected condition");
9372 }
9373
9374 out:
9375 if (device_class.size()) {
9376 CrushWrapper newcrush = _get_pending_crush();
9377 if (newcrush.get_max_devices() < *new_id + 1) {
9378 newcrush.set_max_devices(*new_id + 1);
9379 }
9380 string name = string("osd.") + stringify(*new_id);
9381 if (!newcrush.item_exists(*new_id)) {
9382 newcrush.set_item_name(*new_id, name);
9383 }
9384 ostringstream ss;
9385 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9386 if (r < 0) {
9387 derr << __func__ << " failed to set " << name << " device_class "
9388 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9389 << dendl;
9390 // non-fatal... this might be a replay and we want to be idempotent.
9391 } else {
9392 dout(20) << __func__ << " set " << name << " device_class " << device_class
9393 << dendl;
9394 pending_inc.crush.clear();
9395 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9396 }
9397 } else {
9398 dout(20) << __func__ << " no device_class" << dendl;
9399 }
9400
9401 dout(10) << __func__ << " using id " << *new_id << dendl;
9402 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9403 pending_inc.new_max_osd = *new_id + 1;
9404 }
9405
9406 pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9407 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9408 // set it for us. (ugh.)
9409 pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9410 if (!uuid.is_zero())
9411 pending_inc.new_uuid[*new_id] = uuid;
9412 }
9413
9414 int OSDMonitor::validate_osd_create(
9415 const int32_t id,
9416 const uuid_d& uuid,
9417 const bool check_osd_exists,
9418 int32_t* existing_id,
9419 stringstream& ss)
9420 {
9421
9422 dout(10) << __func__ << " id " << id << " uuid " << uuid
9423 << " check_osd_exists " << check_osd_exists << dendl;
9424
9425 ceph_assert(existing_id);
9426
9427 if (id < 0 && uuid.is_zero()) {
9428 // we have nothing to validate
9429 *existing_id = -1;
9430 return 0;
9431 } else if (uuid.is_zero()) {
9432 // we have an id but we will ignore it - because that's what
9433 // `osd create` does.
9434 return 0;
9435 }
9436
9437 /*
9438 * This function will be used to validate whether we are able to
9439 * create a new osd when the `uuid` is specified.
9440 *
9441 * It will be used by both `osd create` and `osd new`, as the checks
9442 * are basically the same when it pertains to osd id and uuid validation.
9443 * However, `osd create` presumes an `uuid` is optional, for legacy
9444 * reasons, while `osd new` requires the `uuid` to be provided. This
9445 * means that `osd create` will not be idempotent if an `uuid` is not
9446 * provided, but we will always guarantee the idempotency of `osd new`.
9447 */
9448
9449 ceph_assert(!uuid.is_zero());
9450 if (pending_inc.identify_osd(uuid) >= 0) {
9451 // osd is about to exist
9452 return -EAGAIN;
9453 }
9454
9455 int32_t i = osdmap.identify_osd(uuid);
9456 if (i >= 0) {
9457 // osd already exists
9458 if (id >= 0 && i != id) {
9459 ss << "uuid " << uuid << " already in use for different id " << i;
9460 return -EEXIST;
9461 }
9462 // return a positive errno to distinguish between a blocking error
9463 // and an error we consider to not be a problem (i.e., this would be
9464 // an idempotent operation).
9465 *existing_id = i;
9466 return EEXIST;
9467 }
9468 // i < 0
9469 if (id >= 0) {
9470 if (pending_inc.new_state.count(id)) {
9471 // osd is about to exist
9472 return -EAGAIN;
9473 }
9474 // we may not care if an osd exists if we are recreating a previously
9475 // destroyed osd.
9476 if (check_osd_exists && osdmap.exists(id)) {
9477 ss << "id " << id << " already in use and does not match uuid "
9478 << uuid;
9479 return -EINVAL;
9480 }
9481 }
9482 return 0;
9483 }
9484
9485 int OSDMonitor::prepare_command_osd_create(
9486 const int32_t id,
9487 const uuid_d& uuid,
9488 int32_t* existing_id,
9489 stringstream& ss)
9490 {
9491 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9492 ceph_assert(existing_id);
9493 if (osdmap.is_destroyed(id)) {
9494 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9495 "instead.";
9496 return -EINVAL;
9497 }
9498
9499 if (uuid.is_zero()) {
9500 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9501 }
9502
9503 return validate_osd_create(id, uuid, true, existing_id, ss);
9504 }
9505
9506 int OSDMonitor::prepare_command_osd_new(
9507 MonOpRequestRef op,
9508 const cmdmap_t& cmdmap,
9509 const map<string,string>& params,
9510 stringstream &ss,
9511 Formatter *f)
9512 {
9513 uuid_d uuid;
9514 string uuidstr;
9515 int64_t id = -1;
9516
9517 ceph_assert(paxos.is_plugged());
9518
9519 dout(10) << __func__ << " " << op << dendl;
9520
9521 /* validate command. abort now if something's wrong. */
9522
9523 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9524 *
9525 * If `id` is not specified, we will identify any existing osd based
9526 * on `uuid`. Operation will be idempotent iff secrets match.
9527 *
9528 * If `id` is specified, we will identify any existing osd based on
9529 * `uuid` and match against `id`. If they match, operation will be
9530 * idempotent iff secrets match.
9531 *
9532 * `-i secrets.json` will be optional. If supplied, will be used
9533 * to check for idempotency when `id` and `uuid` match.
9534 *
9535 * If `id` is not specified, and `uuid` does not exist, an id will
9536 * be found or allocated for the osd.
9537 *
9538 * If `id` is specified, and the osd has been previously marked
9539 * as destroyed, then the `id` will be reused.
9540 */
9541 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9542 ss << "requires the OSD's UUID to be specified.";
9543 return -EINVAL;
9544 } else if (!uuid.parse(uuidstr.c_str())) {
9545 ss << "invalid UUID value '" << uuidstr << "'.";
9546 return -EINVAL;
9547 }
9548
9549 if (cmd_getval(cmdmap, "id", id) &&
9550 (id < 0)) {
9551 ss << "invalid OSD id; must be greater or equal than zero.";
9552 return -EINVAL;
9553 }
9554
9555 // are we running an `osd create`-like command, or recreating
9556 // a previously destroyed osd?
9557
9558 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9559
9560 // we will care about `id` to assess whether osd is `destroyed`, or
9561 // to create a new osd.
9562 // we will need an `id` by the time we reach auth.
9563
9564 int32_t existing_id = -1;
9565 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9566 &existing_id, ss);
9567
9568 bool may_be_idempotent = false;
9569 if (err == EEXIST) {
9570 // this is idempotent from the osdmon's point-of-view
9571 may_be_idempotent = true;
9572 ceph_assert(existing_id >= 0);
9573 id = existing_id;
9574 } else if (err < 0) {
9575 return err;
9576 }
9577
9578 if (!may_be_idempotent) {
9579 // idempotency is out of the window. We are either creating a new
9580 // osd or recreating a destroyed osd.
9581 //
9582 // We now need to figure out if we have an `id` (and if it's valid),
9583 // of find an `id` if we don't have one.
9584
9585 // NOTE: we need to consider the case where the `id` is specified for
9586 // `osd create`, and we must honor it. So this means checking if
9587 // the `id` is destroyed, and if so assume the destroy; otherwise,
9588 // check if it `exists` - in which case we complain about not being
9589 // `destroyed`. In the end, if nothing fails, we must allow the
9590 // creation, so that we are compatible with `create`.
9591 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9592 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9593 ss << "OSD " << id << " has not yet been destroyed";
9594 return -EINVAL;
9595 } else if (id < 0) {
9596 // find an `id`
9597 id = _allocate_osd_id(&existing_id);
9598 if (id < 0) {
9599 ceph_assert(existing_id >= 0);
9600 id = existing_id;
9601 }
9602 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9603 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9604 dout(10) << __func__ << " recreating osd." << id << dendl;
9605 } else {
9606 dout(10) << __func__ << " creating new osd." << id << dendl;
9607 }
9608 } else {
9609 ceph_assert(id >= 0);
9610 ceph_assert(osdmap.exists(id));
9611 }
9612
9613 // we are now able to either create a brand new osd or reuse an existing
9614 // osd that has been previously destroyed.
9615
9616 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9617
9618 if (may_be_idempotent && params.empty()) {
9619 // nothing to do, really.
9620 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9621 ceph_assert(id >= 0);
9622 if (f) {
9623 f->open_object_section("created_osd");
9624 f->dump_int("osdid", id);
9625 f->close_section();
9626 } else {
9627 ss << id;
9628 }
9629 return EEXIST;
9630 }
9631
9632 string device_class;
9633 auto p = params.find("crush_device_class");
9634 if (p != params.end()) {
9635 device_class = p->second;
9636 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9637 }
9638 string cephx_secret, lockbox_secret, dmcrypt_key;
9639 bool has_lockbox = false;
9640 bool has_secrets = params.count("cephx_secret")
9641 || params.count("cephx_lockbox_secret")
9642 || params.count("dmcrypt_key");
9643
9644 KVMonitor *svc = nullptr;
9645 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9646
9647 if (has_secrets) {
9648 if (params.count("cephx_secret") == 0) {
9649 ss << "requires a cephx secret.";
9650 return -EINVAL;
9651 }
9652 cephx_secret = params.at("cephx_secret");
9653
9654 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9655 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9656
9657 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9658 << " dmcrypt " << has_dmcrypt_key << dendl;
9659
9660 if (has_lockbox_secret && has_dmcrypt_key) {
9661 has_lockbox = true;
9662 lockbox_secret = params.at("cephx_lockbox_secret");
9663 dmcrypt_key = params.at("dmcrypt_key");
9664 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9665 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9666 return -EINVAL;
9667 }
9668
9669 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9670
9671 err = mon.authmon()->validate_osd_new(id, uuid,
9672 cephx_secret,
9673 lockbox_secret,
9674 cephx_entity,
9675 lockbox_entity,
9676 ss);
9677 if (err < 0) {
9678 return err;
9679 } else if (may_be_idempotent && err != EEXIST) {
9680 // for this to be idempotent, `id` should already be >= 0; no need
9681 // to use validate_id.
9682 ceph_assert(id >= 0);
9683 ss << "osd." << id << " exists but secrets do not match";
9684 return -EEXIST;
9685 }
9686
9687 if (has_lockbox) {
9688 svc = mon.kvmon();
9689 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9690 if (err < 0) {
9691 return err;
9692 } else if (may_be_idempotent && err != EEXIST) {
9693 ceph_assert(id >= 0);
9694 ss << "osd." << id << " exists but dm-crypt key does not match.";
9695 return -EEXIST;
9696 }
9697 }
9698 }
9699 ceph_assert(!has_secrets || !cephx_secret.empty());
9700 ceph_assert(!has_lockbox || !lockbox_secret.empty());
9701
9702 if (may_be_idempotent) {
9703 // we have nothing to do for either the osdmon or the authmon,
9704 // and we have no lockbox - so the config key service will not be
9705 // touched. This is therefore an idempotent operation, and we can
9706 // just return right away.
9707 dout(10) << __func__ << " idempotent -- no op." << dendl;
9708 ceph_assert(id >= 0);
9709 if (f) {
9710 f->open_object_section("created_osd");
9711 f->dump_int("osdid", id);
9712 f->close_section();
9713 } else {
9714 ss << id;
9715 }
9716 return EEXIST;
9717 }
9718 ceph_assert(!may_be_idempotent);
9719
9720 // perform updates.
9721 if (has_secrets) {
9722 ceph_assert(!cephx_secret.empty());
9723 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9724 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9725
9726 err = mon.authmon()->do_osd_new(cephx_entity,
9727 lockbox_entity,
9728 has_lockbox);
9729 ceph_assert(0 == err);
9730
9731 if (has_lockbox) {
9732 ceph_assert(nullptr != svc);
9733 svc->do_osd_new(uuid, dmcrypt_key);
9734 }
9735 }
9736
9737 if (is_recreate_destroyed) {
9738 ceph_assert(id >= 0);
9739 ceph_assert(osdmap.is_destroyed(id));
9740 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9741 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9742 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9743 }
9744 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9745 // due to http://tracker.ceph.com/issues/20751 some clusters may
9746 // have UP set for non-existent OSDs; make sure it is cleared
9747 // for a newly created osd.
9748 pending_inc.new_state[id] |= CEPH_OSD_UP;
9749 }
9750 pending_inc.new_uuid[id] = uuid;
9751 } else {
9752 ceph_assert(id >= 0);
9753 int32_t new_id = -1;
9754 do_osd_create(id, uuid, device_class, &new_id);
9755 ceph_assert(new_id >= 0);
9756 ceph_assert(id == new_id);
9757 }
9758
9759 if (f) {
9760 f->open_object_section("created_osd");
9761 f->dump_int("osdid", id);
9762 f->close_section();
9763 } else {
9764 ss << id;
9765 }
9766
9767 return 0;
9768 }
9769
9770 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9771 {
9772 op->mark_osdmon_event(__func__);
9773 auto m = op->get_req<MMonCommand>();
9774 stringstream ss;
9775 cmdmap_t cmdmap;
9776 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9777 string rs = ss.str();
9778 mon.reply_command(op, -EINVAL, rs, get_last_committed());
9779 return false; /* nothing to propose */
9780 }
9781
9782 MonSession *session = op->get_session();
9783 if (!session) {
9784 derr << __func__ << " no session" << dendl;
9785 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9786 return false; /* nothing to propose */
9787 }
9788
9789 return prepare_command_impl(op, cmdmap);
9790 }
9791
9792 static int parse_reweights(CephContext *cct,
9793 const cmdmap_t& cmdmap,
9794 const OSDMap& osdmap,
9795 map<int32_t, uint32_t>* weights)
9796 {
9797 string weights_str;
9798 if (!cmd_getval(cmdmap, "weights", weights_str)) {
9799 return -EINVAL;
9800 }
9801 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9802 json_spirit::mValue json_value;
9803 if (!json_spirit::read(weights_str, json_value)) {
9804 return -EINVAL;
9805 }
9806 if (json_value.type() != json_spirit::obj_type) {
9807 return -EINVAL;
9808 }
9809 const auto obj = json_value.get_obj();
9810 try {
9811 for (auto& osd_weight : obj) {
9812 auto osd_id = std::stoi(osd_weight.first);
9813 if (!osdmap.exists(osd_id)) {
9814 return -ENOENT;
9815 }
9816 if (osd_weight.second.type() != json_spirit::str_type) {
9817 return -EINVAL;
9818 }
9819 auto weight = std::stoul(osd_weight.second.get_str());
9820 weights->insert({osd_id, weight});
9821 }
9822 } catch (const std::logic_error& e) {
9823 return -EINVAL;
9824 }
9825 return 0;
9826 }
9827
9828 int OSDMonitor::prepare_command_osd_destroy(
9829 int32_t id,
9830 stringstream& ss)
9831 {
9832 ceph_assert(paxos.is_plugged());
9833
9834 // we check if the osd exists for the benefit of `osd purge`, which may
9835 // have previously removed the osd. If the osd does not exist, return
9836 // -ENOENT to convey this, and let the caller deal with it.
9837 //
9838 // we presume that all auth secrets and config keys were removed prior
9839 // to this command being called. if they exist by now, we also assume
9840 // they must have been created by some other command and do not pertain
9841 // to this non-existent osd.
9842 if (!osdmap.exists(id)) {
9843 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9844 return -ENOENT;
9845 }
9846
9847 uuid_d uuid = osdmap.get_uuid(id);
9848 dout(10) << __func__ << " destroying osd." << id
9849 << " uuid " << uuid << dendl;
9850
9851 // if it has been destroyed, we assume our work here is done.
9852 if (osdmap.is_destroyed(id)) {
9853 ss << "destroyed osd." << id;
9854 return 0;
9855 }
9856
9857 EntityName cephx_entity, lockbox_entity;
9858 bool idempotent_auth = false, idempotent_cks = false;
9859
9860 int err = mon.authmon()->validate_osd_destroy(id, uuid,
9861 cephx_entity,
9862 lockbox_entity,
9863 ss);
9864 if (err < 0) {
9865 if (err == -ENOENT) {
9866 idempotent_auth = true;
9867 } else {
9868 return err;
9869 }
9870 }
9871
9872 auto svc = mon.kvmon();
9873 err = svc->validate_osd_destroy(id, uuid);
9874 if (err < 0) {
9875 ceph_assert(err == -ENOENT);
9876 err = 0;
9877 idempotent_cks = true;
9878 }
9879
9880 if (!idempotent_auth) {
9881 err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9882 ceph_assert(0 == err);
9883 }
9884
9885 if (!idempotent_cks) {
9886 svc->do_osd_destroy(id, uuid);
9887 }
9888
9889 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9890 pending_inc.new_uuid[id] = uuid_d();
9891
9892 // we can only propose_pending() once per service, otherwise we'll be
9893 // defying PaxosService and all laws of nature. Therefore, as we may
9894 // be used during 'osd purge', let's keep the caller responsible for
9895 // proposing.
9896 ceph_assert(err == 0);
9897 return 0;
9898 }
9899
9900 int OSDMonitor::prepare_command_osd_purge(
9901 int32_t id,
9902 stringstream& ss)
9903 {
9904 ceph_assert(paxos.is_plugged());
9905 dout(10) << __func__ << " purging osd." << id << dendl;
9906
9907 ceph_assert(!osdmap.is_up(id));
9908
9909 /*
9910 * This may look a bit weird, but this is what's going to happen:
9911 *
9912 * 1. we make sure that removing from crush works
9913 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9914 * error, then we abort the whole operation, as no updates
9915 * have been made. However, we this function will have
9916 * side-effects, thus we need to make sure that all operations
9917 * performed henceforth will *always* succeed.
9918 * 3. we call `prepare_command_osd_remove()`. Although this
9919 * function can return an error, it currently only checks if the
9920 * osd is up - and we have made sure that it is not so, so there
9921 * is no conflict, and it is effectively an update.
9922 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9923 * the crush update we delayed from before.
9924 */
9925
9926 CrushWrapper newcrush = _get_pending_crush();
9927
9928 bool may_be_idempotent = false;
9929
9930 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9931 if (err == -ENOENT) {
9932 err = 0;
9933 may_be_idempotent = true;
9934 } else if (err < 0) {
9935 ss << "error removing osd." << id << " from crush";
9936 return err;
9937 }
9938
9939 // no point destroying the osd again if it has already been marked destroyed
9940 if (!osdmap.is_destroyed(id)) {
9941 err = prepare_command_osd_destroy(id, ss);
9942 if (err < 0) {
9943 if (err == -ENOENT) {
9944 err = 0;
9945 } else {
9946 return err;
9947 }
9948 } else {
9949 may_be_idempotent = false;
9950 }
9951 }
9952 ceph_assert(0 == err);
9953
9954 if (may_be_idempotent && !osdmap.exists(id)) {
9955 dout(10) << __func__ << " osd." << id << " does not exist and "
9956 << "we are idempotent." << dendl;
9957 return -ENOENT;
9958 }
9959
9960 err = prepare_command_osd_remove(id);
9961 // we should not be busy, as we should have made sure this id is not up.
9962 ceph_assert(0 == err);
9963
9964 do_osd_crush_remove(newcrush);
9965 return 0;
9966 }
9967
9968 int OSDMonitor::parse_pgid(const cmdmap_t& cmdmap, stringstream &ss,
9969 /* out */ pg_t &pgid, std::optional<string> pgids) {
9970 string pgidstr;
9971 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
9972 ss << "unable to parse 'pgid' value '"
9973 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
9974 return -EINVAL;
9975 }
9976 if (!pgid.parse(pgidstr.c_str())) {
9977 ss << "invalid pgid '" << pgidstr << "'";
9978 return -EINVAL;
9979 }
9980 if (!osdmap.pg_exists(pgid)) {
9981 ss << "pgid '" << pgid << "' does not exist";
9982 return -ENOENT;
9983 }
9984 if (pgids.has_value())
9985 pgids.value() = pgidstr;
9986 return 0;
9987 }
9988
9989 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9990 const cmdmap_t& cmdmap)
9991 {
9992 op->mark_osdmon_event(__func__);
9993 auto m = op->get_req<MMonCommand>();
9994 stringstream ss;
9995 string rs;
9996 bufferlist rdata;
9997 int err = 0;
9998
9999 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
10000 boost::scoped_ptr<Formatter> f(Formatter::create(format));
10001
10002 string prefix;
10003 cmd_getval(cmdmap, "prefix", prefix);
10004
10005 int64_t osdid;
10006 string osd_name;
10007 bool osdid_present = false;
10008 if (prefix != "osd pg-temp" &&
10009 prefix != "osd pg-upmap" &&
10010 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
10011 osdid_present = cmd_getval(cmdmap, "id", osdid);
10012 }
10013 if (osdid_present) {
10014 ostringstream oss;
10015 oss << "osd." << osdid;
10016 osd_name = oss.str();
10017 }
10018
10019 // Even if there's a pending state with changes that could affect
10020 // a command, considering that said state isn't yet committed, we
10021 // just don't care about those changes if the command currently being
10022 // handled acts as a no-op against the current committed state.
10023 // In a nutshell, we assume this command happens *before*.
10024 //
10025 // Let me make this clearer:
10026 //
10027 // - If we have only one client, and that client issues some
10028 // operation that would conflict with this operation but is
10029 // still on the pending state, then we would be sure that said
10030 // operation wouldn't have returned yet, so the client wouldn't
10031 // issue this operation (unless the client didn't wait for the
10032 // operation to finish, and that would be the client's own fault).
10033 //
10034 // - If we have more than one client, each client will observe
10035 // whatever is the state at the moment of the commit. So, if we
10036 // have two clients, one issuing an unlink and another issuing a
10037 // link, and if the link happens while the unlink is still on the
10038 // pending state, from the link's point-of-view this is a no-op.
10039 // If different clients are issuing conflicting operations and
10040 // they care about that, then the clients should make sure they
10041 // enforce some kind of concurrency mechanism -- from our
10042 // perspective that's what Douglas Adams would call an SEP.
10043 //
10044 // This should be used as a general guideline for most commands handled
10045 // in this function. Adapt as you see fit, but please bear in mind that
10046 // this is the expected behavior.
10047
10048
10049 if (prefix == "osd setcrushmap" ||
10050 (prefix == "osd crush set" && !osdid_present)) {
10051 if (pending_inc.crush.length()) {
10052 dout(10) << __func__ << " waiting for pending crush update " << dendl;
10053 goto wait;
10054 }
10055 dout(10) << "prepare_command setting new crush map" << dendl;
10056 bufferlist data(m->get_data());
10057 CrushWrapper crush;
10058 try {
10059 auto bl = data.cbegin();
10060 crush.decode(bl);
10061 }
10062 catch (const std::exception &e) {
10063 err = -EINVAL;
10064 ss << "Failed to parse crushmap: " << e.what();
10065 goto reply_no_propose;
10066 }
10067
10068 int64_t prior_version = 0;
10069 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
10070 if (prior_version == osdmap.get_crush_version() - 1) {
10071 // see if we are a resend of the last update. this is imperfect
10072 // (multiple racing updaters may not both get reliable success)
10073 // but we expect crush updaters (via this interface) to be rare-ish.
10074 bufferlist current, proposed;
10075 osdmap.crush->encode(current, mon.get_quorum_con_features());
10076 crush.encode(proposed, mon.get_quorum_con_features());
10077 if (current.contents_equal(proposed)) {
10078 dout(10) << __func__
10079 << " proposed matches current and version equals previous"
10080 << dendl;
10081 err = 0;
10082 ss << osdmap.get_crush_version();
10083 goto reply_no_propose;
10084 }
10085 }
10086 if (prior_version != osdmap.get_crush_version()) {
10087 err = -EPERM;
10088 ss << "prior_version " << prior_version << " != crush version "
10089 << osdmap.get_crush_version();
10090 goto reply_no_propose;
10091 }
10092 }
10093
10094 if (!validate_crush_against_features(&crush, ss)) {
10095 err = -EINVAL;
10096 goto reply_no_propose;
10097 }
10098
10099 err = osdmap.validate_crush_rules(&crush, &ss);
10100 if (err < 0) {
10101 goto reply_no_propose;
10102 }
10103
10104 if (g_conf()->mon_osd_crush_smoke_test) {
10105 // sanity check: test some inputs to make sure this map isn't
10106 // totally broken
10107 dout(10) << " testing map" << dendl;
10108 stringstream ess;
10109 CrushTester tester(crush, ess);
10110 tester.set_min_x(0);
10111 tester.set_max_x(50);
10112 tester.set_num_rep(3); // arbitrary
10113 auto start = ceph::coarse_mono_clock::now();
10114 int r = tester.test_with_fork(cct, g_conf()->mon_lease);
10115 auto duration = ceph::coarse_mono_clock::now() - start;
10116 if (r < 0) {
10117 dout(10) << " tester.test_with_fork returns " << r
10118 << ": " << ess.str() << dendl;
10119 ss << "crush smoke test failed with " << r << ": " << ess.str();
10120 err = r;
10121 goto reply_no_propose;
10122 }
10123 dout(10) << __func__ << " crush somke test duration: "
10124 << duration << ", result: " << ess.str() << dendl;
10125 }
10126
10127 pending_inc.crush = data;
10128 ss << osdmap.get_crush_version() + 1;
10129 goto update;
10130
10131 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
10132 CrushWrapper newcrush = _get_pending_crush();
10133 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
10134 int bid = -1 - b;
10135 if (newcrush.bucket_exists(bid) &&
10136 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
10137 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
10138 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
10139 }
10140 }
10141 if (!validate_crush_against_features(&newcrush, ss)) {
10142 err = -EINVAL;
10143 goto reply_no_propose;
10144 }
10145 pending_inc.crush.clear();
10146 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10147 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10148 get_last_committed() + 1));
10149 return true;
10150 } else if (prefix == "osd crush set-device-class") {
10151 string device_class;
10152 if (!cmd_getval(cmdmap, "class", device_class)) {
10153 err = -EINVAL; // no value!
10154 goto reply_no_propose;
10155 }
10156
10157 bool stop = false;
10158 vector<string> idvec;
10159 cmd_getval(cmdmap, "ids", idvec);
10160 CrushWrapper newcrush = _get_pending_crush();
10161 set<int> updated;
10162 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10163 set<int> osds;
10164 // wildcard?
10165 if (j == 0 &&
10166 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10167 osdmap.get_all_osds(osds);
10168 stop = true;
10169 } else {
10170 // try traditional single osd way
10171 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10172 if (osd < 0) {
10173 // ss has reason for failure
10174 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10175 err = -EINVAL;
10176 continue;
10177 }
10178 osds.insert(osd);
10179 }
10180
10181 for (auto &osd : osds) {
10182 if (!osdmap.exists(osd)) {
10183 ss << "osd." << osd << " does not exist. ";
10184 continue;
10185 }
10186
10187 ostringstream oss;
10188 oss << "osd." << osd;
10189 string name = oss.str();
10190
10191 if (newcrush.get_max_devices() < osd + 1) {
10192 newcrush.set_max_devices(osd + 1);
10193 }
10194 string action;
10195 if (newcrush.item_exists(osd)) {
10196 action = "updating";
10197 } else {
10198 action = "creating";
10199 newcrush.set_item_name(osd, name);
10200 }
10201
10202 dout(5) << action << " crush item id " << osd << " name '" << name
10203 << "' device_class '" << device_class << "'"
10204 << dendl;
10205 err = newcrush.update_device_class(osd, device_class, name, &ss);
10206 if (err < 0) {
10207 goto reply_no_propose;
10208 }
10209 if (err == 0 && !_have_pending_crush()) {
10210 if (!stop) {
10211 // for single osd only, wildcard makes too much noise
10212 ss << "set-device-class item id " << osd << " name '" << name
10213 << "' device_class '" << device_class << "': no change. ";
10214 }
10215 } else {
10216 updated.insert(osd);
10217 }
10218 }
10219 }
10220
10221 pending_inc.crush.clear();
10222 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10223 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10224 getline(ss, rs);
10225 wait_for_finished_proposal(
10226 op,
10227 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10228 return true;
10229 } else if (prefix == "osd crush rm-device-class") {
10230 bool stop = false;
10231 vector<string> idvec;
10232 cmd_getval(cmdmap, "ids", idvec);
10233 CrushWrapper newcrush = _get_pending_crush();
10234 set<int> updated;
10235
10236 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10237 set<int> osds;
10238
10239 // wildcard?
10240 if (j == 0 &&
10241 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10242 osdmap.get_all_osds(osds);
10243 stop = true;
10244 } else {
10245 // try traditional single osd way
10246 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10247 if (osd < 0) {
10248 // ss has reason for failure
10249 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10250 err = -EINVAL;
10251 goto reply_no_propose;
10252 }
10253 osds.insert(osd);
10254 }
10255
10256 for (auto &osd : osds) {
10257 if (!osdmap.exists(osd)) {
10258 ss << "osd." << osd << " does not exist. ";
10259 continue;
10260 }
10261
10262 auto class_name = newcrush.get_item_class(osd);
10263 if (!class_name) {
10264 ss << "osd." << osd << " belongs to no class, ";
10265 continue;
10266 }
10267 // note that we do not verify if class_is_in_use here
10268 // in case the device is misclassified and user wants
10269 // to overridely reset...
10270
10271 err = newcrush.remove_device_class(cct, osd, &ss);
10272 if (err < 0) {
10273 // ss has reason for failure
10274 goto reply_no_propose;
10275 }
10276 updated.insert(osd);
10277 }
10278 }
10279
10280 pending_inc.crush.clear();
10281 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10282 ss << "done removing class of osd(s): " << updated;
10283 getline(ss, rs);
10284 wait_for_finished_proposal(
10285 op,
10286 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10287 return true;
10288 } else if (prefix == "osd crush class create") {
10289 string device_class;
10290 if (!cmd_getval(cmdmap, "class", device_class)) {
10291 err = -EINVAL; // no value!
10292 goto reply_no_propose;
10293 }
10294 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10295 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10296 << "luminous' before using crush device classes";
10297 err = -EPERM;
10298 goto reply_no_propose;
10299 }
10300 if (!_have_pending_crush() &&
10301 _get_stable_crush().class_exists(device_class)) {
10302 ss << "class '" << device_class << "' already exists";
10303 goto reply_no_propose;
10304 }
10305 CrushWrapper newcrush = _get_pending_crush();
10306 if (newcrush.class_exists(device_class)) {
10307 ss << "class '" << device_class << "' already exists";
10308 goto update;
10309 }
10310 int class_id = newcrush.get_or_create_class_id(device_class);
10311 pending_inc.crush.clear();
10312 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10313 ss << "created class " << device_class << " with id " << class_id
10314 << " to crush map";
10315 goto update;
10316 } else if (prefix == "osd crush class rm") {
10317 string device_class;
10318 if (!cmd_getval(cmdmap, "class", device_class)) {
10319 err = -EINVAL; // no value!
10320 goto reply_no_propose;
10321 }
10322 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10323 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10324 << "luminous' before using crush device classes";
10325 err = -EPERM;
10326 goto reply_no_propose;
10327 }
10328
10329 if (!osdmap.crush->class_exists(device_class)) {
10330 err = 0;
10331 goto reply_no_propose;
10332 }
10333
10334 CrushWrapper newcrush = _get_pending_crush();
10335 if (!newcrush.class_exists(device_class)) {
10336 err = 0; // make command idempotent
10337 goto wait;
10338 }
10339 int class_id = newcrush.get_class_id(device_class);
10340 stringstream ts;
10341 if (newcrush.class_is_in_use(class_id, &ts)) {
10342 err = -EBUSY;
10343 ss << "class '" << device_class << "' " << ts.str();
10344 goto reply_no_propose;
10345 }
10346
10347 // check if class is used by any erasure-code-profiles
10348 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10349 osdmap.get_erasure_code_profiles();
10350 auto ec_profiles = pending_inc.get_erasure_code_profiles();
10351 #ifdef HAVE_STDLIB_MAP_SPLICING
10352 ec_profiles.merge(old_ec_profiles);
10353 #else
10354 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10355 make_move_iterator(end(old_ec_profiles)));
10356 #endif
10357 list<string> referenced_by;
10358 for (auto &i: ec_profiles) {
10359 for (auto &j: i.second) {
10360 if ("crush-device-class" == j.first && device_class == j.second) {
10361 referenced_by.push_back(i.first);
10362 }
10363 }
10364 }
10365 if (!referenced_by.empty()) {
10366 err = -EBUSY;
10367 ss << "class '" << device_class
10368 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10369 goto reply_no_propose;
10370 }
10371
10372 set<int> osds;
10373 newcrush.get_devices_by_class(device_class, &osds);
10374 for (auto& p: osds) {
10375 err = newcrush.remove_device_class(cct, p, &ss);
10376 if (err < 0) {
10377 // ss has reason for failure
10378 goto reply_no_propose;
10379 }
10380 }
10381
10382 if (osds.empty()) {
10383 // empty class, remove directly
10384 err = newcrush.remove_class_name(device_class);
10385 if (err < 0) {
10386 ss << "class '" << device_class << "' cannot be removed '"
10387 << cpp_strerror(err) << "'";
10388 goto reply_no_propose;
10389 }
10390 }
10391
10392 pending_inc.crush.clear();
10393 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10394 ss << "removed class " << device_class << " with id " << class_id
10395 << " from crush map";
10396 goto update;
10397 } else if (prefix == "osd crush class rename") {
10398 string srcname, dstname;
10399 if (!cmd_getval(cmdmap, "srcname", srcname)) {
10400 err = -EINVAL;
10401 goto reply_no_propose;
10402 }
10403 if (!cmd_getval(cmdmap, "dstname", dstname)) {
10404 err = -EINVAL;
10405 goto reply_no_propose;
10406 }
10407
10408 CrushWrapper newcrush = _get_pending_crush();
10409 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10410 // suppose this is a replay and return success
10411 // so command is idempotent
10412 ss << "already renamed to '" << dstname << "'";
10413 err = 0;
10414 goto reply_no_propose;
10415 }
10416
10417 err = newcrush.rename_class(srcname, dstname);
10418 if (err < 0) {
10419 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10420 << cpp_strerror(err);
10421 goto reply_no_propose;
10422 }
10423
10424 pending_inc.crush.clear();
10425 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10426 ss << "rename class '" << srcname << "' to '" << dstname << "'";
10427 goto update;
10428 } else if (prefix == "osd crush add-bucket") {
10429 // os crush add-bucket <name> <type>
10430 string name, typestr;
10431 vector<string> argvec;
10432 cmd_getval(cmdmap, "name", name);
10433 cmd_getval(cmdmap, "type", typestr);
10434 cmd_getval(cmdmap, "args", argvec);
10435 map<string,string> loc;
10436 if (!argvec.empty()) {
10437 CrushWrapper::parse_loc_map(argvec, &loc);
10438 dout(0) << "will create and move bucket '" << name
10439 << "' to location " << loc << dendl;
10440 }
10441
10442 if (!_have_pending_crush() &&
10443 _get_stable_crush().name_exists(name)) {
10444 ss << "bucket '" << name << "' already exists";
10445 goto reply_no_propose;
10446 }
10447
10448 CrushWrapper newcrush = _get_pending_crush();
10449
10450 if (newcrush.name_exists(name)) {
10451 ss << "bucket '" << name << "' already exists";
10452 goto update;
10453 }
10454 int type = newcrush.get_type_id(typestr);
10455 if (type < 0) {
10456 ss << "type '" << typestr << "' does not exist";
10457 err = -EINVAL;
10458 goto reply_no_propose;
10459 }
10460 if (type == 0) {
10461 ss << "type '" << typestr << "' is for devices, not buckets";
10462 err = -EINVAL;
10463 goto reply_no_propose;
10464 }
10465 int bucketno;
10466 err = newcrush.add_bucket(0, 0,
10467 CRUSH_HASH_DEFAULT, type, 0, NULL,
10468 NULL, &bucketno);
10469 if (err < 0) {
10470 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10471 goto reply_no_propose;
10472 }
10473 err = newcrush.set_item_name(bucketno, name);
10474 if (err < 0) {
10475 ss << "error setting bucket name to '" << name << "'";
10476 goto reply_no_propose;
10477 }
10478
10479 if (!loc.empty()) {
10480 if (!newcrush.check_item_loc(cct, bucketno, loc,
10481 (int *)NULL)) {
10482 err = newcrush.move_bucket(cct, bucketno, loc);
10483 if (err < 0) {
10484 ss << "error moving bucket '" << name << "' to location " << loc;
10485 goto reply_no_propose;
10486 }
10487 } else {
10488 ss << "no need to move item id " << bucketno << " name '" << name
10489 << "' to location " << loc << " in crush map";
10490 }
10491 }
10492
10493 pending_inc.crush.clear();
10494 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10495 if (loc.empty()) {
10496 ss << "added bucket " << name << " type " << typestr
10497 << " to crush map";
10498 } else {
10499 ss << "added bucket " << name << " type " << typestr
10500 << " to location " << loc;
10501 }
10502 goto update;
10503 } else if (prefix == "osd crush rename-bucket") {
10504 string srcname, dstname;
10505 cmd_getval(cmdmap, "srcname", srcname);
10506 cmd_getval(cmdmap, "dstname", dstname);
10507
10508 err = crush_rename_bucket(srcname, dstname, &ss);
10509 if (err) {
10510 // equivalent to success for idempotency
10511 if (err == -EALREADY) {
10512 err = 0;
10513 }
10514 goto reply_no_propose;
10515 } else {
10516 goto update;
10517 }
10518 } else if (prefix == "osd crush weight-set create" ||
10519 prefix == "osd crush weight-set create-compat") {
10520 if (_have_pending_crush()) {
10521 dout(10) << " first waiting for pending crush changes to commit" << dendl;
10522 goto wait;
10523 }
10524 CrushWrapper newcrush = _get_pending_crush();
10525 int64_t pool;
10526 int positions;
10527 if (newcrush.has_non_straw2_buckets()) {
10528 ss << "crush map contains one or more bucket(s) that are not straw2";
10529 err = -EPERM;
10530 goto reply_no_propose;
10531 }
10532 if (prefix == "osd crush weight-set create") {
10533 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10534 osdmap.require_min_compat_client < ceph_release_t::luminous) {
10535 ss << "require_min_compat_client "
10536 << osdmap.require_min_compat_client
10537 << " < luminous, which is required for per-pool weight-sets. "
10538 << "Try 'ceph osd set-require-min-compat-client luminous' "
10539 << "before using the new interface";
10540 err = -EPERM;
10541 goto reply_no_propose;
10542 }
10543 string poolname, mode;
10544 cmd_getval(cmdmap, "pool", poolname);
10545 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10546 if (pool < 0) {
10547 ss << "pool '" << poolname << "' not found";
10548 err = -ENOENT;
10549 goto reply_no_propose;
10550 }
10551 cmd_getval(cmdmap, "mode", mode);
10552 if (mode != "flat" && mode != "positional") {
10553 ss << "unrecognized weight-set mode '" << mode << "'";
10554 err = -EINVAL;
10555 goto reply_no_propose;
10556 }
10557 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10558 } else {
10559 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10560 positions = 1;
10561 }
10562 if (!newcrush.create_choose_args(pool, positions)) {
10563 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10564 ss << "compat weight-set already created";
10565 } else {
10566 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10567 << "' already created";
10568 }
10569 goto reply_no_propose;
10570 }
10571 pending_inc.crush.clear();
10572 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10573 goto update;
10574
10575 } else if (prefix == "osd crush weight-set rm" ||
10576 prefix == "osd crush weight-set rm-compat") {
10577 CrushWrapper newcrush = _get_pending_crush();
10578 int64_t pool;
10579 if (prefix == "osd crush weight-set rm") {
10580 string poolname;
10581 cmd_getval(cmdmap, "pool", poolname);
10582 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10583 if (pool < 0) {
10584 ss << "pool '" << poolname << "' not found";
10585 err = -ENOENT;
10586 goto reply_no_propose;
10587 }
10588 } else {
10589 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10590 }
10591 newcrush.rm_choose_args(pool);
10592 pending_inc.crush.clear();
10593 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10594 goto update;
10595
10596 } else if (prefix == "osd crush weight-set reweight" ||
10597 prefix == "osd crush weight-set reweight-compat") {
10598 string poolname, item;
10599 vector<double> weight;
10600 cmd_getval(cmdmap, "pool", poolname);
10601 cmd_getval(cmdmap, "item", item);
10602 cmd_getval(cmdmap, "weight", weight);
10603 CrushWrapper newcrush = _get_pending_crush();
10604 int64_t pool;
10605 if (prefix == "osd crush weight-set reweight") {
10606 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10607 if (pool < 0) {
10608 ss << "pool '" << poolname << "' not found";
10609 err = -ENOENT;
10610 goto reply_no_propose;
10611 }
10612 if (!newcrush.have_choose_args(pool)) {
10613 ss << "no weight-set for pool '" << poolname << "'";
10614 err = -ENOENT;
10615 goto reply_no_propose;
10616 }
10617 auto arg_map = newcrush.choose_args_get(pool);
10618 int positions = newcrush.get_choose_args_positions(arg_map);
10619 if (weight.size() != (size_t)positions) {
10620 ss << "must specify exact " << positions << " weight values";
10621 err = -EINVAL;
10622 goto reply_no_propose;
10623 }
10624 } else {
10625 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10626 if (!newcrush.have_choose_args(pool)) {
10627 ss << "no backward-compatible weight-set";
10628 err = -ENOENT;
10629 goto reply_no_propose;
10630 }
10631 }
10632 if (!newcrush.name_exists(item)) {
10633 ss << "item '" << item << "' does not exist";
10634 err = -ENOENT;
10635 goto reply_no_propose;
10636 }
10637 err = newcrush.choose_args_adjust_item_weightf(
10638 cct,
10639 newcrush.choose_args_get(pool),
10640 newcrush.get_item_id(item),
10641 weight,
10642 &ss);
10643 if (err < 0) {
10644 goto reply_no_propose;
10645 }
10646 err = 0;
10647 pending_inc.crush.clear();
10648 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10649 goto update;
10650 } else if (osdid_present &&
10651 (prefix == "osd crush set" || prefix == "osd crush add")) {
10652 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10653 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10654 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10655
10656 if (!osdmap.exists(osdid)) {
10657 err = -ENOENT;
10658 ss << osd_name
10659 << " does not exist. Create it before updating the crush map";
10660 goto reply_no_propose;
10661 }
10662
10663 double weight;
10664 if (!cmd_getval(cmdmap, "weight", weight)) {
10665 ss << "unable to parse weight value '"
10666 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10667 err = -EINVAL;
10668 goto reply_no_propose;
10669 }
10670
10671 string args;
10672 vector<string> argvec;
10673 cmd_getval(cmdmap, "args", argvec);
10674 map<string,string> loc;
10675 CrushWrapper::parse_loc_map(argvec, &loc);
10676
10677 if (prefix == "osd crush set"
10678 && !_get_stable_crush().item_exists(osdid)) {
10679 err = -ENOENT;
10680 ss << "unable to set item id " << osdid << " name '" << osd_name
10681 << "' weight " << weight << " at location " << loc
10682 << ": does not exist";
10683 goto reply_no_propose;
10684 }
10685
10686 dout(5) << "adding/updating crush item id " << osdid << " name '"
10687 << osd_name << "' weight " << weight << " at location "
10688 << loc << dendl;
10689 CrushWrapper newcrush = _get_pending_crush();
10690
10691 string action;
10692 if (prefix == "osd crush set" ||
10693 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10694 action = "set";
10695 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10696 } else {
10697 action = "add";
10698 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10699 if (err == 0)
10700 err = 1;
10701 }
10702
10703 if (err < 0)
10704 goto reply_no_propose;
10705
10706 if (err == 0 && !_have_pending_crush()) {
10707 ss << action << " item id " << osdid << " name '" << osd_name
10708 << "' weight " << weight << " at location " << loc << ": no change";
10709 goto reply_no_propose;
10710 }
10711
10712 pending_inc.crush.clear();
10713 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10714 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10715 << weight << " at location " << loc << " to crush map";
10716 getline(ss, rs);
10717 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10718 get_last_committed() + 1));
10719 return true;
10720
10721 } else if (prefix == "osd crush create-or-move") {
10722 do {
10723 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10724 if (!osdmap.exists(osdid)) {
10725 err = -ENOENT;
10726 ss << osd_name
10727 << " does not exist. create it before updating the crush map";
10728 goto reply_no_propose;
10729 }
10730
10731 double weight;
10732 if (!cmd_getval(cmdmap, "weight", weight)) {
10733 ss << "unable to parse weight value '"
10734 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10735 err = -EINVAL;
10736 goto reply_no_propose;
10737 }
10738
10739 string args;
10740 vector<string> argvec;
10741 cmd_getval(cmdmap, "args", argvec);
10742 map<string,string> loc;
10743 CrushWrapper::parse_loc_map(argvec, &loc);
10744
10745 dout(0) << "create-or-move crush item name '" << osd_name
10746 << "' initial_weight " << weight << " at location " << loc
10747 << dendl;
10748
10749 CrushWrapper newcrush = _get_pending_crush();
10750
10751 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10752 g_conf()->osd_crush_update_weight_set);
10753 if (err == 0) {
10754 ss << "create-or-move updated item name '" << osd_name
10755 << "' weight " << weight
10756 << " at location " << loc << " to crush map";
10757 break;
10758 }
10759 if (err > 0) {
10760 pending_inc.crush.clear();
10761 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10762 ss << "create-or-move updating item name '" << osd_name
10763 << "' weight " << weight
10764 << " at location " << loc << " to crush map";
10765 getline(ss, rs);
10766 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10767 get_last_committed() + 1));
10768 return true;
10769 }
10770 } while (false);
10771
10772 } else if (prefix == "osd crush move") {
10773 do {
10774 // osd crush move <name> <loc1> [<loc2> ...]
10775 string name;
10776 vector<string> argvec;
10777 cmd_getval(cmdmap, "name", name);
10778 cmd_getval(cmdmap, "args", argvec);
10779 map<string,string> loc;
10780 CrushWrapper::parse_loc_map(argvec, &loc);
10781
10782 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10783 CrushWrapper newcrush = _get_pending_crush();
10784
10785 if (!newcrush.name_exists(name)) {
10786 err = -ENOENT;
10787 ss << "item " << name << " does not exist";
10788 break;
10789 }
10790 int id = newcrush.get_item_id(name);
10791
10792 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10793 if (id >= 0) {
10794 err = newcrush.create_or_move_item(
10795 cct, id, 0, name, loc,
10796 g_conf()->osd_crush_update_weight_set);
10797 } else {
10798 err = newcrush.move_bucket(cct, id, loc);
10799 }
10800 if (err >= 0) {
10801 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10802 pending_inc.crush.clear();
10803 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10804 getline(ss, rs);
10805 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10806 get_last_committed() + 1));
10807 return true;
10808 }
10809 } else {
10810 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10811 err = 0;
10812 }
10813 } while (false);
10814 } else if (prefix == "osd crush swap-bucket") {
10815 string source, dest;
10816 cmd_getval(cmdmap, "source", source);
10817 cmd_getval(cmdmap, "dest", dest);
10818
10819 bool force = false;
10820 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10821
10822 CrushWrapper newcrush = _get_pending_crush();
10823 if (!newcrush.name_exists(source)) {
10824 ss << "source item " << source << " does not exist";
10825 err = -ENOENT;
10826 goto reply_no_propose;
10827 }
10828 if (!newcrush.name_exists(dest)) {
10829 ss << "dest item " << dest << " does not exist";
10830 err = -ENOENT;
10831 goto reply_no_propose;
10832 }
10833 int sid = newcrush.get_item_id(source);
10834 int did = newcrush.get_item_id(dest);
10835 int sparent;
10836 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10837 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10838 err = -EPERM;
10839 goto reply_no_propose;
10840 }
10841 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10842 !force) {
10843 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10844 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10845 << "; pass --yes-i-really-mean-it to proceed anyway";
10846 err = -EPERM;
10847 goto reply_no_propose;
10848 }
10849 int r = newcrush.swap_bucket(cct, sid, did);
10850 if (r < 0) {
10851 ss << "failed to swap bucket contents: " << cpp_strerror(r);
10852 err = r;
10853 goto reply_no_propose;
10854 }
10855 ss << "swapped bucket of " << source << " to " << dest;
10856 pending_inc.crush.clear();
10857 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10858 wait_for_finished_proposal(op,
10859 new Monitor::C_Command(mon, op, err, ss.str(),
10860 get_last_committed() + 1));
10861 return true;
10862 } else if (prefix == "osd crush link") {
10863 // osd crush link <name> <loc1> [<loc2> ...]
10864 string name;
10865 cmd_getval(cmdmap, "name", name);
10866 vector<string> argvec;
10867 cmd_getval(cmdmap, "args", argvec);
10868 map<string,string> loc;
10869 CrushWrapper::parse_loc_map(argvec, &loc);
10870
10871 // Need an explicit check for name_exists because get_item_id returns
10872 // 0 on unfound.
10873 int id = osdmap.crush->get_item_id(name);
10874 if (!osdmap.crush->name_exists(name)) {
10875 err = -ENOENT;
10876 ss << "item " << name << " does not exist";
10877 goto reply_no_propose;
10878 } else {
10879 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10880 }
10881 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10882 ss << "no need to move item id " << id << " name '" << name
10883 << "' to location " << loc << " in crush map";
10884 err = 0;
10885 goto reply_no_propose;
10886 }
10887
10888 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10889 CrushWrapper newcrush = _get_pending_crush();
10890
10891 if (!newcrush.name_exists(name)) {
10892 err = -ENOENT;
10893 ss << "item " << name << " does not exist";
10894 goto reply_no_propose;
10895 } else {
10896 int id = newcrush.get_item_id(name);
10897 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10898 err = newcrush.link_bucket(cct, id, loc);
10899 if (err >= 0) {
10900 ss << "linked item id " << id << " name '" << name
10901 << "' to location " << loc << " in crush map";
10902 pending_inc.crush.clear();
10903 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10904 } else {
10905 ss << "cannot link item id " << id << " name '" << name
10906 << "' to location " << loc;
10907 goto reply_no_propose;
10908 }
10909 } else {
10910 ss << "no need to move item id " << id << " name '" << name
10911 << "' to location " << loc << " in crush map";
10912 err = 0;
10913 }
10914 }
10915 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10916 get_last_committed() + 1));
10917 return true;
10918 } else if (prefix == "osd crush rm" ||
10919 prefix == "osd crush remove" ||
10920 prefix == "osd crush unlink") {
10921 do {
10922 // osd crush rm <id> [ancestor]
10923 CrushWrapper newcrush = _get_pending_crush();
10924
10925 string name;
10926 cmd_getval(cmdmap, "name", name);
10927
10928 if (!osdmap.crush->name_exists(name)) {
10929 err = 0;
10930 ss << "device '" << name << "' does not appear in the crush map";
10931 break;
10932 }
10933 if (!newcrush.name_exists(name)) {
10934 err = 0;
10935 ss << "device '" << name << "' does not appear in the crush map";
10936 getline(ss, rs);
10937 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10938 get_last_committed() + 1));
10939 return true;
10940 }
10941 int id = newcrush.get_item_id(name);
10942 int ancestor = 0;
10943
10944 bool unlink_only = prefix == "osd crush unlink";
10945 string ancestor_str;
10946 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10947 if (!newcrush.name_exists(ancestor_str)) {
10948 err = -ENOENT;
10949 ss << "ancestor item '" << ancestor_str
10950 << "' does not appear in the crush map";
10951 break;
10952 }
10953 ancestor = newcrush.get_item_id(ancestor_str);
10954 }
10955
10956 err = prepare_command_osd_crush_remove(
10957 newcrush,
10958 id, ancestor,
10959 (ancestor < 0), unlink_only);
10960
10961 if (err == -ENOENT) {
10962 ss << "item " << id << " does not appear in that position";
10963 err = 0;
10964 break;
10965 }
10966 if (err == 0) {
10967 if (!unlink_only)
10968 pending_inc.new_crush_node_flags[id] = 0;
10969 ss << "removed item id " << id << " name '" << name << "' from crush map";
10970 getline(ss, rs);
10971 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10972 get_last_committed() + 1));
10973 return true;
10974 }
10975 } while (false);
10976
10977 } else if (prefix == "osd crush reweight-all") {
10978 CrushWrapper newcrush = _get_pending_crush();
10979
10980 newcrush.reweight(cct);
10981 pending_inc.crush.clear();
10982 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10983 ss << "reweighted crush hierarchy";
10984 getline(ss, rs);
10985 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10986 get_last_committed() + 1));
10987 return true;
10988 } else if (prefix == "osd crush reweight") {
10989 // osd crush reweight <name> <weight>
10990 CrushWrapper newcrush = _get_pending_crush();
10991
10992 string name;
10993 cmd_getval(cmdmap, "name", name);
10994 if (!newcrush.name_exists(name)) {
10995 err = -ENOENT;
10996 ss << "device '" << name << "' does not appear in the crush map";
10997 goto reply_no_propose;
10998 }
10999
11000 int id = newcrush.get_item_id(name);
11001 if (id < 0) {
11002 ss << "device '" << name << "' is not a leaf in the crush map";
11003 err = -EINVAL;
11004 goto reply_no_propose;
11005 }
11006 double w;
11007 if (!cmd_getval(cmdmap, "weight", w)) {
11008 ss << "unable to parse weight value '"
11009 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11010 err = -EINVAL;
11011 goto reply_no_propose;
11012 }
11013
11014 err = newcrush.adjust_item_weightf(cct, id, w,
11015 g_conf()->osd_crush_update_weight_set);
11016 if (err < 0)
11017 goto reply_no_propose;
11018 pending_inc.crush.clear();
11019 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11020 ss << "reweighted item id " << id << " name '" << name << "' to " << w
11021 << " in crush map";
11022 getline(ss, rs);
11023 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11024 get_last_committed() + 1));
11025 return true;
11026 } else if (prefix == "osd crush reweight-subtree") {
11027 // osd crush reweight <name> <weight>
11028 CrushWrapper newcrush = _get_pending_crush();
11029
11030 string name;
11031 cmd_getval(cmdmap, "name", name);
11032 if (!newcrush.name_exists(name)) {
11033 err = -ENOENT;
11034 ss << "device '" << name << "' does not appear in the crush map";
11035 goto reply_no_propose;
11036 }
11037
11038 int id = newcrush.get_item_id(name);
11039 if (id >= 0) {
11040 ss << "device '" << name << "' is not a subtree in the crush map";
11041 err = -EINVAL;
11042 goto reply_no_propose;
11043 }
11044 double w;
11045 if (!cmd_getval(cmdmap, "weight", w)) {
11046 ss << "unable to parse weight value '"
11047 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11048 err = -EINVAL;
11049 goto reply_no_propose;
11050 }
11051
11052 err = newcrush.adjust_subtree_weightf(cct, id, w,
11053 g_conf()->osd_crush_update_weight_set);
11054 if (err < 0)
11055 goto reply_no_propose;
11056 pending_inc.crush.clear();
11057 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11058 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
11059 << " in crush map";
11060 getline(ss, rs);
11061 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11062 get_last_committed() + 1));
11063 return true;
11064 } else if (prefix == "osd crush tunables") {
11065 CrushWrapper newcrush = _get_pending_crush();
11066
11067 err = 0;
11068 string profile;
11069 cmd_getval(cmdmap, "profile", profile);
11070 if (profile == "legacy" || profile == "argonaut") {
11071 newcrush.set_tunables_legacy();
11072 } else if (profile == "bobtail") {
11073 newcrush.set_tunables_bobtail();
11074 } else if (profile == "firefly") {
11075 newcrush.set_tunables_firefly();
11076 } else if (profile == "hammer") {
11077 newcrush.set_tunables_hammer();
11078 } else if (profile == "jewel") {
11079 newcrush.set_tunables_jewel();
11080 } else if (profile == "optimal") {
11081 newcrush.set_tunables_optimal();
11082 } else if (profile == "default") {
11083 newcrush.set_tunables_default();
11084 } else {
11085 ss << "unrecognized profile '" << profile << "'";
11086 err = -EINVAL;
11087 goto reply_no_propose;
11088 }
11089
11090 if (!validate_crush_against_features(&newcrush, ss)) {
11091 err = -EINVAL;
11092 goto reply_no_propose;
11093 }
11094
11095 pending_inc.crush.clear();
11096 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11097 ss << "adjusted tunables profile to " << profile;
11098 getline(ss, rs);
11099 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11100 get_last_committed() + 1));
11101 return true;
11102 } else if (prefix == "osd crush set-tunable") {
11103 CrushWrapper newcrush = _get_pending_crush();
11104
11105 err = 0;
11106 string tunable;
11107 cmd_getval(cmdmap, "tunable", tunable);
11108
11109 int64_t value = -1;
11110 if (!cmd_getval(cmdmap, "value", value)) {
11111 err = -EINVAL;
11112 ss << "failed to parse integer value "
11113 << cmd_vartype_stringify(cmdmap.at("value"));
11114 goto reply_no_propose;
11115 }
11116
11117 if (tunable == "straw_calc_version") {
11118 if (value != 0 && value != 1) {
11119 ss << "value must be 0 or 1; got " << value;
11120 err = -EINVAL;
11121 goto reply_no_propose;
11122 }
11123 newcrush.set_straw_calc_version(value);
11124 } else {
11125 ss << "unrecognized tunable '" << tunable << "'";
11126 err = -EINVAL;
11127 goto reply_no_propose;
11128 }
11129
11130 if (!validate_crush_against_features(&newcrush, ss)) {
11131 err = -EINVAL;
11132 goto reply_no_propose;
11133 }
11134
11135 pending_inc.crush.clear();
11136 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11137 ss << "adjusted tunable " << tunable << " to " << value;
11138 getline(ss, rs);
11139 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11140 get_last_committed() + 1));
11141 return true;
11142
11143 } else if (prefix == "osd crush rule create-simple") {
11144 string name, root, type, mode;
11145 cmd_getval(cmdmap, "name", name);
11146 cmd_getval(cmdmap, "root", root);
11147 cmd_getval(cmdmap, "type", type);
11148 cmd_getval(cmdmap, "mode", mode);
11149 if (mode == "")
11150 mode = "firstn";
11151
11152 if (osdmap.crush->rule_exists(name)) {
11153 // The name is uniquely associated to a ruleid and the rule it contains
11154 // From the user point of view, the rule is more meaningfull.
11155 ss << "rule " << name << " already exists";
11156 err = 0;
11157 goto reply_no_propose;
11158 }
11159
11160 CrushWrapper newcrush = _get_pending_crush();
11161
11162 if (newcrush.rule_exists(name)) {
11163 // The name is uniquely associated to a ruleid and the rule it contains
11164 // From the user point of view, the rule is more meaningfull.
11165 ss << "rule " << name << " already exists";
11166 err = 0;
11167 } else {
11168 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
11169 pg_pool_t::TYPE_REPLICATED, &ss);
11170 if (ruleno < 0) {
11171 err = ruleno;
11172 goto reply_no_propose;
11173 }
11174
11175 pending_inc.crush.clear();
11176 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11177 }
11178 getline(ss, rs);
11179 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11180 get_last_committed() + 1));
11181 return true;
11182
11183 } else if (prefix == "osd crush rule create-replicated") {
11184 string name, root, type, device_class;
11185 cmd_getval(cmdmap, "name", name);
11186 cmd_getval(cmdmap, "root", root);
11187 cmd_getval(cmdmap, "type", type);
11188 cmd_getval(cmdmap, "class", device_class);
11189
11190 if (osdmap.crush->rule_exists(name)) {
11191 // The name is uniquely associated to a ruleid and the rule it contains
11192 // From the user point of view, the rule is more meaningfull.
11193 ss << "rule " << name << " already exists";
11194 err = 0;
11195 goto reply_no_propose;
11196 }
11197
11198 CrushWrapper newcrush = _get_pending_crush();
11199
11200 if (newcrush.rule_exists(name)) {
11201 // The name is uniquely associated to a ruleid and the rule it contains
11202 // From the user point of view, the rule is more meaningfull.
11203 ss << "rule " << name << " already exists";
11204 err = 0;
11205 } else {
11206 int ruleno = newcrush.add_simple_rule(
11207 name, root, type, device_class,
11208 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11209 if (ruleno < 0) {
11210 err = ruleno;
11211 goto reply_no_propose;
11212 }
11213
11214 pending_inc.crush.clear();
11215 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11216 }
11217 getline(ss, rs);
11218 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11219 get_last_committed() + 1));
11220 return true;
11221
11222 } else if (prefix == "osd erasure-code-profile rm") {
11223 string name;
11224 cmd_getval(cmdmap, "name", name);
11225
11226 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11227 goto wait;
11228
11229 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11230 err = -EBUSY;
11231 goto reply_no_propose;
11232 }
11233
11234 if (osdmap.has_erasure_code_profile(name) ||
11235 pending_inc.new_erasure_code_profiles.count(name)) {
11236 if (osdmap.has_erasure_code_profile(name)) {
11237 pending_inc.old_erasure_code_profiles.push_back(name);
11238 } else {
11239 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11240 pending_inc.new_erasure_code_profiles.erase(name);
11241 }
11242
11243 getline(ss, rs);
11244 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11245 get_last_committed() + 1));
11246 return true;
11247 } else {
11248 ss << "erasure-code-profile " << name << " does not exist";
11249 err = 0;
11250 goto reply_no_propose;
11251 }
11252
11253 } else if (prefix == "osd erasure-code-profile set") {
11254 string name;
11255 cmd_getval(cmdmap, "name", name);
11256 vector<string> profile;
11257 cmd_getval(cmdmap, "profile", profile);
11258
11259 bool force = false;
11260 cmd_getval(cmdmap, "force", force);
11261
11262 map<string,string> profile_map;
11263 err = parse_erasure_code_profile(profile, &profile_map, &ss);
11264 if (err)
11265 goto reply_no_propose;
11266 if (auto found = profile_map.find("crush-failure-domain");
11267 found != profile_map.end()) {
11268 const auto& failure_domain = found->second;
11269 int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11270 if (failure_domain_type < 0) {
11271 ss << "erasure-code-profile " << profile_map
11272 << " contains an invalid failure-domain " << std::quoted(failure_domain);
11273 err = -EINVAL;
11274 goto reply_no_propose;
11275 }
11276 }
11277
11278 if (profile_map.find("plugin") == profile_map.end()) {
11279 ss << "erasure-code-profile " << profile_map
11280 << " must contain a plugin entry" << std::endl;
11281 err = -EINVAL;
11282 goto reply_no_propose;
11283 }
11284 string plugin = profile_map["plugin"];
11285
11286 if (pending_inc.has_erasure_code_profile(name)) {
11287 dout(20) << "erasure code profile " << name << " try again" << dendl;
11288 goto wait;
11289 } else {
11290 err = normalize_profile(name, profile_map, force, &ss);
11291 if (err)
11292 goto reply_no_propose;
11293
11294 if (osdmap.has_erasure_code_profile(name)) {
11295 ErasureCodeProfile existing_profile_map =
11296 osdmap.get_erasure_code_profile(name);
11297 err = normalize_profile(name, existing_profile_map, force, &ss);
11298 if (err)
11299 goto reply_no_propose;
11300
11301 if (existing_profile_map == profile_map) {
11302 err = 0;
11303 goto reply_no_propose;
11304 }
11305 if (!force) {
11306 err = -EPERM;
11307 ss << "will not override erasure code profile " << name
11308 << " because the existing profile "
11309 << existing_profile_map
11310 << " is different from the proposed profile "
11311 << profile_map;
11312 goto reply_no_propose;
11313 }
11314 }
11315
11316 dout(20) << "erasure code profile set " << name << "="
11317 << profile_map << dendl;
11318 pending_inc.set_erasure_code_profile(name, profile_map);
11319 }
11320
11321 getline(ss, rs);
11322 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11323 get_last_committed() + 1));
11324 return true;
11325
11326 } else if (prefix == "osd crush rule create-erasure") {
11327 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11328 if (err == -EAGAIN)
11329 goto wait;
11330 if (err)
11331 goto reply_no_propose;
11332 string name, poolstr;
11333 cmd_getval(cmdmap, "name", name);
11334 string profile;
11335 cmd_getval(cmdmap, "profile", profile);
11336 if (profile == "")
11337 profile = "default";
11338 if (profile == "default") {
11339 if (!osdmap.has_erasure_code_profile(profile)) {
11340 if (pending_inc.has_erasure_code_profile(profile)) {
11341 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11342 goto wait;
11343 }
11344
11345 map<string,string> profile_map;
11346 err = osdmap.get_erasure_code_profile_default(cct,
11347 profile_map,
11348 &ss);
11349 if (err)
11350 goto reply_no_propose;
11351 err = normalize_profile(name, profile_map, true, &ss);
11352 if (err)
11353 goto reply_no_propose;
11354 dout(20) << "erasure code profile set " << profile << "="
11355 << profile_map << dendl;
11356 pending_inc.set_erasure_code_profile(profile, profile_map);
11357 goto wait;
11358 }
11359 }
11360
11361 int rule;
11362 err = crush_rule_create_erasure(name, profile, &rule, &ss);
11363 if (err < 0) {
11364 switch(err) {
11365 case -EEXIST: // return immediately
11366 ss << "rule " << name << " already exists";
11367 err = 0;
11368 goto reply_no_propose;
11369 case -EALREADY: // wait for pending to be proposed
11370 ss << "rule " << name << " already exists";
11371 err = 0;
11372 break;
11373 default: // non recoverable error
11374 goto reply_no_propose;
11375 }
11376 } else {
11377 ss << "created rule " << name << " at " << rule;
11378 }
11379
11380 getline(ss, rs);
11381 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11382 get_last_committed() + 1));
11383 return true;
11384
11385 } else if (prefix == "osd crush rule rm") {
11386 string name;
11387 cmd_getval(cmdmap, "name", name);
11388
11389 if (!osdmap.crush->rule_exists(name)) {
11390 ss << "rule " << name << " does not exist";
11391 err = 0;
11392 goto reply_no_propose;
11393 }
11394
11395 CrushWrapper newcrush = _get_pending_crush();
11396
11397 if (!newcrush.rule_exists(name)) {
11398 ss << "rule " << name << " does not exist";
11399 err = 0;
11400 } else {
11401 int ruleno = newcrush.get_rule_id(name);
11402 ceph_assert(ruleno >= 0);
11403
11404 // make sure it is not in use.
11405 // FIXME: this is ok in some situations, but let's not bother with that
11406 // complexity now.
11407 if (osdmap.crush_rule_in_use(ruleno)) {
11408 ss << "crush rule " << name << " (" << ruleno << ") is in use";
11409 err = -EBUSY;
11410 goto reply_no_propose;
11411 }
11412
11413 err = newcrush.remove_rule(ruleno);
11414 if (err < 0) {
11415 goto reply_no_propose;
11416 }
11417
11418 pending_inc.crush.clear();
11419 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11420 }
11421 getline(ss, rs);
11422 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11423 get_last_committed() + 1));
11424 return true;
11425
11426 } else if (prefix == "osd crush rule rename") {
11427 string srcname;
11428 string dstname;
11429 cmd_getval(cmdmap, "srcname", srcname);
11430 cmd_getval(cmdmap, "dstname", dstname);
11431 if (srcname.empty() || dstname.empty()) {
11432 ss << "must specify both source rule name and destination rule name";
11433 err = -EINVAL;
11434 goto reply_no_propose;
11435 }
11436 if (srcname == dstname) {
11437 ss << "destination rule name is equal to source rule name";
11438 err = 0;
11439 goto reply_no_propose;
11440 }
11441
11442 CrushWrapper newcrush = _get_pending_crush();
11443 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11444 // srcname does not exist and dstname already exists
11445 // suppose this is a replay and return success
11446 // (so this command is idempotent)
11447 ss << "already renamed to '" << dstname << "'";
11448 err = 0;
11449 goto reply_no_propose;
11450 }
11451
11452 err = newcrush.rename_rule(srcname, dstname, &ss);
11453 if (err < 0) {
11454 // ss has reason for failure
11455 goto reply_no_propose;
11456 }
11457 pending_inc.crush.clear();
11458 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11459 getline(ss, rs);
11460 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11461 get_last_committed() + 1));
11462 return true;
11463
11464 } else if (prefix == "osd setmaxosd") {
11465 int64_t newmax;
11466 if (!cmd_getval(cmdmap, "newmax", newmax)) {
11467 ss << "unable to parse 'newmax' value '"
11468 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11469 err = -EINVAL;
11470 goto reply_no_propose;
11471 }
11472
11473 if (newmax > g_conf()->mon_max_osd) {
11474 err = -ERANGE;
11475 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11476 << g_conf()->mon_max_osd << ")";
11477 goto reply_no_propose;
11478 }
11479
11480 // Don't allow shrinking OSD number as this will cause data loss
11481 // and may cause kernel crashes.
11482 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11483 if (newmax < osdmap.get_max_osd()) {
11484 // Check if the OSDs exist between current max and new value.
11485 // If there are any OSDs exist, then don't allow shrinking number
11486 // of OSDs.
11487 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11488 if (osdmap.exists(i)) {
11489 err = -EBUSY;
11490 ss << "cannot shrink max_osd to " << newmax
11491 << " because osd." << i << " (and possibly others) still in use";
11492 goto reply_no_propose;
11493 }
11494 }
11495 }
11496
11497 pending_inc.new_max_osd = newmax;
11498 ss << "set new max_osd = " << pending_inc.new_max_osd;
11499 getline(ss, rs);
11500 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11501 get_last_committed() + 1));
11502 return true;
11503
11504 } else if (prefix == "osd set-full-ratio" ||
11505 prefix == "osd set-backfillfull-ratio" ||
11506 prefix == "osd set-nearfull-ratio") {
11507 double n;
11508 if (!cmd_getval(cmdmap, "ratio", n)) {
11509 ss << "unable to parse 'ratio' value '"
11510 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11511 err = -EINVAL;
11512 goto reply_no_propose;
11513 }
11514 if (prefix == "osd set-full-ratio")
11515 pending_inc.new_full_ratio = n;
11516 else if (prefix == "osd set-backfillfull-ratio")
11517 pending_inc.new_backfillfull_ratio = n;
11518 else if (prefix == "osd set-nearfull-ratio")
11519 pending_inc.new_nearfull_ratio = n;
11520 ss << prefix << " " << n;
11521 getline(ss, rs);
11522 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11523 get_last_committed() + 1));
11524 return true;
11525 } else if (prefix == "osd set-require-min-compat-client") {
11526 string v;
11527 cmd_getval(cmdmap, "version", v);
11528 ceph_release_t vno = ceph_release_from_name(v);
11529 if (!vno) {
11530 ss << "version " << v << " is not recognized";
11531 err = -EINVAL;
11532 goto reply_no_propose;
11533 }
11534 OSDMap newmap;
11535 newmap.deepish_copy_from(osdmap);
11536 newmap.apply_incremental(pending_inc);
11537 newmap.require_min_compat_client = vno;
11538 auto mvno = newmap.get_min_compat_client();
11539 if (vno < mvno) {
11540 ss << "osdmap current utilizes features that require " << mvno
11541 << "; cannot set require_min_compat_client below that to " << vno;
11542 err = -EPERM;
11543 goto reply_no_propose;
11544 }
11545 bool sure = false;
11546 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11547 if (!sure) {
11548 FeatureMap m;
11549 mon.get_combined_feature_map(&m);
11550 uint64_t features = ceph_release_features(to_integer<int>(vno));
11551 bool first = true;
11552 bool ok = true;
11553 for (int type : {
11554 CEPH_ENTITY_TYPE_CLIENT,
11555 CEPH_ENTITY_TYPE_MDS,
11556 CEPH_ENTITY_TYPE_MGR }) {
11557 auto p = m.m.find(type);
11558 if (p == m.m.end()) {
11559 continue;
11560 }
11561 for (auto& q : p->second) {
11562 uint64_t missing = ~q.first & features;
11563 if (missing) {
11564 if (first) {
11565 ss << "cannot set require_min_compat_client to " << v << ": ";
11566 } else {
11567 ss << "; ";
11568 }
11569 first = false;
11570 ss << q.second << " connected " << ceph_entity_type_name(type)
11571 << "(s) look like " << ceph_release_name(
11572 ceph_release_from_features(q.first))
11573 << " (missing 0x" << std::hex << missing << std::dec << ")";
11574 ok = false;
11575 }
11576 }
11577 }
11578 if (!ok) {
11579 ss << "; add --yes-i-really-mean-it to do it anyway";
11580 err = -EPERM;
11581 goto reply_no_propose;
11582 }
11583 }
11584 ss << "set require_min_compat_client to " << vno;
11585 pending_inc.new_require_min_compat_client = vno;
11586 getline(ss, rs);
11587 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11588 get_last_committed() + 1));
11589 return true;
11590 } else if (prefix == "osd pause") {
11591 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11592
11593 } else if (prefix == "osd unpause") {
11594 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11595
11596 } else if (prefix == "osd set") {
11597 bool sure = false;
11598 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11599
11600 string key;
11601 cmd_getval(cmdmap, "key", key);
11602 if (key == "pause")
11603 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11604 else if (key == "noup")
11605 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11606 else if (key == "nodown")
11607 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11608 else if (key == "noout")
11609 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11610 else if (key == "noin")
11611 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11612 else if (key == "nobackfill")
11613 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11614 else if (key == "norebalance")
11615 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11616 else if (key == "norecover")
11617 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11618 else if (key == "noscrub")
11619 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11620 else if (key == "nodeep-scrub")
11621 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11622 else if (key == "notieragent")
11623 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11624 else if (key == "nosnaptrim")
11625 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11626 else if (key == "pglog_hardlimit") {
11627 if (!osdmap.get_num_up_osds() && !sure) {
11628 ss << "Not advisable to continue since no OSDs are up. Pass "
11629 << "--yes-i-really-mean-it if you really wish to continue.";
11630 err = -EPERM;
11631 goto reply_no_propose;
11632 }
11633 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11634 // we are reusing a jewel feature bit that was retired in luminous.
11635 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11636 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11637 || sure)) {
11638 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11639 } else {
11640 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11641 err = -EPERM;
11642 goto reply_no_propose;
11643 }
11644 } else if (key == "noautoscale") {
11645 return prepare_set_flag(op, CEPH_OSDMAP_NOAUTOSCALE);
11646 } else {
11647 ss << "unrecognized flag '" << key << "'";
11648 err = -EINVAL;
11649 }
11650
11651 } else if (prefix == "osd unset") {
11652 string key;
11653 cmd_getval(cmdmap, "key", key);
11654 if (key == "pause")
11655 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11656 else if (key == "noup")
11657 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11658 else if (key == "nodown")
11659 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11660 else if (key == "noout")
11661 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11662 else if (key == "noin")
11663 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11664 else if (key == "nobackfill")
11665 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11666 else if (key == "norebalance")
11667 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11668 else if (key == "norecover")
11669 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11670 else if (key == "noscrub")
11671 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11672 else if (key == "nodeep-scrub")
11673 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11674 else if (key == "notieragent")
11675 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11676 else if (key == "nosnaptrim")
11677 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11678 else if (key == "noautoscale")
11679 return prepare_unset_flag(op, CEPH_OSDMAP_NOAUTOSCALE);
11680 else {
11681 ss << "unrecognized flag '" << key << "'";
11682 err = -EINVAL;
11683 }
11684
11685 } else if (prefix == "osd require-osd-release") {
11686 string release;
11687 cmd_getval(cmdmap, "release", release);
11688 bool sure = false;
11689 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11690 ceph_release_t rel = ceph_release_from_name(release.c_str());
11691 if (!rel) {
11692 ss << "unrecognized release " << release;
11693 err = -EINVAL;
11694 goto reply_no_propose;
11695 }
11696 if (rel == osdmap.require_osd_release) {
11697 // idempotent
11698 err = 0;
11699 goto reply_no_propose;
11700 }
11701 if (osdmap.require_osd_release < ceph_release_t::pacific && !sure) {
11702 ss << "Not advisable to continue since current 'require_osd_release' "
11703 << "refers to a very old Ceph release. Pass "
11704 << "--yes-i-really-mean-it if you really wish to continue.";
11705 err = -EPERM;
11706 goto reply_no_propose;
11707 }
11708 if (!osdmap.get_num_up_osds() && !sure) {
11709 ss << "Not advisable to continue since no OSDs are up. Pass "
11710 << "--yes-i-really-mean-it if you really wish to continue.";
11711 err = -EPERM;
11712 goto reply_no_propose;
11713 }
11714 if (rel == ceph_release_t::pacific) {
11715 if (!mon.monmap->get_required_features().contains_all(
11716 ceph::features::mon::FEATURE_PACIFIC)) {
11717 ss << "not all mons are pacific";
11718 err = -EPERM;
11719 goto reply_no_propose;
11720 }
11721 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11722 && !sure) {
11723 ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11724 err = -EPERM;
11725 goto reply_no_propose;
11726 }
11727 } else if (rel == ceph_release_t::quincy) {
11728 if (!mon.monmap->get_required_features().contains_all(
11729 ceph::features::mon::FEATURE_QUINCY)) {
11730 ss << "not all mons are quincy";
11731 err = -EPERM;
11732 goto reply_no_propose;
11733 }
11734 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
11735 && !sure) {
11736 ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11737 err = -EPERM;
11738 goto reply_no_propose;
11739 }
11740 } else if (rel == ceph_release_t::reef) {
11741 if (!mon.monmap->get_required_features().contains_all(
11742 ceph::features::mon::FEATURE_REEF)) {
11743 ss << "not all mons are reef";
11744 err = -EPERM;
11745 goto reply_no_propose;
11746 }
11747 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_REEF))
11748 && !sure) {
11749 ss << "not all up OSDs have CEPH_FEATURE_SERVER_REEF feature";
11750 err = -EPERM;
11751 goto reply_no_propose;
11752 }
11753 } else {
11754 ss << "not supported for this release";
11755 err = -EPERM;
11756 goto reply_no_propose;
11757 }
11758 if (rel < osdmap.require_osd_release) {
11759 ss << "require_osd_release cannot be lowered once it has been set";
11760 err = -EPERM;
11761 goto reply_no_propose;
11762 }
11763 pending_inc.new_require_osd_release = rel;
11764 goto update;
11765 } else if (prefix == "osd down" ||
11766 prefix == "osd out" ||
11767 prefix == "osd in" ||
11768 prefix == "osd rm" ||
11769 prefix == "osd stop") {
11770
11771 bool any = false;
11772 bool stop = false;
11773 bool verbose = true;
11774 bool definitely_dead = false;
11775
11776 vector<string> idvec;
11777 cmd_getval(cmdmap, "ids", idvec);
11778 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11779 derr << "definitely_dead " << (int)definitely_dead << dendl;
11780 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11781 set<int> osds;
11782
11783 // wildcard?
11784 if (j == 0 &&
11785 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11786 if (prefix == "osd in") {
11787 // touch out osds only
11788 osdmap.get_out_existing_osds(osds);
11789 } else {
11790 osdmap.get_all_osds(osds);
11791 }
11792 stop = true;
11793 verbose = false; // so the output is less noisy.
11794 } else {
11795 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11796 if (osd < 0) {
11797 ss << "invalid osd id" << osd;
11798 err = -EINVAL;
11799 continue;
11800 } else if (!osdmap.exists(osd)) {
11801 ss << "osd." << osd << " does not exist. ";
11802 continue;
11803 }
11804
11805 osds.insert(osd);
11806 }
11807
11808 for (auto &osd : osds) {
11809 if (prefix == "osd down") {
11810 if (osdmap.is_down(osd)) {
11811 if (verbose)
11812 ss << "osd." << osd << " is already down. ";
11813 } else {
11814 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11815 ss << "marked down osd." << osd << ". ";
11816 any = true;
11817 }
11818 if (definitely_dead) {
11819 if (!pending_inc.new_xinfo.count(osd)) {
11820 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11821 }
11822 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11823 any = true;
11824 }
11825 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11826 }
11827 } else if (prefix == "osd out") {
11828 if (osdmap.is_out(osd)) {
11829 if (verbose)
11830 ss << "osd." << osd << " is already out. ";
11831 } else {
11832 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11833 if (osdmap.osd_weight[osd]) {
11834 if (pending_inc.new_xinfo.count(osd) == 0) {
11835 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11836 }
11837 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11838 }
11839 ss << "marked out osd." << osd << ". ";
11840 std::ostringstream msg;
11841 msg << "Client " << op->get_session()->entity_name
11842 << " marked osd." << osd << " out";
11843 if (osdmap.is_up(osd)) {
11844 msg << ", while it was still marked up";
11845 } else {
11846 auto period = ceph_clock_now() - down_pending_out[osd];
11847 msg << ", after it was down for " << int(period.sec())
11848 << " seconds";
11849 }
11850
11851 mon.clog->info() << msg.str();
11852 any = true;
11853 }
11854 } else if (prefix == "osd in") {
11855 if (osdmap.is_in(osd)) {
11856 if (verbose)
11857 ss << "osd." << osd << " is already in. ";
11858 } else {
11859 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11860 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11861 if (pending_inc.new_xinfo.count(osd) == 0) {
11862 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11863 }
11864 pending_inc.new_xinfo[osd].old_weight = 0;
11865 } else {
11866 pending_inc.new_weight[osd] = CEPH_OSD_IN;
11867 }
11868 ss << "marked in osd." << osd << ". ";
11869 any = true;
11870 }
11871 } else if (prefix == "osd rm") {
11872 err = prepare_command_osd_remove(osd);
11873
11874 if (err == -EBUSY) {
11875 if (any)
11876 ss << ", ";
11877 ss << "osd." << osd << " is still up; must be down before removal. ";
11878 } else {
11879 ceph_assert(err == 0);
11880 if (any) {
11881 ss << ", osd." << osd;
11882 } else {
11883 ss << "removed osd." << osd;
11884 }
11885 any = true;
11886 }
11887 } else if (prefix == "osd stop") {
11888 if (osdmap.is_stop(osd)) {
11889 if (verbose)
11890 ss << "osd." << osd << " is already stopped. ";
11891 } else if (osdmap.is_down(osd)) {
11892 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11893 ss << "stop down osd." << osd << ". ";
11894 any = true;
11895 } else {
11896 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11897 ss << "stop osd." << osd << ". ";
11898 any = true;
11899 }
11900 }
11901 }
11902 }
11903 if (any) {
11904 getline(ss, rs);
11905 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11906 get_last_committed() + 1));
11907 return true;
11908 }
11909 } else if (prefix == "osd set-group" ||
11910 prefix == "osd unset-group" ||
11911 prefix == "osd add-noup" ||
11912 prefix == "osd add-nodown" ||
11913 prefix == "osd add-noin" ||
11914 prefix == "osd add-noout" ||
11915 prefix == "osd rm-noup" ||
11916 prefix == "osd rm-nodown" ||
11917 prefix == "osd rm-noin" ||
11918 prefix == "osd rm-noout") {
11919 bool do_set = prefix == "osd set-group" ||
11920 prefix.find("add") != string::npos;
11921 string flag_str;
11922 unsigned flags = 0;
11923 vector<string> who;
11924 if (prefix == "osd set-group" || prefix == "osd unset-group") {
11925 cmd_getval(cmdmap, "flags", flag_str);
11926 cmd_getval(cmdmap, "who", who);
11927 vector<string> raw_flags;
11928 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11929 for (auto& f : raw_flags) {
11930 if (f == "noup")
11931 flags |= CEPH_OSD_NOUP;
11932 else if (f == "nodown")
11933 flags |= CEPH_OSD_NODOWN;
11934 else if (f == "noin")
11935 flags |= CEPH_OSD_NOIN;
11936 else if (f == "noout")
11937 flags |= CEPH_OSD_NOOUT;
11938 else {
11939 ss << "unrecognized flag '" << f << "', must be one of "
11940 << "{noup,nodown,noin,noout}";
11941 err = -EINVAL;
11942 goto reply_no_propose;
11943 }
11944 }
11945 } else {
11946 cmd_getval(cmdmap, "ids", who);
11947 if (prefix.find("noup") != string::npos)
11948 flags = CEPH_OSD_NOUP;
11949 else if (prefix.find("nodown") != string::npos)
11950 flags = CEPH_OSD_NODOWN;
11951 else if (prefix.find("noin") != string::npos)
11952 flags = CEPH_OSD_NOIN;
11953 else if (prefix.find("noout") != string::npos)
11954 flags = CEPH_OSD_NOOUT;
11955 else
11956 ceph_assert(0 == "Unreachable!");
11957 }
11958 if (flags == 0) {
11959 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11960 err = -EINVAL;
11961 goto reply_no_propose;
11962 }
11963 if (who.empty()) {
11964 ss << "must specify at least one or more targets to set/unset";
11965 err = -EINVAL;
11966 goto reply_no_propose;
11967 }
11968 set<int> osds;
11969 set<int> crush_nodes;
11970 set<int> device_classes;
11971 for (auto& w : who) {
11972 if (w == "any" || w == "all" || w == "*") {
11973 osdmap.get_all_osds(osds);
11974 break;
11975 }
11976 std::stringstream ts;
11977 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11978 osds.insert(osd);
11979 } else if (osdmap.crush->name_exists(w)) {
11980 crush_nodes.insert(osdmap.crush->get_item_id(w));
11981 } else if (osdmap.crush->class_exists(w)) {
11982 device_classes.insert(osdmap.crush->get_class_id(w));
11983 } else {
11984 ss << "unable to parse osd id or crush node or device class: "
11985 << "\"" << w << "\". ";
11986 }
11987 }
11988 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11989 // ss has reason for failure
11990 err = -EINVAL;
11991 goto reply_no_propose;
11992 }
11993 bool any = false;
11994 for (auto osd : osds) {
11995 if (!osdmap.exists(osd)) {
11996 ss << "osd." << osd << " does not exist. ";
11997 continue;
11998 }
11999 if (do_set) {
12000 if (flags & CEPH_OSD_NOUP) {
12001 any |= osdmap.is_noup_by_osd(osd) ?
12002 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
12003 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
12004 }
12005 if (flags & CEPH_OSD_NODOWN) {
12006 any |= osdmap.is_nodown_by_osd(osd) ?
12007 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
12008 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
12009 }
12010 if (flags & CEPH_OSD_NOIN) {
12011 any |= osdmap.is_noin_by_osd(osd) ?
12012 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
12013 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
12014 }
12015 if (flags & CEPH_OSD_NOOUT) {
12016 any |= osdmap.is_noout_by_osd(osd) ?
12017 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
12018 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
12019 }
12020 } else {
12021 if (flags & CEPH_OSD_NOUP) {
12022 any |= osdmap.is_noup_by_osd(osd) ?
12023 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
12024 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
12025 }
12026 if (flags & CEPH_OSD_NODOWN) {
12027 any |= osdmap.is_nodown_by_osd(osd) ?
12028 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
12029 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
12030 }
12031 if (flags & CEPH_OSD_NOIN) {
12032 any |= osdmap.is_noin_by_osd(osd) ?
12033 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
12034 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
12035 }
12036 if (flags & CEPH_OSD_NOOUT) {
12037 any |= osdmap.is_noout_by_osd(osd) ?
12038 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
12039 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
12040 }
12041 }
12042 }
12043 for (auto& id : crush_nodes) {
12044 auto old_flags = osdmap.get_crush_node_flags(id);
12045 auto& pending_flags = pending_inc.new_crush_node_flags[id];
12046 pending_flags |= old_flags; // adopt existing flags first!
12047 if (do_set) {
12048 pending_flags |= flags;
12049 } else {
12050 pending_flags &= ~flags;
12051 }
12052 any = true;
12053 }
12054 for (auto& id : device_classes) {
12055 auto old_flags = osdmap.get_device_class_flags(id);
12056 auto& pending_flags = pending_inc.new_device_class_flags[id];
12057 pending_flags |= old_flags;
12058 if (do_set) {
12059 pending_flags |= flags;
12060 } else {
12061 pending_flags &= ~flags;
12062 }
12063 any = true;
12064 }
12065 if (any) {
12066 getline(ss, rs);
12067 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
12068 get_last_committed() + 1));
12069 return true;
12070 }
12071 } else if (prefix == "osd pg-temp") {
12072 pg_t pgid;
12073 err = parse_pgid(cmdmap, ss, pgid);
12074 if (err < 0)
12075 goto reply_no_propose;
12076 if (pending_inc.new_pg_temp.count(pgid)) {
12077 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
12078 goto wait;
12079 }
12080
12081 vector<int64_t> id_vec;
12082 vector<int32_t> new_pg_temp;
12083 cmd_getval(cmdmap, "id", id_vec);
12084 if (id_vec.empty()) {
12085 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
12086 ss << "done cleaning up pg_temp of " << pgid;
12087 goto update;
12088 }
12089 for (auto osd : id_vec) {
12090 if (!osdmap.exists(osd)) {
12091 ss << "osd." << osd << " does not exist";
12092 err = -ENOENT;
12093 goto reply_no_propose;
12094 }
12095 new_pg_temp.push_back(osd);
12096 }
12097
12098 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12099 if ((int)new_pg_temp.size() < pool_min_size) {
12100 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
12101 << pool_min_size << ")";
12102 err = -EINVAL;
12103 goto reply_no_propose;
12104 }
12105
12106 int pool_size = osdmap.get_pg_pool_size(pgid);
12107 if ((int)new_pg_temp.size() > pool_size) {
12108 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
12109 << pool_size << ")";
12110 err = -EINVAL;
12111 goto reply_no_propose;
12112 }
12113
12114 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
12115 new_pg_temp.begin(), new_pg_temp.end());
12116 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
12117 goto update;
12118 } else if (prefix == "osd primary-temp" ||
12119 prefix == "osd rm-primary-temp") {
12120 pg_t pgid;
12121 err = parse_pgid(cmdmap, ss, pgid);
12122 if (err < 0)
12123 goto reply_no_propose;
12124
12125 int64_t osd;
12126 if (prefix == "osd primary-temp") {
12127 if (!cmd_getval(cmdmap, "id", osd)) {
12128 ss << "unable to parse 'id' value '"
12129 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12130 err = -EINVAL;
12131 goto reply_no_propose;
12132 }
12133 if (!osdmap.exists(osd)) {
12134 ss << "osd." << osd << " does not exist";
12135 err = -ENOENT;
12136 goto reply_no_propose;
12137 }
12138 }
12139 else if (prefix == "osd rm-primary-temp") {
12140 osd = -1;
12141 }
12142 else {
12143 ceph_assert(0 == "Unreachable!");
12144 }
12145
12146 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12147 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12148 ss << "require_min_compat_client "
12149 << osdmap.require_min_compat_client
12150 << " < firefly, which is required for primary-temp";
12151 err = -EPERM;
12152 goto reply_no_propose;
12153 }
12154
12155 pending_inc.new_primary_temp[pgid] = osd;
12156 ss << "set " << pgid << " primary_temp mapping to " << osd;
12157 goto update;
12158 } else if (prefix == "pg repeer") {
12159 pg_t pgid;
12160 err = parse_pgid(cmdmap, ss, pgid);
12161 if (err < 0)
12162 goto reply_no_propose;
12163 vector<int> acting;
12164 int primary;
12165 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12166 if (primary < 0) {
12167 err = -EAGAIN;
12168 ss << "pg currently has no primary";
12169 goto reply_no_propose;
12170 }
12171 if (acting.size() > 1) {
12172 // map to just primary; it will map back to what it wants
12173 pending_inc.new_pg_temp[pgid] = { primary };
12174 } else {
12175 // hmm, pick another arbitrary osd to induce a change. Note
12176 // that this won't work if there is only one suitable OSD in the cluster.
12177 int i;
12178 bool done = false;
12179 for (i = 0; i < osdmap.get_max_osd(); ++i) {
12180 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12181 continue;
12182 }
12183 pending_inc.new_pg_temp[pgid] = { primary, i };
12184 done = true;
12185 break;
12186 }
12187 if (!done) {
12188 err = -EAGAIN;
12189 ss << "not enough up OSDs in the cluster to force repeer";
12190 goto reply_no_propose;
12191 }
12192 }
12193 goto update;
12194 } else if (prefix == "osd pg-upmap" ||
12195 prefix == "osd rm-pg-upmap" ||
12196 prefix == "osd pg-upmap-items" ||
12197 prefix == "osd rm-pg-upmap-items" ||
12198 prefix == "osd pg-upmap-primary" ||
12199 prefix == "osd rm-pg-upmap-primary") {
12200 enum {
12201 OP_PG_UPMAP,
12202 OP_RM_PG_UPMAP,
12203 OP_PG_UPMAP_ITEMS,
12204 OP_RM_PG_UPMAP_ITEMS,
12205 OP_PG_UPMAP_PRIMARY,
12206 OP_RM_PG_UPMAP_PRIMARY,
12207 } upmap_option;
12208
12209 if (prefix == "osd pg-upmap") {
12210 upmap_option = OP_PG_UPMAP;
12211 } else if (prefix == "osd rm-pg-upmap") {
12212 upmap_option = OP_RM_PG_UPMAP;
12213 } else if (prefix == "osd pg-upmap-items") {
12214 upmap_option = OP_PG_UPMAP_ITEMS;
12215 } else if (prefix == "osd rm-pg-upmap-items") {
12216 upmap_option = OP_RM_PG_UPMAP_ITEMS;
12217 } else if (prefix == "osd pg-upmap-primary") {
12218 upmap_option = OP_PG_UPMAP_PRIMARY;
12219 } else if (prefix == "osd rm-pg-upmap-primary") {
12220 upmap_option = OP_RM_PG_UPMAP_PRIMARY;
12221 } else {
12222 ceph_abort_msg("invalid upmap option");
12223 }
12224
12225 ceph_release_t min_release = ceph_release_t::unknown;
12226 string feature_name = "unknown";
12227 switch (upmap_option) {
12228 case OP_PG_UPMAP: // fall through
12229 case OP_RM_PG_UPMAP: // fall through
12230 case OP_PG_UPMAP_ITEMS: // fall through
12231 case OP_RM_PG_UPMAP_ITEMS:
12232 min_release = ceph_release_t::luminous;
12233 feature_name = "pg-upmap";
12234 break;
12235
12236 case OP_PG_UPMAP_PRIMARY: // fall through
12237 case OP_RM_PG_UPMAP_PRIMARY:
12238 min_release = ceph_release_t::reef;
12239 feature_name = "pg-upmap-primary";
12240 break;
12241
12242 default:
12243 ceph_abort_msg("invalid upmap option");
12244 }
12245 uint64_t min_feature = CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
12246 string min_release_name = ceph_release_name(static_cast<int>(min_release));
12247
12248 if (osdmap.require_min_compat_client < min_release) {
12249 ss << "min_compat_client "
12250 << osdmap.require_min_compat_client
12251 << " < " << min_release_name << ", which is required for " << feature_name << ". "
12252 << "Try 'ceph osd set-require-min-compat-client " << min_release_name << "' "
12253 << "before using the new interface";
12254 err = -EPERM;
12255 goto reply_no_propose;
12256 }
12257
12258 //TODO: Should I add feature and test for upmap-primary?
12259 err = check_cluster_features(min_feature, ss);
12260 if (err == -EAGAIN)
12261 goto wait;
12262 if (err < 0)
12263 goto reply_no_propose;
12264 pg_t pgid;
12265 err = parse_pgid(cmdmap, ss, pgid);
12266 if (err < 0)
12267 goto reply_no_propose;
12268 if (pending_inc.old_pools.count(pgid.pool())) {
12269 ss << "pool of " << pgid << " is pending removal";
12270 err = -ENOENT;
12271 getline(ss, rs);
12272 wait_for_finished_proposal(op,
12273 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12274 return true;
12275 }
12276
12277 // check pending upmap changes
12278 switch (upmap_option) {
12279 case OP_PG_UPMAP: // fall through
12280 case OP_RM_PG_UPMAP:
12281 if (pending_inc.new_pg_upmap.count(pgid) ||
12282 pending_inc.old_pg_upmap.count(pgid)) {
12283 dout(10) << __func__ << " waiting for pending update on "
12284 << pgid << dendl;
12285 goto wait;
12286 }
12287 break;
12288
12289 case OP_PG_UPMAP_PRIMARY: // fall through
12290 case OP_RM_PG_UPMAP_PRIMARY:
12291 {
12292 const pg_pool_t *pt = osdmap.get_pg_pool(pgid.pool());
12293 if (! pt->is_replicated()) {
12294 ss << "pg-upmap-primary is only supported for replicated pools";
12295 err = -EINVAL;
12296 goto reply_no_propose;
12297 }
12298 }
12299 // fall through
12300 case OP_PG_UPMAP_ITEMS: // fall through
12301 case OP_RM_PG_UPMAP_ITEMS: // fall through
12302 if (pending_inc.new_pg_upmap_items.count(pgid) ||
12303 pending_inc.old_pg_upmap_items.count(pgid)) {
12304 dout(10) << __func__ << " waiting for pending update on "
12305 << pgid << dendl;
12306 goto wait;
12307 }
12308 break;
12309
12310 default:
12311 ceph_abort_msg("invalid upmap option");
12312 }
12313
12314 switch (upmap_option) {
12315 case OP_PG_UPMAP:
12316 {
12317 vector<int64_t> id_vec;
12318 if (!cmd_getval(cmdmap, "id", id_vec)) {
12319 ss << "unable to parse 'id' value(s) '"
12320 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12321 err = -EINVAL;
12322 goto reply_no_propose;
12323 }
12324
12325 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12326 if ((int)id_vec.size() < pool_min_size) {
12327 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12328 << pool_min_size << ")";
12329 err = -EINVAL;
12330 goto reply_no_propose;
12331 }
12332
12333 int pool_size = osdmap.get_pg_pool_size(pgid);
12334 if ((int)id_vec.size() > pool_size) {
12335 ss << "num of osds (" << id_vec.size() <<") > pool size ("
12336 << pool_size << ")";
12337 err = -EINVAL;
12338 goto reply_no_propose;
12339 }
12340
12341 vector<int32_t> new_pg_upmap;
12342 for (auto osd : id_vec) {
12343 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12344 ss << "osd." << osd << " does not exist";
12345 err = -ENOENT;
12346 goto reply_no_propose;
12347 }
12348 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12349 if (it != new_pg_upmap.end()) {
12350 ss << "osd." << osd << " already exists, ";
12351 continue;
12352 }
12353 new_pg_upmap.push_back(osd);
12354 }
12355
12356 if (new_pg_upmap.empty()) {
12357 ss << "no valid upmap items(pairs) is specified";
12358 err = -EINVAL;
12359 goto reply_no_propose;
12360 }
12361
12362 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12363 new_pg_upmap.begin(), new_pg_upmap.end());
12364 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12365 }
12366 break;
12367
12368 case OP_RM_PG_UPMAP:
12369 {
12370 pending_inc.old_pg_upmap.insert(pgid);
12371 ss << "clear " << pgid << " pg_upmap mapping";
12372 }
12373 break;
12374
12375 case OP_PG_UPMAP_ITEMS:
12376 {
12377 vector<int64_t> id_vec;
12378 if (!cmd_getval(cmdmap, "id", id_vec)) {
12379 ss << "unable to parse 'id' value(s) '"
12380 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12381 err = -EINVAL;
12382 goto reply_no_propose;
12383 }
12384
12385 if (id_vec.size() % 2) {
12386 ss << "you must specify pairs of osd ids to be remapped";
12387 err = -EINVAL;
12388 goto reply_no_propose;
12389 }
12390
12391 int pool_size = osdmap.get_pg_pool_size(pgid);
12392 if ((int)(id_vec.size() / 2) > pool_size) {
12393 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12394 << pool_size << ")";
12395 err = -EINVAL;
12396 goto reply_no_propose;
12397 }
12398
12399 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12400 ostringstream items;
12401 items << "[";
12402 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12403 int from = *p++;
12404 int to = *p;
12405 if (from == to) {
12406 ss << "from osd." << from << " == to osd." << to << ", ";
12407 continue;
12408 }
12409 if (!osdmap.exists(from)) {
12410 ss << "osd." << from << " does not exist";
12411 err = -ENOENT;
12412 goto reply_no_propose;
12413 }
12414 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12415 ss << "osd." << to << " does not exist";
12416 err = -ENOENT;
12417 goto reply_no_propose;
12418 }
12419 pair<int32_t,int32_t> entry = make_pair(from, to);
12420 auto it = std::find(new_pg_upmap_items.begin(),
12421 new_pg_upmap_items.end(), entry);
12422 if (it != new_pg_upmap_items.end()) {
12423 ss << "osd." << from << " -> osd." << to << " already exists, ";
12424 continue;
12425 }
12426 new_pg_upmap_items.push_back(entry);
12427 items << from << "->" << to << ",";
12428 }
12429 string out(items.str());
12430 out.resize(out.size() - 1); // drop last ','
12431 out += "]";
12432
12433 if (new_pg_upmap_items.empty()) {
12434 ss << "no valid upmap items(pairs) is specified";
12435 err = -EINVAL;
12436 goto reply_no_propose;
12437 }
12438
12439 pending_inc.new_pg_upmap_items[pgid] =
12440 mempool::osdmap::vector<pair<int32_t,int32_t>>(
12441 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12442 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12443 }
12444 break;
12445
12446 case OP_RM_PG_UPMAP_ITEMS:
12447 {
12448 pending_inc.old_pg_upmap_items.insert(pgid);
12449 ss << "clear " << pgid << " pg_upmap_items mapping";
12450 }
12451 break;
12452
12453 case OP_PG_UPMAP_PRIMARY:
12454 {
12455 int64_t id;
12456 if (!cmd_getval(cmdmap, "id", id)) {
12457 ss << "invalid osd id value '"
12458 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12459 err = -EINVAL;
12460 goto reply_no_propose;
12461 }
12462 if (id != CRUSH_ITEM_NONE && !osdmap.exists(id)) {
12463 ss << "osd." << id << " does not exist";
12464 err = -ENOENT;
12465 goto reply_no_propose;
12466 }
12467 vector<int> acting;
12468 int primary;
12469 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12470 if (id == primary) {
12471 ss << "osd." << id << " is already primary for pg " << pgid;
12472 err = -EINVAL;
12473 goto reply_no_propose;
12474 }
12475 int found_idx = 0;
12476 for (int i = 1 ; i < (int)acting.size(); i++) { // skip 0 on purpose
12477 if (acting[i] == id) {
12478 found_idx = i;
12479 break;
12480 }
12481 }
12482 if (found_idx == 0) {
12483 ss << "osd." << id << " is not in acting set for pg " << pgid;
12484 err = -EINVAL;
12485 goto reply_no_propose;
12486 }
12487 vector<int> new_acting(acting);
12488 new_acting[found_idx] = new_acting[0];
12489 new_acting[0] = id;
12490 int pool_size = osdmap.get_pg_pool_size(pgid);
12491 if (osdmap.crush->verify_upmap(cct, osdmap.get_pg_pool_crush_rule(pgid),
12492 pool_size, new_acting) >= 0) {
12493 ss << "change primary for pg " << pgid << " to osd." << id;
12494 }
12495 else {
12496 ss << "can't change primary for pg " << pgid << " to osd." << id
12497 << " - illegal pg after the change";
12498 err = -EINVAL;
12499 goto reply_no_propose;
12500 }
12501 pending_inc.new_pg_upmap_primary[pgid] = id;
12502 //TO-REMOVE:
12503 ldout(cct, 20) << "pg " << pgid << ": set pg_upmap_primary to " << id << dendl;
12504 }
12505 break;
12506
12507 case OP_RM_PG_UPMAP_PRIMARY:
12508 {
12509 pending_inc.old_pg_upmap_primary.insert(pgid);
12510 ss << "clear " << pgid << " pg_upmap_primary mapping";
12511 }
12512 break;
12513
12514 default:
12515 ceph_abort_msg("invalid upmap option");
12516 }
12517
12518 goto update;
12519 } else if (prefix == "osd primary-affinity") {
12520 int64_t id;
12521 if (!cmd_getval(cmdmap, "id", id)) {
12522 ss << "invalid osd id value '"
12523 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12524 err = -EINVAL;
12525 goto reply_no_propose;
12526 }
12527 double w;
12528 if (!cmd_getval(cmdmap, "weight", w)) {
12529 ss << "unable to parse 'weight' value '"
12530 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12531 err = -EINVAL;
12532 goto reply_no_propose;
12533 }
12534 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12535 if (ww < 0L) {
12536 ss << "weight must be >= 0";
12537 err = -EINVAL;
12538 goto reply_no_propose;
12539 }
12540 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12541 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12542 ss << "require_min_compat_client "
12543 << osdmap.require_min_compat_client
12544 << " < firefly, which is required for primary-affinity";
12545 err = -EPERM;
12546 goto reply_no_propose;
12547 }
12548 if (osdmap.exists(id)) {
12549 pending_inc.new_primary_affinity[id] = ww;
12550 ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12551 getline(ss, rs);
12552 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12553 get_last_committed() + 1));
12554 return true;
12555 } else {
12556 ss << "osd." << id << " does not exist";
12557 err = -ENOENT;
12558 goto reply_no_propose;
12559 }
12560 } else if (prefix == "osd reweight") {
12561 int64_t id;
12562 if (!cmd_getval(cmdmap, "id", id)) {
12563 ss << "unable to parse osd id value '"
12564 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12565 err = -EINVAL;
12566 goto reply_no_propose;
12567 }
12568 double w;
12569 if (!cmd_getval(cmdmap, "weight", w)) {
12570 ss << "unable to parse weight value '"
12571 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12572 err = -EINVAL;
12573 goto reply_no_propose;
12574 }
12575 long ww = (int)((double)CEPH_OSD_IN*w);
12576 if (ww < 0L) {
12577 ss << "weight must be >= 0";
12578 err = -EINVAL;
12579 goto reply_no_propose;
12580 }
12581 if (osdmap.exists(id)) {
12582 pending_inc.new_weight[id] = ww;
12583 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12584 getline(ss, rs);
12585 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12586 get_last_committed() + 1));
12587 return true;
12588 } else {
12589 ss << "osd." << id << " does not exist";
12590 err = -ENOENT;
12591 goto reply_no_propose;
12592 }
12593 } else if (prefix == "osd reweightn") {
12594 map<int32_t, uint32_t> weights;
12595 err = parse_reweights(cct, cmdmap, osdmap, &weights);
12596 if (err) {
12597 ss << "unable to parse 'weights' value '"
12598 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12599 goto reply_no_propose;
12600 }
12601 pending_inc.new_weight.insert(weights.begin(), weights.end());
12602 wait_for_finished_proposal(
12603 op,
12604 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12605 return true;
12606 } else if (prefix == "osd lost") {
12607 int64_t id;
12608 if (!cmd_getval(cmdmap, "id", id)) {
12609 ss << "unable to parse osd id value '"
12610 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12611 err = -EINVAL;
12612 goto reply_no_propose;
12613 }
12614 bool sure = false;
12615 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12616 if (!sure) {
12617 ss << "are you SURE? this might mean real, permanent data loss. pass "
12618 "--yes-i-really-mean-it if you really do.";
12619 err = -EPERM;
12620 goto reply_no_propose;
12621 } else if (!osdmap.exists(id)) {
12622 ss << "osd." << id << " does not exist";
12623 err = -ENOENT;
12624 goto reply_no_propose;
12625 } else if (!osdmap.is_down(id)) {
12626 ss << "osd." << id << " is not down";
12627 err = -EBUSY;
12628 goto reply_no_propose;
12629 } else {
12630 epoch_t e = osdmap.get_info(id).down_at;
12631 pending_inc.new_lost[id] = e;
12632 ss << "marked osd lost in epoch " << e;
12633 getline(ss, rs);
12634 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12635 get_last_committed() + 1));
12636 return true;
12637 }
12638
12639 } else if (prefix == "osd destroy-actual" ||
12640 prefix == "osd purge-actual" ||
12641 prefix == "osd purge-new") {
12642 /* Destroying an OSD means that we don't expect to further make use of
12643 * the OSDs data (which may even become unreadable after this operation),
12644 * and that we are okay with scrubbing all its cephx keys and config-key
12645 * data (which may include lockbox keys, thus rendering the osd's data
12646 * unreadable).
12647 *
12648 * The OSD will not be removed. Instead, we will mark it as destroyed,
12649 * such that a subsequent call to `create` will not reuse the osd id.
12650 * This will play into being able to recreate the OSD, at the same
12651 * crush location, with minimal data movement.
12652 */
12653
12654 // make sure authmon is writeable.
12655 if (!mon.authmon()->is_writeable()) {
12656 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12657 << "osd destroy" << dendl;
12658 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12659 return false;
12660 }
12661
12662 int64_t id;
12663 if (!cmd_getval(cmdmap, "id", id)) {
12664 auto p = cmdmap.find("id");
12665 if (p == cmdmap.end()) {
12666 ss << "no osd id specified";
12667 } else {
12668 ss << "unable to parse osd id value '"
12669 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12670 }
12671 err = -EINVAL;
12672 goto reply_no_propose;
12673 }
12674
12675 bool is_destroy = (prefix == "osd destroy-actual");
12676 if (!is_destroy) {
12677 ceph_assert("osd purge-actual" == prefix ||
12678 "osd purge-new" == prefix);
12679 }
12680
12681 bool sure = false;
12682 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12683 if (!sure) {
12684 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12685 << "This will mean real, permanent data loss, as well "
12686 << "as deletion of cephx and lockbox keys. "
12687 << "Pass --yes-i-really-mean-it if you really do.";
12688 err = -EPERM;
12689 goto reply_no_propose;
12690 } else if (!osdmap.exists(id)) {
12691 ss << "osd." << id << " does not exist";
12692 err = 0; // idempotent
12693 goto reply_no_propose;
12694 } else if (osdmap.is_up(id)) {
12695 ss << "osd." << id << " is not `down`.";
12696 err = -EBUSY;
12697 goto reply_no_propose;
12698 } else if (is_destroy && osdmap.is_destroyed(id)) {
12699 ss << "destroyed osd." << id;
12700 err = 0;
12701 goto reply_no_propose;
12702 }
12703
12704 if (prefix == "osd purge-new" &&
12705 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12706 ss << "osd." << id << " is not new";
12707 err = -EPERM;
12708 goto reply_no_propose;
12709 }
12710
12711 bool goto_reply = false;
12712
12713 paxos.plug();
12714 if (is_destroy) {
12715 err = prepare_command_osd_destroy(id, ss);
12716 // we checked above that it should exist.
12717 ceph_assert(err != -ENOENT);
12718 } else {
12719 err = prepare_command_osd_purge(id, ss);
12720 if (err == -ENOENT) {
12721 err = 0;
12722 ss << "osd." << id << " does not exist.";
12723 goto_reply = true;
12724 }
12725 }
12726 paxos.unplug();
12727
12728 if (err < 0 || goto_reply) {
12729 goto reply_no_propose;
12730 }
12731
12732 if (is_destroy) {
12733 ss << "destroyed osd." << id;
12734 } else {
12735 ss << "purged osd." << id;
12736 }
12737
12738 getline(ss, rs);
12739 wait_for_finished_proposal(op,
12740 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12741 force_immediate_propose();
12742 return true;
12743
12744 } else if (prefix == "osd new") {
12745
12746 // make sure authmon is writeable.
12747 if (!mon.authmon()->is_writeable()) {
12748 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12749 << "osd new" << dendl;
12750 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12751 return false;
12752 }
12753
12754 // make sure kvmon is writeable.
12755 if (!mon.kvmon()->is_writeable()) {
12756 dout(10) << __func__ << " waiting for kv mon to be writeable for "
12757 << "osd new" << dendl;
12758 mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12759 return false;
12760 }
12761
12762 map<string,string> param_map;
12763
12764 bufferlist bl = m->get_data();
12765 string param_json = bl.to_str();
12766 dout(20) << __func__ << " osd new json = " << param_json << dendl;
12767
12768 err = get_json_str_map(param_json, ss, &param_map);
12769 if (err < 0)
12770 goto reply_no_propose;
12771
12772 dout(20) << __func__ << " osd new params " << param_map << dendl;
12773
12774 paxos.plug();
12775 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12776 paxos.unplug();
12777
12778 if (err < 0) {
12779 goto reply_no_propose;
12780 }
12781
12782 if (f) {
12783 f->flush(rdata);
12784 } else {
12785 rdata.append(ss);
12786 }
12787
12788 if (err == EEXIST) {
12789 // idempotent operation
12790 err = 0;
12791 goto reply_no_propose;
12792 }
12793
12794 wait_for_finished_proposal(op,
12795 new Monitor::C_Command(mon, op, 0, rs, rdata,
12796 get_last_committed() + 1));
12797 force_immediate_propose();
12798 return true;
12799
12800 } else if (prefix == "osd create") {
12801
12802 // optional id provided?
12803 int64_t id = -1, cmd_id = -1;
12804 if (cmd_getval(cmdmap, "id", cmd_id)) {
12805 if (cmd_id < 0) {
12806 ss << "invalid osd id value '" << cmd_id << "'";
12807 err = -EINVAL;
12808 goto reply_no_propose;
12809 }
12810 dout(10) << " osd create got id " << cmd_id << dendl;
12811 }
12812
12813 uuid_d uuid;
12814 string uuidstr;
12815 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12816 if (!uuid.parse(uuidstr.c_str())) {
12817 ss << "invalid uuid value '" << uuidstr << "'";
12818 err = -EINVAL;
12819 goto reply_no_propose;
12820 }
12821 // we only care about the id if we also have the uuid, to
12822 // ensure the operation's idempotency.
12823 id = cmd_id;
12824 }
12825
12826 int32_t new_id = -1;
12827 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12828 if (err < 0) {
12829 if (err == -EAGAIN) {
12830 goto wait;
12831 }
12832 // a check has failed; reply to the user.
12833 goto reply_no_propose;
12834
12835 } else if (err == EEXIST) {
12836 // this is an idempotent operation; we can go ahead and reply.
12837 if (f) {
12838 f->open_object_section("created_osd");
12839 f->dump_int("osdid", new_id);
12840 f->close_section();
12841 f->flush(rdata);
12842 } else {
12843 ss << new_id;
12844 rdata.append(ss);
12845 }
12846 err = 0;
12847 goto reply_no_propose;
12848 }
12849
12850 string empty_device_class;
12851 do_osd_create(id, uuid, empty_device_class, &new_id);
12852
12853 if (f) {
12854 f->open_object_section("created_osd");
12855 f->dump_int("osdid", new_id);
12856 f->close_section();
12857 f->flush(rdata);
12858 } else {
12859 ss << new_id;
12860 rdata.append(ss);
12861 }
12862 wait_for_finished_proposal(op,
12863 new Monitor::C_Command(mon, op, 0, rs, rdata,
12864 get_last_committed() + 1));
12865 return true;
12866
12867 } else if (prefix == "osd blocklist clear" ||
12868 prefix == "osd blacklist clear") {
12869 pending_inc.new_blocklist.clear();
12870 std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12871 std::list<std::pair<entity_addr_t,utime_t > > range_b;
12872 osdmap.get_blocklist(&blocklist, &range_b);
12873 for (const auto &entry : blocklist) {
12874 pending_inc.old_blocklist.push_back(entry.first);
12875 }
12876 for (const auto &entry : range_b) {
12877 pending_inc.old_range_blocklist.push_back(entry.first);
12878 }
12879 ss << " removed all blocklist entries";
12880 getline(ss, rs);
12881 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12882 get_last_committed() + 1));
12883 return true;
12884 } else if (prefix == "osd blocklist" ||
12885 prefix == "osd blacklist") {
12886 string addrstr, rangestr;
12887 bool range = false;
12888 cmd_getval(cmdmap, "addr", addrstr);
12889 if (cmd_getval(cmdmap, "range", rangestr)) {
12890 if (rangestr == "range") {
12891 range = true;
12892 } else {
12893 ss << "Did you mean to specify \"osd blocklist range\"?";
12894 err = -EINVAL;
12895 goto reply_no_propose;
12896 }
12897 }
12898 entity_addr_t addr;
12899 if (!addr.parse(addrstr)) {
12900 ss << "unable to parse address " << addrstr;
12901 err = -EINVAL;
12902 goto reply_no_propose;
12903 }
12904 else {
12905 if (range) {
12906 if (!addr.maybe_cidr()) {
12907 ss << "You specified a range command, but " << addr
12908 << " does not parse as a CIDR range";
12909 err = -EINVAL;
12910 goto reply_no_propose;
12911 }
12912 addr.type = entity_addr_t::TYPE_CIDR;
12913 err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
12914 if (err) {
12915 goto reply_no_propose;
12916 }
12917 if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
12918 (addr.is_ipv6() && addr.get_nonce() > 128)) {
12919 ss << "Too many bits in range for that protocol!";
12920 err = -EINVAL;
12921 goto reply_no_propose;
12922 }
12923 } else {
12924 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12925 // always blocklist type ANY
12926 addr.set_type(entity_addr_t::TYPE_ANY);
12927 } else {
12928 addr.set_type(entity_addr_t::TYPE_LEGACY);
12929 }
12930 }
12931
12932 string blocklistop;
12933 if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12934 cmd_getval(cmdmap, "blacklistop", blocklistop);
12935 }
12936 if (blocklistop == "add") {
12937 utime_t expires = ceph_clock_now();
12938 // default one hour
12939 double d = cmd_getval_or<double>(cmdmap, "expire",
12940 g_conf()->mon_osd_blocklist_default_expire);
12941 expires += d;
12942
12943 auto add_to_pending_blocklists = [](auto& nb, auto& ob,
12944 const auto& addr,
12945 const auto& expires) {
12946 nb[addr] = expires;
12947 // cancel any pending un-blocklisting request too
12948 auto it = std::find(ob.begin(),
12949 ob.end(), addr);
12950 if (it != ob.end()) {
12951 ob.erase(it);
12952 }
12953 };
12954 if (range) {
12955 add_to_pending_blocklists(pending_inc.new_range_blocklist,
12956 pending_inc.old_range_blocklist,
12957 addr, expires);
12958
12959 } else {
12960 add_to_pending_blocklists(pending_inc.new_blocklist,
12961 pending_inc.old_blocklist,
12962 addr, expires);
12963 }
12964
12965 ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12966 getline(ss, rs);
12967 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12968 get_last_committed() + 1));
12969 return true;
12970 } else if (blocklistop == "rm") {
12971 auto rm_from_pending_blocklists = [](const auto& addr,
12972 auto& blocklist,
12973 auto& ob, auto& pb) {
12974 if (blocklist.count(addr)) {
12975 ob.push_back(addr);
12976 return true;
12977 } else if (pb.count(addr)) {
12978 pb.erase(addr);
12979 return true;
12980 }
12981 return false;
12982 };
12983 if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
12984 pending_inc.old_blocklist,
12985 pending_inc.new_blocklist)) ||
12986 (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
12987 pending_inc.old_range_blocklist,
12988 pending_inc.new_range_blocklist))) {
12989 ss << "un-blocklisting " << addr;
12990 getline(ss, rs);
12991 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12992 get_last_committed() + 1));
12993 return true;
12994 }
12995 ss << addr << " isn't blocklisted";
12996 err = 0;
12997 goto reply_no_propose;
12998 }
12999 }
13000 } else if (prefix == "osd pool mksnap") {
13001 string poolstr;
13002 cmd_getval(cmdmap, "pool", poolstr);
13003 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13004 if (pool < 0) {
13005 ss << "unrecognized pool '" << poolstr << "'";
13006 err = -ENOENT;
13007 goto reply_no_propose;
13008 }
13009 string snapname;
13010 cmd_getval(cmdmap, "snap", snapname);
13011 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13012 if (p->is_unmanaged_snaps_mode()) {
13013 ss << "pool " << poolstr << " is in unmanaged snaps mode";
13014 err = -EINVAL;
13015 goto reply_no_propose;
13016 } else if (p->snap_exists(snapname.c_str())) {
13017 ss << "pool " << poolstr << " snap " << snapname << " already exists";
13018 err = 0;
13019 goto reply_no_propose;
13020 } else if (p->is_tier()) {
13021 ss << "pool " << poolstr << " is a cache tier";
13022 err = -EINVAL;
13023 goto reply_no_propose;
13024 }
13025 pg_pool_t *pp = 0;
13026 if (pending_inc.new_pools.count(pool))
13027 pp = &pending_inc.new_pools[pool];
13028 if (!pp) {
13029 pp = &pending_inc.new_pools[pool];
13030 *pp = *p;
13031 }
13032 if (pp->snap_exists(snapname.c_str())) {
13033 ss << "pool " << poolstr << " snap " << snapname << " already exists";
13034 } else {
13035 if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(pool)) {
13036 dout(20) << "pool-level snapshots have been disabled for pools "
13037 "attached to an fs - poolid:" << pool << dendl;
13038 err = -EOPNOTSUPP;
13039 goto reply_no_propose;
13040 }
13041 pp->add_snap(snapname.c_str(), ceph_clock_now());
13042 pp->set_snap_epoch(pending_inc.epoch);
13043 ss << "created pool " << poolstr << " snap " << snapname;
13044 }
13045 getline(ss, rs);
13046 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13047 get_last_committed() + 1));
13048 return true;
13049 } else if (prefix == "osd pool rmsnap") {
13050 string poolstr;
13051 cmd_getval(cmdmap, "pool", poolstr);
13052 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13053 if (pool < 0) {
13054 ss << "unrecognized pool '" << poolstr << "'";
13055 err = -ENOENT;
13056 goto reply_no_propose;
13057 }
13058 string snapname;
13059 cmd_getval(cmdmap, "snap", snapname);
13060 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13061 if (p->is_unmanaged_snaps_mode()) {
13062 ss << "pool " << poolstr << " is in unmanaged snaps mode";
13063 err = -EINVAL;
13064 goto reply_no_propose;
13065 } else if (!p->snap_exists(snapname.c_str())) {
13066 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
13067 err = 0;
13068 goto reply_no_propose;
13069 }
13070 pg_pool_t *pp = 0;
13071 if (pending_inc.new_pools.count(pool))
13072 pp = &pending_inc.new_pools[pool];
13073 if (!pp) {
13074 pp = &pending_inc.new_pools[pool];
13075 *pp = *p;
13076 }
13077 snapid_t sn = pp->snap_exists(snapname.c_str());
13078 if (sn) {
13079 pp->remove_snap(sn);
13080 pp->set_snap_epoch(pending_inc.epoch);
13081 ss << "removed pool " << poolstr << " snap " << snapname;
13082 } else {
13083 ss << "already removed pool " << poolstr << " snap " << snapname;
13084 }
13085 getline(ss, rs);
13086 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13087 get_last_committed() + 1));
13088 return true;
13089 } else if (prefix == "osd pool create") {
13090 int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
13091 int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
13092 int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
13093 int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
13094 string pool_type_str;
13095 cmd_getval(cmdmap, "pool_type", pool_type_str);
13096 if (pool_type_str.empty())
13097 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
13098
13099 string poolstr;
13100 cmd_getval(cmdmap, "pool", poolstr);
13101 bool confirm = false;
13102 //confirmation may be set to true only by internal operations.
13103 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13104 if (poolstr[0] == '.' && !confirm) {
13105 ss << "pool names beginning with . are not allowed";
13106 err = 0;
13107 goto reply_no_propose;
13108 }
13109 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13110 if (pool_id >= 0) {
13111 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13112 if (pool_type_str != p->get_type_name()) {
13113 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
13114 err = -EINVAL;
13115 } else {
13116 ss << "pool '" << poolstr << "' already exists";
13117 err = 0;
13118 }
13119 goto reply_no_propose;
13120 }
13121
13122 int pool_type;
13123 if (pool_type_str == "replicated") {
13124 pool_type = pg_pool_t::TYPE_REPLICATED;
13125 } else if (pool_type_str == "erasure") {
13126 pool_type = pg_pool_t::TYPE_ERASURE;
13127 } else {
13128 ss << "unknown pool type '" << pool_type_str << "'";
13129 err = -EINVAL;
13130 goto reply_no_propose;
13131 }
13132
13133 bool implicit_rule_creation = false;
13134 int64_t expected_num_objects = 0;
13135 string rule_name;
13136 cmd_getval(cmdmap, "rule", rule_name);
13137 string erasure_code_profile;
13138 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
13139
13140 if (pool_type == pg_pool_t::TYPE_ERASURE) {
13141 if (erasure_code_profile == "")
13142 erasure_code_profile = "default";
13143 //handle the erasure code profile
13144 if (erasure_code_profile == "default") {
13145 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
13146 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
13147 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
13148 goto wait;
13149 }
13150
13151 map<string,string> profile_map;
13152 err = osdmap.get_erasure_code_profile_default(cct,
13153 profile_map,
13154 &ss);
13155 if (err)
13156 goto reply_no_propose;
13157 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
13158 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
13159 goto wait;
13160 }
13161 }
13162 if (rule_name == "") {
13163 implicit_rule_creation = true;
13164 if (erasure_code_profile == "default") {
13165 rule_name = "erasure-code";
13166 } else {
13167 dout(1) << "implicitly use rule named after the pool: "
13168 << poolstr << dendl;
13169 rule_name = poolstr;
13170 }
13171 }
13172 expected_num_objects =
13173 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13174 } else {
13175 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
13176 // and put expected_num_objects to rule field
13177 if (erasure_code_profile != "") { // cmd is from CLI
13178 if (rule_name != "") {
13179 string interr;
13180 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
13181 if (interr.length()) {
13182 ss << "error parsing integer value '" << rule_name << "': " << interr;
13183 err = -EINVAL;
13184 goto reply_no_propose;
13185 }
13186 }
13187 rule_name = erasure_code_profile;
13188 } else { // cmd is well-formed
13189 expected_num_objects =
13190 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13191 }
13192 }
13193
13194 if (!implicit_rule_creation && rule_name != "") {
13195 int rule;
13196 err = get_crush_rule(rule_name, &rule, &ss);
13197 if (err == -EAGAIN) {
13198 goto wait;
13199 }
13200 if (err)
13201 goto reply_no_propose;
13202 }
13203
13204 if (expected_num_objects < 0) {
13205 ss << "'expected_num_objects' must be non-negative";
13206 err = -EINVAL;
13207 goto reply_no_propose;
13208 }
13209
13210 set<int32_t> osds;
13211 osdmap.get_all_osds(osds);
13212 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
13213 string type;
13214 if (!get_osd_objectstore_type(osd, &type)) {
13215 return type == "filestore";
13216 } else {
13217 return false;
13218 }
13219 });
13220
13221 if (has_filestore_osd &&
13222 expected_num_objects > 0 &&
13223 cct->_conf->filestore_merge_threshold > 0) {
13224 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13225 err = -EINVAL;
13226 goto reply_no_propose;
13227 }
13228
13229 if (has_filestore_osd &&
13230 expected_num_objects == 0 &&
13231 cct->_conf->filestore_merge_threshold < 0) {
13232 int osds = osdmap.get_num_osds();
13233 bool sure = false;
13234 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13235 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
13236 ss << "For better initial performance on pools expected to store a "
13237 << "large number of objects, consider supplying the "
13238 << "expected_num_objects parameter when creating the pool."
13239 << " Pass --yes-i-really-mean-it to ignore it";
13240 err = -EPERM;
13241 goto reply_no_propose;
13242 }
13243 }
13244
13245 int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
13246 FastReadType fast_read = FAST_READ_DEFAULT;
13247 if (fast_read_param == 0)
13248 fast_read = FAST_READ_OFF;
13249 else if (fast_read_param > 0)
13250 fast_read = FAST_READ_ON;
13251
13252 int64_t repl_size = 0;
13253 cmd_getval(cmdmap, "size", repl_size);
13254 int64_t target_size_bytes = 0;
13255 double target_size_ratio = 0.0;
13256 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
13257 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
13258
13259 string pg_autoscale_mode;
13260 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
13261
13262 bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
13263
13264 bool crimson = cmd_getval_or<bool>(cmdmap, "crimson", false) ||
13265 cct->_conf.get_val<bool>("osd_pool_default_crimson");
13266
13267 err = prepare_new_pool(poolstr,
13268 -1, // default crush rule
13269 rule_name,
13270 pg_num, pgp_num, pg_num_min, pg_num_max,
13271 repl_size, target_size_bytes, target_size_ratio,
13272 erasure_code_profile, pool_type,
13273 (uint64_t)expected_num_objects,
13274 fast_read,
13275 pg_autoscale_mode,
13276 bulk,
13277 crimson,
13278 &ss);
13279 if (err < 0) {
13280 switch(err) {
13281 case -EEXIST:
13282 ss << "pool '" << poolstr << "' already exists";
13283 err = 0;
13284 goto reply_no_propose;
13285 case -EAGAIN:
13286 goto wait;
13287 case -ERANGE:
13288 goto reply_no_propose;
13289 default:
13290 goto reply_no_propose;
13291 }
13292 } else {
13293 ss << "pool '" << poolstr << "' created";
13294 }
13295 getline(ss, rs);
13296 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13297 get_last_committed() + 1));
13298 return true;
13299
13300 } else if (prefix == "osd pool delete" ||
13301 prefix == "osd pool rm") {
13302 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13303 string poolstr, poolstr2, sure;
13304 cmd_getval(cmdmap, "pool", poolstr);
13305 cmd_getval(cmdmap, "pool2", poolstr2);
13306 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13307 if (pool < 0) {
13308 ss << "pool '" << poolstr << "' does not exist";
13309 err = 0;
13310 goto reply_no_propose;
13311 }
13312
13313 bool force_no_fake = false;
13314 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
13315 bool force = false;
13316 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
13317 if (poolstr2 != poolstr ||
13318 (!force && !force_no_fake)) {
13319 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13320 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13321 << "followed by --yes-i-really-really-mean-it.";
13322 err = -EPERM;
13323 goto reply_no_propose;
13324 }
13325 err = _prepare_remove_pool(pool, &ss, force_no_fake);
13326 if (err == -EAGAIN) {
13327 goto wait;
13328 }
13329 if (err < 0)
13330 goto reply_no_propose;
13331 goto update;
13332 } else if (prefix == "osd pool rename") {
13333 string srcpoolstr, destpoolstr;
13334 cmd_getval(cmdmap, "srcpool", srcpoolstr);
13335 cmd_getval(cmdmap, "destpool", destpoolstr);
13336 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13337 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
13338 bool confirm = false;
13339 //confirmation may be set to true only by internal operations.
13340 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13341 if (destpoolstr[0] == '.' && !confirm) {
13342 ss << "pool names beginning with . are not allowed";
13343 err = 0;
13344 goto reply_no_propose;
13345 }
13346 if (pool_src < 0) {
13347 if (pool_dst >= 0) {
13348 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13349 // of operations, assume this rename succeeded, as it is not changing
13350 // the current state. Make sure we output something understandable
13351 // for whoever is issuing the command, if they are paying attention,
13352 // in case it was not intentional; or to avoid a "wtf?" and a bug
13353 // report in case it was intentional, while expecting a failure.
13354 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13355 << destpoolstr << "' does -- assuming successful rename";
13356 err = 0;
13357 } else {
13358 ss << "unrecognized pool '" << srcpoolstr << "'";
13359 err = -ENOENT;
13360 }
13361 goto reply_no_propose;
13362 } else if (pool_dst >= 0) {
13363 // source pool exists and so does the destination pool
13364 ss << "pool '" << destpoolstr << "' already exists";
13365 err = -EEXIST;
13366 goto reply_no_propose;
13367 }
13368
13369 int ret = _prepare_rename_pool(pool_src, destpoolstr);
13370 if (ret == 0) {
13371 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13372 } else {
13373 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13374 << cpp_strerror(ret);
13375 }
13376 getline(ss, rs);
13377 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13378 get_last_committed() + 1));
13379 return true;
13380
13381 } else if (prefix == "osd pool set") {
13382 err = prepare_command_pool_set(cmdmap, ss);
13383 if (err == -EAGAIN)
13384 goto wait;
13385 if (err < 0)
13386 goto reply_no_propose;
13387
13388 getline(ss, rs);
13389 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13390 get_last_committed() + 1));
13391 return true;
13392 } else if (prefix == "osd tier add") {
13393 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13394 if (err == -EAGAIN)
13395 goto wait;
13396 if (err)
13397 goto reply_no_propose;
13398 string poolstr;
13399 cmd_getval(cmdmap, "pool", poolstr);
13400 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13401 if (pool_id < 0) {
13402 ss << "unrecognized pool '" << poolstr << "'";
13403 err = -ENOENT;
13404 goto reply_no_propose;
13405 }
13406 string tierpoolstr;
13407 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13408 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13409 if (tierpool_id < 0) {
13410 ss << "unrecognized pool '" << tierpoolstr << "'";
13411 err = -ENOENT;
13412 goto reply_no_propose;
13413 }
13414 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13415 ceph_assert(p);
13416 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13417 ceph_assert(tp);
13418
13419 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13420 goto reply_no_propose;
13421 }
13422
13423 // make sure new tier is empty
13424 bool force_nonempty = false;
13425 cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
13426 const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13427 if (pstats && pstats->stats.sum.num_objects != 0 &&
13428 !force_nonempty) {
13429 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13430 err = -ENOTEMPTY;
13431 goto reply_no_propose;
13432 }
13433 if (tp->is_erasure()) {
13434 ss << "tier pool '" << tierpoolstr
13435 << "' is an ec pool, which cannot be a tier";
13436 err = -ENOTSUP;
13437 goto reply_no_propose;
13438 }
13439 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13440 (!force_nonempty ||
13441 !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
13442 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13443 err = -ENOTEMPTY;
13444 goto reply_no_propose;
13445 }
13446 // go
13447 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13448 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13449 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13450 goto wait;
13451 }
13452 np->tiers.insert(tierpool_id);
13453 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13454 ntp->tier_of = pool_id;
13455 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13456 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13457 get_last_committed() + 1));
13458 return true;
13459 } else if (prefix == "osd tier remove" ||
13460 prefix == "osd tier rm") {
13461 string poolstr;
13462 cmd_getval(cmdmap, "pool", poolstr);
13463 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13464 if (pool_id < 0) {
13465 ss << "unrecognized pool '" << poolstr << "'";
13466 err = -ENOENT;
13467 goto reply_no_propose;
13468 }
13469 string tierpoolstr;
13470 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13471 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13472 if (tierpool_id < 0) {
13473 ss << "unrecognized pool '" << tierpoolstr << "'";
13474 err = -ENOENT;
13475 goto reply_no_propose;
13476 }
13477 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13478 ceph_assert(p);
13479 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13480 ceph_assert(tp);
13481
13482 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13483 goto reply_no_propose;
13484 }
13485
13486 if (p->tiers.count(tierpool_id) == 0) {
13487 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13488 err = 0;
13489 goto reply_no_propose;
13490 }
13491 if (tp->tier_of != pool_id) {
13492 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13493 << osdmap.get_pool_name(tp->tier_of) << "': "
13494 // be scary about it; this is an inconsistency and bells must go off
13495 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13496 err = -EINVAL;
13497 goto reply_no_propose;
13498 }
13499 if (p->read_tier == tierpool_id) {
13500 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13501 err = -EBUSY;
13502 goto reply_no_propose;
13503 }
13504 // go
13505 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13506 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13507 if (np->tiers.count(tierpool_id) == 0 ||
13508 ntp->tier_of != pool_id ||
13509 np->read_tier == tierpool_id) {
13510 goto wait;
13511 }
13512 np->tiers.erase(tierpool_id);
13513 ntp->clear_tier();
13514 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13515 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13516 get_last_committed() + 1));
13517 return true;
13518 } else if (prefix == "osd tier set-overlay") {
13519 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13520 if (err == -EAGAIN)
13521 goto wait;
13522 if (err)
13523 goto reply_no_propose;
13524 string poolstr;
13525 cmd_getval(cmdmap, "pool", poolstr);
13526 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13527 if (pool_id < 0) {
13528 ss << "unrecognized pool '" << poolstr << "'";
13529 err = -ENOENT;
13530 goto reply_no_propose;
13531 }
13532 string overlaypoolstr;
13533 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13534 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13535 if (overlaypool_id < 0) {
13536 ss << "unrecognized pool '" << overlaypoolstr << "'";
13537 err = -ENOENT;
13538 goto reply_no_propose;
13539 }
13540 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13541 ceph_assert(p);
13542 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13543 ceph_assert(overlay_p);
13544 if (p->tiers.count(overlaypool_id) == 0) {
13545 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13546 err = -EINVAL;
13547 goto reply_no_propose;
13548 }
13549 if (p->read_tier == overlaypool_id) {
13550 err = 0;
13551 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13552 goto reply_no_propose;
13553 }
13554 if (p->has_read_tier()) {
13555 ss << "pool '" << poolstr << "' has overlay '"
13556 << osdmap.get_pool_name(p->read_tier)
13557 << "'; please remove-overlay first";
13558 err = -EINVAL;
13559 goto reply_no_propose;
13560 }
13561
13562 // go
13563 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13564 np->read_tier = overlaypool_id;
13565 np->write_tier = overlaypool_id;
13566 np->set_last_force_op_resend(pending_inc.epoch);
13567 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13568 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13569 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13570 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13571 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13572 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13573 get_last_committed() + 1));
13574 return true;
13575 } else if (prefix == "osd tier remove-overlay" ||
13576 prefix == "osd tier rm-overlay") {
13577 string poolstr;
13578 cmd_getval(cmdmap, "pool", poolstr);
13579 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13580 if (pool_id < 0) {
13581 ss << "unrecognized pool '" << poolstr << "'";
13582 err = -ENOENT;
13583 goto reply_no_propose;
13584 }
13585 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13586 ceph_assert(p);
13587 if (!p->has_read_tier()) {
13588 err = 0;
13589 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13590 goto reply_no_propose;
13591 }
13592
13593 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13594 goto reply_no_propose;
13595 }
13596
13597 // go
13598 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13599 if (np->has_read_tier()) {
13600 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13601 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13602 nop->set_last_force_op_resend(pending_inc.epoch);
13603 }
13604 if (np->has_write_tier()) {
13605 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13606 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13607 nop->set_last_force_op_resend(pending_inc.epoch);
13608 }
13609 np->clear_read_tier();
13610 np->clear_write_tier();
13611 np->set_last_force_op_resend(pending_inc.epoch);
13612 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13613 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13614 get_last_committed() + 1));
13615 return true;
13616 } else if (prefix == "osd tier cache-mode") {
13617 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13618 if (err == -EAGAIN)
13619 goto wait;
13620 if (err)
13621 goto reply_no_propose;
13622 string poolstr;
13623 cmd_getval(cmdmap, "pool", poolstr);
13624 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13625 if (pool_id < 0) {
13626 ss << "unrecognized pool '" << poolstr << "'";
13627 err = -ENOENT;
13628 goto reply_no_propose;
13629 }
13630 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13631 ceph_assert(p);
13632 if (!p->is_tier()) {
13633 ss << "pool '" << poolstr << "' is not a tier";
13634 err = -EINVAL;
13635 goto reply_no_propose;
13636 }
13637 string modestr;
13638 cmd_getval(cmdmap, "mode", modestr);
13639 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13640 if (int(mode) < 0) {
13641 ss << "'" << modestr << "' is not a valid cache mode";
13642 err = -EINVAL;
13643 goto reply_no_propose;
13644 }
13645
13646 bool sure = false;
13647 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13648
13649 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13650 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13651 ss << "'" << modestr << "' is no longer a supported cache mode";
13652 err = -EPERM;
13653 goto reply_no_propose;
13654 }
13655 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13656 mode != pg_pool_t::CACHEMODE_NONE &&
13657 mode != pg_pool_t::CACHEMODE_PROXY &&
13658 mode != pg_pool_t::CACHEMODE_READPROXY) &&
13659 !sure) {
13660 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13661 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13662 err = -EPERM;
13663 goto reply_no_propose;
13664 }
13665
13666 // pool already has this cache-mode set and there are no pending changes
13667 if (p->cache_mode == mode &&
13668 (pending_inc.new_pools.count(pool_id) == 0 ||
13669 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13670 ss << "set cache-mode for pool '" << poolstr << "'"
13671 << " to " << pg_pool_t::get_cache_mode_name(mode);
13672 err = 0;
13673 goto reply_no_propose;
13674 }
13675
13676 /* Mode description:
13677 *
13678 * none: No cache-mode defined
13679 * forward: Forward all reads and writes to base pool [removed]
13680 * writeback: Cache writes, promote reads from base pool
13681 * readonly: Forward writes to base pool
13682 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13683 * proxy: Proxy all reads and writes to base pool
13684 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13685 *
13686 * Hence, these are the allowed transitions:
13687 *
13688 * none -> any
13689 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13690 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13691 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13692 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13693 * writeback -> readproxy || proxy
13694 * readonly -> any
13695 */
13696
13697 // We check if the transition is valid against the current pool mode, as
13698 // it is the only committed state thus far. We will blantly squash
13699 // whatever mode is on the pending state.
13700
13701 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13702 (mode != pg_pool_t::CACHEMODE_PROXY &&
13703 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13704 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13705 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13706 << "' pool; only '"
13707 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
13708 << "','"
13709 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13710 << "' allowed.";
13711 err = -EINVAL;
13712 goto reply_no_propose;
13713 }
13714 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13715 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13716 mode != pg_pool_t::CACHEMODE_PROXY &&
13717 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13718
13719 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13720 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13721 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13722
13723 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13724 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13725 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13726
13727 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13728 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13729 mode != pg_pool_t::CACHEMODE_PROXY &&
13730 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13731
13732 const pool_stat_t* pstats =
13733 mon.mgrstatmon()->get_pool_stat(pool_id);
13734
13735 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13736 ss << "unable to set cache-mode '"
13737 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13738 << "': dirty objects found";
13739 err = -EBUSY;
13740 goto reply_no_propose;
13741 }
13742 }
13743 // go
13744 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13745 np->cache_mode = mode;
13746 // set this both when moving to and from cache_mode NONE. this is to
13747 // capture legacy pools that were set up before this flag existed.
13748 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13749 ss << "set cache-mode for pool '" << poolstr
13750 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13751 if (mode == pg_pool_t::CACHEMODE_NONE) {
13752 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13753 ceph_assert(base_pool);
13754 if (base_pool->read_tier == pool_id ||
13755 base_pool->write_tier == pool_id)
13756 ss <<" (WARNING: pool is still configured as read or write tier)";
13757 }
13758 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13759 get_last_committed() + 1));
13760 return true;
13761 } else if (prefix == "osd tier add-cache") {
13762 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13763 if (err == -EAGAIN)
13764 goto wait;
13765 if (err)
13766 goto reply_no_propose;
13767 string poolstr;
13768 cmd_getval(cmdmap, "pool", poolstr);
13769 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13770 if (pool_id < 0) {
13771 ss << "unrecognized pool '" << poolstr << "'";
13772 err = -ENOENT;
13773 goto reply_no_propose;
13774 }
13775 string tierpoolstr;
13776 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13777 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13778 if (tierpool_id < 0) {
13779 ss << "unrecognized pool '" << tierpoolstr << "'";
13780 err = -ENOENT;
13781 goto reply_no_propose;
13782 }
13783 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13784 ceph_assert(p);
13785 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13786 ceph_assert(tp);
13787
13788 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13789 goto reply_no_propose;
13790 }
13791
13792 int64_t size = 0;
13793 if (!cmd_getval(cmdmap, "size", size)) {
13794 ss << "unable to parse 'size' value '"
13795 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13796 err = -EINVAL;
13797 goto reply_no_propose;
13798 }
13799 // make sure new tier is empty
13800 const pool_stat_t *pstats =
13801 mon.mgrstatmon()->get_pool_stat(tierpool_id);
13802 if (pstats && pstats->stats.sum.num_objects != 0) {
13803 ss << "tier pool '" << tierpoolstr << "' is not empty";
13804 err = -ENOTEMPTY;
13805 goto reply_no_propose;
13806 }
13807 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13808 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13809 if (int(mode) < 0) {
13810 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13811 err = -EINVAL;
13812 goto reply_no_propose;
13813 }
13814 HitSet::Params hsp;
13815 auto& cache_hit_set_type =
13816 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13817 if (cache_hit_set_type == "bloom") {
13818 BloomHitSet::Params *bsp = new BloomHitSet::Params;
13819 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13820 hsp = HitSet::Params(bsp);
13821 } else if (cache_hit_set_type == "explicit_hash") {
13822 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13823 } else if (cache_hit_set_type == "explicit_object") {
13824 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13825 } else {
13826 ss << "osd tier cache default hit set type '"
13827 << cache_hit_set_type << "' is not a known type";
13828 err = -EINVAL;
13829 goto reply_no_propose;
13830 }
13831 // go
13832 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13833 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13834 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13835 goto wait;
13836 }
13837 np->tiers.insert(tierpool_id);
13838 np->read_tier = np->write_tier = tierpool_id;
13839 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13840 np->set_last_force_op_resend(pending_inc.epoch);
13841 ntp->set_last_force_op_resend(pending_inc.epoch);
13842 ntp->tier_of = pool_id;
13843 ntp->cache_mode = mode;
13844 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13845 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13846 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13847 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13848 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13849 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13850 ntp->hit_set_params = hsp;
13851 ntp->target_max_bytes = size;
13852 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13853 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13854 get_last_committed() + 1));
13855 return true;
13856 } else if (prefix == "osd pool set-quota") {
13857 string poolstr;
13858 cmd_getval(cmdmap, "pool", poolstr);
13859 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13860 if (pool_id < 0) {
13861 ss << "unrecognized pool '" << poolstr << "'";
13862 err = -ENOENT;
13863 goto reply_no_propose;
13864 }
13865
13866 string field;
13867 cmd_getval(cmdmap, "field", field);
13868 if (field != "max_objects" && field != "max_bytes") {
13869 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13870 err = -EINVAL;
13871 goto reply_no_propose;
13872 }
13873
13874 // val could contain unit designations, so we treat as a string
13875 string val;
13876 cmd_getval(cmdmap, "val", val);
13877 string tss;
13878 int64_t value;
13879 if (field == "max_objects") {
13880 value = strict_si_cast<uint64_t>(val, &tss);
13881 } else if (field == "max_bytes") {
13882 value = strict_iecstrtoll(val, &tss);
13883 } else {
13884 ceph_abort_msg("unrecognized option");
13885 }
13886 if (!tss.empty()) {
13887 ss << "error parsing value '" << val << "': " << tss;
13888 err = -EINVAL;
13889 goto reply_no_propose;
13890 }
13891
13892 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13893 if (field == "max_objects") {
13894 pi->quota_max_objects = value;
13895 } else if (field == "max_bytes") {
13896 pi->quota_max_bytes = value;
13897 } else {
13898 ceph_abort_msg("unrecognized option");
13899 }
13900 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13901 rs = ss.str();
13902 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13903 get_last_committed() + 1));
13904 return true;
13905 } else if (prefix == "osd pool application enable" ||
13906 prefix == "osd pool application disable" ||
13907 prefix == "osd pool application set" ||
13908 prefix == "osd pool application rm") {
13909 err = prepare_command_pool_application(prefix, cmdmap, ss);
13910 if (err == -EAGAIN) {
13911 goto wait;
13912 } else if (err < 0) {
13913 goto reply_no_propose;
13914 } else {
13915 goto update;
13916 }
13917 } else if (prefix == "osd force-create-pg") {
13918 pg_t pgid;
13919 string pgidstr;
13920 err = parse_pgid(cmdmap, ss, pgid, pgidstr);
13921 if (err < 0)
13922 goto reply_no_propose;
13923 bool sure = false;
13924 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13925 if (!sure) {
13926 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13927 << "that the cluster will give up ever trying to recover the lost data. Do this "
13928 << "only if you are certain that all copies of the PG are in fact lost and you are "
13929 << "willing to accept that the data is permanently destroyed. Pass "
13930 << "--yes-i-really-mean-it to proceed.";
13931 err = -EPERM;
13932 goto reply_no_propose;
13933 }
13934 bool creating_now;
13935 {
13936 std::lock_guard<std::mutex> l(creating_pgs_lock);
13937 auto emplaced = creating_pgs.pgs.emplace(
13938 pgid,
13939 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13940 ceph_clock_now()));
13941 creating_now = emplaced.second;
13942 }
13943 if (creating_now) {
13944 ss << "pg " << pgidstr << " now creating, ok";
13945 // set the pool's CREATING flag so that (1) the osd won't ignore our
13946 // create message and (2) we won't propose any future pg_num changes
13947 // until after the PG has been instantiated.
13948 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13949 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13950 }
13951 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13952 err = 0;
13953 goto update;
13954 } else {
13955 ss << "pg " << pgid << " already creating";
13956 err = 0;
13957 goto reply_no_propose;
13958 }
13959 } else if (prefix == "osd force_healthy_stretch_mode") {
13960 bool sure = false;
13961 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13962 if (!sure) {
13963 ss << "This command will require peering across multiple CRUSH buckets "
13964 "(probably two data centers or availability zones?) and may result in PGs "
13965 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13966 err = -EPERM;
13967 goto reply_no_propose;
13968 }
13969 try_end_recovery_stretch_mode(true);
13970 ss << "Triggering healthy stretch mode";
13971 err = 0;
13972 goto reply_no_propose;
13973 } else if (prefix == "osd force_recovery_stretch_mode") {
13974 bool sure = false;
13975 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13976 if (!sure) {
13977 ss << "This command will increase pool sizes to try and spread them "
13978 "across multiple CRUSH buckets (probably two data centers or "
13979 "availability zones?) and should have happened automatically"
13980 "Pass --yes-i-really-mean-it to proceed.";
13981 err = -EPERM;
13982 goto reply_no_propose;
13983 }
13984 mon.go_recovery_stretch_mode();
13985 ss << "Triggering recovery stretch mode";
13986 err = 0;
13987 goto reply_no_propose;
13988 } else if (prefix == "osd set-allow-crimson") {
13989
13990 bool sure = false;
13991 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13992
13993 bool experimental_enabled =
13994 g_ceph_context->check_experimental_feature_enabled("crimson");
13995 if (!sure || !experimental_enabled) {
13996 ss << "This command will allow usage of crimson-osd osd daemons. "
13997 << "crimson-osd is not considered stable and will likely cause "
13998 << "crashes or data corruption. At this time, crimson-osd is mainly "
13999 << "useful for performance evaluation, testing, and development. "
14000 << "If you are sure, add --yes-i-really-mean-it and add 'crimson' to "
14001 << "the experimental features config. This setting is irrevocable.";
14002 err = -EPERM;
14003 goto reply_no_propose;
14004 }
14005
14006 err = 0;
14007 if (osdmap.get_allow_crimson()) {
14008 goto reply_no_propose;
14009 } else {
14010 pending_inc.set_allow_crimson();
14011 goto update;
14012 }
14013 } else {
14014 err = -EINVAL;
14015 }
14016
14017 reply_no_propose:
14018 getline(ss, rs);
14019 if (err < 0 && rs.length() == 0)
14020 rs = cpp_strerror(err);
14021 mon.reply_command(op, err, rs, rdata, get_last_committed());
14022 return false; /* nothing to propose */
14023
14024 update:
14025 getline(ss, rs);
14026 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
14027 get_last_committed() + 1));
14028 return true;
14029
14030 wait:
14031 // XXX
14032 // Some osd commands split changes across two epochs.
14033 // It seems this is mostly for crush rule changes. It doesn't need
14034 // to be this way but it's a bit of work to fix that. For now,
14035 // trigger a proposal by returning true and then retry the command
14036 // to complete the operation.
14037 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14038 return true;
14039 }
14040
14041 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
14042 {
14043 op->mark_osdmon_event(__func__);
14044
14045 auto m = op->get_req<MPoolOp>();
14046 MonSession *session = op->get_session();
14047 if (!session) {
14048 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14049 return true;
14050 }
14051
14052 switch (m->op) {
14053 case POOL_OP_CREATE_UNMANAGED_SNAP:
14054 case POOL_OP_DELETE_UNMANAGED_SNAP:
14055 {
14056 const std::string* pool_name = nullptr;
14057 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
14058 if (pg_pool != nullptr) {
14059 pool_name = &osdmap.get_pool_name(m->pool);
14060 }
14061
14062 if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
14063 session->entity_name, session->caps,
14064 session->get_peer_socket_addr(),
14065 pool_name)) {
14066 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
14067 << "privileges. message: " << *m << std::endl
14068 << "caps: " << session->caps << dendl;
14069 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14070 return true;
14071 }
14072 }
14073 break;
14074 default:
14075 if (!session->is_capable("osd", MON_CAP_W)) {
14076 dout(0) << "got pool op from entity with insufficient privileges. "
14077 << "message: " << *m << std::endl
14078 << "caps: " << session->caps << dendl;
14079 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14080 return true;
14081 }
14082 break;
14083 }
14084
14085 return false;
14086 }
14087
14088 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
14089 {
14090 op->mark_osdmon_event(__func__);
14091 auto m = op->get_req<MPoolOp>();
14092
14093 if (enforce_pool_op_caps(op)) {
14094 return true;
14095 }
14096
14097 if (m->fsid != mon.monmap->fsid) {
14098 dout(0) << __func__ << " drop message on fsid " << m->fsid
14099 << " != " << mon.monmap->fsid << " for " << *m << dendl;
14100 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14101 return true;
14102 }
14103
14104 if (m->op == POOL_OP_CREATE)
14105 return preprocess_pool_op_create(op);
14106
14107 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
14108 if (p == nullptr) {
14109 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
14110 if (m->op == POOL_OP_DELETE) {
14111 _pool_op_reply(op, 0, osdmap.get_epoch());
14112 } else {
14113 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14114 }
14115 return true;
14116 }
14117
14118 // check if the snap and snapname exist
14119 bool snap_exists = false;
14120 if (p->snap_exists(m->name.c_str()))
14121 snap_exists = true;
14122
14123 switch (m->op) {
14124 case POOL_OP_CREATE_SNAP:
14125 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
14126 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14127 return true;
14128 }
14129 if (snap_exists) {
14130 _pool_op_reply(op, 0, osdmap.get_epoch());
14131 return true;
14132 }
14133 return false;
14134 case POOL_OP_CREATE_UNMANAGED_SNAP:
14135 if (p->is_pool_snaps_mode()) {
14136 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14137 return true;
14138 }
14139 return false;
14140 case POOL_OP_DELETE_SNAP:
14141 if (p->is_unmanaged_snaps_mode()) {
14142 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14143 return true;
14144 }
14145 if (!snap_exists) {
14146 _pool_op_reply(op, 0, osdmap.get_epoch());
14147 return true;
14148 }
14149 return false;
14150 case POOL_OP_DELETE_UNMANAGED_SNAP:
14151 if (p->is_pool_snaps_mode()) {
14152 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14153 return true;
14154 }
14155 if (_is_removed_snap(m->pool, m->snapid)) {
14156 _pool_op_reply(op, 0, osdmap.get_epoch());
14157 return true;
14158 }
14159 return false;
14160 case POOL_OP_DELETE:
14161 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
14162 _pool_op_reply(op, 0, osdmap.get_epoch());
14163 return true;
14164 }
14165 return false;
14166 case POOL_OP_AUID_CHANGE:
14167 return false;
14168 default:
14169 ceph_abort();
14170 break;
14171 }
14172
14173 return false;
14174 }
14175
14176 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
14177 {
14178 if (!osdmap.have_pg_pool(pool)) {
14179 dout(10) << __func__ << " pool " << pool << " snap " << snap
14180 << " - pool dne" << dendl;
14181 return true;
14182 }
14183 if (osdmap.in_removed_snaps_queue(pool, snap)) {
14184 dout(10) << __func__ << " pool " << pool << " snap " << snap
14185 << " - in osdmap removed_snaps_queue" << dendl;
14186 return true;
14187 }
14188 snapid_t begin, end;
14189 int r = lookup_purged_snap(pool, snap, &begin, &end);
14190 if (r == 0) {
14191 dout(10) << __func__ << " pool " << pool << " snap " << snap
14192 << " - purged, [" << begin << "," << end << ")" << dendl;
14193 return true;
14194 }
14195 return false;
14196 }
14197
14198 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
14199 {
14200 if (pending_inc.old_pools.count(pool)) {
14201 dout(10) << __func__ << " pool " << pool << " snap " << snap
14202 << " - pool pending deletion" << dendl;
14203 return true;
14204 }
14205 if (pending_inc.in_new_removed_snaps(pool, snap)) {
14206 dout(10) << __func__ << " pool " << pool << " snap " << snap
14207 << " - in pending new_removed_snaps" << dendl;
14208 return true;
14209 }
14210 return false;
14211 }
14212
14213 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
14214 {
14215 op->mark_osdmon_event(__func__);
14216 auto m = op->get_req<MPoolOp>();
14217 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
14218 if (pool >= 0) {
14219 _pool_op_reply(op, 0, osdmap.get_epoch());
14220 return true;
14221 }
14222
14223 return false;
14224 }
14225
14226 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
14227 {
14228 op->mark_osdmon_event(__func__);
14229 auto m = op->get_req<MPoolOp>();
14230 dout(10) << "prepare_pool_op " << *m << dendl;
14231 if (m->op == POOL_OP_CREATE) {
14232 return prepare_pool_op_create(op);
14233 } else if (m->op == POOL_OP_DELETE) {
14234 return prepare_pool_op_delete(op);
14235 }
14236
14237 int ret = 0;
14238 bool changed = false;
14239
14240 if (!osdmap.have_pg_pool(m->pool)) {
14241 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14242 return false;
14243 }
14244
14245 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
14246
14247 if (m->op == POOL_OP_CREATE_SNAP ||
14248 m->op == POOL_OP_CREATE_UNMANAGED_SNAP) {
14249 if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(m->pool)) {
14250 dout(20) << "monitor-managed snapshots have been disabled for pools "
14251 " attached to an fs - pool:" << m->pool << dendl;
14252 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14253 return false;
14254 }
14255 }
14256
14257 switch (m->op) {
14258 case POOL_OP_CREATE_SNAP:
14259 if (pool->is_tier()) {
14260 ret = -EINVAL;
14261 _pool_op_reply(op, ret, osdmap.get_epoch());
14262 return false;
14263 } // else, fall through
14264 case POOL_OP_DELETE_SNAP:
14265 if (!pool->is_unmanaged_snaps_mode()) {
14266 bool snap_exists = pool->snap_exists(m->name.c_str());
14267 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
14268 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
14269 ret = 0;
14270 } else {
14271 break;
14272 }
14273 } else {
14274 ret = -EINVAL;
14275 }
14276 _pool_op_reply(op, ret, osdmap.get_epoch());
14277 return false;
14278
14279 case POOL_OP_DELETE_UNMANAGED_SNAP:
14280 // we won't allow removal of an unmanaged snapshot from a pool
14281 // not in unmanaged snaps mode.
14282 if (!pool->is_unmanaged_snaps_mode()) {
14283 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
14284 return false;
14285 }
14286 /* fall-thru */
14287 case POOL_OP_CREATE_UNMANAGED_SNAP:
14288 // but we will allow creating an unmanaged snapshot on any pool
14289 // as long as it is not in 'pool' snaps mode.
14290 if (pool->is_pool_snaps_mode()) {
14291 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14292 return false;
14293 }
14294 }
14295
14296 // projected pool info
14297 pg_pool_t pp;
14298 if (pending_inc.new_pools.count(m->pool))
14299 pp = pending_inc.new_pools[m->pool];
14300 else
14301 pp = *osdmap.get_pg_pool(m->pool);
14302
14303 bufferlist reply_data;
14304
14305 // pool snaps vs unmanaged snaps are mutually exclusive
14306 switch (m->op) {
14307 case POOL_OP_CREATE_SNAP:
14308 case POOL_OP_DELETE_SNAP:
14309 if (pp.is_unmanaged_snaps_mode()) {
14310 ret = -EINVAL;
14311 goto out;
14312 }
14313 break;
14314
14315 case POOL_OP_CREATE_UNMANAGED_SNAP:
14316 case POOL_OP_DELETE_UNMANAGED_SNAP:
14317 if (pp.is_pool_snaps_mode()) {
14318 ret = -EINVAL;
14319 goto out;
14320 }
14321 }
14322
14323 switch (m->op) {
14324 case POOL_OP_CREATE_SNAP:
14325 if (!pp.snap_exists(m->name.c_str())) {
14326 pp.add_snap(m->name.c_str(), ceph_clock_now());
14327 dout(10) << "create snap in pool " << m->pool << " " << m->name
14328 << " seq " << pp.get_snap_epoch() << dendl;
14329 changed = true;
14330 }
14331 break;
14332
14333 case POOL_OP_DELETE_SNAP:
14334 {
14335 snapid_t s = pp.snap_exists(m->name.c_str());
14336 if (s) {
14337 pp.remove_snap(s);
14338 pending_inc.new_removed_snaps[m->pool].insert(s);
14339 changed = true;
14340 }
14341 }
14342 break;
14343
14344 case POOL_OP_CREATE_UNMANAGED_SNAP:
14345 {
14346 uint64_t snapid = pp.add_unmanaged_snap(
14347 osdmap.require_osd_release < ceph_release_t::octopus);
14348 encode(snapid, reply_data);
14349 changed = true;
14350 }
14351 break;
14352
14353 case POOL_OP_DELETE_UNMANAGED_SNAP:
14354 if (!_is_removed_snap(m->pool, m->snapid) &&
14355 !_is_pending_removed_snap(m->pool, m->snapid)) {
14356 if (m->snapid > pp.get_snap_seq()) {
14357 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14358 return false;
14359 }
14360 pp.remove_unmanaged_snap(
14361 m->snapid,
14362 osdmap.require_osd_release < ceph_release_t::octopus);
14363 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
14364 // also record the new seq as purged: this avoids a discontinuity
14365 // after all of the snaps have been purged, since the seq assigned
14366 // during removal lives in the same namespace as the actual snaps.
14367 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
14368 changed = true;
14369 }
14370 break;
14371
14372 case POOL_OP_AUID_CHANGE:
14373 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14374 return false;
14375
14376 default:
14377 ceph_abort();
14378 break;
14379 }
14380
14381 if (changed) {
14382 pp.set_snap_epoch(pending_inc.epoch);
14383 pending_inc.new_pools[m->pool] = pp;
14384 }
14385
14386 out:
14387 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14388 return true;
14389 }
14390
14391 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14392 {
14393 op->mark_osdmon_event(__func__);
14394 int err = prepare_new_pool(op);
14395 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14396 return true;
14397 }
14398
14399 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14400 ostream *ss)
14401 {
14402 const string& poolstr = osdmap.get_pool_name(pool_id);
14403
14404 // If the Pool is in use by CephFS, refuse to delete it
14405 FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14406 if (pending_fsmap.pool_in_use(pool_id)) {
14407 *ss << "pool '" << poolstr << "' is in use by CephFS";
14408 return -EBUSY;
14409 }
14410
14411 if (pool.tier_of >= 0) {
14412 *ss << "pool '" << poolstr << "' is a tier of '"
14413 << osdmap.get_pool_name(pool.tier_of) << "'";
14414 return -EBUSY;
14415 }
14416 if (!pool.tiers.empty()) {
14417 *ss << "pool '" << poolstr << "' has tiers";
14418 for(auto tier : pool.tiers) {
14419 *ss << " " << osdmap.get_pool_name(tier);
14420 }
14421 return -EBUSY;
14422 }
14423
14424 if (!g_conf()->mon_allow_pool_delete) {
14425 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14426 return -EPERM;
14427 }
14428
14429 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14430 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14431 return -EPERM;
14432 }
14433
14434 *ss << "pool '" << poolstr << "' removed";
14435 return 0;
14436 }
14437
14438 /**
14439 * Check if it is safe to add a tier to a base pool
14440 *
14441 * @return
14442 * True if the operation should proceed, false if we should abort here
14443 * (abort doesn't necessarily mean error, could be idempotency)
14444 */
14445 bool OSDMonitor::_check_become_tier(
14446 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14447 const int64_t base_pool_id, const pg_pool_t *base_pool,
14448 int *err,
14449 ostream *ss) const
14450 {
14451 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14452 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14453
14454 if (tier_pool->is_crimson()) {
14455 *ss << "pool '" << tier_pool_name << "' is a crimson pool, tiering "
14456 << "features are not supported";
14457 *err = -EINVAL;
14458 return false;
14459 }
14460 if (base_pool->is_crimson()) {
14461 *ss << "pool '" << base_pool_name << "' is a crimson pool, tiering "
14462 << "features are not supported";
14463 *err = -EINVAL;
14464 return false;
14465 }
14466
14467 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14468 if (pending_fsmap.pool_in_use(tier_pool_id)) {
14469 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14470 *err = -EBUSY;
14471 return false;
14472 }
14473
14474 if (base_pool->tiers.count(tier_pool_id)) {
14475 ceph_assert(tier_pool->tier_of == base_pool_id);
14476 *err = 0;
14477 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14478 << base_pool_name << "'";
14479 return false;
14480 }
14481
14482 if (base_pool->is_tier()) {
14483 *ss << "pool '" << base_pool_name << "' is already a tier of '"
14484 << osdmap.get_pool_name(base_pool->tier_of) << "', "
14485 << "multiple tiers are not yet supported.";
14486 *err = -EINVAL;
14487 return false;
14488 }
14489
14490 if (tier_pool->has_tiers()) {
14491 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14492 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14493 it != tier_pool->tiers.end(); ++it)
14494 *ss << "'" << osdmap.get_pool_name(*it) << "',";
14495 *ss << " multiple tiers are not yet supported.";
14496 *err = -EINVAL;
14497 return false;
14498 }
14499
14500 if (tier_pool->is_tier()) {
14501 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14502 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14503 *err = -EINVAL;
14504 return false;
14505 }
14506
14507 *err = 0;
14508 return true;
14509 }
14510
14511
14512 /**
14513 * Check if it is safe to remove a tier from this base pool
14514 *
14515 * @return
14516 * True if the operation should proceed, false if we should abort here
14517 * (abort doesn't necessarily mean error, could be idempotency)
14518 */
14519 bool OSDMonitor::_check_remove_tier(
14520 const int64_t base_pool_id, const pg_pool_t *base_pool,
14521 const pg_pool_t *tier_pool,
14522 int *err, ostream *ss) const
14523 {
14524 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14525
14526 // Apply CephFS-specific checks
14527 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14528 if (pending_fsmap.pool_in_use(base_pool_id)) {
14529 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14530 // If the underlying pool is erasure coded and does not allow EC
14531 // overwrites, we can't permit the removal of the replicated tier that
14532 // CephFS relies on to access it
14533 *ss << "pool '" << base_pool_name <<
14534 "' does not allow EC overwrites and is in use by CephFS"
14535 " via its tier";
14536 *err = -EBUSY;
14537 return false;
14538 }
14539
14540 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14541 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14542 "tier is still in use as a writeback cache. Change the cache "
14543 "mode and flush the cache before removing it";
14544 *err = -EBUSY;
14545 return false;
14546 }
14547 }
14548
14549 *err = 0;
14550 return true;
14551 }
14552
14553 int OSDMonitor::_prepare_remove_pool(
14554 int64_t pool, ostream *ss, bool no_fake)
14555 {
14556 dout(10) << __func__ << " " << pool << dendl;
14557 const pg_pool_t *p = osdmap.get_pg_pool(pool);
14558 int r = _check_remove_pool(pool, *p, ss);
14559 if (r < 0)
14560 return r;
14561
14562 auto new_pool = pending_inc.new_pools.find(pool);
14563 if (new_pool != pending_inc.new_pools.end()) {
14564 // if there is a problem with the pending info, wait and retry
14565 // this op.
14566 const auto& p = new_pool->second;
14567 int r = _check_remove_pool(pool, p, ss);
14568 if (r < 0)
14569 return -EAGAIN;
14570 }
14571
14572 if (pending_inc.old_pools.count(pool)) {
14573 dout(10) << __func__ << " " << pool << " already pending removal"
14574 << dendl;
14575 return 0;
14576 }
14577
14578 if (g_conf()->mon_fake_pool_delete && !no_fake) {
14579 string old_name = osdmap.get_pool_name(pool);
14580 string new_name = old_name + "." + stringify(pool) + ".DELETED";
14581 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14582 << old_name << " -> " << new_name << dendl;
14583 pending_inc.new_pool_names[pool] = new_name;
14584 return 0;
14585 }
14586
14587 // remove
14588 pending_inc.old_pools.insert(pool);
14589
14590 // remove any pg_temp mappings for this pool
14591 for (auto p = osdmap.pg_temp->begin();
14592 p != osdmap.pg_temp->end();
14593 ++p) {
14594 if (p->first.pool() == pool) {
14595 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14596 << p->first << dendl;
14597 pending_inc.new_pg_temp[p->first].clear();
14598 }
14599 }
14600 // remove any primary_temp mappings for this pool
14601 for (auto p = osdmap.primary_temp->begin();
14602 p != osdmap.primary_temp->end();
14603 ++p) {
14604 if (p->first.pool() == pool) {
14605 dout(10) << __func__ << " " << pool
14606 << " removing obsolete primary_temp" << p->first << dendl;
14607 pending_inc.new_primary_temp[p->first] = -1;
14608 }
14609 }
14610 // remove any pg_upmap mappings for this pool
14611 for (auto& p : osdmap.pg_upmap) {
14612 if (p.first.pool() == pool) {
14613 dout(10) << __func__ << " " << pool
14614 << " removing obsolete pg_upmap "
14615 << p.first << dendl;
14616 pending_inc.old_pg_upmap.insert(p.first);
14617 }
14618 }
14619 // remove any pending pg_upmap mappings for this pool
14620 {
14621 auto it = pending_inc.new_pg_upmap.begin();
14622 while (it != pending_inc.new_pg_upmap.end()) {
14623 if (it->first.pool() == pool) {
14624 dout(10) << __func__ << " " << pool
14625 << " removing pending pg_upmap "
14626 << it->first << dendl;
14627 it = pending_inc.new_pg_upmap.erase(it);
14628 } else {
14629 it++;
14630 }
14631 }
14632 }
14633 // remove any pg_upmap_items mappings for this pool
14634 for (auto& p : osdmap.pg_upmap_items) {
14635 if (p.first.pool() == pool) {
14636 dout(10) << __func__ << " " << pool
14637 << " removing obsolete pg_upmap_items " << p.first
14638 << dendl;
14639 pending_inc.old_pg_upmap_items.insert(p.first);
14640 }
14641 }
14642 // remove any pending pg_upmap mappings for this pool
14643 {
14644 auto it = pending_inc.new_pg_upmap_items.begin();
14645 while (it != pending_inc.new_pg_upmap_items.end()) {
14646 if (it->first.pool() == pool) {
14647 dout(10) << __func__ << " " << pool
14648 << " removing pending pg_upmap_items "
14649 << it->first << dendl;
14650 it = pending_inc.new_pg_upmap_items.erase(it);
14651 } else {
14652 it++;
14653 }
14654 }
14655 }
14656
14657 // remove any choose_args for this pool
14658 CrushWrapper newcrush = _get_pending_crush();
14659 if (newcrush.have_choose_args(pool)) {
14660 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14661 newcrush.rm_choose_args(pool);
14662 pending_inc.crush.clear();
14663 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14664 }
14665 return 0;
14666 }
14667
14668 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14669 {
14670 dout(10) << "_prepare_rename_pool " << pool << dendl;
14671 if (pending_inc.old_pools.count(pool)) {
14672 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14673 return -ENOENT;
14674 }
14675 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14676 p != pending_inc.new_pool_names.end();
14677 ++p) {
14678 if (p->second == newname && p->first != pool) {
14679 return -EEXIST;
14680 }
14681 }
14682
14683 pending_inc.new_pool_names[pool] = newname;
14684 return 0;
14685 }
14686
14687 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14688 {
14689 op->mark_osdmon_event(__func__);
14690 auto m = op->get_req<MPoolOp>();
14691 ostringstream ss;
14692 int ret = _prepare_remove_pool(m->pool, &ss, false);
14693 if (ret == -EAGAIN) {
14694 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14695 return true;
14696 }
14697 if (ret < 0)
14698 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14699 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14700 pending_inc.epoch));
14701 return true;
14702 }
14703
14704 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14705 int ret, epoch_t epoch, bufferlist *blp)
14706 {
14707 op->mark_osdmon_event(__func__);
14708 auto m = op->get_req<MPoolOp>();
14709 dout(20) << "_pool_op_reply " << ret << dendl;
14710 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14711 ret, epoch, get_last_committed(), blp);
14712 mon.send_reply(op, reply);
14713 }
14714
14715 void OSDMonitor::convert_pool_priorities(void)
14716 {
14717 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14718 int64_t max_prio = 0;
14719 int64_t min_prio = 0;
14720 for (const auto &i : osdmap.get_pools()) {
14721 const auto &pool = i.second;
14722
14723 if (pool.opts.is_set(key)) {
14724 int64_t prio = 0;
14725 pool.opts.get(key, &prio);
14726 if (prio > max_prio)
14727 max_prio = prio;
14728 if (prio < min_prio)
14729 min_prio = prio;
14730 }
14731 }
14732 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14733 dout(20) << __func__ << " nothing to fix" << dendl;
14734 return;
14735 }
14736 // Current pool priorities exceeds new maximum
14737 for (const auto &i : osdmap.get_pools()) {
14738 const auto pool_id = i.first;
14739 pg_pool_t pool = i.second;
14740
14741 int64_t prio = 0;
14742 pool.opts.get(key, &prio);
14743 int64_t n;
14744
14745 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14746 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14747 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14748 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14749 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14750 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14751 } else {
14752 continue;
14753 }
14754 if (n == 0) {
14755 pool.opts.unset(key);
14756 } else {
14757 pool.opts.set(key, static_cast<int64_t>(n));
14758 }
14759 dout(10) << __func__ << " pool " << pool_id
14760 << " recovery_priority adjusted "
14761 << prio << " to " << n << dendl;
14762 pool.last_change = pending_inc.epoch;
14763 pending_inc.new_pools[pool_id] = pool;
14764 }
14765 }
14766
14767 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14768 int *errcode,
14769 set<pg_pool_t*>* pools,
14770 const string& new_crush_rule)
14771 {
14772 dout(20) << __func__ << dendl;
14773 *okay = false;
14774 int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14775 if (new_crush_rule_result < 0) {
14776 ss << "unrecognized crush rule " << new_crush_rule_result;
14777 *errcode = new_crush_rule_result;
14778 return;
14779 }
14780 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14781 for (const auto& pooli : osdmap.pools) {
14782 int64_t poolid = pooli.first;
14783 const pg_pool_t *p = &pooli.second;
14784 if (!p->is_replicated()) {
14785 ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14786 *errcode = -EINVAL;
14787 return;
14788 }
14789 uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14790 if ((p->get_size() != default_size ||
14791 (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14792 (p->get_crush_rule() != new_rule)) {
14793 ss << "we currently require stretch mode pools start out with the"
14794 " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14795 *errcode = -EINVAL;
14796 return;
14797 }
14798 pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14799 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14800 // the attempt may fail and then we have these pool updates...but they won't do anything
14801 // if there is a failure, so if it's hard to change the interface, no need to bother
14802 pools->insert(pp);
14803 }
14804 *okay = true;
14805 return;
14806 }
14807
14808 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14809 int *errcode, bool commit,
14810 const string& dividing_bucket,
14811 uint32_t bucket_count,
14812 const set<pg_pool_t*>& pools,
14813 const string& new_crush_rule)
14814 {
14815 dout(20) << __func__ << dendl;
14816 *okay = false;
14817 CrushWrapper crush = _get_pending_crush();
14818 int dividing_id = -1;
14819 if (auto type_id = crush.get_validated_type_id(dividing_bucket);
14820 !type_id.has_value()) {
14821 ss << dividing_bucket << " is not a valid crush bucket type";
14822 *errcode = -ENOENT;
14823 ceph_assert(!commit);
14824 return;
14825 } else {
14826 dividing_id = *type_id;
14827 }
14828 vector<int> subtrees;
14829 crush.get_subtree_of_type(dividing_id, &subtrees);
14830 if (subtrees.size() != 2) {
14831 ss << "there are " << subtrees.size() << dividing_bucket
14832 << "'s in the cluster but stretch mode currently only works with 2!";
14833 *errcode = -EINVAL;
14834 ceph_assert(!commit || subtrees.size() == 2);
14835 return;
14836 }
14837
14838 int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14839 if (new_crush_rule_result < 0) {
14840 ss << "unrecognized crush rule " << new_crush_rule;
14841 *errcode = new_crush_rule_result;
14842 ceph_assert(!commit || (new_crush_rule_result > 0));
14843 return;
14844 }
14845 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14846
14847 int weight1 = crush.get_item_weight(subtrees[0]);
14848 int weight2 = crush.get_item_weight(subtrees[1]);
14849 if (weight1 != weight2) {
14850 // TODO: I'm really not sure this is a good idea?
14851 ss << "the 2 " << dividing_bucket
14852 << "instances in the cluster have differing weights "
14853 << weight1 << " and " << weight2
14854 <<" but stretch mode currently requires they be the same!";
14855 *errcode = -EINVAL;
14856 ceph_assert(!commit || (weight1 == weight2));
14857 return;
14858 }
14859 if (bucket_count != 2) {
14860 ss << "currently we only support 2-site stretch clusters!";
14861 *errcode = -EINVAL;
14862 ceph_assert(!commit || bucket_count == 2);
14863 return;
14864 }
14865 // TODO: check CRUSH rules for pools so that we are appropriately divided
14866 if (commit) {
14867 for (auto pool : pools) {
14868 pool->crush_rule = new_rule;
14869 pool->peering_crush_bucket_count = bucket_count;
14870 pool->peering_crush_bucket_target = bucket_count;
14871 pool->peering_crush_bucket_barrier = dividing_id;
14872 pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14873 pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14874 pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14875 }
14876 pending_inc.change_stretch_mode = true;
14877 pending_inc.stretch_mode_enabled = true;
14878 pending_inc.new_stretch_bucket_count = bucket_count;
14879 pending_inc.new_degraded_stretch_mode = 0;
14880 pending_inc.new_stretch_mode_bucket = dividing_id;
14881 }
14882 *okay = true;
14883 return;
14884 }
14885
14886 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14887 set<int> *really_down_buckets,
14888 set<string> *really_down_mons)
14889 {
14890 dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14891 ceph_assert(is_readable());
14892 if (dead_buckets.empty()) return false;
14893 set<int> down_cache;
14894 bool really_down = false;
14895 for (auto dbi : dead_buckets) {
14896 const string& bucket_name = dbi.first;
14897 ceph_assert(osdmap.crush->name_exists(bucket_name));
14898 int bucket_id = osdmap.crush->get_item_id(bucket_name);
14899 dout(20) << "Checking " << bucket_name << " id " << bucket_id
14900 << " to see if OSDs are also down" << dendl;
14901 bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14902 if (subtree_down) {
14903 dout(20) << "subtree is down!" << dendl;
14904 really_down = true;
14905 really_down_buckets->insert(bucket_id);
14906 really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14907 }
14908 }
14909 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14910 << " and mons " << *really_down_mons << " are really down" << dendl;
14911 return really_down;
14912 }
14913
14914 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14915 const set<string>& live_zones)
14916 {
14917 dout(20) << __func__ << dendl;
14918 stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14919 // update the general OSDMap changes
14920 pending_inc.change_stretch_mode = true;
14921 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14922 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14923 int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14924 ceph_assert(new_site_count == 1); // stretch count 2!
14925 pending_inc.new_degraded_stretch_mode = new_site_count;
14926 pending_inc.new_recovering_stretch_mode = 0;
14927 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14928
14929 // and then apply them to all the pg_pool_ts
14930 ceph_assert(live_zones.size() == 1); // only support 2 zones now
14931 const string& remaining_site_name = *(live_zones.begin());
14932 ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14933 int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14934 for (auto pgi : osdmap.pools) {
14935 if (pgi.second.peering_crush_bucket_count) {
14936 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14937 newp.peering_crush_bucket_count = new_site_count;
14938 newp.peering_crush_mandatory_member = remaining_site;
14939 newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14940 newp.set_last_force_op_resend(pending_inc.epoch);
14941 }
14942 }
14943 propose_pending();
14944 }
14945
14946 void OSDMonitor::trigger_recovery_stretch_mode()
14947 {
14948 dout(20) << __func__ << dendl;
14949 stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14950 pending_inc.change_stretch_mode = true;
14951 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14952 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14953 pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14954 pending_inc.new_recovering_stretch_mode = 1;
14955 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14956
14957 for (auto pgi : osdmap.pools) {
14958 if (pgi.second.peering_crush_bucket_count) {
14959 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14960 newp.set_last_force_op_resend(pending_inc.epoch);
14961 }
14962 }
14963 propose_pending();
14964 }
14965
14966 void OSDMonitor::set_degraded_stretch_mode()
14967 {
14968 stretch_recovery_triggered.set_from_double(0);
14969 }
14970
14971 void OSDMonitor::set_recovery_stretch_mode()
14972 {
14973 if (stretch_recovery_triggered.is_zero()) {
14974 stretch_recovery_triggered = ceph_clock_now();
14975 }
14976 }
14977
14978 void OSDMonitor::set_healthy_stretch_mode()
14979 {
14980 stretch_recovery_triggered.set_from_double(0);
14981 }
14982
14983 void OSDMonitor::notify_new_pg_digest()
14984 {
14985 dout(20) << __func__ << dendl;
14986 if (!stretch_recovery_triggered.is_zero()) {
14987 try_end_recovery_stretch_mode(false);
14988 }
14989 }
14990
14991 struct CMonExitRecovery : public Context {
14992 OSDMonitor *m;
14993 bool force;
14994 CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14995 void finish(int r) {
14996 m->try_end_recovery_stretch_mode(force);
14997 }
14998 };
14999
15000 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
15001 {
15002 dout(20) << __func__ << dendl;
15003 if (!mon.is_leader()) return;
15004 if (!mon.is_degraded_stretch_mode()) return;
15005 if (!mon.is_recovering_stretch_mode()) return;
15006 if (!is_readable()) {
15007 wait_for_readable_ctx(new CMonExitRecovery(this, force));
15008 return;
15009 }
15010
15011 if (osdmap.recovering_stretch_mode &&
15012 ((!stretch_recovery_triggered.is_zero() &&
15013 ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
15014 stretch_recovery_triggered) ||
15015 force)) {
15016 if (!mon.mgrstatmon()->is_readable()) {
15017 mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
15018 return;
15019 }
15020 const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
15021 double misplaced, degraded, inactive, unknown;
15022 pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
15023 if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
15024 // we can exit degraded stretch mode!
15025 mon.trigger_healthy_stretch_mode();
15026 }
15027 }
15028 }
15029
15030 void OSDMonitor::trigger_healthy_stretch_mode()
15031 {
15032 ceph_assert(is_writeable());
15033 stretch_recovery_triggered.set_from_double(0);
15034 pending_inc.change_stretch_mode = true;
15035 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
15036 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
15037 pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
15038 pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
15039 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
15040 for (auto pgi : osdmap.pools) {
15041 if (pgi.second.peering_crush_bucket_count) {
15042 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
15043 newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
15044 newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
15045 newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
15046 newp.set_last_force_op_resend(pending_inc.epoch);
15047 }
15048 }
15049 propose_pending();
15050 }