]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
5d76862e0a8346a593879f0f69b55870a04fc912
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
61
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
69
70 #include "common/config.h"
71 #include "common/errno.h"
72
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
76
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
86
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
89
90 #include "json_spirit/json_spirit_reader.h"
91
92 #include <boost/algorithm/string/predicate.hpp>
93
94 using std::dec;
95 using std::hex;
96 using std::list;
97 using std::map;
98 using std::make_pair;
99 using std::ostringstream;
100 using std::pair;
101 using std::set;
102 using std::string;
103 using std::stringstream;
104 using std::to_string;
105 using std::vector;
106
107 using ceph::bufferlist;
108 using ceph::decode;
109 using ceph::encode;
110 using ceph::ErasureCodeInterfaceRef;
111 using ceph::ErasureCodePluginRegistry;
112 using ceph::ErasureCodeProfile;
113 using ceph::Formatter;
114 using ceph::JSONFormatter;
115 using ceph::make_message;
116
117 #define dout_subsys ceph_subsys_mon
118 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
119 static const string OSD_METADATA_PREFIX("osd_metadata");
120 static const string OSD_SNAP_PREFIX("osd_snap");
121
122 /*
123
124 OSD snapshot metadata
125 ---------------------
126
127 -- starting with mimic, removed in octopus --
128
129 "removed_epoch_%llu_%08lx" % (pool, epoch)
130 -> interval_set<snapid_t>
131
132 "removed_snap_%llu_%016llx" % (pool, last_snap)
133 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
134
135
136 -- starting with mimic --
137
138 "purged_snap_%llu_%016llx" % (pool, last_snap)
139 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
140
141 - note that the {removed,purged}_snap put the last snap in they key so
142 that we can use forward iteration only to search for an epoch in an
143 interval. e.g., to test if epoch N is removed/purged, we'll find a key
144 >= N that either does or doesn't contain the given snap.
145
146
147 -- starting with octopus --
148
149 "purged_epoch_%08lx" % epoch
150 -> map<int64_t,interval_set<snapid_t>>
151
152 */
153 using namespace TOPNSPC::common;
154 namespace {
155
156 struct OSDMemCache : public PriorityCache::PriCache {
157 OSDMonitor *osdmon;
158 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
159 int64_t committed_bytes = 0;
160 double cache_ratio = 0;
161
162 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
163
164 virtual uint64_t _get_used_bytes() const = 0;
165
166 virtual int64_t request_cache_bytes(
167 PriorityCache::Priority pri, uint64_t total_cache) const {
168 int64_t assigned = get_cache_bytes(pri);
169
170 switch (pri) {
171 // All cache items are currently set to have PRI1 priority
172 case PriorityCache::Priority::PRI1:
173 {
174 int64_t request = _get_used_bytes();
175 return (request > assigned) ? request - assigned : 0;
176 }
177 default:
178 break;
179 }
180 return -EOPNOTSUPP;
181 }
182
183 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
184 return cache_bytes[pri];
185 }
186
187 virtual int64_t get_cache_bytes() const {
188 int64_t total = 0;
189
190 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
191 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
192 total += get_cache_bytes(pri);
193 }
194 return total;
195 }
196
197 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
198 cache_bytes[pri] = bytes;
199 }
200 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
201 cache_bytes[pri] += bytes;
202 }
203 virtual int64_t commit_cache_size(uint64_t total_cache) {
204 committed_bytes = PriorityCache::get_chunk(
205 get_cache_bytes(), total_cache);
206 return committed_bytes;
207 }
208 virtual int64_t get_committed_size() const {
209 return committed_bytes;
210 }
211 virtual double get_cache_ratio() const {
212 return cache_ratio;
213 }
214 virtual void set_cache_ratio(double ratio) {
215 cache_ratio = ratio;
216 }
217 virtual void shift_bins() {
218 }
219 virtual void import_bins(const std::vector<uint64_t> &bins) {
220 }
221 virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
222 }
223 virtual uint64_t get_bins(PriorityCache::Priority pri) const {
224 return 0;
225 }
226
227 virtual string get_cache_name() const = 0;
228 };
229
230 struct IncCache : public OSDMemCache {
231 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
232
233 virtual uint64_t _get_used_bytes() const {
234 return osdmon->inc_osd_cache.get_bytes();
235 }
236
237 virtual string get_cache_name() const {
238 return "OSDMap Inc Cache";
239 }
240
241 uint64_t _get_num_osdmaps() const {
242 return osdmon->inc_osd_cache.get_size();
243 }
244 };
245
246 struct FullCache : public OSDMemCache {
247 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
248
249 virtual uint64_t _get_used_bytes() const {
250 return osdmon->full_osd_cache.get_bytes();
251 }
252
253 virtual string get_cache_name() const {
254 return "OSDMap Full Cache";
255 }
256
257 uint64_t _get_num_osdmaps() const {
258 return osdmon->full_osd_cache.get_size();
259 }
260 };
261
262 std::shared_ptr<IncCache> inc_cache;
263 std::shared_ptr<FullCache> full_cache;
264
265 const uint32_t MAX_POOL_APPLICATIONS = 4;
266 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
267 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
268
269 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
270 // Note: this doesn't include support for the application tag match
271 if ((grant.spec.allow & OSD_CAP_W) != 0) {
272 auto& match = grant.match;
273 if (match.is_match_all()) {
274 return true;
275 } else if (pool_name != nullptr &&
276 !match.pool_namespace.pool_name.empty() &&
277 match.pool_namespace.pool_name == *pool_name) {
278 return true;
279 }
280 }
281 return false;
282 }
283
284 bool is_unmanaged_snap_op_permitted(CephContext* cct,
285 const KeyServer& key_server,
286 const EntityName& entity_name,
287 const MonCap& mon_caps,
288 const entity_addr_t& peer_socket_addr,
289 const std::string* pool_name)
290 {
291 typedef std::map<std::string, std::string> CommandArgs;
292
293 if (mon_caps.is_capable(
294 cct, entity_name, "osd",
295 "osd pool op unmanaged-snap",
296 (pool_name == nullptr ?
297 CommandArgs{} /* pool DNE, require unrestricted cap */ :
298 CommandArgs{{"poolname", *pool_name}}),
299 false, true, false,
300 peer_socket_addr)) {
301 return true;
302 }
303
304 AuthCapsInfo caps_info;
305 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
306 caps_info)) {
307 dout(10) << "unable to locate OSD cap data for " << entity_name
308 << " in auth db" << dendl;
309 return false;
310 }
311
312 string caps_str;
313 if (caps_info.caps.length() > 0) {
314 auto p = caps_info.caps.cbegin();
315 try {
316 decode(caps_str, p);
317 } catch (const ceph::buffer::error &err) {
318 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
319 << dendl;
320 return false;
321 }
322 }
323
324 OSDCap osd_cap;
325 if (!osd_cap.parse(caps_str, nullptr)) {
326 dout(10) << "unable to parse OSD cap data for " << entity_name
327 << " in auth db" << dendl;
328 return false;
329 }
330
331 // if the entity has write permissions in one or all pools, permit
332 // usage of unmanaged-snapshots
333 if (osd_cap.allow_all()) {
334 return true;
335 }
336
337 for (auto& grant : osd_cap.grants) {
338 if (grant.profile.is_valid()) {
339 for (auto& profile_grant : grant.profile_grants) {
340 if (is_osd_writable(profile_grant, pool_name)) {
341 return true;
342 }
343 }
344 } else if (is_osd_writable(grant, pool_name)) {
345 return true;
346 }
347 }
348
349 return false;
350 }
351
352 } // anonymous namespace
353
354 void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
355 epoch_t last_epoch_clean)
356 {
357 if (ps >= pg_num) {
358 // removed PG
359 return;
360 }
361 epoch_by_pg.resize(pg_num, 0);
362 const auto old_lec = epoch_by_pg[ps];
363 if (old_lec >= last_epoch_clean) {
364 // stale lec
365 return;
366 }
367 epoch_by_pg[ps] = last_epoch_clean;
368 if (last_epoch_clean < floor) {
369 floor = last_epoch_clean;
370 } else if (last_epoch_clean > floor) {
371 if (old_lec == floor) {
372 // probably should increase floor?
373 auto new_floor = std::min_element(std::begin(epoch_by_pg),
374 std::end(epoch_by_pg));
375 floor = *new_floor;
376 }
377 }
378 if (ps != next_missing) {
379 return;
380 }
381 for (; next_missing < epoch_by_pg.size(); next_missing++) {
382 if (epoch_by_pg[next_missing] == 0) {
383 break;
384 }
385 }
386 }
387
388 void LastEpochClean::remove_pool(uint64_t pool)
389 {
390 report_by_pool.erase(pool);
391 }
392
393 void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
394 epoch_t last_epoch_clean)
395 {
396 auto& lec = report_by_pool[pg.pool()];
397 return lec.report(pg_num, pg.ps(), last_epoch_clean);
398 }
399
400 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
401 {
402 auto floor = latest.get_epoch();
403 for (auto& pool : latest.get_pools()) {
404 auto reported = report_by_pool.find(pool.first);
405 if (reported == report_by_pool.end()) {
406 return 0;
407 }
408 if (reported->second.next_missing < pool.second.get_pg_num()) {
409 return 0;
410 }
411 if (reported->second.floor < floor) {
412 floor = reported->second.floor;
413 }
414 }
415 return floor;
416 }
417
418 void LastEpochClean::dump(Formatter *f) const
419 {
420 f->open_array_section("per_pool");
421
422 for (auto& [pool, lec] : report_by_pool) {
423 f->open_object_section("pool");
424 f->dump_unsigned("poolid", pool);
425 f->dump_unsigned("floor", lec.floor);
426 f->close_section();
427 }
428
429 f->close_section();
430 }
431
432 class C_UpdateCreatingPGs : public Context {
433 public:
434 OSDMonitor *osdmon;
435 utime_t start;
436 epoch_t epoch;
437 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
438 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
439 void finish(int r) override {
440 if (r >= 0) {
441 utime_t end = ceph_clock_now();
442 dout(10) << "osdmap epoch " << epoch << " mapping took "
443 << (end - start) << " seconds" << dendl;
444 osdmon->update_creating_pgs();
445 osdmon->check_pg_creates_subs();
446 }
447 }
448 };
449
450 #undef dout_prefix
451 #define dout_prefix _prefix(_dout, mon, osdmap)
452 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
453 return *_dout << "mon." << mon.name << "@" << mon.rank
454 << "(" << mon.get_state_name()
455 << ").osd e" << osdmap.get_epoch() << " ";
456 }
457
458 OSDMonitor::OSDMonitor(
459 CephContext *cct,
460 Monitor &mn,
461 Paxos &p,
462 const string& service_name)
463 : PaxosService(mn, p, service_name),
464 cct(cct),
465 inc_osd_cache(g_conf()->mon_osd_cache_size),
466 full_osd_cache(g_conf()->mon_osd_cache_size),
467 has_osdmap_manifest(false),
468 mapper(mn.cct, &mn.cpu_tp)
469 {
470 inc_cache = std::make_shared<IncCache>(this);
471 full_cache = std::make_shared<FullCache>(this);
472 cct->_conf.add_observer(this);
473 int r = _set_cache_sizes();
474 if (r < 0) {
475 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
476 << g_conf()->mon_osd_cache_size
477 << ") without priority cache management"
478 << dendl;
479 }
480 }
481
482 const char **OSDMonitor::get_tracked_conf_keys() const
483 {
484 static const char* KEYS[] = {
485 "mon_memory_target",
486 "mon_memory_autotune",
487 "rocksdb_cache_size",
488 NULL
489 };
490 return KEYS;
491 }
492
493 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
494 const std::set<std::string> &changed)
495 {
496 dout(10) << __func__ << " " << changed << dendl;
497
498 if (changed.count("mon_memory_autotune")) {
499 _set_cache_autotuning();
500 }
501 if (changed.count("mon_memory_target") ||
502 changed.count("rocksdb_cache_size")) {
503 int r = _update_mon_cache_settings();
504 if (r < 0) {
505 derr << __func__ << " mon_memory_target:"
506 << g_conf()->mon_memory_target
507 << " rocksdb_cache_size:"
508 << g_conf()->rocksdb_cache_size
509 << ". Unable to update cache size."
510 << dendl;
511 }
512 }
513 }
514
515 void OSDMonitor::_set_cache_autotuning()
516 {
517 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
518 // Disable cache autotuning
519 std::lock_guard l(balancer_lock);
520 pcm = nullptr;
521 }
522
523 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
524 int r = register_cache_with_pcm();
525 if (r < 0) {
526 dout(10) << __func__
527 << " Error while registering osdmon caches with pcm."
528 << " Cache auto tuning not enabled."
529 << dendl;
530 mon_memory_autotune = false;
531 } else {
532 mon_memory_autotune = true;
533 }
534 }
535 }
536
537 int OSDMonitor::_update_mon_cache_settings()
538 {
539 if (g_conf()->mon_memory_target <= 0 ||
540 g_conf()->mon_memory_target < mon_memory_min ||
541 g_conf()->rocksdb_cache_size <= 0) {
542 return -EINVAL;
543 }
544
545 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
546 derr << __func__ << " not using pcm and rocksdb" << dendl;
547 return -EINVAL;
548 }
549
550 uint64_t old_mon_memory_target = mon_memory_target;
551 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
552
553 // Set the new pcm memory cache sizes
554 mon_memory_target = g_conf()->mon_memory_target;
555 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
556
557 uint64_t base = mon_memory_base;
558 double fragmentation = mon_memory_fragmentation;
559 uint64_t target = mon_memory_target;
560 uint64_t min = mon_memory_min;
561 uint64_t max = min;
562
563 uint64_t ltarget = (1.0 - fragmentation) * target;
564 if (ltarget > base + min) {
565 max = ltarget - base;
566 }
567
568 int r = _set_cache_ratios();
569 if (r < 0) {
570 derr << __func__ << " Cache ratios for pcm could not be set."
571 << " Review the kv (rocksdb) and mon_memory_target sizes."
572 << dendl;
573 mon_memory_target = old_mon_memory_target;
574 rocksdb_cache_size = old_rocksdb_cache_size;
575 return -EINVAL;
576 }
577
578 if (mon_memory_autotune && pcm != nullptr) {
579 std::lock_guard l(balancer_lock);
580 // set pcm cache levels
581 pcm->set_target_memory(target);
582 pcm->set_min_memory(min);
583 pcm->set_max_memory(max);
584 // tune memory based on new values
585 pcm->tune_memory();
586 pcm->balance();
587 _set_new_cache_sizes();
588 dout(1) << __func__ << " Updated mon cache setting."
589 << " target: " << target
590 << " min: " << min
591 << " max: " << max
592 << dendl;
593 }
594 return 0;
595 }
596
597 int OSDMonitor::_set_cache_sizes()
598 {
599 if (g_conf()->mon_memory_autotune) {
600 // set the new osdmon cache targets to be managed by pcm
601 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
602 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
603 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
604 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
605 mon_memory_target = g_conf()->mon_memory_target;
606 mon_memory_min = g_conf()->mon_osd_cache_size_min;
607 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
608 derr << __func__ << " mon_memory_target:" << mon_memory_target
609 << " mon_memory_min:" << mon_memory_min
610 << ". Invalid size option(s) provided."
611 << dendl;
612 return -EINVAL;
613 }
614 // Set the initial inc and full LRU cache sizes
615 inc_osd_cache.set_bytes(mon_memory_min);
616 full_osd_cache.set_bytes(mon_memory_min);
617 mon_memory_autotune = g_conf()->mon_memory_autotune;
618 }
619 return 0;
620 }
621
622 bool OSDMonitor::_have_pending_crush()
623 {
624 return pending_inc.crush.length() > 0;
625 }
626
627 CrushWrapper &OSDMonitor::_get_stable_crush()
628 {
629 return *osdmap.crush;
630 }
631
632 CrushWrapper OSDMonitor::_get_pending_crush()
633 {
634 bufferlist bl;
635 if (pending_inc.crush.length())
636 bl = pending_inc.crush;
637 else
638 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
639
640 auto p = bl.cbegin();
641 CrushWrapper crush;
642 crush.decode(p);
643 return crush;
644 }
645
646 void OSDMonitor::create_initial()
647 {
648 dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
649
650 OSDMap newmap;
651
652 bufferlist bl;
653 mon.store->get("mkfs", "osdmap", bl);
654
655 if (bl.length()) {
656 newmap.decode(bl);
657 newmap.set_fsid(mon.monmap->fsid);
658 } else {
659 newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
660 }
661 newmap.set_epoch(1);
662 newmap.created = newmap.modified = ceph_clock_now();
663
664 // new clusters should sort bitwise by default.
665 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
666
667 newmap.flags |=
668 CEPH_OSDMAP_RECOVERY_DELETES |
669 CEPH_OSDMAP_PURGED_SNAPDIRS |
670 CEPH_OSDMAP_PGLOG_HARDLIMIT;
671 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
672 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
673 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
674 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
675 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
676 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
677
678 // new cluster should require latest by default
679 if (g_conf().get_val<bool>("mon_debug_no_require_quincy")) {
680 if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) {
681 derr << __func__ << " mon_debug_no_require_quincy and pacific=true" << dendl;
682 newmap.require_osd_release = ceph_release_t::nautilus;
683 } else {
684 derr << __func__ << " mon_debug_no_require_quincy=true" << dendl;
685 newmap.require_osd_release = ceph_release_t::pacific;
686 }
687 } else {
688 newmap.require_osd_release = ceph_release_t::quincy;
689 }
690
691 ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
692 if (!r) {
693 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
694 }
695 newmap.require_min_compat_client = r;
696
697 // encode into pending incremental
698 uint64_t features = newmap.get_encoding_features();
699 newmap.encode(pending_inc.fullmap,
700 features | CEPH_FEATURE_RESERVED);
701 pending_inc.full_crc = newmap.get_crc();
702 dout(20) << " full crc " << pending_inc.full_crc << dendl;
703 }
704
705 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
706 {
707 s.insert(service_name);
708 s.insert(OSD_PG_CREATING_PREFIX);
709 s.insert(OSD_METADATA_PREFIX);
710 s.insert(OSD_SNAP_PREFIX);
711 }
712
713 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
714 {
715 // we really don't care if the version has been updated, because we may
716 // have trimmed without having increased the last committed; yet, we may
717 // need to update the in-memory manifest.
718 load_osdmap_manifest();
719
720 version_t version = get_last_committed();
721 if (version == osdmap.epoch)
722 return;
723 ceph_assert(version > osdmap.epoch);
724
725 dout(15) << "update_from_paxos paxos e " << version
726 << ", my e " << osdmap.epoch << dendl;
727
728 int prev_num_up_osd = osdmap.num_up_osd;
729
730 if (mapping_job) {
731 if (!mapping_job->is_done()) {
732 dout(1) << __func__ << " mapping job "
733 << mapping_job.get() << " did not complete, "
734 << mapping_job->shards << " left, canceling" << dendl;
735 mapping_job->abort();
736 }
737 mapping_job.reset();
738 }
739
740 load_health();
741
742 /*
743 * We will possibly have a stashed latest that *we* wrote, and we will
744 * always be sure to have the oldest full map in the first..last range
745 * due to encode_trim_extra(), which includes the oldest full map in the trim
746 * transaction.
747 *
748 * encode_trim_extra() does not however write the full map's
749 * version to 'full_latest'. This is only done when we are building the
750 * full maps from the incremental versions. But don't panic! We make sure
751 * that the following conditions find whichever full map version is newer.
752 */
753 version_t latest_full = get_version_latest_full();
754 if (latest_full == 0 && get_first_committed() > 1)
755 latest_full = get_first_committed();
756
757 if (get_first_committed() > 1 &&
758 latest_full < get_first_committed()) {
759 // the monitor could be just sync'ed with its peer, and the latest_full key
760 // is not encoded in the paxos commits in encode_pending(), so we need to
761 // make sure we get it pointing to a proper version.
762 version_t lc = get_last_committed();
763 version_t fc = get_first_committed();
764
765 dout(10) << __func__ << " looking for valid full map in interval"
766 << " [" << fc << ", " << lc << "]" << dendl;
767
768 latest_full = 0;
769 for (version_t v = lc; v >= fc; v--) {
770 string full_key = "full_" + stringify(v);
771 if (mon.store->exists(get_service_name(), full_key)) {
772 dout(10) << __func__ << " found latest full map v " << v << dendl;
773 latest_full = v;
774 break;
775 }
776 }
777
778 ceph_assert(latest_full > 0);
779 auto t(std::make_shared<MonitorDBStore::Transaction>());
780 put_version_latest_full(t, latest_full);
781 mon.store->apply_transaction(t);
782 dout(10) << __func__ << " updated the on-disk full map version to "
783 << latest_full << dendl;
784 }
785
786 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
787 bufferlist latest_bl;
788 get_version_full(latest_full, latest_bl);
789 ceph_assert(latest_bl.length() != 0);
790 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
791 osdmap = OSDMap();
792 osdmap.decode(latest_bl);
793 }
794
795 bufferlist bl;
796 if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
797 auto p = bl.cbegin();
798 std::lock_guard<std::mutex> l(creating_pgs_lock);
799 creating_pgs.decode(p);
800 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
801 << creating_pgs.last_scan_epoch
802 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
803 } else {
804 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
805 << dendl;
806 }
807
808 // walk through incrementals
809 MonitorDBStore::TransactionRef t;
810 size_t tx_size = 0;
811 while (version > osdmap.epoch) {
812 bufferlist inc_bl;
813 int err = get_version(osdmap.epoch+1, inc_bl);
814 ceph_assert(err == 0);
815 ceph_assert(inc_bl.length());
816 // set priority cache manager levels if the osdmap is
817 // being populated for the first time.
818 if (mon_memory_autotune && pcm == nullptr) {
819 int r = register_cache_with_pcm();
820 if (r < 0) {
821 dout(10) << __func__
822 << " Error while registering osdmon caches with pcm."
823 << " Proceeding without cache auto tuning."
824 << dendl;
825 }
826 }
827
828 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
829 << dendl;
830 OSDMap::Incremental inc(inc_bl);
831 err = osdmap.apply_incremental(inc);
832 ceph_assert(err == 0);
833
834 if (!t)
835 t.reset(new MonitorDBStore::Transaction);
836
837 // Write out the full map for all past epochs. Encode the full
838 // map with the same features as the incremental. If we don't
839 // know, use the quorum features. If we don't know those either,
840 // encode with all features.
841 uint64_t f = inc.encode_features;
842 if (!f)
843 f = mon.get_quorum_con_features();
844 if (!f)
845 f = -1;
846 bufferlist full_bl;
847 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
848 tx_size += full_bl.length();
849
850 bufferlist orig_full_bl;
851 get_version_full(osdmap.epoch, orig_full_bl);
852 if (orig_full_bl.length()) {
853 // the primary provided the full map
854 ceph_assert(inc.have_crc);
855 if (inc.full_crc != osdmap.crc) {
856 // This will happen if the mons were running mixed versions in
857 // the past or some other circumstance made the full encoded
858 // maps divergent. Reloading here will bring us back into
859 // sync with the primary for this and all future maps. OSDs
860 // will also be brought back into sync when they discover the
861 // crc mismatch and request a full map from a mon.
862 derr << __func__ << " full map CRC mismatch, resetting to canonical"
863 << dendl;
864
865 dout(20) << __func__ << " my (bad) full osdmap:\n";
866 JSONFormatter jf(true);
867 jf.dump_object("osdmap", osdmap);
868 jf.flush(*_dout);
869 *_dout << "\nhexdump:\n";
870 full_bl.hexdump(*_dout);
871 *_dout << dendl;
872
873 osdmap = OSDMap();
874 osdmap.decode(orig_full_bl);
875
876 dout(20) << __func__ << " canonical full osdmap:\n";
877 JSONFormatter jf(true);
878 jf.dump_object("osdmap", osdmap);
879 jf.flush(*_dout);
880 *_dout << "\nhexdump:\n";
881 orig_full_bl.hexdump(*_dout);
882 *_dout << dendl;
883 }
884 } else {
885 ceph_assert(!inc.have_crc);
886 put_version_full(t, osdmap.epoch, full_bl);
887 }
888 put_version_latest_full(t, osdmap.epoch);
889
890 // share
891 dout(1) << osdmap << dendl;
892
893 if (osdmap.epoch == 1) {
894 t->erase("mkfs", "osdmap");
895 }
896
897 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
898 mon.store->apply_transaction(t);
899 t = MonitorDBStore::TransactionRef();
900 tx_size = 0;
901 }
902 for (const auto [osd, state] : inc.new_state) {
903 if (state & CEPH_OSD_UP) {
904 // could be marked up *or* down, but we're too lazy to check which
905 last_osd_report.erase(osd);
906 }
907 }
908 for (const auto [osd, weight] : inc.new_weight) {
909 if (weight == CEPH_OSD_OUT) {
910 // manually marked out, so drop it
911 osd_epochs.erase(osd);
912 }
913 }
914 }
915
916 if (t) {
917 mon.store->apply_transaction(t);
918 }
919
920 bool marked_osd_down = false;
921 for (int o = 0; o < osdmap.get_max_osd(); o++) {
922 if (osdmap.is_out(o))
923 continue;
924 auto found = down_pending_out.find(o);
925 if (osdmap.is_down(o)) {
926 // populate down -> out map
927 if (found == down_pending_out.end()) {
928 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
929 down_pending_out[o] = ceph_clock_now();
930 marked_osd_down = true;
931 }
932 } else {
933 if (found != down_pending_out.end()) {
934 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
935 down_pending_out.erase(found);
936 }
937 }
938 }
939 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
940
941 check_osdmap_subs();
942 check_pg_creates_subs();
943
944 share_map_with_random_osd();
945 update_logger();
946 process_failures();
947
948 // make sure our feature bits reflect the latest map
949 update_msgr_features();
950
951 if (!mon.is_leader()) {
952 // will be called by on_active() on the leader, avoid doing so twice
953 start_mapping();
954 }
955 if (osdmap.stretch_mode_enabled) {
956 dout(20) << "Stretch mode enabled in this map" << dendl;
957 mon.try_engage_stretch_mode();
958 if (osdmap.degraded_stretch_mode) {
959 dout(20) << "Degraded stretch mode set in this map" << dendl;
960 if (!osdmap.recovering_stretch_mode) {
961 mon.set_degraded_stretch_mode();
962 if (prev_num_up_osd < osdmap.num_up_osd &&
963 (osdmap.num_up_osd / (double)osdmap.num_osd) >
964 cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) {
965 // TODO: This works for 2-site clusters when the OSD maps are appropriately
966 // trimmed and everything is "normal" but not if you have a lot of out OSDs
967 // you're ignoring or in some really degenerate failure cases
968 dout(10) << "Enabling recovery stretch mode in this map" << dendl;
969 mon.go_recovery_stretch_mode();
970 }
971 } else {
972 mon.set_recovery_stretch_mode();
973 }
974 } else {
975 mon.set_healthy_stretch_mode();
976 }
977 if (marked_osd_down &&
978 (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
979 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
980 mon.maybe_go_degraded_stretch_mode();
981 }
982 }
983 }
984
985 int OSDMonitor::register_cache_with_pcm()
986 {
987 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
988 derr << __func__ << " Invalid memory size specified for mon caches."
989 << " Caches will not be auto-tuned."
990 << dendl;
991 return -EINVAL;
992 }
993 uint64_t base = mon_memory_base;
994 double fragmentation = mon_memory_fragmentation;
995 // For calculating total target memory, consider rocksdb cache size.
996 uint64_t target = mon_memory_target;
997 uint64_t min = mon_memory_min;
998 uint64_t max = min;
999
1000 // Apply the same logic as in bluestore to set the max amount
1001 // of memory to use for cache. Assume base memory for OSDMaps
1002 // and then add in some overhead for fragmentation.
1003 uint64_t ltarget = (1.0 - fragmentation) * target;
1004 if (ltarget > base + min) {
1005 max = ltarget - base;
1006 }
1007
1008 rocksdb_binned_kv_cache = mon.store->get_priority_cache();
1009 if (!rocksdb_binned_kv_cache) {
1010 derr << __func__ << " not using rocksdb" << dendl;
1011 return -EINVAL;
1012 }
1013
1014 int r = _set_cache_ratios();
1015 if (r < 0) {
1016 derr << __func__ << " Cache ratios for pcm could not be set."
1017 << " Review the kv (rocksdb) and mon_memory_target sizes."
1018 << dendl;
1019 return -EINVAL;
1020 }
1021
1022 pcm = std::make_shared<PriorityCache::Manager>(
1023 cct, min, max, target, true);
1024 pcm->insert("kv", rocksdb_binned_kv_cache, true);
1025 pcm->insert("inc", inc_cache, true);
1026 pcm->insert("full", full_cache, true);
1027 dout(1) << __func__ << " pcm target: " << target
1028 << " pcm max: " << max
1029 << " pcm min: " << min
1030 << " inc_osd_cache size: " << inc_osd_cache.get_size()
1031 << dendl;
1032 return 0;
1033 }
1034
1035 int OSDMonitor::_set_cache_ratios()
1036 {
1037 double old_cache_kv_ratio = cache_kv_ratio;
1038
1039 // Set the cache ratios for kv(rocksdb), inc and full caches
1040 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1041 if (cache_kv_ratio >= 1.0) {
1042 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1043 << ") must be in range [0,<1.0]."
1044 << dendl;
1045 cache_kv_ratio = old_cache_kv_ratio;
1046 return -EINVAL;
1047 }
1048 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1049 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1050 inc_cache->set_cache_ratio(cache_inc_ratio);
1051 full_cache->set_cache_ratio(cache_full_ratio);
1052
1053 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1054 << " inc ratio " << cache_inc_ratio
1055 << " full ratio " << cache_full_ratio
1056 << dendl;
1057 return 0;
1058 }
1059
1060 void OSDMonitor::start_mapping()
1061 {
1062 // initiate mapping job
1063 if (mapping_job) {
1064 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1065 << dendl;
1066 mapping_job->abort();
1067 }
1068 if (!osdmap.get_pools().empty()) {
1069 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1070 mapping_job = mapping.start_update(osdmap, mapper,
1071 g_conf()->mon_osd_mapping_pgs_per_chunk);
1072 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1073 << " at " << fin->start << dendl;
1074 mapping_job->set_finish_event(fin);
1075 } else {
1076 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1077 mapping_job = nullptr;
1078 }
1079 }
1080
1081 void OSDMonitor::update_msgr_features()
1082 {
1083 const int types[] = {
1084 entity_name_t::TYPE_OSD,
1085 entity_name_t::TYPE_CLIENT,
1086 entity_name_t::TYPE_MDS,
1087 entity_name_t::TYPE_MON
1088 };
1089 for (int type : types) {
1090 uint64_t mask;
1091 uint64_t features = osdmap.get_features(type, &mask);
1092 if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1093 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1094 ceph::net::Policy p = mon.messenger->get_policy(type);
1095 p.features_required = (p.features_required & ~mask) | features;
1096 mon.messenger->set_policy(type, p);
1097 }
1098 }
1099 }
1100
1101 void OSDMonitor::on_active()
1102 {
1103 update_logger();
1104
1105 if (mon.is_leader()) {
1106 mon.clog->debug() << "osdmap " << osdmap;
1107 if (!priority_convert) {
1108 // Only do this once at start-up
1109 convert_pool_priorities();
1110 priority_convert = true;
1111 }
1112 } else {
1113 list<MonOpRequestRef> ls;
1114 take_all_failures(ls);
1115 while (!ls.empty()) {
1116 MonOpRequestRef op = ls.front();
1117 op->mark_osdmon_event(__func__);
1118 dispatch(op);
1119 ls.pop_front();
1120 }
1121 }
1122 start_mapping();
1123 }
1124
1125 void OSDMonitor::on_restart()
1126 {
1127 last_osd_report.clear();
1128 }
1129
1130 void OSDMonitor::on_shutdown()
1131 {
1132 dout(10) << __func__ << dendl;
1133 if (mapping_job) {
1134 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1135 << dendl;
1136 mapping_job->abort();
1137 }
1138
1139 // discard failure info, waiters
1140 list<MonOpRequestRef> ls;
1141 take_all_failures(ls);
1142 ls.clear();
1143 }
1144
1145 void OSDMonitor::update_logger()
1146 {
1147 dout(10) << "update_logger" << dendl;
1148
1149 mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1150 mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1151 mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1152 mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1153 }
1154
1155 void OSDMonitor::create_pending()
1156 {
1157 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1158 pending_inc.fsid = mon.monmap->fsid;
1159 pending_metadata.clear();
1160 pending_metadata_rm.clear();
1161 pending_pseudo_purged_snaps.clear();
1162
1163 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1164
1165 // safety checks (this shouldn't really happen)
1166 {
1167 if (osdmap.backfillfull_ratio <= 0) {
1168 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1169 if (pending_inc.new_backfillfull_ratio > 1.0)
1170 pending_inc.new_backfillfull_ratio /= 100;
1171 dout(1) << __func__ << " setting backfillfull_ratio = "
1172 << pending_inc.new_backfillfull_ratio << dendl;
1173 }
1174 if (osdmap.full_ratio <= 0) {
1175 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1176 if (pending_inc.new_full_ratio > 1.0)
1177 pending_inc.new_full_ratio /= 100;
1178 dout(1) << __func__ << " setting full_ratio = "
1179 << pending_inc.new_full_ratio << dendl;
1180 }
1181 if (osdmap.nearfull_ratio <= 0) {
1182 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1183 if (pending_inc.new_nearfull_ratio > 1.0)
1184 pending_inc.new_nearfull_ratio /= 100;
1185 dout(1) << __func__ << " setting nearfull_ratio = "
1186 << pending_inc.new_nearfull_ratio << dendl;
1187 }
1188 }
1189 }
1190
1191 creating_pgs_t
1192 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1193 const OSDMap& nextmap)
1194 {
1195 dout(10) << __func__ << dendl;
1196 creating_pgs_t pending_creatings;
1197 {
1198 std::lock_guard<std::mutex> l(creating_pgs_lock);
1199 pending_creatings = creating_pgs;
1200 }
1201 // check for new or old pools
1202 if (pending_creatings.last_scan_epoch < inc.epoch) {
1203 unsigned queued = 0;
1204 queued += scan_for_creating_pgs(osdmap.get_pools(),
1205 inc.old_pools,
1206 inc.modified,
1207 &pending_creatings);
1208 queued += scan_for_creating_pgs(inc.new_pools,
1209 inc.old_pools,
1210 inc.modified,
1211 &pending_creatings);
1212 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1213 for (auto deleted_pool : inc.old_pools) {
1214 auto removed = pending_creatings.remove_pool(deleted_pool);
1215 dout(10) << __func__ << " " << removed
1216 << " pg removed because containing pool deleted: "
1217 << deleted_pool << dendl;
1218 last_epoch_clean.remove_pool(deleted_pool);
1219 }
1220 // pgmon updates its creating_pgs in check_osd_map() which is called by
1221 // on_active() and check_osd_map() could be delayed if lease expires, so its
1222 // creating_pgs could be stale in comparison with the one of osdmon. let's
1223 // trim them here. otherwise, they will be added back after being erased.
1224 unsigned removed = 0;
1225 for (auto& pg : pending_created_pgs) {
1226 dout(20) << __func__ << " noting created pg " << pg << dendl;
1227 pending_creatings.created_pools.insert(pg.pool());
1228 removed += pending_creatings.pgs.erase(pg);
1229 }
1230 pending_created_pgs.clear();
1231 dout(10) << __func__ << " " << removed
1232 << " pgs removed because they're created" << dendl;
1233 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1234 }
1235
1236 // filter out any pgs that shouldn't exist.
1237 {
1238 auto i = pending_creatings.pgs.begin();
1239 while (i != pending_creatings.pgs.end()) {
1240 if (!nextmap.pg_exists(i->first)) {
1241 dout(10) << __func__ << " removing pg " << i->first
1242 << " which should not exist" << dendl;
1243 i = pending_creatings.pgs.erase(i);
1244 } else {
1245 ++i;
1246 }
1247 }
1248 }
1249
1250 // process queue
1251 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1252 const auto total = pending_creatings.pgs.size();
1253 while (pending_creatings.pgs.size() < max &&
1254 !pending_creatings.queue.empty()) {
1255 auto p = pending_creatings.queue.begin();
1256 int64_t poolid = p->first;
1257 dout(10) << __func__ << " pool " << poolid
1258 << " created " << p->second.created
1259 << " modified " << p->second.modified
1260 << " [" << p->second.start << "-" << p->second.end << ")"
1261 << dendl;
1262 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1263 p->second.end - p->second.start);
1264 ps_t first = p->second.start;
1265 ps_t end = first + n;
1266 for (ps_t ps = first; ps < end; ++ps) {
1267 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1268 // NOTE: use the *current* epoch as the PG creation epoch so that the
1269 // OSD does not have to generate a long set of PastIntervals.
1270 pending_creatings.pgs.emplace(
1271 pgid,
1272 creating_pgs_t::pg_create_info(inc.epoch,
1273 p->second.modified));
1274 dout(10) << __func__ << " adding " << pgid << dendl;
1275 }
1276 p->second.start = end;
1277 if (p->second.done()) {
1278 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1279 pending_creatings.queue.erase(p);
1280 } else {
1281 dout(10) << __func__ << " pool " << poolid
1282 << " now [" << p->second.start << "-" << p->second.end << ")"
1283 << dendl;
1284 }
1285 }
1286 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1287 << " pools" << dendl;
1288
1289 if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1290 // walk creating pgs' history and past_intervals forward
1291 for (auto& i : pending_creatings.pgs) {
1292 // this mirrors PG::start_peering_interval()
1293 pg_t pgid = i.first;
1294
1295 // this is a bit imprecise, but sufficient?
1296 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1297 const pg_pool_t *pi;
1298 bool operator()(const set<pg_shard_t> &have) const {
1299 return have.size() >= pi->min_size;
1300 }
1301 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1302 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1303
1304 vector<int> up, acting;
1305 int up_primary, acting_primary;
1306 nextmap.pg_to_up_acting_osds(
1307 pgid, &up, &up_primary, &acting, &acting_primary);
1308 if (i.second.history.epoch_created == 0) {
1309 // new pg entry, set it up
1310 i.second.up = up;
1311 i.second.acting = acting;
1312 i.second.up_primary = up_primary;
1313 i.second.acting_primary = acting_primary;
1314 i.second.history = pg_history_t(i.second.create_epoch,
1315 i.second.create_stamp);
1316 dout(10) << __func__ << " pg " << pgid << " just added, "
1317 << " up " << i.second.up
1318 << " p " << i.second.up_primary
1319 << " acting " << i.second.acting
1320 << " p " << i.second.acting_primary
1321 << " history " << i.second.history
1322 << " past_intervals " << i.second.past_intervals
1323 << dendl;
1324 } else {
1325 std::stringstream debug;
1326 if (PastIntervals::check_new_interval(
1327 i.second.acting_primary, acting_primary,
1328 i.second.acting, acting,
1329 i.second.up_primary, up_primary,
1330 i.second.up, up,
1331 i.second.history.same_interval_since,
1332 i.second.history.last_epoch_clean,
1333 &nextmap,
1334 &osdmap,
1335 pgid,
1336 min_size_predicate,
1337 &i.second.past_intervals,
1338 &debug)) {
1339 epoch_t e = inc.epoch;
1340 i.second.history.same_interval_since = e;
1341 if (i.second.up != up) {
1342 i.second.history.same_up_since = e;
1343 }
1344 if (i.second.acting_primary != acting_primary) {
1345 i.second.history.same_primary_since = e;
1346 }
1347 if (pgid.is_split(
1348 osdmap.get_pg_num(pgid.pool()),
1349 nextmap.get_pg_num(pgid.pool()),
1350 nullptr)) {
1351 i.second.history.last_epoch_split = e;
1352 }
1353 dout(10) << __func__ << " pg " << pgid << " new interval,"
1354 << " up " << i.second.up << " -> " << up
1355 << " p " << i.second.up_primary << " -> " << up_primary
1356 << " acting " << i.second.acting << " -> " << acting
1357 << " p " << i.second.acting_primary << " -> "
1358 << acting_primary
1359 << " history " << i.second.history
1360 << " past_intervals " << i.second.past_intervals
1361 << dendl;
1362 dout(20) << " debug: " << debug.str() << dendl;
1363 i.second.up = up;
1364 i.second.acting = acting;
1365 i.second.up_primary = up_primary;
1366 i.second.acting_primary = acting_primary;
1367 }
1368 }
1369 }
1370 }
1371 dout(10) << __func__
1372 << " " << (pending_creatings.pgs.size() - total)
1373 << "/" << pending_creatings.pgs.size()
1374 << " pgs added from queued pools" << dendl;
1375 return pending_creatings;
1376 }
1377
1378 void OSDMonitor::maybe_prime_pg_temp()
1379 {
1380 bool all = false;
1381 if (pending_inc.crush.length()) {
1382 dout(10) << __func__ << " new crush map, all" << dendl;
1383 all = true;
1384 }
1385
1386 if (!pending_inc.new_up_client.empty()) {
1387 dout(10) << __func__ << " new up osds, all" << dendl;
1388 all = true;
1389 }
1390
1391 // check for interesting OSDs
1392 set<int> osds;
1393 for (auto p = pending_inc.new_state.begin();
1394 !all && p != pending_inc.new_state.end();
1395 ++p) {
1396 if ((p->second & CEPH_OSD_UP) &&
1397 osdmap.is_up(p->first)) {
1398 osds.insert(p->first);
1399 }
1400 }
1401 for (auto p = pending_inc.new_weight.begin();
1402 !all && p != pending_inc.new_weight.end();
1403 ++p) {
1404 if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1405 // weight reduction
1406 osds.insert(p->first);
1407 } else {
1408 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1409 << dendl;
1410 all = true;
1411 }
1412 }
1413
1414 if (!all && osds.empty())
1415 return;
1416
1417 if (!all) {
1418 unsigned estimate =
1419 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1420 if (estimate > mapping.get_num_pgs() *
1421 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1422 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1423 << osds.size() << " osds >= "
1424 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1425 << mapping.get_num_pgs() << " pgs, all"
1426 << dendl;
1427 all = true;
1428 } else {
1429 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1430 << osds.size() << " osds" << dendl;
1431 }
1432 }
1433
1434 OSDMap next;
1435 next.deepish_copy_from(osdmap);
1436 next.apply_incremental(pending_inc);
1437
1438 if (next.get_pools().empty()) {
1439 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1440 } else if (all) {
1441 PrimeTempJob job(next, this);
1442 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1443 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1444 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1445 } else {
1446 dout(10) << __func__ << " did not finish in "
1447 << g_conf()->mon_osd_prime_pg_temp_max_time
1448 << ", stopping" << dendl;
1449 job.abort();
1450 }
1451 } else {
1452 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1453 utime_t stop = ceph_clock_now();
1454 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1455 const int chunk = 1000;
1456 int n = chunk;
1457 std::unordered_set<pg_t> did_pgs;
1458 for (auto osd : osds) {
1459 auto& pgs = mapping.get_osd_acting_pgs(osd);
1460 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1461 for (auto pgid : pgs) {
1462 if (!did_pgs.insert(pgid).second) {
1463 continue;
1464 }
1465 prime_pg_temp(next, pgid);
1466 if (--n <= 0) {
1467 n = chunk;
1468 if (ceph_clock_now() > stop) {
1469 dout(10) << __func__ << " consumed more than "
1470 << g_conf()->mon_osd_prime_pg_temp_max_time
1471 << " seconds, stopping"
1472 << dendl;
1473 return;
1474 }
1475 }
1476 }
1477 }
1478 }
1479 }
1480
1481 void OSDMonitor::prime_pg_temp(
1482 const OSDMap& next,
1483 pg_t pgid)
1484 {
1485 // TODO: remove this creating_pgs direct access?
1486 if (creating_pgs.pgs.count(pgid)) {
1487 return;
1488 }
1489 if (!osdmap.pg_exists(pgid)) {
1490 return;
1491 }
1492
1493 vector<int> up, acting;
1494 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1495
1496 vector<int> next_up, next_acting;
1497 int next_up_primary, next_acting_primary;
1498 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1499 &next_acting, &next_acting_primary);
1500 if (acting == next_acting &&
1501 !(up != acting && next_up == next_acting))
1502 return; // no change since last epoch
1503
1504 if (acting.empty())
1505 return; // if previously empty now we can be no worse off
1506 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1507 if (pool && acting.size() < pool->min_size)
1508 return; // can be no worse off than before
1509
1510 if (next_up == next_acting) {
1511 acting.clear();
1512 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1513 << dendl;
1514 }
1515
1516 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1517 << " -> " << next_up << "/" << next_acting
1518 << ", priming " << acting
1519 << dendl;
1520 {
1521 std::lock_guard l(prime_pg_temp_lock);
1522 // do not touch a mapping if a change is pending
1523 pending_inc.new_pg_temp.emplace(
1524 pgid,
1525 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1526 }
1527 }
1528
1529 /**
1530 * @note receiving a transaction in this function gives a fair amount of
1531 * freedom to the service implementation if it does need it. It shouldn't.
1532 */
1533 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1534 {
1535 dout(10) << "encode_pending e " << pending_inc.epoch
1536 << dendl;
1537
1538 if (do_prune(t)) {
1539 dout(1) << __func__ << " osdmap full prune encoded e"
1540 << pending_inc.epoch << dendl;
1541 }
1542
1543 // finalize up pending_inc
1544 pending_inc.modified = ceph_clock_now();
1545
1546 int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1547 ceph_assert(r == 0);
1548
1549 if (mapping_job) {
1550 if (!mapping_job->is_done()) {
1551 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1552 << mapping_job.get() << " did not complete, "
1553 << mapping_job->shards << " left" << dendl;
1554 mapping_job->abort();
1555 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1556 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1557 << mapping_job.get() << " is prior epoch "
1558 << mapping.get_epoch() << dendl;
1559 } else {
1560 if (g_conf()->mon_osd_prime_pg_temp) {
1561 maybe_prime_pg_temp();
1562 }
1563 }
1564 } else if (g_conf()->mon_osd_prime_pg_temp) {
1565 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1566 << dendl;
1567 }
1568 mapping_job.reset();
1569
1570 // ensure we don't have blank new_state updates. these are interrpeted as
1571 // CEPH_OSD_UP (and almost certainly not what we want!).
1572 auto p = pending_inc.new_state.begin();
1573 while (p != pending_inc.new_state.end()) {
1574 if (p->second == 0) {
1575 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1576 p = pending_inc.new_state.erase(p);
1577 } else {
1578 if (p->second & CEPH_OSD_UP) {
1579 pending_inc.new_last_up_change = pending_inc.modified;
1580 }
1581 ++p;
1582 }
1583 }
1584 if (!pending_inc.new_up_client.empty()) {
1585 pending_inc.new_last_up_change = pending_inc.modified;
1586 }
1587 for (auto& i : pending_inc.new_weight) {
1588 if (i.first >= osdmap.max_osd) {
1589 if (i.second) {
1590 // new osd is already marked in
1591 pending_inc.new_last_in_change = pending_inc.modified;
1592 break;
1593 }
1594 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1595 // existing osd marked in or out
1596 pending_inc.new_last_in_change = pending_inc.modified;
1597 break;
1598 }
1599 }
1600
1601 {
1602 OSDMap tmp;
1603 tmp.deepish_copy_from(osdmap);
1604 tmp.apply_incremental(pending_inc);
1605
1606 // clean pg_temp mappings
1607 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1608
1609 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1610 {
1611 // check every upmapped pg for now
1612 // until we could reliably identify certain cases to ignore,
1613 // which is obviously the hard part TBD..
1614 vector<pg_t> pgs_to_check;
1615 tmp.get_upmap_pgs(&pgs_to_check);
1616 if (pgs_to_check.size() <
1617 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1618 // not enough pgs, do it inline
1619 tmp.clean_pg_upmaps(cct, &pending_inc);
1620 } else {
1621 CleanUpmapJob job(cct, tmp, pending_inc);
1622 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1623 job.wait();
1624 }
1625 }
1626
1627 // update creating pgs first so that we can remove the created pgid and
1628 // process the pool flag removal below in the same osdmap epoch.
1629 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1630 bufferlist creatings_bl;
1631 uint64_t features = CEPH_FEATURES_ALL;
1632 if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1633 dout(20) << __func__ << " encoding pending pgs without octopus features"
1634 << dendl;
1635 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1636 }
1637 encode(pending_creatings, creatings_bl, features);
1638 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1639
1640 // remove any old (or incompat) POOL_CREATING flags
1641 for (auto& i : tmp.get_pools()) {
1642 if (tmp.require_osd_release < ceph_release_t::nautilus) {
1643 // pre-nautilus OSDMaps shouldn't get this flag.
1644 if (pending_inc.new_pools.count(i.first)) {
1645 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1646 }
1647 }
1648 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1649 !pending_creatings.still_creating_pool(i.first)) {
1650 dout(10) << __func__ << " done creating pool " << i.first
1651 << ", clearing CREATING flag" << dendl;
1652 if (pending_inc.new_pools.count(i.first) == 0) {
1653 pending_inc.new_pools[i.first] = i.second;
1654 }
1655 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1656 }
1657 }
1658
1659 // collect which pools are currently affected by
1660 // the near/backfill/full osd(s),
1661 // and set per-pool near/backfill/full flag instead
1662 set<int64_t> full_pool_ids;
1663 set<int64_t> backfillfull_pool_ids;
1664 set<int64_t> nearfull_pool_ids;
1665 tmp.get_full_pools(cct,
1666 &full_pool_ids,
1667 &backfillfull_pool_ids,
1668 &nearfull_pool_ids);
1669 if (full_pool_ids.empty() ||
1670 backfillfull_pool_ids.empty() ||
1671 nearfull_pool_ids.empty()) {
1672 // normal case - no nearfull, backfillfull or full osds
1673 // try cancel any improper nearfull/backfillfull/full pool
1674 // flags first
1675 for (auto &pool: tmp.get_pools()) {
1676 auto p = pool.first;
1677 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1678 nearfull_pool_ids.empty()) {
1679 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1680 << "'s nearfull flag" << dendl;
1681 if (pending_inc.new_pools.count(p) == 0) {
1682 // load original pool info first!
1683 pending_inc.new_pools[p] = pool.second;
1684 }
1685 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1686 }
1687 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1688 backfillfull_pool_ids.empty()) {
1689 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1690 << "'s backfillfull flag" << dendl;
1691 if (pending_inc.new_pools.count(p) == 0) {
1692 pending_inc.new_pools[p] = pool.second;
1693 }
1694 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1695 }
1696 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1697 full_pool_ids.empty()) {
1698 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1699 // set by EQUOTA, skipping
1700 continue;
1701 }
1702 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1703 << "'s full flag" << dendl;
1704 if (pending_inc.new_pools.count(p) == 0) {
1705 pending_inc.new_pools[p] = pool.second;
1706 }
1707 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1708 }
1709 }
1710 }
1711 if (!full_pool_ids.empty()) {
1712 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1713 << " as full" << dendl;
1714 for (auto &p: full_pool_ids) {
1715 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1716 continue;
1717 }
1718 if (pending_inc.new_pools.count(p) == 0) {
1719 pending_inc.new_pools[p] = tmp.pools[p];
1720 }
1721 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1722 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1723 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1724 }
1725 // cancel FLAG_FULL for pools which are no longer full too
1726 for (auto &pool: tmp.get_pools()) {
1727 auto p = pool.first;
1728 if (full_pool_ids.count(p)) {
1729 // skip pools we have just marked as full above
1730 continue;
1731 }
1732 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1733 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1734 // don't touch if currently is not full
1735 // or is running out of quota (and hence considered as full)
1736 continue;
1737 }
1738 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1739 << "'s full flag" << dendl;
1740 if (pending_inc.new_pools.count(p) == 0) {
1741 pending_inc.new_pools[p] = pool.second;
1742 }
1743 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1744 }
1745 }
1746 if (!backfillfull_pool_ids.empty()) {
1747 for (auto &p: backfillfull_pool_ids) {
1748 if (full_pool_ids.count(p)) {
1749 // skip pools we have already considered as full above
1750 continue;
1751 }
1752 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1753 // make sure FLAG_FULL is truly set, so we are safe not
1754 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1755 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1756 continue;
1757 }
1758 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1759 // don't bother if pool is already marked as backfillfull
1760 continue;
1761 }
1762 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1763 << "'s as backfillfull" << dendl;
1764 if (pending_inc.new_pools.count(p) == 0) {
1765 pending_inc.new_pools[p] = tmp.pools[p];
1766 }
1767 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1768 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1769 }
1770 // cancel FLAG_BACKFILLFULL for pools
1771 // which are no longer backfillfull too
1772 for (auto &pool: tmp.get_pools()) {
1773 auto p = pool.first;
1774 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1775 // skip pools we have just marked as backfillfull/full above
1776 continue;
1777 }
1778 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1779 // and don't touch if currently is not backfillfull
1780 continue;
1781 }
1782 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1783 << "'s backfillfull flag" << dendl;
1784 if (pending_inc.new_pools.count(p) == 0) {
1785 pending_inc.new_pools[p] = pool.second;
1786 }
1787 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1788 }
1789 }
1790 if (!nearfull_pool_ids.empty()) {
1791 for (auto &p: nearfull_pool_ids) {
1792 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1793 continue;
1794 }
1795 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1796 // make sure FLAG_FULL is truly set, so we are safe not
1797 // to set a extra (redundant) FLAG_NEARFULL flag
1798 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1799 continue;
1800 }
1801 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1802 // don't bother if pool is already marked as nearfull
1803 continue;
1804 }
1805 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1806 << "'s as nearfull" << dendl;
1807 if (pending_inc.new_pools.count(p) == 0) {
1808 pending_inc.new_pools[p] = tmp.pools[p];
1809 }
1810 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1811 }
1812 // cancel FLAG_NEARFULL for pools
1813 // which are no longer nearfull too
1814 for (auto &pool: tmp.get_pools()) {
1815 auto p = pool.first;
1816 if (full_pool_ids.count(p) ||
1817 backfillfull_pool_ids.count(p) ||
1818 nearfull_pool_ids.count(p)) {
1819 // skip pools we have just marked as
1820 // nearfull/backfillfull/full above
1821 continue;
1822 }
1823 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1824 // and don't touch if currently is not nearfull
1825 continue;
1826 }
1827 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1828 << "'s nearfull flag" << dendl;
1829 if (pending_inc.new_pools.count(p) == 0) {
1830 pending_inc.new_pools[p] = pool.second;
1831 }
1832 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1833 }
1834 }
1835
1836 // min_compat_client?
1837 if (!tmp.require_min_compat_client) {
1838 auto mv = tmp.get_min_compat_client();
1839 dout(1) << __func__ << " setting require_min_compat_client to currently "
1840 << "required " << mv << dendl;
1841 mon.clog->info() << "setting require_min_compat_client to currently "
1842 << "required " << mv;
1843 pending_inc.new_require_min_compat_client = mv;
1844 }
1845
1846 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1847 tmp.require_osd_release >= ceph_release_t::nautilus) {
1848 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1849 // add creating flags?
1850 for (auto& i : tmp.get_pools()) {
1851 if (pending_creatings.still_creating_pool(i.first)) {
1852 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1853 << dendl;
1854 if (pending_inc.new_pools.count(i.first) == 0) {
1855 pending_inc.new_pools[i.first] = i.second;
1856 }
1857 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1858 }
1859 }
1860 // adjust blocklist items to all be TYPE_ANY
1861 for (auto& i : tmp.blocklist) {
1862 auto a = i.first;
1863 a.set_type(entity_addr_t::TYPE_ANY);
1864 pending_inc.new_blocklist[a] = i.second;
1865 pending_inc.old_blocklist.push_back(i.first);
1866 }
1867 }
1868
1869 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1870 tmp.require_osd_release >= ceph_release_t::octopus) {
1871 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1872
1873 // adjust obsoleted cache modes
1874 for (auto& [poolid, pi] : tmp.pools) {
1875 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1876 if (pending_inc.new_pools.count(poolid) == 0) {
1877 pending_inc.new_pools[poolid] = pi;
1878 }
1879 dout(10) << __func__ << " switching pool " << poolid
1880 << " cachemode from forward -> proxy" << dendl;
1881 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1882 }
1883 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1884 if (pending_inc.new_pools.count(poolid) == 0) {
1885 pending_inc.new_pools[poolid] = pi;
1886 }
1887 dout(10) << __func__ << " switching pool " << poolid
1888 << " cachemode from readforward -> readproxy" << dendl;
1889 pending_inc.new_pools[poolid].cache_mode =
1890 pg_pool_t::CACHEMODE_READPROXY;
1891 }
1892 }
1893
1894 // clear removed_snaps for every pool
1895 for (auto& [poolid, pi] : tmp.pools) {
1896 if (pi.removed_snaps.empty()) {
1897 continue;
1898 }
1899 if (pending_inc.new_pools.count(poolid) == 0) {
1900 pending_inc.new_pools[poolid] = pi;
1901 }
1902 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1903 << dendl;
1904 pending_inc.new_pools[poolid].removed_snaps.clear();
1905 }
1906
1907 // create a combined purged snap epoch key for all purged snaps
1908 // prior to this epoch, and store it in the current epoch (i.e.,
1909 // the last pre-octopus epoch, just prior to the one we're
1910 // encoding now).
1911 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1912 it->lower_bound("purged_snap_");
1913 map<int64_t,snap_interval_set_t> combined;
1914 while (it->valid()) {
1915 if (it->key().find("purged_snap_") != 0) {
1916 break;
1917 }
1918 string k = it->key();
1919 long long unsigned pool;
1920 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1921 if (n != 1) {
1922 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1923 } else {
1924 bufferlist v = it->value();
1925 auto p = v.cbegin();
1926 snapid_t begin, end;
1927 ceph::decode(begin, p);
1928 ceph::decode(end, p);
1929 combined[pool].insert(begin, end - begin);
1930 }
1931 it->next();
1932 }
1933 if (!combined.empty()) {
1934 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1935 bufferlist v;
1936 ceph::encode(combined, v);
1937 t->put(OSD_SNAP_PREFIX, k, v);
1938 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1939 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1940 << dendl;
1941 } else {
1942 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1943 << dendl;
1944 }
1945
1946 // clean out the old removed_snap_ and removed_epoch keys
1947 // ('`' is ASCII '_' + 1)
1948 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1949 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1950 }
1951 }
1952
1953 // tell me about it
1954 for (auto i = pending_inc.new_state.begin();
1955 i != pending_inc.new_state.end();
1956 ++i) {
1957 int s = i->second ? i->second : CEPH_OSD_UP;
1958 if (s & CEPH_OSD_UP) {
1959 dout(2) << " osd." << i->first << " DOWN" << dendl;
1960 // Reset laggy parameters if failure interval exceeds a threshold.
1961 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1962 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1963 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1964 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1965 set_default_laggy_params(i->first);
1966 }
1967 }
1968 }
1969 if (s & CEPH_OSD_EXISTS)
1970 dout(2) << " osd." << i->first << " DNE" << dendl;
1971 }
1972 for (auto i = pending_inc.new_up_client.begin();
1973 i != pending_inc.new_up_client.end();
1974 ++i) {
1975 //FIXME: insert cluster addresses too
1976 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1977 }
1978 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1979 i != pending_inc.new_weight.end();
1980 ++i) {
1981 if (i->second == CEPH_OSD_OUT) {
1982 dout(2) << " osd." << i->first << " OUT" << dendl;
1983 } else if (i->second == CEPH_OSD_IN) {
1984 dout(2) << " osd." << i->first << " IN" << dendl;
1985 } else {
1986 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1987 }
1988 }
1989
1990 // features for osdmap and its incremental
1991 uint64_t features;
1992
1993 // encode full map and determine its crc
1994 OSDMap tmp;
1995 {
1996 tmp.deepish_copy_from(osdmap);
1997 tmp.apply_incremental(pending_inc);
1998
1999 // determine appropriate features
2000 features = tmp.get_encoding_features();
2001 dout(10) << __func__ << " encoding full map with "
2002 << tmp.require_osd_release
2003 << " features " << features << dendl;
2004
2005 // the features should be a subset of the mon quorum's features!
2006 ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2007
2008 bufferlist fullbl;
2009 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2010 pending_inc.full_crc = tmp.get_crc();
2011
2012 // include full map in the txn. note that old monitors will
2013 // overwrite this. new ones will now skip the local full map
2014 // encode and reload from this.
2015 put_version_full(t, pending_inc.epoch, fullbl);
2016 }
2017
2018 // encode
2019 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2020 bufferlist bl;
2021 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2022
2023 dout(20) << " full_crc " << tmp.get_crc()
2024 << " inc_crc " << pending_inc.inc_crc << dendl;
2025
2026 /* put everything in the transaction */
2027 put_version(t, pending_inc.epoch, bl);
2028 put_last_committed(t, pending_inc.epoch);
2029
2030 // metadata, too!
2031 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2032 p != pending_metadata.end();
2033 ++p) {
2034 Metadata m;
2035 auto mp = p->second.cbegin();
2036 decode(m, mp);
2037 auto it = m.find("osd_objectstore");
2038 if (it != m.end()) {
2039 if (it->second == "filestore") {
2040 filestore_osds.insert(p->first);
2041 } else {
2042 filestore_osds.erase(p->first);
2043 }
2044 }
2045 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2046 }
2047 for (set<int>::iterator p = pending_metadata_rm.begin();
2048 p != pending_metadata_rm.end();
2049 ++p) {
2050 filestore_osds.erase(*p);
2051 t->erase(OSD_METADATA_PREFIX, stringify(*p));
2052 }
2053 pending_metadata.clear();
2054 pending_metadata_rm.clear();
2055
2056 // purged_snaps
2057 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2058 !pending_inc.new_purged_snaps.empty()) {
2059 // all snaps purged this epoch (across all pools)
2060 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2061 bufferlist v;
2062 encode(pending_inc.new_purged_snaps, v);
2063 t->put(OSD_SNAP_PREFIX, k, v);
2064 }
2065 for (auto& i : pending_inc.new_purged_snaps) {
2066 for (auto q = i.second.begin();
2067 q != i.second.end();
2068 ++q) {
2069 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2070 pending_inc.epoch,
2071 t);
2072 }
2073 }
2074 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2075 for (auto snap : snaps) {
2076 insert_purged_snap_update(pool, snap, snap + 1,
2077 pending_inc.epoch,
2078 t);
2079 }
2080 }
2081
2082 // health
2083 health_check_map_t next;
2084 tmp.check_health(cct, &next);
2085 // OSD_FILESTORE
2086 check_for_filestore_osds(&next);
2087 encode_health(next, t);
2088 }
2089
2090 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2091 {
2092 bufferlist bl;
2093 int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2094 if (r < 0)
2095 return r;
2096 try {
2097 auto p = bl.cbegin();
2098 decode(m, p);
2099 }
2100 catch (ceph::buffer::error& e) {
2101 if (err)
2102 *err << "osd." << osd << " metadata is corrupt";
2103 return -EIO;
2104 }
2105 return 0;
2106 }
2107
2108 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2109 {
2110 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2111 if (osdmap.is_up(osd)) {
2112 map<string,string> meta;
2113 load_metadata(osd, meta, nullptr);
2114 auto p = meta.find(field);
2115 if (p == meta.end()) {
2116 (*out)["unknown"]++;
2117 } else {
2118 (*out)[p->second]++;
2119 }
2120 }
2121 }
2122 }
2123
2124 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2125 {
2126 map<string,int> by_val;
2127 count_metadata(field, &by_val);
2128 f->open_object_section(field.c_str());
2129 for (auto& p : by_val) {
2130 f->dump_int(p.first.c_str(), p.second);
2131 }
2132 f->close_section();
2133 }
2134
2135 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2136 {
2137 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2138 if (osdmap.is_up(osd)) {
2139 map<string,string> meta;
2140 load_metadata(osd, meta, nullptr);
2141 auto p = meta.find("ceph_version_short");
2142 if (p == meta.end()) continue;
2143 versions[p->second].push_back(string("osd.") + stringify(osd));
2144 }
2145 }
2146 }
2147
2148 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2149 {
2150 map<string, string> metadata;
2151 int r = load_metadata(osd, metadata, nullptr);
2152 if (r < 0)
2153 return r;
2154
2155 auto it = metadata.find("osd_objectstore");
2156 if (it == metadata.end())
2157 return -ENOENT;
2158 *type = it->second;
2159 return 0;
2160 }
2161
2162 void OSDMonitor::get_filestore_osd_list()
2163 {
2164 for (unsigned osd = 0; osd < osdmap.get_num_osds(); ++osd) {
2165 string objectstore_type;
2166 int r = get_osd_objectstore_type(osd, &objectstore_type);
2167 if (r == 0 && objectstore_type == "filestore") {
2168 filestore_osds.insert(osd);
2169 }
2170 }
2171 }
2172
2173 void OSDMonitor::check_for_filestore_osds(health_check_map_t *checks)
2174 {
2175 if (g_conf()->mon_warn_on_filestore_osds &&
2176 filestore_osds.size() > 0) {
2177 ostringstream ss, deprecated_tip;
2178 list<string> detail;
2179 ss << filestore_osds.size()
2180 << " osd(s) "
2181 << (filestore_osds.size() == 1 ? "is" : "are")
2182 << " running Filestore";
2183 deprecated_tip << ss.str();
2184 ss << " [Deprecated]";
2185 auto& d = checks->add("OSD_FILESTORE", HEALTH_WARN, ss.str(),
2186 filestore_osds.size());
2187 deprecated_tip << ", which has been deprecated and"
2188 << " not been optimized for QoS"
2189 << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2190 detail.push_back(deprecated_tip.str());
2191 d.detail.swap(detail);
2192 }
2193 }
2194
2195 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2196 const pg_pool_t &pool,
2197 ostream *err)
2198 {
2199 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2200 // since filestore osds could always join the pool later
2201 set<int> checked_osds;
2202 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2203 vector<int> up, acting;
2204 pg_t pgid(ps, pool_id);
2205 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2206 for (int osd : up) {
2207 if (checked_osds.find(osd) != checked_osds.end())
2208 continue;
2209 string objectstore_type;
2210 int r = get_osd_objectstore_type(osd, &objectstore_type);
2211 // allow with missing metadata, e.g. due to an osd never booting yet
2212 if (r < 0 || objectstore_type == "bluestore") {
2213 checked_osds.insert(osd);
2214 continue;
2215 }
2216 *err << "osd." << osd << " uses " << objectstore_type;
2217 return false;
2218 }
2219 }
2220 return true;
2221 }
2222
2223 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2224 {
2225 map<string,string> m;
2226 if (int r = load_metadata(osd, m, err))
2227 return r;
2228 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2229 f->dump_string(p->first.c_str(), p->second);
2230 return 0;
2231 }
2232
2233 void OSDMonitor::print_nodes(Formatter *f)
2234 {
2235 // group OSDs by their hosts
2236 map<string, list<int> > osds; // hostname => osd
2237 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2238 map<string, string> m;
2239 if (load_metadata(osd, m, NULL)) {
2240 continue;
2241 }
2242 map<string, string>::iterator hostname = m.find("hostname");
2243 if (hostname == m.end()) {
2244 // not likely though
2245 continue;
2246 }
2247 osds[hostname->second].push_back(osd);
2248 }
2249
2250 dump_services(f, osds, "osd");
2251 }
2252
2253 void OSDMonitor::share_map_with_random_osd()
2254 {
2255 if (osdmap.get_num_up_osds() == 0) {
2256 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2257 return;
2258 }
2259
2260 MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2261 if (!s) {
2262 dout(10) << __func__ << " no up osd on our session map" << dendl;
2263 return;
2264 }
2265
2266 dout(10) << "committed, telling random " << s->name
2267 << " all about it" << dendl;
2268
2269 // get feature of the peer
2270 // use quorum_con_features, if it's an anonymous connection.
2271 uint64_t features = s->con_features ? s->con_features :
2272 mon.get_quorum_con_features();
2273 // whatev, they'll request more if they need it
2274 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2275 s->con->send_message(m);
2276 // NOTE: do *not* record osd has up to this epoch (as we do
2277 // elsewhere) as they may still need to request older values.
2278 }
2279
2280 version_t OSDMonitor::get_trim_to() const
2281 {
2282 if (mon.get_quorum().empty()) {
2283 dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2284 return 0;
2285 }
2286
2287 {
2288 std::lock_guard<std::mutex> l(creating_pgs_lock);
2289 if (!creating_pgs.pgs.empty()) {
2290 dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2291 return 0;
2292 }
2293 }
2294
2295 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2296 dout(0) << __func__
2297 << " blocking osdmap trim"
2298 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2299 << " trim_to = 0" << dendl;
2300 return 0;
2301 }
2302
2303 {
2304 epoch_t floor = get_min_last_epoch_clean();
2305 dout(10) << " min_last_epoch_clean " << floor << dendl;
2306 if (g_conf()->mon_osd_force_trim_to > 0 &&
2307 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2308 floor = g_conf()->mon_osd_force_trim_to;
2309 dout(10) << __func__
2310 << " explicit mon_osd_force_trim_to = " << floor << dendl;
2311 }
2312 unsigned min = g_conf()->mon_min_osdmap_epochs;
2313 if (floor + min > get_last_committed()) {
2314 if (min < get_last_committed())
2315 floor = get_last_committed() - min;
2316 else
2317 floor = 0;
2318 }
2319 if (floor > get_first_committed()) {
2320 dout(10) << __func__ << " trim_to = " << floor << dendl;
2321 return floor;
2322 }
2323 }
2324 dout(10) << __func__ << " trim_to = 0" << dendl;
2325 return 0;
2326 }
2327
2328 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2329 {
2330 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2331 // also scan osd epochs
2332 // don't trim past the oldest reported osd epoch
2333 for (auto [osd, epoch] : osd_epochs) {
2334 if (epoch < floor) {
2335 floor = epoch;
2336 }
2337 }
2338 return floor;
2339 }
2340
2341 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2342 version_t first)
2343 {
2344 dout(10) << __func__ << " including full map for e " << first << dendl;
2345 bufferlist bl;
2346 get_version_full(first, bl);
2347 put_version_full(tx, first, bl);
2348
2349 if (has_osdmap_manifest &&
2350 first > osdmap_manifest.get_first_pinned()) {
2351 _prune_update_trimmed(tx, first);
2352 }
2353 }
2354
2355
2356 /* full osdmap prune
2357 *
2358 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2359 */
2360
2361 void OSDMonitor::load_osdmap_manifest()
2362 {
2363 bool store_has_manifest =
2364 mon.store->exists(get_service_name(), "osdmap_manifest");
2365
2366 if (!store_has_manifest) {
2367 if (!has_osdmap_manifest) {
2368 return;
2369 }
2370
2371 dout(20) << __func__
2372 << " dropping osdmap manifest from memory." << dendl;
2373 osdmap_manifest = osdmap_manifest_t();
2374 has_osdmap_manifest = false;
2375 return;
2376 }
2377
2378 dout(20) << __func__
2379 << " osdmap manifest detected in store; reload." << dendl;
2380
2381 bufferlist manifest_bl;
2382 int r = get_value("osdmap_manifest", manifest_bl);
2383 if (r < 0) {
2384 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2385 ceph_abort_msg("error reading manifest");
2386 }
2387 osdmap_manifest.decode(manifest_bl);
2388 has_osdmap_manifest = true;
2389
2390 dout(10) << __func__ << " store osdmap manifest pinned ("
2391 << osdmap_manifest.get_first_pinned()
2392 << " .. "
2393 << osdmap_manifest.get_last_pinned()
2394 << ")"
2395 << dendl;
2396 }
2397
2398 bool OSDMonitor::should_prune() const
2399 {
2400 version_t first = get_first_committed();
2401 version_t last = get_last_committed();
2402 version_t min_osdmap_epochs =
2403 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2404 version_t prune_min =
2405 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2406 version_t prune_interval =
2407 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2408 version_t last_pinned = osdmap_manifest.get_last_pinned();
2409 version_t last_to_pin = last - min_osdmap_epochs;
2410
2411 // Make it or break it constraints.
2412 //
2413 // If any of these conditions fails, we will not prune, regardless of
2414 // whether we have an on-disk manifest with an on-going pruning state.
2415 //
2416 if ((last - first) <= min_osdmap_epochs) {
2417 // between the first and last committed epochs, we don't have
2418 // enough epochs to trim, much less to prune.
2419 dout(10) << __func__
2420 << " currently holding only " << (last - first)
2421 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2422 << "); do not prune."
2423 << dendl;
2424 return false;
2425
2426 } else if ((last_to_pin - first) < prune_min) {
2427 // between the first committed epoch and the last epoch we would prune,
2428 // we simply don't have enough versions over the minimum to prune maps.
2429 dout(10) << __func__
2430 << " could only prune " << (last_to_pin - first)
2431 << " epochs (" << first << ".." << last_to_pin << "), which"
2432 " is less than the required minimum (" << prune_min << ")"
2433 << dendl;
2434 return false;
2435
2436 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2437 dout(10) << __func__
2438 << " we have pruned as far as we can; do not prune."
2439 << dendl;
2440 return false;
2441
2442 } else if (last_pinned + prune_interval > last_to_pin) {
2443 dout(10) << __func__
2444 << " not enough epochs to form an interval (last pinned: "
2445 << last_pinned << ", last to pin: "
2446 << last_to_pin << ", interval: " << prune_interval << ")"
2447 << dendl;
2448 return false;
2449 }
2450
2451 dout(15) << __func__
2452 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2453 << " lc (" << first << ".." << last << ")"
2454 << dendl;
2455 return true;
2456 }
2457
2458 void OSDMonitor::_prune_update_trimmed(
2459 MonitorDBStore::TransactionRef tx,
2460 version_t first)
2461 {
2462 dout(10) << __func__
2463 << " first " << first
2464 << " last_pinned " << osdmap_manifest.get_last_pinned()
2465 << dendl;
2466
2467 osdmap_manifest_t manifest = osdmap_manifest;
2468
2469 if (!manifest.is_pinned(first)) {
2470 manifest.pin(first);
2471 }
2472
2473 set<version_t>::iterator p_end = manifest.pinned.find(first);
2474 set<version_t>::iterator p = manifest.pinned.begin();
2475 manifest.pinned.erase(p, p_end);
2476 ceph_assert(manifest.get_first_pinned() == first);
2477
2478 if (manifest.get_last_pinned() == first+1 ||
2479 manifest.pinned.size() == 1) {
2480 // we reached the end of the line, as pinned maps go; clean up our
2481 // manifest, and let `should_prune()` decide whether we should prune
2482 // again.
2483 tx->erase(get_service_name(), "osdmap_manifest");
2484 return;
2485 }
2486
2487 bufferlist bl;
2488 manifest.encode(bl);
2489 tx->put(get_service_name(), "osdmap_manifest", bl);
2490 }
2491
2492 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2493 {
2494 dout(1) << __func__ << dendl;
2495
2496 version_t pin_first;
2497
2498 // verify constrainsts on stable in-memory state
2499 if (!has_osdmap_manifest) {
2500 // we must have never pruned, OR if we pruned the state must no longer
2501 // be relevant (i.e., the state must have been removed alongside with
2502 // the trim that *must* have removed past the last pinned map in a
2503 // previous prune).
2504 ceph_assert(osdmap_manifest.pinned.empty());
2505 ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2506 pin_first = get_first_committed();
2507
2508 } else {
2509 // we must have pruned in the past AND its state is still relevant
2510 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2511 // and thus we still hold a manifest in the store).
2512 ceph_assert(!osdmap_manifest.pinned.empty());
2513 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2514 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2515
2516 dout(10) << __func__
2517 << " first_pinned " << osdmap_manifest.get_first_pinned()
2518 << " last_pinned " << osdmap_manifest.get_last_pinned()
2519 << dendl;
2520
2521 pin_first = osdmap_manifest.get_last_pinned();
2522 }
2523
2524 manifest.pin(pin_first);
2525 }
2526
2527 bool OSDMonitor::_prune_sanitize_options() const
2528 {
2529 uint64_t prune_interval =
2530 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2531 uint64_t prune_min =
2532 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2533 uint64_t txsize =
2534 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2535
2536 bool r = true;
2537
2538 if (prune_interval == 0) {
2539 derr << __func__
2540 << " prune is enabled BUT prune interval is zero; abort."
2541 << dendl;
2542 r = false;
2543 } else if (prune_interval == 1) {
2544 derr << __func__
2545 << " prune interval is equal to one, which essentially means"
2546 " no pruning; abort."
2547 << dendl;
2548 r = false;
2549 }
2550 if (prune_min == 0) {
2551 derr << __func__
2552 << " prune is enabled BUT prune min is zero; abort."
2553 << dendl;
2554 r = false;
2555 }
2556 if (prune_interval > prune_min) {
2557 derr << __func__
2558 << " impossible to ascertain proper prune interval because"
2559 << " it is greater than the minimum prune epochs"
2560 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2561 << dendl;
2562 r = false;
2563 }
2564
2565 if (txsize < prune_interval - 1) {
2566 derr << __func__
2567 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2568 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2569 << "); abort." << dendl;
2570 r = false;
2571 }
2572 return r;
2573 }
2574
2575 bool OSDMonitor::is_prune_enabled() const {
2576 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2577 }
2578
2579 bool OSDMonitor::is_prune_supported() const {
2580 return mon.get_required_mon_features().contains_any(
2581 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2582 }
2583
2584 /** do_prune
2585 *
2586 * @returns true if has side-effects; false otherwise.
2587 */
2588 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2589 {
2590 bool enabled = is_prune_enabled();
2591
2592 dout(1) << __func__ << " osdmap full prune "
2593 << ( enabled ? "enabled" : "disabled")
2594 << dendl;
2595
2596 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2597 return false;
2598 }
2599
2600 // we are beyond the minimum prune versions, we need to remove maps because
2601 // otherwise the store will grow unbounded and we may end up having issues
2602 // with available disk space or store hangs.
2603
2604 // we will not pin all versions. We will leave a buffer number of versions.
2605 // this allows us the monitor to trim maps without caring too much about
2606 // pinned maps, and then allow us to use another ceph-mon without these
2607 // capabilities, without having to repair the store.
2608
2609 osdmap_manifest_t manifest = osdmap_manifest;
2610
2611 version_t first = get_first_committed();
2612 version_t last = get_last_committed();
2613
2614 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2615 version_t last_pinned = manifest.get_last_pinned();
2616 uint64_t prune_interval =
2617 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2618 uint64_t txsize =
2619 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2620
2621 prune_init(manifest);
2622
2623 // we need to get rid of some osdmaps
2624
2625 dout(5) << __func__
2626 << " lc (" << first << " .. " << last << ")"
2627 << " last_pinned " << last_pinned
2628 << " interval " << prune_interval
2629 << " last_to_pin " << last_to_pin
2630 << dendl;
2631
2632 // We will be erasing maps as we go.
2633 //
2634 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2635 //
2636 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2637 // we stop pruning. We could prune the maps between `next_to_pin` and
2638 // `last_to_pin`, but by not doing it we end up with neater pruned
2639 // intervals, aligned with `prune_interval`. Besides, this should not be a
2640 // problem as long as `prune_interval` is set to a sane value, instead of
2641 // hundreds or thousands of maps.
2642
2643 auto map_exists = [this](version_t v) {
2644 string k = mon.store->combine_strings("full", v);
2645 return mon.store->exists(get_service_name(), k);
2646 };
2647
2648 // 'interval' represents the number of maps from the last pinned
2649 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2650 // version 11 next; all intermediate versions will be removed.
2651 //
2652 // 'txsize' represents the maximum number of versions we'll be removing in
2653 // this iteration. If 'txsize' is large enough to perform multiple passes
2654 // pinning and removing maps, we will do so; if not, we'll do at least one
2655 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2656 // ensure that we never go *over* the maximum.
2657
2658 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2659 uint64_t removal_interval = prune_interval - 1;
2660
2661 if (txsize < removal_interval) {
2662 dout(5) << __func__
2663 << " setting txsize to removal interval size ("
2664 << removal_interval << " versions"
2665 << dendl;
2666 txsize = removal_interval;
2667 }
2668 ceph_assert(removal_interval > 0);
2669
2670 uint64_t num_pruned = 0;
2671 while (num_pruned + removal_interval <= txsize) {
2672 last_pinned = manifest.get_last_pinned();
2673
2674 if (last_pinned + prune_interval > last_to_pin) {
2675 break;
2676 }
2677 ceph_assert(last_pinned < last_to_pin);
2678
2679 version_t next_pinned = last_pinned + prune_interval;
2680 ceph_assert(next_pinned <= last_to_pin);
2681 manifest.pin(next_pinned);
2682
2683 dout(20) << __func__
2684 << " last_pinned " << last_pinned
2685 << " next_pinned " << next_pinned
2686 << " num_pruned " << num_pruned
2687 << " removal interval (" << (last_pinned+1)
2688 << ".." << (next_pinned-1) << ")"
2689 << " txsize " << txsize << dendl;
2690
2691 ceph_assert(map_exists(last_pinned));
2692 ceph_assert(map_exists(next_pinned));
2693
2694 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2695 ceph_assert(!manifest.is_pinned(v));
2696
2697 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2698 string full_key = mon.store->combine_strings("full", v);
2699 tx->erase(get_service_name(), full_key);
2700 ++num_pruned;
2701 }
2702 }
2703
2704 ceph_assert(num_pruned > 0);
2705
2706 bufferlist bl;
2707 manifest.encode(bl);
2708 tx->put(get_service_name(), "osdmap_manifest", bl);
2709
2710 return true;
2711 }
2712
2713
2714 // -------------
2715
2716 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2717 {
2718 op->mark_osdmon_event(__func__);
2719 Message *m = op->get_req();
2720 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2721
2722 switch (m->get_type()) {
2723 // READs
2724 case MSG_MON_COMMAND:
2725 try {
2726 return preprocess_command(op);
2727 } catch (const bad_cmd_get& e) {
2728 bufferlist bl;
2729 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2730 return true;
2731 }
2732 case CEPH_MSG_MON_GET_OSDMAP:
2733 return preprocess_get_osdmap(op);
2734
2735 // damp updates
2736 case MSG_OSD_MARK_ME_DOWN:
2737 return preprocess_mark_me_down(op);
2738 case MSG_OSD_MARK_ME_DEAD:
2739 return preprocess_mark_me_dead(op);
2740 case MSG_OSD_FULL:
2741 return preprocess_full(op);
2742 case MSG_OSD_FAILURE:
2743 return preprocess_failure(op);
2744 case MSG_OSD_BOOT:
2745 return preprocess_boot(op);
2746 case MSG_OSD_ALIVE:
2747 return preprocess_alive(op);
2748 case MSG_OSD_PG_CREATED:
2749 return preprocess_pg_created(op);
2750 case MSG_OSD_PG_READY_TO_MERGE:
2751 return preprocess_pg_ready_to_merge(op);
2752 case MSG_OSD_PGTEMP:
2753 return preprocess_pgtemp(op);
2754 case MSG_OSD_BEACON:
2755 return preprocess_beacon(op);
2756
2757 case CEPH_MSG_POOLOP:
2758 return preprocess_pool_op(op);
2759
2760 case MSG_REMOVE_SNAPS:
2761 return preprocess_remove_snaps(op);
2762
2763 case MSG_MON_GET_PURGED_SNAPS:
2764 return preprocess_get_purged_snaps(op);
2765
2766 default:
2767 ceph_abort();
2768 return true;
2769 }
2770 }
2771
2772 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2773 {
2774 op->mark_osdmon_event(__func__);
2775 Message *m = op->get_req();
2776 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2777
2778 switch (m->get_type()) {
2779 // damp updates
2780 case MSG_OSD_MARK_ME_DOWN:
2781 return prepare_mark_me_down(op);
2782 case MSG_OSD_MARK_ME_DEAD:
2783 return prepare_mark_me_dead(op);
2784 case MSG_OSD_FULL:
2785 return prepare_full(op);
2786 case MSG_OSD_FAILURE:
2787 return prepare_failure(op);
2788 case MSG_OSD_BOOT:
2789 return prepare_boot(op);
2790 case MSG_OSD_ALIVE:
2791 return prepare_alive(op);
2792 case MSG_OSD_PG_CREATED:
2793 return prepare_pg_created(op);
2794 case MSG_OSD_PGTEMP:
2795 return prepare_pgtemp(op);
2796 case MSG_OSD_PG_READY_TO_MERGE:
2797 return prepare_pg_ready_to_merge(op);
2798 case MSG_OSD_BEACON:
2799 return prepare_beacon(op);
2800
2801 case MSG_MON_COMMAND:
2802 try {
2803 return prepare_command(op);
2804 } catch (const bad_cmd_get& e) {
2805 bufferlist bl;
2806 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2807 return true;
2808 }
2809
2810 case CEPH_MSG_POOLOP:
2811 return prepare_pool_op(op);
2812
2813 case MSG_REMOVE_SNAPS:
2814 return prepare_remove_snaps(op);
2815
2816
2817 default:
2818 ceph_abort();
2819 }
2820
2821 return false;
2822 }
2823
2824 bool OSDMonitor::should_propose(double& delay)
2825 {
2826 dout(10) << "should_propose" << dendl;
2827
2828 // if full map, propose immediately! any subsequent changes will be clobbered.
2829 if (pending_inc.fullmap.length())
2830 return true;
2831
2832 // adjust osd weights?
2833 if (!osd_weight.empty() &&
2834 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2835 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2836 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2837 delay = 0.0;
2838 osd_weight.clear();
2839 return true;
2840 }
2841
2842 return PaxosService::should_propose(delay);
2843 }
2844
2845
2846
2847 // ---------------------------
2848 // READs
2849
2850 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2851 {
2852 op->mark_osdmon_event(__func__);
2853 auto m = op->get_req<MMonGetOSDMap>();
2854
2855 uint64_t features = mon.get_quorum_con_features();
2856 if (op->get_session() && op->get_session()->con_features)
2857 features = op->get_session()->con_features;
2858
2859 dout(10) << __func__ << " " << *m << dendl;
2860 MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2861 epoch_t first = get_first_committed();
2862 epoch_t last = osdmap.get_epoch();
2863 int max = g_conf()->osd_map_message_max;
2864 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2865 for (epoch_t e = std::max(first, m->get_full_first());
2866 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2867 ++e, --max) {
2868 bufferlist& bl = reply->maps[e];
2869 int r = get_version_full(e, features, bl);
2870 ceph_assert(r >= 0);
2871 max_bytes -= bl.length();
2872 }
2873 for (epoch_t e = std::max(first, m->get_inc_first());
2874 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2875 ++e, --max) {
2876 bufferlist& bl = reply->incremental_maps[e];
2877 int r = get_version(e, features, bl);
2878 ceph_assert(r >= 0);
2879 max_bytes -= bl.length();
2880 }
2881 reply->oldest_map = first;
2882 reply->newest_map = last;
2883 mon.send_reply(op, reply);
2884 return true;
2885 }
2886
2887
2888 // ---------------------------
2889 // UPDATEs
2890
2891 // failure --
2892
2893 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2894 // check permissions
2895 MonSession *session = op->get_session();
2896 if (!session)
2897 return true;
2898 if (!session->is_capable("osd", MON_CAP_X)) {
2899 dout(0) << "got MOSDFailure from entity with insufficient caps "
2900 << session->caps << dendl;
2901 return true;
2902 }
2903 if (fsid != mon.monmap->fsid) {
2904 dout(0) << "check_source: on fsid " << fsid
2905 << " != " << mon.monmap->fsid << dendl;
2906 return true;
2907 }
2908 return false;
2909 }
2910
2911
2912 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2913 {
2914 op->mark_osdmon_event(__func__);
2915 auto m = op->get_req<MOSDFailure>();
2916 // who is target_osd
2917 int badboy = m->get_target_osd();
2918
2919 // check permissions
2920 if (check_source(op, m->fsid))
2921 goto didit;
2922
2923 // first, verify the reporting host is valid
2924 if (m->get_orig_source().is_osd()) {
2925 int from = m->get_orig_source().num();
2926 if (!osdmap.exists(from) ||
2927 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2928 (osdmap.is_down(from) && m->if_osd_failed())) {
2929 dout(5) << "preprocess_failure from dead osd." << from
2930 << ", ignoring" << dendl;
2931 send_incremental(op, m->get_epoch()+1);
2932 goto didit;
2933 }
2934 }
2935
2936
2937 // weird?
2938 if (osdmap.is_down(badboy)) {
2939 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2940 << " " << m->get_target_addrs()
2941 << ", from " << m->get_orig_source() << dendl;
2942 if (m->get_epoch() < osdmap.get_epoch())
2943 send_incremental(op, m->get_epoch()+1);
2944 goto didit;
2945 }
2946 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2947 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2948 << " " << m->get_target_addrs()
2949 << " != map's " << osdmap.get_addrs(badboy)
2950 << ", from " << m->get_orig_source() << dendl;
2951 if (m->get_epoch() < osdmap.get_epoch())
2952 send_incremental(op, m->get_epoch()+1);
2953 goto didit;
2954 }
2955
2956 // already reported?
2957 if (osdmap.is_down(badboy) ||
2958 osdmap.get_up_from(badboy) > m->get_epoch()) {
2959 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2960 << " " << m->get_target_addrs()
2961 << ", from " << m->get_orig_source() << dendl;
2962 if (m->get_epoch() < osdmap.get_epoch())
2963 send_incremental(op, m->get_epoch()+1);
2964 goto didit;
2965 }
2966
2967 if (!can_mark_down(badboy)) {
2968 dout(5) << "preprocess_failure ignoring report of osd."
2969 << m->get_target_osd() << " " << m->get_target_addrs()
2970 << " from " << m->get_orig_source() << dendl;
2971 goto didit;
2972 }
2973
2974 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2975 << " " << m->get_target_addrs()
2976 << ", from " << m->get_orig_source() << dendl;
2977 return false;
2978
2979 didit:
2980 mon.no_reply(op);
2981 return true;
2982 }
2983
2984 class C_AckMarkedDown : public C_MonOp {
2985 OSDMonitor *osdmon;
2986 public:
2987 C_AckMarkedDown(
2988 OSDMonitor *osdmon,
2989 MonOpRequestRef op)
2990 : C_MonOp(op), osdmon(osdmon) {}
2991
2992 void _finish(int r) override {
2993 if (r == 0) {
2994 auto m = op->get_req<MOSDMarkMeDown>();
2995 osdmon->mon.send_reply(
2996 op,
2997 new MOSDMarkMeDown(
2998 m->fsid,
2999 m->target_osd,
3000 m->target_addrs,
3001 m->get_epoch(),
3002 false)); // ACK itself does not request an ack
3003 } else if (r == -EAGAIN) {
3004 osdmon->dispatch(op);
3005 } else {
3006 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
3007 }
3008 }
3009 ~C_AckMarkedDown() override {
3010 }
3011 };
3012
3013 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
3014 {
3015 op->mark_osdmon_event(__func__);
3016 auto m = op->get_req<MOSDMarkMeDown>();
3017 int from = m->target_osd;
3018
3019 // check permissions
3020 if (check_source(op, m->fsid))
3021 goto reply;
3022
3023 // first, verify the reporting host is valid
3024 if (!m->get_orig_source().is_osd())
3025 goto reply;
3026
3027 if (!osdmap.exists(from) ||
3028 osdmap.is_down(from) ||
3029 osdmap.get_addrs(from) != m->target_addrs) {
3030 dout(5) << "preprocess_mark_me_down from dead osd."
3031 << from << ", ignoring" << dendl;
3032 send_incremental(op, m->get_epoch()+1);
3033 goto reply;
3034 }
3035
3036 // no down might be set
3037 if (!can_mark_down(from))
3038 goto reply;
3039
3040 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3041 << " " << m->target_addrs << dendl;
3042 return false;
3043
3044 reply:
3045 if (m->request_ack) {
3046 Context *c(new C_AckMarkedDown(this, op));
3047 c->complete(0);
3048 }
3049 return true;
3050 }
3051
3052 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3053 {
3054 op->mark_osdmon_event(__func__);
3055 auto m = op->get_req<MOSDMarkMeDown>();
3056 int target_osd = m->target_osd;
3057
3058 ceph_assert(osdmap.is_up(target_osd));
3059 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3060
3061 mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
3062 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3063 if (m->down_and_dead) {
3064 if (!pending_inc.new_xinfo.count(target_osd)) {
3065 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3066 }
3067 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3068 }
3069 if (m->request_ack)
3070 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3071 return true;
3072 }
3073
3074 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3075 {
3076 op->mark_osdmon_event(__func__);
3077 auto m = op->get_req<MOSDMarkMeDead>();
3078 int from = m->target_osd;
3079
3080 // check permissions
3081 if (check_source(op, m->fsid)) {
3082 mon.no_reply(op);
3083 return true;
3084 }
3085
3086 // first, verify the reporting host is valid
3087 if (!m->get_orig_source().is_osd()) {
3088 mon.no_reply(op);
3089 return true;
3090 }
3091
3092 if (!osdmap.exists(from) ||
3093 !osdmap.is_down(from)) {
3094 dout(5) << __func__ << " from nonexistent or up osd." << from
3095 << ", ignoring" << dendl;
3096 send_incremental(op, m->get_epoch()+1);
3097 mon.no_reply(op);
3098 return true;
3099 }
3100
3101 return false;
3102 }
3103
3104 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3105 {
3106 op->mark_osdmon_event(__func__);
3107 auto m = op->get_req<MOSDMarkMeDead>();
3108 int target_osd = m->target_osd;
3109
3110 ceph_assert(osdmap.is_down(target_osd));
3111
3112 mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3113 << m->get_epoch();
3114 if (!pending_inc.new_xinfo.count(target_osd)) {
3115 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3116 }
3117 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3118 wait_for_finished_proposal(
3119 op,
3120 new LambdaContext(
3121 [op, this] (int r) {
3122 if (r >= 0) {
3123 mon.no_reply(op); // ignore on success
3124 }
3125 }
3126 ));
3127 return true;
3128 }
3129
3130 bool OSDMonitor::can_mark_down(int i)
3131 {
3132 if (osdmap.is_nodown(i)) {
3133 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3134 << "will not mark it down" << dendl;
3135 return false;
3136 }
3137
3138 int num_osds = osdmap.get_num_osds();
3139 if (num_osds == 0) {
3140 dout(5) << __func__ << " no osds" << dendl;
3141 return false;
3142 }
3143 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3144 float up_ratio = (float)up / (float)num_osds;
3145 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3146 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3147 << g_conf()->mon_osd_min_up_ratio
3148 << ", will not mark osd." << i << " down" << dendl;
3149 return false;
3150 }
3151 return true;
3152 }
3153
3154 bool OSDMonitor::can_mark_up(int i)
3155 {
3156 if (osdmap.is_noup(i)) {
3157 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3158 << "will not mark it up" << dendl;
3159 return false;
3160 }
3161
3162 return true;
3163 }
3164
3165 /**
3166 * @note the parameter @p i apparently only exists here so we can output the
3167 * osd's id on messages.
3168 */
3169 bool OSDMonitor::can_mark_out(int i)
3170 {
3171 if (osdmap.is_noout(i)) {
3172 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3173 << "will not mark it out" << dendl;
3174 return false;
3175 }
3176
3177 int num_osds = osdmap.get_num_osds();
3178 if (num_osds == 0) {
3179 dout(5) << __func__ << " no osds" << dendl;
3180 return false;
3181 }
3182 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3183 float in_ratio = (float)in / (float)num_osds;
3184 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3185 if (i >= 0)
3186 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3187 << g_conf()->mon_osd_min_in_ratio
3188 << ", will not mark osd." << i << " out" << dendl;
3189 else
3190 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3191 << g_conf()->mon_osd_min_in_ratio
3192 << ", will not mark osds out" << dendl;
3193 return false;
3194 }
3195
3196 return true;
3197 }
3198
3199 bool OSDMonitor::can_mark_in(int i)
3200 {
3201 if (osdmap.is_noin(i)) {
3202 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3203 << "will not mark it in" << dendl;
3204 return false;
3205 }
3206
3207 return true;
3208 }
3209
3210 bool OSDMonitor::check_failures(utime_t now)
3211 {
3212 bool found_failure = false;
3213 auto p = failure_info.begin();
3214 while (p != failure_info.end()) {
3215 auto& [target_osd, fi] = *p;
3216 if (can_mark_down(target_osd) &&
3217 check_failure(now, target_osd, fi)) {
3218 found_failure = true;
3219 ++p;
3220 } else if (is_failure_stale(now, fi)) {
3221 dout(10) << " dropping stale failure_info for osd." << target_osd
3222 << " from " << fi.reporters.size() << " reporters"
3223 << dendl;
3224 p = failure_info.erase(p);
3225 } else {
3226 ++p;
3227 }
3228 }
3229 return found_failure;
3230 }
3231
3232 utime_t OSDMonitor::get_grace_time(utime_t now,
3233 int target_osd,
3234 failure_info_t& fi) const
3235 {
3236 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3237 if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3238 return orig_grace;
3239 }
3240 utime_t grace = orig_grace;
3241 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3242 double decay_k = ::log(.5) / halflife;
3243
3244 // scale grace period based on historical probability of 'lagginess'
3245 // (false positive failures due to slowness).
3246 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3247 const utime_t failed_for = now - fi.get_failed_since();
3248 double decay = exp((double)failed_for * decay_k);
3249 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3250 << " failed_for " << failed_for << " decay " << decay << dendl;
3251 double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3252 grace += my_grace;
3253
3254 // consider the peers reporting a failure a proxy for a potential
3255 // 'subcluster' over the overall cluster that is similarly
3256 // laggy. this is clearly not true in all cases, but will sometimes
3257 // help us localize the grace correction to a subset of the system
3258 // (say, a rack with a bad switch) that is unhappy.
3259 double peer_grace = 0;
3260 for (auto& [reporter, report] : fi.reporters) {
3261 if (osdmap.exists(reporter)) {
3262 const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3263 utime_t elapsed = now - xi.down_stamp;
3264 double decay = exp((double)elapsed * decay_k);
3265 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3266 }
3267 }
3268 peer_grace /= (double)fi.reporters.size();
3269 grace += peer_grace;
3270 dout(10) << " osd." << target_osd << " has "
3271 << fi.reporters.size() << " reporters, "
3272 << grace << " grace (" << orig_grace << " + " << my_grace
3273 << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3274 << dendl;
3275
3276 return grace;
3277 }
3278
3279 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3280 {
3281 // already pending failure?
3282 if (pending_inc.new_state.count(target_osd) &&
3283 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3284 dout(10) << " already pending failure" << dendl;
3285 return true;
3286 }
3287
3288 set<string> reporters_by_subtree;
3289 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3290 ceph_assert(fi.reporters.size());
3291 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3292 // get the parent bucket whose type matches with "reporter_subtree_level".
3293 // fall back to OSD if the level doesn't exist.
3294 if (osdmap.exists(p->first)) {
3295 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3296 if (auto iter = reporter_loc.find(reporter_subtree_level);
3297 iter == reporter_loc.end()) {
3298 reporters_by_subtree.insert("osd." + to_string(p->first));
3299 } else {
3300 reporters_by_subtree.insert(iter->second);
3301 }
3302 ++p;
3303 } else {
3304 fi.cancel_report(p->first);;
3305 p = fi.reporters.erase(p);
3306 }
3307 }
3308 if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3309 return false;
3310 }
3311 const utime_t failed_for = now - fi.get_failed_since();
3312 const utime_t grace = get_grace_time(now, target_osd, fi);
3313 if (failed_for >= grace) {
3314 dout(1) << " we have enough reporters to mark osd." << target_osd
3315 << " down" << dendl;
3316 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3317
3318 mon.clog->info() << "osd." << target_osd << " failed ("
3319 << osdmap.crush->get_full_location_ordered_string(
3320 target_osd)
3321 << ") ("
3322 << (int)reporters_by_subtree.size()
3323 << " reporters from different "
3324 << reporter_subtree_level << " after "
3325 << failed_for << " >= grace " << grace << ")";
3326 return true;
3327 }
3328 return false;
3329 }
3330
3331 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3332 {
3333 // if it takes too long to either cancel the report to mark the osd down,
3334 // some reporters must have failed to cancel their reports. let's just
3335 // forget these reports.
3336 const utime_t failed_for = now - fi.get_failed_since();
3337 auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3338 auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3339 return failed_for >= (heartbeat_grace + heartbeat_stale);
3340 }
3341
3342 void OSDMonitor::force_failure(int target_osd, int by)
3343 {
3344 // already pending failure?
3345 if (pending_inc.new_state.count(target_osd) &&
3346 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3347 dout(10) << " already pending failure" << dendl;
3348 return;
3349 }
3350
3351 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3352 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3353 if (!pending_inc.new_xinfo.count(target_osd)) {
3354 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3355 }
3356 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3357
3358 mon.clog->info() << "osd." << target_osd << " failed ("
3359 << osdmap.crush->get_full_location_ordered_string(target_osd)
3360 << ") (connection refused reported by osd." << by << ")";
3361 return;
3362 }
3363
3364 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3365 {
3366 op->mark_osdmon_event(__func__);
3367 auto m = op->get_req<MOSDFailure>();
3368 dout(1) << "prepare_failure osd." << m->get_target_osd()
3369 << " " << m->get_target_addrs()
3370 << " from " << m->get_orig_source()
3371 << " is reporting failure:" << m->if_osd_failed() << dendl;
3372
3373 int target_osd = m->get_target_osd();
3374 int reporter = m->get_orig_source().num();
3375 ceph_assert(osdmap.is_up(target_osd));
3376 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3377
3378 mon.no_reply(op);
3379
3380 if (m->if_osd_failed()) {
3381 // calculate failure time
3382 utime_t now = ceph_clock_now();
3383 utime_t failed_since =
3384 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3385
3386 // add a report
3387 if (m->is_immediate()) {
3388 mon.clog->debug() << "osd." << m->get_target_osd()
3389 << " reported immediately failed by "
3390 << m->get_orig_source();
3391 force_failure(target_osd, reporter);
3392 return true;
3393 }
3394 mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3395 << m->get_orig_source();
3396
3397 failure_info_t& fi = failure_info[target_osd];
3398 fi.add_report(reporter, failed_since, op);
3399 return check_failure(now, target_osd, fi);
3400 } else {
3401 // remove the report
3402 mon.clog->debug() << "osd." << m->get_target_osd()
3403 << " failure report canceled by "
3404 << m->get_orig_source();
3405 if (failure_info.count(target_osd)) {
3406 failure_info_t& fi = failure_info[target_osd];
3407 fi.cancel_report(reporter);
3408 if (fi.reporters.empty()) {
3409 dout(10) << " removing last failure_info for osd." << target_osd
3410 << dendl;
3411 failure_info.erase(target_osd);
3412 } else {
3413 dout(10) << " failure_info for osd." << target_osd << " now "
3414 << fi.reporters.size() << " reporters" << dendl;
3415 }
3416 } else {
3417 dout(10) << " no failure_info for osd." << target_osd << dendl;
3418 }
3419 }
3420
3421 return false;
3422 }
3423
3424 void OSDMonitor::process_failures()
3425 {
3426 map<int,failure_info_t>::iterator p = failure_info.begin();
3427 while (p != failure_info.end()) {
3428 if (osdmap.is_up(p->first)) {
3429 ++p;
3430 } else {
3431 dout(10) << "process_failures osd." << p->first << dendl;
3432 list<MonOpRequestRef> ls;
3433 p->second.take_report_messages(ls);
3434 failure_info.erase(p++);
3435
3436 while (!ls.empty()) {
3437 MonOpRequestRef o = ls.front();
3438 if (o) {
3439 o->mark_event(__func__);
3440 MOSDFailure *m = o->get_req<MOSDFailure>();
3441 send_latest(o, m->get_epoch());
3442 mon.no_reply(o);
3443 }
3444 ls.pop_front();
3445 }
3446 }
3447 }
3448 }
3449
3450 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3451 {
3452 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3453
3454 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3455 p != failure_info.end();
3456 ++p) {
3457 p->second.take_report_messages(ls);
3458 }
3459 failure_info.clear();
3460 }
3461
3462 int OSDMonitor::get_grace_interval_threshold()
3463 {
3464 int halflife = g_conf()->mon_osd_laggy_halflife;
3465 // Scale the halflife period (default: 1_hr) by
3466 // a factor (48) to calculate the threshold.
3467 int grace_threshold_factor = 48;
3468 return halflife * grace_threshold_factor;
3469 }
3470
3471 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3472 {
3473 int grace_interval_threshold_secs = get_grace_interval_threshold();
3474 if (last_failed_interval > grace_interval_threshold_secs) {
3475 dout(1) << " last_failed_interval " << last_failed_interval
3476 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3477 << dendl;
3478 return true;
3479 }
3480 return false;
3481 }
3482
3483 void OSDMonitor::set_default_laggy_params(int target_osd)
3484 {
3485 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3486 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3487 }
3488 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3489 xi.down_stamp = pending_inc.modified;
3490 xi.laggy_probability = 0.0;
3491 xi.laggy_interval = 0;
3492 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3493 }
3494
3495
3496 // boot --
3497
3498 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3499 {
3500 op->mark_osdmon_event(__func__);
3501 auto m = op->get_req<MOSDBoot>();
3502 int from = m->get_orig_source_inst().name.num();
3503
3504 // check permissions, ignore if failed (no response expected)
3505 MonSession *session = op->get_session();
3506 if (!session)
3507 goto ignore;
3508 if (!session->is_capable("osd", MON_CAP_X)) {
3509 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3510 << session->caps << dendl;
3511 goto ignore;
3512 }
3513
3514 if (m->sb.cluster_fsid != mon.monmap->fsid) {
3515 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3516 << " != " << mon.monmap->fsid << dendl;
3517 goto ignore;
3518 }
3519
3520 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3521 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3522 goto ignore;
3523 }
3524
3525 ceph_assert(m->get_orig_source_inst().name.is_osd());
3526
3527 // lower bound of N-2
3528 if (!HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS)) {
3529 mon.clog->info() << "disallowing boot of OSD "
3530 << m->get_orig_source_inst()
3531 << " because the osd lacks CEPH_FEATURE_SERVER_OCTOPUS";
3532 goto ignore;
3533 }
3534
3535 // make sure osd versions do not span more than 3 releases
3536 if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) &&
3537 osdmap.require_osd_release < ceph_release_t::nautilus) {
3538 mon.clog->info() << "disallowing boot of pacific+ OSD "
3539 << m->get_orig_source_inst()
3540 << " because require_osd_release < nautilus";
3541 goto ignore;
3542 }
3543 if (HAVE_FEATURE(m->osd_features, SERVER_QUINCY) &&
3544 osdmap.require_osd_release < ceph_release_t::octopus) {
3545 mon.clog->info() << "disallowing boot of quincy+ OSD "
3546 << m->get_orig_source_inst()
3547 << " because require_osd_release < octopus";
3548 goto ignore;
3549 }
3550
3551 if (osdmap.stretch_mode_enabled &&
3552 !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3553 mon.clog->info() << "disallowing boot of OSD "
3554 << m->get_orig_source_inst()
3555 << " because stretch mode is on and OSD lacks support";
3556 goto ignore;
3557 }
3558
3559 // already booted?
3560 if (osdmap.is_up(from) &&
3561 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3562 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3563 // yup.
3564 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3565 << " " << m->get_orig_source_addrs()
3566 << " =~ " << osdmap.get_addrs(from) << dendl;
3567 _booted(op, false);
3568 return true;
3569 }
3570
3571 if (osdmap.exists(from) &&
3572 !osdmap.get_uuid(from).is_zero() &&
3573 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3574 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3575 << " clashes with existing osd: different fsid"
3576 << " (ours: " << osdmap.get_uuid(from)
3577 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3578 goto ignore;
3579 }
3580
3581 if (osdmap.exists(from) &&
3582 osdmap.get_info(from).up_from > m->version &&
3583 osdmap.get_most_recent_addrs(from).legacy_equals(
3584 m->get_orig_source_addrs())) {
3585 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3586 send_latest(op, m->sb.current_epoch+1);
3587 return true;
3588 }
3589
3590 // noup?
3591 if (!can_mark_up(from)) {
3592 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3593 send_latest(op, m->sb.current_epoch+1);
3594 return true;
3595 }
3596
3597 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3598 return false;
3599
3600 ignore:
3601 return true;
3602 }
3603
3604 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3605 {
3606 op->mark_osdmon_event(__func__);
3607 auto m = op->get_req<MOSDBoot>();
3608 dout(7) << __func__ << " from " << m->get_source()
3609 << " sb " << m->sb
3610 << " client_addrs" << m->get_connection()->get_peer_addrs()
3611 << " cluster_addrs " << m->cluster_addrs
3612 << " hb_back_addrs " << m->hb_back_addrs
3613 << " hb_front_addrs " << m->hb_front_addrs
3614 << dendl;
3615
3616 ceph_assert(m->get_orig_source().is_osd());
3617 int from = m->get_orig_source().num();
3618
3619 // does this osd exist?
3620 if (from >= osdmap.get_max_osd()) {
3621 dout(1) << "boot from osd." << from << " >= max_osd "
3622 << osdmap.get_max_osd() << dendl;
3623 return false;
3624 }
3625
3626 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3627 if (pending_inc.new_state.count(from))
3628 oldstate ^= pending_inc.new_state[from];
3629
3630 // already up? mark down first?
3631 if (osdmap.is_up(from)) {
3632 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3633 << osdmap.get_addrs(from) << dendl;
3634 // preprocess should have caught these; if not, assert.
3635 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3636 m->get_orig_source_addrs()) ||
3637 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3638 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3639
3640 if (pending_inc.new_state.count(from) == 0 ||
3641 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3642 // mark previous guy down
3643 pending_inc.new_state[from] = CEPH_OSD_UP;
3644 }
3645 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3646 } else if (pending_inc.new_up_client.count(from)) {
3647 // already prepared, just wait
3648 dout(7) << __func__ << " already prepared, waiting on "
3649 << m->get_orig_source_addr() << dendl;
3650 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3651 } else {
3652 // mark new guy up.
3653 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3654 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3655 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3656 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3657
3658 down_pending_out.erase(from); // if any
3659
3660 if (m->sb.weight)
3661 osd_weight[from] = m->sb.weight;
3662
3663 // set uuid?
3664 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3665 << dendl;
3666 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3667 // preprocess should have caught this; if not, assert.
3668 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3669 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3670 }
3671
3672 // fresh osd?
3673 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3674 const osd_info_t& i = osdmap.get_info(from);
3675 if (i.up_from > i.lost_at) {
3676 dout(10) << " fresh osd; marking lost_at too" << dendl;
3677 pending_inc.new_lost[from] = osdmap.get_epoch();
3678 }
3679 }
3680
3681 // metadata
3682 bufferlist osd_metadata;
3683 encode(m->metadata, osd_metadata);
3684 pending_metadata[from] = osd_metadata;
3685 pending_metadata_rm.erase(from);
3686
3687 // adjust last clean unmount epoch?
3688 const osd_info_t& info = osdmap.get_info(from);
3689 dout(10) << " old osd_info: " << info << dendl;
3690 if (m->sb.mounted > info.last_clean_begin ||
3691 (m->sb.mounted == info.last_clean_begin &&
3692 m->sb.clean_thru > info.last_clean_end)) {
3693 epoch_t begin = m->sb.mounted;
3694 epoch_t end = m->sb.clean_thru;
3695
3696 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3697 << "[" << info.last_clean_begin << "," << info.last_clean_end
3698 << ") -> [" << begin << "-" << end << ")"
3699 << dendl;
3700 pending_inc.new_last_clean_interval[from] =
3701 pair<epoch_t,epoch_t>(begin, end);
3702 }
3703
3704 if (pending_inc.new_xinfo.count(from) == 0)
3705 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3706 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3707 if (m->boot_epoch == 0) {
3708 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3709 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3710 dout(10) << " not laggy, new xi " << xi << dendl;
3711 } else {
3712 if (xi.down_stamp.sec()) {
3713 int interval = ceph_clock_now().sec() -
3714 xi.down_stamp.sec();
3715 if (g_conf()->mon_osd_laggy_max_interval &&
3716 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3717 interval = g_conf()->mon_osd_laggy_max_interval;
3718 }
3719 xi.laggy_interval =
3720 interval * g_conf()->mon_osd_laggy_weight +
3721 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3722 }
3723 xi.laggy_probability =
3724 g_conf()->mon_osd_laggy_weight +
3725 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3726 dout(10) << " laggy, now xi " << xi << dendl;
3727 }
3728
3729 // set features shared by the osd
3730 if (m->osd_features)
3731 xi.features = m->osd_features;
3732 else
3733 xi.features = m->get_connection()->get_features();
3734
3735 // mark in?
3736 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3737 (oldstate & CEPH_OSD_AUTOOUT)) ||
3738 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3739 (g_conf()->mon_osd_auto_mark_in)) {
3740 if (can_mark_in(from)) {
3741 if (xi.old_weight > 0) {
3742 pending_inc.new_weight[from] = xi.old_weight;
3743 xi.old_weight = 0;
3744 } else {
3745 pending_inc.new_weight[from] = CEPH_OSD_IN;
3746 }
3747 } else {
3748 dout(7) << __func__ << " NOIN set, will not mark in "
3749 << m->get_orig_source_addr() << dendl;
3750 }
3751 }
3752
3753 // wait
3754 wait_for_finished_proposal(op, new C_Booted(this, op));
3755 }
3756 return true;
3757 }
3758
3759 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3760 {
3761 op->mark_osdmon_event(__func__);
3762 auto m = op->get_req<MOSDBoot>();
3763 dout(7) << "_booted " << m->get_orig_source_inst()
3764 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3765
3766 if (logit) {
3767 mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3768 << " boot";
3769 }
3770
3771 send_latest(op, m->sb.current_epoch+1);
3772 }
3773
3774
3775 // -------------
3776 // full
3777
3778 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3779 {
3780 op->mark_osdmon_event(__func__);
3781 auto m = op->get_req<MOSDFull>();
3782 int from = m->get_orig_source().num();
3783 set<string> state;
3784 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3785
3786 // check permissions, ignore if failed
3787 MonSession *session = op->get_session();
3788 if (!session)
3789 goto ignore;
3790 if (!session->is_capable("osd", MON_CAP_X)) {
3791 dout(0) << "MOSDFull from entity with insufficient privileges:"
3792 << session->caps << dendl;
3793 goto ignore;
3794 }
3795
3796 // ignore a full message from the osd instance that already went down
3797 if (!osdmap.exists(from)) {
3798 dout(7) << __func__ << " ignoring full message from nonexistent "
3799 << m->get_orig_source_inst() << dendl;
3800 goto ignore;
3801 }
3802 if ((!osdmap.is_up(from) &&
3803 osdmap.get_most_recent_addrs(from).legacy_equals(
3804 m->get_orig_source_addrs())) ||
3805 (osdmap.is_up(from) &&
3806 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3807 dout(7) << __func__ << " ignoring full message from down "
3808 << m->get_orig_source_inst() << dendl;
3809 goto ignore;
3810 }
3811
3812 OSDMap::calc_state_set(osdmap.get_state(from), state);
3813
3814 if ((osdmap.get_state(from) & mask) == m->state) {
3815 dout(7) << __func__ << " state already " << state << " for osd." << from
3816 << " " << m->get_orig_source_inst() << dendl;
3817 _reply_map(op, m->version);
3818 goto ignore;
3819 }
3820
3821 dout(10) << __func__ << " want state " << state << " for osd." << from
3822 << " " << m->get_orig_source_inst() << dendl;
3823 return false;
3824
3825 ignore:
3826 return true;
3827 }
3828
3829 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3830 {
3831 op->mark_osdmon_event(__func__);
3832 auto m = op->get_req<MOSDFull>();
3833 const int from = m->get_orig_source().num();
3834
3835 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3836 const unsigned want_state = m->state & mask; // safety first
3837
3838 unsigned cur_state = osdmap.get_state(from);
3839 auto p = pending_inc.new_state.find(from);
3840 if (p != pending_inc.new_state.end()) {
3841 cur_state ^= p->second;
3842 }
3843 cur_state &= mask;
3844
3845 set<string> want_state_set, cur_state_set;
3846 OSDMap::calc_state_set(want_state, want_state_set);
3847 OSDMap::calc_state_set(cur_state, cur_state_set);
3848
3849 if (cur_state != want_state) {
3850 if (p != pending_inc.new_state.end()) {
3851 p->second &= ~mask;
3852 } else {
3853 pending_inc.new_state[from] = 0;
3854 }
3855 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3856 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3857 << " -> " << want_state_set << dendl;
3858 } else {
3859 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3860 << " = wanted " << want_state_set << ", just waiting" << dendl;
3861 }
3862
3863 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3864 return true;
3865 }
3866
3867 // -------------
3868 // alive
3869
3870 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3871 {
3872 op->mark_osdmon_event(__func__);
3873 auto m = op->get_req<MOSDAlive>();
3874 int from = m->get_orig_source().num();
3875
3876 // check permissions, ignore if failed
3877 MonSession *session = op->get_session();
3878 if (!session)
3879 goto ignore;
3880 if (!session->is_capable("osd", MON_CAP_X)) {
3881 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3882 << session->caps << dendl;
3883 goto ignore;
3884 }
3885
3886 if (!osdmap.is_up(from) ||
3887 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3888 dout(7) << "preprocess_alive ignoring alive message from down "
3889 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3890 << dendl;
3891 goto ignore;
3892 }
3893
3894 if (osdmap.get_up_thru(from) >= m->want) {
3895 // yup.
3896 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3897 _reply_map(op, m->version);
3898 return true;
3899 }
3900
3901 dout(10) << "preprocess_alive want up_thru " << m->want
3902 << " from " << m->get_orig_source_inst() << dendl;
3903 return false;
3904
3905 ignore:
3906 return true;
3907 }
3908
3909 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3910 {
3911 op->mark_osdmon_event(__func__);
3912 auto m = op->get_req<MOSDAlive>();
3913 int from = m->get_orig_source().num();
3914
3915 if (0) { // we probably don't care much about these
3916 mon.clog->debug() << m->get_orig_source_inst() << " alive";
3917 }
3918
3919 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3920 << " from " << m->get_orig_source_inst() << dendl;
3921
3922 update_up_thru(from, m->version); // set to the latest map the OSD has
3923 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3924 return true;
3925 }
3926
3927 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3928 {
3929 op->mark_osdmon_event(__func__);
3930 dout(7) << "_reply_map " << e
3931 << " from " << op->get_req()->get_orig_source_inst()
3932 << dendl;
3933 send_latest(op, e);
3934 }
3935
3936 // pg_created
3937 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3938 {
3939 op->mark_osdmon_event(__func__);
3940 auto m = op->get_req<MOSDPGCreated>();
3941 dout(10) << __func__ << " " << *m << dendl;
3942 auto session = op->get_session();
3943 mon.no_reply(op);
3944 if (!session) {
3945 dout(10) << __func__ << ": no monitor session!" << dendl;
3946 return true;
3947 }
3948 if (!session->is_capable("osd", MON_CAP_X)) {
3949 derr << __func__ << " received from entity "
3950 << "with insufficient privileges " << session->caps << dendl;
3951 return true;
3952 }
3953 // always forward the "created!" to the leader
3954 return false;
3955 }
3956
3957 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3958 {
3959 op->mark_osdmon_event(__func__);
3960 auto m = op->get_req<MOSDPGCreated>();
3961 dout(10) << __func__ << " " << *m << dendl;
3962 auto src = m->get_orig_source();
3963 auto from = src.num();
3964 if (!src.is_osd() ||
3965 !mon.osdmon()->osdmap.is_up(from) ||
3966 !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3967 m->get_orig_source_addrs())) {
3968 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3969 return false;
3970 }
3971 pending_created_pgs.push_back(m->pgid);
3972 return true;
3973 }
3974
3975 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3976 {
3977 op->mark_osdmon_event(__func__);
3978 auto m = op->get_req<MOSDPGReadyToMerge>();
3979 dout(10) << __func__ << " " << *m << dendl;
3980 const pg_pool_t *pi;
3981 auto session = op->get_session();
3982 if (!session) {
3983 dout(10) << __func__ << ": no monitor session!" << dendl;
3984 goto ignore;
3985 }
3986 if (!session->is_capable("osd", MON_CAP_X)) {
3987 derr << __func__ << " received from entity "
3988 << "with insufficient privileges " << session->caps << dendl;
3989 goto ignore;
3990 }
3991 pi = osdmap.get_pg_pool(m->pgid.pool());
3992 if (!pi) {
3993 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3994 goto ignore;
3995 }
3996 if (pi->get_pg_num() <= m->pgid.ps()) {
3997 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3998 goto ignore;
3999 }
4000 if (pi->get_pg_num() != m->pgid.ps() + 1) {
4001 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
4002 goto ignore;
4003 }
4004 if (pi->get_pg_num_pending() > m->pgid.ps()) {
4005 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4006 goto ignore;
4007 }
4008 return false;
4009
4010 ignore:
4011 mon.no_reply(op);
4012 return true;
4013 }
4014
4015 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4016 {
4017 op->mark_osdmon_event(__func__);
4018 auto m = op->get_req<MOSDPGReadyToMerge>();
4019 dout(10) << __func__ << " " << *m << dendl;
4020 pg_pool_t p;
4021 if (pending_inc.new_pools.count(m->pgid.pool()))
4022 p = pending_inc.new_pools[m->pgid.pool()];
4023 else
4024 p = *osdmap.get_pg_pool(m->pgid.pool());
4025 if (p.get_pg_num() != m->pgid.ps() + 1 ||
4026 p.get_pg_num_pending() > m->pgid.ps()) {
4027 dout(10) << __func__
4028 << " race with concurrent pg_num[_pending] update, will retry"
4029 << dendl;
4030 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4031 return true;
4032 }
4033
4034 if (m->ready) {
4035 p.dec_pg_num(m->pgid,
4036 pending_inc.epoch,
4037 m->source_version,
4038 m->target_version,
4039 m->last_epoch_started,
4040 m->last_epoch_clean);
4041 p.last_change = pending_inc.epoch;
4042 } else {
4043 // back off the merge attempt!
4044 p.set_pg_num_pending(p.get_pg_num());
4045 }
4046
4047 // force pre-nautilus clients to resend their ops, since they
4048 // don't understand pg_num_pending changes form a new interval
4049 p.last_force_op_resend_prenautilus = pending_inc.epoch;
4050
4051 pending_inc.new_pools[m->pgid.pool()] = p;
4052
4053 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4054 if (m->ready &&
4055 prob > 0 &&
4056 prob > (double)(rand() % 1000)/1000.0) {
4057 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4058 auto n = new MMonCommand(mon.monmap->get_fsid());
4059 n->set_connection(m->get_connection());
4060 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4061 osdmap.get_pool_name(m->pgid.pool()) +
4062 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4063 stringify(m->pgid.ps() + 1) + "\"}" };
4064 MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4065 nop->set_type_service();
4066 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4067 } else {
4068 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4069 }
4070 return true;
4071 }
4072
4073
4074 // -------------
4075 // pg_temp changes
4076
4077 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4078 {
4079 auto m = op->get_req<MOSDPGTemp>();
4080 dout(10) << "preprocess_pgtemp " << *m << dendl;
4081 mempool::osdmap::vector<int> empty;
4082 int from = m->get_orig_source().num();
4083 size_t ignore_cnt = 0;
4084
4085 // check caps
4086 MonSession *session = op->get_session();
4087 if (!session)
4088 goto ignore;
4089 if (!session->is_capable("osd", MON_CAP_X)) {
4090 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4091 << session->caps << dendl;
4092 goto ignore;
4093 }
4094
4095 if (!osdmap.is_up(from) ||
4096 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4097 dout(7) << "ignoring pgtemp message from down "
4098 << m->get_orig_source() << " " << m->get_orig_source_addrs()
4099 << dendl;
4100 goto ignore;
4101 }
4102
4103 if (m->forced) {
4104 return false;
4105 }
4106
4107 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4108 dout(20) << " " << p->first
4109 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4110 << " -> " << p->second << dendl;
4111
4112 // does the pool exist?
4113 if (!osdmap.have_pg_pool(p->first.pool())) {
4114 /*
4115 * 1. If the osdmap does not have the pool, it means the pool has been
4116 * removed in-between the osd sending this message and us handling it.
4117 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4118 * not exist in the pending either, as the osds would not send a
4119 * message about a pool they know nothing about (yet).
4120 * 3. However, if the pool does exist in the pending, then it must be a
4121 * new pool, and not relevant to this message (see 1).
4122 */
4123 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4124 << ": pool has been removed" << dendl;
4125 ignore_cnt++;
4126 continue;
4127 }
4128
4129 int acting_primary = -1;
4130 osdmap.pg_to_up_acting_osds(
4131 p->first, nullptr, nullptr, nullptr, &acting_primary);
4132 if (acting_primary != from) {
4133 /* If the source isn't the primary based on the current osdmap, we know
4134 * that the interval changed and that we can discard this message.
4135 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4136 * which of two pg temp mappings on the same pg is more recent.
4137 */
4138 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4139 << ": primary has changed" << dendl;
4140 ignore_cnt++;
4141 continue;
4142 }
4143
4144 // removal?
4145 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4146 osdmap.primary_temp->count(p->first)))
4147 return false;
4148 // change?
4149 // NOTE: we assume that this will clear pg_primary, so consider
4150 // an existing pg_primary field to imply a change
4151 if (p->second.size() &&
4152 (osdmap.pg_temp->count(p->first) == 0 ||
4153 osdmap.pg_temp->get(p->first) != p->second ||
4154 osdmap.primary_temp->count(p->first)))
4155 return false;
4156 }
4157
4158 // should we ignore all the pgs?
4159 if (ignore_cnt == m->pg_temp.size())
4160 goto ignore;
4161
4162 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4163 _reply_map(op, m->map_epoch);
4164 return true;
4165
4166 ignore:
4167 mon.no_reply(op);
4168 return true;
4169 }
4170
4171 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4172 {
4173 epoch_t old_up_thru = osdmap.get_up_thru(from);
4174 auto ut = pending_inc.new_up_thru.find(from);
4175 if (ut != pending_inc.new_up_thru.end()) {
4176 old_up_thru = ut->second;
4177 }
4178 if (up_thru > old_up_thru) {
4179 // set up_thru too, so the osd doesn't have to ask again
4180 pending_inc.new_up_thru[from] = up_thru;
4181 }
4182 }
4183
4184 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4185 {
4186 op->mark_osdmon_event(__func__);
4187 auto m = op->get_req<MOSDPGTemp>();
4188 int from = m->get_orig_source().num();
4189 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4190 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4191 uint64_t pool = p->first.pool();
4192 if (pending_inc.old_pools.count(pool)) {
4193 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4194 << ": pool pending removal" << dendl;
4195 continue;
4196 }
4197 if (!osdmap.have_pg_pool(pool)) {
4198 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4199 << ": pool has been removed" << dendl;
4200 continue;
4201 }
4202 pending_inc.new_pg_temp[p->first] =
4203 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4204
4205 // unconditionally clear pg_primary (until this message can encode
4206 // a change for that, too.. at which point we need to also fix
4207 // preprocess_pg_temp)
4208 if (osdmap.primary_temp->count(p->first) ||
4209 pending_inc.new_primary_temp.count(p->first))
4210 pending_inc.new_primary_temp[p->first] = -1;
4211 }
4212
4213 // set up_thru too, so the osd doesn't have to ask again
4214 update_up_thru(from, m->map_epoch);
4215
4216 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4217 return true;
4218 }
4219
4220
4221 // ---
4222
4223 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4224 {
4225 op->mark_osdmon_event(__func__);
4226 auto m = op->get_req<MRemoveSnaps>();
4227 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4228
4229 // check privilege, ignore if failed
4230 MonSession *session = op->get_session();
4231 mon.no_reply(op);
4232 if (!session)
4233 goto ignore;
4234 if (!session->caps.is_capable(
4235 cct,
4236 session->entity_name,
4237 "osd", "osd pool rmsnap", {}, true, true, false,
4238 session->get_peer_socket_addr())) {
4239 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4240 << session->caps << dendl;
4241 goto ignore;
4242 }
4243
4244 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4245 q != m->snaps.end();
4246 ++q) {
4247 if (!osdmap.have_pg_pool(q->first)) {
4248 dout(10) << " ignoring removed_snaps " << q->second
4249 << " on non-existent pool " << q->first << dendl;
4250 continue;
4251 }
4252 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4253 for (vector<snapid_t>::iterator p = q->second.begin();
4254 p != q->second.end();
4255 ++p) {
4256 if (*p > pi->get_snap_seq() ||
4257 !_is_removed_snap(q->first, *p)) {
4258 return false;
4259 }
4260 }
4261 }
4262
4263 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4264 auto reply = make_message<MRemoveSnaps>();
4265 reply->snaps = m->snaps;
4266 mon.send_reply(op, reply.detach());
4267 }
4268
4269 ignore:
4270 return true;
4271 }
4272
4273 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4274 {
4275 op->mark_osdmon_event(__func__);
4276 auto m = op->get_req<MRemoveSnaps>();
4277 dout(7) << "prepare_remove_snaps " << *m << dendl;
4278
4279 for (auto& [pool, snaps] : m->snaps) {
4280 if (!osdmap.have_pg_pool(pool)) {
4281 dout(10) << " ignoring removed_snaps " << snaps
4282 << " on non-existent pool " << pool << dendl;
4283 continue;
4284 }
4285
4286 pg_pool_t& pi = osdmap.pools[pool];
4287 for (auto s : snaps) {
4288 if (!_is_removed_snap(pool, s) &&
4289 (!pending_inc.new_pools.count(pool) ||
4290 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4291 (!pending_inc.new_removed_snaps.count(pool) ||
4292 !pending_inc.new_removed_snaps[pool].contains(s))) {
4293 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4294 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4295 newpi->removed_snaps.insert(s);
4296 dout(10) << " pool " << pool << " removed_snaps added " << s
4297 << " (now " << newpi->removed_snaps << ")" << dendl;
4298 }
4299 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4300 if (s > newpi->get_snap_seq()) {
4301 dout(10) << " pool " << pool << " snap_seq "
4302 << newpi->get_snap_seq() << " -> " << s << dendl;
4303 newpi->set_snap_seq(s);
4304 }
4305 newpi->set_snap_epoch(pending_inc.epoch);
4306 dout(10) << " added pool " << pool << " snap " << s
4307 << " to removed_snaps queue" << dendl;
4308 pending_inc.new_removed_snaps[pool].insert(s);
4309 }
4310 }
4311 }
4312
4313 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4314 auto reply = make_message<MRemoveSnaps>();
4315 reply->snaps = m->snaps;
4316 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4317 }
4318
4319 return true;
4320 }
4321
4322 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4323 {
4324 op->mark_osdmon_event(__func__);
4325 auto m = op->get_req<MMonGetPurgedSnaps>();
4326 dout(7) << __func__ << " " << *m << dendl;
4327
4328 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4329
4330 string k = make_purged_snap_epoch_key(m->start);
4331 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4332 it->upper_bound(k);
4333 unsigned long epoch = m->last;
4334 while (it->valid()) {
4335 if (it->key().find("purged_epoch_") != 0) {
4336 break;
4337 }
4338 string k = it->key();
4339 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4340 if (n != 1) {
4341 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4342 } else if (epoch > m->last) {
4343 break;
4344 } else {
4345 bufferlist bl = it->value();
4346 auto p = bl.cbegin();
4347 auto &v = r[epoch];
4348 try {
4349 ceph::decode(v, p);
4350 } catch (ceph::buffer::error& e) {
4351 derr << __func__ << " unable to parse value for key '" << it->key()
4352 << "': \n";
4353 bl.hexdump(*_dout);
4354 *_dout << dendl;
4355 }
4356 n += 4 + v.size() * 16;
4357 }
4358 if (n > 1048576) {
4359 // impose a semi-arbitrary limit to message size
4360 break;
4361 }
4362 it->next();
4363 }
4364
4365 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4366 reply->purged_snaps.swap(r);
4367 mon.send_reply(op, reply.detach());
4368
4369 return true;
4370 }
4371
4372 // osd beacon
4373 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4374 {
4375 op->mark_osdmon_event(__func__);
4376 // check caps
4377 auto session = op->get_session();
4378 mon.no_reply(op);
4379 if (!session) {
4380 dout(10) << __func__ << " no monitor session!" << dendl;
4381 return true;
4382 }
4383 if (!session->is_capable("osd", MON_CAP_X)) {
4384 derr << __func__ << " received from entity "
4385 << "with insufficient privileges " << session->caps << dendl;
4386 return true;
4387 }
4388 // Always forward the beacon to the leader, even if they are the same as
4389 // the old one. The leader will mark as down osds that haven't sent
4390 // beacon for a few minutes.
4391 return false;
4392 }
4393
4394 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4395 {
4396 op->mark_osdmon_event(__func__);
4397 const auto beacon = op->get_req<MOSDBeacon>();
4398 const auto src = beacon->get_orig_source();
4399 dout(10) << __func__ << " " << *beacon
4400 << " from " << src << dendl;
4401 int from = src.num();
4402
4403 if (!src.is_osd() ||
4404 !osdmap.is_up(from) ||
4405 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4406 if (src.is_osd() && !osdmap.is_up(from)) {
4407 // share some new maps with this guy in case it may not be
4408 // aware of its own deadness...
4409 send_latest(op, beacon->version+1);
4410 }
4411 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4412 return false;
4413 }
4414
4415 last_osd_report[from].first = ceph_clock_now();
4416 last_osd_report[from].second = beacon->osd_beacon_report_interval;
4417 osd_epochs[from] = beacon->version;
4418
4419 for (const auto& pg : beacon->pgs) {
4420 if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4421 unsigned pg_num = pool->get_pg_num();
4422 last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4423 }
4424 }
4425
4426 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4427 beacon->last_purged_snaps_scrub) {
4428 if (pending_inc.new_xinfo.count(from) == 0) {
4429 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4430 }
4431 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4432 beacon->last_purged_snaps_scrub;
4433 return true;
4434 } else {
4435 return false;
4436 }
4437 }
4438
4439 // ---------------
4440 // map helpers
4441
4442 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4443 {
4444 op->mark_osdmon_event(__func__);
4445 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4446 << " start " << start << dendl;
4447 if (start == 0)
4448 send_full(op);
4449 else
4450 send_incremental(op, start);
4451 }
4452
4453
4454 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4455 {
4456 MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4457 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4458 r->oldest_map = get_first_committed();
4459 r->newest_map = osdmap.get_epoch();
4460 return r;
4461 }
4462
4463 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4464 {
4465 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4466 << std::hex << features << std::dec << dendl;
4467 MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4468 m->oldest_map = get_first_committed();
4469 m->newest_map = osdmap.get_epoch();
4470
4471 for (epoch_t e = to; e >= from && e > 0; e--) {
4472 bufferlist bl;
4473 int err = get_version(e, features, bl);
4474 if (err == 0) {
4475 ceph_assert(bl.length());
4476 // if (get_version(e, bl) > 0) {
4477 dout(20) << "build_incremental inc " << e << " "
4478 << bl.length() << " bytes" << dendl;
4479 m->incremental_maps[e] = bl;
4480 } else {
4481 ceph_assert(err == -ENOENT);
4482 ceph_assert(!bl.length());
4483 get_version_full(e, features, bl);
4484 if (bl.length() > 0) {
4485 //else if (get_version("full", e, bl) > 0) {
4486 dout(20) << "build_incremental full " << e << " "
4487 << bl.length() << " bytes" << dendl;
4488 m->maps[e] = bl;
4489 } else {
4490 ceph_abort(); // we should have all maps.
4491 }
4492 }
4493 }
4494 return m;
4495 }
4496
4497 void OSDMonitor::send_full(MonOpRequestRef op)
4498 {
4499 op->mark_osdmon_event(__func__);
4500 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4501 mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4502 }
4503
4504 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4505 {
4506 op->mark_osdmon_event(__func__);
4507
4508 MonSession *s = op->get_session();
4509 ceph_assert(s);
4510
4511 if (s->proxy_con) {
4512 // oh, we can tell the other mon to do it
4513 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4514 << first << dendl;
4515 MRoute *r = new MRoute(s->proxy_tid, NULL);
4516 r->send_osdmap_first = first;
4517 s->proxy_con->send_message(r);
4518 op->mark_event("reply: send routed send_osdmap_first reply");
4519 } else {
4520 // do it ourselves
4521 send_incremental(first, s, false, op);
4522 }
4523 }
4524
4525 void OSDMonitor::send_incremental(epoch_t first,
4526 MonSession *session,
4527 bool onetime,
4528 MonOpRequestRef req)
4529 {
4530 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4531 << " to " << session->name << dendl;
4532
4533 // get feature of the peer
4534 // use quorum_con_features, if it's an anonymous connection.
4535 uint64_t features = session->con_features ? session->con_features :
4536 mon.get_quorum_con_features();
4537
4538 if (first <= session->osd_epoch) {
4539 dout(10) << __func__ << " " << session->name << " should already have epoch "
4540 << session->osd_epoch << dendl;
4541 first = session->osd_epoch + 1;
4542 }
4543
4544 if (first < get_first_committed()) {
4545 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4546 m->oldest_map = get_first_committed();
4547 m->newest_map = osdmap.get_epoch();
4548
4549 first = get_first_committed();
4550 bufferlist bl;
4551 int err = get_version_full(first, features, bl);
4552 ceph_assert(err == 0);
4553 ceph_assert(bl.length());
4554 dout(20) << "send_incremental starting with base full "
4555 << first << " " << bl.length() << " bytes" << dendl;
4556 m->maps[first] = bl;
4557
4558 if (req) {
4559 mon.send_reply(req, m);
4560 session->osd_epoch = first;
4561 return;
4562 } else {
4563 session->con->send_message(m);
4564 session->osd_epoch = first;
4565 }
4566 first++;
4567 }
4568
4569 while (first <= osdmap.get_epoch()) {
4570 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4571 osdmap.get_epoch());
4572 MOSDMap *m = build_incremental(first, last, features);
4573
4574 if (req) {
4575 // send some maps. it may not be all of them, but it will get them
4576 // started.
4577 mon.send_reply(req, m);
4578 } else {
4579 session->con->send_message(m);
4580 first = last + 1;
4581 }
4582 session->osd_epoch = last;
4583 if (onetime || req)
4584 break;
4585 }
4586 }
4587
4588 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4589 {
4590 return get_version(ver, mon.get_quorum_con_features(), bl);
4591 }
4592
4593 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4594 {
4595 OSDMap::Incremental inc;
4596 auto q = bl.cbegin();
4597 inc.decode(q);
4598 // always encode with subset of osdmap's canonical features
4599 uint64_t f = features & inc.encode_features;
4600 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4601 << dendl;
4602 bl.clear();
4603 if (inc.fullmap.length()) {
4604 // embedded full map?
4605 OSDMap m;
4606 m.decode(inc.fullmap);
4607 inc.fullmap.clear();
4608 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4609 }
4610 if (inc.crush.length()) {
4611 // embedded crush map
4612 CrushWrapper c;
4613 auto p = inc.crush.cbegin();
4614 c.decode(p);
4615 inc.crush.clear();
4616 c.encode(inc.crush, f);
4617 }
4618 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4619 }
4620
4621 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4622 {
4623 OSDMap m;
4624 auto q = bl.cbegin();
4625 m.decode(q);
4626 // always encode with subset of osdmap's canonical features
4627 uint64_t f = features & m.get_encoding_features();
4628 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4629 << dendl;
4630 bl.clear();
4631 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4632 }
4633
4634 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4635 {
4636 uint64_t significant_features = OSDMap::get_significant_features(features);
4637 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4638 return 0;
4639 }
4640 int ret = PaxosService::get_version(ver, bl);
4641 if (ret < 0) {
4642 return ret;
4643 }
4644 // NOTE: this check is imprecise; the OSDMap encoding features may
4645 // be a subset of the latest mon quorum features, but worst case we
4646 // reencode once and then cache the (identical) result under both
4647 // feature masks.
4648 if (significant_features !=
4649 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4650 reencode_incremental_map(bl, features);
4651 }
4652 inc_osd_cache.add_bytes({ver, significant_features}, bl);
4653 return 0;
4654 }
4655
4656 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4657 {
4658 bufferlist inc_bl;
4659 int err = get_version(ver, inc_bl);
4660 ceph_assert(err == 0);
4661 ceph_assert(inc_bl.length());
4662
4663 auto p = inc_bl.cbegin();
4664 inc.decode(p);
4665 dout(10) << __func__ << " "
4666 << " epoch " << inc.epoch
4667 << " inc_crc " << inc.inc_crc
4668 << " full_crc " << inc.full_crc
4669 << " encode_features " << inc.encode_features << dendl;
4670 return 0;
4671 }
4672
4673 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4674 {
4675 dout(10) << __func__ << " ver " << ver << dendl;
4676
4677 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4678 if (closest_pinned == 0) {
4679 return -ENOENT;
4680 }
4681 if (closest_pinned > ver) {
4682 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4683 }
4684 ceph_assert(closest_pinned <= ver);
4685
4686 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4687
4688 // get osdmap incremental maps and apply on top of this one.
4689 bufferlist osdm_bl;
4690 bool has_cached_osdmap = false;
4691 for (version_t v = ver-1; v >= closest_pinned; --v) {
4692 if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4693 &osdm_bl)) {
4694 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4695 closest_pinned = v;
4696 has_cached_osdmap = true;
4697 break;
4698 }
4699 }
4700
4701 if (!has_cached_osdmap) {
4702 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4703 if (err != 0) {
4704 derr << __func__ << " closest pinned map ver " << closest_pinned
4705 << " not available! error: " << cpp_strerror(err) << dendl;
4706 }
4707 ceph_assert(err == 0);
4708 }
4709
4710 ceph_assert(osdm_bl.length());
4711
4712 OSDMap osdm;
4713 osdm.decode(osdm_bl);
4714
4715 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4716 << " e" << osdm.epoch
4717 << " crc " << osdm.get_crc()
4718 << " -- applying incremental maps." << dendl;
4719
4720 uint64_t encode_features = 0;
4721 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4722 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4723
4724 OSDMap::Incremental inc;
4725 int err = get_inc(v, inc);
4726 ceph_assert(err == 0);
4727
4728 encode_features = inc.encode_features;
4729
4730 err = osdm.apply_incremental(inc);
4731 ceph_assert(err == 0);
4732
4733 // this block performs paranoid checks on map retrieval
4734 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4735 inc.full_crc != 0) {
4736
4737 uint64_t f = encode_features;
4738 if (!f) {
4739 f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4740 }
4741
4742 // encode osdmap to force calculating crcs
4743 bufferlist tbl;
4744 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4745 // decode osdmap to compare crcs with what's expected by incremental
4746 OSDMap tosdm;
4747 tosdm.decode(tbl);
4748
4749 if (tosdm.get_crc() != inc.full_crc) {
4750 derr << __func__
4751 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4752 << ", expected " << inc.full_crc << ")" << dendl;
4753 ceph_abort_msg("osdmap crc mismatch");
4754 }
4755 }
4756
4757 // note: we cannot add the recently computed map to the cache, as is,
4758 // because we have not encoded the map into a bl.
4759 }
4760
4761 if (!encode_features) {
4762 dout(10) << __func__
4763 << " last incremental map didn't have features;"
4764 << " defaulting to quorum's or all" << dendl;
4765 encode_features =
4766 (mon.quorum_con_features ? mon.quorum_con_features : -1);
4767 }
4768 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4769
4770 return 0;
4771 }
4772
4773 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4774 {
4775 return get_version_full(ver, mon.get_quorum_con_features(), bl);
4776 }
4777
4778 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4779 bufferlist& bl)
4780 {
4781 uint64_t significant_features = OSDMap::get_significant_features(features);
4782 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4783 return 0;
4784 }
4785 int ret = PaxosService::get_version_full(ver, bl);
4786 if (ret == -ENOENT) {
4787 // build map?
4788 ret = get_full_from_pinned_map(ver, bl);
4789 }
4790 if (ret < 0) {
4791 return ret;
4792 }
4793 // NOTE: this check is imprecise; the OSDMap encoding features may
4794 // be a subset of the latest mon quorum features, but worst case we
4795 // reencode once and then cache the (identical) result under both
4796 // feature masks.
4797 if (significant_features !=
4798 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4799 reencode_full_map(bl, features);
4800 }
4801 full_osd_cache.add_bytes({ver, significant_features}, bl);
4802 return 0;
4803 }
4804
4805 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4806 {
4807 dout(10) << "blocklist " << av << " until " << until << dendl;
4808 for (auto a : av.v) {
4809 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4810 a.set_type(entity_addr_t::TYPE_ANY);
4811 } else {
4812 a.set_type(entity_addr_t::TYPE_LEGACY);
4813 }
4814 pending_inc.new_blocklist[a] = until;
4815 }
4816 return pending_inc.epoch;
4817 }
4818
4819 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4820 {
4821 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4822 a.set_type(entity_addr_t::TYPE_ANY);
4823 } else {
4824 a.set_type(entity_addr_t::TYPE_LEGACY);
4825 }
4826 dout(10) << "blocklist " << a << " until " << until << dendl;
4827 pending_inc.new_blocklist[a] = until;
4828 return pending_inc.epoch;
4829 }
4830
4831
4832 void OSDMonitor::check_osdmap_subs()
4833 {
4834 dout(10) << __func__ << dendl;
4835 if (!osdmap.get_epoch()) {
4836 return;
4837 }
4838 auto osdmap_subs = mon.session_map.subs.find("osdmap");
4839 if (osdmap_subs == mon.session_map.subs.end()) {
4840 return;
4841 }
4842 auto p = osdmap_subs->second->begin();
4843 while (!p.end()) {
4844 auto sub = *p;
4845 ++p;
4846 check_osdmap_sub(sub);
4847 }
4848 }
4849
4850 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4851 {
4852 dout(10) << __func__ << " " << sub << " next " << sub->next
4853 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4854 if (sub->next <= osdmap.get_epoch()) {
4855 if (sub->next >= 1)
4856 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4857 else
4858 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4859 if (sub->onetime)
4860 mon.session_map.remove_sub(sub);
4861 else
4862 sub->next = osdmap.get_epoch() + 1;
4863 }
4864 }
4865
4866 void OSDMonitor::check_pg_creates_subs()
4867 {
4868 if (!osdmap.get_num_up_osds()) {
4869 return;
4870 }
4871 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4872 mon.with_session_map([this](const MonSessionMap& session_map) {
4873 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4874 if (pg_creates_subs == session_map.subs.end()) {
4875 return;
4876 }
4877 for (auto sub : *pg_creates_subs->second) {
4878 check_pg_creates_sub(sub);
4879 }
4880 });
4881 }
4882
4883 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4884 {
4885 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4886 ceph_assert(sub->type == "osd_pg_creates");
4887 // only send these if the OSD is up. we will check_subs() when they do
4888 // come up so they will get the creates then.
4889 if (sub->session->name.is_osd() &&
4890 mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4891 sub->next = send_pg_creates(sub->session->name.num(),
4892 sub->session->con.get(),
4893 sub->next);
4894 }
4895 }
4896
4897 void OSDMonitor::do_application_enable(int64_t pool_id,
4898 const std::string &app_name,
4899 const std::string &app_key,
4900 const std::string &app_value,
4901 bool force)
4902 {
4903 ceph_assert(paxos.is_plugged() && is_writeable());
4904
4905 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4906 << dendl;
4907
4908 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4909
4910 auto pp = osdmap.get_pg_pool(pool_id);
4911 ceph_assert(pp != nullptr);
4912
4913 pg_pool_t p = *pp;
4914 if (pending_inc.new_pools.count(pool_id)) {
4915 p = pending_inc.new_pools[pool_id];
4916 }
4917
4918 if (app_key.empty()) {
4919 p.application_metadata.insert({app_name, {}});
4920 } else {
4921 if (force) {
4922 p.application_metadata[app_name][app_key] = app_value;
4923 } else {
4924 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4925 }
4926 }
4927 p.last_change = pending_inc.epoch;
4928 pending_inc.new_pools[pool_id] = p;
4929 }
4930
4931 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4932 pool_opts_t::key_t opt,
4933 pool_opts_t::value_t val)
4934 {
4935 dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
4936 << " val: " << val << dendl;
4937 auto p = pending_inc.new_pools.try_emplace(
4938 pool_id, *osdmap.get_pg_pool(pool_id));
4939 p.first->second.opts.set(opt, val);
4940 }
4941
4942 unsigned OSDMonitor::scan_for_creating_pgs(
4943 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4944 const mempool::osdmap::set<int64_t>& removed_pools,
4945 utime_t modified,
4946 creating_pgs_t* creating_pgs) const
4947 {
4948 unsigned queued = 0;
4949 for (auto& p : pools) {
4950 int64_t poolid = p.first;
4951 if (creating_pgs->created_pools.count(poolid)) {
4952 dout(10) << __func__ << " already created " << poolid << dendl;
4953 continue;
4954 }
4955 const pg_pool_t& pool = p.second;
4956 int ruleno = pool.get_crush_rule();
4957 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4958 continue;
4959
4960 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4961 const auto created = pool.get_last_change();
4962 if (last_scan_epoch && created <= last_scan_epoch) {
4963 dout(10) << __func__ << " no change in pool " << poolid
4964 << " " << pool << dendl;
4965 continue;
4966 }
4967 if (removed_pools.count(poolid)) {
4968 dout(10) << __func__ << " pool is being removed: " << poolid
4969 << " " << pool << dendl;
4970 continue;
4971 }
4972 dout(10) << __func__ << " queueing pool create for " << poolid
4973 << " " << pool << dendl;
4974 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4975 created, modified);
4976 queued++;
4977 }
4978 return queued;
4979 }
4980
4981 void OSDMonitor::update_creating_pgs()
4982 {
4983 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4984 << creating_pgs.queue.size() << " pools in queue" << dendl;
4985 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4986 std::lock_guard<std::mutex> l(creating_pgs_lock);
4987 for (const auto& pg : creating_pgs.pgs) {
4988 int acting_primary = -1;
4989 auto pgid = pg.first;
4990 if (!osdmap.pg_exists(pgid)) {
4991 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4992 << dendl;
4993 continue;
4994 }
4995 auto mapped = pg.second.create_epoch;
4996 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4997 spg_t spgid(pgid);
4998 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4999 // check the previous creating_pgs, look for the target to whom the pg was
5000 // previously mapped
5001 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
5002 const auto last_acting_primary = pgs_by_epoch.first;
5003 for (auto& pgs: pgs_by_epoch.second) {
5004 if (pgs.second.count(spgid)) {
5005 if (last_acting_primary == acting_primary) {
5006 mapped = pgs.first;
5007 } else {
5008 dout(20) << __func__ << " " << pgid << " "
5009 << " acting_primary:" << last_acting_primary
5010 << " -> " << acting_primary << dendl;
5011 // note epoch if the target of the create message changed.
5012 mapped = mapping.get_epoch();
5013 }
5014 break;
5015 } else {
5016 // newly creating
5017 mapped = mapping.get_epoch();
5018 }
5019 }
5020 }
5021 dout(10) << __func__ << " will instruct osd." << acting_primary
5022 << " to create " << pgid << "@" << mapped << dendl;
5023 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5024 }
5025 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5026 creating_pgs_epoch = mapping.get_epoch();
5027 }
5028
5029 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5030 {
5031 dout(30) << __func__ << " osd." << osd << " next=" << next
5032 << " " << creating_pgs_by_osd_epoch << dendl;
5033 std::lock_guard<std::mutex> l(creating_pgs_lock);
5034 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5035 dout(20) << __func__
5036 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5037 // the subscribers will be updated when the mapping is completed anyway
5038 return next;
5039 }
5040 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5041 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5042 return next;
5043 ceph_assert(!creating_pgs_by_epoch->second.empty());
5044
5045 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
5046 MOSDPGCreate2 *m = nullptr;
5047
5048 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
5049
5050 epoch_t last = 0;
5051 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5052 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5053 auto epoch = epoch_pgs->first;
5054 auto& pgs = epoch_pgs->second;
5055 dout(20) << __func__ << " osd." << osd << " from " << next
5056 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5057 last = epoch;
5058 for (auto& pg : pgs) {
5059 // Need the create time from the monitor using its clock to set
5060 // last_scrub_stamp upon pg creation.
5061 auto create = creating_pgs.pgs.find(pg.pgid);
5062 ceph_assert(create != creating_pgs.pgs.end());
5063 if (old) {
5064 if (!oldm) {
5065 oldm = new MOSDPGCreate(creating_pgs_epoch);
5066 }
5067 oldm->mkpg.emplace(pg.pgid,
5068 pg_create_t{create->second.create_epoch, pg.pgid, 0});
5069 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
5070 } else {
5071 if (!m) {
5072 m = new MOSDPGCreate2(creating_pgs_epoch);
5073 }
5074 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5075 create->second.create_stamp));
5076 if (create->second.history.epoch_created) {
5077 dout(20) << __func__ << " " << pg << " " << create->second.history
5078 << " " << create->second.past_intervals << dendl;
5079 m->pg_extra.emplace(pg, make_pair(create->second.history,
5080 create->second.past_intervals));
5081 }
5082 }
5083 dout(20) << __func__ << " will create " << pg
5084 << " at " << create->second.create_epoch << dendl;
5085 }
5086 }
5087 if (m) {
5088 con->send_message(m);
5089 } else if (oldm) {
5090 con->send_message(oldm);
5091 } else {
5092 dout(20) << __func__ << " osd." << osd << " from " << next
5093 << " has nothing to send" << dendl;
5094 return next;
5095 }
5096
5097 // sub is current through last + 1
5098 return last + 1;
5099 }
5100
5101 // TICK
5102
5103
5104 void OSDMonitor::tick()
5105 {
5106 if (!is_active()) return;
5107
5108 dout(10) << osdmap << dendl;
5109
5110 // always update osdmap manifest, regardless of being the leader.
5111 load_osdmap_manifest();
5112
5113 // always tune priority cache manager memory on leader and peons
5114 if (ceph_using_tcmalloc() && mon_memory_autotune) {
5115 std::lock_guard l(balancer_lock);
5116 if (pcm != nullptr) {
5117 pcm->tune_memory();
5118 pcm->balance();
5119 _set_new_cache_sizes();
5120 dout(10) << "tick balancer "
5121 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5122 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5123 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5124 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5125 << dendl;
5126 dout(10) << "tick balancer "
5127 << " full cache_bytes: " << full_cache->get_cache_bytes()
5128 << " full comtd_bytes: " << full_cache->get_committed_size()
5129 << " full used_bytes: " << full_cache->_get_used_bytes()
5130 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5131 << dendl;
5132 }
5133 }
5134
5135 if (!mon.is_leader()) return;
5136
5137 bool do_propose = false;
5138 utime_t now = ceph_clock_now();
5139
5140 if (handle_osd_timeouts(now, last_osd_report)) {
5141 do_propose = true;
5142 }
5143
5144 // mark osds down?
5145 if (check_failures(now)) {
5146 do_propose = true;
5147 }
5148
5149 // Force a proposal if we need to prune; pruning is performed on
5150 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5151 // even if there's nothing going on.
5152 if (is_prune_enabled() && should_prune()) {
5153 do_propose = true;
5154 }
5155
5156 // mark down osds out?
5157
5158 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5159 * influence at all. The decision is made based on the ratio of "in" osds,
5160 * and the function returns false if this ratio is lower that the minimum
5161 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5162 */
5163 if (can_mark_out(-1)) {
5164 string down_out_subtree_limit = g_conf().get_val<string>(
5165 "mon_osd_down_out_subtree_limit");
5166 set<int> down_cache; // quick cache of down subtrees
5167
5168 map<int,utime_t>::iterator i = down_pending_out.begin();
5169 while (i != down_pending_out.end()) {
5170 int o = i->first;
5171 utime_t down = now;
5172 down -= i->second;
5173 ++i;
5174
5175 if (osdmap.is_down(o) &&
5176 osdmap.is_in(o) &&
5177 can_mark_out(o)) {
5178 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5179 utime_t grace = orig_grace;
5180 double my_grace = 0.0;
5181
5182 if (g_conf()->mon_osd_adjust_down_out_interval) {
5183 // scale grace period the same way we do the heartbeat grace.
5184 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5185 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5186 double decay_k = ::log(.5) / halflife;
5187 double decay = exp((double)down * decay_k);
5188 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5189 << " down for " << down << " decay " << decay << dendl;
5190 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5191 grace += my_grace;
5192 }
5193
5194 // is this an entire large subtree down?
5195 if (down_out_subtree_limit.length()) {
5196 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5197 if (type > 0) {
5198 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5199 dout(10) << "tick entire containing " << down_out_subtree_limit
5200 << " subtree for osd." << o
5201 << " is down; resetting timer" << dendl;
5202 // reset timer, too.
5203 down_pending_out[o] = now;
5204 continue;
5205 }
5206 }
5207 }
5208
5209 bool down_out = !osdmap.is_destroyed(o) &&
5210 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5211 bool destroyed_out = osdmap.is_destroyed(o) &&
5212 g_conf()->mon_osd_destroyed_out_interval > 0 &&
5213 // this is not precise enough as we did not make a note when this osd
5214 // was marked as destroyed, but let's not bother with that
5215 // complexity for now.
5216 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5217 if (down_out || destroyed_out) {
5218 dout(10) << "tick marking osd." << o << " OUT after " << down
5219 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5220 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5221
5222 // set the AUTOOUT bit.
5223 if (pending_inc.new_state.count(o) == 0)
5224 pending_inc.new_state[o] = 0;
5225 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5226
5227 // remember previous weight
5228 if (pending_inc.new_xinfo.count(o) == 0)
5229 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5230 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5231
5232 do_propose = true;
5233
5234 mon.clog->info() << "Marking osd." << o << " out (has been down for "
5235 << int(down.sec()) << " seconds)";
5236 } else
5237 continue;
5238 }
5239
5240 down_pending_out.erase(o);
5241 }
5242 } else {
5243 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5244 }
5245
5246 // expire blocklisted items?
5247 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5248 p != osdmap.blocklist.end();
5249 ++p) {
5250 if (p->second < now) {
5251 dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5252 pending_inc.old_blocklist.push_back(p->first);
5253 do_propose = true;
5254 }
5255 }
5256 for (auto p = osdmap.range_blocklist.begin();
5257 p != osdmap.range_blocklist.end();
5258 ++p) {
5259 if (p->second < now) {
5260 dout(10) << "expiring range_blocklist item " << p->first
5261 << " expired " << p->second << " < now " << now << dendl;
5262 pending_inc.old_range_blocklist.push_back(p->first);
5263 do_propose = true;
5264 }
5265 }
5266
5267 if (try_prune_purged_snaps()) {
5268 do_propose = true;
5269 }
5270
5271 if (update_pools_status())
5272 do_propose = true;
5273
5274 if (do_propose ||
5275 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5276 propose_pending();
5277 }
5278
5279 void OSDMonitor::_set_new_cache_sizes()
5280 {
5281 uint64_t cache_size = 0;
5282 int64_t inc_alloc = 0;
5283 int64_t full_alloc = 0;
5284 int64_t kv_alloc = 0;
5285
5286 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5287 cache_size = pcm->get_tuned_mem();
5288 inc_alloc = inc_cache->get_committed_size();
5289 full_alloc = full_cache->get_committed_size();
5290 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5291 }
5292
5293 inc_osd_cache.set_bytes(inc_alloc);
5294 full_osd_cache.set_bytes(full_alloc);
5295
5296 dout(1) << __func__ << " cache_size:" << cache_size
5297 << " inc_alloc: " << inc_alloc
5298 << " full_alloc: " << full_alloc
5299 << " kv_alloc: " << kv_alloc
5300 << dendl;
5301 }
5302
5303 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5304 std::map<int, std::pair<utime_t, int>> &last_osd_report)
5305 {
5306 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5307 if (now - mon.get_leader_since() < timeo) {
5308 // We haven't been the leader for long enough to consider OSD timeouts
5309 return false;
5310 }
5311
5312 int max_osd = osdmap.get_max_osd();
5313 bool new_down = false;
5314
5315 for (int i=0; i < max_osd; ++i) {
5316 dout(30) << __func__ << ": checking up on osd " << i << dendl;
5317 if (!osdmap.exists(i)) {
5318 last_osd_report.erase(i); // if any
5319 continue;
5320 }
5321 if (!osdmap.is_up(i))
5322 continue;
5323 const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5324 if (t == last_osd_report.end()) {
5325 // it wasn't in the map; start the timer.
5326 last_osd_report[i].first = now;
5327 last_osd_report[i].second = 0;
5328 } else if (can_mark_down(i)) {
5329 utime_t diff = now - t->second.first;
5330 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5331 // to allow for the osd to miss a beacon.
5332 int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5333 utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0);
5334 if (diff > max_timeout) {
5335 mon.clog->info() << "osd." << i << " marked down after no beacon for "
5336 << diff << " seconds";
5337 derr << "no beacon from osd." << i << " since " << t->second.first
5338 << ", " << diff << " seconds ago. marking down" << dendl;
5339 pending_inc.new_state[i] = CEPH_OSD_UP;
5340 new_down = true;
5341 }
5342 }
5343 }
5344 return new_down;
5345 }
5346
5347 static void dump_cpu_list(Formatter *f, const char *name,
5348 const string& strlist)
5349 {
5350 cpu_set_t cpu_set;
5351 size_t cpu_set_size;
5352 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5353 return;
5354 }
5355 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5356 f->open_array_section(name);
5357 for (auto cpu : cpus) {
5358 f->dump_int("cpu", cpu);
5359 }
5360 f->close_section();
5361 }
5362
5363 void OSDMonitor::dump_info(Formatter *f)
5364 {
5365 f->open_object_section("osdmap");
5366 osdmap.dump(f);
5367 f->close_section();
5368
5369 f->open_array_section("osd_metadata");
5370 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5371 if (osdmap.exists(i)) {
5372 f->open_object_section("osd");
5373 f->dump_unsigned("id", i);
5374 dump_osd_metadata(i, f, NULL);
5375 f->close_section();
5376 }
5377 }
5378 f->close_section();
5379
5380 f->open_object_section("osdmap_clean_epochs");
5381 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5382
5383 f->open_object_section("last_epoch_clean");
5384 last_epoch_clean.dump(f);
5385 f->close_section();
5386
5387 f->open_array_section("osd_epochs");
5388 for (auto& osd_epoch : osd_epochs) {
5389 f->open_object_section("osd");
5390 f->dump_unsigned("id", osd_epoch.first);
5391 f->dump_unsigned("epoch", osd_epoch.second);
5392 f->close_section();
5393 }
5394 f->close_section(); // osd_epochs
5395
5396 f->close_section(); // osd_clean_epochs
5397
5398 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5399 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5400
5401 f->open_object_section("crushmap");
5402 osdmap.crush->dump(f);
5403 f->close_section();
5404
5405 if (has_osdmap_manifest) {
5406 f->open_object_section("osdmap_manifest");
5407 osdmap_manifest.dump(f);
5408 f->close_section();
5409 }
5410 }
5411
5412 namespace {
5413 enum osd_pool_get_choices {
5414 SIZE, MIN_SIZE,
5415 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5416 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5417 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5418 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5419 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5420 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5421 CACHE_TARGET_FULL_RATIO,
5422 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5423 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5424 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5425 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5426 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5427 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5428 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5429 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5430 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5431 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5432 PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5433 DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX };
5434
5435 std::set<osd_pool_get_choices>
5436 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5437 const std::set<osd_pool_get_choices>& second)
5438 {
5439 std::set<osd_pool_get_choices> result;
5440 std::set_difference(first.begin(), first.end(),
5441 second.begin(), second.end(),
5442 std::inserter(result, result.end()));
5443 return result;
5444 }
5445 }
5446
5447
5448 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5449 {
5450 op->mark_osdmon_event(__func__);
5451 auto m = op->get_req<MMonCommand>();
5452 int r = 0;
5453 bufferlist rdata;
5454 stringstream ss, ds;
5455
5456 cmdmap_t cmdmap;
5457 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5458 string rs = ss.str();
5459 mon.reply_command(op, -EINVAL, rs, get_last_committed());
5460 return true;
5461 }
5462
5463 MonSession *session = op->get_session();
5464 if (!session) {
5465 derr << __func__ << " no session" << dendl;
5466 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5467 return true;
5468 }
5469
5470 string prefix;
5471 cmd_getval(cmdmap, "prefix", prefix);
5472
5473 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
5474 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5475
5476 if (prefix == "osd stat") {
5477 if (f) {
5478 f->open_object_section("osdmap");
5479 osdmap.print_summary(f.get(), ds, "", true);
5480 f->close_section();
5481 f->flush(rdata);
5482 } else {
5483 osdmap.print_summary(nullptr, ds, "", true);
5484 rdata.append(ds);
5485 }
5486 }
5487 else if (prefix == "osd dump" ||
5488 prefix == "osd tree" ||
5489 prefix == "osd tree-from" ||
5490 prefix == "osd ls" ||
5491 prefix == "osd getmap" ||
5492 prefix == "osd getcrushmap" ||
5493 prefix == "osd ls-tree" ||
5494 prefix == "osd info") {
5495
5496 epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
5497 bufferlist osdmap_bl;
5498 int err = get_version_full(epoch, osdmap_bl);
5499 if (err == -ENOENT) {
5500 r = -ENOENT;
5501 ss << "there is no map for epoch " << epoch;
5502 goto reply;
5503 }
5504 ceph_assert(err == 0);
5505 ceph_assert(osdmap_bl.length());
5506
5507 OSDMap *p;
5508 if (epoch == osdmap.get_epoch()) {
5509 p = &osdmap;
5510 } else {
5511 p = new OSDMap;
5512 p->decode(osdmap_bl);
5513 }
5514
5515 auto sg = make_scope_guard([&] {
5516 if (p != &osdmap) {
5517 delete p;
5518 }
5519 });
5520
5521 if (prefix == "osd dump") {
5522 stringstream ds;
5523 if (f) {
5524 f->open_object_section("osdmap");
5525 p->dump(f.get());
5526 f->close_section();
5527 f->flush(ds);
5528 } else {
5529 p->print(ds);
5530 }
5531 rdata.append(ds);
5532 if (!f)
5533 ds << " ";
5534 } else if (prefix == "osd ls") {
5535 if (f) {
5536 f->open_array_section("osds");
5537 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5538 if (osdmap.exists(i)) {
5539 f->dump_int("osd", i);
5540 }
5541 }
5542 f->close_section();
5543 f->flush(ds);
5544 } else {
5545 bool first = true;
5546 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5547 if (osdmap.exists(i)) {
5548 if (!first)
5549 ds << "\n";
5550 first = false;
5551 ds << i;
5552 }
5553 }
5554 }
5555 rdata.append(ds);
5556 } else if (prefix == "osd info") {
5557 int64_t osd_id;
5558 bool do_single_osd = true;
5559 if (!cmd_getval(cmdmap, "id", osd_id)) {
5560 do_single_osd = false;
5561 }
5562
5563 if (do_single_osd && !osdmap.exists(osd_id)) {
5564 ss << "osd." << osd_id << " does not exist";
5565 r = -EINVAL;
5566 goto reply;
5567 }
5568
5569 if (f) {
5570 if (do_single_osd) {
5571 osdmap.dump_osd(osd_id, f.get());
5572 } else {
5573 osdmap.dump_osds(f.get());
5574 }
5575 f->flush(ds);
5576 } else {
5577 if (do_single_osd) {
5578 osdmap.print_osd(osd_id, ds);
5579 } else {
5580 osdmap.print_osds(ds);
5581 }
5582 }
5583 rdata.append(ds);
5584 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5585 string bucket;
5586 if (prefix == "osd tree-from") {
5587 cmd_getval(cmdmap, "bucket", bucket);
5588 if (!osdmap.crush->name_exists(bucket)) {
5589 ss << "bucket '" << bucket << "' does not exist";
5590 r = -ENOENT;
5591 goto reply;
5592 }
5593 int id = osdmap.crush->get_item_id(bucket);
5594 if (id >= 0) {
5595 ss << "\"" << bucket << "\" is not a bucket";
5596 r = -EINVAL;
5597 goto reply;
5598 }
5599 }
5600
5601 vector<string> states;
5602 cmd_getval(cmdmap, "states", states);
5603 unsigned filter = 0;
5604 for (auto& s : states) {
5605 if (s == "up") {
5606 filter |= OSDMap::DUMP_UP;
5607 } else if (s == "down") {
5608 filter |= OSDMap::DUMP_DOWN;
5609 } else if (s == "in") {
5610 filter |= OSDMap::DUMP_IN;
5611 } else if (s == "out") {
5612 filter |= OSDMap::DUMP_OUT;
5613 } else if (s == "destroyed") {
5614 filter |= OSDMap::DUMP_DESTROYED;
5615 } else {
5616 ss << "unrecognized state '" << s << "'";
5617 r = -EINVAL;
5618 goto reply;
5619 }
5620 }
5621 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5622 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5623 ss << "cannot specify both 'in' and 'out'";
5624 r = -EINVAL;
5625 goto reply;
5626 }
5627 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5628 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5629 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5630 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5631 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5632 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5633 ss << "can specify only one of 'up', 'down' and 'destroyed'";
5634 r = -EINVAL;
5635 goto reply;
5636 }
5637 if (f) {
5638 f->open_object_section("tree");
5639 p->print_tree(f.get(), NULL, filter, bucket);
5640 f->close_section();
5641 f->flush(ds);
5642 } else {
5643 p->print_tree(NULL, &ds, filter, bucket);
5644 }
5645 rdata.append(ds);
5646 } else if (prefix == "osd getmap") {
5647 rdata.append(osdmap_bl);
5648 ss << "got osdmap epoch " << p->get_epoch();
5649 } else if (prefix == "osd getcrushmap") {
5650 p->crush->encode(rdata, mon.get_quorum_con_features());
5651 ss << p->get_crush_version();
5652 } else if (prefix == "osd ls-tree") {
5653 string bucket_name;
5654 cmd_getval(cmdmap, "name", bucket_name);
5655 set<int> osds;
5656 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5657 if (r == -ENOENT) {
5658 ss << "\"" << bucket_name << "\" does not exist";
5659 goto reply;
5660 } else if (r < 0) {
5661 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5662 goto reply;
5663 }
5664
5665 if (f) {
5666 f->open_array_section("osds");
5667 for (auto &i : osds) {
5668 if (osdmap.exists(i)) {
5669 f->dump_int("osd", i);
5670 }
5671 }
5672 f->close_section();
5673 f->flush(ds);
5674 } else {
5675 bool first = true;
5676 for (auto &i : osds) {
5677 if (osdmap.exists(i)) {
5678 if (!first)
5679 ds << "\n";
5680 first = false;
5681 ds << i;
5682 }
5683 }
5684 }
5685
5686 rdata.append(ds);
5687 }
5688 } else if (prefix == "osd getmaxosd") {
5689 if (f) {
5690 f->open_object_section("getmaxosd");
5691 f->dump_unsigned("epoch", osdmap.get_epoch());
5692 f->dump_int("max_osd", osdmap.get_max_osd());
5693 f->close_section();
5694 f->flush(rdata);
5695 } else {
5696 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5697 rdata.append(ds);
5698 }
5699 } else if (prefix == "osd utilization") {
5700 string out;
5701 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5702 if (f)
5703 f->flush(rdata);
5704 else
5705 rdata.append(out);
5706 r = 0;
5707 goto reply;
5708 } else if (prefix == "osd find") {
5709 int64_t osd;
5710 if (!cmd_getval(cmdmap, "id", osd)) {
5711 ss << "unable to parse osd id value '"
5712 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5713 r = -EINVAL;
5714 goto reply;
5715 }
5716 if (!osdmap.exists(osd)) {
5717 ss << "osd." << osd << " does not exist";
5718 r = -ENOENT;
5719 goto reply;
5720 }
5721 string format;
5722 cmd_getval(cmdmap, "format", format);
5723 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5724 f->open_object_section("osd_location");
5725 f->dump_int("osd", osd);
5726 f->dump_object("addrs", osdmap.get_addrs(osd));
5727 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5728
5729 // try to identify host, pod/container name, etc.
5730 map<string,string> m;
5731 load_metadata(osd, m, nullptr);
5732 if (auto p = m.find("hostname"); p != m.end()) {
5733 f->dump_string("host", p->second);
5734 }
5735 for (auto& k : {
5736 "pod_name", "pod_namespace", // set by rook
5737 "container_name" // set by cephadm, ceph-ansible
5738 }) {
5739 if (auto p = m.find(k); p != m.end()) {
5740 f->dump_string(k, p->second);
5741 }
5742 }
5743
5744 // crush is helpful too
5745 f->open_object_section("crush_location");
5746 map<string,string> loc = osdmap.crush->get_full_location(osd);
5747 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5748 f->dump_string(p->first.c_str(), p->second);
5749 f->close_section();
5750 f->close_section();
5751 f->flush(rdata);
5752 } else if (prefix == "osd metadata") {
5753 int64_t osd = -1;
5754 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5755 !cmd_getval(cmdmap, "id", osd)) {
5756 ss << "unable to parse osd id value '"
5757 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5758 r = -EINVAL;
5759 goto reply;
5760 }
5761 if (osd >= 0 && !osdmap.exists(osd)) {
5762 ss << "osd." << osd << " does not exist";
5763 r = -ENOENT;
5764 goto reply;
5765 }
5766 string format;
5767 cmd_getval(cmdmap, "format", format);
5768 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5769 if (osd >= 0) {
5770 f->open_object_section("osd_metadata");
5771 f->dump_unsigned("id", osd);
5772 r = dump_osd_metadata(osd, f.get(), &ss);
5773 if (r < 0)
5774 goto reply;
5775 f->close_section();
5776 } else {
5777 r = 0;
5778 f->open_array_section("osd_metadata");
5779 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5780 if (osdmap.exists(i)) {
5781 f->open_object_section("osd");
5782 f->dump_unsigned("id", i);
5783 r = dump_osd_metadata(i, f.get(), NULL);
5784 if (r == -EINVAL || r == -ENOENT) {
5785 // Drop error, continue to get other daemons' metadata
5786 dout(4) << "No metadata for osd." << i << dendl;
5787 r = 0;
5788 } else if (r < 0) {
5789 // Unexpected error
5790 goto reply;
5791 }
5792 f->close_section();
5793 }
5794 }
5795 f->close_section();
5796 }
5797 f->flush(rdata);
5798 } else if (prefix == "osd versions") {
5799 if (!f)
5800 f.reset(Formatter::create("json-pretty"));
5801 count_metadata("ceph_version", f.get());
5802 f->flush(rdata);
5803 r = 0;
5804 } else if (prefix == "osd count-metadata") {
5805 if (!f)
5806 f.reset(Formatter::create("json-pretty"));
5807 string field;
5808 cmd_getval(cmdmap, "property", field);
5809 count_metadata(field, f.get());
5810 f->flush(rdata);
5811 r = 0;
5812 } else if (prefix == "osd numa-status") {
5813 TextTable tbl;
5814 if (f) {
5815 f->open_array_section("osds");
5816 } else {
5817 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5818 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5819 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5820 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5821 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5822 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5823 }
5824 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5825 if (osdmap.exists(i)) {
5826 map<string,string> m;
5827 ostringstream err;
5828 if (load_metadata(i, m, &err) < 0) {
5829 continue;
5830 }
5831 string host;
5832 auto p = m.find("hostname");
5833 if (p != m.end()) {
5834 host = p->second;
5835 }
5836 if (f) {
5837 f->open_object_section("osd");
5838 f->dump_int("osd", i);
5839 f->dump_string("host", host);
5840 for (auto n : { "network_numa_node", "objectstore_numa_node",
5841 "numa_node" }) {
5842 p = m.find(n);
5843 if (p != m.end()) {
5844 f->dump_int(n, atoi(p->second.c_str()));
5845 }
5846 }
5847 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5848 p = m.find(n);
5849 if (p != m.end()) {
5850 list<string> ls = get_str_list(p->second, ",");
5851 f->open_array_section(n);
5852 for (auto node : ls) {
5853 f->dump_int("node", atoi(node.c_str()));
5854 }
5855 f->close_section();
5856 }
5857 }
5858 for (auto n : { "numa_node_cpus" }) {
5859 p = m.find(n);
5860 if (p != m.end()) {
5861 dump_cpu_list(f.get(), n, p->second);
5862 }
5863 }
5864 f->close_section();
5865 } else {
5866 tbl << i;
5867 tbl << host;
5868 p = m.find("network_numa_nodes");
5869 if (p != m.end()) {
5870 tbl << p->second;
5871 } else {
5872 tbl << "-";
5873 }
5874 p = m.find("objectstore_numa_nodes");
5875 if (p != m.end()) {
5876 tbl << p->second;
5877 } else {
5878 tbl << "-";
5879 }
5880 p = m.find("numa_node");
5881 auto q = m.find("numa_node_cpus");
5882 if (p != m.end() && q != m.end()) {
5883 tbl << p->second;
5884 tbl << q->second;
5885 } else {
5886 tbl << "-";
5887 tbl << "-";
5888 }
5889 tbl << TextTable::endrow;
5890 }
5891 }
5892 }
5893 if (f) {
5894 f->close_section();
5895 f->flush(rdata);
5896 } else {
5897 rdata.append(stringify(tbl));
5898 }
5899 } else if (prefix == "osd map") {
5900 string poolstr, objstr, namespacestr;
5901 cmd_getval(cmdmap, "pool", poolstr);
5902 cmd_getval(cmdmap, "object", objstr);
5903 cmd_getval(cmdmap, "nspace", namespacestr);
5904
5905 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5906 if (pool < 0) {
5907 ss << "pool " << poolstr << " does not exist";
5908 r = -ENOENT;
5909 goto reply;
5910 }
5911 object_locator_t oloc(pool, namespacestr);
5912 object_t oid(objstr);
5913 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5914 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5915 vector<int> up, acting;
5916 int up_p, acting_p;
5917 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5918
5919 string fullobjname;
5920 if (!namespacestr.empty())
5921 fullobjname = namespacestr + string("/") + oid.name;
5922 else
5923 fullobjname = oid.name;
5924 if (f) {
5925 f->open_object_section("osd_map");
5926 f->dump_unsigned("epoch", osdmap.get_epoch());
5927 f->dump_string("pool", poolstr);
5928 f->dump_int("pool_id", pool);
5929 f->dump_stream("objname") << fullobjname;
5930 f->dump_stream("raw_pgid") << pgid;
5931 f->dump_stream("pgid") << mpgid;
5932 f->open_array_section("up");
5933 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5934 f->dump_int("osd", *p);
5935 f->close_section();
5936 f->dump_int("up_primary", up_p);
5937 f->open_array_section("acting");
5938 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5939 f->dump_int("osd", *p);
5940 f->close_section();
5941 f->dump_int("acting_primary", acting_p);
5942 f->close_section(); // osd_map
5943 f->flush(rdata);
5944 } else {
5945 ds << "osdmap e" << osdmap.get_epoch()
5946 << " pool '" << poolstr << "' (" << pool << ")"
5947 << " object '" << fullobjname << "' ->"
5948 << " pg " << pgid << " (" << mpgid << ")"
5949 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5950 << pg_vector_string(acting) << ", p" << acting_p << ")";
5951 rdata.append(ds);
5952 }
5953
5954 } else if (prefix == "pg map") {
5955 pg_t pgid;
5956 string pgidstr;
5957 cmd_getval(cmdmap, "pgid", pgidstr);
5958 if (!pgid.parse(pgidstr.c_str())) {
5959 ss << "invalid pgid '" << pgidstr << "'";
5960 r = -EINVAL;
5961 goto reply;
5962 }
5963 vector<int> up, acting;
5964 if (!osdmap.have_pg_pool(pgid.pool())) {
5965 ss << "pg '" << pgidstr << "' does not exist";
5966 r = -ENOENT;
5967 goto reply;
5968 }
5969 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5970 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5971 if (f) {
5972 f->open_object_section("pg_map");
5973 f->dump_unsigned("epoch", osdmap.get_epoch());
5974 f->dump_stream("raw_pgid") << pgid;
5975 f->dump_stream("pgid") << mpgid;
5976 f->open_array_section("up");
5977 for (auto osd : up) {
5978 f->dump_int("up_osd", osd);
5979 }
5980 f->close_section();
5981 f->open_array_section("acting");
5982 for (auto osd : acting) {
5983 f->dump_int("acting_osd", osd);
5984 }
5985 f->close_section();
5986 f->close_section();
5987 f->flush(rdata);
5988 } else {
5989 ds << "osdmap e" << osdmap.get_epoch()
5990 << " pg " << pgid << " (" << mpgid << ")"
5991 << " -> up " << up << " acting " << acting;
5992 rdata.append(ds);
5993 }
5994 goto reply;
5995
5996 } else if (prefix == "osd lspools") {
5997 if (f)
5998 f->open_array_section("pools");
5999 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
6000 p != osdmap.pools.end();
6001 ++p) {
6002 if (f) {
6003 f->open_object_section("pool");
6004 f->dump_int("poolnum", p->first);
6005 f->dump_string("poolname", osdmap.pool_name[p->first]);
6006 f->close_section();
6007 } else {
6008 ds << p->first << ' ' << osdmap.pool_name[p->first];
6009 if (next(p) != osdmap.pools.end()) {
6010 ds << '\n';
6011 }
6012 }
6013 }
6014 if (f) {
6015 f->close_section();
6016 f->flush(ds);
6017 }
6018 rdata.append(ds);
6019 } else if (prefix == "osd blocklist ls" ||
6020 prefix == "osd blacklist ls") {
6021 if (f)
6022 f->open_array_section("blocklist");
6023
6024 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6025 p != osdmap.blocklist.end();
6026 ++p) {
6027 if (f) {
6028 f->open_object_section("entry");
6029 f->dump_string("addr", p->first.get_legacy_str());
6030 f->dump_stream("until") << p->second;
6031 f->close_section();
6032 } else {
6033 stringstream ss;
6034 string s;
6035 ss << p->first << " " << p->second;
6036 getline(ss, s);
6037 s += "\n";
6038 rdata.append(s);
6039 }
6040 }
6041 if (f) {
6042 f->close_section();
6043 f->flush(rdata);
6044 }
6045 if (f)
6046 f->open_array_section("range_blocklist");
6047
6048 for (auto p = osdmap.range_blocklist.begin();
6049 p != osdmap.range_blocklist.end();
6050 ++p) {
6051 if (f) {
6052 f->open_object_section("entry");
6053 f->dump_string("range", p->first.get_legacy_str());
6054 f->dump_stream("until") << p->second;
6055 f->close_section();
6056 } else {
6057 stringstream ss;
6058 string s;
6059 ss << p->first << " " << p->second;
6060 getline(ss, s);
6061 s += "\n";
6062 rdata.append(s);
6063 }
6064 }
6065 if (f) {
6066 f->close_section();
6067 f->flush(rdata);
6068 }
6069 ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
6070
6071 } else if (prefix == "osd pool ls") {
6072 string detail;
6073 cmd_getval(cmdmap, "detail", detail);
6074 if (!f && detail == "detail") {
6075 ostringstream ss;
6076 osdmap.print_pools(ss);
6077 rdata.append(ss.str());
6078 } else {
6079 if (f)
6080 f->open_array_section("pools");
6081 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
6082 it != osdmap.get_pools().end();
6083 ++it) {
6084 if (f) {
6085 if (detail == "detail") {
6086 f->open_object_section("pool");
6087 f->dump_int("pool_id", it->first);
6088 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6089 it->second.dump(f.get());
6090 f->close_section();
6091 } else {
6092 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6093 }
6094 } else {
6095 rdata.append(osdmap.get_pool_name(it->first) + "\n");
6096 }
6097 }
6098 if (f) {
6099 f->close_section();
6100 f->flush(rdata);
6101 }
6102 }
6103
6104 } else if (prefix == "osd crush get-tunable") {
6105 string tunable;
6106 cmd_getval(cmdmap, "tunable", tunable);
6107 ostringstream rss;
6108 if (f)
6109 f->open_object_section("tunable");
6110 if (tunable == "straw_calc_version") {
6111 if (f)
6112 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6113 else
6114 rss << osdmap.crush->get_straw_calc_version() << "\n";
6115 } else {
6116 r = -EINVAL;
6117 goto reply;
6118 }
6119 if (f) {
6120 f->close_section();
6121 f->flush(rdata);
6122 } else {
6123 rdata.append(rss.str());
6124 }
6125 r = 0;
6126
6127 } else if (prefix == "osd pool get") {
6128 string poolstr;
6129 cmd_getval(cmdmap, "pool", poolstr);
6130 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6131 if (pool < 0) {
6132 ss << "unrecognized pool '" << poolstr << "'";
6133 r = -ENOENT;
6134 goto reply;
6135 }
6136
6137 const pg_pool_t *p = osdmap.get_pg_pool(pool);
6138 string var;
6139 cmd_getval(cmdmap, "var", var);
6140
6141 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6142 const choices_map_t ALL_CHOICES = {
6143 {"size", SIZE},
6144 {"min_size", MIN_SIZE},
6145 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6146 {"crush_rule", CRUSH_RULE},
6147 {"hashpspool", HASHPSPOOL},
6148 {"eio", POOL_EIO},
6149 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6150 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6151 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6152 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6153 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6154 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6155 {"use_gmt_hitset", USE_GMT_HITSET},
6156 {"target_max_objects", TARGET_MAX_OBJECTS},
6157 {"target_max_bytes", TARGET_MAX_BYTES},
6158 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6159 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6160 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6161 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6162 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6163 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6164 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6165 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6166 {"fast_read", FAST_READ},
6167 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6168 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6169 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6170 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6171 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6172 {"recovery_priority", RECOVERY_PRIORITY},
6173 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6174 {"scrub_priority", SCRUB_PRIORITY},
6175 {"compression_mode", COMPRESSION_MODE},
6176 {"compression_algorithm", COMPRESSION_ALGORITHM},
6177 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6178 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6179 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6180 {"csum_type", CSUM_TYPE},
6181 {"csum_max_block", CSUM_MAX_BLOCK},
6182 {"csum_min_block", CSUM_MIN_BLOCK},
6183 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6184 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6185 {"pg_num_min", PG_NUM_MIN},
6186 {"pg_num_max", PG_NUM_MAX},
6187 {"target_size_bytes", TARGET_SIZE_BYTES},
6188 {"target_size_ratio", TARGET_SIZE_RATIO},
6189 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6190 {"dedup_tier", DEDUP_TIER},
6191 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6192 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6193 {"bulk", BULK}
6194 };
6195
6196 typedef std::set<osd_pool_get_choices> choices_set_t;
6197
6198 const choices_set_t ONLY_TIER_CHOICES = {
6199 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6200 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6201 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6202 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6203 MIN_READ_RECENCY_FOR_PROMOTE,
6204 MIN_WRITE_RECENCY_FOR_PROMOTE,
6205 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6206 };
6207 const choices_set_t ONLY_ERASURE_CHOICES = {
6208 EC_OVERWRITES, ERASURE_CODE_PROFILE
6209 };
6210
6211 choices_set_t selected_choices;
6212 if (var == "all") {
6213 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6214 it != ALL_CHOICES.end(); ++it) {
6215 selected_choices.insert(it->second);
6216 }
6217
6218 if(!p->is_tier()) {
6219 selected_choices = subtract_second_from_first(selected_choices,
6220 ONLY_TIER_CHOICES);
6221 }
6222
6223 if(!p->is_erasure()) {
6224 selected_choices = subtract_second_from_first(selected_choices,
6225 ONLY_ERASURE_CHOICES);
6226 }
6227 } else /* var != "all" */ {
6228 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6229 if (found == ALL_CHOICES.end()) {
6230 ss << "pool '" << poolstr
6231 << "': invalid variable: '" << var << "'";
6232 r = -EINVAL;
6233 goto reply;
6234 }
6235
6236 osd_pool_get_choices selected = found->second;
6237
6238 if (!p->is_tier() &&
6239 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6240 ss << "pool '" << poolstr
6241 << "' is not a tier pool: variable not applicable";
6242 r = -EACCES;
6243 goto reply;
6244 }
6245
6246 if (!p->is_erasure() &&
6247 ONLY_ERASURE_CHOICES.find(selected)
6248 != ONLY_ERASURE_CHOICES.end()) {
6249 ss << "pool '" << poolstr
6250 << "' is not a erasure pool: variable not applicable";
6251 r = -EACCES;
6252 goto reply;
6253 }
6254
6255 if (pool_opts_t::is_opt_name(var) &&
6256 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6257 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6258 r = -ENOENT;
6259 goto reply;
6260 }
6261
6262 selected_choices.insert(selected);
6263 }
6264
6265 if (f) {
6266 f->open_object_section("pool");
6267 f->dump_string("pool", poolstr);
6268 f->dump_int("pool_id", pool);
6269 for(choices_set_t::const_iterator it = selected_choices.begin();
6270 it != selected_choices.end(); ++it) {
6271 choices_map_t::const_iterator i;
6272 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6273 if (i->second == *it) {
6274 break;
6275 }
6276 }
6277 ceph_assert(i != ALL_CHOICES.end());
6278 switch(*it) {
6279 case PG_NUM:
6280 f->dump_int("pg_num", p->get_pg_num());
6281 break;
6282 case PGP_NUM:
6283 f->dump_int("pgp_num", p->get_pgp_num());
6284 break;
6285 case SIZE:
6286 f->dump_int("size", p->get_size());
6287 break;
6288 case MIN_SIZE:
6289 f->dump_int("min_size", p->get_min_size());
6290 break;
6291 case CRUSH_RULE:
6292 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6293 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6294 p->get_crush_rule()));
6295 } else {
6296 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6297 }
6298 break;
6299 case EC_OVERWRITES:
6300 f->dump_bool("allow_ec_overwrites",
6301 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6302 break;
6303 case PG_AUTOSCALE_MODE:
6304 f->dump_string("pg_autoscale_mode",
6305 pg_pool_t::get_pg_autoscale_mode_name(
6306 p->pg_autoscale_mode));
6307 break;
6308 case HASHPSPOOL:
6309 case POOL_EIO:
6310 case NODELETE:
6311 case BULK:
6312 case NOPGCHANGE:
6313 case NOSIZECHANGE:
6314 case WRITE_FADVISE_DONTNEED:
6315 case NOSCRUB:
6316 case NODEEP_SCRUB:
6317 f->dump_bool(i->first.c_str(),
6318 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6319 break;
6320 case HIT_SET_PERIOD:
6321 f->dump_int("hit_set_period", p->hit_set_period);
6322 break;
6323 case HIT_SET_COUNT:
6324 f->dump_int("hit_set_count", p->hit_set_count);
6325 break;
6326 case HIT_SET_TYPE:
6327 f->dump_string("hit_set_type",
6328 HitSet::get_type_name(p->hit_set_params.get_type()));
6329 break;
6330 case HIT_SET_FPP:
6331 {
6332 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6333 BloomHitSet::Params *bloomp =
6334 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6335 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6336 } else if(var != "all") {
6337 f->close_section();
6338 ss << "hit set is not of type Bloom; " <<
6339 "invalid to get a false positive rate!";
6340 r = -EINVAL;
6341 goto reply;
6342 }
6343 }
6344 break;
6345 case USE_GMT_HITSET:
6346 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6347 break;
6348 case TARGET_MAX_OBJECTS:
6349 f->dump_unsigned("target_max_objects", p->target_max_objects);
6350 break;
6351 case TARGET_MAX_BYTES:
6352 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6353 break;
6354 case CACHE_TARGET_DIRTY_RATIO:
6355 f->dump_unsigned("cache_target_dirty_ratio_micro",
6356 p->cache_target_dirty_ratio_micro);
6357 f->dump_float("cache_target_dirty_ratio",
6358 ((float)p->cache_target_dirty_ratio_micro/1000000));
6359 break;
6360 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6361 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6362 p->cache_target_dirty_high_ratio_micro);
6363 f->dump_float("cache_target_dirty_high_ratio",
6364 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6365 break;
6366 case CACHE_TARGET_FULL_RATIO:
6367 f->dump_unsigned("cache_target_full_ratio_micro",
6368 p->cache_target_full_ratio_micro);
6369 f->dump_float("cache_target_full_ratio",
6370 ((float)p->cache_target_full_ratio_micro/1000000));
6371 break;
6372 case CACHE_MIN_FLUSH_AGE:
6373 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6374 break;
6375 case CACHE_MIN_EVICT_AGE:
6376 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6377 break;
6378 case ERASURE_CODE_PROFILE:
6379 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6380 break;
6381 case MIN_READ_RECENCY_FOR_PROMOTE:
6382 f->dump_int("min_read_recency_for_promote",
6383 p->min_read_recency_for_promote);
6384 break;
6385 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6386 f->dump_int("min_write_recency_for_promote",
6387 p->min_write_recency_for_promote);
6388 break;
6389 case FAST_READ:
6390 f->dump_int("fast_read", p->fast_read);
6391 break;
6392 case HIT_SET_GRADE_DECAY_RATE:
6393 f->dump_int("hit_set_grade_decay_rate",
6394 p->hit_set_grade_decay_rate);
6395 break;
6396 case HIT_SET_SEARCH_LAST_N:
6397 f->dump_int("hit_set_search_last_n",
6398 p->hit_set_search_last_n);
6399 break;
6400 case SCRUB_MIN_INTERVAL:
6401 case SCRUB_MAX_INTERVAL:
6402 case DEEP_SCRUB_INTERVAL:
6403 case RECOVERY_PRIORITY:
6404 case RECOVERY_OP_PRIORITY:
6405 case SCRUB_PRIORITY:
6406 case COMPRESSION_MODE:
6407 case COMPRESSION_ALGORITHM:
6408 case COMPRESSION_REQUIRED_RATIO:
6409 case COMPRESSION_MAX_BLOB_SIZE:
6410 case COMPRESSION_MIN_BLOB_SIZE:
6411 case CSUM_TYPE:
6412 case CSUM_MAX_BLOCK:
6413 case CSUM_MIN_BLOCK:
6414 case FINGERPRINT_ALGORITHM:
6415 case PG_NUM_MIN:
6416 case PG_NUM_MAX:
6417 case TARGET_SIZE_BYTES:
6418 case TARGET_SIZE_RATIO:
6419 case PG_AUTOSCALE_BIAS:
6420 case DEDUP_TIER:
6421 case DEDUP_CHUNK_ALGORITHM:
6422 case DEDUP_CDC_CHUNK_SIZE:
6423 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6424 if (p->opts.is_set(key)) {
6425 if(*it == CSUM_TYPE) {
6426 int64_t val;
6427 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6428 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6429 } else {
6430 p->opts.dump(i->first, f.get());
6431 }
6432 }
6433 break;
6434 }
6435 }
6436 f->close_section();
6437 f->flush(rdata);
6438 } else /* !f */ {
6439 for(choices_set_t::const_iterator it = selected_choices.begin();
6440 it != selected_choices.end(); ++it) {
6441 choices_map_t::const_iterator i;
6442 switch(*it) {
6443 case PG_NUM:
6444 ss << "pg_num: " << p->get_pg_num() << "\n";
6445 break;
6446 case PGP_NUM:
6447 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6448 break;
6449 case SIZE:
6450 ss << "size: " << p->get_size() << "\n";
6451 break;
6452 case MIN_SIZE:
6453 ss << "min_size: " << p->get_min_size() << "\n";
6454 break;
6455 case CRUSH_RULE:
6456 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6457 ss << "crush_rule: " << osdmap.crush->get_rule_name(
6458 p->get_crush_rule()) << "\n";
6459 } else {
6460 ss << "crush_rule: " << p->get_crush_rule() << "\n";
6461 }
6462 break;
6463 case PG_AUTOSCALE_MODE:
6464 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6465 p->pg_autoscale_mode) <<"\n";
6466 break;
6467 case HIT_SET_PERIOD:
6468 ss << "hit_set_period: " << p->hit_set_period << "\n";
6469 break;
6470 case HIT_SET_COUNT:
6471 ss << "hit_set_count: " << p->hit_set_count << "\n";
6472 break;
6473 case HIT_SET_TYPE:
6474 ss << "hit_set_type: " <<
6475 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6476 break;
6477 case HIT_SET_FPP:
6478 {
6479 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6480 BloomHitSet::Params *bloomp =
6481 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6482 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6483 } else if(var != "all") {
6484 ss << "hit set is not of type Bloom; " <<
6485 "invalid to get a false positive rate!";
6486 r = -EINVAL;
6487 goto reply;
6488 }
6489 }
6490 break;
6491 case USE_GMT_HITSET:
6492 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6493 break;
6494 case TARGET_MAX_OBJECTS:
6495 ss << "target_max_objects: " << p->target_max_objects << "\n";
6496 break;
6497 case TARGET_MAX_BYTES:
6498 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6499 break;
6500 case CACHE_TARGET_DIRTY_RATIO:
6501 ss << "cache_target_dirty_ratio: "
6502 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6503 break;
6504 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6505 ss << "cache_target_dirty_high_ratio: "
6506 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6507 break;
6508 case CACHE_TARGET_FULL_RATIO:
6509 ss << "cache_target_full_ratio: "
6510 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6511 break;
6512 case CACHE_MIN_FLUSH_AGE:
6513 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6514 break;
6515 case CACHE_MIN_EVICT_AGE:
6516 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6517 break;
6518 case ERASURE_CODE_PROFILE:
6519 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6520 break;
6521 case MIN_READ_RECENCY_FOR_PROMOTE:
6522 ss << "min_read_recency_for_promote: " <<
6523 p->min_read_recency_for_promote << "\n";
6524 break;
6525 case HIT_SET_GRADE_DECAY_RATE:
6526 ss << "hit_set_grade_decay_rate: " <<
6527 p->hit_set_grade_decay_rate << "\n";
6528 break;
6529 case HIT_SET_SEARCH_LAST_N:
6530 ss << "hit_set_search_last_n: " <<
6531 p->hit_set_search_last_n << "\n";
6532 break;
6533 case EC_OVERWRITES:
6534 ss << "allow_ec_overwrites: " <<
6535 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6536 "\n";
6537 break;
6538 case HASHPSPOOL:
6539 case POOL_EIO:
6540 case NODELETE:
6541 case BULK:
6542 case NOPGCHANGE:
6543 case NOSIZECHANGE:
6544 case WRITE_FADVISE_DONTNEED:
6545 case NOSCRUB:
6546 case NODEEP_SCRUB:
6547 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6548 if (i->second == *it)
6549 break;
6550 }
6551 ceph_assert(i != ALL_CHOICES.end());
6552 ss << i->first << ": " <<
6553 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6554 "true" : "false") << "\n";
6555 break;
6556 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6557 ss << "min_write_recency_for_promote: " <<
6558 p->min_write_recency_for_promote << "\n";
6559 break;
6560 case FAST_READ:
6561 ss << "fast_read: " << p->fast_read << "\n";
6562 break;
6563 case SCRUB_MIN_INTERVAL:
6564 case SCRUB_MAX_INTERVAL:
6565 case DEEP_SCRUB_INTERVAL:
6566 case RECOVERY_PRIORITY:
6567 case RECOVERY_OP_PRIORITY:
6568 case SCRUB_PRIORITY:
6569 case COMPRESSION_MODE:
6570 case COMPRESSION_ALGORITHM:
6571 case COMPRESSION_REQUIRED_RATIO:
6572 case COMPRESSION_MAX_BLOB_SIZE:
6573 case COMPRESSION_MIN_BLOB_SIZE:
6574 case CSUM_TYPE:
6575 case CSUM_MAX_BLOCK:
6576 case CSUM_MIN_BLOCK:
6577 case FINGERPRINT_ALGORITHM:
6578 case PG_NUM_MIN:
6579 case PG_NUM_MAX:
6580 case TARGET_SIZE_BYTES:
6581 case TARGET_SIZE_RATIO:
6582 case PG_AUTOSCALE_BIAS:
6583 case DEDUP_TIER:
6584 case DEDUP_CHUNK_ALGORITHM:
6585 case DEDUP_CDC_CHUNK_SIZE:
6586 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6587 if (i->second == *it)
6588 break;
6589 }
6590 ceph_assert(i != ALL_CHOICES.end());
6591 {
6592 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6593 if (p->opts.is_set(key)) {
6594 if(key == pool_opts_t::CSUM_TYPE) {
6595 int64_t val;
6596 p->opts.get(key, &val);
6597 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6598 } else {
6599 ss << i->first << ": " << p->opts.get(key) << "\n";
6600 }
6601 }
6602 }
6603 break;
6604 }
6605 rdata.append(ss.str());
6606 ss.str("");
6607 }
6608 }
6609 r = 0;
6610 } else if (prefix == "osd pool get-quota") {
6611 string pool_name;
6612 cmd_getval(cmdmap, "pool", pool_name);
6613
6614 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6615 if (poolid < 0) {
6616 ceph_assert(poolid == -ENOENT);
6617 ss << "unrecognized pool '" << pool_name << "'";
6618 r = -ENOENT;
6619 goto reply;
6620 }
6621 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6622 const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6623 if (!pstat) {
6624 ss << "no stats for pool '" << pool_name << "'";
6625 r = -ENOENT;
6626 goto reply;
6627 }
6628 const object_stat_sum_t& sum = pstat->stats.sum;
6629 if (f) {
6630 f->open_object_section("pool_quotas");
6631 f->dump_string("pool_name", pool_name);
6632 f->dump_unsigned("pool_id", poolid);
6633 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6634 f->dump_int("current_num_objects", sum.num_objects);
6635 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6636 f->dump_int("current_num_bytes", sum.num_bytes);
6637 f->close_section();
6638 f->flush(rdata);
6639 } else {
6640 stringstream rs;
6641 rs << "quotas for pool '" << pool_name << "':\n"
6642 << " max objects: ";
6643 if (p->quota_max_objects == 0)
6644 rs << "N/A";
6645 else {
6646 rs << si_u_t(p->quota_max_objects) << " objects";
6647 rs << " (current num objects: " << sum.num_objects << " objects)";
6648 }
6649 rs << "\n"
6650 << " max bytes : ";
6651 if (p->quota_max_bytes == 0)
6652 rs << "N/A";
6653 else {
6654 rs << byte_u_t(p->quota_max_bytes);
6655 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6656 }
6657 rdata.append(rs.str());
6658 }
6659 rdata.append("\n");
6660 r = 0;
6661 } else if (prefix == "osd crush rule list" ||
6662 prefix == "osd crush rule ls") {
6663 if (f) {
6664 f->open_array_section("rules");
6665 osdmap.crush->list_rules(f.get());
6666 f->close_section();
6667 f->flush(rdata);
6668 } else {
6669 ostringstream ss;
6670 osdmap.crush->list_rules(&ss);
6671 rdata.append(ss.str());
6672 }
6673 } else if (prefix == "osd crush rule ls-by-class") {
6674 string class_name;
6675 cmd_getval(cmdmap, "class", class_name);
6676 if (class_name.empty()) {
6677 ss << "no class specified";
6678 r = -EINVAL;
6679 goto reply;
6680 }
6681 set<int> rules;
6682 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6683 if (r < 0) {
6684 ss << "failed to get rules by class '" << class_name << "'";
6685 goto reply;
6686 }
6687 if (f) {
6688 f->open_array_section("rules");
6689 for (auto &rule: rules) {
6690 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6691 }
6692 f->close_section();
6693 f->flush(rdata);
6694 } else {
6695 ostringstream rs;
6696 for (auto &rule: rules) {
6697 rs << osdmap.crush->get_rule_name(rule) << "\n";
6698 }
6699 rdata.append(rs.str());
6700 }
6701 } else if (prefix == "osd crush rule dump") {
6702 string name;
6703 cmd_getval(cmdmap, "name", name);
6704 string format;
6705 cmd_getval(cmdmap, "format", format);
6706 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6707 if (name == "") {
6708 f->open_array_section("rules");
6709 osdmap.crush->dump_rules(f.get());
6710 f->close_section();
6711 } else {
6712 int ruleno = osdmap.crush->get_rule_id(name);
6713 if (ruleno < 0) {
6714 ss << "unknown crush rule '" << name << "'";
6715 r = ruleno;
6716 goto reply;
6717 }
6718 osdmap.crush->dump_rule(ruleno, f.get());
6719 }
6720 ostringstream rs;
6721 f->flush(rs);
6722 rs << "\n";
6723 rdata.append(rs.str());
6724 } else if (prefix == "osd crush dump") {
6725 string format;
6726 cmd_getval(cmdmap, "format", format);
6727 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6728 f->open_object_section("crush_map");
6729 osdmap.crush->dump(f.get());
6730 f->close_section();
6731 ostringstream rs;
6732 f->flush(rs);
6733 rs << "\n";
6734 rdata.append(rs.str());
6735 } else if (prefix == "osd crush show-tunables") {
6736 string format;
6737 cmd_getval(cmdmap, "format", format);
6738 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6739 f->open_object_section("crush_map_tunables");
6740 osdmap.crush->dump_tunables(f.get());
6741 f->close_section();
6742 ostringstream rs;
6743 f->flush(rs);
6744 rs << "\n";
6745 rdata.append(rs.str());
6746 } else if (prefix == "osd crush tree") {
6747 bool show_shadow = false;
6748 if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
6749 std::string shadow;
6750 if (cmd_getval(cmdmap, "shadow", shadow) &&
6751 shadow == "--show-shadow") {
6752 show_shadow = true;
6753 }
6754 }
6755 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6756 if (f) {
6757 f->open_object_section("crush_tree");
6758 osdmap.crush->dump_tree(nullptr,
6759 f.get(),
6760 osdmap.get_pool_names(),
6761 show_shadow);
6762 f->close_section();
6763 f->flush(rdata);
6764 } else {
6765 ostringstream ss;
6766 osdmap.crush->dump_tree(&ss,
6767 nullptr,
6768 osdmap.get_pool_names(),
6769 show_shadow);
6770 rdata.append(ss.str());
6771 }
6772 } else if (prefix == "osd crush ls") {
6773 string name;
6774 if (!cmd_getval(cmdmap, "node", name)) {
6775 ss << "no node specified";
6776 r = -EINVAL;
6777 goto reply;
6778 }
6779 if (!osdmap.crush->name_exists(name)) {
6780 ss << "node '" << name << "' does not exist";
6781 r = -ENOENT;
6782 goto reply;
6783 }
6784 int id = osdmap.crush->get_item_id(name);
6785 list<int> result;
6786 if (id >= 0) {
6787 result.push_back(id);
6788 } else {
6789 int num = osdmap.crush->get_bucket_size(id);
6790 for (int i = 0; i < num; ++i) {
6791 result.push_back(osdmap.crush->get_bucket_item(id, i));
6792 }
6793 }
6794 if (f) {
6795 f->open_array_section("items");
6796 for (auto i : result) {
6797 f->dump_string("item", osdmap.crush->get_item_name(i));
6798 }
6799 f->close_section();
6800 f->flush(rdata);
6801 } else {
6802 ostringstream ss;
6803 for (auto i : result) {
6804 ss << osdmap.crush->get_item_name(i) << "\n";
6805 }
6806 rdata.append(ss.str());
6807 }
6808 r = 0;
6809 } else if (prefix == "osd crush class ls") {
6810 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6811 f->open_array_section("crush_classes");
6812 for (auto i : osdmap.crush->class_name)
6813 f->dump_string("class", i.second);
6814 f->close_section();
6815 f->flush(rdata);
6816 } else if (prefix == "osd crush class ls-osd") {
6817 string name;
6818 cmd_getval(cmdmap, "class", name);
6819 set<int> osds;
6820 osdmap.crush->get_devices_by_class(name, &osds);
6821 if (f) {
6822 f->open_array_section("osds");
6823 for (auto &osd: osds)
6824 f->dump_int("osd", osd);
6825 f->close_section();
6826 f->flush(rdata);
6827 } else {
6828 bool first = true;
6829 for (auto &osd : osds) {
6830 if (!first)
6831 ds << "\n";
6832 first = false;
6833 ds << osd;
6834 }
6835 rdata.append(ds);
6836 }
6837 } else if (prefix == "osd crush get-device-class") {
6838 vector<string> idvec;
6839 cmd_getval(cmdmap, "ids", idvec);
6840 map<int, string> class_by_osd;
6841 for (auto& id : idvec) {
6842 ostringstream ts;
6843 long osd = parse_osd_id(id.c_str(), &ts);
6844 if (osd < 0) {
6845 ss << "unable to parse osd id:'" << id << "'";
6846 r = -EINVAL;
6847 goto reply;
6848 }
6849 auto device_class = osdmap.crush->get_item_class(osd);
6850 if (device_class)
6851 class_by_osd[osd] = device_class;
6852 else
6853 class_by_osd[osd] = ""; // no class
6854 }
6855 if (f) {
6856 f->open_array_section("osd_device_classes");
6857 for (auto& i : class_by_osd) {
6858 f->open_object_section("osd_device_class");
6859 f->dump_int("osd", i.first);
6860 f->dump_string("device_class", i.second);
6861 f->close_section();
6862 }
6863 f->close_section();
6864 f->flush(rdata);
6865 } else {
6866 if (class_by_osd.size() == 1) {
6867 // for single input, make a clean output
6868 ds << class_by_osd.begin()->second;
6869 } else {
6870 // note that we do not group osds by class here
6871 for (auto it = class_by_osd.begin();
6872 it != class_by_osd.end();
6873 it++) {
6874 ds << "osd." << it->first << ' ' << it->second;
6875 if (next(it) != class_by_osd.end())
6876 ds << '\n';
6877 }
6878 }
6879 rdata.append(ds);
6880 }
6881 } else if (prefix == "osd erasure-code-profile ls") {
6882 const auto &profiles = osdmap.get_erasure_code_profiles();
6883 if (f)
6884 f->open_array_section("erasure-code-profiles");
6885 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6886 if (f)
6887 f->dump_string("profile", i->first.c_str());
6888 else
6889 rdata.append(i->first + "\n");
6890 }
6891 if (f) {
6892 f->close_section();
6893 ostringstream rs;
6894 f->flush(rs);
6895 rs << "\n";
6896 rdata.append(rs.str());
6897 }
6898 } else if (prefix == "osd crush weight-set ls") {
6899 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6900 if (f) {
6901 f->open_array_section("weight_sets");
6902 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6903 f->dump_string("pool", "(compat)");
6904 }
6905 for (auto& i : osdmap.crush->choose_args) {
6906 if (i.first >= 0) {
6907 f->dump_string("pool", osdmap.get_pool_name(i.first));
6908 }
6909 }
6910 f->close_section();
6911 f->flush(rdata);
6912 } else {
6913 ostringstream rs;
6914 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6915 rs << "(compat)\n";
6916 }
6917 for (auto& i : osdmap.crush->choose_args) {
6918 if (i.first >= 0) {
6919 rs << osdmap.get_pool_name(i.first) << "\n";
6920 }
6921 }
6922 rdata.append(rs.str());
6923 }
6924 } else if (prefix == "osd crush weight-set dump") {
6925 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6926 "json-pretty"));
6927 osdmap.crush->dump_choose_args(f.get());
6928 f->flush(rdata);
6929 } else if (prefix == "osd erasure-code-profile get") {
6930 string name;
6931 cmd_getval(cmdmap, "name", name);
6932 if (!osdmap.has_erasure_code_profile(name)) {
6933 ss << "unknown erasure code profile '" << name << "'";
6934 r = -ENOENT;
6935 goto reply;
6936 }
6937 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6938 if (f)
6939 f->open_object_section("profile");
6940 for (map<string,string>::const_iterator i = profile.begin();
6941 i != profile.end();
6942 ++i) {
6943 if (f)
6944 f->dump_string(i->first.c_str(), i->second.c_str());
6945 else
6946 rdata.append(i->first + "=" + i->second + "\n");
6947 }
6948 if (f) {
6949 f->close_section();
6950 ostringstream rs;
6951 f->flush(rs);
6952 rs << "\n";
6953 rdata.append(rs.str());
6954 }
6955 } else if (prefix == "osd pool application get") {
6956 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6957 "json-pretty"));
6958 string pool_name;
6959 cmd_getval(cmdmap, "pool", pool_name);
6960 string app;
6961 cmd_getval(cmdmap, "app", app);
6962 string key;
6963 cmd_getval(cmdmap, "key", key);
6964
6965 if (pool_name.empty()) {
6966 // all
6967 f->open_object_section("pools");
6968 for (const auto &pool : osdmap.pools) {
6969 std::string name("<unknown>");
6970 const auto &pni = osdmap.pool_name.find(pool.first);
6971 if (pni != osdmap.pool_name.end())
6972 name = pni->second;
6973 f->open_object_section(name.c_str());
6974 for (auto &app_pair : pool.second.application_metadata) {
6975 f->open_object_section(app_pair.first.c_str());
6976 for (auto &kv_pair : app_pair.second) {
6977 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6978 }
6979 f->close_section();
6980 }
6981 f->close_section(); // name
6982 }
6983 f->close_section(); // pools
6984 f->flush(rdata);
6985 } else {
6986 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6987 if (pool < 0) {
6988 ss << "unrecognized pool '" << pool_name << "'";
6989 r = -ENOENT;
6990 goto reply;
6991 }
6992 auto p = osdmap.get_pg_pool(pool);
6993 // filter by pool
6994 if (app.empty()) {
6995 f->open_object_section(pool_name.c_str());
6996 for (auto &app_pair : p->application_metadata) {
6997 f->open_object_section(app_pair.first.c_str());
6998 for (auto &kv_pair : app_pair.second) {
6999 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7000 }
7001 f->close_section(); // application
7002 }
7003 f->close_section(); // pool_name
7004 f->flush(rdata);
7005 goto reply;
7006 }
7007
7008 auto app_it = p->application_metadata.find(app);
7009 if (app_it == p->application_metadata.end()) {
7010 ss << "pool '" << pool_name << "' has no application '" << app << "'";
7011 r = -ENOENT;
7012 goto reply;
7013 }
7014 // filter by pool + app
7015 if (key.empty()) {
7016 f->open_object_section(app_it->first.c_str());
7017 for (auto &kv_pair : app_it->second) {
7018 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7019 }
7020 f->close_section(); // application
7021 f->flush(rdata);
7022 goto reply;
7023 }
7024 // filter by pool + app + key
7025 auto key_it = app_it->second.find(key);
7026 if (key_it == app_it->second.end()) {
7027 ss << "application '" << app << "' on pool '" << pool_name
7028 << "' does not have key '" << key << "'";
7029 r = -ENOENT;
7030 goto reply;
7031 }
7032 ss << key_it->second << "\n";
7033 rdata.append(ss.str());
7034 ss.str("");
7035 }
7036 } else if (prefix == "osd get-require-min-compat-client") {
7037 ss << osdmap.require_min_compat_client << std::endl;
7038 rdata.append(ss.str());
7039 ss.str("");
7040 goto reply;
7041 } else if (prefix == "osd pool application enable" ||
7042 prefix == "osd pool application disable" ||
7043 prefix == "osd pool application set" ||
7044 prefix == "osd pool application rm") {
7045 bool changed = false;
7046 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7047 if (r != 0) {
7048 // Error, reply.
7049 goto reply;
7050 } else if (changed) {
7051 // Valid mutation, proceed to prepare phase
7052 return false;
7053 } else {
7054 // Idempotent case, reply
7055 goto reply;
7056 }
7057 } else {
7058 // try prepare update
7059 return false;
7060 }
7061
7062 reply:
7063 string rs;
7064 getline(ss, rs);
7065 mon.reply_command(op, r, rs, rdata, get_last_committed());
7066 return true;
7067 }
7068
7069 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7070 {
7071 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7072 osdmap.get_pg_pool(pool_id));
7073 ceph_assert(pool);
7074 pool->set_flag(flags);
7075 }
7076
7077 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7078 {
7079 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7080 osdmap.get_pg_pool(pool_id));
7081 ceph_assert(pool);
7082 pool->unset_flag(flags);
7083 }
7084
7085 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7086 {
7087 char k[80];
7088 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7089 return k;
7090 }
7091
7092 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7093 {
7094 char k[80];
7095 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7096 (unsigned long long)pool, (unsigned long long)snap);
7097 return k;
7098 }
7099
7100 string OSDMonitor::make_purged_snap_key_value(
7101 int64_t pool, snapid_t snap, snapid_t num,
7102 epoch_t epoch, bufferlist *v)
7103 {
7104 // encode the *last* epoch in the key so that we can use forward
7105 // iteration only to search for an epoch in an interval.
7106 encode(snap, *v);
7107 encode(snap + num, *v);
7108 encode(epoch, *v);
7109 return make_purged_snap_key(pool, snap + num - 1);
7110 }
7111
7112
7113 int OSDMonitor::lookup_purged_snap(
7114 int64_t pool, snapid_t snap,
7115 snapid_t *begin, snapid_t *end)
7116 {
7117 string k = make_purged_snap_key(pool, snap);
7118 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7119 it->lower_bound(k);
7120 if (!it->valid()) {
7121 dout(20) << __func__
7122 << " pool " << pool << " snap " << snap
7123 << " - key '" << k << "' not found" << dendl;
7124 return -ENOENT;
7125 }
7126 if (it->key().find("purged_snap_") != 0) {
7127 dout(20) << __func__
7128 << " pool " << pool << " snap " << snap
7129 << " - key '" << k << "' got '" << it->key()
7130 << "', wrong prefix" << dendl;
7131 return -ENOENT;
7132 }
7133 string gotk = it->key();
7134 const char *format = "purged_snap_%llu_";
7135 long long int keypool;
7136 int n = sscanf(gotk.c_str(), format, &keypool);
7137 if (n != 1) {
7138 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7139 return -ENOENT;
7140 }
7141 if (pool != keypool) {
7142 dout(20) << __func__
7143 << " pool " << pool << " snap " << snap
7144 << " - key '" << k << "' got '" << gotk
7145 << "', wrong pool " << keypool
7146 << dendl;
7147 return -ENOENT;
7148 }
7149 bufferlist v = it->value();
7150 auto p = v.cbegin();
7151 decode(*begin, p);
7152 decode(*end, p);
7153 if (snap < *begin || snap >= *end) {
7154 dout(20) << __func__
7155 << " pool " << pool << " snap " << snap
7156 << " - found [" << *begin << "," << *end << "), no overlap"
7157 << dendl;
7158 return -ENOENT;
7159 }
7160 return 0;
7161 }
7162
7163 void OSDMonitor::insert_purged_snap_update(
7164 int64_t pool,
7165 snapid_t start, snapid_t end,
7166 epoch_t epoch,
7167 MonitorDBStore::TransactionRef t)
7168 {
7169 snapid_t before_begin, before_end;
7170 snapid_t after_begin, after_end;
7171 int b = lookup_purged_snap(pool, start - 1,
7172 &before_begin, &before_end);
7173 int a = lookup_purged_snap(pool, end,
7174 &after_begin, &after_end);
7175 if (!b && !a) {
7176 dout(10) << __func__
7177 << " [" << start << "," << end << ") - joins ["
7178 << before_begin << "," << before_end << ") and ["
7179 << after_begin << "," << after_end << ")" << dendl;
7180 // erase only the begin record; we'll overwrite the end one.
7181 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7182 bufferlist v;
7183 string k = make_purged_snap_key_value(pool,
7184 before_begin, after_end - before_begin,
7185 pending_inc.epoch, &v);
7186 t->put(OSD_SNAP_PREFIX, k, v);
7187 } else if (!b) {
7188 dout(10) << __func__
7189 << " [" << start << "," << end << ") - join with earlier ["
7190 << before_begin << "," << before_end << ")" << dendl;
7191 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7192 bufferlist v;
7193 string k = make_purged_snap_key_value(pool,
7194 before_begin, end - before_begin,
7195 pending_inc.epoch, &v);
7196 t->put(OSD_SNAP_PREFIX, k, v);
7197 } else if (!a) {
7198 dout(10) << __func__
7199 << " [" << start << "," << end << ") - join with later ["
7200 << after_begin << "," << after_end << ")" << dendl;
7201 // overwrite after record
7202 bufferlist v;
7203 string k = make_purged_snap_key_value(pool,
7204 start, after_end - start,
7205 pending_inc.epoch, &v);
7206 t->put(OSD_SNAP_PREFIX, k, v);
7207 } else {
7208 dout(10) << __func__
7209 << " [" << start << "," << end << ") - new"
7210 << dendl;
7211 bufferlist v;
7212 string k = make_purged_snap_key_value(pool,
7213 start, end - start,
7214 pending_inc.epoch, &v);
7215 t->put(OSD_SNAP_PREFIX, k, v);
7216 }
7217 }
7218
7219 bool OSDMonitor::try_prune_purged_snaps()
7220 {
7221 if (!mon.mgrstatmon()->is_readable()) {
7222 return false;
7223 }
7224 if (!pending_inc.new_purged_snaps.empty()) {
7225 return false; // we already pruned for this epoch
7226 }
7227
7228 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7229 "mon_max_snap_prune_per_epoch");
7230 if (!max_prune) {
7231 max_prune = 100000;
7232 }
7233 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7234
7235 unsigned actually_pruned = 0;
7236 auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7237 for (auto& p : osdmap.get_pools()) {
7238 auto q = purged_snaps.find(p.first);
7239 if (q == purged_snaps.end()) {
7240 continue;
7241 }
7242 auto& purged = q->second;
7243 if (purged.empty()) {
7244 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7245 continue;
7246 }
7247 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7248 snap_interval_set_t to_prune;
7249 unsigned maybe_pruned = actually_pruned;
7250 for (auto i = purged.begin(); i != purged.end(); ++i) {
7251 snapid_t begin = i.get_start();
7252 auto end = i.get_start() + i.get_len();
7253 snapid_t pbegin = 0, pend = 0;
7254 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7255 if (r == 0) {
7256 // already purged.
7257 // be a bit aggressive about backing off here, because the mon may
7258 // do a lot of work going through this set, and if we know the
7259 // purged set from the OSDs is at least *partly* stale we may as
7260 // well wait for it to be fresh.
7261 dout(20) << __func__ << " we've already purged " << pbegin
7262 << "~" << (pend - pbegin) << dendl;
7263 break; // next pool
7264 }
7265 if (pbegin && pbegin > begin && pbegin < end) {
7266 // the tail of [begin,end) is purged; shorten the range
7267 end = pbegin;
7268 }
7269 to_prune.insert(begin, end - begin);
7270 maybe_pruned += end - begin;
7271 if (maybe_pruned >= max_prune) {
7272 break;
7273 }
7274 }
7275 if (!to_prune.empty()) {
7276 // PGs may still be reporting things as purged that we have already
7277 // pruned from removed_snaps_queue.
7278 snap_interval_set_t actual;
7279 auto r = osdmap.removed_snaps_queue.find(p.first);
7280 if (r != osdmap.removed_snaps_queue.end()) {
7281 actual.intersection_of(to_prune, r->second);
7282 }
7283 actually_pruned += actual.size();
7284 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7285 << ", actual pruned " << actual << dendl;
7286 if (!actual.empty()) {
7287 pending_inc.new_purged_snaps[p.first].swap(actual);
7288 }
7289 }
7290 if (actually_pruned >= max_prune) {
7291 break;
7292 }
7293 }
7294 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7295 return !!actually_pruned;
7296 }
7297
7298 bool OSDMonitor::update_pools_status()
7299 {
7300 if (!mon.mgrstatmon()->is_readable())
7301 return false;
7302
7303 bool ret = false;
7304
7305 auto& pools = osdmap.get_pools();
7306 for (auto it = pools.begin(); it != pools.end(); ++it) {
7307 const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7308 if (!pstat)
7309 continue;
7310 const object_stat_sum_t& sum = pstat->stats.sum;
7311 const pg_pool_t &pool = it->second;
7312 const string& pool_name = osdmap.get_pool_name(it->first);
7313
7314 bool pool_is_full =
7315 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7316 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7317
7318 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7319 if (pool_is_full)
7320 continue;
7321
7322 mon.clog->info() << "pool '" << pool_name
7323 << "' no longer out of quota; removing NO_QUOTA flag";
7324 // below we cancel FLAG_FULL too, we'll set it again in
7325 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7326 clear_pool_flags(it->first,
7327 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7328 ret = true;
7329 } else {
7330 if (!pool_is_full)
7331 continue;
7332
7333 if (pool.quota_max_bytes > 0 &&
7334 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7335 mon.clog->warn() << "pool '" << pool_name << "' is full"
7336 << " (reached quota's max_bytes: "
7337 << byte_u_t(pool.quota_max_bytes) << ")";
7338 }
7339 if (pool.quota_max_objects > 0 &&
7340 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7341 mon.clog->warn() << "pool '" << pool_name << "' is full"
7342 << " (reached quota's max_objects: "
7343 << pool.quota_max_objects << ")";
7344 }
7345 // set both FLAG_FULL_QUOTA and FLAG_FULL
7346 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7347 // since FLAG_FULL should always take precedence
7348 set_pool_flags(it->first,
7349 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7350 clear_pool_flags(it->first,
7351 pg_pool_t::FLAG_NEARFULL |
7352 pg_pool_t::FLAG_BACKFILLFULL);
7353 ret = true;
7354 }
7355 }
7356 return ret;
7357 }
7358
7359 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7360 {
7361 op->mark_osdmon_event(__func__);
7362 auto m = op->get_req<MPoolOp>();
7363 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7364 MonSession *session = op->get_session();
7365 if (!session)
7366 return -EPERM;
7367 string erasure_code_profile;
7368 stringstream ss;
7369 string rule_name;
7370 bool bulk = false;
7371 int ret = 0;
7372 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7373 0, 0, 0, 0, 0, 0, 0.0,
7374 erasure_code_profile,
7375 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
7376 &ss);
7377
7378 if (ret < 0) {
7379 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7380 }
7381 return ret;
7382 }
7383
7384 int OSDMonitor::crush_rename_bucket(const string& srcname,
7385 const string& dstname,
7386 ostream *ss)
7387 {
7388 int ret;
7389 //
7390 // Avoid creating a pending crush if it does not already exists and
7391 // the rename would fail.
7392 //
7393 if (!_have_pending_crush()) {
7394 ret = _get_stable_crush().can_rename_bucket(srcname,
7395 dstname,
7396 ss);
7397 if (ret)
7398 return ret;
7399 }
7400
7401 CrushWrapper newcrush = _get_pending_crush();
7402
7403 ret = newcrush.rename_bucket(srcname,
7404 dstname,
7405 ss);
7406 if (ret)
7407 return ret;
7408
7409 pending_inc.crush.clear();
7410 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7411 *ss << "renamed bucket " << srcname << " into " << dstname;
7412 return 0;
7413 }
7414
7415 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7416 {
7417 string replacement = "";
7418
7419 if (plugin == "jerasure_generic" ||
7420 plugin == "jerasure_sse3" ||
7421 plugin == "jerasure_sse4" ||
7422 plugin == "jerasure_neon") {
7423 replacement = "jerasure";
7424 } else if (plugin == "shec_generic" ||
7425 plugin == "shec_sse3" ||
7426 plugin == "shec_sse4" ||
7427 plugin == "shec_neon") {
7428 replacement = "shec";
7429 }
7430
7431 if (replacement != "") {
7432 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7433 << plugin << " that has been deprecated. Please use "
7434 << replacement << " instead." << dendl;
7435 }
7436 }
7437
7438 int OSDMonitor::normalize_profile(const string& profilename,
7439 ErasureCodeProfile &profile,
7440 bool force,
7441 ostream *ss)
7442 {
7443 ErasureCodeInterfaceRef erasure_code;
7444 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7445 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7446 check_legacy_ec_plugin(plugin->second, profilename);
7447 int err = instance.factory(plugin->second,
7448 g_conf().get_val<std::string>("erasure_code_dir"),
7449 profile, &erasure_code, ss);
7450 if (err) {
7451 return err;
7452 }
7453
7454 err = erasure_code->init(profile, ss);
7455 if (err) {
7456 return err;
7457 }
7458
7459 auto it = profile.find("stripe_unit");
7460 if (it != profile.end()) {
7461 string err_str;
7462 uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
7463 if (!err_str.empty()) {
7464 *ss << "could not parse stripe_unit '" << it->second
7465 << "': " << err_str << std::endl;
7466 return -EINVAL;
7467 }
7468 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7469 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7470 if (chunk_size != stripe_unit) {
7471 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7472 << "alignment. Would be padded to " << chunk_size
7473 << std::endl;
7474 return -EINVAL;
7475 }
7476 if ((stripe_unit % 4096) != 0 && !force) {
7477 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7478 << "use --force to override this check" << std::endl;
7479 return -EINVAL;
7480 }
7481 }
7482 return 0;
7483 }
7484
7485 int OSDMonitor::crush_rule_create_erasure(const string &name,
7486 const string &profile,
7487 int *rule,
7488 ostream *ss)
7489 {
7490 int ruleid = osdmap.crush->get_rule_id(name);
7491 if (ruleid != -ENOENT) {
7492 *rule = ruleid;
7493 return -EEXIST;
7494 }
7495
7496 CrushWrapper newcrush = _get_pending_crush();
7497
7498 ruleid = newcrush.get_rule_id(name);
7499 if (ruleid != -ENOENT) {
7500 *rule = ruleid;
7501 return -EALREADY;
7502 } else {
7503 ErasureCodeInterfaceRef erasure_code;
7504 int err = get_erasure_code(profile, &erasure_code, ss);
7505 if (err) {
7506 *ss << "failed to load plugin using profile " << profile << std::endl;
7507 return err;
7508 }
7509
7510 err = erasure_code->create_rule(name, newcrush, ss);
7511 erasure_code.reset();
7512 if (err < 0)
7513 return err;
7514 *rule = err;
7515 pending_inc.crush.clear();
7516 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7517 return 0;
7518 }
7519 }
7520
7521 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7522 ErasureCodeInterfaceRef *erasure_code,
7523 ostream *ss) const
7524 {
7525 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7526 return -EAGAIN;
7527 ErasureCodeProfile profile =
7528 osdmap.get_erasure_code_profile(erasure_code_profile);
7529 ErasureCodeProfile::const_iterator plugin =
7530 profile.find("plugin");
7531 if (plugin == profile.end()) {
7532 *ss << "cannot determine the erasure code plugin"
7533 << " because there is no 'plugin' entry in the erasure_code_profile "
7534 << profile << std::endl;
7535 return -EINVAL;
7536 }
7537 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7538 auto& instance = ErasureCodePluginRegistry::instance();
7539 return instance.factory(plugin->second,
7540 g_conf().get_val<std::string>("erasure_code_dir"),
7541 profile, erasure_code, ss);
7542 }
7543
7544 int OSDMonitor::check_cluster_features(uint64_t features,
7545 stringstream &ss)
7546 {
7547 stringstream unsupported_ss;
7548 int unsupported_count = 0;
7549 if ((mon.get_quorum_con_features() & features) != features) {
7550 unsupported_ss << "the monitor cluster";
7551 ++unsupported_count;
7552 }
7553
7554 set<int32_t> up_osds;
7555 osdmap.get_up_osds(up_osds);
7556 for (set<int32_t>::iterator it = up_osds.begin();
7557 it != up_osds.end(); ++it) {
7558 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7559 if ((xi.features & features) != features) {
7560 if (unsupported_count > 0)
7561 unsupported_ss << ", ";
7562 unsupported_ss << "osd." << *it;
7563 unsupported_count ++;
7564 }
7565 }
7566
7567 if (unsupported_count > 0) {
7568 ss << "features " << features << " unsupported by: "
7569 << unsupported_ss.str();
7570 return -ENOTSUP;
7571 }
7572
7573 // check pending osd state, too!
7574 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7575 pending_inc.new_xinfo.begin();
7576 p != pending_inc.new_xinfo.end(); ++p) {
7577 const osd_xinfo_t &xi = p->second;
7578 if ((xi.features & features) != features) {
7579 dout(10) << __func__ << " pending osd." << p->first
7580 << " features are insufficient; retry" << dendl;
7581 return -EAGAIN;
7582 }
7583 }
7584
7585 return 0;
7586 }
7587
7588 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7589 stringstream& ss)
7590 {
7591 OSDMap::Incremental new_pending = pending_inc;
7592 encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7593 OSDMap newmap;
7594 newmap.deepish_copy_from(osdmap);
7595 newmap.apply_incremental(new_pending);
7596
7597 // client compat
7598 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7599 auto mv = newmap.get_min_compat_client();
7600 if (mv > newmap.require_min_compat_client) {
7601 ss << "new crush map requires client version " << mv
7602 << " but require_min_compat_client is "
7603 << newmap.require_min_compat_client;
7604 return false;
7605 }
7606 }
7607
7608 // osd compat
7609 uint64_t features =
7610 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7611 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7612 stringstream features_ss;
7613 int r = check_cluster_features(features, features_ss);
7614 if (r) {
7615 ss << "Could not change CRUSH: " << features_ss.str();
7616 return false;
7617 }
7618
7619 return true;
7620 }
7621
7622 bool OSDMonitor::erasure_code_profile_in_use(
7623 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7624 const string &profile,
7625 ostream *ss)
7626 {
7627 bool found = false;
7628 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7629 p != pools.end();
7630 ++p) {
7631 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7632 *ss << osdmap.pool_name[p->first] << " ";
7633 found = true;
7634 }
7635 }
7636 if (found) {
7637 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7638 }
7639 return found;
7640 }
7641
7642 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7643 map<string,string> *erasure_code_profile_map,
7644 ostream *ss)
7645 {
7646 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7647 get_json_str_map,
7648 *ss,
7649 erasure_code_profile_map,
7650 true);
7651 if (r)
7652 return r;
7653 ceph_assert((*erasure_code_profile_map).count("plugin"));
7654 string default_plugin = (*erasure_code_profile_map)["plugin"];
7655 map<string,string> user_map;
7656 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7657 i != erasure_code_profile.end();
7658 ++i) {
7659 size_t equal = i->find('=');
7660 if (equal == string::npos) {
7661 user_map[*i] = string();
7662 (*erasure_code_profile_map)[*i] = string();
7663 } else {
7664 const string key = i->substr(0, equal);
7665 equal++;
7666 const string value = i->substr(equal);
7667 if (key.find("ruleset-") == 0) {
7668 *ss << "property '" << key << "' is no longer supported; try "
7669 << "'crush-" << key.substr(8) << "' instead";
7670 return -EINVAL;
7671 }
7672 user_map[key] = value;
7673 (*erasure_code_profile_map)[key] = value;
7674 }
7675 }
7676
7677 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7678 (*erasure_code_profile_map) = user_map;
7679
7680 return 0;
7681 }
7682
7683 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7684 const string &erasure_code_profile,
7685 uint8_t repl_size,
7686 unsigned *size, unsigned *min_size,
7687 ostream *ss)
7688 {
7689 int err = 0;
7690 bool set_min_size = false;
7691 switch (pool_type) {
7692 case pg_pool_t::TYPE_REPLICATED:
7693 if (osdmap.stretch_mode_enabled) {
7694 if (repl_size == 0)
7695 repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7696 if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7697 *ss << "prepare_pool_size: we are in stretch mode but size "
7698 << repl_size << " does not match!";
7699 return -EINVAL;
7700 }
7701 *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7702 set_min_size = true;
7703 }
7704 if (repl_size == 0) {
7705 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7706 }
7707 *size = repl_size;
7708 if (!set_min_size)
7709 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7710 break;
7711 case pg_pool_t::TYPE_ERASURE:
7712 {
7713 if (osdmap.stretch_mode_enabled) {
7714 *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7715 return -EINVAL;
7716 }
7717 ErasureCodeInterfaceRef erasure_code;
7718 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7719 if (err == 0) {
7720 *size = erasure_code->get_chunk_count();
7721 *min_size =
7722 erasure_code->get_data_chunk_count() +
7723 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7724 assert(*min_size <= *size);
7725 assert(*min_size >= erasure_code->get_data_chunk_count());
7726 }
7727 }
7728 break;
7729 default:
7730 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7731 err = -EINVAL;
7732 break;
7733 }
7734 return err;
7735 }
7736
7737 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7738 const string &erasure_code_profile,
7739 uint32_t *stripe_width,
7740 ostream *ss)
7741 {
7742 int err = 0;
7743 switch (pool_type) {
7744 case pg_pool_t::TYPE_REPLICATED:
7745 // ignored
7746 break;
7747 case pg_pool_t::TYPE_ERASURE:
7748 {
7749 ErasureCodeProfile profile =
7750 osdmap.get_erasure_code_profile(erasure_code_profile);
7751 ErasureCodeInterfaceRef erasure_code;
7752 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7753 if (err)
7754 break;
7755 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7756 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7757 auto it = profile.find("stripe_unit");
7758 if (it != profile.end()) {
7759 string err_str;
7760 stripe_unit = strict_iecstrtoll(it->second, &err_str);
7761 ceph_assert(err_str.empty());
7762 }
7763 *stripe_width = data_chunks *
7764 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7765 }
7766 break;
7767 default:
7768 *ss << "prepare_pool_stripe_width: "
7769 << pool_type << " is not a known pool type";
7770 err = -EINVAL;
7771 break;
7772 }
7773 return err;
7774 }
7775
7776 int OSDMonitor::get_replicated_stretch_crush_rule()
7777 {
7778 /* we don't write down the stretch rule anywhere, so
7779 * we have to guess it. How? Look at all the pools
7780 * and count up how many times a given rule is used
7781 * on stretch pools and then return the one with
7782 * the most users!
7783 */
7784 map<int,int> rule_counts;
7785 for (const auto& pooli : osdmap.pools) {
7786 const pg_pool_t& p = pooli.second;
7787 if (p.is_replicated() && p.is_stretch_pool()) {
7788 if (!rule_counts.count(p.crush_rule)) {
7789 rule_counts[p.crush_rule] = 1;
7790 } else {
7791 ++rule_counts[p.crush_rule];
7792 }
7793 }
7794 }
7795
7796 if (rule_counts.empty()) {
7797 return -ENOENT;
7798 }
7799
7800 int most_used_count = 0;
7801 int most_used_rule = -1;
7802 for (auto i : rule_counts) {
7803 if (i.second > most_used_count) {
7804 most_used_rule = i.first;
7805 most_used_count = i.second;
7806 }
7807 }
7808 ceph_assert(most_used_count > 0);
7809 ceph_assert(most_used_rule >= 0);
7810 return most_used_rule;
7811 }
7812
7813 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7814 const string &erasure_code_profile,
7815 const string &rule_name,
7816 int *crush_rule,
7817 ostream *ss)
7818 {
7819
7820 if (*crush_rule < 0) {
7821 switch (pool_type) {
7822 case pg_pool_t::TYPE_REPLICATED:
7823 {
7824 if (rule_name == "") {
7825 if (osdmap.stretch_mode_enabled) {
7826 *crush_rule = get_replicated_stretch_crush_rule();
7827 } else {
7828 // Use default rule
7829 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
7830 }
7831 if (*crush_rule < 0) {
7832 // Errors may happen e.g. if no valid rule is available
7833 *ss << "No suitable CRUSH rule exists, check "
7834 << "'osd pool default crush *' config options";
7835 return -ENOENT;
7836 }
7837 } else {
7838 return get_crush_rule(rule_name, crush_rule, ss);
7839 }
7840 }
7841 break;
7842 case pg_pool_t::TYPE_ERASURE:
7843 {
7844 int err = crush_rule_create_erasure(rule_name,
7845 erasure_code_profile,
7846 crush_rule, ss);
7847 switch (err) {
7848 case -EALREADY:
7849 dout(20) << "prepare_pool_crush_rule: rule "
7850 << rule_name << " try again" << dendl;
7851 // fall through
7852 case 0:
7853 // need to wait for the crush rule to be proposed before proceeding
7854 err = -EAGAIN;
7855 break;
7856 case -EEXIST:
7857 err = 0;
7858 break;
7859 }
7860 return err;
7861 }
7862 break;
7863 default:
7864 *ss << "prepare_pool_crush_rule: " << pool_type
7865 << " is not a known pool type";
7866 return -EINVAL;
7867 }
7868 } else {
7869 if (!osdmap.crush->rule_exists(*crush_rule)) {
7870 *ss << "CRUSH rule " << *crush_rule << " not found";
7871 return -ENOENT;
7872 }
7873 }
7874
7875 return 0;
7876 }
7877
7878 int OSDMonitor::get_crush_rule(const string &rule_name,
7879 int *crush_rule,
7880 ostream *ss)
7881 {
7882 int ret;
7883 ret = osdmap.crush->get_rule_id(rule_name);
7884 if (ret != -ENOENT) {
7885 // found it, use it
7886 *crush_rule = ret;
7887 } else {
7888 CrushWrapper newcrush = _get_pending_crush();
7889
7890 ret = newcrush.get_rule_id(rule_name);
7891 if (ret != -ENOENT) {
7892 // found it, wait for it to be proposed
7893 dout(20) << __func__ << ": rule " << rule_name
7894 << " try again" << dendl;
7895 return -EAGAIN;
7896 } else {
7897 // Cannot find it , return error
7898 *ss << "specified rule " << rule_name << " doesn't exist";
7899 return ret;
7900 }
7901 }
7902 return 0;
7903 }
7904
7905 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, int crush_rule, ostream *ss)
7906 {
7907 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7908 uint64_t projected = 0;
7909 unsigned osd_num = 0;
7910 // assume min cluster size 3
7911 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);
7912 if (pool < 0) {
7913 // a new pool
7914 projected += pg_num * size;
7915 }
7916 if (mapping.get_epoch() >= osdmap.get_epoch()) {
7917 set<int> roots;
7918 CrushWrapper newcrush = _get_pending_crush();
7919 newcrush.find_takes_by_rule(crush_rule, &roots);
7920 int max_osd = osdmap.get_max_osd();
7921 for (auto root : roots) {
7922 const char *rootname = newcrush.get_item_name(root);
7923 set<int> osd_ids;
7924 newcrush.get_leaves(rootname, &osd_ids);
7925 unsigned out_osd = 0;
7926 for (auto id : osd_ids) {
7927 if (id > max_osd) {
7928 out_osd++;
7929 continue;
7930 }
7931 projected += mapping.get_osd_acting_pgs(id).size();
7932 }
7933 osd_num += osd_ids.size() - out_osd;
7934 }
7935 if (pool >= 0) {
7936 // update an existing pool's pg num
7937 const auto& pg_info = osdmap.get_pools().at(pool);
7938 // already counted the pgs of this `pool` by iterating crush map, so
7939 // remove them using adding the specified pg num
7940 projected += pg_num * size;
7941 projected -= pg_info.get_pg_num_target() * pg_info.get_size();
7942 }
7943 num_osds = std::max(osd_num, 3u); // assume min cluster size 3
7944 } else {
7945 // use pg_num target for evaluating the projected pg num
7946 for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
7947 if (pool_id == pool) {
7948 projected += pg_num * size;
7949 } else {
7950 projected += pool_info.get_pg_num_target() * pool_info.get_size();
7951 }
7952 }
7953 }
7954 auto max_pgs = max_pgs_per_osd * num_osds;
7955 if (projected > max_pgs) {
7956 if (pool >= 0) {
7957 *ss << "pool id " << pool;
7958 }
7959 *ss << " pg_num " << pg_num << " size " << size
7960 << " would mean " << projected
7961 << " total pgs, which exceeds max " << max_pgs
7962 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7963 << " * num_in_osds " << num_osds << ")";
7964 return -ERANGE;
7965 }
7966 return 0;
7967 }
7968
7969 /**
7970 * @param name The name of the new pool
7971 * @param crush_rule The crush rule to use. If <0, will use the system default
7972 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7973 * @param pg_num The pg_num to use. If set to 0, will use the system default
7974 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7975 * @param pg_num_min min pg_num
7976 * @param pg_num_max max pg_num
7977 * @param repl_size Replication factor, or 0 for default
7978 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7979 * @param pool_type TYPE_ERASURE, or TYPE_REP
7980 * @param expected_num_objects expected number of objects on the pool
7981 * @param fast_read fast read type.
7982 * @param ss human readable error message, if any.
7983 *
7984 * @return 0 on success, negative errno on failure.
7985 */
7986 int OSDMonitor::prepare_new_pool(string& name,
7987 int crush_rule,
7988 const string &crush_rule_name,
7989 unsigned pg_num, unsigned pgp_num,
7990 unsigned pg_num_min,
7991 unsigned pg_num_max,
7992 const uint64_t repl_size,
7993 const uint64_t target_size_bytes,
7994 const float target_size_ratio,
7995 const string &erasure_code_profile,
7996 const unsigned pool_type,
7997 const uint64_t expected_num_objects,
7998 FastReadType fast_read,
7999 const string& pg_autoscale_mode,
8000 bool bulk,
8001 ostream *ss)
8002 {
8003 if (name.length() == 0)
8004 return -EINVAL;
8005 if (pg_num == 0) {
8006 auto pg_num_from_mode =
8007 [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
8008 (const string& mode) {
8009 return mode == "on" ? 1 : pg_num;
8010 };
8011 pg_num = pg_num_from_mode(
8012 pg_autoscale_mode.empty() ?
8013 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
8014 pg_autoscale_mode);
8015 }
8016 if (pgp_num == 0)
8017 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
8018 if (!pgp_num)
8019 pgp_num = pg_num;
8020 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8021 *ss << "'pg_num' must be greater than 0 and less than or equal to "
8022 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8023 << " (you may adjust 'mon max pool pg num' for higher values)";
8024 return -ERANGE;
8025 }
8026 if (pgp_num > pg_num) {
8027 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8028 << ", which in this case is " << pg_num;
8029 return -ERANGE;
8030 }
8031 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
8032 *ss << "'fast_read' can only apply to erasure coding pool";
8033 return -EINVAL;
8034 }
8035 int r;
8036 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
8037 crush_rule_name, &crush_rule, ss);
8038 if (r) {
8039 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
8040 return r;
8041 }
8042 unsigned size, min_size;
8043 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
8044 &size, &min_size, ss);
8045 if (r) {
8046 dout(10) << "prepare_pool_size returns " << r << dendl;
8047 return r;
8048 }
8049 if (g_conf()->mon_osd_crush_smoke_test) {
8050 CrushWrapper newcrush = _get_pending_crush();
8051 ostringstream err;
8052 CrushTester tester(newcrush, err);
8053 tester.set_min_x(0);
8054 tester.set_max_x(50);
8055 tester.set_rule(crush_rule);
8056 tester.set_num_rep(size);
8057 auto start = ceph::coarse_mono_clock::now();
8058 r = tester.test_with_fork(g_conf()->mon_lease);
8059 auto duration = ceph::coarse_mono_clock::now() - start;
8060 if (r < 0) {
8061 dout(10) << "tester.test_with_fork returns " << r
8062 << ": " << err.str() << dendl;
8063 *ss << "crush test failed with " << r << ": " << err.str();
8064 return r;
8065 }
8066 dout(10) << __func__ << " crush smoke test duration: "
8067 << duration << dendl;
8068 }
8069 r = check_pg_num(-1, pg_num, size, crush_rule, ss);
8070 if (r) {
8071 dout(10) << "check_pg_num returns " << r << dendl;
8072 return r;
8073 }
8074
8075 if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) {
8076 *ss << "crush rule " << crush_rule << " type does not match pool";
8077 return -EINVAL;
8078 }
8079
8080 uint32_t stripe_width = 0;
8081 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
8082 if (r) {
8083 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
8084 return r;
8085 }
8086
8087 bool fread = false;
8088 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8089 switch (fast_read) {
8090 case FAST_READ_OFF:
8091 fread = false;
8092 break;
8093 case FAST_READ_ON:
8094 fread = true;
8095 break;
8096 case FAST_READ_DEFAULT:
8097 fread = g_conf()->osd_pool_default_ec_fast_read;
8098 break;
8099 default:
8100 *ss << "invalid fast_read setting: " << fast_read;
8101 return -EINVAL;
8102 }
8103 }
8104
8105 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8106 p != pending_inc.new_pool_names.end();
8107 ++p) {
8108 if (p->second == name)
8109 return 0;
8110 }
8111
8112 if (-1 == pending_inc.new_pool_max)
8113 pending_inc.new_pool_max = osdmap.pool_max;
8114 int64_t pool = ++pending_inc.new_pool_max;
8115 pg_pool_t empty;
8116 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
8117 pi->create_time = ceph_clock_now();
8118 pi->type = pool_type;
8119 pi->fast_read = fread;
8120 pi->flags = g_conf()->osd_pool_default_flags;
8121 if (bulk) {
8122 pi->set_flag(pg_pool_t::FLAG_BULK);
8123 } else if (g_conf()->osd_pool_default_flag_bulk) {
8124 pi->set_flag(pg_pool_t::FLAG_BULK);
8125 }
8126 if (g_conf()->osd_pool_default_flag_hashpspool)
8127 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
8128 if (g_conf()->osd_pool_default_flag_nodelete)
8129 pi->set_flag(pg_pool_t::FLAG_NODELETE);
8130 if (g_conf()->osd_pool_default_flag_nopgchange)
8131 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8132 if (g_conf()->osd_pool_default_flag_nosizechange)
8133 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
8134 pi->set_flag(pg_pool_t::FLAG_CREATING);
8135 if (g_conf()->osd_pool_use_gmt_hitset)
8136 pi->use_gmt_hitset = true;
8137 else
8138 pi->use_gmt_hitset = false;
8139
8140 pi->size = size;
8141 pi->min_size = min_size;
8142 pi->crush_rule = crush_rule;
8143 pi->expected_num_objects = expected_num_objects;
8144 pi->object_hash = CEPH_STR_HASH_RJENKINS;
8145 if (osdmap.stretch_mode_enabled) {
8146 pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8147 pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8148 pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8149 pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8150 if (osdmap.degraded_stretch_mode) {
8151 pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8152 pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8153 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8154 // TODO: drat, we don't record this ^ anywhere, though given that it
8155 // necessarily won't exist elsewhere it likely doesn't matter
8156 pi->min_size = pi->min_size / 2;
8157 pi->size = pi->size / 2; // only support 2 zones now
8158 }
8159 }
8160
8161 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8162 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8163 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8164 pi->pg_autoscale_mode = m;
8165 } else {
8166 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
8167 }
8168 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8169 pi->set_pg_num(
8170 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8171 : pg_num);
8172 pi->set_pg_num_pending(pi->get_pg_num());
8173 pi->set_pg_num_target(pg_num);
8174 pi->set_pgp_num(pi->get_pg_num());
8175 pi->set_pgp_num_target(pgp_num);
8176 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8177 pg_num_min) {
8178 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8179 }
8180 if (osdmap.require_osd_release >= ceph_release_t::quincy &&
8181 pg_num_max) {
8182 pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
8183 }
8184 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8185 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8186 pi->pg_autoscale_mode = m;
8187 }
8188
8189 pi->last_change = pending_inc.epoch;
8190 pi->auid = 0;
8191
8192 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8193 pi->erasure_code_profile = erasure_code_profile;
8194 } else {
8195 pi->erasure_code_profile = "";
8196 }
8197 pi->stripe_width = stripe_width;
8198
8199 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8200 target_size_bytes) {
8201 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8202 // larger than int32_t max.
8203 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8204 }
8205 if (target_size_ratio > 0.0 &&
8206 osdmap.require_osd_release >= ceph_release_t::nautilus) {
8207 // only store for nautilus+, just to be consistent and tidy.
8208 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8209 }
8210
8211 pi->cache_target_dirty_ratio_micro =
8212 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8213 pi->cache_target_dirty_high_ratio_micro =
8214 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8215 pi->cache_target_full_ratio_micro =
8216 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8217 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8218 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8219
8220 pending_inc.new_pool_names[pool] = name;
8221 return 0;
8222 }
8223
8224 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8225 {
8226 op->mark_osdmon_event(__func__);
8227 ostringstream ss;
8228 if (pending_inc.new_flags < 0)
8229 pending_inc.new_flags = osdmap.get_flags();
8230 pending_inc.new_flags |= flag;
8231 ss << OSDMap::get_flag_string(flag) << " is set";
8232 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8233 get_last_committed() + 1));
8234 return true;
8235 }
8236
8237 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8238 {
8239 op->mark_osdmon_event(__func__);
8240 ostringstream ss;
8241 if (pending_inc.new_flags < 0)
8242 pending_inc.new_flags = osdmap.get_flags();
8243 pending_inc.new_flags &= ~flag;
8244 ss << OSDMap::get_flag_string(flag) << " is unset";
8245 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8246 get_last_committed() + 1));
8247 return true;
8248 }
8249
8250 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8251 stringstream& ss)
8252 {
8253 string poolstr;
8254 cmd_getval(cmdmap, "pool", poolstr);
8255 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8256 if (pool < 0) {
8257 ss << "unrecognized pool '" << poolstr << "'";
8258 return -ENOENT;
8259 }
8260 string var;
8261 cmd_getval(cmdmap, "var", var);
8262
8263 pg_pool_t p = *osdmap.get_pg_pool(pool);
8264 if (pending_inc.new_pools.count(pool))
8265 p = pending_inc.new_pools[pool];
8266
8267 // accept val as a json string in the normal case (current
8268 // generation monitor). parse out int or float values from the
8269 // string as needed. however, if it is not a string, try to pull
8270 // out an int, in case an older monitor with an older json schema is
8271 // forwarding a request.
8272 string val;
8273 string interr, floaterr;
8274 int64_t n = 0;
8275 double f = 0;
8276 int64_t uf = 0; // micro-f
8277 cmd_getval(cmdmap, "val", val);
8278
8279 auto si_options = {
8280 "target_max_objects"
8281 };
8282 auto iec_options = {
8283 "target_max_bytes",
8284 "target_size_bytes",
8285 "compression_max_blob_size",
8286 "compression_min_blob_size",
8287 "csum_max_block",
8288 "csum_min_block",
8289 };
8290 if (count(begin(si_options), end(si_options), var)) {
8291 n = strict_si_cast<int64_t>(val, &interr);
8292 } else if (count(begin(iec_options), end(iec_options), var)) {
8293 n = strict_iec_cast<int64_t>(val, &interr);
8294 } else {
8295 // parse string as both int and float; different fields use different types.
8296 n = strict_strtoll(val.c_str(), 10, &interr);
8297 f = strict_strtod(val.c_str(), &floaterr);
8298 uf = llrintl(f * (double)1000000.0);
8299 }
8300
8301 if (!p.is_tier() &&
8302 (var == "hit_set_type" || var == "hit_set_period" ||
8303 var == "hit_set_count" || var == "hit_set_fpp" ||
8304 var == "target_max_objects" || var == "target_max_bytes" ||
8305 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8306 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8307 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8308 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8309 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8310 return -EACCES;
8311 }
8312
8313 if (var == "size") {
8314 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8315 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8316 return -EPERM;
8317 }
8318 if (p.type == pg_pool_t::TYPE_ERASURE) {
8319 ss << "can not change the size of an erasure-coded pool";
8320 return -ENOTSUP;
8321 }
8322 if (interr.length()) {
8323 ss << "error parsing integer value '" << val << "': " << interr;
8324 return -EINVAL;
8325 }
8326 if (n <= 0 || n > 10) {
8327 ss << "pool size must be between 1 and 10";
8328 return -EINVAL;
8329 }
8330 if (n == 1) {
8331 if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8332 ss << "configuring pool size as 1 is disabled by default.";
8333 return -EPERM;
8334 }
8335 bool sure = false;
8336 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8337 if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8338 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8339 "pass the flag --yes-i-really-mean-it.";
8340 return -EPERM;
8341 }
8342 }
8343 if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) {
8344 ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
8345 return -EINVAL;
8346 }
8347 int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
8348 if (r < 0) {
8349 return r;
8350 }
8351 p.size = n;
8352 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8353 } else if (var == "min_size") {
8354 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8355 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8356 return -EPERM;
8357 }
8358 if (interr.length()) {
8359 ss << "error parsing integer value '" << val << "': " << interr;
8360 return -EINVAL;
8361 }
8362
8363 if (p.type != pg_pool_t::TYPE_ERASURE) {
8364 if (n < 1 || n > p.size) {
8365 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8366 return -EINVAL;
8367 }
8368 } else {
8369 ErasureCodeInterfaceRef erasure_code;
8370 int k;
8371 stringstream tmp;
8372 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8373 if (err == 0) {
8374 k = erasure_code->get_data_chunk_count();
8375 } else {
8376 ss << __func__ << " get_erasure_code failed: " << tmp.str();
8377 return err;
8378 }
8379
8380 if (n < k || n > p.size) {
8381 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8382 return -EINVAL;
8383 }
8384 }
8385 p.min_size = n;
8386 } else if (var == "pg_num_actual") {
8387 if (interr.length()) {
8388 ss << "error parsing integer value '" << val << "': " << interr;
8389 return -EINVAL;
8390 }
8391 if (n == (int)p.get_pg_num()) {
8392 return 0;
8393 }
8394 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8395 ss << "'pg_num' must be greater than 0 and less than or equal to "
8396 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8397 << " (you may adjust 'mon max pool pg num' for higher values)";
8398 return -ERANGE;
8399 }
8400 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8401 ss << "cannot adjust pg_num while initial PGs are being created";
8402 return -EBUSY;
8403 }
8404 if (n > (int)p.get_pg_num()) {
8405 if (p.get_pg_num() != p.get_pg_num_pending()) {
8406 // force pre-nautilus clients to resend their ops, since they
8407 // don't understand pg_num_pending changes form a new interval
8408 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8409 }
8410 p.set_pg_num(n);
8411 } else {
8412 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8413 ss << "nautilus OSDs are required to adjust pg_num_pending";
8414 return -EPERM;
8415 }
8416 if (n < (int)p.get_pgp_num()) {
8417 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8418 return -EINVAL;
8419 }
8420 if (n < (int)p.get_pg_num() - 1) {
8421 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8422 << ") - 1; only single pg decrease is currently supported";
8423 return -EINVAL;
8424 }
8425 p.set_pg_num_pending(n);
8426 // force pre-nautilus clients to resend their ops, since they
8427 // don't understand pg_num_pending changes form a new interval
8428 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8429 }
8430 // force pre-luminous clients to resend their ops, since they
8431 // don't understand that split PGs now form a new interval.
8432 p.last_force_op_resend_preluminous = pending_inc.epoch;
8433 } else if (var == "pg_num") {
8434 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8435 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8436 return -EPERM;
8437 }
8438 if (interr.length()) {
8439 ss << "error parsing integer value '" << val << "': " << interr;
8440 return -EINVAL;
8441 }
8442 if (n == (int)p.get_pg_num_target()) {
8443 return 0;
8444 }
8445 if (n <= 0 || static_cast<uint64_t>(n) >
8446 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8447 ss << "'pg_num' must be greater than 0 and less than or equal to "
8448 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8449 << " (you may adjust 'mon max pool pg num' for higher values)";
8450 return -ERANGE;
8451 }
8452 if (n > (int)p.get_pg_num_target()) {
8453 int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
8454 if (r) {
8455 return r;
8456 }
8457 bool force = false;
8458 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8459 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8460 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8461 return -EPERM;
8462 }
8463 } else {
8464 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8465 ss << "nautilus OSDs are required to decrease pg_num";
8466 return -EPERM;
8467 }
8468 }
8469 int64_t pg_min = 0, pg_max = 0;
8470 p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
8471 p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
8472 if (pg_min && n < pg_min) {
8473 ss << "specified pg_num " << n
8474 << " < pg_num_min " << pg_min;
8475 return -EINVAL;
8476 }
8477 if (pg_max && n > pg_max) {
8478 ss << "specified pg_num " << n
8479 << " < pg_num_max " << pg_max;
8480 return -EINVAL;
8481 }
8482 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8483 // pre-nautilus osdmap format; increase pg_num directly
8484 assert(n > (int)p.get_pg_num());
8485 // force pre-nautilus clients to resend their ops, since they
8486 // don't understand pg_num_target changes form a new interval
8487 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8488 // force pre-luminous clients to resend their ops, since they
8489 // don't understand that split PGs now form a new interval.
8490 p.last_force_op_resend_preluminous = pending_inc.epoch;
8491 p.set_pg_num(n);
8492 } else {
8493 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8494 // make pgp_num track pg_num if it already matches. if it is set
8495 // differently, leave it different and let the user control it
8496 // manually.
8497 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8498 p.set_pgp_num_target(n);
8499 }
8500 p.set_pg_num_target(n);
8501 }
8502 } else if (var == "pgp_num_actual") {
8503 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8504 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8505 return -EPERM;
8506 }
8507 if (interr.length()) {
8508 ss << "error parsing integer value '" << val << "': " << interr;
8509 return -EINVAL;
8510 }
8511 if (n <= 0) {
8512 ss << "specified pgp_num must > 0, but you set to " << n;
8513 return -EINVAL;
8514 }
8515 if (n > (int)p.get_pg_num()) {
8516 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8517 return -EINVAL;
8518 }
8519 if (n > (int)p.get_pg_num_pending()) {
8520 ss << "specified pgp_num " << n
8521 << " > pg_num_pending " << p.get_pg_num_pending();
8522 return -EINVAL;
8523 }
8524 p.set_pgp_num(n);
8525 } else if (var == "pgp_num") {
8526 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8527 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8528 return -EPERM;
8529 }
8530 if (interr.length()) {
8531 ss << "error parsing integer value '" << val << "': " << interr;
8532 return -EINVAL;
8533 }
8534 if (n <= 0) {
8535 ss << "specified pgp_num must > 0, but you set to " << n;
8536 return -EINVAL;
8537 }
8538 if (n > (int)p.get_pg_num_target()) {
8539 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8540 return -EINVAL;
8541 }
8542 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8543 // pre-nautilus osdmap format; increase pgp_num directly
8544 p.set_pgp_num(n);
8545 } else {
8546 p.set_pgp_num_target(n);
8547 }
8548 } else if (var == "pg_autoscale_mode") {
8549 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8550 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8551 ss << "specified invalid mode " << val;
8552 return -EINVAL;
8553 }
8554 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8555 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8556 return -EINVAL;
8557 }
8558 p.pg_autoscale_mode = m;
8559 } else if (var == "crush_rule") {
8560 int id = osdmap.crush->get_rule_id(val);
8561 if (id == -ENOENT) {
8562 ss << "crush rule " << val << " does not exist";
8563 return -ENOENT;
8564 }
8565 if (id < 0) {
8566 ss << cpp_strerror(id);
8567 return -ENOENT;
8568 }
8569 if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) {
8570 ss << "crush rule " << id << " type does not match pool";
8571 return -EINVAL;
8572 }
8573 p.crush_rule = id;
8574 } else if (var == "nodelete" || var == "nopgchange" ||
8575 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8576 var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
8577 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8578 // make sure we only compare against 'n' if we didn't receive a string
8579 if (val == "true" || (interr.empty() && n == 1)) {
8580 p.set_flag(flag);
8581 } else if (val == "false" || (interr.empty() && n == 0)) {
8582 p.unset_flag(flag);
8583 } else {
8584 ss << "expecting value 'true', 'false', '0', or '1'";
8585 return -EINVAL;
8586 }
8587 } else if (var == "eio") {
8588 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8589
8590 // make sure we only compare against 'n' if we didn't receive a string
8591 if (val == "true" || (interr.empty() && n == 1)) {
8592 p.set_flag(flag);
8593 } else if (val == "false" || (interr.empty() && n == 0)) {
8594 p.unset_flag(flag);
8595 } else {
8596 ss << "expecting value 'true', 'false', '0', or '1'";
8597 return -EINVAL;
8598 }
8599 } else if (var == "hashpspool") {
8600 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8601 bool force = false;
8602 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8603
8604 if (!force) {
8605 ss << "are you SURE? this will remap all placement groups in this pool,"
8606 " this triggers large data movement,"
8607 " pass --yes-i-really-mean-it if you really do.";
8608 return -EPERM;
8609 }
8610 // make sure we only compare against 'n' if we didn't receive a string
8611 if (val == "true" || (interr.empty() && n == 1)) {
8612 p.set_flag(flag);
8613 } else if (val == "false" || (interr.empty() && n == 0)) {
8614 p.unset_flag(flag);
8615 } else {
8616 ss << "expecting value 'true', 'false', '0', or '1'";
8617 return -EINVAL;
8618 }
8619 } else if (var == "hit_set_type") {
8620 if (val == "none")
8621 p.hit_set_params = HitSet::Params();
8622 else {
8623 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8624 if (err)
8625 return err;
8626 if (val == "bloom") {
8627 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8628 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8629 p.hit_set_params = HitSet::Params(bsp);
8630 } else if (val == "explicit_hash")
8631 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8632 else if (val == "explicit_object")
8633 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8634 else {
8635 ss << "unrecognized hit_set type '" << val << "'";
8636 return -EINVAL;
8637 }
8638 }
8639 } else if (var == "hit_set_period") {
8640 if (interr.length()) {
8641 ss << "error parsing integer value '" << val << "': " << interr;
8642 return -EINVAL;
8643 } else if (n < 0) {
8644 ss << "hit_set_period should be non-negative";
8645 return -EINVAL;
8646 }
8647 p.hit_set_period = n;
8648 } else if (var == "hit_set_count") {
8649 if (interr.length()) {
8650 ss << "error parsing integer value '" << val << "': " << interr;
8651 return -EINVAL;
8652 } else if (n < 0) {
8653 ss << "hit_set_count should be non-negative";
8654 return -EINVAL;
8655 }
8656 p.hit_set_count = n;
8657 } else if (var == "hit_set_fpp") {
8658 if (floaterr.length()) {
8659 ss << "error parsing floating point value '" << val << "': " << floaterr;
8660 return -EINVAL;
8661 } else if (f < 0 || f > 1.0) {
8662 ss << "hit_set_fpp should be in the range 0..1";
8663 return -EINVAL;
8664 }
8665 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8666 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8667 return -EINVAL;
8668 }
8669 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8670 bloomp->set_fpp(f);
8671 } else if (var == "use_gmt_hitset") {
8672 if (val == "true" || (interr.empty() && n == 1)) {
8673 p.use_gmt_hitset = true;
8674 } else {
8675 ss << "expecting value 'true' or '1'";
8676 return -EINVAL;
8677 }
8678 } else if (var == "allow_ec_overwrites") {
8679 if (!p.is_erasure()) {
8680 ss << "ec overwrites can only be enabled for an erasure coded pool";
8681 return -EINVAL;
8682 }
8683 stringstream err;
8684 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8685 !is_pool_currently_all_bluestore(pool, p, &err)) {
8686 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8687 return -EINVAL;
8688 }
8689 if (val == "true" || (interr.empty() && n == 1)) {
8690 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8691 } else if (val == "false" || (interr.empty() && n == 0)) {
8692 ss << "ec overwrites cannot be disabled once enabled";
8693 return -EINVAL;
8694 } else {
8695 ss << "expecting value 'true', 'false', '0', or '1'";
8696 return -EINVAL;
8697 }
8698 } else if (var == "target_max_objects") {
8699 if (interr.length()) {
8700 ss << "error parsing int '" << val << "': " << interr;
8701 return -EINVAL;
8702 }
8703 p.target_max_objects = n;
8704 } else if (var == "target_max_bytes") {
8705 if (interr.length()) {
8706 ss << "error parsing int '" << val << "': " << interr;
8707 return -EINVAL;
8708 }
8709 p.target_max_bytes = n;
8710 } else if (var == "cache_target_dirty_ratio") {
8711 if (floaterr.length()) {
8712 ss << "error parsing float '" << val << "': " << floaterr;
8713 return -EINVAL;
8714 }
8715 if (f < 0 || f > 1.0) {
8716 ss << "value must be in the range 0..1";
8717 return -ERANGE;
8718 }
8719 p.cache_target_dirty_ratio_micro = uf;
8720 } else if (var == "cache_target_dirty_high_ratio") {
8721 if (floaterr.length()) {
8722 ss << "error parsing float '" << val << "': " << floaterr;
8723 return -EINVAL;
8724 }
8725 if (f < 0 || f > 1.0) {
8726 ss << "value must be in the range 0..1";
8727 return -ERANGE;
8728 }
8729 p.cache_target_dirty_high_ratio_micro = uf;
8730 } else if (var == "cache_target_full_ratio") {
8731 if (floaterr.length()) {
8732 ss << "error parsing float '" << val << "': " << floaterr;
8733 return -EINVAL;
8734 }
8735 if (f < 0 || f > 1.0) {
8736 ss << "value must be in the range 0..1";
8737 return -ERANGE;
8738 }
8739 p.cache_target_full_ratio_micro = uf;
8740 } else if (var == "cache_min_flush_age") {
8741 if (interr.length()) {
8742 ss << "error parsing int '" << val << "': " << interr;
8743 return -EINVAL;
8744 }
8745 p.cache_min_flush_age = n;
8746 } else if (var == "cache_min_evict_age") {
8747 if (interr.length()) {
8748 ss << "error parsing int '" << val << "': " << interr;
8749 return -EINVAL;
8750 }
8751 p.cache_min_evict_age = n;
8752 } else if (var == "min_read_recency_for_promote") {
8753 if (interr.length()) {
8754 ss << "error parsing integer value '" << val << "': " << interr;
8755 return -EINVAL;
8756 }
8757 p.min_read_recency_for_promote = n;
8758 } else if (var == "hit_set_grade_decay_rate") {
8759 if (interr.length()) {
8760 ss << "error parsing integer value '" << val << "': " << interr;
8761 return -EINVAL;
8762 }
8763 if (n > 100 || n < 0) {
8764 ss << "value out of range,valid range is 0 - 100";
8765 return -EINVAL;
8766 }
8767 p.hit_set_grade_decay_rate = n;
8768 } else if (var == "hit_set_search_last_n") {
8769 if (interr.length()) {
8770 ss << "error parsing integer value '" << val << "': " << interr;
8771 return -EINVAL;
8772 }
8773 if (n > p.hit_set_count || n < 0) {
8774 ss << "value out of range,valid range is 0 - hit_set_count";
8775 return -EINVAL;
8776 }
8777 p.hit_set_search_last_n = n;
8778 } else if (var == "min_write_recency_for_promote") {
8779 if (interr.length()) {
8780 ss << "error parsing integer value '" << val << "': " << interr;
8781 return -EINVAL;
8782 }
8783 p.min_write_recency_for_promote = n;
8784 } else if (var == "fast_read") {
8785 if (p.is_replicated()) {
8786 ss << "fast read is not supported in replication pool";
8787 return -EINVAL;
8788 }
8789 if (val == "true" || (interr.empty() && n == 1)) {
8790 p.fast_read = true;
8791 } else if (val == "false" || (interr.empty() && n == 0)) {
8792 p.fast_read = false;
8793 } else {
8794 ss << "expecting value 'true', 'false', '0', or '1'";
8795 return -EINVAL;
8796 }
8797 } else if (pool_opts_t::is_opt_name(var)) {
8798 bool unset = val == "unset";
8799 if (var == "compression_mode") {
8800 if (!unset) {
8801 auto cmode = Compressor::get_comp_mode_type(val);
8802 if (!cmode) {
8803 ss << "unrecognized compression mode '" << val << "'";
8804 return -EINVAL;
8805 }
8806 }
8807 } else if (var == "compression_algorithm") {
8808 if (!unset) {
8809 auto alg = Compressor::get_comp_alg_type(val);
8810 if (!alg) {
8811 ss << "unrecognized compression_algorithm '" << val << "'";
8812 return -EINVAL;
8813 }
8814 }
8815 } else if (var == "compression_required_ratio") {
8816 if (floaterr.length()) {
8817 ss << "error parsing float value '" << val << "': " << floaterr;
8818 return -EINVAL;
8819 }
8820 if (f < 0 || f > 1) {
8821 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8822 return -EINVAL;
8823 }
8824 } else if (var == "csum_type") {
8825 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8826 if (t < 0 ) {
8827 ss << "unrecognized csum_type '" << val << "'";
8828 return -EINVAL;
8829 }
8830 //preserve csum_type numeric value
8831 n = t;
8832 interr.clear();
8833 } else if (var == "compression_max_blob_size" ||
8834 var == "compression_min_blob_size" ||
8835 var == "csum_max_block" ||
8836 var == "csum_min_block") {
8837 if (interr.length()) {
8838 ss << "error parsing int value '" << val << "': " << interr;
8839 return -EINVAL;
8840 }
8841 } else if (var == "fingerprint_algorithm") {
8842 if (!unset) {
8843 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8844 if (!alg) {
8845 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8846 return -EINVAL;
8847 }
8848 }
8849 } else if (var == "target_size_bytes") {
8850 if (interr.length()) {
8851 ss << "error parsing unit value '" << val << "': " << interr;
8852 return -EINVAL;
8853 }
8854 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8855 ss << "must set require_osd_release to nautilus or "
8856 << "later before setting target_size_bytes";
8857 return -EINVAL;
8858 }
8859 } else if (var == "target_size_ratio") {
8860 if (f < 0.0) {
8861 ss << "target_size_ratio cannot be negative";
8862 return -EINVAL;
8863 }
8864 } else if (var == "pg_num_min") {
8865 if (interr.length()) {
8866 ss << "error parsing int value '" << val << "': " << interr;
8867 return -EINVAL;
8868 }
8869 if (n > (int)p.get_pg_num_target()) {
8870 ss << "specified pg_num_min " << n
8871 << " > pg_num " << p.get_pg_num_target();
8872 return -EINVAL;
8873 }
8874 } else if (var == "pg_num_max") {
8875 if (interr.length()) {
8876 ss << "error parsing int value '" << val << "': " << interr;
8877 return -EINVAL;
8878 }
8879 if (n && n < (int)p.get_pg_num_target()) {
8880 ss << "specified pg_num_max " << n
8881 << " < pg_num " << p.get_pg_num_target();
8882 return -EINVAL;
8883 }
8884 } else if (var == "recovery_priority") {
8885 if (interr.length()) {
8886 ss << "error parsing int value '" << val << "': " << interr;
8887 return -EINVAL;
8888 }
8889 if (!g_conf()->debug_allow_any_pool_priority) {
8890 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8891 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8892 << " and " << OSD_POOL_PRIORITY_MAX;
8893 return -EINVAL;
8894 }
8895 }
8896 } else if (var == "pg_autoscale_bias") {
8897 if (f < 0.0 || f > 1000.0) {
8898 ss << "pg_autoscale_bias must be between 0 and 1000";
8899 return -EINVAL;
8900 }
8901 } else if (var == "dedup_tier") {
8902 if (interr.empty()) {
8903 ss << "expecting value 'pool name'";
8904 return -EINVAL;
8905 }
8906 // Current base tier in dedup does not support ec pool
8907 if (p.is_erasure()) {
8908 ss << "pool '" << poolstr
8909 << "' is an ec pool, which cannot be a base tier";
8910 return -ENOTSUP;
8911 }
8912 int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8913 if (lowtierpool_id < 0) {
8914 ss << "unrecognized pool '" << val << "'";
8915 return -ENOENT;
8916 }
8917 const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8918 ceph_assert(tp);
8919 n = lowtierpool_id;
8920 // The original input is string (pool name), but we convert it to int64_t.
8921 // So, clear interr
8922 interr.clear();
8923 } else if (var == "dedup_chunk_algorithm") {
8924 if (!unset) {
8925 auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8926 if (!alg) {
8927 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8928 return -EINVAL;
8929 }
8930 }
8931 } else if (var == "dedup_cdc_chunk_size") {
8932 if (interr.length()) {
8933 ss << "error parsing int value '" << val << "': " << interr;
8934 return -EINVAL;
8935 }
8936 }
8937
8938 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8939 switch (desc.type) {
8940 case pool_opts_t::STR:
8941 if (unset) {
8942 p.opts.unset(desc.key);
8943 } else {
8944 p.opts.set(desc.key, static_cast<std::string>(val));
8945 }
8946 break;
8947 case pool_opts_t::INT:
8948 if (interr.length()) {
8949 ss << "error parsing integer value '" << val << "': " << interr;
8950 return -EINVAL;
8951 }
8952 if (n == 0) {
8953 p.opts.unset(desc.key);
8954 } else {
8955 p.opts.set(desc.key, static_cast<int64_t>(n));
8956 }
8957 break;
8958 case pool_opts_t::DOUBLE:
8959 if (floaterr.length()) {
8960 ss << "error parsing floating point value '" << val << "': " << floaterr;
8961 return -EINVAL;
8962 }
8963 if (f == 0) {
8964 p.opts.unset(desc.key);
8965 } else {
8966 p.opts.set(desc.key, static_cast<double>(f));
8967 }
8968 break;
8969 default:
8970 ceph_assert(!"unknown type");
8971 }
8972 } else {
8973 ss << "unrecognized variable '" << var << "'";
8974 return -EINVAL;
8975 }
8976 if (val != "unset") {
8977 ss << "set pool " << pool << " " << var << " to " << val;
8978 } else {
8979 ss << "unset pool " << pool << " " << var;
8980 }
8981 p.last_change = pending_inc.epoch;
8982 pending_inc.new_pools[pool] = p;
8983 return 0;
8984 }
8985
8986 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8987 const cmdmap_t& cmdmap,
8988 stringstream& ss)
8989 {
8990 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8991 }
8992
8993 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8994 const cmdmap_t& cmdmap,
8995 stringstream& ss,
8996 bool *modified)
8997 {
8998 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8999 }
9000
9001
9002 /**
9003 * Common logic for preprocess and prepare phases of pool application
9004 * tag commands. In preprocess mode we're only detecting invalid
9005 * commands, and determining whether it was a modification or a no-op.
9006 * In prepare mode we're actually updating the pending state.
9007 */
9008 int OSDMonitor::_command_pool_application(const string &prefix,
9009 const cmdmap_t& cmdmap,
9010 stringstream& ss,
9011 bool *modified,
9012 bool preparing)
9013 {
9014 string pool_name;
9015 cmd_getval(cmdmap, "pool", pool_name);
9016 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
9017 if (pool < 0) {
9018 ss << "unrecognized pool '" << pool_name << "'";
9019 return -ENOENT;
9020 }
9021
9022 pg_pool_t p = *osdmap.get_pg_pool(pool);
9023 if (preparing) {
9024 if (pending_inc.new_pools.count(pool)) {
9025 p = pending_inc.new_pools[pool];
9026 }
9027 }
9028
9029 string app;
9030 cmd_getval(cmdmap, "app", app);
9031 bool app_exists = (p.application_metadata.count(app) > 0);
9032
9033 string key;
9034 cmd_getval(cmdmap, "key", key);
9035 if (key == "all") {
9036 ss << "key cannot be 'all'";
9037 return -EINVAL;
9038 }
9039
9040 string value;
9041 cmd_getval(cmdmap, "value", value);
9042 if (value == "all") {
9043 ss << "value cannot be 'all'";
9044 return -EINVAL;
9045 }
9046
9047 if (boost::algorithm::ends_with(prefix, "enable")) {
9048 if (app.empty()) {
9049 ss << "application name must be provided";
9050 return -EINVAL;
9051 }
9052
9053 if (p.is_tier()) {
9054 ss << "application must be enabled on base tier";
9055 return -EINVAL;
9056 }
9057
9058 bool force = false;
9059 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9060
9061 if (!app_exists && !p.application_metadata.empty() && !force) {
9062 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
9063 << "application; pass --yes-i-really-mean-it to proceed anyway";
9064 return -EPERM;
9065 }
9066
9067 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
9068 ss << "too many enabled applications on pool '" << pool_name << "'; "
9069 << "max " << MAX_POOL_APPLICATIONS;
9070 return -EINVAL;
9071 }
9072
9073 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
9074 ss << "application name '" << app << "' too long; max length "
9075 << MAX_POOL_APPLICATION_LENGTH;
9076 return -EINVAL;
9077 }
9078
9079 if (!app_exists) {
9080 p.application_metadata[app] = {};
9081 }
9082 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
9083
9084 } else if (boost::algorithm::ends_with(prefix, "disable")) {
9085 bool force = false;
9086 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9087
9088 if (!force) {
9089 ss << "Are you SURE? Disabling an application within a pool might result "
9090 << "in loss of application functionality; pass "
9091 << "--yes-i-really-mean-it to proceed anyway";
9092 return -EPERM;
9093 }
9094
9095 if (!app_exists) {
9096 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9097 << "'";
9098 return 0; // idempotent
9099 }
9100
9101 p.application_metadata.erase(app);
9102 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
9103
9104 } else if (boost::algorithm::ends_with(prefix, "set")) {
9105 if (p.is_tier()) {
9106 ss << "application metadata must be set on base tier";
9107 return -EINVAL;
9108 }
9109
9110 if (!app_exists) {
9111 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9112 << "'";
9113 return -ENOENT;
9114 }
9115
9116 string key;
9117 cmd_getval(cmdmap, "key", key);
9118
9119 if (key.empty()) {
9120 ss << "key must be provided";
9121 return -EINVAL;
9122 }
9123
9124 auto &app_keys = p.application_metadata[app];
9125 if (app_keys.count(key) == 0 &&
9126 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
9127 ss << "too many keys set for application '" << app << "' on pool '"
9128 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
9129 return -EINVAL;
9130 }
9131
9132 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
9133 ss << "key '" << app << "' too long; max length "
9134 << MAX_POOL_APPLICATION_LENGTH;
9135 return -EINVAL;
9136 }
9137
9138 string value;
9139 cmd_getval(cmdmap, "value", value);
9140 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9141 ss << "value '" << value << "' too long; max length "
9142 << MAX_POOL_APPLICATION_LENGTH;
9143 return -EINVAL;
9144 }
9145
9146 p.application_metadata[app][key] = value;
9147 ss << "set application '" << app << "' key '" << key << "' to '"
9148 << value << "' on pool '" << pool_name << "'";
9149 } else if (boost::algorithm::ends_with(prefix, "rm")) {
9150 if (!app_exists) {
9151 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9152 << "'";
9153 return -ENOENT;
9154 }
9155
9156 string key;
9157 cmd_getval(cmdmap, "key", key);
9158 auto it = p.application_metadata[app].find(key);
9159 if (it == p.application_metadata[app].end()) {
9160 ss << "application '" << app << "' on pool '" << pool_name
9161 << "' does not have key '" << key << "'";
9162 return 0; // idempotent
9163 }
9164
9165 p.application_metadata[app].erase(it);
9166 ss << "removed application '" << app << "' key '" << key << "' on pool '"
9167 << pool_name << "'";
9168 } else {
9169 ceph_abort();
9170 }
9171
9172 if (preparing) {
9173 p.last_change = pending_inc.epoch;
9174 pending_inc.new_pools[pool] = p;
9175 }
9176
9177 // Because we fell through this far, we didn't hit no-op cases,
9178 // so pool was definitely modified
9179 if (modified != nullptr) {
9180 *modified = true;
9181 }
9182
9183 return 0;
9184 }
9185
9186 int OSDMonitor::_prepare_command_osd_crush_remove(
9187 CrushWrapper &newcrush,
9188 int32_t id,
9189 int32_t ancestor,
9190 bool has_ancestor,
9191 bool unlink_only)
9192 {
9193 int err = 0;
9194
9195 if (has_ancestor) {
9196 err = newcrush.remove_item_under(cct, id, ancestor,
9197 unlink_only);
9198 } else {
9199 err = newcrush.remove_item(cct, id, unlink_only);
9200 }
9201 return err;
9202 }
9203
9204 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9205 {
9206 pending_inc.crush.clear();
9207 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9208 }
9209
9210 int OSDMonitor::prepare_command_osd_crush_remove(
9211 CrushWrapper &newcrush,
9212 int32_t id,
9213 int32_t ancestor,
9214 bool has_ancestor,
9215 bool unlink_only)
9216 {
9217 int err = _prepare_command_osd_crush_remove(
9218 newcrush, id, ancestor,
9219 has_ancestor, unlink_only);
9220
9221 if (err < 0)
9222 return err;
9223
9224 ceph_assert(err == 0);
9225 do_osd_crush_remove(newcrush);
9226
9227 return 0;
9228 }
9229
9230 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9231 {
9232 if (osdmap.is_up(id)) {
9233 return -EBUSY;
9234 }
9235
9236 pending_inc.new_state[id] = osdmap.get_state(id);
9237 pending_inc.new_uuid[id] = uuid_d();
9238 pending_metadata_rm.insert(id);
9239 pending_metadata.erase(id);
9240
9241 return 0;
9242 }
9243
9244 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9245 {
9246 ceph_assert(existing_id);
9247 *existing_id = -1;
9248
9249 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9250 if (!osdmap.exists(i) &&
9251 pending_inc.new_up_client.count(i) == 0 &&
9252 (pending_inc.new_state.count(i) == 0 ||
9253 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9254 *existing_id = i;
9255 return -1;
9256 }
9257 }
9258
9259 if (pending_inc.new_max_osd < 0) {
9260 return osdmap.get_max_osd();
9261 }
9262 return pending_inc.new_max_osd;
9263 }
9264
9265 void OSDMonitor::do_osd_create(
9266 const int32_t id,
9267 const uuid_d& uuid,
9268 const string& device_class,
9269 int32_t* new_id)
9270 {
9271 dout(10) << __func__ << " uuid " << uuid << dendl;
9272 ceph_assert(new_id);
9273
9274 // We presume validation has been performed prior to calling this
9275 // function. We assert with prejudice.
9276
9277 int32_t allocated_id = -1; // declare here so we can jump
9278 int32_t existing_id = -1;
9279 if (!uuid.is_zero()) {
9280 existing_id = osdmap.identify_osd(uuid);
9281 if (existing_id >= 0) {
9282 ceph_assert(id < 0 || id == existing_id);
9283 *new_id = existing_id;
9284 goto out;
9285 } else if (id >= 0) {
9286 // uuid does not exist, and id has been provided, so just create
9287 // the new osd.id
9288 *new_id = id;
9289 goto out;
9290 }
9291 }
9292
9293 // allocate a new id
9294 allocated_id = _allocate_osd_id(&existing_id);
9295 dout(10) << __func__ << " allocated id " << allocated_id
9296 << " existing id " << existing_id << dendl;
9297 if (existing_id >= 0) {
9298 ceph_assert(existing_id < osdmap.get_max_osd());
9299 ceph_assert(allocated_id < 0);
9300 *new_id = existing_id;
9301 } else if (allocated_id >= 0) {
9302 ceph_assert(existing_id < 0);
9303 // raise max_osd
9304 if (pending_inc.new_max_osd < 0) {
9305 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9306 } else {
9307 ++pending_inc.new_max_osd;
9308 }
9309 *new_id = pending_inc.new_max_osd - 1;
9310 ceph_assert(*new_id == allocated_id);
9311 } else {
9312 ceph_abort_msg("unexpected condition");
9313 }
9314
9315 out:
9316 if (device_class.size()) {
9317 CrushWrapper newcrush = _get_pending_crush();
9318 if (newcrush.get_max_devices() < *new_id + 1) {
9319 newcrush.set_max_devices(*new_id + 1);
9320 }
9321 string name = string("osd.") + stringify(*new_id);
9322 if (!newcrush.item_exists(*new_id)) {
9323 newcrush.set_item_name(*new_id, name);
9324 }
9325 ostringstream ss;
9326 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9327 if (r < 0) {
9328 derr << __func__ << " failed to set " << name << " device_class "
9329 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9330 << dendl;
9331 // non-fatal... this might be a replay and we want to be idempotent.
9332 } else {
9333 dout(20) << __func__ << " set " << name << " device_class " << device_class
9334 << dendl;
9335 pending_inc.crush.clear();
9336 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9337 }
9338 } else {
9339 dout(20) << __func__ << " no device_class" << dendl;
9340 }
9341
9342 dout(10) << __func__ << " using id " << *new_id << dendl;
9343 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9344 pending_inc.new_max_osd = *new_id + 1;
9345 }
9346
9347 pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9348 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9349 // set it for us. (ugh.)
9350 pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9351 if (!uuid.is_zero())
9352 pending_inc.new_uuid[*new_id] = uuid;
9353 }
9354
9355 int OSDMonitor::validate_osd_create(
9356 const int32_t id,
9357 const uuid_d& uuid,
9358 const bool check_osd_exists,
9359 int32_t* existing_id,
9360 stringstream& ss)
9361 {
9362
9363 dout(10) << __func__ << " id " << id << " uuid " << uuid
9364 << " check_osd_exists " << check_osd_exists << dendl;
9365
9366 ceph_assert(existing_id);
9367
9368 if (id < 0 && uuid.is_zero()) {
9369 // we have nothing to validate
9370 *existing_id = -1;
9371 return 0;
9372 } else if (uuid.is_zero()) {
9373 // we have an id but we will ignore it - because that's what
9374 // `osd create` does.
9375 return 0;
9376 }
9377
9378 /*
9379 * This function will be used to validate whether we are able to
9380 * create a new osd when the `uuid` is specified.
9381 *
9382 * It will be used by both `osd create` and `osd new`, as the checks
9383 * are basically the same when it pertains to osd id and uuid validation.
9384 * However, `osd create` presumes an `uuid` is optional, for legacy
9385 * reasons, while `osd new` requires the `uuid` to be provided. This
9386 * means that `osd create` will not be idempotent if an `uuid` is not
9387 * provided, but we will always guarantee the idempotency of `osd new`.
9388 */
9389
9390 ceph_assert(!uuid.is_zero());
9391 if (pending_inc.identify_osd(uuid) >= 0) {
9392 // osd is about to exist
9393 return -EAGAIN;
9394 }
9395
9396 int32_t i = osdmap.identify_osd(uuid);
9397 if (i >= 0) {
9398 // osd already exists
9399 if (id >= 0 && i != id) {
9400 ss << "uuid " << uuid << " already in use for different id " << i;
9401 return -EEXIST;
9402 }
9403 // return a positive errno to distinguish between a blocking error
9404 // and an error we consider to not be a problem (i.e., this would be
9405 // an idempotent operation).
9406 *existing_id = i;
9407 return EEXIST;
9408 }
9409 // i < 0
9410 if (id >= 0) {
9411 if (pending_inc.new_state.count(id)) {
9412 // osd is about to exist
9413 return -EAGAIN;
9414 }
9415 // we may not care if an osd exists if we are recreating a previously
9416 // destroyed osd.
9417 if (check_osd_exists && osdmap.exists(id)) {
9418 ss << "id " << id << " already in use and does not match uuid "
9419 << uuid;
9420 return -EINVAL;
9421 }
9422 }
9423 return 0;
9424 }
9425
9426 int OSDMonitor::prepare_command_osd_create(
9427 const int32_t id,
9428 const uuid_d& uuid,
9429 int32_t* existing_id,
9430 stringstream& ss)
9431 {
9432 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9433 ceph_assert(existing_id);
9434 if (osdmap.is_destroyed(id)) {
9435 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9436 "instead.";
9437 return -EINVAL;
9438 }
9439
9440 if (uuid.is_zero()) {
9441 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9442 }
9443
9444 return validate_osd_create(id, uuid, true, existing_id, ss);
9445 }
9446
9447 int OSDMonitor::prepare_command_osd_new(
9448 MonOpRequestRef op,
9449 const cmdmap_t& cmdmap,
9450 const map<string,string>& params,
9451 stringstream &ss,
9452 Formatter *f)
9453 {
9454 uuid_d uuid;
9455 string uuidstr;
9456 int64_t id = -1;
9457
9458 ceph_assert(paxos.is_plugged());
9459
9460 dout(10) << __func__ << " " << op << dendl;
9461
9462 /* validate command. abort now if something's wrong. */
9463
9464 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9465 *
9466 * If `id` is not specified, we will identify any existing osd based
9467 * on `uuid`. Operation will be idempotent iff secrets match.
9468 *
9469 * If `id` is specified, we will identify any existing osd based on
9470 * `uuid` and match against `id`. If they match, operation will be
9471 * idempotent iff secrets match.
9472 *
9473 * `-i secrets.json` will be optional. If supplied, will be used
9474 * to check for idempotency when `id` and `uuid` match.
9475 *
9476 * If `id` is not specified, and `uuid` does not exist, an id will
9477 * be found or allocated for the osd.
9478 *
9479 * If `id` is specified, and the osd has been previously marked
9480 * as destroyed, then the `id` will be reused.
9481 */
9482 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9483 ss << "requires the OSD's UUID to be specified.";
9484 return -EINVAL;
9485 } else if (!uuid.parse(uuidstr.c_str())) {
9486 ss << "invalid UUID value '" << uuidstr << "'.";
9487 return -EINVAL;
9488 }
9489
9490 if (cmd_getval(cmdmap, "id", id) &&
9491 (id < 0)) {
9492 ss << "invalid OSD id; must be greater or equal than zero.";
9493 return -EINVAL;
9494 }
9495
9496 // are we running an `osd create`-like command, or recreating
9497 // a previously destroyed osd?
9498
9499 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9500
9501 // we will care about `id` to assess whether osd is `destroyed`, or
9502 // to create a new osd.
9503 // we will need an `id` by the time we reach auth.
9504
9505 int32_t existing_id = -1;
9506 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9507 &existing_id, ss);
9508
9509 bool may_be_idempotent = false;
9510 if (err == EEXIST) {
9511 // this is idempotent from the osdmon's point-of-view
9512 may_be_idempotent = true;
9513 ceph_assert(existing_id >= 0);
9514 id = existing_id;
9515 } else if (err < 0) {
9516 return err;
9517 }
9518
9519 if (!may_be_idempotent) {
9520 // idempotency is out of the window. We are either creating a new
9521 // osd or recreating a destroyed osd.
9522 //
9523 // We now need to figure out if we have an `id` (and if it's valid),
9524 // of find an `id` if we don't have one.
9525
9526 // NOTE: we need to consider the case where the `id` is specified for
9527 // `osd create`, and we must honor it. So this means checking if
9528 // the `id` is destroyed, and if so assume the destroy; otherwise,
9529 // check if it `exists` - in which case we complain about not being
9530 // `destroyed`. In the end, if nothing fails, we must allow the
9531 // creation, so that we are compatible with `create`.
9532 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9533 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9534 ss << "OSD " << id << " has not yet been destroyed";
9535 return -EINVAL;
9536 } else if (id < 0) {
9537 // find an `id`
9538 id = _allocate_osd_id(&existing_id);
9539 if (id < 0) {
9540 ceph_assert(existing_id >= 0);
9541 id = existing_id;
9542 }
9543 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9544 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9545 dout(10) << __func__ << " recreating osd." << id << dendl;
9546 } else {
9547 dout(10) << __func__ << " creating new osd." << id << dendl;
9548 }
9549 } else {
9550 ceph_assert(id >= 0);
9551 ceph_assert(osdmap.exists(id));
9552 }
9553
9554 // we are now able to either create a brand new osd or reuse an existing
9555 // osd that has been previously destroyed.
9556
9557 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9558
9559 if (may_be_idempotent && params.empty()) {
9560 // nothing to do, really.
9561 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9562 ceph_assert(id >= 0);
9563 if (f) {
9564 f->open_object_section("created_osd");
9565 f->dump_int("osdid", id);
9566 f->close_section();
9567 } else {
9568 ss << id;
9569 }
9570 return EEXIST;
9571 }
9572
9573 string device_class;
9574 auto p = params.find("crush_device_class");
9575 if (p != params.end()) {
9576 device_class = p->second;
9577 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9578 }
9579 string cephx_secret, lockbox_secret, dmcrypt_key;
9580 bool has_lockbox = false;
9581 bool has_secrets = params.count("cephx_secret")
9582 || params.count("cephx_lockbox_secret")
9583 || params.count("dmcrypt_key");
9584
9585 KVMonitor *svc = nullptr;
9586 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9587
9588 if (has_secrets) {
9589 if (params.count("cephx_secret") == 0) {
9590 ss << "requires a cephx secret.";
9591 return -EINVAL;
9592 }
9593 cephx_secret = params.at("cephx_secret");
9594
9595 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9596 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9597
9598 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9599 << " dmcrypt " << has_dmcrypt_key << dendl;
9600
9601 if (has_lockbox_secret && has_dmcrypt_key) {
9602 has_lockbox = true;
9603 lockbox_secret = params.at("cephx_lockbox_secret");
9604 dmcrypt_key = params.at("dmcrypt_key");
9605 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9606 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9607 return -EINVAL;
9608 }
9609
9610 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9611
9612 err = mon.authmon()->validate_osd_new(id, uuid,
9613 cephx_secret,
9614 lockbox_secret,
9615 cephx_entity,
9616 lockbox_entity,
9617 ss);
9618 if (err < 0) {
9619 return err;
9620 } else if (may_be_idempotent && err != EEXIST) {
9621 // for this to be idempotent, `id` should already be >= 0; no need
9622 // to use validate_id.
9623 ceph_assert(id >= 0);
9624 ss << "osd." << id << " exists but secrets do not match";
9625 return -EEXIST;
9626 }
9627
9628 if (has_lockbox) {
9629 svc = mon.kvmon();
9630 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9631 if (err < 0) {
9632 return err;
9633 } else if (may_be_idempotent && err != EEXIST) {
9634 ceph_assert(id >= 0);
9635 ss << "osd." << id << " exists but dm-crypt key does not match.";
9636 return -EEXIST;
9637 }
9638 }
9639 }
9640 ceph_assert(!has_secrets || !cephx_secret.empty());
9641 ceph_assert(!has_lockbox || !lockbox_secret.empty());
9642
9643 if (may_be_idempotent) {
9644 // we have nothing to do for either the osdmon or the authmon,
9645 // and we have no lockbox - so the config key service will not be
9646 // touched. This is therefore an idempotent operation, and we can
9647 // just return right away.
9648 dout(10) << __func__ << " idempotent -- no op." << dendl;
9649 ceph_assert(id >= 0);
9650 if (f) {
9651 f->open_object_section("created_osd");
9652 f->dump_int("osdid", id);
9653 f->close_section();
9654 } else {
9655 ss << id;
9656 }
9657 return EEXIST;
9658 }
9659 ceph_assert(!may_be_idempotent);
9660
9661 // perform updates.
9662 if (has_secrets) {
9663 ceph_assert(!cephx_secret.empty());
9664 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9665 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9666
9667 err = mon.authmon()->do_osd_new(cephx_entity,
9668 lockbox_entity,
9669 has_lockbox);
9670 ceph_assert(0 == err);
9671
9672 if (has_lockbox) {
9673 ceph_assert(nullptr != svc);
9674 svc->do_osd_new(uuid, dmcrypt_key);
9675 }
9676 }
9677
9678 if (is_recreate_destroyed) {
9679 ceph_assert(id >= 0);
9680 ceph_assert(osdmap.is_destroyed(id));
9681 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9682 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9683 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9684 }
9685 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9686 // due to http://tracker.ceph.com/issues/20751 some clusters may
9687 // have UP set for non-existent OSDs; make sure it is cleared
9688 // for a newly created osd.
9689 pending_inc.new_state[id] |= CEPH_OSD_UP;
9690 }
9691 pending_inc.new_uuid[id] = uuid;
9692 } else {
9693 ceph_assert(id >= 0);
9694 int32_t new_id = -1;
9695 do_osd_create(id, uuid, device_class, &new_id);
9696 ceph_assert(new_id >= 0);
9697 ceph_assert(id == new_id);
9698 }
9699
9700 if (f) {
9701 f->open_object_section("created_osd");
9702 f->dump_int("osdid", id);
9703 f->close_section();
9704 } else {
9705 ss << id;
9706 }
9707
9708 return 0;
9709 }
9710
9711 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9712 {
9713 op->mark_osdmon_event(__func__);
9714 auto m = op->get_req<MMonCommand>();
9715 stringstream ss;
9716 cmdmap_t cmdmap;
9717 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9718 string rs = ss.str();
9719 mon.reply_command(op, -EINVAL, rs, get_last_committed());
9720 return true;
9721 }
9722
9723 MonSession *session = op->get_session();
9724 if (!session) {
9725 derr << __func__ << " no session" << dendl;
9726 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9727 return true;
9728 }
9729
9730 return prepare_command_impl(op, cmdmap);
9731 }
9732
9733 static int parse_reweights(CephContext *cct,
9734 const cmdmap_t& cmdmap,
9735 const OSDMap& osdmap,
9736 map<int32_t, uint32_t>* weights)
9737 {
9738 string weights_str;
9739 if (!cmd_getval(cmdmap, "weights", weights_str)) {
9740 return -EINVAL;
9741 }
9742 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9743 json_spirit::mValue json_value;
9744 if (!json_spirit::read(weights_str, json_value)) {
9745 return -EINVAL;
9746 }
9747 if (json_value.type() != json_spirit::obj_type) {
9748 return -EINVAL;
9749 }
9750 const auto obj = json_value.get_obj();
9751 try {
9752 for (auto& osd_weight : obj) {
9753 auto osd_id = std::stoi(osd_weight.first);
9754 if (!osdmap.exists(osd_id)) {
9755 return -ENOENT;
9756 }
9757 if (osd_weight.second.type() != json_spirit::str_type) {
9758 return -EINVAL;
9759 }
9760 auto weight = std::stoul(osd_weight.second.get_str());
9761 weights->insert({osd_id, weight});
9762 }
9763 } catch (const std::logic_error& e) {
9764 return -EINVAL;
9765 }
9766 return 0;
9767 }
9768
9769 int OSDMonitor::prepare_command_osd_destroy(
9770 int32_t id,
9771 stringstream& ss)
9772 {
9773 ceph_assert(paxos.is_plugged());
9774
9775 // we check if the osd exists for the benefit of `osd purge`, which may
9776 // have previously removed the osd. If the osd does not exist, return
9777 // -ENOENT to convey this, and let the caller deal with it.
9778 //
9779 // we presume that all auth secrets and config keys were removed prior
9780 // to this command being called. if they exist by now, we also assume
9781 // they must have been created by some other command and do not pertain
9782 // to this non-existent osd.
9783 if (!osdmap.exists(id)) {
9784 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9785 return -ENOENT;
9786 }
9787
9788 uuid_d uuid = osdmap.get_uuid(id);
9789 dout(10) << __func__ << " destroying osd." << id
9790 << " uuid " << uuid << dendl;
9791
9792 // if it has been destroyed, we assume our work here is done.
9793 if (osdmap.is_destroyed(id)) {
9794 ss << "destroyed osd." << id;
9795 return 0;
9796 }
9797
9798 EntityName cephx_entity, lockbox_entity;
9799 bool idempotent_auth = false, idempotent_cks = false;
9800
9801 int err = mon.authmon()->validate_osd_destroy(id, uuid,
9802 cephx_entity,
9803 lockbox_entity,
9804 ss);
9805 if (err < 0) {
9806 if (err == -ENOENT) {
9807 idempotent_auth = true;
9808 } else {
9809 return err;
9810 }
9811 }
9812
9813 auto svc = mon.kvmon();
9814 err = svc->validate_osd_destroy(id, uuid);
9815 if (err < 0) {
9816 ceph_assert(err == -ENOENT);
9817 err = 0;
9818 idempotent_cks = true;
9819 }
9820
9821 if (!idempotent_auth) {
9822 err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9823 ceph_assert(0 == err);
9824 }
9825
9826 if (!idempotent_cks) {
9827 svc->do_osd_destroy(id, uuid);
9828 }
9829
9830 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9831 pending_inc.new_uuid[id] = uuid_d();
9832
9833 // we can only propose_pending() once per service, otherwise we'll be
9834 // defying PaxosService and all laws of nature. Therefore, as we may
9835 // be used during 'osd purge', let's keep the caller responsible for
9836 // proposing.
9837 ceph_assert(err == 0);
9838 return 0;
9839 }
9840
9841 int OSDMonitor::prepare_command_osd_purge(
9842 int32_t id,
9843 stringstream& ss)
9844 {
9845 ceph_assert(paxos.is_plugged());
9846 dout(10) << __func__ << " purging osd." << id << dendl;
9847
9848 ceph_assert(!osdmap.is_up(id));
9849
9850 /*
9851 * This may look a bit weird, but this is what's going to happen:
9852 *
9853 * 1. we make sure that removing from crush works
9854 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9855 * error, then we abort the whole operation, as no updates
9856 * have been made. However, we this function will have
9857 * side-effects, thus we need to make sure that all operations
9858 * performed henceforth will *always* succeed.
9859 * 3. we call `prepare_command_osd_remove()`. Although this
9860 * function can return an error, it currently only checks if the
9861 * osd is up - and we have made sure that it is not so, so there
9862 * is no conflict, and it is effectively an update.
9863 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9864 * the crush update we delayed from before.
9865 */
9866
9867 CrushWrapper newcrush = _get_pending_crush();
9868
9869 bool may_be_idempotent = false;
9870
9871 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9872 if (err == -ENOENT) {
9873 err = 0;
9874 may_be_idempotent = true;
9875 } else if (err < 0) {
9876 ss << "error removing osd." << id << " from crush";
9877 return err;
9878 }
9879
9880 // no point destroying the osd again if it has already been marked destroyed
9881 if (!osdmap.is_destroyed(id)) {
9882 err = prepare_command_osd_destroy(id, ss);
9883 if (err < 0) {
9884 if (err == -ENOENT) {
9885 err = 0;
9886 } else {
9887 return err;
9888 }
9889 } else {
9890 may_be_idempotent = false;
9891 }
9892 }
9893 ceph_assert(0 == err);
9894
9895 if (may_be_idempotent && !osdmap.exists(id)) {
9896 dout(10) << __func__ << " osd." << id << " does not exist and "
9897 << "we are idempotent." << dendl;
9898 return -ENOENT;
9899 }
9900
9901 err = prepare_command_osd_remove(id);
9902 // we should not be busy, as we should have made sure this id is not up.
9903 ceph_assert(0 == err);
9904
9905 do_osd_crush_remove(newcrush);
9906 return 0;
9907 }
9908
9909 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9910 const cmdmap_t& cmdmap)
9911 {
9912 op->mark_osdmon_event(__func__);
9913 auto m = op->get_req<MMonCommand>();
9914 bool ret = false;
9915 stringstream ss;
9916 string rs;
9917 bufferlist rdata;
9918 int err = 0;
9919
9920 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
9921 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9922
9923 string prefix;
9924 cmd_getval(cmdmap, "prefix", prefix);
9925
9926 int64_t osdid;
9927 string osd_name;
9928 bool osdid_present = false;
9929 if (prefix != "osd pg-temp" &&
9930 prefix != "osd pg-upmap" &&
9931 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9932 osdid_present = cmd_getval(cmdmap, "id", osdid);
9933 }
9934 if (osdid_present) {
9935 ostringstream oss;
9936 oss << "osd." << osdid;
9937 osd_name = oss.str();
9938 }
9939
9940 // Even if there's a pending state with changes that could affect
9941 // a command, considering that said state isn't yet committed, we
9942 // just don't care about those changes if the command currently being
9943 // handled acts as a no-op against the current committed state.
9944 // In a nutshell, we assume this command happens *before*.
9945 //
9946 // Let me make this clearer:
9947 //
9948 // - If we have only one client, and that client issues some
9949 // operation that would conflict with this operation but is
9950 // still on the pending state, then we would be sure that said
9951 // operation wouldn't have returned yet, so the client wouldn't
9952 // issue this operation (unless the client didn't wait for the
9953 // operation to finish, and that would be the client's own fault).
9954 //
9955 // - If we have more than one client, each client will observe
9956 // whatever is the state at the moment of the commit. So, if we
9957 // have two clients, one issuing an unlink and another issuing a
9958 // link, and if the link happens while the unlink is still on the
9959 // pending state, from the link's point-of-view this is a no-op.
9960 // If different clients are issuing conflicting operations and
9961 // they care about that, then the clients should make sure they
9962 // enforce some kind of concurrency mechanism -- from our
9963 // perspective that's what Douglas Adams would call an SEP.
9964 //
9965 // This should be used as a general guideline for most commands handled
9966 // in this function. Adapt as you see fit, but please bear in mind that
9967 // this is the expected behavior.
9968
9969
9970 if (prefix == "osd setcrushmap" ||
9971 (prefix == "osd crush set" && !osdid_present)) {
9972 if (pending_inc.crush.length()) {
9973 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9974 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9975 return true;
9976 }
9977 dout(10) << "prepare_command setting new crush map" << dendl;
9978 bufferlist data(m->get_data());
9979 CrushWrapper crush;
9980 try {
9981 auto bl = data.cbegin();
9982 crush.decode(bl);
9983 }
9984 catch (const std::exception &e) {
9985 err = -EINVAL;
9986 ss << "Failed to parse crushmap: " << e.what();
9987 goto reply;
9988 }
9989
9990 int64_t prior_version = 0;
9991 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9992 if (prior_version == osdmap.get_crush_version() - 1) {
9993 // see if we are a resend of the last update. this is imperfect
9994 // (multiple racing updaters may not both get reliable success)
9995 // but we expect crush updaters (via this interface) to be rare-ish.
9996 bufferlist current, proposed;
9997 osdmap.crush->encode(current, mon.get_quorum_con_features());
9998 crush.encode(proposed, mon.get_quorum_con_features());
9999 if (current.contents_equal(proposed)) {
10000 dout(10) << __func__
10001 << " proposed matches current and version equals previous"
10002 << dendl;
10003 err = 0;
10004 ss << osdmap.get_crush_version();
10005 goto reply;
10006 }
10007 }
10008 if (prior_version != osdmap.get_crush_version()) {
10009 err = -EPERM;
10010 ss << "prior_version " << prior_version << " != crush version "
10011 << osdmap.get_crush_version();
10012 goto reply;
10013 }
10014 }
10015
10016 if (!validate_crush_against_features(&crush, ss)) {
10017 err = -EINVAL;
10018 goto reply;
10019 }
10020
10021 err = osdmap.validate_crush_rules(&crush, &ss);
10022 if (err < 0) {
10023 goto reply;
10024 }
10025
10026 if (g_conf()->mon_osd_crush_smoke_test) {
10027 // sanity check: test some inputs to make sure this map isn't
10028 // totally broken
10029 dout(10) << " testing map" << dendl;
10030 stringstream ess;
10031 CrushTester tester(crush, ess);
10032 tester.set_min_x(0);
10033 tester.set_max_x(50);
10034 tester.set_num_rep(3); // arbitrary
10035 auto start = ceph::coarse_mono_clock::now();
10036 int r = tester.test_with_fork(g_conf()->mon_lease);
10037 auto duration = ceph::coarse_mono_clock::now() - start;
10038 if (r < 0) {
10039 dout(10) << " tester.test_with_fork returns " << r
10040 << ": " << ess.str() << dendl;
10041 ss << "crush smoke test failed with " << r << ": " << ess.str();
10042 err = r;
10043 goto reply;
10044 }
10045 dout(10) << __func__ << " crush somke test duration: "
10046 << duration << ", result: " << ess.str() << dendl;
10047 }
10048
10049 pending_inc.crush = data;
10050 ss << osdmap.get_crush_version() + 1;
10051 goto update;
10052
10053 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
10054 CrushWrapper newcrush = _get_pending_crush();
10055 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
10056 int bid = -1 - b;
10057 if (newcrush.bucket_exists(bid) &&
10058 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
10059 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
10060 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
10061 }
10062 }
10063 if (!validate_crush_against_features(&newcrush, ss)) {
10064 err = -EINVAL;
10065 goto reply;
10066 }
10067 pending_inc.crush.clear();
10068 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10069 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10070 get_last_committed() + 1));
10071 return true;
10072 } else if (prefix == "osd crush set-device-class") {
10073 string device_class;
10074 if (!cmd_getval(cmdmap, "class", device_class)) {
10075 err = -EINVAL; // no value!
10076 goto reply;
10077 }
10078
10079 bool stop = false;
10080 vector<string> idvec;
10081 cmd_getval(cmdmap, "ids", idvec);
10082 CrushWrapper newcrush = _get_pending_crush();
10083 set<int> updated;
10084 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10085 set<int> osds;
10086 // wildcard?
10087 if (j == 0 &&
10088 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10089 osdmap.get_all_osds(osds);
10090 stop = true;
10091 } else {
10092 // try traditional single osd way
10093 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10094 if (osd < 0) {
10095 // ss has reason for failure
10096 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10097 err = -EINVAL;
10098 continue;
10099 }
10100 osds.insert(osd);
10101 }
10102
10103 for (auto &osd : osds) {
10104 if (!osdmap.exists(osd)) {
10105 ss << "osd." << osd << " does not exist. ";
10106 continue;
10107 }
10108
10109 ostringstream oss;
10110 oss << "osd." << osd;
10111 string name = oss.str();
10112
10113 if (newcrush.get_max_devices() < osd + 1) {
10114 newcrush.set_max_devices(osd + 1);
10115 }
10116 string action;
10117 if (newcrush.item_exists(osd)) {
10118 action = "updating";
10119 } else {
10120 action = "creating";
10121 newcrush.set_item_name(osd, name);
10122 }
10123
10124 dout(5) << action << " crush item id " << osd << " name '" << name
10125 << "' device_class '" << device_class << "'"
10126 << dendl;
10127 err = newcrush.update_device_class(osd, device_class, name, &ss);
10128 if (err < 0) {
10129 goto reply;
10130 }
10131 if (err == 0 && !_have_pending_crush()) {
10132 if (!stop) {
10133 // for single osd only, wildcard makes too much noise
10134 ss << "set-device-class item id " << osd << " name '" << name
10135 << "' device_class '" << device_class << "': no change. ";
10136 }
10137 } else {
10138 updated.insert(osd);
10139 }
10140 }
10141 }
10142
10143 pending_inc.crush.clear();
10144 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10145 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10146 getline(ss, rs);
10147 wait_for_finished_proposal(
10148 op,
10149 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10150 return true;
10151 } else if (prefix == "osd crush rm-device-class") {
10152 bool stop = false;
10153 vector<string> idvec;
10154 cmd_getval(cmdmap, "ids", idvec);
10155 CrushWrapper newcrush = _get_pending_crush();
10156 set<int> updated;
10157
10158 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10159 set<int> osds;
10160
10161 // wildcard?
10162 if (j == 0 &&
10163 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10164 osdmap.get_all_osds(osds);
10165 stop = true;
10166 } else {
10167 // try traditional single osd way
10168 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10169 if (osd < 0) {
10170 // ss has reason for failure
10171 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10172 err = -EINVAL;
10173 goto reply;
10174 }
10175 osds.insert(osd);
10176 }
10177
10178 for (auto &osd : osds) {
10179 if (!osdmap.exists(osd)) {
10180 ss << "osd." << osd << " does not exist. ";
10181 continue;
10182 }
10183
10184 auto class_name = newcrush.get_item_class(osd);
10185 if (!class_name) {
10186 ss << "osd." << osd << " belongs to no class, ";
10187 continue;
10188 }
10189 // note that we do not verify if class_is_in_use here
10190 // in case the device is misclassified and user wants
10191 // to overridely reset...
10192
10193 err = newcrush.remove_device_class(cct, osd, &ss);
10194 if (err < 0) {
10195 // ss has reason for failure
10196 goto reply;
10197 }
10198 updated.insert(osd);
10199 }
10200 }
10201
10202 pending_inc.crush.clear();
10203 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10204 ss << "done removing class of osd(s): " << updated;
10205 getline(ss, rs);
10206 wait_for_finished_proposal(
10207 op,
10208 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10209 return true;
10210 } else if (prefix == "osd crush class create") {
10211 string device_class;
10212 if (!cmd_getval(cmdmap, "class", device_class)) {
10213 err = -EINVAL; // no value!
10214 goto reply;
10215 }
10216 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10217 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10218 << "luminous' before using crush device classes";
10219 err = -EPERM;
10220 goto reply;
10221 }
10222 if (!_have_pending_crush() &&
10223 _get_stable_crush().class_exists(device_class)) {
10224 ss << "class '" << device_class << "' already exists";
10225 goto reply;
10226 }
10227 CrushWrapper newcrush = _get_pending_crush();
10228 if (newcrush.class_exists(device_class)) {
10229 ss << "class '" << device_class << "' already exists";
10230 goto update;
10231 }
10232 int class_id = newcrush.get_or_create_class_id(device_class);
10233 pending_inc.crush.clear();
10234 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10235 ss << "created class " << device_class << " with id " << class_id
10236 << " to crush map";
10237 goto update;
10238 } else if (prefix == "osd crush class rm") {
10239 string device_class;
10240 if (!cmd_getval(cmdmap, "class", device_class)) {
10241 err = -EINVAL; // no value!
10242 goto reply;
10243 }
10244 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10245 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10246 << "luminous' before using crush device classes";
10247 err = -EPERM;
10248 goto reply;
10249 }
10250
10251 if (!osdmap.crush->class_exists(device_class)) {
10252 err = 0;
10253 goto reply;
10254 }
10255
10256 CrushWrapper newcrush = _get_pending_crush();
10257 if (!newcrush.class_exists(device_class)) {
10258 err = 0; // make command idempotent
10259 goto wait;
10260 }
10261 int class_id = newcrush.get_class_id(device_class);
10262 stringstream ts;
10263 if (newcrush.class_is_in_use(class_id, &ts)) {
10264 err = -EBUSY;
10265 ss << "class '" << device_class << "' " << ts.str();
10266 goto reply;
10267 }
10268
10269 // check if class is used by any erasure-code-profiles
10270 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10271 osdmap.get_erasure_code_profiles();
10272 auto ec_profiles = pending_inc.get_erasure_code_profiles();
10273 #ifdef HAVE_STDLIB_MAP_SPLICING
10274 ec_profiles.merge(old_ec_profiles);
10275 #else
10276 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10277 make_move_iterator(end(old_ec_profiles)));
10278 #endif
10279 list<string> referenced_by;
10280 for (auto &i: ec_profiles) {
10281 for (auto &j: i.second) {
10282 if ("crush-device-class" == j.first && device_class == j.second) {
10283 referenced_by.push_back(i.first);
10284 }
10285 }
10286 }
10287 if (!referenced_by.empty()) {
10288 err = -EBUSY;
10289 ss << "class '" << device_class
10290 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10291 goto reply;
10292 }
10293
10294 set<int> osds;
10295 newcrush.get_devices_by_class(device_class, &osds);
10296 for (auto& p: osds) {
10297 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
10298 if (err < 0) {
10299 // ss has reason for failure
10300 goto reply;
10301 }
10302 }
10303
10304 if (osds.empty()) {
10305 // empty class, remove directly
10306 err = newcrush.remove_class_name(device_class);
10307 if (err < 0) {
10308 ss << "class '" << device_class << "' cannot be removed '"
10309 << cpp_strerror(err) << "'";
10310 goto reply;
10311 }
10312 }
10313
10314 pending_inc.crush.clear();
10315 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10316 ss << "removed class " << device_class << " with id " << class_id
10317 << " from crush map";
10318 goto update;
10319 } else if (prefix == "osd crush class rename") {
10320 string srcname, dstname;
10321 if (!cmd_getval(cmdmap, "srcname", srcname)) {
10322 err = -EINVAL;
10323 goto reply;
10324 }
10325 if (!cmd_getval(cmdmap, "dstname", dstname)) {
10326 err = -EINVAL;
10327 goto reply;
10328 }
10329
10330 CrushWrapper newcrush = _get_pending_crush();
10331 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10332 // suppose this is a replay and return success
10333 // so command is idempotent
10334 ss << "already renamed to '" << dstname << "'";
10335 err = 0;
10336 goto reply;
10337 }
10338
10339 err = newcrush.rename_class(srcname, dstname);
10340 if (err < 0) {
10341 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10342 << cpp_strerror(err);
10343 goto reply;
10344 }
10345
10346 pending_inc.crush.clear();
10347 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10348 ss << "rename class '" << srcname << "' to '" << dstname << "'";
10349 goto update;
10350 } else if (prefix == "osd crush add-bucket") {
10351 // os crush add-bucket <name> <type>
10352 string name, typestr;
10353 vector<string> argvec;
10354 cmd_getval(cmdmap, "name", name);
10355 cmd_getval(cmdmap, "type", typestr);
10356 cmd_getval(cmdmap, "args", argvec);
10357 map<string,string> loc;
10358 if (!argvec.empty()) {
10359 CrushWrapper::parse_loc_map(argvec, &loc);
10360 dout(0) << "will create and move bucket '" << name
10361 << "' to location " << loc << dendl;
10362 }
10363
10364 if (!_have_pending_crush() &&
10365 _get_stable_crush().name_exists(name)) {
10366 ss << "bucket '" << name << "' already exists";
10367 goto reply;
10368 }
10369
10370 CrushWrapper newcrush = _get_pending_crush();
10371
10372 if (newcrush.name_exists(name)) {
10373 ss << "bucket '" << name << "' already exists";
10374 goto update;
10375 }
10376 int type = newcrush.get_type_id(typestr);
10377 if (type < 0) {
10378 ss << "type '" << typestr << "' does not exist";
10379 err = -EINVAL;
10380 goto reply;
10381 }
10382 if (type == 0) {
10383 ss << "type '" << typestr << "' is for devices, not buckets";
10384 err = -EINVAL;
10385 goto reply;
10386 }
10387 int bucketno;
10388 err = newcrush.add_bucket(0, 0,
10389 CRUSH_HASH_DEFAULT, type, 0, NULL,
10390 NULL, &bucketno);
10391 if (err < 0) {
10392 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10393 goto reply;
10394 }
10395 err = newcrush.set_item_name(bucketno, name);
10396 if (err < 0) {
10397 ss << "error setting bucket name to '" << name << "'";
10398 goto reply;
10399 }
10400
10401 if (!loc.empty()) {
10402 if (!newcrush.check_item_loc(cct, bucketno, loc,
10403 (int *)NULL)) {
10404 err = newcrush.move_bucket(cct, bucketno, loc);
10405 if (err < 0) {
10406 ss << "error moving bucket '" << name << "' to location " << loc;
10407 goto reply;
10408 }
10409 } else {
10410 ss << "no need to move item id " << bucketno << " name '" << name
10411 << "' to location " << loc << " in crush map";
10412 }
10413 }
10414
10415 pending_inc.crush.clear();
10416 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10417 if (loc.empty()) {
10418 ss << "added bucket " << name << " type " << typestr
10419 << " to crush map";
10420 } else {
10421 ss << "added bucket " << name << " type " << typestr
10422 << " to location " << loc;
10423 }
10424 goto update;
10425 } else if (prefix == "osd crush rename-bucket") {
10426 string srcname, dstname;
10427 cmd_getval(cmdmap, "srcname", srcname);
10428 cmd_getval(cmdmap, "dstname", dstname);
10429
10430 err = crush_rename_bucket(srcname, dstname, &ss);
10431 if (err == -EALREADY) // equivalent to success for idempotency
10432 err = 0;
10433 if (err)
10434 goto reply;
10435 else
10436 goto update;
10437 } else if (prefix == "osd crush weight-set create" ||
10438 prefix == "osd crush weight-set create-compat") {
10439 if (_have_pending_crush()) {
10440 dout(10) << " first waiting for pending crush changes to commit" << dendl;
10441 goto wait;
10442 }
10443 CrushWrapper newcrush = _get_pending_crush();
10444 int64_t pool;
10445 int positions;
10446 if (newcrush.has_non_straw2_buckets()) {
10447 ss << "crush map contains one or more bucket(s) that are not straw2";
10448 err = -EPERM;
10449 goto reply;
10450 }
10451 if (prefix == "osd crush weight-set create") {
10452 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10453 osdmap.require_min_compat_client < ceph_release_t::luminous) {
10454 ss << "require_min_compat_client "
10455 << osdmap.require_min_compat_client
10456 << " < luminous, which is required for per-pool weight-sets. "
10457 << "Try 'ceph osd set-require-min-compat-client luminous' "
10458 << "before using the new interface";
10459 err = -EPERM;
10460 goto reply;
10461 }
10462 string poolname, mode;
10463 cmd_getval(cmdmap, "pool", poolname);
10464 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10465 if (pool < 0) {
10466 ss << "pool '" << poolname << "' not found";
10467 err = -ENOENT;
10468 goto reply;
10469 }
10470 cmd_getval(cmdmap, "mode", mode);
10471 if (mode != "flat" && mode != "positional") {
10472 ss << "unrecognized weight-set mode '" << mode << "'";
10473 err = -EINVAL;
10474 goto reply;
10475 }
10476 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10477 } else {
10478 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10479 positions = 1;
10480 }
10481 if (!newcrush.create_choose_args(pool, positions)) {
10482 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10483 ss << "compat weight-set already created";
10484 } else {
10485 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10486 << "' already created";
10487 }
10488 goto reply;
10489 }
10490 pending_inc.crush.clear();
10491 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10492 goto update;
10493
10494 } else if (prefix == "osd crush weight-set rm" ||
10495 prefix == "osd crush weight-set rm-compat") {
10496 CrushWrapper newcrush = _get_pending_crush();
10497 int64_t pool;
10498 if (prefix == "osd crush weight-set rm") {
10499 string poolname;
10500 cmd_getval(cmdmap, "pool", poolname);
10501 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10502 if (pool < 0) {
10503 ss << "pool '" << poolname << "' not found";
10504 err = -ENOENT;
10505 goto reply;
10506 }
10507 } else {
10508 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10509 }
10510 newcrush.rm_choose_args(pool);
10511 pending_inc.crush.clear();
10512 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10513 goto update;
10514
10515 } else if (prefix == "osd crush weight-set reweight" ||
10516 prefix == "osd crush weight-set reweight-compat") {
10517 string poolname, item;
10518 vector<double> weight;
10519 cmd_getval(cmdmap, "pool", poolname);
10520 cmd_getval(cmdmap, "item", item);
10521 cmd_getval(cmdmap, "weight", weight);
10522 CrushWrapper newcrush = _get_pending_crush();
10523 int64_t pool;
10524 if (prefix == "osd crush weight-set reweight") {
10525 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10526 if (pool < 0) {
10527 ss << "pool '" << poolname << "' not found";
10528 err = -ENOENT;
10529 goto reply;
10530 }
10531 if (!newcrush.have_choose_args(pool)) {
10532 ss << "no weight-set for pool '" << poolname << "'";
10533 err = -ENOENT;
10534 goto reply;
10535 }
10536 auto arg_map = newcrush.choose_args_get(pool);
10537 int positions = newcrush.get_choose_args_positions(arg_map);
10538 if (weight.size() != (size_t)positions) {
10539 ss << "must specify exact " << positions << " weight values";
10540 err = -EINVAL;
10541 goto reply;
10542 }
10543 } else {
10544 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10545 if (!newcrush.have_choose_args(pool)) {
10546 ss << "no backward-compatible weight-set";
10547 err = -ENOENT;
10548 goto reply;
10549 }
10550 }
10551 if (!newcrush.name_exists(item)) {
10552 ss << "item '" << item << "' does not exist";
10553 err = -ENOENT;
10554 goto reply;
10555 }
10556 err = newcrush.choose_args_adjust_item_weightf(
10557 cct,
10558 newcrush.choose_args_get(pool),
10559 newcrush.get_item_id(item),
10560 weight,
10561 &ss);
10562 if (err < 0) {
10563 goto reply;
10564 }
10565 err = 0;
10566 pending_inc.crush.clear();
10567 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10568 goto update;
10569 } else if (osdid_present &&
10570 (prefix == "osd crush set" || prefix == "osd crush add")) {
10571 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10572 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10573 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10574
10575 if (!osdmap.exists(osdid)) {
10576 err = -ENOENT;
10577 ss << osd_name
10578 << " does not exist. Create it before updating the crush map";
10579 goto reply;
10580 }
10581
10582 double weight;
10583 if (!cmd_getval(cmdmap, "weight", weight)) {
10584 ss << "unable to parse weight value '"
10585 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10586 err = -EINVAL;
10587 goto reply;
10588 }
10589
10590 string args;
10591 vector<string> argvec;
10592 cmd_getval(cmdmap, "args", argvec);
10593 map<string,string> loc;
10594 CrushWrapper::parse_loc_map(argvec, &loc);
10595
10596 if (prefix == "osd crush set"
10597 && !_get_stable_crush().item_exists(osdid)) {
10598 err = -ENOENT;
10599 ss << "unable to set item id " << osdid << " name '" << osd_name
10600 << "' weight " << weight << " at location " << loc
10601 << ": does not exist";
10602 goto reply;
10603 }
10604
10605 dout(5) << "adding/updating crush item id " << osdid << " name '"
10606 << osd_name << "' weight " << weight << " at location "
10607 << loc << dendl;
10608 CrushWrapper newcrush = _get_pending_crush();
10609
10610 string action;
10611 if (prefix == "osd crush set" ||
10612 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10613 action = "set";
10614 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10615 } else {
10616 action = "add";
10617 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10618 if (err == 0)
10619 err = 1;
10620 }
10621
10622 if (err < 0)
10623 goto reply;
10624
10625 if (err == 0 && !_have_pending_crush()) {
10626 ss << action << " item id " << osdid << " name '" << osd_name
10627 << "' weight " << weight << " at location " << loc << ": no change";
10628 goto reply;
10629 }
10630
10631 pending_inc.crush.clear();
10632 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10633 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10634 << weight << " at location " << loc << " to crush map";
10635 getline(ss, rs);
10636 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10637 get_last_committed() + 1));
10638 return true;
10639
10640 } else if (prefix == "osd crush create-or-move") {
10641 do {
10642 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10643 if (!osdmap.exists(osdid)) {
10644 err = -ENOENT;
10645 ss << osd_name
10646 << " does not exist. create it before updating the crush map";
10647 goto reply;
10648 }
10649
10650 double weight;
10651 if (!cmd_getval(cmdmap, "weight", weight)) {
10652 ss << "unable to parse weight value '"
10653 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10654 err = -EINVAL;
10655 goto reply;
10656 }
10657
10658 string args;
10659 vector<string> argvec;
10660 cmd_getval(cmdmap, "args", argvec);
10661 map<string,string> loc;
10662 CrushWrapper::parse_loc_map(argvec, &loc);
10663
10664 dout(0) << "create-or-move crush item name '" << osd_name
10665 << "' initial_weight " << weight << " at location " << loc
10666 << dendl;
10667
10668 CrushWrapper newcrush = _get_pending_crush();
10669
10670 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10671 g_conf()->osd_crush_update_weight_set);
10672 if (err == 0) {
10673 ss << "create-or-move updated item name '" << osd_name
10674 << "' weight " << weight
10675 << " at location " << loc << " to crush map";
10676 break;
10677 }
10678 if (err > 0) {
10679 pending_inc.crush.clear();
10680 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10681 ss << "create-or-move updating item name '" << osd_name
10682 << "' weight " << weight
10683 << " at location " << loc << " to crush map";
10684 getline(ss, rs);
10685 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10686 get_last_committed() + 1));
10687 return true;
10688 }
10689 } while (false);
10690
10691 } else if (prefix == "osd crush move") {
10692 do {
10693 // osd crush move <name> <loc1> [<loc2> ...]
10694 string name;
10695 vector<string> argvec;
10696 cmd_getval(cmdmap, "name", name);
10697 cmd_getval(cmdmap, "args", argvec);
10698 map<string,string> loc;
10699 CrushWrapper::parse_loc_map(argvec, &loc);
10700
10701 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10702 CrushWrapper newcrush = _get_pending_crush();
10703
10704 if (!newcrush.name_exists(name)) {
10705 err = -ENOENT;
10706 ss << "item " << name << " does not exist";
10707 break;
10708 }
10709 int id = newcrush.get_item_id(name);
10710
10711 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10712 if (id >= 0) {
10713 err = newcrush.create_or_move_item(
10714 cct, id, 0, name, loc,
10715 g_conf()->osd_crush_update_weight_set);
10716 } else {
10717 err = newcrush.move_bucket(cct, id, loc);
10718 }
10719 if (err >= 0) {
10720 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10721 pending_inc.crush.clear();
10722 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10723 getline(ss, rs);
10724 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10725 get_last_committed() + 1));
10726 return true;
10727 }
10728 } else {
10729 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10730 err = 0;
10731 }
10732 } while (false);
10733 } else if (prefix == "osd crush swap-bucket") {
10734 string source, dest;
10735 cmd_getval(cmdmap, "source", source);
10736 cmd_getval(cmdmap, "dest", dest);
10737
10738 bool force = false;
10739 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10740
10741 CrushWrapper newcrush = _get_pending_crush();
10742 if (!newcrush.name_exists(source)) {
10743 ss << "source item " << source << " does not exist";
10744 err = -ENOENT;
10745 goto reply;
10746 }
10747 if (!newcrush.name_exists(dest)) {
10748 ss << "dest item " << dest << " does not exist";
10749 err = -ENOENT;
10750 goto reply;
10751 }
10752 int sid = newcrush.get_item_id(source);
10753 int did = newcrush.get_item_id(dest);
10754 int sparent;
10755 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10756 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10757 err = -EPERM;
10758 goto reply;
10759 }
10760 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10761 !force) {
10762 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10763 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10764 << "; pass --yes-i-really-mean-it to proceed anyway";
10765 err = -EPERM;
10766 goto reply;
10767 }
10768 int r = newcrush.swap_bucket(cct, sid, did);
10769 if (r < 0) {
10770 ss << "failed to swap bucket contents: " << cpp_strerror(r);
10771 err = r;
10772 goto reply;
10773 }
10774 ss << "swapped bucket of " << source << " to " << dest;
10775 pending_inc.crush.clear();
10776 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10777 wait_for_finished_proposal(op,
10778 new Monitor::C_Command(mon, op, err, ss.str(),
10779 get_last_committed() + 1));
10780 return true;
10781 } else if (prefix == "osd crush link") {
10782 // osd crush link <name> <loc1> [<loc2> ...]
10783 string name;
10784 cmd_getval(cmdmap, "name", name);
10785 vector<string> argvec;
10786 cmd_getval(cmdmap, "args", argvec);
10787 map<string,string> loc;
10788 CrushWrapper::parse_loc_map(argvec, &loc);
10789
10790 // Need an explicit check for name_exists because get_item_id returns
10791 // 0 on unfound.
10792 int id = osdmap.crush->get_item_id(name);
10793 if (!osdmap.crush->name_exists(name)) {
10794 err = -ENOENT;
10795 ss << "item " << name << " does not exist";
10796 goto reply;
10797 } else {
10798 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10799 }
10800 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10801 ss << "no need to move item id " << id << " name '" << name
10802 << "' to location " << loc << " in crush map";
10803 err = 0;
10804 goto reply;
10805 }
10806
10807 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10808 CrushWrapper newcrush = _get_pending_crush();
10809
10810 if (!newcrush.name_exists(name)) {
10811 err = -ENOENT;
10812 ss << "item " << name << " does not exist";
10813 goto reply;
10814 } else {
10815 int id = newcrush.get_item_id(name);
10816 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10817 err = newcrush.link_bucket(cct, id, loc);
10818 if (err >= 0) {
10819 ss << "linked item id " << id << " name '" << name
10820 << "' to location " << loc << " in crush map";
10821 pending_inc.crush.clear();
10822 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10823 } else {
10824 ss << "cannot link item id " << id << " name '" << name
10825 << "' to location " << loc;
10826 goto reply;
10827 }
10828 } else {
10829 ss << "no need to move item id " << id << " name '" << name
10830 << "' to location " << loc << " in crush map";
10831 err = 0;
10832 }
10833 }
10834 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10835 get_last_committed() + 1));
10836 return true;
10837 } else if (prefix == "osd crush rm" ||
10838 prefix == "osd crush remove" ||
10839 prefix == "osd crush unlink") {
10840 do {
10841 // osd crush rm <id> [ancestor]
10842 CrushWrapper newcrush = _get_pending_crush();
10843
10844 string name;
10845 cmd_getval(cmdmap, "name", name);
10846
10847 if (!osdmap.crush->name_exists(name)) {
10848 err = 0;
10849 ss << "device '" << name << "' does not appear in the crush map";
10850 break;
10851 }
10852 if (!newcrush.name_exists(name)) {
10853 err = 0;
10854 ss << "device '" << name << "' does not appear in the crush map";
10855 getline(ss, rs);
10856 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10857 get_last_committed() + 1));
10858 return true;
10859 }
10860 int id = newcrush.get_item_id(name);
10861 int ancestor = 0;
10862
10863 bool unlink_only = prefix == "osd crush unlink";
10864 string ancestor_str;
10865 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10866 if (!newcrush.name_exists(ancestor_str)) {
10867 err = -ENOENT;
10868 ss << "ancestor item '" << ancestor_str
10869 << "' does not appear in the crush map";
10870 break;
10871 }
10872 ancestor = newcrush.get_item_id(ancestor_str);
10873 }
10874
10875 err = prepare_command_osd_crush_remove(
10876 newcrush,
10877 id, ancestor,
10878 (ancestor < 0), unlink_only);
10879
10880 if (err == -ENOENT) {
10881 ss << "item " << id << " does not appear in that position";
10882 err = 0;
10883 break;
10884 }
10885 if (err == 0) {
10886 if (!unlink_only)
10887 pending_inc.new_crush_node_flags[id] = 0;
10888 ss << "removed item id " << id << " name '" << name << "' from crush map";
10889 getline(ss, rs);
10890 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10891 get_last_committed() + 1));
10892 return true;
10893 }
10894 } while (false);
10895
10896 } else if (prefix == "osd crush reweight-all") {
10897 CrushWrapper newcrush = _get_pending_crush();
10898
10899 newcrush.reweight(cct);
10900 pending_inc.crush.clear();
10901 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10902 ss << "reweighted crush hierarchy";
10903 getline(ss, rs);
10904 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10905 get_last_committed() + 1));
10906 return true;
10907 } else if (prefix == "osd crush reweight") {
10908 // osd crush reweight <name> <weight>
10909 CrushWrapper newcrush = _get_pending_crush();
10910
10911 string name;
10912 cmd_getval(cmdmap, "name", name);
10913 if (!newcrush.name_exists(name)) {
10914 err = -ENOENT;
10915 ss << "device '" << name << "' does not appear in the crush map";
10916 goto reply;
10917 }
10918
10919 int id = newcrush.get_item_id(name);
10920 if (id < 0) {
10921 ss << "device '" << name << "' is not a leaf in the crush map";
10922 err = -EINVAL;
10923 goto reply;
10924 }
10925 double w;
10926 if (!cmd_getval(cmdmap, "weight", w)) {
10927 ss << "unable to parse weight value '"
10928 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10929 err = -EINVAL;
10930 goto reply;
10931 }
10932
10933 err = newcrush.adjust_item_weightf(cct, id, w,
10934 g_conf()->osd_crush_update_weight_set);
10935 if (err < 0)
10936 goto reply;
10937 pending_inc.crush.clear();
10938 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10939 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10940 << " in crush map";
10941 getline(ss, rs);
10942 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10943 get_last_committed() + 1));
10944 return true;
10945 } else if (prefix == "osd crush reweight-subtree") {
10946 // osd crush reweight <name> <weight>
10947 CrushWrapper newcrush = _get_pending_crush();
10948
10949 string name;
10950 cmd_getval(cmdmap, "name", name);
10951 if (!newcrush.name_exists(name)) {
10952 err = -ENOENT;
10953 ss << "device '" << name << "' does not appear in the crush map";
10954 goto reply;
10955 }
10956
10957 int id = newcrush.get_item_id(name);
10958 if (id >= 0) {
10959 ss << "device '" << name << "' is not a subtree in the crush map";
10960 err = -EINVAL;
10961 goto reply;
10962 }
10963 double w;
10964 if (!cmd_getval(cmdmap, "weight", w)) {
10965 ss << "unable to parse weight value '"
10966 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10967 err = -EINVAL;
10968 goto reply;
10969 }
10970
10971 err = newcrush.adjust_subtree_weightf(cct, id, w,
10972 g_conf()->osd_crush_update_weight_set);
10973 if (err < 0)
10974 goto reply;
10975 pending_inc.crush.clear();
10976 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10977 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10978 << " in crush map";
10979 getline(ss, rs);
10980 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10981 get_last_committed() + 1));
10982 return true;
10983 } else if (prefix == "osd crush tunables") {
10984 CrushWrapper newcrush = _get_pending_crush();
10985
10986 err = 0;
10987 string profile;
10988 cmd_getval(cmdmap, "profile", profile);
10989 if (profile == "legacy" || profile == "argonaut") {
10990 newcrush.set_tunables_legacy();
10991 } else if (profile == "bobtail") {
10992 newcrush.set_tunables_bobtail();
10993 } else if (profile == "firefly") {
10994 newcrush.set_tunables_firefly();
10995 } else if (profile == "hammer") {
10996 newcrush.set_tunables_hammer();
10997 } else if (profile == "jewel") {
10998 newcrush.set_tunables_jewel();
10999 } else if (profile == "optimal") {
11000 newcrush.set_tunables_optimal();
11001 } else if (profile == "default") {
11002 newcrush.set_tunables_default();
11003 } else {
11004 ss << "unrecognized profile '" << profile << "'";
11005 err = -EINVAL;
11006 goto reply;
11007 }
11008
11009 if (!validate_crush_against_features(&newcrush, ss)) {
11010 err = -EINVAL;
11011 goto reply;
11012 }
11013
11014 pending_inc.crush.clear();
11015 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11016 ss << "adjusted tunables profile to " << profile;
11017 getline(ss, rs);
11018 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11019 get_last_committed() + 1));
11020 return true;
11021 } else if (prefix == "osd crush set-tunable") {
11022 CrushWrapper newcrush = _get_pending_crush();
11023
11024 err = 0;
11025 string tunable;
11026 cmd_getval(cmdmap, "tunable", tunable);
11027
11028 int64_t value = -1;
11029 if (!cmd_getval(cmdmap, "value", value)) {
11030 err = -EINVAL;
11031 ss << "failed to parse integer value "
11032 << cmd_vartype_stringify(cmdmap.at("value"));
11033 goto reply;
11034 }
11035
11036 if (tunable == "straw_calc_version") {
11037 if (value != 0 && value != 1) {
11038 ss << "value must be 0 or 1; got " << value;
11039 err = -EINVAL;
11040 goto reply;
11041 }
11042 newcrush.set_straw_calc_version(value);
11043 } else {
11044 ss << "unrecognized tunable '" << tunable << "'";
11045 err = -EINVAL;
11046 goto reply;
11047 }
11048
11049 if (!validate_crush_against_features(&newcrush, ss)) {
11050 err = -EINVAL;
11051 goto reply;
11052 }
11053
11054 pending_inc.crush.clear();
11055 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11056 ss << "adjusted tunable " << tunable << " to " << value;
11057 getline(ss, rs);
11058 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11059 get_last_committed() + 1));
11060 return true;
11061
11062 } else if (prefix == "osd crush rule create-simple") {
11063 string name, root, type, mode;
11064 cmd_getval(cmdmap, "name", name);
11065 cmd_getval(cmdmap, "root", root);
11066 cmd_getval(cmdmap, "type", type);
11067 cmd_getval(cmdmap, "mode", mode);
11068 if (mode == "")
11069 mode = "firstn";
11070
11071 if (osdmap.crush->rule_exists(name)) {
11072 // The name is uniquely associated to a ruleid and the rule it contains
11073 // From the user point of view, the rule is more meaningfull.
11074 ss << "rule " << name << " already exists";
11075 err = 0;
11076 goto reply;
11077 }
11078
11079 CrushWrapper newcrush = _get_pending_crush();
11080
11081 if (newcrush.rule_exists(name)) {
11082 // The name is uniquely associated to a ruleid and the rule it contains
11083 // From the user point of view, the rule is more meaningfull.
11084 ss << "rule " << name << " already exists";
11085 err = 0;
11086 } else {
11087 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
11088 pg_pool_t::TYPE_REPLICATED, &ss);
11089 if (ruleno < 0) {
11090 err = ruleno;
11091 goto reply;
11092 }
11093
11094 pending_inc.crush.clear();
11095 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11096 }
11097 getline(ss, rs);
11098 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11099 get_last_committed() + 1));
11100 return true;
11101
11102 } else if (prefix == "osd crush rule create-replicated") {
11103 string name, root, type, device_class;
11104 cmd_getval(cmdmap, "name", name);
11105 cmd_getval(cmdmap, "root", root);
11106 cmd_getval(cmdmap, "type", type);
11107 cmd_getval(cmdmap, "class", device_class);
11108
11109 if (osdmap.crush->rule_exists(name)) {
11110 // The name is uniquely associated to a ruleid and the rule it contains
11111 // From the user point of view, the rule is more meaningfull.
11112 ss << "rule " << name << " already exists";
11113 err = 0;
11114 goto reply;
11115 }
11116
11117 CrushWrapper newcrush = _get_pending_crush();
11118
11119 if (newcrush.rule_exists(name)) {
11120 // The name is uniquely associated to a ruleid and the rule it contains
11121 // From the user point of view, the rule is more meaningfull.
11122 ss << "rule " << name << " already exists";
11123 err = 0;
11124 } else {
11125 int ruleno = newcrush.add_simple_rule(
11126 name, root, type, device_class,
11127 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11128 if (ruleno < 0) {
11129 err = ruleno;
11130 goto reply;
11131 }
11132
11133 pending_inc.crush.clear();
11134 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11135 }
11136 getline(ss, rs);
11137 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11138 get_last_committed() + 1));
11139 return true;
11140
11141 } else if (prefix == "osd erasure-code-profile rm") {
11142 string name;
11143 cmd_getval(cmdmap, "name", name);
11144
11145 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11146 goto wait;
11147
11148 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11149 err = -EBUSY;
11150 goto reply;
11151 }
11152
11153 if (osdmap.has_erasure_code_profile(name) ||
11154 pending_inc.new_erasure_code_profiles.count(name)) {
11155 if (osdmap.has_erasure_code_profile(name)) {
11156 pending_inc.old_erasure_code_profiles.push_back(name);
11157 } else {
11158 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11159 pending_inc.new_erasure_code_profiles.erase(name);
11160 }
11161
11162 getline(ss, rs);
11163 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11164 get_last_committed() + 1));
11165 return true;
11166 } else {
11167 ss << "erasure-code-profile " << name << " does not exist";
11168 err = 0;
11169 goto reply;
11170 }
11171
11172 } else if (prefix == "osd erasure-code-profile set") {
11173 string name;
11174 cmd_getval(cmdmap, "name", name);
11175 vector<string> profile;
11176 cmd_getval(cmdmap, "profile", profile);
11177
11178 bool force = false;
11179 cmd_getval(cmdmap, "force", force);
11180
11181 map<string,string> profile_map;
11182 err = parse_erasure_code_profile(profile, &profile_map, &ss);
11183 if (err)
11184 goto reply;
11185 if (auto found = profile_map.find("crush-failure-domain");
11186 found != profile_map.end()) {
11187 const auto& failure_domain = found->second;
11188 int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11189 if (failure_domain_type < 0) {
11190 ss << "erasure-code-profile " << profile_map
11191 << " contains an invalid failure-domain " << std::quoted(failure_domain);
11192 err = -EINVAL;
11193 goto reply;
11194 }
11195 }
11196
11197 if (profile_map.find("plugin") == profile_map.end()) {
11198 ss << "erasure-code-profile " << profile_map
11199 << " must contain a plugin entry" << std::endl;
11200 err = -EINVAL;
11201 goto reply;
11202 }
11203 string plugin = profile_map["plugin"];
11204
11205 if (pending_inc.has_erasure_code_profile(name)) {
11206 dout(20) << "erasure code profile " << name << " try again" << dendl;
11207 goto wait;
11208 } else {
11209 err = normalize_profile(name, profile_map, force, &ss);
11210 if (err)
11211 goto reply;
11212
11213 if (osdmap.has_erasure_code_profile(name)) {
11214 ErasureCodeProfile existing_profile_map =
11215 osdmap.get_erasure_code_profile(name);
11216 err = normalize_profile(name, existing_profile_map, force, &ss);
11217 if (err)
11218 goto reply;
11219
11220 if (existing_profile_map == profile_map) {
11221 err = 0;
11222 goto reply;
11223 }
11224 if (!force) {
11225 err = -EPERM;
11226 ss << "will not override erasure code profile " << name
11227 << " because the existing profile "
11228 << existing_profile_map
11229 << " is different from the proposed profile "
11230 << profile_map;
11231 goto reply;
11232 }
11233 }
11234
11235 dout(20) << "erasure code profile set " << name << "="
11236 << profile_map << dendl;
11237 pending_inc.set_erasure_code_profile(name, profile_map);
11238 }
11239
11240 getline(ss, rs);
11241 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11242 get_last_committed() + 1));
11243 return true;
11244
11245 } else if (prefix == "osd crush rule create-erasure") {
11246 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11247 if (err == -EAGAIN)
11248 goto wait;
11249 if (err)
11250 goto reply;
11251 string name, poolstr;
11252 cmd_getval(cmdmap, "name", name);
11253 string profile;
11254 cmd_getval(cmdmap, "profile", profile);
11255 if (profile == "")
11256 profile = "default";
11257 if (profile == "default") {
11258 if (!osdmap.has_erasure_code_profile(profile)) {
11259 if (pending_inc.has_erasure_code_profile(profile)) {
11260 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11261 goto wait;
11262 }
11263
11264 map<string,string> profile_map;
11265 err = osdmap.get_erasure_code_profile_default(cct,
11266 profile_map,
11267 &ss);
11268 if (err)
11269 goto reply;
11270 err = normalize_profile(name, profile_map, true, &ss);
11271 if (err)
11272 goto reply;
11273 dout(20) << "erasure code profile set " << profile << "="
11274 << profile_map << dendl;
11275 pending_inc.set_erasure_code_profile(profile, profile_map);
11276 goto wait;
11277 }
11278 }
11279
11280 int rule;
11281 err = crush_rule_create_erasure(name, profile, &rule, &ss);
11282 if (err < 0) {
11283 switch(err) {
11284 case -EEXIST: // return immediately
11285 ss << "rule " << name << " already exists";
11286 err = 0;
11287 goto reply;
11288 break;
11289 case -EALREADY: // wait for pending to be proposed
11290 ss << "rule " << name << " already exists";
11291 err = 0;
11292 break;
11293 default: // non recoverable error
11294 goto reply;
11295 break;
11296 }
11297 } else {
11298 ss << "created rule " << name << " at " << rule;
11299 }
11300
11301 getline(ss, rs);
11302 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11303 get_last_committed() + 1));
11304 return true;
11305
11306 } else if (prefix == "osd crush rule rm") {
11307 string name;
11308 cmd_getval(cmdmap, "name", name);
11309
11310 if (!osdmap.crush->rule_exists(name)) {
11311 ss << "rule " << name << " does not exist";
11312 err = 0;
11313 goto reply;
11314 }
11315
11316 CrushWrapper newcrush = _get_pending_crush();
11317
11318 if (!newcrush.rule_exists(name)) {
11319 ss << "rule " << name << " does not exist";
11320 err = 0;
11321 } else {
11322 int ruleno = newcrush.get_rule_id(name);
11323 ceph_assert(ruleno >= 0);
11324
11325 // make sure it is not in use.
11326 // FIXME: this is ok in some situations, but let's not bother with that
11327 // complexity now.
11328 if (osdmap.crush_rule_in_use(ruleno)) {
11329 ss << "crush rule " << name << " (" << ruleno << ") is in use";
11330 err = -EBUSY;
11331 goto reply;
11332 }
11333
11334 err = newcrush.remove_rule(ruleno);
11335 if (err < 0) {
11336 goto reply;
11337 }
11338
11339 pending_inc.crush.clear();
11340 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11341 }
11342 getline(ss, rs);
11343 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11344 get_last_committed() + 1));
11345 return true;
11346
11347 } else if (prefix == "osd crush rule rename") {
11348 string srcname;
11349 string dstname;
11350 cmd_getval(cmdmap, "srcname", srcname);
11351 cmd_getval(cmdmap, "dstname", dstname);
11352 if (srcname.empty() || dstname.empty()) {
11353 ss << "must specify both source rule name and destination rule name";
11354 err = -EINVAL;
11355 goto reply;
11356 }
11357 if (srcname == dstname) {
11358 ss << "destination rule name is equal to source rule name";
11359 err = 0;
11360 goto reply;
11361 }
11362
11363 CrushWrapper newcrush = _get_pending_crush();
11364 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11365 // srcname does not exist and dstname already exists
11366 // suppose this is a replay and return success
11367 // (so this command is idempotent)
11368 ss << "already renamed to '" << dstname << "'";
11369 err = 0;
11370 goto reply;
11371 }
11372
11373 err = newcrush.rename_rule(srcname, dstname, &ss);
11374 if (err < 0) {
11375 // ss has reason for failure
11376 goto reply;
11377 }
11378 pending_inc.crush.clear();
11379 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11380 getline(ss, rs);
11381 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11382 get_last_committed() + 1));
11383 return true;
11384
11385 } else if (prefix == "osd setmaxosd") {
11386 int64_t newmax;
11387 if (!cmd_getval(cmdmap, "newmax", newmax)) {
11388 ss << "unable to parse 'newmax' value '"
11389 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11390 err = -EINVAL;
11391 goto reply;
11392 }
11393
11394 if (newmax > g_conf()->mon_max_osd) {
11395 err = -ERANGE;
11396 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11397 << g_conf()->mon_max_osd << ")";
11398 goto reply;
11399 }
11400
11401 // Don't allow shrinking OSD number as this will cause data loss
11402 // and may cause kernel crashes.
11403 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11404 if (newmax < osdmap.get_max_osd()) {
11405 // Check if the OSDs exist between current max and new value.
11406 // If there are any OSDs exist, then don't allow shrinking number
11407 // of OSDs.
11408 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11409 if (osdmap.exists(i)) {
11410 err = -EBUSY;
11411 ss << "cannot shrink max_osd to " << newmax
11412 << " because osd." << i << " (and possibly others) still in use";
11413 goto reply;
11414 }
11415 }
11416 }
11417
11418 pending_inc.new_max_osd = newmax;
11419 ss << "set new max_osd = " << pending_inc.new_max_osd;
11420 getline(ss, rs);
11421 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11422 get_last_committed() + 1));
11423 return true;
11424
11425 } else if (prefix == "osd set-full-ratio" ||
11426 prefix == "osd set-backfillfull-ratio" ||
11427 prefix == "osd set-nearfull-ratio") {
11428 double n;
11429 if (!cmd_getval(cmdmap, "ratio", n)) {
11430 ss << "unable to parse 'ratio' value '"
11431 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11432 err = -EINVAL;
11433 goto reply;
11434 }
11435 if (prefix == "osd set-full-ratio")
11436 pending_inc.new_full_ratio = n;
11437 else if (prefix == "osd set-backfillfull-ratio")
11438 pending_inc.new_backfillfull_ratio = n;
11439 else if (prefix == "osd set-nearfull-ratio")
11440 pending_inc.new_nearfull_ratio = n;
11441 ss << prefix << " " << n;
11442 getline(ss, rs);
11443 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11444 get_last_committed() + 1));
11445 return true;
11446 } else if (prefix == "osd set-require-min-compat-client") {
11447 string v;
11448 cmd_getval(cmdmap, "version", v);
11449 ceph_release_t vno = ceph_release_from_name(v);
11450 if (!vno) {
11451 ss << "version " << v << " is not recognized";
11452 err = -EINVAL;
11453 goto reply;
11454 }
11455 OSDMap newmap;
11456 newmap.deepish_copy_from(osdmap);
11457 newmap.apply_incremental(pending_inc);
11458 newmap.require_min_compat_client = vno;
11459 auto mvno = newmap.get_min_compat_client();
11460 if (vno < mvno) {
11461 ss << "osdmap current utilizes features that require " << mvno
11462 << "; cannot set require_min_compat_client below that to " << vno;
11463 err = -EPERM;
11464 goto reply;
11465 }
11466 bool sure = false;
11467 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11468 if (!sure) {
11469 FeatureMap m;
11470 mon.get_combined_feature_map(&m);
11471 uint64_t features = ceph_release_features(to_integer<int>(vno));
11472 bool first = true;
11473 bool ok = true;
11474 for (int type : {
11475 CEPH_ENTITY_TYPE_CLIENT,
11476 CEPH_ENTITY_TYPE_MDS,
11477 CEPH_ENTITY_TYPE_MGR }) {
11478 auto p = m.m.find(type);
11479 if (p == m.m.end()) {
11480 continue;
11481 }
11482 for (auto& q : p->second) {
11483 uint64_t missing = ~q.first & features;
11484 if (missing) {
11485 if (first) {
11486 ss << "cannot set require_min_compat_client to " << v << ": ";
11487 } else {
11488 ss << "; ";
11489 }
11490 first = false;
11491 ss << q.second << " connected " << ceph_entity_type_name(type)
11492 << "(s) look like " << ceph_release_name(
11493 ceph_release_from_features(q.first))
11494 << " (missing 0x" << std::hex << missing << std::dec << ")";
11495 ok = false;
11496 }
11497 }
11498 }
11499 if (!ok) {
11500 ss << "; add --yes-i-really-mean-it to do it anyway";
11501 err = -EPERM;
11502 goto reply;
11503 }
11504 }
11505 ss << "set require_min_compat_client to " << vno;
11506 pending_inc.new_require_min_compat_client = vno;
11507 getline(ss, rs);
11508 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11509 get_last_committed() + 1));
11510 return true;
11511 } else if (prefix == "osd pause") {
11512 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11513
11514 } else if (prefix == "osd unpause") {
11515 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11516
11517 } else if (prefix == "osd set") {
11518 bool sure = false;
11519 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11520
11521 string key;
11522 cmd_getval(cmdmap, "key", key);
11523 if (key == "pause")
11524 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11525 else if (key == "noup")
11526 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11527 else if (key == "nodown")
11528 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11529 else if (key == "noout")
11530 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11531 else if (key == "noin")
11532 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11533 else if (key == "nobackfill")
11534 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11535 else if (key == "norebalance")
11536 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11537 else if (key == "norecover")
11538 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11539 else if (key == "noscrub")
11540 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11541 else if (key == "nodeep-scrub")
11542 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11543 else if (key == "notieragent")
11544 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11545 else if (key == "nosnaptrim")
11546 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11547 else if (key == "pglog_hardlimit") {
11548 if (!osdmap.get_num_up_osds() && !sure) {
11549 ss << "Not advisable to continue since no OSDs are up. Pass "
11550 << "--yes-i-really-mean-it if you really wish to continue.";
11551 err = -EPERM;
11552 goto reply;
11553 }
11554 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11555 // we are reusing a jewel feature bit that was retired in luminous.
11556 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11557 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11558 || sure)) {
11559 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11560 } else {
11561 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11562 err = -EPERM;
11563 goto reply;
11564 }
11565 } else {
11566 ss << "unrecognized flag '" << key << "'";
11567 err = -EINVAL;
11568 }
11569
11570 } else if (prefix == "osd unset") {
11571 string key;
11572 cmd_getval(cmdmap, "key", key);
11573 if (key == "pause")
11574 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11575 else if (key == "noup")
11576 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11577 else if (key == "nodown")
11578 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11579 else if (key == "noout")
11580 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11581 else if (key == "noin")
11582 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11583 else if (key == "nobackfill")
11584 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11585 else if (key == "norebalance")
11586 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11587 else if (key == "norecover")
11588 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11589 else if (key == "noscrub")
11590 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11591 else if (key == "nodeep-scrub")
11592 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11593 else if (key == "notieragent")
11594 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11595 else if (key == "nosnaptrim")
11596 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11597 else {
11598 ss << "unrecognized flag '" << key << "'";
11599 err = -EINVAL;
11600 }
11601
11602 } else if (prefix == "osd require-osd-release") {
11603 string release;
11604 cmd_getval(cmdmap, "release", release);
11605 bool sure = false;
11606 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11607 ceph_release_t rel = ceph_release_from_name(release.c_str());
11608 if (!rel) {
11609 ss << "unrecognized release " << release;
11610 err = -EINVAL;
11611 goto reply;
11612 }
11613 if (rel == osdmap.require_osd_release) {
11614 // idempotent
11615 err = 0;
11616 goto reply;
11617 }
11618 ceph_assert(osdmap.require_osd_release >= ceph_release_t::octopus);
11619 if (!osdmap.get_num_up_osds() && !sure) {
11620 ss << "Not advisable to continue since no OSDs are up. Pass "
11621 << "--yes-i-really-mean-it if you really wish to continue.";
11622 err = -EPERM;
11623 goto reply;
11624 }
11625 if (rel == ceph_release_t::octopus) {
11626 if (!mon.monmap->get_required_features().contains_all(
11627 ceph::features::mon::FEATURE_OCTOPUS)) {
11628 ss << "not all mons are octopus";
11629 err = -EPERM;
11630 goto reply;
11631 }
11632 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11633 && !sure) {
11634 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11635 err = -EPERM;
11636 goto reply;
11637 }
11638 } else if (rel == ceph_release_t::pacific) {
11639 if (!mon.monmap->get_required_features().contains_all(
11640 ceph::features::mon::FEATURE_PACIFIC)) {
11641 ss << "not all mons are pacific";
11642 err = -EPERM;
11643 goto reply;
11644 }
11645 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11646 && !sure) {
11647 ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11648 err = -EPERM;
11649 goto reply;
11650 }
11651 } else if (rel == ceph_release_t::quincy) {
11652 if (!mon.monmap->get_required_features().contains_all(
11653 ceph::features::mon::FEATURE_QUINCY)) {
11654 ss << "not all mons are quincy";
11655 err = -EPERM;
11656 goto reply;
11657 }
11658 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
11659 && !sure) {
11660 ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11661 err = -EPERM;
11662 goto reply;
11663 }
11664 } else {
11665 ss << "not supported for this release";
11666 err = -EPERM;
11667 goto reply;
11668 }
11669 if (rel < osdmap.require_osd_release) {
11670 ss << "require_osd_release cannot be lowered once it has been set";
11671 err = -EPERM;
11672 goto reply;
11673 }
11674 pending_inc.new_require_osd_release = rel;
11675 goto update;
11676 } else if (prefix == "osd down" ||
11677 prefix == "osd out" ||
11678 prefix == "osd in" ||
11679 prefix == "osd rm" ||
11680 prefix == "osd stop") {
11681
11682 bool any = false;
11683 bool stop = false;
11684 bool verbose = true;
11685 bool definitely_dead = false;
11686
11687 vector<string> idvec;
11688 cmd_getval(cmdmap, "ids", idvec);
11689 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11690 derr << "definitely_dead " << (int)definitely_dead << dendl;
11691 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11692 set<int> osds;
11693
11694 // wildcard?
11695 if (j == 0 &&
11696 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11697 if (prefix == "osd in") {
11698 // touch out osds only
11699 osdmap.get_out_existing_osds(osds);
11700 } else {
11701 osdmap.get_all_osds(osds);
11702 }
11703 stop = true;
11704 verbose = false; // so the output is less noisy.
11705 } else {
11706 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11707 if (osd < 0) {
11708 ss << "invalid osd id" << osd;
11709 err = -EINVAL;
11710 continue;
11711 } else if (!osdmap.exists(osd)) {
11712 ss << "osd." << osd << " does not exist. ";
11713 continue;
11714 }
11715
11716 osds.insert(osd);
11717 }
11718
11719 for (auto &osd : osds) {
11720 if (prefix == "osd down") {
11721 if (osdmap.is_down(osd)) {
11722 if (verbose)
11723 ss << "osd." << osd << " is already down. ";
11724 } else {
11725 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11726 ss << "marked down osd." << osd << ". ";
11727 any = true;
11728 }
11729 if (definitely_dead) {
11730 if (!pending_inc.new_xinfo.count(osd)) {
11731 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11732 }
11733 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11734 any = true;
11735 }
11736 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11737 }
11738 } else if (prefix == "osd out") {
11739 if (osdmap.is_out(osd)) {
11740 if (verbose)
11741 ss << "osd." << osd << " is already out. ";
11742 } else {
11743 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11744 if (osdmap.osd_weight[osd]) {
11745 if (pending_inc.new_xinfo.count(osd) == 0) {
11746 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11747 }
11748 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11749 }
11750 ss << "marked out osd." << osd << ". ";
11751 std::ostringstream msg;
11752 msg << "Client " << op->get_session()->entity_name
11753 << " marked osd." << osd << " out";
11754 if (osdmap.is_up(osd)) {
11755 msg << ", while it was still marked up";
11756 } else {
11757 auto period = ceph_clock_now() - down_pending_out[osd];
11758 msg << ", after it was down for " << int(period.sec())
11759 << " seconds";
11760 }
11761
11762 mon.clog->info() << msg.str();
11763 any = true;
11764 }
11765 } else if (prefix == "osd in") {
11766 if (osdmap.is_in(osd)) {
11767 if (verbose)
11768 ss << "osd." << osd << " is already in. ";
11769 } else {
11770 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11771 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11772 if (pending_inc.new_xinfo.count(osd) == 0) {
11773 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11774 }
11775 pending_inc.new_xinfo[osd].old_weight = 0;
11776 } else {
11777 pending_inc.new_weight[osd] = CEPH_OSD_IN;
11778 }
11779 ss << "marked in osd." << osd << ". ";
11780 any = true;
11781 }
11782 } else if (prefix == "osd rm") {
11783 err = prepare_command_osd_remove(osd);
11784
11785 if (err == -EBUSY) {
11786 if (any)
11787 ss << ", ";
11788 ss << "osd." << osd << " is still up; must be down before removal. ";
11789 } else {
11790 ceph_assert(err == 0);
11791 if (any) {
11792 ss << ", osd." << osd;
11793 } else {
11794 ss << "removed osd." << osd;
11795 }
11796 any = true;
11797 }
11798 } else if (prefix == "osd stop") {
11799 if (osdmap.is_stop(osd)) {
11800 if (verbose)
11801 ss << "osd." << osd << " is already stopped. ";
11802 } else if (osdmap.is_down(osd)) {
11803 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11804 ss << "stop down osd." << osd << ". ";
11805 any = true;
11806 } else {
11807 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11808 ss << "stop osd." << osd << ". ";
11809 any = true;
11810 }
11811 }
11812 }
11813 }
11814 if (any) {
11815 getline(ss, rs);
11816 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11817 get_last_committed() + 1));
11818 return true;
11819 }
11820 } else if (prefix == "osd set-group" ||
11821 prefix == "osd unset-group" ||
11822 prefix == "osd add-noup" ||
11823 prefix == "osd add-nodown" ||
11824 prefix == "osd add-noin" ||
11825 prefix == "osd add-noout" ||
11826 prefix == "osd rm-noup" ||
11827 prefix == "osd rm-nodown" ||
11828 prefix == "osd rm-noin" ||
11829 prefix == "osd rm-noout") {
11830 bool do_set = prefix == "osd set-group" ||
11831 prefix.find("add") != string::npos;
11832 string flag_str;
11833 unsigned flags = 0;
11834 vector<string> who;
11835 if (prefix == "osd set-group" || prefix == "osd unset-group") {
11836 cmd_getval(cmdmap, "flags", flag_str);
11837 cmd_getval(cmdmap, "who", who);
11838 vector<string> raw_flags;
11839 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11840 for (auto& f : raw_flags) {
11841 if (f == "noup")
11842 flags |= CEPH_OSD_NOUP;
11843 else if (f == "nodown")
11844 flags |= CEPH_OSD_NODOWN;
11845 else if (f == "noin")
11846 flags |= CEPH_OSD_NOIN;
11847 else if (f == "noout")
11848 flags |= CEPH_OSD_NOOUT;
11849 else {
11850 ss << "unrecognized flag '" << f << "', must be one of "
11851 << "{noup,nodown,noin,noout}";
11852 err = -EINVAL;
11853 goto reply;
11854 }
11855 }
11856 } else {
11857 cmd_getval(cmdmap, "ids", who);
11858 if (prefix.find("noup") != string::npos)
11859 flags = CEPH_OSD_NOUP;
11860 else if (prefix.find("nodown") != string::npos)
11861 flags = CEPH_OSD_NODOWN;
11862 else if (prefix.find("noin") != string::npos)
11863 flags = CEPH_OSD_NOIN;
11864 else if (prefix.find("noout") != string::npos)
11865 flags = CEPH_OSD_NOOUT;
11866 else
11867 ceph_assert(0 == "Unreachable!");
11868 }
11869 if (flags == 0) {
11870 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11871 err = -EINVAL;
11872 goto reply;
11873 }
11874 if (who.empty()) {
11875 ss << "must specify at least one or more targets to set/unset";
11876 err = -EINVAL;
11877 goto reply;
11878 }
11879 set<int> osds;
11880 set<int> crush_nodes;
11881 set<int> device_classes;
11882 for (auto& w : who) {
11883 if (w == "any" || w == "all" || w == "*") {
11884 osdmap.get_all_osds(osds);
11885 break;
11886 }
11887 std::stringstream ts;
11888 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11889 osds.insert(osd);
11890 } else if (osdmap.crush->name_exists(w)) {
11891 crush_nodes.insert(osdmap.crush->get_item_id(w));
11892 } else if (osdmap.crush->class_exists(w)) {
11893 device_classes.insert(osdmap.crush->get_class_id(w));
11894 } else {
11895 ss << "unable to parse osd id or crush node or device class: "
11896 << "\"" << w << "\". ";
11897 }
11898 }
11899 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11900 // ss has reason for failure
11901 err = -EINVAL;
11902 goto reply;
11903 }
11904 bool any = false;
11905 for (auto osd : osds) {
11906 if (!osdmap.exists(osd)) {
11907 ss << "osd." << osd << " does not exist. ";
11908 continue;
11909 }
11910 if (do_set) {
11911 if (flags & CEPH_OSD_NOUP) {
11912 any |= osdmap.is_noup_by_osd(osd) ?
11913 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11914 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11915 }
11916 if (flags & CEPH_OSD_NODOWN) {
11917 any |= osdmap.is_nodown_by_osd(osd) ?
11918 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11919 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11920 }
11921 if (flags & CEPH_OSD_NOIN) {
11922 any |= osdmap.is_noin_by_osd(osd) ?
11923 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11924 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11925 }
11926 if (flags & CEPH_OSD_NOOUT) {
11927 any |= osdmap.is_noout_by_osd(osd) ?
11928 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11929 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11930 }
11931 } else {
11932 if (flags & CEPH_OSD_NOUP) {
11933 any |= osdmap.is_noup_by_osd(osd) ?
11934 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11935 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11936 }
11937 if (flags & CEPH_OSD_NODOWN) {
11938 any |= osdmap.is_nodown_by_osd(osd) ?
11939 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11940 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11941 }
11942 if (flags & CEPH_OSD_NOIN) {
11943 any |= osdmap.is_noin_by_osd(osd) ?
11944 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11945 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11946 }
11947 if (flags & CEPH_OSD_NOOUT) {
11948 any |= osdmap.is_noout_by_osd(osd) ?
11949 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11950 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11951 }
11952 }
11953 }
11954 for (auto& id : crush_nodes) {
11955 auto old_flags = osdmap.get_crush_node_flags(id);
11956 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11957 pending_flags |= old_flags; // adopt existing flags first!
11958 if (do_set) {
11959 pending_flags |= flags;
11960 } else {
11961 pending_flags &= ~flags;
11962 }
11963 any = true;
11964 }
11965 for (auto& id : device_classes) {
11966 auto old_flags = osdmap.get_device_class_flags(id);
11967 auto& pending_flags = pending_inc.new_device_class_flags[id];
11968 pending_flags |= old_flags;
11969 if (do_set) {
11970 pending_flags |= flags;
11971 } else {
11972 pending_flags &= ~flags;
11973 }
11974 any = true;
11975 }
11976 if (any) {
11977 getline(ss, rs);
11978 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11979 get_last_committed() + 1));
11980 return true;
11981 }
11982 } else if (prefix == "osd pg-temp") {
11983 string pgidstr;
11984 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11985 ss << "unable to parse 'pgid' value '"
11986 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11987 err = -EINVAL;
11988 goto reply;
11989 }
11990 pg_t pgid;
11991 if (!pgid.parse(pgidstr.c_str())) {
11992 ss << "invalid pgid '" << pgidstr << "'";
11993 err = -EINVAL;
11994 goto reply;
11995 }
11996 if (!osdmap.pg_exists(pgid)) {
11997 ss << "pg " << pgid << " does not exist";
11998 err = -ENOENT;
11999 goto reply;
12000 }
12001 if (pending_inc.new_pg_temp.count(pgid)) {
12002 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
12003 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12004 return true;
12005 }
12006
12007 vector<int64_t> id_vec;
12008 vector<int32_t> new_pg_temp;
12009 cmd_getval(cmdmap, "id", id_vec);
12010 if (id_vec.empty()) {
12011 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
12012 ss << "done cleaning up pg_temp of " << pgid;
12013 goto update;
12014 }
12015 for (auto osd : id_vec) {
12016 if (!osdmap.exists(osd)) {
12017 ss << "osd." << osd << " does not exist";
12018 err = -ENOENT;
12019 goto reply;
12020 }
12021 new_pg_temp.push_back(osd);
12022 }
12023
12024 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12025 if ((int)new_pg_temp.size() < pool_min_size) {
12026 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
12027 << pool_min_size << ")";
12028 err = -EINVAL;
12029 goto reply;
12030 }
12031
12032 int pool_size = osdmap.get_pg_pool_size(pgid);
12033 if ((int)new_pg_temp.size() > pool_size) {
12034 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
12035 << pool_size << ")";
12036 err = -EINVAL;
12037 goto reply;
12038 }
12039
12040 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
12041 new_pg_temp.begin(), new_pg_temp.end());
12042 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
12043 goto update;
12044 } else if (prefix == "osd primary-temp") {
12045 string pgidstr;
12046 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
12047 ss << "unable to parse 'pgid' value '"
12048 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
12049 err = -EINVAL;
12050 goto reply;
12051 }
12052 pg_t pgid;
12053 if (!pgid.parse(pgidstr.c_str())) {
12054 ss << "invalid pgid '" << pgidstr << "'";
12055 err = -EINVAL;
12056 goto reply;
12057 }
12058 if (!osdmap.pg_exists(pgid)) {
12059 ss << "pg " << pgid << " does not exist";
12060 err = -ENOENT;
12061 goto reply;
12062 }
12063
12064 int64_t osd;
12065 if (!cmd_getval(cmdmap, "id", osd)) {
12066 ss << "unable to parse 'id' value '"
12067 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12068 err = -EINVAL;
12069 goto reply;
12070 }
12071 if (osd != -1 && !osdmap.exists(osd)) {
12072 ss << "osd." << osd << " does not exist";
12073 err = -ENOENT;
12074 goto reply;
12075 }
12076
12077 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12078 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12079 ss << "require_min_compat_client "
12080 << osdmap.require_min_compat_client
12081 << " < firefly, which is required for primary-temp";
12082 err = -EPERM;
12083 goto reply;
12084 }
12085
12086 pending_inc.new_primary_temp[pgid] = osd;
12087 ss << "set " << pgid << " primary_temp mapping to " << osd;
12088 goto update;
12089 } else if (prefix == "pg repeer") {
12090 pg_t pgid;
12091 string pgidstr;
12092 cmd_getval(cmdmap, "pgid", pgidstr);
12093 if (!pgid.parse(pgidstr.c_str())) {
12094 ss << "invalid pgid '" << pgidstr << "'";
12095 err = -EINVAL;
12096 goto reply;
12097 }
12098 if (!osdmap.pg_exists(pgid)) {
12099 ss << "pg '" << pgidstr << "' does not exist";
12100 err = -ENOENT;
12101 goto reply;
12102 }
12103 vector<int> acting;
12104 int primary;
12105 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12106 if (primary < 0) {
12107 err = -EAGAIN;
12108 ss << "pg currently has no primary";
12109 goto reply;
12110 }
12111 if (acting.size() > 1) {
12112 // map to just primary; it will map back to what it wants
12113 pending_inc.new_pg_temp[pgid] = { primary };
12114 } else {
12115 // hmm, pick another arbitrary osd to induce a change. Note
12116 // that this won't work if there is only one suitable OSD in the cluster.
12117 int i;
12118 bool done = false;
12119 for (i = 0; i < osdmap.get_max_osd(); ++i) {
12120 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12121 continue;
12122 }
12123 pending_inc.new_pg_temp[pgid] = { primary, i };
12124 done = true;
12125 break;
12126 }
12127 if (!done) {
12128 err = -EAGAIN;
12129 ss << "not enough up OSDs in the cluster to force repeer";
12130 goto reply;
12131 }
12132 }
12133 goto update;
12134 } else if (prefix == "osd pg-upmap" ||
12135 prefix == "osd rm-pg-upmap" ||
12136 prefix == "osd pg-upmap-items" ||
12137 prefix == "osd rm-pg-upmap-items") {
12138 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
12139 ss << "min_compat_client "
12140 << osdmap.require_min_compat_client
12141 << " < luminous, which is required for pg-upmap. "
12142 << "Try 'ceph osd set-require-min-compat-client luminous' "
12143 << "before using the new interface";
12144 err = -EPERM;
12145 goto reply;
12146 }
12147 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
12148 if (err == -EAGAIN)
12149 goto wait;
12150 if (err < 0)
12151 goto reply;
12152 string pgidstr;
12153 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
12154 ss << "unable to parse 'pgid' value '"
12155 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
12156 err = -EINVAL;
12157 goto reply;
12158 }
12159 pg_t pgid;
12160 if (!pgid.parse(pgidstr.c_str())) {
12161 ss << "invalid pgid '" << pgidstr << "'";
12162 err = -EINVAL;
12163 goto reply;
12164 }
12165 if (!osdmap.pg_exists(pgid)) {
12166 ss << "pg " << pgid << " does not exist";
12167 err = -ENOENT;
12168 goto reply;
12169 }
12170 if (pending_inc.old_pools.count(pgid.pool())) {
12171 ss << "pool of " << pgid << " is pending removal";
12172 err = -ENOENT;
12173 getline(ss, rs);
12174 wait_for_finished_proposal(op,
12175 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12176 return true;
12177 }
12178
12179 enum {
12180 OP_PG_UPMAP,
12181 OP_RM_PG_UPMAP,
12182 OP_PG_UPMAP_ITEMS,
12183 OP_RM_PG_UPMAP_ITEMS,
12184 } option;
12185
12186 if (prefix == "osd pg-upmap") {
12187 option = OP_PG_UPMAP;
12188 } else if (prefix == "osd rm-pg-upmap") {
12189 option = OP_RM_PG_UPMAP;
12190 } else if (prefix == "osd pg-upmap-items") {
12191 option = OP_PG_UPMAP_ITEMS;
12192 } else {
12193 option = OP_RM_PG_UPMAP_ITEMS;
12194 }
12195
12196 // check pending upmap changes
12197 switch (option) {
12198 case OP_PG_UPMAP: // fall through
12199 case OP_RM_PG_UPMAP:
12200 if (pending_inc.new_pg_upmap.count(pgid) ||
12201 pending_inc.old_pg_upmap.count(pgid)) {
12202 dout(10) << __func__ << " waiting for pending update on "
12203 << pgid << dendl;
12204 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12205 return true;
12206 }
12207 break;
12208
12209 case OP_PG_UPMAP_ITEMS: // fall through
12210 case OP_RM_PG_UPMAP_ITEMS:
12211 if (pending_inc.new_pg_upmap_items.count(pgid) ||
12212 pending_inc.old_pg_upmap_items.count(pgid)) {
12213 dout(10) << __func__ << " waiting for pending update on "
12214 << pgid << dendl;
12215 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12216 return true;
12217 }
12218 break;
12219
12220 default:
12221 ceph_abort_msg("invalid option");
12222 }
12223
12224 switch (option) {
12225 case OP_PG_UPMAP:
12226 {
12227 vector<int64_t> id_vec;
12228 if (!cmd_getval(cmdmap, "id", id_vec)) {
12229 ss << "unable to parse 'id' value(s) '"
12230 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12231 err = -EINVAL;
12232 goto reply;
12233 }
12234
12235 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12236 if ((int)id_vec.size() < pool_min_size) {
12237 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12238 << pool_min_size << ")";
12239 err = -EINVAL;
12240 goto reply;
12241 }
12242
12243 int pool_size = osdmap.get_pg_pool_size(pgid);
12244 if ((int)id_vec.size() > pool_size) {
12245 ss << "num of osds (" << id_vec.size() <<") > pool size ("
12246 << pool_size << ")";
12247 err = -EINVAL;
12248 goto reply;
12249 }
12250
12251 vector<int32_t> new_pg_upmap;
12252 for (auto osd : id_vec) {
12253 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12254 ss << "osd." << osd << " does not exist";
12255 err = -ENOENT;
12256 goto reply;
12257 }
12258 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12259 if (it != new_pg_upmap.end()) {
12260 ss << "osd." << osd << " already exists, ";
12261 continue;
12262 }
12263 new_pg_upmap.push_back(osd);
12264 }
12265
12266 if (new_pg_upmap.empty()) {
12267 ss << "no valid upmap items(pairs) is specified";
12268 err = -EINVAL;
12269 goto reply;
12270 }
12271
12272 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12273 new_pg_upmap.begin(), new_pg_upmap.end());
12274 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12275 }
12276 break;
12277
12278 case OP_RM_PG_UPMAP:
12279 {
12280 pending_inc.old_pg_upmap.insert(pgid);
12281 ss << "clear " << pgid << " pg_upmap mapping";
12282 }
12283 break;
12284
12285 case OP_PG_UPMAP_ITEMS:
12286 {
12287 vector<int64_t> id_vec;
12288 if (!cmd_getval(cmdmap, "id", id_vec)) {
12289 ss << "unable to parse 'id' value(s) '"
12290 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12291 err = -EINVAL;
12292 goto reply;
12293 }
12294
12295 if (id_vec.size() % 2) {
12296 ss << "you must specify pairs of osd ids to be remapped";
12297 err = -EINVAL;
12298 goto reply;
12299 }
12300
12301 int pool_size = osdmap.get_pg_pool_size(pgid);
12302 if ((int)(id_vec.size() / 2) > pool_size) {
12303 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12304 << pool_size << ")";
12305 err = -EINVAL;
12306 goto reply;
12307 }
12308
12309 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12310 ostringstream items;
12311 items << "[";
12312 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12313 int from = *p++;
12314 int to = *p;
12315 if (from == to) {
12316 ss << "from osd." << from << " == to osd." << to << ", ";
12317 continue;
12318 }
12319 if (!osdmap.exists(from)) {
12320 ss << "osd." << from << " does not exist";
12321 err = -ENOENT;
12322 goto reply;
12323 }
12324 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12325 ss << "osd." << to << " does not exist";
12326 err = -ENOENT;
12327 goto reply;
12328 }
12329 pair<int32_t,int32_t> entry = make_pair(from, to);
12330 auto it = std::find(new_pg_upmap_items.begin(),
12331 new_pg_upmap_items.end(), entry);
12332 if (it != new_pg_upmap_items.end()) {
12333 ss << "osd." << from << " -> osd." << to << " already exists, ";
12334 continue;
12335 }
12336 new_pg_upmap_items.push_back(entry);
12337 items << from << "->" << to << ",";
12338 }
12339 string out(items.str());
12340 out.resize(out.size() - 1); // drop last ','
12341 out += "]";
12342
12343 if (new_pg_upmap_items.empty()) {
12344 ss << "no valid upmap items(pairs) is specified";
12345 err = -EINVAL;
12346 goto reply;
12347 }
12348
12349 pending_inc.new_pg_upmap_items[pgid] =
12350 mempool::osdmap::vector<pair<int32_t,int32_t>>(
12351 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12352 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12353 }
12354 break;
12355
12356 case OP_RM_PG_UPMAP_ITEMS:
12357 {
12358 pending_inc.old_pg_upmap_items.insert(pgid);
12359 ss << "clear " << pgid << " pg_upmap_items mapping";
12360 }
12361 break;
12362
12363 default:
12364 ceph_abort_msg("invalid option");
12365 }
12366
12367 goto update;
12368 } else if (prefix == "osd primary-affinity") {
12369 int64_t id;
12370 if (!cmd_getval(cmdmap, "id", id)) {
12371 ss << "invalid osd id value '"
12372 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12373 err = -EINVAL;
12374 goto reply;
12375 }
12376 double w;
12377 if (!cmd_getval(cmdmap, "weight", w)) {
12378 ss << "unable to parse 'weight' value '"
12379 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12380 err = -EINVAL;
12381 goto reply;
12382 }
12383 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12384 if (ww < 0L) {
12385 ss << "weight must be >= 0";
12386 err = -EINVAL;
12387 goto reply;
12388 }
12389 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12390 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12391 ss << "require_min_compat_client "
12392 << osdmap.require_min_compat_client
12393 << " < firefly, which is required for primary-affinity";
12394 err = -EPERM;
12395 goto reply;
12396 }
12397 if (osdmap.exists(id)) {
12398 pending_inc.new_primary_affinity[id] = ww;
12399 ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12400 getline(ss, rs);
12401 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12402 get_last_committed() + 1));
12403 return true;
12404 } else {
12405 ss << "osd." << id << " does not exist";
12406 err = -ENOENT;
12407 goto reply;
12408 }
12409 } else if (prefix == "osd reweight") {
12410 int64_t id;
12411 if (!cmd_getval(cmdmap, "id", id)) {
12412 ss << "unable to parse osd id value '"
12413 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12414 err = -EINVAL;
12415 goto reply;
12416 }
12417 double w;
12418 if (!cmd_getval(cmdmap, "weight", w)) {
12419 ss << "unable to parse weight value '"
12420 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12421 err = -EINVAL;
12422 goto reply;
12423 }
12424 long ww = (int)((double)CEPH_OSD_IN*w);
12425 if (ww < 0L) {
12426 ss << "weight must be >= 0";
12427 err = -EINVAL;
12428 goto reply;
12429 }
12430 if (osdmap.exists(id)) {
12431 pending_inc.new_weight[id] = ww;
12432 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12433 getline(ss, rs);
12434 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12435 get_last_committed() + 1));
12436 return true;
12437 } else {
12438 ss << "osd." << id << " does not exist";
12439 err = -ENOENT;
12440 goto reply;
12441 }
12442 } else if (prefix == "osd reweightn") {
12443 map<int32_t, uint32_t> weights;
12444 err = parse_reweights(cct, cmdmap, osdmap, &weights);
12445 if (err) {
12446 ss << "unable to parse 'weights' value '"
12447 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12448 goto reply;
12449 }
12450 pending_inc.new_weight.insert(weights.begin(), weights.end());
12451 wait_for_finished_proposal(
12452 op,
12453 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12454 return true;
12455 } else if (prefix == "osd lost") {
12456 int64_t id;
12457 if (!cmd_getval(cmdmap, "id", id)) {
12458 ss << "unable to parse osd id value '"
12459 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12460 err = -EINVAL;
12461 goto reply;
12462 }
12463 bool sure = false;
12464 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12465 if (!sure) {
12466 ss << "are you SURE? this might mean real, permanent data loss. pass "
12467 "--yes-i-really-mean-it if you really do.";
12468 err = -EPERM;
12469 goto reply;
12470 } else if (!osdmap.exists(id)) {
12471 ss << "osd." << id << " does not exist";
12472 err = -ENOENT;
12473 goto reply;
12474 } else if (!osdmap.is_down(id)) {
12475 ss << "osd." << id << " is not down";
12476 err = -EBUSY;
12477 goto reply;
12478 } else {
12479 epoch_t e = osdmap.get_info(id).down_at;
12480 pending_inc.new_lost[id] = e;
12481 ss << "marked osd lost in epoch " << e;
12482 getline(ss, rs);
12483 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12484 get_last_committed() + 1));
12485 return true;
12486 }
12487
12488 } else if (prefix == "osd destroy-actual" ||
12489 prefix == "osd purge-actual" ||
12490 prefix == "osd purge-new") {
12491 /* Destroying an OSD means that we don't expect to further make use of
12492 * the OSDs data (which may even become unreadable after this operation),
12493 * and that we are okay with scrubbing all its cephx keys and config-key
12494 * data (which may include lockbox keys, thus rendering the osd's data
12495 * unreadable).
12496 *
12497 * The OSD will not be removed. Instead, we will mark it as destroyed,
12498 * such that a subsequent call to `create` will not reuse the osd id.
12499 * This will play into being able to recreate the OSD, at the same
12500 * crush location, with minimal data movement.
12501 */
12502
12503 // make sure authmon is writeable.
12504 if (!mon.authmon()->is_writeable()) {
12505 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12506 << "osd destroy" << dendl;
12507 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12508 return false;
12509 }
12510
12511 int64_t id;
12512 if (!cmd_getval(cmdmap, "id", id)) {
12513 auto p = cmdmap.find("id");
12514 if (p == cmdmap.end()) {
12515 ss << "no osd id specified";
12516 } else {
12517 ss << "unable to parse osd id value '"
12518 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12519 }
12520 err = -EINVAL;
12521 goto reply;
12522 }
12523
12524 bool is_destroy = (prefix == "osd destroy-actual");
12525 if (!is_destroy) {
12526 ceph_assert("osd purge-actual" == prefix ||
12527 "osd purge-new" == prefix);
12528 }
12529
12530 bool sure = false;
12531 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12532 if (!sure) {
12533 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12534 << "This will mean real, permanent data loss, as well "
12535 << "as deletion of cephx and lockbox keys. "
12536 << "Pass --yes-i-really-mean-it if you really do.";
12537 err = -EPERM;
12538 goto reply;
12539 } else if (!osdmap.exists(id)) {
12540 ss << "osd." << id << " does not exist";
12541 err = 0; // idempotent
12542 goto reply;
12543 } else if (osdmap.is_up(id)) {
12544 ss << "osd." << id << " is not `down`.";
12545 err = -EBUSY;
12546 goto reply;
12547 } else if (is_destroy && osdmap.is_destroyed(id)) {
12548 ss << "destroyed osd." << id;
12549 err = 0;
12550 goto reply;
12551 }
12552
12553 if (prefix == "osd purge-new" &&
12554 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12555 ss << "osd." << id << " is not new";
12556 err = -EPERM;
12557 goto reply;
12558 }
12559
12560 bool goto_reply = false;
12561
12562 paxos.plug();
12563 if (is_destroy) {
12564 err = prepare_command_osd_destroy(id, ss);
12565 // we checked above that it should exist.
12566 ceph_assert(err != -ENOENT);
12567 } else {
12568 err = prepare_command_osd_purge(id, ss);
12569 if (err == -ENOENT) {
12570 err = 0;
12571 ss << "osd." << id << " does not exist.";
12572 goto_reply = true;
12573 }
12574 }
12575 paxos.unplug();
12576
12577 if (err < 0 || goto_reply) {
12578 goto reply;
12579 }
12580
12581 if (is_destroy) {
12582 ss << "destroyed osd." << id;
12583 } else {
12584 ss << "purged osd." << id;
12585 }
12586
12587 getline(ss, rs);
12588 wait_for_finished_proposal(op,
12589 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12590 force_immediate_propose();
12591 return true;
12592
12593 } else if (prefix == "osd new") {
12594
12595 // make sure authmon is writeable.
12596 if (!mon.authmon()->is_writeable()) {
12597 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12598 << "osd new" << dendl;
12599 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12600 return false;
12601 }
12602
12603 // make sure kvmon is writeable.
12604 if (!mon.kvmon()->is_writeable()) {
12605 dout(10) << __func__ << " waiting for kv mon to be writeable for "
12606 << "osd new" << dendl;
12607 mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12608 return false;
12609 }
12610
12611 map<string,string> param_map;
12612
12613 bufferlist bl = m->get_data();
12614 string param_json = bl.to_str();
12615 dout(20) << __func__ << " osd new json = " << param_json << dendl;
12616
12617 err = get_json_str_map(param_json, ss, &param_map);
12618 if (err < 0)
12619 goto reply;
12620
12621 dout(20) << __func__ << " osd new params " << param_map << dendl;
12622
12623 paxos.plug();
12624 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12625 paxos.unplug();
12626
12627 if (err < 0) {
12628 goto reply;
12629 }
12630
12631 if (f) {
12632 f->flush(rdata);
12633 } else {
12634 rdata.append(ss);
12635 }
12636
12637 if (err == EEXIST) {
12638 // idempotent operation
12639 err = 0;
12640 goto reply;
12641 }
12642
12643 wait_for_finished_proposal(op,
12644 new Monitor::C_Command(mon, op, 0, rs, rdata,
12645 get_last_committed() + 1));
12646 force_immediate_propose();
12647 return true;
12648
12649 } else if (prefix == "osd create") {
12650
12651 // optional id provided?
12652 int64_t id = -1, cmd_id = -1;
12653 if (cmd_getval(cmdmap, "id", cmd_id)) {
12654 if (cmd_id < 0) {
12655 ss << "invalid osd id value '" << cmd_id << "'";
12656 err = -EINVAL;
12657 goto reply;
12658 }
12659 dout(10) << " osd create got id " << cmd_id << dendl;
12660 }
12661
12662 uuid_d uuid;
12663 string uuidstr;
12664 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12665 if (!uuid.parse(uuidstr.c_str())) {
12666 ss << "invalid uuid value '" << uuidstr << "'";
12667 err = -EINVAL;
12668 goto reply;
12669 }
12670 // we only care about the id if we also have the uuid, to
12671 // ensure the operation's idempotency.
12672 id = cmd_id;
12673 }
12674
12675 int32_t new_id = -1;
12676 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12677 if (err < 0) {
12678 if (err == -EAGAIN) {
12679 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12680 return true;
12681 }
12682 // a check has failed; reply to the user.
12683 goto reply;
12684
12685 } else if (err == EEXIST) {
12686 // this is an idempotent operation; we can go ahead and reply.
12687 if (f) {
12688 f->open_object_section("created_osd");
12689 f->dump_int("osdid", new_id);
12690 f->close_section();
12691 f->flush(rdata);
12692 } else {
12693 ss << new_id;
12694 rdata.append(ss);
12695 }
12696 err = 0;
12697 goto reply;
12698 }
12699
12700 string empty_device_class;
12701 do_osd_create(id, uuid, empty_device_class, &new_id);
12702
12703 if (f) {
12704 f->open_object_section("created_osd");
12705 f->dump_int("osdid", new_id);
12706 f->close_section();
12707 f->flush(rdata);
12708 } else {
12709 ss << new_id;
12710 rdata.append(ss);
12711 }
12712 wait_for_finished_proposal(op,
12713 new Monitor::C_Command(mon, op, 0, rs, rdata,
12714 get_last_committed() + 1));
12715 return true;
12716
12717 } else if (prefix == "osd blocklist clear" ||
12718 prefix == "osd blacklist clear") {
12719 pending_inc.new_blocklist.clear();
12720 std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12721 std::list<std::pair<entity_addr_t,utime_t > > range_b;
12722 osdmap.get_blocklist(&blocklist, &range_b);
12723 for (const auto &entry : blocklist) {
12724 pending_inc.old_blocklist.push_back(entry.first);
12725 }
12726 for (const auto &entry : range_b) {
12727 pending_inc.old_range_blocklist.push_back(entry.first);
12728 }
12729 ss << " removed all blocklist entries";
12730 getline(ss, rs);
12731 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12732 get_last_committed() + 1));
12733 return true;
12734 } else if (prefix == "osd blocklist" ||
12735 prefix == "osd blacklist") {
12736 string addrstr, rangestr;
12737 bool range = false;
12738 cmd_getval(cmdmap, "addr", addrstr);
12739 if (cmd_getval(cmdmap, "range", rangestr)) {
12740 if (rangestr == "range") {
12741 range = true;
12742 } else {
12743 ss << "Did you mean to specify \"osd blocklist range\"?";
12744 err = -EINVAL;
12745 goto reply;
12746 }
12747 }
12748 entity_addr_t addr;
12749 if (!addr.parse(addrstr)) {
12750 ss << "unable to parse address " << addrstr;
12751 err = -EINVAL;
12752 goto reply;
12753 }
12754 else {
12755 if (range) {
12756 if (!addr.maybe_cidr()) {
12757 ss << "You specified a range command, but " << addr
12758 << " does not parse as a CIDR range";
12759 err = -EINVAL;
12760 goto reply;
12761 }
12762 addr.type = entity_addr_t::TYPE_CIDR;
12763 err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
12764 if (err) {
12765 goto reply;
12766 }
12767 if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
12768 (addr.is_ipv6() && addr.get_nonce() > 128)) {
12769 ss << "Too many bits in range for that protocol!";
12770 err = -EINVAL;
12771 goto reply;
12772 }
12773 } else {
12774 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12775 // always blocklist type ANY
12776 addr.set_type(entity_addr_t::TYPE_ANY);
12777 } else {
12778 addr.set_type(entity_addr_t::TYPE_LEGACY);
12779 }
12780 }
12781
12782 string blocklistop;
12783 if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12784 cmd_getval(cmdmap, "blacklistop", blocklistop);
12785 }
12786 if (blocklistop == "add") {
12787 utime_t expires = ceph_clock_now();
12788 // default one hour
12789 double d = cmd_getval_or<double>(cmdmap, "expire",
12790 g_conf()->mon_osd_blocklist_default_expire);
12791 expires += d;
12792
12793 auto add_to_pending_blocklists = [](auto& nb, auto& ob,
12794 const auto& addr,
12795 const auto& expires) {
12796 nb[addr] = expires;
12797 // cancel any pending un-blocklisting request too
12798 auto it = std::find(ob.begin(),
12799 ob.end(), addr);
12800 if (it != ob.end()) {
12801 ob.erase(it);
12802 }
12803 };
12804 if (range) {
12805 add_to_pending_blocklists(pending_inc.new_range_blocklist,
12806 pending_inc.old_range_blocklist,
12807 addr, expires);
12808
12809 } else {
12810 add_to_pending_blocklists(pending_inc.new_blocklist,
12811 pending_inc.old_blocklist,
12812 addr, expires);
12813 }
12814
12815 ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12816 getline(ss, rs);
12817 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12818 get_last_committed() + 1));
12819 return true;
12820 } else if (blocklistop == "rm") {
12821 auto rm_from_pending_blocklists = [](const auto& addr,
12822 auto& blocklist,
12823 auto& ob, auto& pb) {
12824 if (blocklist.count(addr)) {
12825 ob.push_back(addr);
12826 return true;
12827 } else if (pb.count(addr)) {
12828 pb.erase(addr);
12829 return true;
12830 }
12831 return false;
12832 };
12833 if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
12834 pending_inc.old_blocklist,
12835 pending_inc.new_blocklist)) ||
12836 (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
12837 pending_inc.old_range_blocklist,
12838 pending_inc.new_range_blocklist))) {
12839 ss << "un-blocklisting " << addr;
12840 getline(ss, rs);
12841 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12842 get_last_committed() + 1));
12843 return true;
12844 }
12845 ss << addr << " isn't blocklisted";
12846 err = 0;
12847 goto reply;
12848 }
12849 }
12850 } else if (prefix == "osd pool mksnap") {
12851 string poolstr;
12852 cmd_getval(cmdmap, "pool", poolstr);
12853 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12854 if (pool < 0) {
12855 ss << "unrecognized pool '" << poolstr << "'";
12856 err = -ENOENT;
12857 goto reply;
12858 }
12859 string snapname;
12860 cmd_getval(cmdmap, "snap", snapname);
12861 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12862 if (p->is_unmanaged_snaps_mode()) {
12863 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12864 err = -EINVAL;
12865 goto reply;
12866 } else if (p->snap_exists(snapname.c_str())) {
12867 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12868 err = 0;
12869 goto reply;
12870 } else if (p->is_tier()) {
12871 ss << "pool " << poolstr << " is a cache tier";
12872 err = -EINVAL;
12873 goto reply;
12874 }
12875 pg_pool_t *pp = 0;
12876 if (pending_inc.new_pools.count(pool))
12877 pp = &pending_inc.new_pools[pool];
12878 if (!pp) {
12879 pp = &pending_inc.new_pools[pool];
12880 *pp = *p;
12881 }
12882 if (pp->snap_exists(snapname.c_str())) {
12883 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12884 } else {
12885 pp->add_snap(snapname.c_str(), ceph_clock_now());
12886 pp->set_snap_epoch(pending_inc.epoch);
12887 ss << "created pool " << poolstr << " snap " << snapname;
12888 }
12889 getline(ss, rs);
12890 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12891 get_last_committed() + 1));
12892 return true;
12893 } else if (prefix == "osd pool rmsnap") {
12894 string poolstr;
12895 cmd_getval(cmdmap, "pool", poolstr);
12896 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12897 if (pool < 0) {
12898 ss << "unrecognized pool '" << poolstr << "'";
12899 err = -ENOENT;
12900 goto reply;
12901 }
12902 string snapname;
12903 cmd_getval(cmdmap, "snap", snapname);
12904 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12905 if (p->is_unmanaged_snaps_mode()) {
12906 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12907 err = -EINVAL;
12908 goto reply;
12909 } else if (!p->snap_exists(snapname.c_str())) {
12910 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12911 err = 0;
12912 goto reply;
12913 }
12914 pg_pool_t *pp = 0;
12915 if (pending_inc.new_pools.count(pool))
12916 pp = &pending_inc.new_pools[pool];
12917 if (!pp) {
12918 pp = &pending_inc.new_pools[pool];
12919 *pp = *p;
12920 }
12921 snapid_t sn = pp->snap_exists(snapname.c_str());
12922 if (sn) {
12923 pp->remove_snap(sn);
12924 pp->set_snap_epoch(pending_inc.epoch);
12925 ss << "removed pool " << poolstr << " snap " << snapname;
12926 } else {
12927 ss << "already removed pool " << poolstr << " snap " << snapname;
12928 }
12929 getline(ss, rs);
12930 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12931 get_last_committed() + 1));
12932 return true;
12933 } else if (prefix == "osd pool create") {
12934 int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
12935 int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
12936 int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
12937 int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
12938 string pool_type_str;
12939 cmd_getval(cmdmap, "pool_type", pool_type_str);
12940 if (pool_type_str.empty())
12941 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12942
12943 string poolstr;
12944 cmd_getval(cmdmap, "pool", poolstr);
12945 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12946 if (pool_id >= 0) {
12947 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12948 if (pool_type_str != p->get_type_name()) {
12949 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12950 err = -EINVAL;
12951 } else {
12952 ss << "pool '" << poolstr << "' already exists";
12953 err = 0;
12954 }
12955 goto reply;
12956 }
12957
12958 int pool_type;
12959 if (pool_type_str == "replicated") {
12960 pool_type = pg_pool_t::TYPE_REPLICATED;
12961 } else if (pool_type_str == "erasure") {
12962 pool_type = pg_pool_t::TYPE_ERASURE;
12963 } else {
12964 ss << "unknown pool type '" << pool_type_str << "'";
12965 err = -EINVAL;
12966 goto reply;
12967 }
12968
12969 bool implicit_rule_creation = false;
12970 int64_t expected_num_objects = 0;
12971 string rule_name;
12972 cmd_getval(cmdmap, "rule", rule_name);
12973 string erasure_code_profile;
12974 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12975
12976 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12977 if (erasure_code_profile == "")
12978 erasure_code_profile = "default";
12979 //handle the erasure code profile
12980 if (erasure_code_profile == "default") {
12981 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12982 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12983 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12984 goto wait;
12985 }
12986
12987 map<string,string> profile_map;
12988 err = osdmap.get_erasure_code_profile_default(cct,
12989 profile_map,
12990 &ss);
12991 if (err)
12992 goto reply;
12993 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12994 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12995 goto wait;
12996 }
12997 }
12998 if (rule_name == "") {
12999 implicit_rule_creation = true;
13000 if (erasure_code_profile == "default") {
13001 rule_name = "erasure-code";
13002 } else {
13003 dout(1) << "implicitly use rule named after the pool: "
13004 << poolstr << dendl;
13005 rule_name = poolstr;
13006 }
13007 }
13008 expected_num_objects =
13009 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13010 } else {
13011 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
13012 // and put expected_num_objects to rule field
13013 if (erasure_code_profile != "") { // cmd is from CLI
13014 if (rule_name != "") {
13015 string interr;
13016 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
13017 if (interr.length()) {
13018 ss << "error parsing integer value '" << rule_name << "': " << interr;
13019 err = -EINVAL;
13020 goto reply;
13021 }
13022 }
13023 rule_name = erasure_code_profile;
13024 } else { // cmd is well-formed
13025 expected_num_objects =
13026 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13027 }
13028 }
13029
13030 if (!implicit_rule_creation && rule_name != "") {
13031 int rule;
13032 err = get_crush_rule(rule_name, &rule, &ss);
13033 if (err == -EAGAIN) {
13034 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13035 return true;
13036 }
13037 if (err)
13038 goto reply;
13039 }
13040
13041 if (expected_num_objects < 0) {
13042 ss << "'expected_num_objects' must be non-negative";
13043 err = -EINVAL;
13044 goto reply;
13045 }
13046
13047 set<int32_t> osds;
13048 osdmap.get_all_osds(osds);
13049 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
13050 string type;
13051 if (!get_osd_objectstore_type(osd, &type)) {
13052 return type == "filestore";
13053 } else {
13054 return false;
13055 }
13056 });
13057
13058 if (has_filestore_osd &&
13059 expected_num_objects > 0 &&
13060 cct->_conf->filestore_merge_threshold > 0) {
13061 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13062 err = -EINVAL;
13063 goto reply;
13064 }
13065
13066 if (has_filestore_osd &&
13067 expected_num_objects == 0 &&
13068 cct->_conf->filestore_merge_threshold < 0) {
13069 int osds = osdmap.get_num_osds();
13070 bool sure = false;
13071 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13072 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
13073 ss << "For better initial performance on pools expected to store a "
13074 << "large number of objects, consider supplying the "
13075 << "expected_num_objects parameter when creating the pool."
13076 << " Pass --yes-i-really-mean-it to ignore it";
13077 err = -EPERM;
13078 goto reply;
13079 }
13080 }
13081
13082 int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
13083 FastReadType fast_read = FAST_READ_DEFAULT;
13084 if (fast_read_param == 0)
13085 fast_read = FAST_READ_OFF;
13086 else if (fast_read_param > 0)
13087 fast_read = FAST_READ_ON;
13088
13089 int64_t repl_size = 0;
13090 cmd_getval(cmdmap, "size", repl_size);
13091 int64_t target_size_bytes = 0;
13092 double target_size_ratio = 0.0;
13093 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
13094 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
13095
13096 string pg_autoscale_mode;
13097 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
13098
13099 bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
13100 err = prepare_new_pool(poolstr,
13101 -1, // default crush rule
13102 rule_name,
13103 pg_num, pgp_num, pg_num_min, pg_num_max,
13104 repl_size, target_size_bytes, target_size_ratio,
13105 erasure_code_profile, pool_type,
13106 (uint64_t)expected_num_objects,
13107 fast_read,
13108 pg_autoscale_mode,
13109 bulk,
13110 &ss);
13111 if (err < 0) {
13112 switch(err) {
13113 case -EEXIST:
13114 ss << "pool '" << poolstr << "' already exists";
13115 break;
13116 case -EAGAIN:
13117 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13118 return true;
13119 case -ERANGE:
13120 goto reply;
13121 default:
13122 goto reply;
13123 break;
13124 }
13125 } else {
13126 ss << "pool '" << poolstr << "' created";
13127 }
13128 getline(ss, rs);
13129 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13130 get_last_committed() + 1));
13131 return true;
13132
13133 } else if (prefix == "osd pool delete" ||
13134 prefix == "osd pool rm") {
13135 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13136 string poolstr, poolstr2, sure;
13137 cmd_getval(cmdmap, "pool", poolstr);
13138 cmd_getval(cmdmap, "pool2", poolstr2);
13139 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13140 if (pool < 0) {
13141 ss << "pool '" << poolstr << "' does not exist";
13142 err = 0;
13143 goto reply;
13144 }
13145
13146 bool force_no_fake = false;
13147 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
13148 bool force = false;
13149 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
13150 if (poolstr2 != poolstr ||
13151 (!force && !force_no_fake)) {
13152 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13153 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13154 << "followed by --yes-i-really-really-mean-it.";
13155 err = -EPERM;
13156 goto reply;
13157 }
13158 err = _prepare_remove_pool(pool, &ss, force_no_fake);
13159 if (err == -EAGAIN) {
13160 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13161 return true;
13162 }
13163 if (err < 0)
13164 goto reply;
13165 goto update;
13166 } else if (prefix == "osd pool rename") {
13167 string srcpoolstr, destpoolstr;
13168 cmd_getval(cmdmap, "srcpool", srcpoolstr);
13169 cmd_getval(cmdmap, "destpool", destpoolstr);
13170 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13171 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
13172
13173 if (pool_src < 0) {
13174 if (pool_dst >= 0) {
13175 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13176 // of operations, assume this rename succeeded, as it is not changing
13177 // the current state. Make sure we output something understandable
13178 // for whoever is issuing the command, if they are paying attention,
13179 // in case it was not intentional; or to avoid a "wtf?" and a bug
13180 // report in case it was intentional, while expecting a failure.
13181 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13182 << destpoolstr << "' does -- assuming successful rename";
13183 err = 0;
13184 } else {
13185 ss << "unrecognized pool '" << srcpoolstr << "'";
13186 err = -ENOENT;
13187 }
13188 goto reply;
13189 } else if (pool_dst >= 0) {
13190 // source pool exists and so does the destination pool
13191 ss << "pool '" << destpoolstr << "' already exists";
13192 err = -EEXIST;
13193 goto reply;
13194 }
13195
13196 int ret = _prepare_rename_pool(pool_src, destpoolstr);
13197 if (ret == 0) {
13198 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13199 } else {
13200 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13201 << cpp_strerror(ret);
13202 }
13203 getline(ss, rs);
13204 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13205 get_last_committed() + 1));
13206 return true;
13207
13208 } else if (prefix == "osd pool set") {
13209 err = prepare_command_pool_set(cmdmap, ss);
13210 if (err == -EAGAIN)
13211 goto wait;
13212 if (err < 0)
13213 goto reply;
13214
13215 getline(ss, rs);
13216 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13217 get_last_committed() + 1));
13218 return true;
13219 } else if (prefix == "osd tier add") {
13220 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13221 if (err == -EAGAIN)
13222 goto wait;
13223 if (err)
13224 goto reply;
13225 string poolstr;
13226 cmd_getval(cmdmap, "pool", poolstr);
13227 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13228 if (pool_id < 0) {
13229 ss << "unrecognized pool '" << poolstr << "'";
13230 err = -ENOENT;
13231 goto reply;
13232 }
13233 string tierpoolstr;
13234 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13235 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13236 if (tierpool_id < 0) {
13237 ss << "unrecognized pool '" << tierpoolstr << "'";
13238 err = -ENOENT;
13239 goto reply;
13240 }
13241 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13242 ceph_assert(p);
13243 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13244 ceph_assert(tp);
13245
13246 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13247 goto reply;
13248 }
13249
13250 // make sure new tier is empty
13251 bool force_nonempty = false;
13252 cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
13253 const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13254 if (pstats && pstats->stats.sum.num_objects != 0 &&
13255 !force_nonempty) {
13256 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13257 err = -ENOTEMPTY;
13258 goto reply;
13259 }
13260 if (tp->is_erasure()) {
13261 ss << "tier pool '" << tierpoolstr
13262 << "' is an ec pool, which cannot be a tier";
13263 err = -ENOTSUP;
13264 goto reply;
13265 }
13266 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13267 (!force_nonempty ||
13268 !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
13269 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13270 err = -ENOTEMPTY;
13271 goto reply;
13272 }
13273 // go
13274 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13275 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13276 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13277 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13278 return true;
13279 }
13280 np->tiers.insert(tierpool_id);
13281 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13282 ntp->tier_of = pool_id;
13283 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13284 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13285 get_last_committed() + 1));
13286 return true;
13287 } else if (prefix == "osd tier remove" ||
13288 prefix == "osd tier rm") {
13289 string poolstr;
13290 cmd_getval(cmdmap, "pool", poolstr);
13291 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13292 if (pool_id < 0) {
13293 ss << "unrecognized pool '" << poolstr << "'";
13294 err = -ENOENT;
13295 goto reply;
13296 }
13297 string tierpoolstr;
13298 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13299 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13300 if (tierpool_id < 0) {
13301 ss << "unrecognized pool '" << tierpoolstr << "'";
13302 err = -ENOENT;
13303 goto reply;
13304 }
13305 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13306 ceph_assert(p);
13307 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13308 ceph_assert(tp);
13309
13310 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13311 goto reply;
13312 }
13313
13314 if (p->tiers.count(tierpool_id) == 0) {
13315 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13316 err = 0;
13317 goto reply;
13318 }
13319 if (tp->tier_of != pool_id) {
13320 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13321 << osdmap.get_pool_name(tp->tier_of) << "': "
13322 // be scary about it; this is an inconsistency and bells must go off
13323 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13324 err = -EINVAL;
13325 goto reply;
13326 }
13327 if (p->read_tier == tierpool_id) {
13328 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13329 err = -EBUSY;
13330 goto reply;
13331 }
13332 // go
13333 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13334 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13335 if (np->tiers.count(tierpool_id) == 0 ||
13336 ntp->tier_of != pool_id ||
13337 np->read_tier == tierpool_id) {
13338 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13339 return true;
13340 }
13341 np->tiers.erase(tierpool_id);
13342 ntp->clear_tier();
13343 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13344 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13345 get_last_committed() + 1));
13346 return true;
13347 } else if (prefix == "osd tier set-overlay") {
13348 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13349 if (err == -EAGAIN)
13350 goto wait;
13351 if (err)
13352 goto reply;
13353 string poolstr;
13354 cmd_getval(cmdmap, "pool", poolstr);
13355 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13356 if (pool_id < 0) {
13357 ss << "unrecognized pool '" << poolstr << "'";
13358 err = -ENOENT;
13359 goto reply;
13360 }
13361 string overlaypoolstr;
13362 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13363 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13364 if (overlaypool_id < 0) {
13365 ss << "unrecognized pool '" << overlaypoolstr << "'";
13366 err = -ENOENT;
13367 goto reply;
13368 }
13369 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13370 ceph_assert(p);
13371 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13372 ceph_assert(overlay_p);
13373 if (p->tiers.count(overlaypool_id) == 0) {
13374 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13375 err = -EINVAL;
13376 goto reply;
13377 }
13378 if (p->read_tier == overlaypool_id) {
13379 err = 0;
13380 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13381 goto reply;
13382 }
13383 if (p->has_read_tier()) {
13384 ss << "pool '" << poolstr << "' has overlay '"
13385 << osdmap.get_pool_name(p->read_tier)
13386 << "'; please remove-overlay first";
13387 err = -EINVAL;
13388 goto reply;
13389 }
13390
13391 // go
13392 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13393 np->read_tier = overlaypool_id;
13394 np->write_tier = overlaypool_id;
13395 np->set_last_force_op_resend(pending_inc.epoch);
13396 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13397 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13398 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13399 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13400 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13401 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13402 get_last_committed() + 1));
13403 return true;
13404 } else if (prefix == "osd tier remove-overlay" ||
13405 prefix == "osd tier rm-overlay") {
13406 string poolstr;
13407 cmd_getval(cmdmap, "pool", poolstr);
13408 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13409 if (pool_id < 0) {
13410 ss << "unrecognized pool '" << poolstr << "'";
13411 err = -ENOENT;
13412 goto reply;
13413 }
13414 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13415 ceph_assert(p);
13416 if (!p->has_read_tier()) {
13417 err = 0;
13418 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13419 goto reply;
13420 }
13421
13422 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13423 goto reply;
13424 }
13425
13426 // go
13427 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13428 if (np->has_read_tier()) {
13429 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13430 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13431 nop->set_last_force_op_resend(pending_inc.epoch);
13432 }
13433 if (np->has_write_tier()) {
13434 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13435 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13436 nop->set_last_force_op_resend(pending_inc.epoch);
13437 }
13438 np->clear_read_tier();
13439 np->clear_write_tier();
13440 np->set_last_force_op_resend(pending_inc.epoch);
13441 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13442 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13443 get_last_committed() + 1));
13444 return true;
13445 } else if (prefix == "osd tier cache-mode") {
13446 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13447 if (err == -EAGAIN)
13448 goto wait;
13449 if (err)
13450 goto reply;
13451 string poolstr;
13452 cmd_getval(cmdmap, "pool", poolstr);
13453 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13454 if (pool_id < 0) {
13455 ss << "unrecognized pool '" << poolstr << "'";
13456 err = -ENOENT;
13457 goto reply;
13458 }
13459 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13460 ceph_assert(p);
13461 if (!p->is_tier()) {
13462 ss << "pool '" << poolstr << "' is not a tier";
13463 err = -EINVAL;
13464 goto reply;
13465 }
13466 string modestr;
13467 cmd_getval(cmdmap, "mode", modestr);
13468 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13469 if (int(mode) < 0) {
13470 ss << "'" << modestr << "' is not a valid cache mode";
13471 err = -EINVAL;
13472 goto reply;
13473 }
13474
13475 bool sure = false;
13476 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13477
13478 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13479 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13480 ss << "'" << modestr << "' is no longer a supported cache mode";
13481 err = -EPERM;
13482 goto reply;
13483 }
13484 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13485 mode != pg_pool_t::CACHEMODE_NONE &&
13486 mode != pg_pool_t::CACHEMODE_PROXY &&
13487 mode != pg_pool_t::CACHEMODE_READPROXY) &&
13488 !sure) {
13489 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13490 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13491 err = -EPERM;
13492 goto reply;
13493 }
13494
13495 // pool already has this cache-mode set and there are no pending changes
13496 if (p->cache_mode == mode &&
13497 (pending_inc.new_pools.count(pool_id) == 0 ||
13498 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13499 ss << "set cache-mode for pool '" << poolstr << "'"
13500 << " to " << pg_pool_t::get_cache_mode_name(mode);
13501 err = 0;
13502 goto reply;
13503 }
13504
13505 /* Mode description:
13506 *
13507 * none: No cache-mode defined
13508 * forward: Forward all reads and writes to base pool [removed]
13509 * writeback: Cache writes, promote reads from base pool
13510 * readonly: Forward writes to base pool
13511 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13512 * proxy: Proxy all reads and writes to base pool
13513 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13514 *
13515 * Hence, these are the allowed transitions:
13516 *
13517 * none -> any
13518 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13519 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13520 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13521 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13522 * writeback -> readproxy || proxy
13523 * readonly -> any
13524 */
13525
13526 // We check if the transition is valid against the current pool mode, as
13527 // it is the only committed state thus far. We will blantly squash
13528 // whatever mode is on the pending state.
13529
13530 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13531 (mode != pg_pool_t::CACHEMODE_PROXY &&
13532 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13533 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13534 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13535 << "' pool; only '"
13536 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13537 << "' allowed.";
13538 err = -EINVAL;
13539 goto reply;
13540 }
13541 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13542 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13543 mode != pg_pool_t::CACHEMODE_PROXY &&
13544 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13545
13546 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13547 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13548 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13549
13550 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13551 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13552 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13553
13554 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13555 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13556 mode != pg_pool_t::CACHEMODE_PROXY &&
13557 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13558
13559 const pool_stat_t* pstats =
13560 mon.mgrstatmon()->get_pool_stat(pool_id);
13561
13562 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13563 ss << "unable to set cache-mode '"
13564 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13565 << "': dirty objects found";
13566 err = -EBUSY;
13567 goto reply;
13568 }
13569 }
13570 // go
13571 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13572 np->cache_mode = mode;
13573 // set this both when moving to and from cache_mode NONE. this is to
13574 // capture legacy pools that were set up before this flag existed.
13575 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13576 ss << "set cache-mode for pool '" << poolstr
13577 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13578 if (mode == pg_pool_t::CACHEMODE_NONE) {
13579 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13580 ceph_assert(base_pool);
13581 if (base_pool->read_tier == pool_id ||
13582 base_pool->write_tier == pool_id)
13583 ss <<" (WARNING: pool is still configured as read or write tier)";
13584 }
13585 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13586 get_last_committed() + 1));
13587 return true;
13588 } else if (prefix == "osd tier add-cache") {
13589 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13590 if (err == -EAGAIN)
13591 goto wait;
13592 if (err)
13593 goto reply;
13594 string poolstr;
13595 cmd_getval(cmdmap, "pool", poolstr);
13596 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13597 if (pool_id < 0) {
13598 ss << "unrecognized pool '" << poolstr << "'";
13599 err = -ENOENT;
13600 goto reply;
13601 }
13602 string tierpoolstr;
13603 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13604 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13605 if (tierpool_id < 0) {
13606 ss << "unrecognized pool '" << tierpoolstr << "'";
13607 err = -ENOENT;
13608 goto reply;
13609 }
13610 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13611 ceph_assert(p);
13612 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13613 ceph_assert(tp);
13614
13615 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13616 goto reply;
13617 }
13618
13619 int64_t size = 0;
13620 if (!cmd_getval(cmdmap, "size", size)) {
13621 ss << "unable to parse 'size' value '"
13622 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13623 err = -EINVAL;
13624 goto reply;
13625 }
13626 // make sure new tier is empty
13627 const pool_stat_t *pstats =
13628 mon.mgrstatmon()->get_pool_stat(tierpool_id);
13629 if (pstats && pstats->stats.sum.num_objects != 0) {
13630 ss << "tier pool '" << tierpoolstr << "' is not empty";
13631 err = -ENOTEMPTY;
13632 goto reply;
13633 }
13634 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13635 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13636 if (int(mode) < 0) {
13637 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13638 err = -EINVAL;
13639 goto reply;
13640 }
13641 HitSet::Params hsp;
13642 auto& cache_hit_set_type =
13643 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13644 if (cache_hit_set_type == "bloom") {
13645 BloomHitSet::Params *bsp = new BloomHitSet::Params;
13646 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13647 hsp = HitSet::Params(bsp);
13648 } else if (cache_hit_set_type == "explicit_hash") {
13649 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13650 } else if (cache_hit_set_type == "explicit_object") {
13651 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13652 } else {
13653 ss << "osd tier cache default hit set type '"
13654 << cache_hit_set_type << "' is not a known type";
13655 err = -EINVAL;
13656 goto reply;
13657 }
13658 // go
13659 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13660 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13661 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13662 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13663 return true;
13664 }
13665 np->tiers.insert(tierpool_id);
13666 np->read_tier = np->write_tier = tierpool_id;
13667 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13668 np->set_last_force_op_resend(pending_inc.epoch);
13669 ntp->set_last_force_op_resend(pending_inc.epoch);
13670 ntp->tier_of = pool_id;
13671 ntp->cache_mode = mode;
13672 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13673 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13674 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13675 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13676 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13677 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13678 ntp->hit_set_params = hsp;
13679 ntp->target_max_bytes = size;
13680 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13681 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13682 get_last_committed() + 1));
13683 return true;
13684 } else if (prefix == "osd pool set-quota") {
13685 string poolstr;
13686 cmd_getval(cmdmap, "pool", poolstr);
13687 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13688 if (pool_id < 0) {
13689 ss << "unrecognized pool '" << poolstr << "'";
13690 err = -ENOENT;
13691 goto reply;
13692 }
13693
13694 string field;
13695 cmd_getval(cmdmap, "field", field);
13696 if (field != "max_objects" && field != "max_bytes") {
13697 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13698 err = -EINVAL;
13699 goto reply;
13700 }
13701
13702 // val could contain unit designations, so we treat as a string
13703 string val;
13704 cmd_getval(cmdmap, "val", val);
13705 string tss;
13706 int64_t value;
13707 if (field == "max_objects") {
13708 value = strict_si_cast<uint64_t>(val, &tss);
13709 } else if (field == "max_bytes") {
13710 value = strict_iecstrtoll(val, &tss);
13711 } else {
13712 ceph_abort_msg("unrecognized option");
13713 }
13714 if (!tss.empty()) {
13715 ss << "error parsing value '" << val << "': " << tss;
13716 err = -EINVAL;
13717 goto reply;
13718 }
13719
13720 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13721 if (field == "max_objects") {
13722 pi->quota_max_objects = value;
13723 } else if (field == "max_bytes") {
13724 pi->quota_max_bytes = value;
13725 } else {
13726 ceph_abort_msg("unrecognized option");
13727 }
13728 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13729 rs = ss.str();
13730 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13731 get_last_committed() + 1));
13732 return true;
13733 } else if (prefix == "osd pool application enable" ||
13734 prefix == "osd pool application disable" ||
13735 prefix == "osd pool application set" ||
13736 prefix == "osd pool application rm") {
13737 err = prepare_command_pool_application(prefix, cmdmap, ss);
13738 if (err == -EAGAIN) {
13739 goto wait;
13740 } else if (err < 0) {
13741 goto reply;
13742 } else {
13743 goto update;
13744 }
13745 } else if (prefix == "osd force-create-pg") {
13746 pg_t pgid;
13747 string pgidstr;
13748 cmd_getval(cmdmap, "pgid", pgidstr);
13749 if (!pgid.parse(pgidstr.c_str())) {
13750 ss << "invalid pgid '" << pgidstr << "'";
13751 err = -EINVAL;
13752 goto reply;
13753 }
13754 if (!osdmap.pg_exists(pgid)) {
13755 ss << "pg " << pgid << " should not exist";
13756 err = -ENOENT;
13757 goto reply;
13758 }
13759 bool sure = false;
13760 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13761 if (!sure) {
13762 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13763 << "that the cluster will give up ever trying to recover the lost data. Do this "
13764 << "only if you are certain that all copies of the PG are in fact lost and you are "
13765 << "willing to accept that the data is permanently destroyed. Pass "
13766 << "--yes-i-really-mean-it to proceed.";
13767 err = -EPERM;
13768 goto reply;
13769 }
13770 bool creating_now;
13771 {
13772 std::lock_guard<std::mutex> l(creating_pgs_lock);
13773 auto emplaced = creating_pgs.pgs.emplace(
13774 pgid,
13775 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13776 ceph_clock_now()));
13777 creating_now = emplaced.second;
13778 }
13779 if (creating_now) {
13780 ss << "pg " << pgidstr << " now creating, ok";
13781 // set the pool's CREATING flag so that (1) the osd won't ignore our
13782 // create message and (2) we won't propose any future pg_num changes
13783 // until after the PG has been instantiated.
13784 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13785 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13786 }
13787 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13788 err = 0;
13789 goto update;
13790 } else {
13791 ss << "pg " << pgid << " already creating";
13792 err = 0;
13793 goto reply;
13794 }
13795 } else if (prefix == "osd force_healthy_stretch_mode") {
13796 bool sure = false;
13797 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13798 if (!sure) {
13799 ss << "This command will require peering across multiple CRUSH buckets "
13800 "(probably two data centers or availability zones?) and may result in PGs "
13801 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13802 err = -EPERM;
13803 goto reply;
13804 }
13805 try_end_recovery_stretch_mode(true);
13806 ss << "Triggering healthy stretch mode";
13807 err = 0;
13808 goto reply;
13809 } else if (prefix == "osd force_recovery_stretch_mode") {
13810 bool sure = false;
13811 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13812 if (!sure) {
13813 ss << "This command will increase pool sizes to try and spread them "
13814 "across multiple CRUSH buckets (probably two data centers or "
13815 "availability zones?) and should have happened automatically"
13816 "Pass --yes-i-really-mean-it to proceed.";
13817 err = -EPERM;
13818 goto reply;
13819 }
13820 mon.go_recovery_stretch_mode();
13821 ss << "Triggering recovery stretch mode";
13822 err = 0;
13823 goto reply;
13824 } else {
13825 err = -EINVAL;
13826 }
13827
13828 reply:
13829 getline(ss, rs);
13830 if (err < 0 && rs.length() == 0)
13831 rs = cpp_strerror(err);
13832 mon.reply_command(op, err, rs, rdata, get_last_committed());
13833 return ret;
13834
13835 update:
13836 getline(ss, rs);
13837 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13838 get_last_committed() + 1));
13839 return true;
13840
13841 wait:
13842 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13843 return true;
13844 }
13845
13846 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13847 {
13848 op->mark_osdmon_event(__func__);
13849
13850 auto m = op->get_req<MPoolOp>();
13851 MonSession *session = op->get_session();
13852 if (!session) {
13853 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13854 return true;
13855 }
13856
13857 switch (m->op) {
13858 case POOL_OP_CREATE_UNMANAGED_SNAP:
13859 case POOL_OP_DELETE_UNMANAGED_SNAP:
13860 {
13861 const std::string* pool_name = nullptr;
13862 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13863 if (pg_pool != nullptr) {
13864 pool_name = &osdmap.get_pool_name(m->pool);
13865 }
13866
13867 if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
13868 session->entity_name, session->caps,
13869 session->get_peer_socket_addr(),
13870 pool_name)) {
13871 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13872 << "privileges. message: " << *m << std::endl
13873 << "caps: " << session->caps << dendl;
13874 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13875 return true;
13876 }
13877 }
13878 break;
13879 default:
13880 if (!session->is_capable("osd", MON_CAP_W)) {
13881 dout(0) << "got pool op from entity with insufficient privileges. "
13882 << "message: " << *m << std::endl
13883 << "caps: " << session->caps << dendl;
13884 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13885 return true;
13886 }
13887 break;
13888 }
13889
13890 return false;
13891 }
13892
13893 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13894 {
13895 op->mark_osdmon_event(__func__);
13896 auto m = op->get_req<MPoolOp>();
13897
13898 if (enforce_pool_op_caps(op)) {
13899 return true;
13900 }
13901
13902 if (m->fsid != mon.monmap->fsid) {
13903 dout(0) << __func__ << " drop message on fsid " << m->fsid
13904 << " != " << mon.monmap->fsid << " for " << *m << dendl;
13905 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13906 return true;
13907 }
13908
13909 if (m->op == POOL_OP_CREATE)
13910 return preprocess_pool_op_create(op);
13911
13912 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13913 if (p == nullptr) {
13914 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13915 if (m->op == POOL_OP_DELETE) {
13916 _pool_op_reply(op, 0, osdmap.get_epoch());
13917 } else {
13918 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13919 }
13920 return true;
13921 }
13922
13923 // check if the snap and snapname exist
13924 bool snap_exists = false;
13925 if (p->snap_exists(m->name.c_str()))
13926 snap_exists = true;
13927
13928 switch (m->op) {
13929 case POOL_OP_CREATE_SNAP:
13930 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13931 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13932 return true;
13933 }
13934 if (snap_exists) {
13935 _pool_op_reply(op, 0, osdmap.get_epoch());
13936 return true;
13937 }
13938 return false;
13939 case POOL_OP_CREATE_UNMANAGED_SNAP:
13940 if (p->is_pool_snaps_mode()) {
13941 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13942 return true;
13943 }
13944 return false;
13945 case POOL_OP_DELETE_SNAP:
13946 if (p->is_unmanaged_snaps_mode()) {
13947 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13948 return true;
13949 }
13950 if (!snap_exists) {
13951 _pool_op_reply(op, 0, osdmap.get_epoch());
13952 return true;
13953 }
13954 return false;
13955 case POOL_OP_DELETE_UNMANAGED_SNAP:
13956 if (p->is_pool_snaps_mode()) {
13957 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13958 return true;
13959 }
13960 if (_is_removed_snap(m->pool, m->snapid)) {
13961 _pool_op_reply(op, 0, osdmap.get_epoch());
13962 return true;
13963 }
13964 return false;
13965 case POOL_OP_DELETE:
13966 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13967 _pool_op_reply(op, 0, osdmap.get_epoch());
13968 return true;
13969 }
13970 return false;
13971 case POOL_OP_AUID_CHANGE:
13972 return false;
13973 default:
13974 ceph_abort();
13975 break;
13976 }
13977
13978 return false;
13979 }
13980
13981 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13982 {
13983 if (!osdmap.have_pg_pool(pool)) {
13984 dout(10) << __func__ << " pool " << pool << " snap " << snap
13985 << " - pool dne" << dendl;
13986 return true;
13987 }
13988 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13989 dout(10) << __func__ << " pool " << pool << " snap " << snap
13990 << " - in osdmap removed_snaps_queue" << dendl;
13991 return true;
13992 }
13993 snapid_t begin, end;
13994 int r = lookup_purged_snap(pool, snap, &begin, &end);
13995 if (r == 0) {
13996 dout(10) << __func__ << " pool " << pool << " snap " << snap
13997 << " - purged, [" << begin << "," << end << ")" << dendl;
13998 return true;
13999 }
14000 return false;
14001 }
14002
14003 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
14004 {
14005 if (pending_inc.old_pools.count(pool)) {
14006 dout(10) << __func__ << " pool " << pool << " snap " << snap
14007 << " - pool pending deletion" << dendl;
14008 return true;
14009 }
14010 if (pending_inc.in_new_removed_snaps(pool, snap)) {
14011 dout(10) << __func__ << " pool " << pool << " snap " << snap
14012 << " - in pending new_removed_snaps" << dendl;
14013 return true;
14014 }
14015 return false;
14016 }
14017
14018 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
14019 {
14020 op->mark_osdmon_event(__func__);
14021 auto m = op->get_req<MPoolOp>();
14022 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
14023 if (pool >= 0) {
14024 _pool_op_reply(op, 0, osdmap.get_epoch());
14025 return true;
14026 }
14027
14028 return false;
14029 }
14030
14031 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
14032 {
14033 op->mark_osdmon_event(__func__);
14034 auto m = op->get_req<MPoolOp>();
14035 dout(10) << "prepare_pool_op " << *m << dendl;
14036 if (m->op == POOL_OP_CREATE) {
14037 return prepare_pool_op_create(op);
14038 } else if (m->op == POOL_OP_DELETE) {
14039 return prepare_pool_op_delete(op);
14040 }
14041
14042 int ret = 0;
14043 bool changed = false;
14044
14045 if (!osdmap.have_pg_pool(m->pool)) {
14046 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14047 return false;
14048 }
14049
14050 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
14051
14052 switch (m->op) {
14053 case POOL_OP_CREATE_SNAP:
14054 if (pool->is_tier()) {
14055 ret = -EINVAL;
14056 _pool_op_reply(op, ret, osdmap.get_epoch());
14057 return false;
14058 } // else, fall through
14059 case POOL_OP_DELETE_SNAP:
14060 if (!pool->is_unmanaged_snaps_mode()) {
14061 bool snap_exists = pool->snap_exists(m->name.c_str());
14062 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
14063 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
14064 ret = 0;
14065 } else {
14066 break;
14067 }
14068 } else {
14069 ret = -EINVAL;
14070 }
14071 _pool_op_reply(op, ret, osdmap.get_epoch());
14072 return false;
14073
14074 case POOL_OP_DELETE_UNMANAGED_SNAP:
14075 // we won't allow removal of an unmanaged snapshot from a pool
14076 // not in unmanaged snaps mode.
14077 if (!pool->is_unmanaged_snaps_mode()) {
14078 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
14079 return false;
14080 }
14081 /* fall-thru */
14082 case POOL_OP_CREATE_UNMANAGED_SNAP:
14083 // but we will allow creating an unmanaged snapshot on any pool
14084 // as long as it is not in 'pool' snaps mode.
14085 if (pool->is_pool_snaps_mode()) {
14086 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14087 return false;
14088 }
14089 }
14090
14091 // projected pool info
14092 pg_pool_t pp;
14093 if (pending_inc.new_pools.count(m->pool))
14094 pp = pending_inc.new_pools[m->pool];
14095 else
14096 pp = *osdmap.get_pg_pool(m->pool);
14097
14098 bufferlist reply_data;
14099
14100 // pool snaps vs unmanaged snaps are mutually exclusive
14101 switch (m->op) {
14102 case POOL_OP_CREATE_SNAP:
14103 case POOL_OP_DELETE_SNAP:
14104 if (pp.is_unmanaged_snaps_mode()) {
14105 ret = -EINVAL;
14106 goto out;
14107 }
14108 break;
14109
14110 case POOL_OP_CREATE_UNMANAGED_SNAP:
14111 case POOL_OP_DELETE_UNMANAGED_SNAP:
14112 if (pp.is_pool_snaps_mode()) {
14113 ret = -EINVAL;
14114 goto out;
14115 }
14116 }
14117
14118 switch (m->op) {
14119 case POOL_OP_CREATE_SNAP:
14120 if (!pp.snap_exists(m->name.c_str())) {
14121 pp.add_snap(m->name.c_str(), ceph_clock_now());
14122 dout(10) << "create snap in pool " << m->pool << " " << m->name
14123 << " seq " << pp.get_snap_epoch() << dendl;
14124 changed = true;
14125 }
14126 break;
14127
14128 case POOL_OP_DELETE_SNAP:
14129 {
14130 snapid_t s = pp.snap_exists(m->name.c_str());
14131 if (s) {
14132 pp.remove_snap(s);
14133 pending_inc.new_removed_snaps[m->pool].insert(s);
14134 changed = true;
14135 }
14136 }
14137 break;
14138
14139 case POOL_OP_CREATE_UNMANAGED_SNAP:
14140 {
14141 uint64_t snapid = pp.add_unmanaged_snap(
14142 osdmap.require_osd_release < ceph_release_t::octopus);
14143 encode(snapid, reply_data);
14144 changed = true;
14145 }
14146 break;
14147
14148 case POOL_OP_DELETE_UNMANAGED_SNAP:
14149 if (!_is_removed_snap(m->pool, m->snapid) &&
14150 !_is_pending_removed_snap(m->pool, m->snapid)) {
14151 if (m->snapid > pp.get_snap_seq()) {
14152 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14153 return false;
14154 }
14155 pp.remove_unmanaged_snap(
14156 m->snapid,
14157 osdmap.require_osd_release < ceph_release_t::octopus);
14158 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
14159 // also record the new seq as purged: this avoids a discontinuity
14160 // after all of the snaps have been purged, since the seq assigned
14161 // during removal lives in the same namespace as the actual snaps.
14162 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
14163 changed = true;
14164 }
14165 break;
14166
14167 case POOL_OP_AUID_CHANGE:
14168 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14169 return false;
14170
14171 default:
14172 ceph_abort();
14173 break;
14174 }
14175
14176 if (changed) {
14177 pp.set_snap_epoch(pending_inc.epoch);
14178 pending_inc.new_pools[m->pool] = pp;
14179 }
14180
14181 out:
14182 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14183 return true;
14184 }
14185
14186 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14187 {
14188 op->mark_osdmon_event(__func__);
14189 int err = prepare_new_pool(op);
14190 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14191 return true;
14192 }
14193
14194 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14195 ostream *ss)
14196 {
14197 const string& poolstr = osdmap.get_pool_name(pool_id);
14198
14199 // If the Pool is in use by CephFS, refuse to delete it
14200 FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14201 if (pending_fsmap.pool_in_use(pool_id)) {
14202 *ss << "pool '" << poolstr << "' is in use by CephFS";
14203 return -EBUSY;
14204 }
14205
14206 if (pool.tier_of >= 0) {
14207 *ss << "pool '" << poolstr << "' is a tier of '"
14208 << osdmap.get_pool_name(pool.tier_of) << "'";
14209 return -EBUSY;
14210 }
14211 if (!pool.tiers.empty()) {
14212 *ss << "pool '" << poolstr << "' has tiers";
14213 for(auto tier : pool.tiers) {
14214 *ss << " " << osdmap.get_pool_name(tier);
14215 }
14216 return -EBUSY;
14217 }
14218
14219 if (!g_conf()->mon_allow_pool_delete) {
14220 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14221 return -EPERM;
14222 }
14223
14224 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14225 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14226 return -EPERM;
14227 }
14228
14229 *ss << "pool '" << poolstr << "' removed";
14230 return 0;
14231 }
14232
14233 /**
14234 * Check if it is safe to add a tier to a base pool
14235 *
14236 * @return
14237 * True if the operation should proceed, false if we should abort here
14238 * (abort doesn't necessarily mean error, could be idempotency)
14239 */
14240 bool OSDMonitor::_check_become_tier(
14241 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14242 const int64_t base_pool_id, const pg_pool_t *base_pool,
14243 int *err,
14244 ostream *ss) const
14245 {
14246 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14247 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14248
14249 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14250 if (pending_fsmap.pool_in_use(tier_pool_id)) {
14251 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14252 *err = -EBUSY;
14253 return false;
14254 }
14255
14256 if (base_pool->tiers.count(tier_pool_id)) {
14257 ceph_assert(tier_pool->tier_of == base_pool_id);
14258 *err = 0;
14259 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14260 << base_pool_name << "'";
14261 return false;
14262 }
14263
14264 if (base_pool->is_tier()) {
14265 *ss << "pool '" << base_pool_name << "' is already a tier of '"
14266 << osdmap.get_pool_name(base_pool->tier_of) << "', "
14267 << "multiple tiers are not yet supported.";
14268 *err = -EINVAL;
14269 return false;
14270 }
14271
14272 if (tier_pool->has_tiers()) {
14273 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14274 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14275 it != tier_pool->tiers.end(); ++it)
14276 *ss << "'" << osdmap.get_pool_name(*it) << "',";
14277 *ss << " multiple tiers are not yet supported.";
14278 *err = -EINVAL;
14279 return false;
14280 }
14281
14282 if (tier_pool->is_tier()) {
14283 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14284 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14285 *err = -EINVAL;
14286 return false;
14287 }
14288
14289 *err = 0;
14290 return true;
14291 }
14292
14293
14294 /**
14295 * Check if it is safe to remove a tier from this base pool
14296 *
14297 * @return
14298 * True if the operation should proceed, false if we should abort here
14299 * (abort doesn't necessarily mean error, could be idempotency)
14300 */
14301 bool OSDMonitor::_check_remove_tier(
14302 const int64_t base_pool_id, const pg_pool_t *base_pool,
14303 const pg_pool_t *tier_pool,
14304 int *err, ostream *ss) const
14305 {
14306 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14307
14308 // Apply CephFS-specific checks
14309 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14310 if (pending_fsmap.pool_in_use(base_pool_id)) {
14311 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14312 // If the underlying pool is erasure coded and does not allow EC
14313 // overwrites, we can't permit the removal of the replicated tier that
14314 // CephFS relies on to access it
14315 *ss << "pool '" << base_pool_name <<
14316 "' does not allow EC overwrites and is in use by CephFS"
14317 " via its tier";
14318 *err = -EBUSY;
14319 return false;
14320 }
14321
14322 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14323 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14324 "tier is still in use as a writeback cache. Change the cache "
14325 "mode and flush the cache before removing it";
14326 *err = -EBUSY;
14327 return false;
14328 }
14329 }
14330
14331 *err = 0;
14332 return true;
14333 }
14334
14335 int OSDMonitor::_prepare_remove_pool(
14336 int64_t pool, ostream *ss, bool no_fake)
14337 {
14338 dout(10) << __func__ << " " << pool << dendl;
14339 const pg_pool_t *p = osdmap.get_pg_pool(pool);
14340 int r = _check_remove_pool(pool, *p, ss);
14341 if (r < 0)
14342 return r;
14343
14344 auto new_pool = pending_inc.new_pools.find(pool);
14345 if (new_pool != pending_inc.new_pools.end()) {
14346 // if there is a problem with the pending info, wait and retry
14347 // this op.
14348 const auto& p = new_pool->second;
14349 int r = _check_remove_pool(pool, p, ss);
14350 if (r < 0)
14351 return -EAGAIN;
14352 }
14353
14354 if (pending_inc.old_pools.count(pool)) {
14355 dout(10) << __func__ << " " << pool << " already pending removal"
14356 << dendl;
14357 return 0;
14358 }
14359
14360 if (g_conf()->mon_fake_pool_delete && !no_fake) {
14361 string old_name = osdmap.get_pool_name(pool);
14362 string new_name = old_name + "." + stringify(pool) + ".DELETED";
14363 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14364 << old_name << " -> " << new_name << dendl;
14365 pending_inc.new_pool_names[pool] = new_name;
14366 return 0;
14367 }
14368
14369 // remove
14370 pending_inc.old_pools.insert(pool);
14371
14372 // remove any pg_temp mappings for this pool
14373 for (auto p = osdmap.pg_temp->begin();
14374 p != osdmap.pg_temp->end();
14375 ++p) {
14376 if (p->first.pool() == pool) {
14377 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14378 << p->first << dendl;
14379 pending_inc.new_pg_temp[p->first].clear();
14380 }
14381 }
14382 // remove any primary_temp mappings for this pool
14383 for (auto p = osdmap.primary_temp->begin();
14384 p != osdmap.primary_temp->end();
14385 ++p) {
14386 if (p->first.pool() == pool) {
14387 dout(10) << __func__ << " " << pool
14388 << " removing obsolete primary_temp" << p->first << dendl;
14389 pending_inc.new_primary_temp[p->first] = -1;
14390 }
14391 }
14392 // remove any pg_upmap mappings for this pool
14393 for (auto& p : osdmap.pg_upmap) {
14394 if (p.first.pool() == pool) {
14395 dout(10) << __func__ << " " << pool
14396 << " removing obsolete pg_upmap "
14397 << p.first << dendl;
14398 pending_inc.old_pg_upmap.insert(p.first);
14399 }
14400 }
14401 // remove any pending pg_upmap mappings for this pool
14402 {
14403 auto it = pending_inc.new_pg_upmap.begin();
14404 while (it != pending_inc.new_pg_upmap.end()) {
14405 if (it->first.pool() == pool) {
14406 dout(10) << __func__ << " " << pool
14407 << " removing pending pg_upmap "
14408 << it->first << dendl;
14409 it = pending_inc.new_pg_upmap.erase(it);
14410 } else {
14411 it++;
14412 }
14413 }
14414 }
14415 // remove any pg_upmap_items mappings for this pool
14416 for (auto& p : osdmap.pg_upmap_items) {
14417 if (p.first.pool() == pool) {
14418 dout(10) << __func__ << " " << pool
14419 << " removing obsolete pg_upmap_items " << p.first
14420 << dendl;
14421 pending_inc.old_pg_upmap_items.insert(p.first);
14422 }
14423 }
14424 // remove any pending pg_upmap mappings for this pool
14425 {
14426 auto it = pending_inc.new_pg_upmap_items.begin();
14427 while (it != pending_inc.new_pg_upmap_items.end()) {
14428 if (it->first.pool() == pool) {
14429 dout(10) << __func__ << " " << pool
14430 << " removing pending pg_upmap_items "
14431 << it->first << dendl;
14432 it = pending_inc.new_pg_upmap_items.erase(it);
14433 } else {
14434 it++;
14435 }
14436 }
14437 }
14438
14439 // remove any choose_args for this pool
14440 CrushWrapper newcrush = _get_pending_crush();
14441 if (newcrush.have_choose_args(pool)) {
14442 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14443 newcrush.rm_choose_args(pool);
14444 pending_inc.crush.clear();
14445 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14446 }
14447 return 0;
14448 }
14449
14450 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14451 {
14452 dout(10) << "_prepare_rename_pool " << pool << dendl;
14453 if (pending_inc.old_pools.count(pool)) {
14454 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14455 return -ENOENT;
14456 }
14457 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14458 p != pending_inc.new_pool_names.end();
14459 ++p) {
14460 if (p->second == newname && p->first != pool) {
14461 return -EEXIST;
14462 }
14463 }
14464
14465 pending_inc.new_pool_names[pool] = newname;
14466 return 0;
14467 }
14468
14469 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14470 {
14471 op->mark_osdmon_event(__func__);
14472 auto m = op->get_req<MPoolOp>();
14473 ostringstream ss;
14474 int ret = _prepare_remove_pool(m->pool, &ss, false);
14475 if (ret == -EAGAIN) {
14476 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14477 return true;
14478 }
14479 if (ret < 0)
14480 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14481 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14482 pending_inc.epoch));
14483 return true;
14484 }
14485
14486 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14487 int ret, epoch_t epoch, bufferlist *blp)
14488 {
14489 op->mark_osdmon_event(__func__);
14490 auto m = op->get_req<MPoolOp>();
14491 dout(20) << "_pool_op_reply " << ret << dendl;
14492 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14493 ret, epoch, get_last_committed(), blp);
14494 mon.send_reply(op, reply);
14495 }
14496
14497 void OSDMonitor::convert_pool_priorities(void)
14498 {
14499 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14500 int64_t max_prio = 0;
14501 int64_t min_prio = 0;
14502 for (const auto &i : osdmap.get_pools()) {
14503 const auto &pool = i.second;
14504
14505 if (pool.opts.is_set(key)) {
14506 int64_t prio = 0;
14507 pool.opts.get(key, &prio);
14508 if (prio > max_prio)
14509 max_prio = prio;
14510 if (prio < min_prio)
14511 min_prio = prio;
14512 }
14513 }
14514 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14515 dout(20) << __func__ << " nothing to fix" << dendl;
14516 return;
14517 }
14518 // Current pool priorities exceeds new maximum
14519 for (const auto &i : osdmap.get_pools()) {
14520 const auto pool_id = i.first;
14521 pg_pool_t pool = i.second;
14522
14523 int64_t prio = 0;
14524 pool.opts.get(key, &prio);
14525 int64_t n;
14526
14527 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14528 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14529 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14530 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14531 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14532 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14533 } else {
14534 continue;
14535 }
14536 if (n == 0) {
14537 pool.opts.unset(key);
14538 } else {
14539 pool.opts.set(key, static_cast<int64_t>(n));
14540 }
14541 dout(10) << __func__ << " pool " << pool_id
14542 << " recovery_priority adjusted "
14543 << prio << " to " << n << dendl;
14544 pool.last_change = pending_inc.epoch;
14545 pending_inc.new_pools[pool_id] = pool;
14546 }
14547 }
14548
14549 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14550 int *errcode,
14551 set<pg_pool_t*>* pools,
14552 const string& new_crush_rule)
14553 {
14554 dout(20) << __func__ << dendl;
14555 *okay = false;
14556 int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14557 if (new_crush_rule_result < 0) {
14558 ss << "unrecognized crush rule " << new_crush_rule_result;
14559 *errcode = new_crush_rule_result;
14560 return;
14561 }
14562 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14563 for (const auto& pooli : osdmap.pools) {
14564 int64_t poolid = pooli.first;
14565 const pg_pool_t *p = &pooli.second;
14566 if (!p->is_replicated()) {
14567 ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14568 *errcode = -EINVAL;
14569 return;
14570 }
14571 uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14572 if ((p->get_size() != default_size ||
14573 (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14574 (p->get_crush_rule() != new_rule)) {
14575 ss << "we currently require stretch mode pools start out with the"
14576 " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14577 *errcode = -EINVAL;
14578 return;
14579 }
14580 pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14581 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14582 // the attempt may fail and then we have these pool updates...but they won't do anything
14583 // if there is a failure, so if it's hard to change the interface, no need to bother
14584 pools->insert(pp);
14585 }
14586 *okay = true;
14587 return;
14588 }
14589
14590 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14591 int *errcode, bool commit,
14592 const string& dividing_bucket,
14593 uint32_t bucket_count,
14594 const set<pg_pool_t*>& pools,
14595 const string& new_crush_rule)
14596 {
14597 dout(20) << __func__ << dendl;
14598 *okay = false;
14599 CrushWrapper crush = _get_pending_crush();
14600 int dividing_id = -1;
14601 if (auto type_id = crush.get_validated_type_id(dividing_bucket);
14602 !type_id.has_value()) {
14603 ss << dividing_bucket << " is not a valid crush bucket type";
14604 *errcode = -ENOENT;
14605 ceph_assert(!commit);
14606 return;
14607 } else {
14608 dividing_id = *type_id;
14609 }
14610 vector<int> subtrees;
14611 crush.get_subtree_of_type(dividing_id, &subtrees);
14612 if (subtrees.size() != 2) {
14613 ss << "there are " << subtrees.size() << dividing_bucket
14614 << "'s in the cluster but stretch mode currently only works with 2!";
14615 *errcode = -EINVAL;
14616 ceph_assert(!commit || subtrees.size() == 2);
14617 return;
14618 }
14619
14620 int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14621 if (new_crush_rule_result < 0) {
14622 ss << "unrecognized crush rule " << new_crush_rule;
14623 *errcode = new_crush_rule_result;
14624 ceph_assert(!commit || (new_crush_rule_result > 0));
14625 return;
14626 }
14627 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14628
14629 int weight1 = crush.get_item_weight(subtrees[0]);
14630 int weight2 = crush.get_item_weight(subtrees[1]);
14631 if (weight1 != weight2) {
14632 // TODO: I'm really not sure this is a good idea?
14633 ss << "the 2 " << dividing_bucket
14634 << "instances in the cluster have differing weights "
14635 << weight1 << " and " << weight2
14636 <<" but stretch mode currently requires they be the same!";
14637 *errcode = -EINVAL;
14638 ceph_assert(!commit || (weight1 == weight2));
14639 return;
14640 }
14641 if (bucket_count != 2) {
14642 ss << "currently we only support 2-site stretch clusters!";
14643 *errcode = -EINVAL;
14644 ceph_assert(!commit || bucket_count == 2);
14645 return;
14646 }
14647 // TODO: check CRUSH rules for pools so that we are appropriately divided
14648 if (commit) {
14649 for (auto pool : pools) {
14650 pool->crush_rule = new_rule;
14651 pool->peering_crush_bucket_count = bucket_count;
14652 pool->peering_crush_bucket_target = bucket_count;
14653 pool->peering_crush_bucket_barrier = dividing_id;
14654 pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14655 pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14656 pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14657 }
14658 pending_inc.change_stretch_mode = true;
14659 pending_inc.stretch_mode_enabled = true;
14660 pending_inc.new_stretch_bucket_count = bucket_count;
14661 pending_inc.new_degraded_stretch_mode = 0;
14662 pending_inc.new_stretch_mode_bucket = dividing_id;
14663 }
14664 *okay = true;
14665 return;
14666 }
14667
14668 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14669 set<int> *really_down_buckets,
14670 set<string> *really_down_mons)
14671 {
14672 dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14673 ceph_assert(is_readable());
14674 if (dead_buckets.empty()) return false;
14675 set<int> down_cache;
14676 bool really_down = false;
14677 for (auto dbi : dead_buckets) {
14678 const string& bucket_name = dbi.first;
14679 ceph_assert(osdmap.crush->name_exists(bucket_name));
14680 int bucket_id = osdmap.crush->get_item_id(bucket_name);
14681 dout(20) << "Checking " << bucket_name << " id " << bucket_id
14682 << " to see if OSDs are also down" << dendl;
14683 bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14684 if (subtree_down) {
14685 dout(20) << "subtree is down!" << dendl;
14686 really_down = true;
14687 really_down_buckets->insert(bucket_id);
14688 really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14689 }
14690 }
14691 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14692 << " and mons " << *really_down_mons << " are really down" << dendl;
14693 return really_down;
14694 }
14695
14696 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14697 const set<string>& live_zones)
14698 {
14699 dout(20) << __func__ << dendl;
14700 stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14701 // update the general OSDMap changes
14702 pending_inc.change_stretch_mode = true;
14703 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14704 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14705 int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14706 ceph_assert(new_site_count == 1); // stretch count 2!
14707 pending_inc.new_degraded_stretch_mode = new_site_count;
14708 pending_inc.new_recovering_stretch_mode = 0;
14709 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14710
14711 // and then apply them to all the pg_pool_ts
14712 ceph_assert(live_zones.size() == 1); // only support 2 zones now
14713 const string& remaining_site_name = *(live_zones.begin());
14714 ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14715 int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14716 for (auto pgi : osdmap.pools) {
14717 if (pgi.second.peering_crush_bucket_count) {
14718 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14719 newp.peering_crush_bucket_count = new_site_count;
14720 newp.peering_crush_mandatory_member = remaining_site;
14721 newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14722 newp.set_last_force_op_resend(pending_inc.epoch);
14723 }
14724 }
14725 propose_pending();
14726 }
14727
14728 void OSDMonitor::trigger_recovery_stretch_mode()
14729 {
14730 dout(20) << __func__ << dendl;
14731 stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14732 pending_inc.change_stretch_mode = true;
14733 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14734 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14735 pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14736 pending_inc.new_recovering_stretch_mode = 1;
14737 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14738
14739 for (auto pgi : osdmap.pools) {
14740 if (pgi.second.peering_crush_bucket_count) {
14741 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14742 newp.set_last_force_op_resend(pending_inc.epoch);
14743 }
14744 }
14745 propose_pending();
14746 }
14747
14748 void OSDMonitor::set_degraded_stretch_mode()
14749 {
14750 stretch_recovery_triggered.set_from_double(0);
14751 }
14752
14753 void OSDMonitor::set_recovery_stretch_mode()
14754 {
14755 if (stretch_recovery_triggered.is_zero()) {
14756 stretch_recovery_triggered = ceph_clock_now();
14757 }
14758 }
14759
14760 void OSDMonitor::set_healthy_stretch_mode()
14761 {
14762 stretch_recovery_triggered.set_from_double(0);
14763 }
14764
14765 void OSDMonitor::notify_new_pg_digest()
14766 {
14767 dout(20) << __func__ << dendl;
14768 if (!stretch_recovery_triggered.is_zero()) {
14769 try_end_recovery_stretch_mode(false);
14770 }
14771 }
14772
14773 struct CMonExitRecovery : public Context {
14774 OSDMonitor *m;
14775 bool force;
14776 CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14777 void finish(int r) {
14778 m->try_end_recovery_stretch_mode(force);
14779 }
14780 };
14781
14782 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
14783 {
14784 dout(20) << __func__ << dendl;
14785 if (!mon.is_leader()) return;
14786 if (!mon.is_degraded_stretch_mode()) return;
14787 if (!mon.is_recovering_stretch_mode()) return;
14788 if (!is_readable()) {
14789 wait_for_readable_ctx(new CMonExitRecovery(this, force));
14790 return;
14791 }
14792
14793 if (osdmap.recovering_stretch_mode &&
14794 ((!stretch_recovery_triggered.is_zero() &&
14795 ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
14796 stretch_recovery_triggered) ||
14797 force)) {
14798 if (!mon.mgrstatmon()->is_readable()) {
14799 mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
14800 return;
14801 }
14802 const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
14803 double misplaced, degraded, inactive, unknown;
14804 pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
14805 if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
14806 // we can exit degraded stretch mode!
14807 mon.trigger_healthy_stretch_mode();
14808 }
14809 }
14810 }
14811
14812 void OSDMonitor::trigger_healthy_stretch_mode()
14813 {
14814 ceph_assert(is_writeable());
14815 stretch_recovery_triggered.set_from_double(0);
14816 pending_inc.change_stretch_mode = true;
14817 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14818 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14819 pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
14820 pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
14821 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14822 for (auto pgi : osdmap.pools) {
14823 if (pgi.second.peering_crush_bucket_count) {
14824 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14825 newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
14826 newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14827 newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14828 newp.set_last_force_op_resend(pending_inc.epoch);
14829 }
14830 }
14831 propose_pending();
14832 }