]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
61
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
69
70 #include "common/config.h"
71 #include "common/errno.h"
72
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
76
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
86
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
89
90 #include "json_spirit/json_spirit_reader.h"
91
92 #include <boost/algorithm/string/predicate.hpp>
93
94 using std::dec;
95 using std::hex;
96 using std::list;
97 using std::map;
98 using std::make_pair;
99 using std::ostringstream;
100 using std::pair;
101 using std::set;
102 using std::string;
103 using std::stringstream;
104 using std::to_string;
105 using std::vector;
106
107 using ceph::bufferlist;
108 using ceph::decode;
109 using ceph::encode;
110 using ceph::ErasureCodeInterfaceRef;
111 using ceph::ErasureCodePluginRegistry;
112 using ceph::ErasureCodeProfile;
113 using ceph::Formatter;
114 using ceph::JSONFormatter;
115 using ceph::make_message;
116
117 #define dout_subsys ceph_subsys_mon
118 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
119 static const string OSD_METADATA_PREFIX("osd_metadata");
120 static const string OSD_SNAP_PREFIX("osd_snap");
121
122 /*
123
124 OSD snapshot metadata
125 ---------------------
126
127 -- starting with mimic, removed in octopus --
128
129 "removed_epoch_%llu_%08lx" % (pool, epoch)
130 -> interval_set<snapid_t>
131
132 "removed_snap_%llu_%016llx" % (pool, last_snap)
133 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
134
135
136 -- starting with mimic --
137
138 "purged_snap_%llu_%016llx" % (pool, last_snap)
139 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
140
141 - note that the {removed,purged}_snap put the last snap in they key so
142 that we can use forward iteration only to search for an epoch in an
143 interval. e.g., to test if epoch N is removed/purged, we'll find a key
144 >= N that either does or doesn't contain the given snap.
145
146
147 -- starting with octopus --
148
149 "purged_epoch_%08lx" % epoch
150 -> map<int64_t,interval_set<snapid_t>>
151
152 */
153 using namespace TOPNSPC::common;
154 namespace {
155
156 struct OSDMemCache : public PriorityCache::PriCache {
157 OSDMonitor *osdmon;
158 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
159 int64_t committed_bytes = 0;
160 double cache_ratio = 0;
161
162 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
163
164 virtual uint64_t _get_used_bytes() const = 0;
165
166 virtual int64_t request_cache_bytes(
167 PriorityCache::Priority pri, uint64_t total_cache) const {
168 int64_t assigned = get_cache_bytes(pri);
169
170 switch (pri) {
171 // All cache items are currently set to have PRI1 priority
172 case PriorityCache::Priority::PRI1:
173 {
174 int64_t request = _get_used_bytes();
175 return (request > assigned) ? request - assigned : 0;
176 }
177 default:
178 break;
179 }
180 return -EOPNOTSUPP;
181 }
182
183 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
184 return cache_bytes[pri];
185 }
186
187 virtual int64_t get_cache_bytes() const {
188 int64_t total = 0;
189
190 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
191 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
192 total += get_cache_bytes(pri);
193 }
194 return total;
195 }
196
197 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
198 cache_bytes[pri] = bytes;
199 }
200 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
201 cache_bytes[pri] += bytes;
202 }
203 virtual int64_t commit_cache_size(uint64_t total_cache) {
204 committed_bytes = PriorityCache::get_chunk(
205 get_cache_bytes(), total_cache);
206 return committed_bytes;
207 }
208 virtual int64_t get_committed_size() const {
209 return committed_bytes;
210 }
211 virtual double get_cache_ratio() const {
212 return cache_ratio;
213 }
214 virtual void set_cache_ratio(double ratio) {
215 cache_ratio = ratio;
216 }
217 virtual void shift_bins() {
218 }
219 virtual void import_bins(const std::vector<uint64_t> &bins) {
220 }
221 virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
222 }
223 virtual uint64_t get_bins(PriorityCache::Priority pri) const {
224 return 0;
225 }
226
227 virtual string get_cache_name() const = 0;
228 };
229
230 struct IncCache : public OSDMemCache {
231 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
232
233 virtual uint64_t _get_used_bytes() const {
234 return osdmon->inc_osd_cache.get_bytes();
235 }
236
237 virtual string get_cache_name() const {
238 return "OSDMap Inc Cache";
239 }
240
241 uint64_t _get_num_osdmaps() const {
242 return osdmon->inc_osd_cache.get_size();
243 }
244 };
245
246 struct FullCache : public OSDMemCache {
247 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
248
249 virtual uint64_t _get_used_bytes() const {
250 return osdmon->full_osd_cache.get_bytes();
251 }
252
253 virtual string get_cache_name() const {
254 return "OSDMap Full Cache";
255 }
256
257 uint64_t _get_num_osdmaps() const {
258 return osdmon->full_osd_cache.get_size();
259 }
260 };
261
262 std::shared_ptr<IncCache> inc_cache;
263 std::shared_ptr<FullCache> full_cache;
264
265 const uint32_t MAX_POOL_APPLICATIONS = 4;
266 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
267 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
268
269 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
270 // Note: this doesn't include support for the application tag match
271 if ((grant.spec.allow & OSD_CAP_W) != 0) {
272 auto& match = grant.match;
273 if (match.is_match_all()) {
274 return true;
275 } else if (pool_name != nullptr &&
276 !match.pool_namespace.pool_name.empty() &&
277 match.pool_namespace.pool_name == *pool_name) {
278 return true;
279 }
280 }
281 return false;
282 }
283
284 bool is_unmanaged_snap_op_permitted(CephContext* cct,
285 const KeyServer& key_server,
286 const EntityName& entity_name,
287 const MonCap& mon_caps,
288 const entity_addr_t& peer_socket_addr,
289 const std::string* pool_name)
290 {
291 typedef std::map<std::string, std::string> CommandArgs;
292
293 if (mon_caps.is_capable(
294 cct, entity_name, "osd",
295 "osd pool op unmanaged-snap",
296 (pool_name == nullptr ?
297 CommandArgs{} /* pool DNE, require unrestricted cap */ :
298 CommandArgs{{"poolname", *pool_name}}),
299 false, true, false,
300 peer_socket_addr)) {
301 return true;
302 }
303
304 AuthCapsInfo caps_info;
305 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
306 caps_info)) {
307 dout(10) << "unable to locate OSD cap data for " << entity_name
308 << " in auth db" << dendl;
309 return false;
310 }
311
312 string caps_str;
313 if (caps_info.caps.length() > 0) {
314 auto p = caps_info.caps.cbegin();
315 try {
316 decode(caps_str, p);
317 } catch (const ceph::buffer::error &err) {
318 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
319 << dendl;
320 return false;
321 }
322 }
323
324 OSDCap osd_cap;
325 if (!osd_cap.parse(caps_str, nullptr)) {
326 dout(10) << "unable to parse OSD cap data for " << entity_name
327 << " in auth db" << dendl;
328 return false;
329 }
330
331 // if the entity has write permissions in one or all pools, permit
332 // usage of unmanaged-snapshots
333 if (osd_cap.allow_all()) {
334 return true;
335 }
336
337 for (auto& grant : osd_cap.grants) {
338 if (grant.profile.is_valid()) {
339 for (auto& profile_grant : grant.profile_grants) {
340 if (is_osd_writable(profile_grant, pool_name)) {
341 return true;
342 }
343 }
344 } else if (is_osd_writable(grant, pool_name)) {
345 return true;
346 }
347 }
348
349 return false;
350 }
351
352 } // anonymous namespace
353
354 void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
355 epoch_t last_epoch_clean)
356 {
357 if (ps >= pg_num) {
358 // removed PG
359 return;
360 }
361 epoch_by_pg.resize(pg_num, 0);
362 const auto old_lec = epoch_by_pg[ps];
363 if (old_lec >= last_epoch_clean) {
364 // stale lec
365 return;
366 }
367 epoch_by_pg[ps] = last_epoch_clean;
368 if (last_epoch_clean < floor) {
369 floor = last_epoch_clean;
370 } else if (last_epoch_clean > floor) {
371 if (old_lec == floor) {
372 // probably should increase floor?
373 auto new_floor = std::min_element(std::begin(epoch_by_pg),
374 std::end(epoch_by_pg));
375 floor = *new_floor;
376 }
377 }
378 if (ps != next_missing) {
379 return;
380 }
381 for (; next_missing < epoch_by_pg.size(); next_missing++) {
382 if (epoch_by_pg[next_missing] == 0) {
383 break;
384 }
385 }
386 }
387
388 void LastEpochClean::remove_pool(uint64_t pool)
389 {
390 report_by_pool.erase(pool);
391 }
392
393 void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
394 epoch_t last_epoch_clean)
395 {
396 auto& lec = report_by_pool[pg.pool()];
397 return lec.report(pg_num, pg.ps(), last_epoch_clean);
398 }
399
400 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
401 {
402 auto floor = latest.get_epoch();
403 for (auto& pool : latest.get_pools()) {
404 auto reported = report_by_pool.find(pool.first);
405 if (reported == report_by_pool.end()) {
406 return 0;
407 }
408 if (reported->second.next_missing < pool.second.get_pg_num()) {
409 return 0;
410 }
411 if (reported->second.floor < floor) {
412 floor = reported->second.floor;
413 }
414 }
415 return floor;
416 }
417
418 void LastEpochClean::dump(Formatter *f) const
419 {
420 f->open_array_section("per_pool");
421
422 for (auto& [pool, lec] : report_by_pool) {
423 f->open_object_section("pool");
424 f->dump_unsigned("poolid", pool);
425 f->dump_unsigned("floor", lec.floor);
426 f->close_section();
427 }
428
429 f->close_section();
430 }
431
432 class C_UpdateCreatingPGs : public Context {
433 public:
434 OSDMonitor *osdmon;
435 utime_t start;
436 epoch_t epoch;
437 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
438 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
439 void finish(int r) override {
440 if (r >= 0) {
441 utime_t end = ceph_clock_now();
442 dout(10) << "osdmap epoch " << epoch << " mapping took "
443 << (end - start) << " seconds" << dendl;
444 osdmon->update_creating_pgs();
445 osdmon->check_pg_creates_subs();
446 }
447 }
448 };
449
450 #undef dout_prefix
451 #define dout_prefix _prefix(_dout, mon, osdmap)
452 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
453 return *_dout << "mon." << mon.name << "@" << mon.rank
454 << "(" << mon.get_state_name()
455 << ").osd e" << osdmap.get_epoch() << " ";
456 }
457
458 OSDMonitor::OSDMonitor(
459 CephContext *cct,
460 Monitor &mn,
461 Paxos &p,
462 const string& service_name)
463 : PaxosService(mn, p, service_name),
464 cct(cct),
465 inc_osd_cache(g_conf()->mon_osd_cache_size),
466 full_osd_cache(g_conf()->mon_osd_cache_size),
467 has_osdmap_manifest(false),
468 mapper(mn.cct, &mn.cpu_tp)
469 {
470 inc_cache = std::make_shared<IncCache>(this);
471 full_cache = std::make_shared<FullCache>(this);
472 cct->_conf.add_observer(this);
473 int r = _set_cache_sizes();
474 if (r < 0) {
475 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
476 << g_conf()->mon_osd_cache_size
477 << ") without priority cache management"
478 << dendl;
479 }
480 }
481
482 const char **OSDMonitor::get_tracked_conf_keys() const
483 {
484 static const char* KEYS[] = {
485 "mon_memory_target",
486 "mon_memory_autotune",
487 "rocksdb_cache_size",
488 NULL
489 };
490 return KEYS;
491 }
492
493 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
494 const std::set<std::string> &changed)
495 {
496 dout(10) << __func__ << " " << changed << dendl;
497
498 if (changed.count("mon_memory_autotune")) {
499 _set_cache_autotuning();
500 }
501 if (changed.count("mon_memory_target") ||
502 changed.count("rocksdb_cache_size")) {
503 int r = _update_mon_cache_settings();
504 if (r < 0) {
505 derr << __func__ << " mon_memory_target:"
506 << g_conf()->mon_memory_target
507 << " rocksdb_cache_size:"
508 << g_conf()->rocksdb_cache_size
509 << ". Unable to update cache size."
510 << dendl;
511 }
512 }
513 }
514
515 void OSDMonitor::_set_cache_autotuning()
516 {
517 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
518 // Disable cache autotuning
519 std::lock_guard l(balancer_lock);
520 pcm = nullptr;
521 }
522
523 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
524 int r = register_cache_with_pcm();
525 if (r < 0) {
526 dout(10) << __func__
527 << " Error while registering osdmon caches with pcm."
528 << " Cache auto tuning not enabled."
529 << dendl;
530 mon_memory_autotune = false;
531 } else {
532 mon_memory_autotune = true;
533 }
534 }
535 }
536
537 int OSDMonitor::_update_mon_cache_settings()
538 {
539 if (g_conf()->mon_memory_target <= 0 ||
540 g_conf()->mon_memory_target < mon_memory_min ||
541 g_conf()->rocksdb_cache_size <= 0) {
542 return -EINVAL;
543 }
544
545 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
546 derr << __func__ << " not using pcm and rocksdb" << dendl;
547 return -EINVAL;
548 }
549
550 uint64_t old_mon_memory_target = mon_memory_target;
551 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
552
553 // Set the new pcm memory cache sizes
554 mon_memory_target = g_conf()->mon_memory_target;
555 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
556
557 uint64_t base = mon_memory_base;
558 double fragmentation = mon_memory_fragmentation;
559 uint64_t target = mon_memory_target;
560 uint64_t min = mon_memory_min;
561 uint64_t max = min;
562
563 uint64_t ltarget = (1.0 - fragmentation) * target;
564 if (ltarget > base + min) {
565 max = ltarget - base;
566 }
567
568 int r = _set_cache_ratios();
569 if (r < 0) {
570 derr << __func__ << " Cache ratios for pcm could not be set."
571 << " Review the kv (rocksdb) and mon_memory_target sizes."
572 << dendl;
573 mon_memory_target = old_mon_memory_target;
574 rocksdb_cache_size = old_rocksdb_cache_size;
575 return -EINVAL;
576 }
577
578 if (mon_memory_autotune && pcm != nullptr) {
579 std::lock_guard l(balancer_lock);
580 // set pcm cache levels
581 pcm->set_target_memory(target);
582 pcm->set_min_memory(min);
583 pcm->set_max_memory(max);
584 // tune memory based on new values
585 pcm->tune_memory();
586 pcm->balance();
587 _set_new_cache_sizes();
588 dout(1) << __func__ << " Updated mon cache setting."
589 << " target: " << target
590 << " min: " << min
591 << " max: " << max
592 << dendl;
593 }
594 return 0;
595 }
596
597 int OSDMonitor::_set_cache_sizes()
598 {
599 if (g_conf()->mon_memory_autotune) {
600 // set the new osdmon cache targets to be managed by pcm
601 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
602 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
603 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
604 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
605 mon_memory_target = g_conf()->mon_memory_target;
606 mon_memory_min = g_conf()->mon_osd_cache_size_min;
607 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
608 derr << __func__ << " mon_memory_target:" << mon_memory_target
609 << " mon_memory_min:" << mon_memory_min
610 << ". Invalid size option(s) provided."
611 << dendl;
612 return -EINVAL;
613 }
614 // Set the initial inc and full LRU cache sizes
615 inc_osd_cache.set_bytes(mon_memory_min);
616 full_osd_cache.set_bytes(mon_memory_min);
617 mon_memory_autotune = g_conf()->mon_memory_autotune;
618 }
619 return 0;
620 }
621
622 bool OSDMonitor::_have_pending_crush()
623 {
624 return pending_inc.crush.length() > 0;
625 }
626
627 CrushWrapper &OSDMonitor::_get_stable_crush()
628 {
629 return *osdmap.crush;
630 }
631
632 CrushWrapper OSDMonitor::_get_pending_crush()
633 {
634 bufferlist bl;
635 if (pending_inc.crush.length())
636 bl = pending_inc.crush;
637 else
638 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
639
640 auto p = bl.cbegin();
641 CrushWrapper crush;
642 crush.decode(p);
643 return crush;
644 }
645
646 void OSDMonitor::create_initial()
647 {
648 dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
649
650 OSDMap newmap;
651
652 bufferlist bl;
653 mon.store->get("mkfs", "osdmap", bl);
654
655 if (bl.length()) {
656 newmap.decode(bl);
657 newmap.set_fsid(mon.monmap->fsid);
658 } else {
659 newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
660 }
661 newmap.set_epoch(1);
662 newmap.created = newmap.modified = ceph_clock_now();
663
664 // new clusters should sort bitwise by default.
665 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
666
667 newmap.flags |=
668 CEPH_OSDMAP_RECOVERY_DELETES |
669 CEPH_OSDMAP_PURGED_SNAPDIRS |
670 CEPH_OSDMAP_PGLOG_HARDLIMIT;
671 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
672 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
673 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
674 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
675 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
676 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
677
678 // new cluster should require latest by default
679 if (g_conf().get_val<bool>("mon_debug_no_require_quincy")) {
680 if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) {
681 derr << __func__ << " mon_debug_no_require_quincy and pacific=true" << dendl;
682 newmap.require_osd_release = ceph_release_t::nautilus;
683 } else {
684 derr << __func__ << " mon_debug_no_require_quincy=true" << dendl;
685 newmap.require_osd_release = ceph_release_t::pacific;
686 }
687 } else {
688 newmap.require_osd_release = ceph_release_t::quincy;
689 }
690
691 ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
692 if (!r) {
693 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
694 }
695 newmap.require_min_compat_client = r;
696
697 // encode into pending incremental
698 uint64_t features = newmap.get_encoding_features();
699 newmap.encode(pending_inc.fullmap,
700 features | CEPH_FEATURE_RESERVED);
701 pending_inc.full_crc = newmap.get_crc();
702 dout(20) << " full crc " << pending_inc.full_crc << dendl;
703 }
704
705 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
706 {
707 s.insert(service_name);
708 s.insert(OSD_PG_CREATING_PREFIX);
709 s.insert(OSD_METADATA_PREFIX);
710 s.insert(OSD_SNAP_PREFIX);
711 }
712
713 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
714 {
715 // we really don't care if the version has been updated, because we may
716 // have trimmed without having increased the last committed; yet, we may
717 // need to update the in-memory manifest.
718 load_osdmap_manifest();
719
720 version_t version = get_last_committed();
721 if (version == osdmap.epoch)
722 return;
723 ceph_assert(version > osdmap.epoch);
724
725 dout(15) << "update_from_paxos paxos e " << version
726 << ", my e " << osdmap.epoch << dendl;
727
728 int prev_num_up_osd = osdmap.num_up_osd;
729
730 if (mapping_job) {
731 if (!mapping_job->is_done()) {
732 dout(1) << __func__ << " mapping job "
733 << mapping_job.get() << " did not complete, "
734 << mapping_job->shards << " left, canceling" << dendl;
735 mapping_job->abort();
736 }
737 mapping_job.reset();
738 }
739
740 load_health();
741
742 /*
743 * We will possibly have a stashed latest that *we* wrote, and we will
744 * always be sure to have the oldest full map in the first..last range
745 * due to encode_trim_extra(), which includes the oldest full map in the trim
746 * transaction.
747 *
748 * encode_trim_extra() does not however write the full map's
749 * version to 'full_latest'. This is only done when we are building the
750 * full maps from the incremental versions. But don't panic! We make sure
751 * that the following conditions find whichever full map version is newer.
752 */
753 version_t latest_full = get_version_latest_full();
754 if (latest_full == 0 && get_first_committed() > 1)
755 latest_full = get_first_committed();
756
757 if (get_first_committed() > 1 &&
758 latest_full < get_first_committed()) {
759 // the monitor could be just sync'ed with its peer, and the latest_full key
760 // is not encoded in the paxos commits in encode_pending(), so we need to
761 // make sure we get it pointing to a proper version.
762 version_t lc = get_last_committed();
763 version_t fc = get_first_committed();
764
765 dout(10) << __func__ << " looking for valid full map in interval"
766 << " [" << fc << ", " << lc << "]" << dendl;
767
768 latest_full = 0;
769 for (version_t v = lc; v >= fc; v--) {
770 string full_key = "full_" + stringify(v);
771 if (mon.store->exists(get_service_name(), full_key)) {
772 dout(10) << __func__ << " found latest full map v " << v << dendl;
773 latest_full = v;
774 break;
775 }
776 }
777
778 ceph_assert(latest_full > 0);
779 auto t(std::make_shared<MonitorDBStore::Transaction>());
780 put_version_latest_full(t, latest_full);
781 mon.store->apply_transaction(t);
782 dout(10) << __func__ << " updated the on-disk full map version to "
783 << latest_full << dendl;
784 }
785
786 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
787 bufferlist latest_bl;
788 get_version_full(latest_full, latest_bl);
789 ceph_assert(latest_bl.length() != 0);
790 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
791 osdmap = OSDMap();
792 osdmap.decode(latest_bl);
793 }
794
795 bufferlist bl;
796 if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
797 auto p = bl.cbegin();
798 std::lock_guard<std::mutex> l(creating_pgs_lock);
799 creating_pgs.decode(p);
800 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
801 << creating_pgs.last_scan_epoch
802 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
803 } else {
804 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
805 << dendl;
806 }
807
808 // walk through incrementals
809 MonitorDBStore::TransactionRef t;
810 size_t tx_size = 0;
811 while (version > osdmap.epoch) {
812 bufferlist inc_bl;
813 int err = get_version(osdmap.epoch+1, inc_bl);
814 ceph_assert(err == 0);
815 ceph_assert(inc_bl.length());
816 // set priority cache manager levels if the osdmap is
817 // being populated for the first time.
818 if (mon_memory_autotune && pcm == nullptr) {
819 int r = register_cache_with_pcm();
820 if (r < 0) {
821 dout(10) << __func__
822 << " Error while registering osdmon caches with pcm."
823 << " Proceeding without cache auto tuning."
824 << dendl;
825 }
826 }
827
828 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
829 << dendl;
830 OSDMap::Incremental inc(inc_bl);
831 err = osdmap.apply_incremental(inc);
832 ceph_assert(err == 0);
833
834 if (!t)
835 t.reset(new MonitorDBStore::Transaction);
836
837 // Write out the full map for all past epochs. Encode the full
838 // map with the same features as the incremental. If we don't
839 // know, use the quorum features. If we don't know those either,
840 // encode with all features.
841 uint64_t f = inc.encode_features;
842 if (!f)
843 f = mon.get_quorum_con_features();
844 if (!f)
845 f = -1;
846 bufferlist full_bl;
847 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
848 tx_size += full_bl.length();
849
850 bufferlist orig_full_bl;
851 get_version_full(osdmap.epoch, orig_full_bl);
852 if (orig_full_bl.length()) {
853 // the primary provided the full map
854 ceph_assert(inc.have_crc);
855 if (inc.full_crc != osdmap.crc) {
856 // This will happen if the mons were running mixed versions in
857 // the past or some other circumstance made the full encoded
858 // maps divergent. Reloading here will bring us back into
859 // sync with the primary for this and all future maps. OSDs
860 // will also be brought back into sync when they discover the
861 // crc mismatch and request a full map from a mon.
862 derr << __func__ << " full map CRC mismatch, resetting to canonical"
863 << dendl;
864
865 dout(20) << __func__ << " my (bad) full osdmap:\n";
866 JSONFormatter jf(true);
867 jf.dump_object("osdmap", osdmap);
868 jf.flush(*_dout);
869 *_dout << "\nhexdump:\n";
870 full_bl.hexdump(*_dout);
871 *_dout << dendl;
872
873 osdmap = OSDMap();
874 osdmap.decode(orig_full_bl);
875
876 dout(20) << __func__ << " canonical full osdmap:\n";
877 JSONFormatter jf(true);
878 jf.dump_object("osdmap", osdmap);
879 jf.flush(*_dout);
880 *_dout << "\nhexdump:\n";
881 orig_full_bl.hexdump(*_dout);
882 *_dout << dendl;
883 }
884 } else {
885 ceph_assert(!inc.have_crc);
886 put_version_full(t, osdmap.epoch, full_bl);
887 }
888 put_version_latest_full(t, osdmap.epoch);
889
890 // share
891 dout(1) << osdmap << dendl;
892
893 if (osdmap.epoch == 1) {
894 t->erase("mkfs", "osdmap");
895 }
896
897 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
898 mon.store->apply_transaction(t);
899 t = MonitorDBStore::TransactionRef();
900 tx_size = 0;
901 }
902 for (const auto [osd, state] : inc.new_state) {
903 if (state & CEPH_OSD_UP) {
904 // could be marked up *or* down, but we're too lazy to check which
905 last_osd_report.erase(osd);
906 }
907 }
908 for (const auto [osd, weight] : inc.new_weight) {
909 if (weight == CEPH_OSD_OUT) {
910 // manually marked out, so drop it
911 osd_epochs.erase(osd);
912 }
913 }
914 }
915
916 if (t) {
917 mon.store->apply_transaction(t);
918 }
919
920 bool marked_osd_down = false;
921 for (int o = 0; o < osdmap.get_max_osd(); o++) {
922 if (osdmap.is_out(o))
923 continue;
924 auto found = down_pending_out.find(o);
925 if (osdmap.is_down(o)) {
926 // populate down -> out map
927 if (found == down_pending_out.end()) {
928 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
929 down_pending_out[o] = ceph_clock_now();
930 marked_osd_down = true;
931 }
932 } else {
933 if (found != down_pending_out.end()) {
934 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
935 down_pending_out.erase(found);
936 }
937 }
938 }
939 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
940
941 check_osdmap_subs();
942 check_pg_creates_subs();
943
944 share_map_with_random_osd();
945 update_logger();
946 process_failures();
947
948 // make sure our feature bits reflect the latest map
949 update_msgr_features();
950
951 if (!mon.is_leader()) {
952 // will be called by on_active() on the leader, avoid doing so twice
953 start_mapping();
954 }
955 if (osdmap.stretch_mode_enabled) {
956 dout(20) << "Stretch mode enabled in this map" << dendl;
957 mon.try_engage_stretch_mode();
958 if (osdmap.degraded_stretch_mode) {
959 dout(20) << "Degraded stretch mode set in this map" << dendl;
960 if (!osdmap.recovering_stretch_mode) {
961 mon.set_degraded_stretch_mode();
962 if (prev_num_up_osd < osdmap.num_up_osd &&
963 (osdmap.num_up_osd / (double)osdmap.num_osd) >
964 cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) {
965 // TODO: This works for 2-site clusters when the OSD maps are appropriately
966 // trimmed and everything is "normal" but not if you have a lot of out OSDs
967 // you're ignoring or in some really degenerate failure cases
968 dout(10) << "Enabling recovery stretch mode in this map" << dendl;
969 mon.go_recovery_stretch_mode();
970 }
971 } else {
972 mon.set_recovery_stretch_mode();
973 }
974 } else {
975 mon.set_healthy_stretch_mode();
976 }
977 if (marked_osd_down &&
978 (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
979 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
980 mon.maybe_go_degraded_stretch_mode();
981 }
982 }
983 }
984
985 int OSDMonitor::register_cache_with_pcm()
986 {
987 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
988 derr << __func__ << " Invalid memory size specified for mon caches."
989 << " Caches will not be auto-tuned."
990 << dendl;
991 return -EINVAL;
992 }
993 uint64_t base = mon_memory_base;
994 double fragmentation = mon_memory_fragmentation;
995 // For calculating total target memory, consider rocksdb cache size.
996 uint64_t target = mon_memory_target;
997 uint64_t min = mon_memory_min;
998 uint64_t max = min;
999
1000 // Apply the same logic as in bluestore to set the max amount
1001 // of memory to use for cache. Assume base memory for OSDMaps
1002 // and then add in some overhead for fragmentation.
1003 uint64_t ltarget = (1.0 - fragmentation) * target;
1004 if (ltarget > base + min) {
1005 max = ltarget - base;
1006 }
1007
1008 rocksdb_binned_kv_cache = mon.store->get_priority_cache();
1009 if (!rocksdb_binned_kv_cache) {
1010 derr << __func__ << " not using rocksdb" << dendl;
1011 return -EINVAL;
1012 }
1013
1014 int r = _set_cache_ratios();
1015 if (r < 0) {
1016 derr << __func__ << " Cache ratios for pcm could not be set."
1017 << " Review the kv (rocksdb) and mon_memory_target sizes."
1018 << dendl;
1019 return -EINVAL;
1020 }
1021
1022 pcm = std::make_shared<PriorityCache::Manager>(
1023 cct, min, max, target, true);
1024 pcm->insert("kv", rocksdb_binned_kv_cache, true);
1025 pcm->insert("inc", inc_cache, true);
1026 pcm->insert("full", full_cache, true);
1027 dout(1) << __func__ << " pcm target: " << target
1028 << " pcm max: " << max
1029 << " pcm min: " << min
1030 << " inc_osd_cache size: " << inc_osd_cache.get_size()
1031 << dendl;
1032 return 0;
1033 }
1034
1035 int OSDMonitor::_set_cache_ratios()
1036 {
1037 double old_cache_kv_ratio = cache_kv_ratio;
1038
1039 // Set the cache ratios for kv(rocksdb), inc and full caches
1040 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1041 if (cache_kv_ratio >= 1.0) {
1042 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1043 << ") must be in range [0,<1.0]."
1044 << dendl;
1045 cache_kv_ratio = old_cache_kv_ratio;
1046 return -EINVAL;
1047 }
1048 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1049 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1050 inc_cache->set_cache_ratio(cache_inc_ratio);
1051 full_cache->set_cache_ratio(cache_full_ratio);
1052
1053 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1054 << " inc ratio " << cache_inc_ratio
1055 << " full ratio " << cache_full_ratio
1056 << dendl;
1057 return 0;
1058 }
1059
1060 void OSDMonitor::start_mapping()
1061 {
1062 // initiate mapping job
1063 if (mapping_job) {
1064 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1065 << dendl;
1066 mapping_job->abort();
1067 }
1068 if (!osdmap.get_pools().empty()) {
1069 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1070 mapping_job = mapping.start_update(osdmap, mapper,
1071 g_conf()->mon_osd_mapping_pgs_per_chunk);
1072 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1073 << " at " << fin->start << dendl;
1074 mapping_job->set_finish_event(fin);
1075 } else {
1076 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1077 mapping_job = nullptr;
1078 }
1079 }
1080
1081 void OSDMonitor::update_msgr_features()
1082 {
1083 const int types[] = {
1084 entity_name_t::TYPE_OSD,
1085 entity_name_t::TYPE_CLIENT,
1086 entity_name_t::TYPE_MDS,
1087 entity_name_t::TYPE_MON
1088 };
1089 for (int type : types) {
1090 uint64_t mask;
1091 uint64_t features = osdmap.get_features(type, &mask);
1092 if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1093 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1094 ceph::net::Policy p = mon.messenger->get_policy(type);
1095 p.features_required = (p.features_required & ~mask) | features;
1096 mon.messenger->set_policy(type, p);
1097 }
1098 }
1099 }
1100
1101 void OSDMonitor::on_active()
1102 {
1103 update_logger();
1104
1105 if (mon.is_leader()) {
1106 mon.clog->debug() << "osdmap " << osdmap;
1107 if (!priority_convert) {
1108 // Only do this once at start-up
1109 convert_pool_priorities();
1110 priority_convert = true;
1111 }
1112 } else {
1113 list<MonOpRequestRef> ls;
1114 take_all_failures(ls);
1115 while (!ls.empty()) {
1116 MonOpRequestRef op = ls.front();
1117 op->mark_osdmon_event(__func__);
1118 dispatch(op);
1119 ls.pop_front();
1120 }
1121 }
1122 start_mapping();
1123 }
1124
1125 void OSDMonitor::on_restart()
1126 {
1127 last_osd_report.clear();
1128 }
1129
1130 void OSDMonitor::on_shutdown()
1131 {
1132 dout(10) << __func__ << dendl;
1133 if (mapping_job) {
1134 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1135 << dendl;
1136 mapping_job->abort();
1137 }
1138
1139 // discard failure info, waiters
1140 list<MonOpRequestRef> ls;
1141 take_all_failures(ls);
1142 ls.clear();
1143 }
1144
1145 void OSDMonitor::update_logger()
1146 {
1147 dout(10) << "update_logger" << dendl;
1148
1149 mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1150 mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1151 mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1152 mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1153 }
1154
1155 void OSDMonitor::create_pending()
1156 {
1157 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1158 pending_inc.fsid = mon.monmap->fsid;
1159 pending_metadata.clear();
1160 pending_metadata_rm.clear();
1161 pending_pseudo_purged_snaps.clear();
1162
1163 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1164
1165 // safety checks (this shouldn't really happen)
1166 {
1167 if (osdmap.backfillfull_ratio <= 0) {
1168 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1169 if (pending_inc.new_backfillfull_ratio > 1.0)
1170 pending_inc.new_backfillfull_ratio /= 100;
1171 dout(1) << __func__ << " setting backfillfull_ratio = "
1172 << pending_inc.new_backfillfull_ratio << dendl;
1173 }
1174 if (osdmap.full_ratio <= 0) {
1175 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1176 if (pending_inc.new_full_ratio > 1.0)
1177 pending_inc.new_full_ratio /= 100;
1178 dout(1) << __func__ << " setting full_ratio = "
1179 << pending_inc.new_full_ratio << dendl;
1180 }
1181 if (osdmap.nearfull_ratio <= 0) {
1182 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1183 if (pending_inc.new_nearfull_ratio > 1.0)
1184 pending_inc.new_nearfull_ratio /= 100;
1185 dout(1) << __func__ << " setting nearfull_ratio = "
1186 << pending_inc.new_nearfull_ratio << dendl;
1187 }
1188 }
1189 }
1190
1191 creating_pgs_t
1192 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1193 const OSDMap& nextmap)
1194 {
1195 dout(10) << __func__ << dendl;
1196 creating_pgs_t pending_creatings;
1197 {
1198 std::lock_guard<std::mutex> l(creating_pgs_lock);
1199 pending_creatings = creating_pgs;
1200 }
1201 // check for new or old pools
1202 if (pending_creatings.last_scan_epoch < inc.epoch) {
1203 unsigned queued = 0;
1204 queued += scan_for_creating_pgs(osdmap.get_pools(),
1205 inc.old_pools,
1206 inc.modified,
1207 &pending_creatings);
1208 queued += scan_for_creating_pgs(inc.new_pools,
1209 inc.old_pools,
1210 inc.modified,
1211 &pending_creatings);
1212 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1213 for (auto deleted_pool : inc.old_pools) {
1214 auto removed = pending_creatings.remove_pool(deleted_pool);
1215 dout(10) << __func__ << " " << removed
1216 << " pg removed because containing pool deleted: "
1217 << deleted_pool << dendl;
1218 last_epoch_clean.remove_pool(deleted_pool);
1219 }
1220 // pgmon updates its creating_pgs in check_osd_map() which is called by
1221 // on_active() and check_osd_map() could be delayed if lease expires, so its
1222 // creating_pgs could be stale in comparison with the one of osdmon. let's
1223 // trim them here. otherwise, they will be added back after being erased.
1224 unsigned removed = 0;
1225 for (auto& pg : pending_created_pgs) {
1226 dout(20) << __func__ << " noting created pg " << pg << dendl;
1227 pending_creatings.created_pools.insert(pg.pool());
1228 removed += pending_creatings.pgs.erase(pg);
1229 }
1230 pending_created_pgs.clear();
1231 dout(10) << __func__ << " " << removed
1232 << " pgs removed because they're created" << dendl;
1233 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1234 }
1235
1236 // filter out any pgs that shouldn't exist.
1237 {
1238 auto i = pending_creatings.pgs.begin();
1239 while (i != pending_creatings.pgs.end()) {
1240 if (!nextmap.pg_exists(i->first)) {
1241 dout(10) << __func__ << " removing pg " << i->first
1242 << " which should not exist" << dendl;
1243 i = pending_creatings.pgs.erase(i);
1244 } else {
1245 ++i;
1246 }
1247 }
1248 }
1249
1250 // process queue
1251 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1252 const auto total = pending_creatings.pgs.size();
1253 while (pending_creatings.pgs.size() < max &&
1254 !pending_creatings.queue.empty()) {
1255 auto p = pending_creatings.queue.begin();
1256 int64_t poolid = p->first;
1257 dout(10) << __func__ << " pool " << poolid
1258 << " created " << p->second.created
1259 << " modified " << p->second.modified
1260 << " [" << p->second.start << "-" << p->second.end << ")"
1261 << dendl;
1262 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1263 p->second.end - p->second.start);
1264 ps_t first = p->second.start;
1265 ps_t end = first + n;
1266 for (ps_t ps = first; ps < end; ++ps) {
1267 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1268 // NOTE: use the *current* epoch as the PG creation epoch so that the
1269 // OSD does not have to generate a long set of PastIntervals.
1270 pending_creatings.pgs.emplace(
1271 pgid,
1272 creating_pgs_t::pg_create_info(inc.epoch,
1273 p->second.modified));
1274 dout(10) << __func__ << " adding " << pgid << dendl;
1275 }
1276 p->second.start = end;
1277 if (p->second.done()) {
1278 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1279 pending_creatings.queue.erase(p);
1280 } else {
1281 dout(10) << __func__ << " pool " << poolid
1282 << " now [" << p->second.start << "-" << p->second.end << ")"
1283 << dendl;
1284 }
1285 }
1286 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1287 << " pools" << dendl;
1288
1289 if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1290 // walk creating pgs' history and past_intervals forward
1291 for (auto& i : pending_creatings.pgs) {
1292 // this mirrors PG::start_peering_interval()
1293 pg_t pgid = i.first;
1294
1295 // this is a bit imprecise, but sufficient?
1296 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1297 const pg_pool_t *pi;
1298 bool operator()(const set<pg_shard_t> &have) const {
1299 return have.size() >= pi->min_size;
1300 }
1301 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1302 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1303
1304 vector<int> up, acting;
1305 int up_primary, acting_primary;
1306 nextmap.pg_to_up_acting_osds(
1307 pgid, &up, &up_primary, &acting, &acting_primary);
1308 if (i.second.history.epoch_created == 0) {
1309 // new pg entry, set it up
1310 i.second.up = up;
1311 i.second.acting = acting;
1312 i.second.up_primary = up_primary;
1313 i.second.acting_primary = acting_primary;
1314 i.second.history = pg_history_t(i.second.create_epoch,
1315 i.second.create_stamp);
1316 dout(10) << __func__ << " pg " << pgid << " just added, "
1317 << " up " << i.second.up
1318 << " p " << i.second.up_primary
1319 << " acting " << i.second.acting
1320 << " p " << i.second.acting_primary
1321 << " history " << i.second.history
1322 << " past_intervals " << i.second.past_intervals
1323 << dendl;
1324 } else {
1325 std::stringstream debug;
1326 if (PastIntervals::check_new_interval(
1327 i.second.acting_primary, acting_primary,
1328 i.second.acting, acting,
1329 i.second.up_primary, up_primary,
1330 i.second.up, up,
1331 i.second.history.same_interval_since,
1332 i.second.history.last_epoch_clean,
1333 &nextmap,
1334 &osdmap,
1335 pgid,
1336 min_size_predicate,
1337 &i.second.past_intervals,
1338 &debug)) {
1339 epoch_t e = inc.epoch;
1340 i.second.history.same_interval_since = e;
1341 if (i.second.up != up) {
1342 i.second.history.same_up_since = e;
1343 }
1344 if (i.second.acting_primary != acting_primary) {
1345 i.second.history.same_primary_since = e;
1346 }
1347 if (pgid.is_split(
1348 osdmap.get_pg_num(pgid.pool()),
1349 nextmap.get_pg_num(pgid.pool()),
1350 nullptr)) {
1351 i.second.history.last_epoch_split = e;
1352 }
1353 dout(10) << __func__ << " pg " << pgid << " new interval,"
1354 << " up " << i.second.up << " -> " << up
1355 << " p " << i.second.up_primary << " -> " << up_primary
1356 << " acting " << i.second.acting << " -> " << acting
1357 << " p " << i.second.acting_primary << " -> "
1358 << acting_primary
1359 << " history " << i.second.history
1360 << " past_intervals " << i.second.past_intervals
1361 << dendl;
1362 dout(20) << " debug: " << debug.str() << dendl;
1363 i.second.up = up;
1364 i.second.acting = acting;
1365 i.second.up_primary = up_primary;
1366 i.second.acting_primary = acting_primary;
1367 }
1368 }
1369 }
1370 }
1371 dout(10) << __func__
1372 << " " << (pending_creatings.pgs.size() - total)
1373 << "/" << pending_creatings.pgs.size()
1374 << " pgs added from queued pools" << dendl;
1375 return pending_creatings;
1376 }
1377
1378 void OSDMonitor::maybe_prime_pg_temp()
1379 {
1380 bool all = false;
1381 if (pending_inc.crush.length()) {
1382 dout(10) << __func__ << " new crush map, all" << dendl;
1383 all = true;
1384 }
1385
1386 if (!pending_inc.new_up_client.empty()) {
1387 dout(10) << __func__ << " new up osds, all" << dendl;
1388 all = true;
1389 }
1390
1391 // check for interesting OSDs
1392 set<int> osds;
1393 for (auto p = pending_inc.new_state.begin();
1394 !all && p != pending_inc.new_state.end();
1395 ++p) {
1396 if ((p->second & CEPH_OSD_UP) &&
1397 osdmap.is_up(p->first)) {
1398 osds.insert(p->first);
1399 }
1400 }
1401 for (auto p = pending_inc.new_weight.begin();
1402 !all && p != pending_inc.new_weight.end();
1403 ++p) {
1404 if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1405 // weight reduction
1406 osds.insert(p->first);
1407 } else {
1408 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1409 << dendl;
1410 all = true;
1411 }
1412 }
1413
1414 if (!all && osds.empty())
1415 return;
1416
1417 if (!all) {
1418 unsigned estimate =
1419 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1420 if (estimate > mapping.get_num_pgs() *
1421 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1422 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1423 << osds.size() << " osds >= "
1424 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1425 << mapping.get_num_pgs() << " pgs, all"
1426 << dendl;
1427 all = true;
1428 } else {
1429 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1430 << osds.size() << " osds" << dendl;
1431 }
1432 }
1433
1434 OSDMap next;
1435 next.deepish_copy_from(osdmap);
1436 next.apply_incremental(pending_inc);
1437
1438 if (next.get_pools().empty()) {
1439 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1440 } else if (all) {
1441 PrimeTempJob job(next, this);
1442 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1443 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1444 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1445 } else {
1446 dout(10) << __func__ << " did not finish in "
1447 << g_conf()->mon_osd_prime_pg_temp_max_time
1448 << ", stopping" << dendl;
1449 job.abort();
1450 }
1451 } else {
1452 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1453 utime_t stop = ceph_clock_now();
1454 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1455 const int chunk = 1000;
1456 int n = chunk;
1457 std::unordered_set<pg_t> did_pgs;
1458 for (auto osd : osds) {
1459 auto& pgs = mapping.get_osd_acting_pgs(osd);
1460 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1461 for (auto pgid : pgs) {
1462 if (!did_pgs.insert(pgid).second) {
1463 continue;
1464 }
1465 prime_pg_temp(next, pgid);
1466 if (--n <= 0) {
1467 n = chunk;
1468 if (ceph_clock_now() > stop) {
1469 dout(10) << __func__ << " consumed more than "
1470 << g_conf()->mon_osd_prime_pg_temp_max_time
1471 << " seconds, stopping"
1472 << dendl;
1473 return;
1474 }
1475 }
1476 }
1477 }
1478 }
1479 }
1480
1481 void OSDMonitor::prime_pg_temp(
1482 const OSDMap& next,
1483 pg_t pgid)
1484 {
1485 // TODO: remove this creating_pgs direct access?
1486 if (creating_pgs.pgs.count(pgid)) {
1487 return;
1488 }
1489 if (!osdmap.pg_exists(pgid)) {
1490 return;
1491 }
1492
1493 vector<int> up, acting;
1494 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1495
1496 vector<int> next_up, next_acting;
1497 int next_up_primary, next_acting_primary;
1498 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1499 &next_acting, &next_acting_primary);
1500 if (acting == next_acting &&
1501 !(up != acting && next_up == next_acting))
1502 return; // no change since last epoch
1503
1504 if (acting.empty())
1505 return; // if previously empty now we can be no worse off
1506 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1507 if (pool && acting.size() < pool->min_size)
1508 return; // can be no worse off than before
1509
1510 if (next_up == next_acting) {
1511 acting.clear();
1512 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1513 << dendl;
1514 }
1515
1516 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1517 << " -> " << next_up << "/" << next_acting
1518 << ", priming " << acting
1519 << dendl;
1520 {
1521 std::lock_guard l(prime_pg_temp_lock);
1522 // do not touch a mapping if a change is pending
1523 pending_inc.new_pg_temp.emplace(
1524 pgid,
1525 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1526 }
1527 }
1528
1529 /**
1530 * @note receiving a transaction in this function gives a fair amount of
1531 * freedom to the service implementation if it does need it. It shouldn't.
1532 */
1533 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1534 {
1535 dout(10) << "encode_pending e " << pending_inc.epoch
1536 << dendl;
1537
1538 if (do_prune(t)) {
1539 dout(1) << __func__ << " osdmap full prune encoded e"
1540 << pending_inc.epoch << dendl;
1541 }
1542
1543 // finalize up pending_inc
1544 pending_inc.modified = ceph_clock_now();
1545
1546 int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1547 ceph_assert(r == 0);
1548
1549 if (mapping_job) {
1550 if (!mapping_job->is_done()) {
1551 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1552 << mapping_job.get() << " did not complete, "
1553 << mapping_job->shards << " left" << dendl;
1554 mapping_job->abort();
1555 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1556 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1557 << mapping_job.get() << " is prior epoch "
1558 << mapping.get_epoch() << dendl;
1559 } else {
1560 if (g_conf()->mon_osd_prime_pg_temp) {
1561 maybe_prime_pg_temp();
1562 }
1563 }
1564 } else if (g_conf()->mon_osd_prime_pg_temp) {
1565 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1566 << dendl;
1567 }
1568 mapping_job.reset();
1569
1570 // ensure we don't have blank new_state updates. these are interrpeted as
1571 // CEPH_OSD_UP (and almost certainly not what we want!).
1572 auto p = pending_inc.new_state.begin();
1573 while (p != pending_inc.new_state.end()) {
1574 if (p->second == 0) {
1575 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1576 p = pending_inc.new_state.erase(p);
1577 } else {
1578 if (p->second & CEPH_OSD_UP) {
1579 pending_inc.new_last_up_change = pending_inc.modified;
1580 }
1581 ++p;
1582 }
1583 }
1584 if (!pending_inc.new_up_client.empty()) {
1585 pending_inc.new_last_up_change = pending_inc.modified;
1586 }
1587 for (auto& i : pending_inc.new_weight) {
1588 if (i.first >= osdmap.max_osd) {
1589 if (i.second) {
1590 // new osd is already marked in
1591 pending_inc.new_last_in_change = pending_inc.modified;
1592 break;
1593 }
1594 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1595 // existing osd marked in or out
1596 pending_inc.new_last_in_change = pending_inc.modified;
1597 break;
1598 }
1599 }
1600
1601 {
1602 OSDMap tmp;
1603 tmp.deepish_copy_from(osdmap);
1604 tmp.apply_incremental(pending_inc);
1605
1606 // clean pg_temp mappings
1607 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1608
1609 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1610 {
1611 // check every upmapped pg for now
1612 // until we could reliably identify certain cases to ignore,
1613 // which is obviously the hard part TBD..
1614 vector<pg_t> pgs_to_check;
1615 tmp.get_upmap_pgs(&pgs_to_check);
1616 if (pgs_to_check.size() <
1617 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1618 // not enough pgs, do it inline
1619 tmp.clean_pg_upmaps(cct, &pending_inc);
1620 } else {
1621 CleanUpmapJob job(cct, tmp, pending_inc);
1622 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1623 job.wait();
1624 }
1625 }
1626
1627 // update creating pgs first so that we can remove the created pgid and
1628 // process the pool flag removal below in the same osdmap epoch.
1629 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1630 bufferlist creatings_bl;
1631 uint64_t features = CEPH_FEATURES_ALL;
1632 if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1633 dout(20) << __func__ << " encoding pending pgs without octopus features"
1634 << dendl;
1635 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1636 }
1637 encode(pending_creatings, creatings_bl, features);
1638 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1639
1640 // remove any old (or incompat) POOL_CREATING flags
1641 for (auto& i : tmp.get_pools()) {
1642 if (tmp.require_osd_release < ceph_release_t::nautilus) {
1643 // pre-nautilus OSDMaps shouldn't get this flag.
1644 if (pending_inc.new_pools.count(i.first)) {
1645 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1646 }
1647 }
1648 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1649 !pending_creatings.still_creating_pool(i.first)) {
1650 dout(10) << __func__ << " done creating pool " << i.first
1651 << ", clearing CREATING flag" << dendl;
1652 if (pending_inc.new_pools.count(i.first) == 0) {
1653 pending_inc.new_pools[i.first] = i.second;
1654 }
1655 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1656 }
1657 }
1658
1659 // collect which pools are currently affected by
1660 // the near/backfill/full osd(s),
1661 // and set per-pool near/backfill/full flag instead
1662 set<int64_t> full_pool_ids;
1663 set<int64_t> backfillfull_pool_ids;
1664 set<int64_t> nearfull_pool_ids;
1665 tmp.get_full_pools(cct,
1666 &full_pool_ids,
1667 &backfillfull_pool_ids,
1668 &nearfull_pool_ids);
1669 if (full_pool_ids.empty() ||
1670 backfillfull_pool_ids.empty() ||
1671 nearfull_pool_ids.empty()) {
1672 // normal case - no nearfull, backfillfull or full osds
1673 // try cancel any improper nearfull/backfillfull/full pool
1674 // flags first
1675 for (auto &pool: tmp.get_pools()) {
1676 auto p = pool.first;
1677 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1678 nearfull_pool_ids.empty()) {
1679 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1680 << "'s nearfull flag" << dendl;
1681 if (pending_inc.new_pools.count(p) == 0) {
1682 // load original pool info first!
1683 pending_inc.new_pools[p] = pool.second;
1684 }
1685 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1686 }
1687 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1688 backfillfull_pool_ids.empty()) {
1689 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1690 << "'s backfillfull flag" << dendl;
1691 if (pending_inc.new_pools.count(p) == 0) {
1692 pending_inc.new_pools[p] = pool.second;
1693 }
1694 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1695 }
1696 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1697 full_pool_ids.empty()) {
1698 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1699 // set by EQUOTA, skipping
1700 continue;
1701 }
1702 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1703 << "'s full flag" << dendl;
1704 if (pending_inc.new_pools.count(p) == 0) {
1705 pending_inc.new_pools[p] = pool.second;
1706 }
1707 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1708 }
1709 }
1710 }
1711 if (!full_pool_ids.empty()) {
1712 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1713 << " as full" << dendl;
1714 for (auto &p: full_pool_ids) {
1715 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1716 continue;
1717 }
1718 if (pending_inc.new_pools.count(p) == 0) {
1719 pending_inc.new_pools[p] = tmp.pools[p];
1720 }
1721 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1722 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1723 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1724 }
1725 // cancel FLAG_FULL for pools which are no longer full too
1726 for (auto &pool: tmp.get_pools()) {
1727 auto p = pool.first;
1728 if (full_pool_ids.count(p)) {
1729 // skip pools we have just marked as full above
1730 continue;
1731 }
1732 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1733 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1734 // don't touch if currently is not full
1735 // or is running out of quota (and hence considered as full)
1736 continue;
1737 }
1738 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1739 << "'s full flag" << dendl;
1740 if (pending_inc.new_pools.count(p) == 0) {
1741 pending_inc.new_pools[p] = pool.second;
1742 }
1743 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1744 }
1745 }
1746 if (!backfillfull_pool_ids.empty()) {
1747 for (auto &p: backfillfull_pool_ids) {
1748 if (full_pool_ids.count(p)) {
1749 // skip pools we have already considered as full above
1750 continue;
1751 }
1752 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1753 // make sure FLAG_FULL is truly set, so we are safe not
1754 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1755 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1756 continue;
1757 }
1758 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1759 // don't bother if pool is already marked as backfillfull
1760 continue;
1761 }
1762 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1763 << "'s as backfillfull" << dendl;
1764 if (pending_inc.new_pools.count(p) == 0) {
1765 pending_inc.new_pools[p] = tmp.pools[p];
1766 }
1767 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1768 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1769 }
1770 // cancel FLAG_BACKFILLFULL for pools
1771 // which are no longer backfillfull too
1772 for (auto &pool: tmp.get_pools()) {
1773 auto p = pool.first;
1774 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1775 // skip pools we have just marked as backfillfull/full above
1776 continue;
1777 }
1778 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1779 // and don't touch if currently is not backfillfull
1780 continue;
1781 }
1782 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1783 << "'s backfillfull flag" << dendl;
1784 if (pending_inc.new_pools.count(p) == 0) {
1785 pending_inc.new_pools[p] = pool.second;
1786 }
1787 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1788 }
1789 }
1790 if (!nearfull_pool_ids.empty()) {
1791 for (auto &p: nearfull_pool_ids) {
1792 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1793 continue;
1794 }
1795 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1796 // make sure FLAG_FULL is truly set, so we are safe not
1797 // to set a extra (redundant) FLAG_NEARFULL flag
1798 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1799 continue;
1800 }
1801 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1802 // don't bother if pool is already marked as nearfull
1803 continue;
1804 }
1805 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1806 << "'s as nearfull" << dendl;
1807 if (pending_inc.new_pools.count(p) == 0) {
1808 pending_inc.new_pools[p] = tmp.pools[p];
1809 }
1810 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1811 }
1812 // cancel FLAG_NEARFULL for pools
1813 // which are no longer nearfull too
1814 for (auto &pool: tmp.get_pools()) {
1815 auto p = pool.first;
1816 if (full_pool_ids.count(p) ||
1817 backfillfull_pool_ids.count(p) ||
1818 nearfull_pool_ids.count(p)) {
1819 // skip pools we have just marked as
1820 // nearfull/backfillfull/full above
1821 continue;
1822 }
1823 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1824 // and don't touch if currently is not nearfull
1825 continue;
1826 }
1827 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1828 << "'s nearfull flag" << dendl;
1829 if (pending_inc.new_pools.count(p) == 0) {
1830 pending_inc.new_pools[p] = pool.second;
1831 }
1832 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1833 }
1834 }
1835
1836 // min_compat_client?
1837 if (!tmp.require_min_compat_client) {
1838 auto mv = tmp.get_min_compat_client();
1839 dout(1) << __func__ << " setting require_min_compat_client to currently "
1840 << "required " << mv << dendl;
1841 mon.clog->info() << "setting require_min_compat_client to currently "
1842 << "required " << mv;
1843 pending_inc.new_require_min_compat_client = mv;
1844 }
1845
1846 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1847 tmp.require_osd_release >= ceph_release_t::nautilus) {
1848 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1849 // add creating flags?
1850 for (auto& i : tmp.get_pools()) {
1851 if (pending_creatings.still_creating_pool(i.first)) {
1852 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1853 << dendl;
1854 if (pending_inc.new_pools.count(i.first) == 0) {
1855 pending_inc.new_pools[i.first] = i.second;
1856 }
1857 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1858 }
1859 }
1860 // adjust blocklist items to all be TYPE_ANY
1861 for (auto& i : tmp.blocklist) {
1862 auto a = i.first;
1863 a.set_type(entity_addr_t::TYPE_ANY);
1864 pending_inc.new_blocklist[a] = i.second;
1865 pending_inc.old_blocklist.push_back(i.first);
1866 }
1867 }
1868
1869 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1870 tmp.require_osd_release >= ceph_release_t::octopus) {
1871 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1872
1873 // adjust obsoleted cache modes
1874 for (auto& [poolid, pi] : tmp.pools) {
1875 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1876 if (pending_inc.new_pools.count(poolid) == 0) {
1877 pending_inc.new_pools[poolid] = pi;
1878 }
1879 dout(10) << __func__ << " switching pool " << poolid
1880 << " cachemode from forward -> proxy" << dendl;
1881 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1882 }
1883 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1884 if (pending_inc.new_pools.count(poolid) == 0) {
1885 pending_inc.new_pools[poolid] = pi;
1886 }
1887 dout(10) << __func__ << " switching pool " << poolid
1888 << " cachemode from readforward -> readproxy" << dendl;
1889 pending_inc.new_pools[poolid].cache_mode =
1890 pg_pool_t::CACHEMODE_READPROXY;
1891 }
1892 }
1893
1894 // clear removed_snaps for every pool
1895 for (auto& [poolid, pi] : tmp.pools) {
1896 if (pi.removed_snaps.empty()) {
1897 continue;
1898 }
1899 if (pending_inc.new_pools.count(poolid) == 0) {
1900 pending_inc.new_pools[poolid] = pi;
1901 }
1902 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1903 << dendl;
1904 pending_inc.new_pools[poolid].removed_snaps.clear();
1905 }
1906
1907 // create a combined purged snap epoch key for all purged snaps
1908 // prior to this epoch, and store it in the current epoch (i.e.,
1909 // the last pre-octopus epoch, just prior to the one we're
1910 // encoding now).
1911 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1912 it->lower_bound("purged_snap_");
1913 map<int64_t,snap_interval_set_t> combined;
1914 while (it->valid()) {
1915 if (it->key().find("purged_snap_") != 0) {
1916 break;
1917 }
1918 string k = it->key();
1919 long long unsigned pool;
1920 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1921 if (n != 1) {
1922 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1923 } else {
1924 bufferlist v = it->value();
1925 auto p = v.cbegin();
1926 snapid_t begin, end;
1927 ceph::decode(begin, p);
1928 ceph::decode(end, p);
1929 combined[pool].insert(begin, end - begin);
1930 }
1931 it->next();
1932 }
1933 if (!combined.empty()) {
1934 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1935 bufferlist v;
1936 ceph::encode(combined, v);
1937 t->put(OSD_SNAP_PREFIX, k, v);
1938 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1939 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1940 << dendl;
1941 } else {
1942 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1943 << dendl;
1944 }
1945
1946 // clean out the old removed_snap_ and removed_epoch keys
1947 // ('`' is ASCII '_' + 1)
1948 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1949 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1950 }
1951 }
1952
1953 // tell me about it
1954 for (auto i = pending_inc.new_state.begin();
1955 i != pending_inc.new_state.end();
1956 ++i) {
1957 int s = i->second ? i->second : CEPH_OSD_UP;
1958 if (s & CEPH_OSD_UP) {
1959 dout(2) << " osd." << i->first << " DOWN" << dendl;
1960 // Reset laggy parameters if failure interval exceeds a threshold.
1961 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1962 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1963 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1964 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1965 set_default_laggy_params(i->first);
1966 }
1967 }
1968 }
1969 if (s & CEPH_OSD_EXISTS)
1970 dout(2) << " osd." << i->first << " DNE" << dendl;
1971 }
1972 for (auto i = pending_inc.new_up_client.begin();
1973 i != pending_inc.new_up_client.end();
1974 ++i) {
1975 //FIXME: insert cluster addresses too
1976 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1977 }
1978 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1979 i != pending_inc.new_weight.end();
1980 ++i) {
1981 if (i->second == CEPH_OSD_OUT) {
1982 dout(2) << " osd." << i->first << " OUT" << dendl;
1983 } else if (i->second == CEPH_OSD_IN) {
1984 dout(2) << " osd." << i->first << " IN" << dendl;
1985 } else {
1986 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1987 }
1988 }
1989
1990 // features for osdmap and its incremental
1991 uint64_t features;
1992
1993 // encode full map and determine its crc
1994 OSDMap tmp;
1995 {
1996 tmp.deepish_copy_from(osdmap);
1997 tmp.apply_incremental(pending_inc);
1998
1999 // determine appropriate features
2000 features = tmp.get_encoding_features();
2001 dout(10) << __func__ << " encoding full map with "
2002 << tmp.require_osd_release
2003 << " features " << features << dendl;
2004
2005 // the features should be a subset of the mon quorum's features!
2006 ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2007
2008 bufferlist fullbl;
2009 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2010 pending_inc.full_crc = tmp.get_crc();
2011
2012 // include full map in the txn. note that old monitors will
2013 // overwrite this. new ones will now skip the local full map
2014 // encode and reload from this.
2015 put_version_full(t, pending_inc.epoch, fullbl);
2016 }
2017
2018 // encode
2019 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2020 bufferlist bl;
2021 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2022
2023 dout(20) << " full_crc " << tmp.get_crc()
2024 << " inc_crc " << pending_inc.inc_crc << dendl;
2025
2026 /* put everything in the transaction */
2027 put_version(t, pending_inc.epoch, bl);
2028 put_last_committed(t, pending_inc.epoch);
2029
2030 // metadata, too!
2031 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2032 p != pending_metadata.end();
2033 ++p) {
2034 Metadata m;
2035 auto mp = p->second.cbegin();
2036 decode(m, mp);
2037 auto it = m.find("osd_objectstore");
2038 if (it != m.end()) {
2039 if (it->second == "filestore") {
2040 filestore_osds.insert(p->first);
2041 } else {
2042 filestore_osds.erase(p->first);
2043 }
2044 }
2045 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2046 }
2047 for (set<int>::iterator p = pending_metadata_rm.begin();
2048 p != pending_metadata_rm.end();
2049 ++p) {
2050 filestore_osds.erase(*p);
2051 t->erase(OSD_METADATA_PREFIX, stringify(*p));
2052 }
2053 pending_metadata.clear();
2054 pending_metadata_rm.clear();
2055
2056 // purged_snaps
2057 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2058 !pending_inc.new_purged_snaps.empty()) {
2059 // all snaps purged this epoch (across all pools)
2060 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2061 bufferlist v;
2062 encode(pending_inc.new_purged_snaps, v);
2063 t->put(OSD_SNAP_PREFIX, k, v);
2064 }
2065 for (auto& i : pending_inc.new_purged_snaps) {
2066 for (auto q = i.second.begin();
2067 q != i.second.end();
2068 ++q) {
2069 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2070 pending_inc.epoch,
2071 t);
2072 }
2073 }
2074 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2075 for (auto snap : snaps) {
2076 insert_purged_snap_update(pool, snap, snap + 1,
2077 pending_inc.epoch,
2078 t);
2079 }
2080 }
2081
2082 // health
2083 health_check_map_t next;
2084 tmp.check_health(cct, &next);
2085 // OSD_FILESTORE
2086 check_for_filestore_osds(&next);
2087 encode_health(next, t);
2088 }
2089
2090 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2091 {
2092 bufferlist bl;
2093 int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2094 if (r < 0)
2095 return r;
2096 try {
2097 auto p = bl.cbegin();
2098 decode(m, p);
2099 }
2100 catch (ceph::buffer::error& e) {
2101 if (err)
2102 *err << "osd." << osd << " metadata is corrupt";
2103 return -EIO;
2104 }
2105 return 0;
2106 }
2107
2108 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2109 {
2110 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2111 if (osdmap.is_up(osd)) {
2112 map<string,string> meta;
2113 load_metadata(osd, meta, nullptr);
2114 auto p = meta.find(field);
2115 if (p == meta.end()) {
2116 (*out)["unknown"]++;
2117 } else {
2118 (*out)[p->second]++;
2119 }
2120 }
2121 }
2122 }
2123
2124 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2125 {
2126 map<string,int> by_val;
2127 count_metadata(field, &by_val);
2128 f->open_object_section(field.c_str());
2129 for (auto& p : by_val) {
2130 f->dump_int(p.first.c_str(), p.second);
2131 }
2132 f->close_section();
2133 }
2134
2135 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2136 {
2137 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2138 if (osdmap.is_up(osd)) {
2139 map<string,string> meta;
2140 load_metadata(osd, meta, nullptr);
2141 auto p = meta.find("ceph_version_short");
2142 if (p == meta.end()) continue;
2143 versions[p->second].push_back(string("osd.") + stringify(osd));
2144 }
2145 }
2146 }
2147
2148 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2149 {
2150 map<string, string> metadata;
2151 int r = load_metadata(osd, metadata, nullptr);
2152 if (r < 0)
2153 return r;
2154
2155 auto it = metadata.find("osd_objectstore");
2156 if (it == metadata.end())
2157 return -ENOENT;
2158 *type = it->second;
2159 return 0;
2160 }
2161
2162 void OSDMonitor::get_filestore_osd_list()
2163 {
2164 for (unsigned osd = 0; osd < osdmap.get_num_osds(); ++osd) {
2165 string objectstore_type;
2166 int r = get_osd_objectstore_type(osd, &objectstore_type);
2167 if (r == 0 && objectstore_type == "filestore") {
2168 filestore_osds.insert(osd);
2169 }
2170 }
2171 }
2172
2173 void OSDMonitor::check_for_filestore_osds(health_check_map_t *checks)
2174 {
2175 if (g_conf()->mon_warn_on_filestore_osds &&
2176 filestore_osds.size() > 0) {
2177 ostringstream ss, deprecated_tip;
2178 list<string> detail;
2179 ss << filestore_osds.size()
2180 << " osd(s) "
2181 << (filestore_osds.size() == 1 ? "is" : "are")
2182 << " running Filestore";
2183 deprecated_tip << ss.str();
2184 ss << " [Deprecated]";
2185 auto& d = checks->add("OSD_FILESTORE", HEALTH_WARN, ss.str(),
2186 filestore_osds.size());
2187 deprecated_tip << ", which has been deprecated and"
2188 << " not been optimized for QoS"
2189 << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2190 detail.push_back(deprecated_tip.str());
2191 d.detail.swap(detail);
2192 }
2193 }
2194
2195 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2196 const pg_pool_t &pool,
2197 ostream *err)
2198 {
2199 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2200 // since filestore osds could always join the pool later
2201 set<int> checked_osds;
2202 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2203 vector<int> up, acting;
2204 pg_t pgid(ps, pool_id);
2205 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2206 for (int osd : up) {
2207 if (checked_osds.find(osd) != checked_osds.end())
2208 continue;
2209 string objectstore_type;
2210 int r = get_osd_objectstore_type(osd, &objectstore_type);
2211 // allow with missing metadata, e.g. due to an osd never booting yet
2212 if (r < 0 || objectstore_type == "bluestore") {
2213 checked_osds.insert(osd);
2214 continue;
2215 }
2216 *err << "osd." << osd << " uses " << objectstore_type;
2217 return false;
2218 }
2219 }
2220 return true;
2221 }
2222
2223 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2224 {
2225 map<string,string> m;
2226 if (int r = load_metadata(osd, m, err))
2227 return r;
2228 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2229 f->dump_string(p->first.c_str(), p->second);
2230 return 0;
2231 }
2232
2233 void OSDMonitor::print_nodes(Formatter *f)
2234 {
2235 // group OSDs by their hosts
2236 map<string, list<int> > osds; // hostname => osd
2237 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2238 map<string, string> m;
2239 if (load_metadata(osd, m, NULL)) {
2240 continue;
2241 }
2242 map<string, string>::iterator hostname = m.find("hostname");
2243 if (hostname == m.end()) {
2244 // not likely though
2245 continue;
2246 }
2247 osds[hostname->second].push_back(osd);
2248 }
2249
2250 dump_services(f, osds, "osd");
2251 }
2252
2253 void OSDMonitor::share_map_with_random_osd()
2254 {
2255 if (osdmap.get_num_up_osds() == 0) {
2256 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2257 return;
2258 }
2259
2260 MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2261 if (!s) {
2262 dout(10) << __func__ << " no up osd on our session map" << dendl;
2263 return;
2264 }
2265
2266 dout(10) << "committed, telling random " << s->name
2267 << " all about it" << dendl;
2268
2269 // get feature of the peer
2270 // use quorum_con_features, if it's an anonymous connection.
2271 uint64_t features = s->con_features ? s->con_features :
2272 mon.get_quorum_con_features();
2273 // whatev, they'll request more if they need it
2274 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2275 s->con->send_message(m);
2276 // NOTE: do *not* record osd has up to this epoch (as we do
2277 // elsewhere) as they may still need to request older values.
2278 }
2279
2280 version_t OSDMonitor::get_trim_to() const
2281 {
2282 if (mon.get_quorum().empty()) {
2283 dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2284 return 0;
2285 }
2286
2287 {
2288 std::lock_guard<std::mutex> l(creating_pgs_lock);
2289 if (!creating_pgs.pgs.empty()) {
2290 dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2291 return 0;
2292 }
2293 }
2294
2295 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2296 dout(0) << __func__
2297 << " blocking osdmap trim"
2298 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2299 << " trim_to = 0" << dendl;
2300 return 0;
2301 }
2302
2303 {
2304 epoch_t floor = get_min_last_epoch_clean();
2305 dout(10) << " min_last_epoch_clean " << floor << dendl;
2306 if (g_conf()->mon_osd_force_trim_to > 0 &&
2307 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2308 floor = g_conf()->mon_osd_force_trim_to;
2309 dout(10) << __func__
2310 << " explicit mon_osd_force_trim_to = " << floor << dendl;
2311 }
2312 unsigned min = g_conf()->mon_min_osdmap_epochs;
2313 if (floor + min > get_last_committed()) {
2314 if (min < get_last_committed())
2315 floor = get_last_committed() - min;
2316 else
2317 floor = 0;
2318 }
2319 if (floor > get_first_committed()) {
2320 dout(10) << __func__ << " trim_to = " << floor << dendl;
2321 return floor;
2322 }
2323 }
2324 dout(10) << __func__ << " trim_to = 0" << dendl;
2325 return 0;
2326 }
2327
2328 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2329 {
2330 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2331 // also scan osd epochs
2332 // don't trim past the oldest reported osd epoch
2333 for (auto [osd, epoch] : osd_epochs) {
2334 if (epoch < floor) {
2335 floor = epoch;
2336 }
2337 }
2338 return floor;
2339 }
2340
2341 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2342 version_t first)
2343 {
2344 dout(10) << __func__ << " including full map for e " << first << dendl;
2345 bufferlist bl;
2346 get_version_full(first, bl);
2347 put_version_full(tx, first, bl);
2348
2349 if (has_osdmap_manifest &&
2350 first > osdmap_manifest.get_first_pinned()) {
2351 _prune_update_trimmed(tx, first);
2352 }
2353 }
2354
2355
2356 /* full osdmap prune
2357 *
2358 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2359 */
2360
2361 void OSDMonitor::load_osdmap_manifest()
2362 {
2363 bool store_has_manifest =
2364 mon.store->exists(get_service_name(), "osdmap_manifest");
2365
2366 if (!store_has_manifest) {
2367 if (!has_osdmap_manifest) {
2368 return;
2369 }
2370
2371 dout(20) << __func__
2372 << " dropping osdmap manifest from memory." << dendl;
2373 osdmap_manifest = osdmap_manifest_t();
2374 has_osdmap_manifest = false;
2375 return;
2376 }
2377
2378 dout(20) << __func__
2379 << " osdmap manifest detected in store; reload." << dendl;
2380
2381 bufferlist manifest_bl;
2382 int r = get_value("osdmap_manifest", manifest_bl);
2383 if (r < 0) {
2384 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2385 ceph_abort_msg("error reading manifest");
2386 }
2387 osdmap_manifest.decode(manifest_bl);
2388 has_osdmap_manifest = true;
2389
2390 dout(10) << __func__ << " store osdmap manifest pinned ("
2391 << osdmap_manifest.get_first_pinned()
2392 << " .. "
2393 << osdmap_manifest.get_last_pinned()
2394 << ")"
2395 << dendl;
2396 }
2397
2398 bool OSDMonitor::should_prune() const
2399 {
2400 version_t first = get_first_committed();
2401 version_t last = get_last_committed();
2402 version_t min_osdmap_epochs =
2403 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2404 version_t prune_min =
2405 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2406 version_t prune_interval =
2407 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2408 version_t last_pinned = osdmap_manifest.get_last_pinned();
2409 version_t last_to_pin = last - min_osdmap_epochs;
2410
2411 // Make it or break it constraints.
2412 //
2413 // If any of these conditions fails, we will not prune, regardless of
2414 // whether we have an on-disk manifest with an on-going pruning state.
2415 //
2416 if ((last - first) <= min_osdmap_epochs) {
2417 // between the first and last committed epochs, we don't have
2418 // enough epochs to trim, much less to prune.
2419 dout(10) << __func__
2420 << " currently holding only " << (last - first)
2421 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2422 << "); do not prune."
2423 << dendl;
2424 return false;
2425
2426 } else if ((last_to_pin - first) < prune_min) {
2427 // between the first committed epoch and the last epoch we would prune,
2428 // we simply don't have enough versions over the minimum to prune maps.
2429 dout(10) << __func__
2430 << " could only prune " << (last_to_pin - first)
2431 << " epochs (" << first << ".." << last_to_pin << "), which"
2432 " is less than the required minimum (" << prune_min << ")"
2433 << dendl;
2434 return false;
2435
2436 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2437 dout(10) << __func__
2438 << " we have pruned as far as we can; do not prune."
2439 << dendl;
2440 return false;
2441
2442 } else if (last_pinned + prune_interval > last_to_pin) {
2443 dout(10) << __func__
2444 << " not enough epochs to form an interval (last pinned: "
2445 << last_pinned << ", last to pin: "
2446 << last_to_pin << ", interval: " << prune_interval << ")"
2447 << dendl;
2448 return false;
2449 }
2450
2451 dout(15) << __func__
2452 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2453 << " lc (" << first << ".." << last << ")"
2454 << dendl;
2455 return true;
2456 }
2457
2458 void OSDMonitor::_prune_update_trimmed(
2459 MonitorDBStore::TransactionRef tx,
2460 version_t first)
2461 {
2462 dout(10) << __func__
2463 << " first " << first
2464 << " last_pinned " << osdmap_manifest.get_last_pinned()
2465 << dendl;
2466
2467 osdmap_manifest_t manifest = osdmap_manifest;
2468
2469 if (!manifest.is_pinned(first)) {
2470 manifest.pin(first);
2471 }
2472
2473 set<version_t>::iterator p_end = manifest.pinned.find(first);
2474 set<version_t>::iterator p = manifest.pinned.begin();
2475 manifest.pinned.erase(p, p_end);
2476 ceph_assert(manifest.get_first_pinned() == first);
2477
2478 if (manifest.get_last_pinned() == first+1 ||
2479 manifest.pinned.size() == 1) {
2480 // we reached the end of the line, as pinned maps go; clean up our
2481 // manifest, and let `should_prune()` decide whether we should prune
2482 // again.
2483 tx->erase(get_service_name(), "osdmap_manifest");
2484 return;
2485 }
2486
2487 bufferlist bl;
2488 manifest.encode(bl);
2489 tx->put(get_service_name(), "osdmap_manifest", bl);
2490 }
2491
2492 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2493 {
2494 dout(1) << __func__ << dendl;
2495
2496 version_t pin_first;
2497
2498 // verify constrainsts on stable in-memory state
2499 if (!has_osdmap_manifest) {
2500 // we must have never pruned, OR if we pruned the state must no longer
2501 // be relevant (i.e., the state must have been removed alongside with
2502 // the trim that *must* have removed past the last pinned map in a
2503 // previous prune).
2504 ceph_assert(osdmap_manifest.pinned.empty());
2505 ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2506 pin_first = get_first_committed();
2507
2508 } else {
2509 // we must have pruned in the past AND its state is still relevant
2510 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2511 // and thus we still hold a manifest in the store).
2512 ceph_assert(!osdmap_manifest.pinned.empty());
2513 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2514 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2515
2516 dout(10) << __func__
2517 << " first_pinned " << osdmap_manifest.get_first_pinned()
2518 << " last_pinned " << osdmap_manifest.get_last_pinned()
2519 << dendl;
2520
2521 pin_first = osdmap_manifest.get_last_pinned();
2522 }
2523
2524 manifest.pin(pin_first);
2525 }
2526
2527 bool OSDMonitor::_prune_sanitize_options() const
2528 {
2529 uint64_t prune_interval =
2530 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2531 uint64_t prune_min =
2532 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2533 uint64_t txsize =
2534 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2535
2536 bool r = true;
2537
2538 if (prune_interval == 0) {
2539 derr << __func__
2540 << " prune is enabled BUT prune interval is zero; abort."
2541 << dendl;
2542 r = false;
2543 } else if (prune_interval == 1) {
2544 derr << __func__
2545 << " prune interval is equal to one, which essentially means"
2546 " no pruning; abort."
2547 << dendl;
2548 r = false;
2549 }
2550 if (prune_min == 0) {
2551 derr << __func__
2552 << " prune is enabled BUT prune min is zero; abort."
2553 << dendl;
2554 r = false;
2555 }
2556 if (prune_interval > prune_min) {
2557 derr << __func__
2558 << " impossible to ascertain proper prune interval because"
2559 << " it is greater than the minimum prune epochs"
2560 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2561 << dendl;
2562 r = false;
2563 }
2564
2565 if (txsize < prune_interval - 1) {
2566 derr << __func__
2567 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2568 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2569 << "); abort." << dendl;
2570 r = false;
2571 }
2572 return r;
2573 }
2574
2575 bool OSDMonitor::is_prune_enabled() const {
2576 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2577 }
2578
2579 bool OSDMonitor::is_prune_supported() const {
2580 return mon.get_required_mon_features().contains_any(
2581 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2582 }
2583
2584 /** do_prune
2585 *
2586 * @returns true if has side-effects; false otherwise.
2587 */
2588 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2589 {
2590 bool enabled = is_prune_enabled();
2591
2592 dout(1) << __func__ << " osdmap full prune "
2593 << ( enabled ? "enabled" : "disabled")
2594 << dendl;
2595
2596 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2597 return false;
2598 }
2599
2600 // we are beyond the minimum prune versions, we need to remove maps because
2601 // otherwise the store will grow unbounded and we may end up having issues
2602 // with available disk space or store hangs.
2603
2604 // we will not pin all versions. We will leave a buffer number of versions.
2605 // this allows us the monitor to trim maps without caring too much about
2606 // pinned maps, and then allow us to use another ceph-mon without these
2607 // capabilities, without having to repair the store.
2608
2609 osdmap_manifest_t manifest = osdmap_manifest;
2610
2611 version_t first = get_first_committed();
2612 version_t last = get_last_committed();
2613
2614 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2615 version_t last_pinned = manifest.get_last_pinned();
2616 uint64_t prune_interval =
2617 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2618 uint64_t txsize =
2619 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2620
2621 prune_init(manifest);
2622
2623 // we need to get rid of some osdmaps
2624
2625 dout(5) << __func__
2626 << " lc (" << first << " .. " << last << ")"
2627 << " last_pinned " << last_pinned
2628 << " interval " << prune_interval
2629 << " last_to_pin " << last_to_pin
2630 << dendl;
2631
2632 // We will be erasing maps as we go.
2633 //
2634 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2635 //
2636 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2637 // we stop pruning. We could prune the maps between `next_to_pin` and
2638 // `last_to_pin`, but by not doing it we end up with neater pruned
2639 // intervals, aligned with `prune_interval`. Besides, this should not be a
2640 // problem as long as `prune_interval` is set to a sane value, instead of
2641 // hundreds or thousands of maps.
2642
2643 auto map_exists = [this](version_t v) {
2644 string k = mon.store->combine_strings("full", v);
2645 return mon.store->exists(get_service_name(), k);
2646 };
2647
2648 // 'interval' represents the number of maps from the last pinned
2649 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2650 // version 11 next; all intermediate versions will be removed.
2651 //
2652 // 'txsize' represents the maximum number of versions we'll be removing in
2653 // this iteration. If 'txsize' is large enough to perform multiple passes
2654 // pinning and removing maps, we will do so; if not, we'll do at least one
2655 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2656 // ensure that we never go *over* the maximum.
2657
2658 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2659 uint64_t removal_interval = prune_interval - 1;
2660
2661 if (txsize < removal_interval) {
2662 dout(5) << __func__
2663 << " setting txsize to removal interval size ("
2664 << removal_interval << " versions"
2665 << dendl;
2666 txsize = removal_interval;
2667 }
2668 ceph_assert(removal_interval > 0);
2669
2670 uint64_t num_pruned = 0;
2671 while (num_pruned + removal_interval <= txsize) {
2672 last_pinned = manifest.get_last_pinned();
2673
2674 if (last_pinned + prune_interval > last_to_pin) {
2675 break;
2676 }
2677 ceph_assert(last_pinned < last_to_pin);
2678
2679 version_t next_pinned = last_pinned + prune_interval;
2680 ceph_assert(next_pinned <= last_to_pin);
2681 manifest.pin(next_pinned);
2682
2683 dout(20) << __func__
2684 << " last_pinned " << last_pinned
2685 << " next_pinned " << next_pinned
2686 << " num_pruned " << num_pruned
2687 << " removal interval (" << (last_pinned+1)
2688 << ".." << (next_pinned-1) << ")"
2689 << " txsize " << txsize << dendl;
2690
2691 ceph_assert(map_exists(last_pinned));
2692 ceph_assert(map_exists(next_pinned));
2693
2694 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2695 ceph_assert(!manifest.is_pinned(v));
2696
2697 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2698 string full_key = mon.store->combine_strings("full", v);
2699 tx->erase(get_service_name(), full_key);
2700 ++num_pruned;
2701 }
2702 }
2703
2704 ceph_assert(num_pruned > 0);
2705
2706 bufferlist bl;
2707 manifest.encode(bl);
2708 tx->put(get_service_name(), "osdmap_manifest", bl);
2709
2710 return true;
2711 }
2712
2713
2714 // -------------
2715
2716 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2717 {
2718 op->mark_osdmon_event(__func__);
2719 Message *m = op->get_req();
2720 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2721
2722 switch (m->get_type()) {
2723 // READs
2724 case MSG_MON_COMMAND:
2725 try {
2726 return preprocess_command(op);
2727 } catch (const bad_cmd_get& e) {
2728 bufferlist bl;
2729 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2730 return true;
2731 }
2732 case CEPH_MSG_MON_GET_OSDMAP:
2733 return preprocess_get_osdmap(op);
2734
2735 // damp updates
2736 case MSG_OSD_MARK_ME_DOWN:
2737 return preprocess_mark_me_down(op);
2738 case MSG_OSD_MARK_ME_DEAD:
2739 return preprocess_mark_me_dead(op);
2740 case MSG_OSD_FULL:
2741 return preprocess_full(op);
2742 case MSG_OSD_FAILURE:
2743 return preprocess_failure(op);
2744 case MSG_OSD_BOOT:
2745 return preprocess_boot(op);
2746 case MSG_OSD_ALIVE:
2747 return preprocess_alive(op);
2748 case MSG_OSD_PG_CREATED:
2749 return preprocess_pg_created(op);
2750 case MSG_OSD_PG_READY_TO_MERGE:
2751 return preprocess_pg_ready_to_merge(op);
2752 case MSG_OSD_PGTEMP:
2753 return preprocess_pgtemp(op);
2754 case MSG_OSD_BEACON:
2755 return preprocess_beacon(op);
2756
2757 case CEPH_MSG_POOLOP:
2758 return preprocess_pool_op(op);
2759
2760 case MSG_REMOVE_SNAPS:
2761 return preprocess_remove_snaps(op);
2762
2763 case MSG_MON_GET_PURGED_SNAPS:
2764 return preprocess_get_purged_snaps(op);
2765
2766 default:
2767 ceph_abort();
2768 return true;
2769 }
2770 }
2771
2772 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2773 {
2774 op->mark_osdmon_event(__func__);
2775 Message *m = op->get_req();
2776 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2777
2778 switch (m->get_type()) {
2779 // damp updates
2780 case MSG_OSD_MARK_ME_DOWN:
2781 return prepare_mark_me_down(op);
2782 case MSG_OSD_MARK_ME_DEAD:
2783 return prepare_mark_me_dead(op);
2784 case MSG_OSD_FULL:
2785 return prepare_full(op);
2786 case MSG_OSD_FAILURE:
2787 return prepare_failure(op);
2788 case MSG_OSD_BOOT:
2789 return prepare_boot(op);
2790 case MSG_OSD_ALIVE:
2791 return prepare_alive(op);
2792 case MSG_OSD_PG_CREATED:
2793 return prepare_pg_created(op);
2794 case MSG_OSD_PGTEMP:
2795 return prepare_pgtemp(op);
2796 case MSG_OSD_PG_READY_TO_MERGE:
2797 return prepare_pg_ready_to_merge(op);
2798 case MSG_OSD_BEACON:
2799 return prepare_beacon(op);
2800
2801 case MSG_MON_COMMAND:
2802 try {
2803 return prepare_command(op);
2804 } catch (const bad_cmd_get& e) {
2805 bufferlist bl;
2806 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2807 return true;
2808 }
2809
2810 case CEPH_MSG_POOLOP:
2811 return prepare_pool_op(op);
2812
2813 case MSG_REMOVE_SNAPS:
2814 return prepare_remove_snaps(op);
2815
2816
2817 default:
2818 ceph_abort();
2819 }
2820
2821 return false;
2822 }
2823
2824 bool OSDMonitor::should_propose(double& delay)
2825 {
2826 dout(10) << "should_propose" << dendl;
2827
2828 // if full map, propose immediately! any subsequent changes will be clobbered.
2829 if (pending_inc.fullmap.length())
2830 return true;
2831
2832 // adjust osd weights?
2833 if (!osd_weight.empty() &&
2834 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2835 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2836 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2837 delay = 0.0;
2838 osd_weight.clear();
2839 return true;
2840 }
2841
2842 return PaxosService::should_propose(delay);
2843 }
2844
2845
2846
2847 // ---------------------------
2848 // READs
2849
2850 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2851 {
2852 op->mark_osdmon_event(__func__);
2853 auto m = op->get_req<MMonGetOSDMap>();
2854
2855 uint64_t features = mon.get_quorum_con_features();
2856 if (op->get_session() && op->get_session()->con_features)
2857 features = op->get_session()->con_features;
2858
2859 dout(10) << __func__ << " " << *m << dendl;
2860 MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2861 epoch_t first = get_first_committed();
2862 epoch_t last = osdmap.get_epoch();
2863 int max = g_conf()->osd_map_message_max;
2864 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2865 for (epoch_t e = std::max(first, m->get_full_first());
2866 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2867 ++e, --max) {
2868 bufferlist& bl = reply->maps[e];
2869 int r = get_version_full(e, features, bl);
2870 ceph_assert(r >= 0);
2871 max_bytes -= bl.length();
2872 }
2873 for (epoch_t e = std::max(first, m->get_inc_first());
2874 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2875 ++e, --max) {
2876 bufferlist& bl = reply->incremental_maps[e];
2877 int r = get_version(e, features, bl);
2878 ceph_assert(r >= 0);
2879 max_bytes -= bl.length();
2880 }
2881 reply->oldest_map = first;
2882 reply->newest_map = last;
2883 mon.send_reply(op, reply);
2884 return true;
2885 }
2886
2887
2888 // ---------------------------
2889 // UPDATEs
2890
2891 // failure --
2892
2893 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2894 // check permissions
2895 MonSession *session = op->get_session();
2896 if (!session)
2897 return true;
2898 if (!session->is_capable("osd", MON_CAP_X)) {
2899 dout(0) << "got MOSDFailure from entity with insufficient caps "
2900 << session->caps << dendl;
2901 return true;
2902 }
2903 if (fsid != mon.monmap->fsid) {
2904 dout(0) << "check_source: on fsid " << fsid
2905 << " != " << mon.monmap->fsid << dendl;
2906 return true;
2907 }
2908 return false;
2909 }
2910
2911
2912 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2913 {
2914 op->mark_osdmon_event(__func__);
2915 auto m = op->get_req<MOSDFailure>();
2916 // who is target_osd
2917 int badboy = m->get_target_osd();
2918
2919 // check permissions
2920 if (check_source(op, m->fsid))
2921 goto didit;
2922
2923 // first, verify the reporting host is valid
2924 if (m->get_orig_source().is_osd()) {
2925 int from = m->get_orig_source().num();
2926 if (!osdmap.exists(from) ||
2927 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2928 (osdmap.is_down(from) && m->if_osd_failed())) {
2929 dout(5) << "preprocess_failure from dead osd." << from
2930 << ", ignoring" << dendl;
2931 send_incremental(op, m->get_epoch()+1);
2932 goto didit;
2933 }
2934 }
2935
2936
2937 // weird?
2938 if (osdmap.is_down(badboy)) {
2939 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2940 << " " << m->get_target_addrs()
2941 << ", from " << m->get_orig_source() << dendl;
2942 if (m->get_epoch() < osdmap.get_epoch())
2943 send_incremental(op, m->get_epoch()+1);
2944 goto didit;
2945 }
2946 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2947 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2948 << " " << m->get_target_addrs()
2949 << " != map's " << osdmap.get_addrs(badboy)
2950 << ", from " << m->get_orig_source() << dendl;
2951 if (m->get_epoch() < osdmap.get_epoch())
2952 send_incremental(op, m->get_epoch()+1);
2953 goto didit;
2954 }
2955
2956 // already reported?
2957 if (osdmap.is_down(badboy) ||
2958 osdmap.get_up_from(badboy) > m->get_epoch()) {
2959 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2960 << " " << m->get_target_addrs()
2961 << ", from " << m->get_orig_source() << dendl;
2962 if (m->get_epoch() < osdmap.get_epoch())
2963 send_incremental(op, m->get_epoch()+1);
2964 goto didit;
2965 }
2966
2967 if (!can_mark_down(badboy)) {
2968 dout(5) << "preprocess_failure ignoring report of osd."
2969 << m->get_target_osd() << " " << m->get_target_addrs()
2970 << " from " << m->get_orig_source() << dendl;
2971 goto didit;
2972 }
2973
2974 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2975 << " " << m->get_target_addrs()
2976 << ", from " << m->get_orig_source() << dendl;
2977 return false;
2978
2979 didit:
2980 mon.no_reply(op);
2981 return true;
2982 }
2983
2984 class C_AckMarkedDown : public C_MonOp {
2985 OSDMonitor *osdmon;
2986 public:
2987 C_AckMarkedDown(
2988 OSDMonitor *osdmon,
2989 MonOpRequestRef op)
2990 : C_MonOp(op), osdmon(osdmon) {}
2991
2992 void _finish(int r) override {
2993 if (r == 0) {
2994 auto m = op->get_req<MOSDMarkMeDown>();
2995 osdmon->mon.send_reply(
2996 op,
2997 new MOSDMarkMeDown(
2998 m->fsid,
2999 m->target_osd,
3000 m->target_addrs,
3001 m->get_epoch(),
3002 false)); // ACK itself does not request an ack
3003 } else if (r == -EAGAIN) {
3004 osdmon->dispatch(op);
3005 } else {
3006 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
3007 }
3008 }
3009 ~C_AckMarkedDown() override {
3010 }
3011 };
3012
3013 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
3014 {
3015 op->mark_osdmon_event(__func__);
3016 auto m = op->get_req<MOSDMarkMeDown>();
3017 int from = m->target_osd;
3018
3019 // check permissions
3020 if (check_source(op, m->fsid))
3021 goto reply;
3022
3023 // first, verify the reporting host is valid
3024 if (!m->get_orig_source().is_osd())
3025 goto reply;
3026
3027 if (!osdmap.exists(from) ||
3028 osdmap.is_down(from) ||
3029 osdmap.get_addrs(from) != m->target_addrs) {
3030 dout(5) << "preprocess_mark_me_down from dead osd."
3031 << from << ", ignoring" << dendl;
3032 send_incremental(op, m->get_epoch()+1);
3033 goto reply;
3034 }
3035
3036 // no down might be set
3037 if (!can_mark_down(from))
3038 goto reply;
3039
3040 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3041 << " " << m->target_addrs << dendl;
3042 return false;
3043
3044 reply:
3045 if (m->request_ack) {
3046 Context *c(new C_AckMarkedDown(this, op));
3047 c->complete(0);
3048 }
3049 return true;
3050 }
3051
3052 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3053 {
3054 op->mark_osdmon_event(__func__);
3055 auto m = op->get_req<MOSDMarkMeDown>();
3056 int target_osd = m->target_osd;
3057
3058 ceph_assert(osdmap.is_up(target_osd));
3059 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3060
3061 mon.clog->info() << "osd." << target_osd << " marked itself down";
3062 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3063 if (m->request_ack)
3064 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3065 return true;
3066 }
3067
3068 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3069 {
3070 op->mark_osdmon_event(__func__);
3071 auto m = op->get_req<MOSDMarkMeDead>();
3072 int from = m->target_osd;
3073
3074 // check permissions
3075 if (check_source(op, m->fsid)) {
3076 mon.no_reply(op);
3077 return true;
3078 }
3079
3080 // first, verify the reporting host is valid
3081 if (!m->get_orig_source().is_osd()) {
3082 mon.no_reply(op);
3083 return true;
3084 }
3085
3086 if (!osdmap.exists(from) ||
3087 !osdmap.is_down(from)) {
3088 dout(5) << __func__ << " from nonexistent or up osd." << from
3089 << ", ignoring" << dendl;
3090 send_incremental(op, m->get_epoch()+1);
3091 mon.no_reply(op);
3092 return true;
3093 }
3094
3095 return false;
3096 }
3097
3098 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3099 {
3100 op->mark_osdmon_event(__func__);
3101 auto m = op->get_req<MOSDMarkMeDead>();
3102 int target_osd = m->target_osd;
3103
3104 ceph_assert(osdmap.is_down(target_osd));
3105
3106 mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3107 << m->get_epoch();
3108 if (!pending_inc.new_xinfo.count(target_osd)) {
3109 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3110 }
3111 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3112 wait_for_finished_proposal(
3113 op,
3114 new LambdaContext(
3115 [op, this] (int r) {
3116 if (r >= 0) {
3117 mon.no_reply(op); // ignore on success
3118 }
3119 }
3120 ));
3121 return true;
3122 }
3123
3124 bool OSDMonitor::can_mark_down(int i)
3125 {
3126 if (osdmap.is_nodown(i)) {
3127 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3128 << "will not mark it down" << dendl;
3129 return false;
3130 }
3131
3132 int num_osds = osdmap.get_num_osds();
3133 if (num_osds == 0) {
3134 dout(5) << __func__ << " no osds" << dendl;
3135 return false;
3136 }
3137 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3138 float up_ratio = (float)up / (float)num_osds;
3139 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3140 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3141 << g_conf()->mon_osd_min_up_ratio
3142 << ", will not mark osd." << i << " down" << dendl;
3143 return false;
3144 }
3145 return true;
3146 }
3147
3148 bool OSDMonitor::can_mark_up(int i)
3149 {
3150 if (osdmap.is_noup(i)) {
3151 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3152 << "will not mark it up" << dendl;
3153 return false;
3154 }
3155
3156 return true;
3157 }
3158
3159 /**
3160 * @note the parameter @p i apparently only exists here so we can output the
3161 * osd's id on messages.
3162 */
3163 bool OSDMonitor::can_mark_out(int i)
3164 {
3165 if (osdmap.is_noout(i)) {
3166 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3167 << "will not mark it out" << dendl;
3168 return false;
3169 }
3170
3171 int num_osds = osdmap.get_num_osds();
3172 if (num_osds == 0) {
3173 dout(5) << __func__ << " no osds" << dendl;
3174 return false;
3175 }
3176 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3177 float in_ratio = (float)in / (float)num_osds;
3178 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3179 if (i >= 0)
3180 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3181 << g_conf()->mon_osd_min_in_ratio
3182 << ", will not mark osd." << i << " out" << dendl;
3183 else
3184 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3185 << g_conf()->mon_osd_min_in_ratio
3186 << ", will not mark osds out" << dendl;
3187 return false;
3188 }
3189
3190 return true;
3191 }
3192
3193 bool OSDMonitor::can_mark_in(int i)
3194 {
3195 if (osdmap.is_noin(i)) {
3196 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3197 << "will not mark it in" << dendl;
3198 return false;
3199 }
3200
3201 return true;
3202 }
3203
3204 bool OSDMonitor::check_failures(utime_t now)
3205 {
3206 bool found_failure = false;
3207 auto p = failure_info.begin();
3208 while (p != failure_info.end()) {
3209 auto& [target_osd, fi] = *p;
3210 if (can_mark_down(target_osd) &&
3211 check_failure(now, target_osd, fi)) {
3212 found_failure = true;
3213 ++p;
3214 } else if (is_failure_stale(now, fi)) {
3215 dout(10) << " dropping stale failure_info for osd." << target_osd
3216 << " from " << fi.reporters.size() << " reporters"
3217 << dendl;
3218 p = failure_info.erase(p);
3219 } else {
3220 ++p;
3221 }
3222 }
3223 return found_failure;
3224 }
3225
3226 utime_t OSDMonitor::get_grace_time(utime_t now,
3227 int target_osd,
3228 failure_info_t& fi) const
3229 {
3230 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3231 if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3232 return orig_grace;
3233 }
3234 utime_t grace = orig_grace;
3235 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3236 double decay_k = ::log(.5) / halflife;
3237
3238 // scale grace period based on historical probability of 'lagginess'
3239 // (false positive failures due to slowness).
3240 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3241 const utime_t failed_for = now - fi.get_failed_since();
3242 double decay = exp((double)failed_for * decay_k);
3243 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3244 << " failed_for " << failed_for << " decay " << decay << dendl;
3245 double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3246 grace += my_grace;
3247
3248 // consider the peers reporting a failure a proxy for a potential
3249 // 'subcluster' over the overall cluster that is similarly
3250 // laggy. this is clearly not true in all cases, but will sometimes
3251 // help us localize the grace correction to a subset of the system
3252 // (say, a rack with a bad switch) that is unhappy.
3253 double peer_grace = 0;
3254 for (auto& [reporter, report] : fi.reporters) {
3255 if (osdmap.exists(reporter)) {
3256 const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3257 utime_t elapsed = now - xi.down_stamp;
3258 double decay = exp((double)elapsed * decay_k);
3259 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3260 }
3261 }
3262 peer_grace /= (double)fi.reporters.size();
3263 grace += peer_grace;
3264 dout(10) << " osd." << target_osd << " has "
3265 << fi.reporters.size() << " reporters, "
3266 << grace << " grace (" << orig_grace << " + " << my_grace
3267 << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3268 << dendl;
3269
3270 return grace;
3271 }
3272
3273 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3274 {
3275 // already pending failure?
3276 if (pending_inc.new_state.count(target_osd) &&
3277 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3278 dout(10) << " already pending failure" << dendl;
3279 return true;
3280 }
3281
3282 set<string> reporters_by_subtree;
3283 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3284 ceph_assert(fi.reporters.size());
3285 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3286 // get the parent bucket whose type matches with "reporter_subtree_level".
3287 // fall back to OSD if the level doesn't exist.
3288 if (osdmap.exists(p->first)) {
3289 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3290 if (auto iter = reporter_loc.find(reporter_subtree_level);
3291 iter == reporter_loc.end()) {
3292 reporters_by_subtree.insert("osd." + to_string(p->first));
3293 } else {
3294 reporters_by_subtree.insert(iter->second);
3295 }
3296 ++p;
3297 } else {
3298 fi.cancel_report(p->first);;
3299 p = fi.reporters.erase(p);
3300 }
3301 }
3302 if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3303 return false;
3304 }
3305 const utime_t failed_for = now - fi.get_failed_since();
3306 const utime_t grace = get_grace_time(now, target_osd, fi);
3307 if (failed_for >= grace) {
3308 dout(1) << " we have enough reporters to mark osd." << target_osd
3309 << " down" << dendl;
3310 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3311
3312 mon.clog->info() << "osd." << target_osd << " failed ("
3313 << osdmap.crush->get_full_location_ordered_string(
3314 target_osd)
3315 << ") ("
3316 << (int)reporters_by_subtree.size()
3317 << " reporters from different "
3318 << reporter_subtree_level << " after "
3319 << failed_for << " >= grace " << grace << ")";
3320 return true;
3321 }
3322 return false;
3323 }
3324
3325 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3326 {
3327 // if it takes too long to either cancel the report to mark the osd down,
3328 // some reporters must have failed to cancel their reports. let's just
3329 // forget these reports.
3330 const utime_t failed_for = now - fi.get_failed_since();
3331 auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3332 auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3333 return failed_for >= (heartbeat_grace + heartbeat_stale);
3334 }
3335
3336 void OSDMonitor::force_failure(int target_osd, int by)
3337 {
3338 // already pending failure?
3339 if (pending_inc.new_state.count(target_osd) &&
3340 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3341 dout(10) << " already pending failure" << dendl;
3342 return;
3343 }
3344
3345 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3346 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3347 if (!pending_inc.new_xinfo.count(target_osd)) {
3348 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3349 }
3350 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3351
3352 mon.clog->info() << "osd." << target_osd << " failed ("
3353 << osdmap.crush->get_full_location_ordered_string(target_osd)
3354 << ") (connection refused reported by osd." << by << ")";
3355 return;
3356 }
3357
3358 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3359 {
3360 op->mark_osdmon_event(__func__);
3361 auto m = op->get_req<MOSDFailure>();
3362 dout(1) << "prepare_failure osd." << m->get_target_osd()
3363 << " " << m->get_target_addrs()
3364 << " from " << m->get_orig_source()
3365 << " is reporting failure:" << m->if_osd_failed() << dendl;
3366
3367 int target_osd = m->get_target_osd();
3368 int reporter = m->get_orig_source().num();
3369 ceph_assert(osdmap.is_up(target_osd));
3370 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3371
3372 mon.no_reply(op);
3373
3374 if (m->if_osd_failed()) {
3375 // calculate failure time
3376 utime_t now = ceph_clock_now();
3377 utime_t failed_since =
3378 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3379
3380 // add a report
3381 if (m->is_immediate()) {
3382 mon.clog->debug() << "osd." << m->get_target_osd()
3383 << " reported immediately failed by "
3384 << m->get_orig_source();
3385 force_failure(target_osd, reporter);
3386 return true;
3387 }
3388 mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3389 << m->get_orig_source();
3390
3391 failure_info_t& fi = failure_info[target_osd];
3392 fi.add_report(reporter, failed_since, op);
3393 return check_failure(now, target_osd, fi);
3394 } else {
3395 // remove the report
3396 mon.clog->debug() << "osd." << m->get_target_osd()
3397 << " failure report canceled by "
3398 << m->get_orig_source();
3399 if (failure_info.count(target_osd)) {
3400 failure_info_t& fi = failure_info[target_osd];
3401 fi.cancel_report(reporter);
3402 if (fi.reporters.empty()) {
3403 dout(10) << " removing last failure_info for osd." << target_osd
3404 << dendl;
3405 failure_info.erase(target_osd);
3406 } else {
3407 dout(10) << " failure_info for osd." << target_osd << " now "
3408 << fi.reporters.size() << " reporters" << dendl;
3409 }
3410 } else {
3411 dout(10) << " no failure_info for osd." << target_osd << dendl;
3412 }
3413 }
3414
3415 return false;
3416 }
3417
3418 void OSDMonitor::process_failures()
3419 {
3420 map<int,failure_info_t>::iterator p = failure_info.begin();
3421 while (p != failure_info.end()) {
3422 if (osdmap.is_up(p->first)) {
3423 ++p;
3424 } else {
3425 dout(10) << "process_failures osd." << p->first << dendl;
3426 list<MonOpRequestRef> ls;
3427 p->second.take_report_messages(ls);
3428 failure_info.erase(p++);
3429
3430 while (!ls.empty()) {
3431 MonOpRequestRef o = ls.front();
3432 if (o) {
3433 o->mark_event(__func__);
3434 MOSDFailure *m = o->get_req<MOSDFailure>();
3435 send_latest(o, m->get_epoch());
3436 mon.no_reply(o);
3437 }
3438 ls.pop_front();
3439 }
3440 }
3441 }
3442 }
3443
3444 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3445 {
3446 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3447
3448 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3449 p != failure_info.end();
3450 ++p) {
3451 p->second.take_report_messages(ls);
3452 }
3453 failure_info.clear();
3454 }
3455
3456 int OSDMonitor::get_grace_interval_threshold()
3457 {
3458 int halflife = g_conf()->mon_osd_laggy_halflife;
3459 // Scale the halflife period (default: 1_hr) by
3460 // a factor (48) to calculate the threshold.
3461 int grace_threshold_factor = 48;
3462 return halflife * grace_threshold_factor;
3463 }
3464
3465 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3466 {
3467 int grace_interval_threshold_secs = get_grace_interval_threshold();
3468 if (last_failed_interval > grace_interval_threshold_secs) {
3469 dout(1) << " last_failed_interval " << last_failed_interval
3470 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3471 << dendl;
3472 return true;
3473 }
3474 return false;
3475 }
3476
3477 void OSDMonitor::set_default_laggy_params(int target_osd)
3478 {
3479 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3480 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3481 }
3482 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3483 xi.down_stamp = pending_inc.modified;
3484 xi.laggy_probability = 0.0;
3485 xi.laggy_interval = 0;
3486 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3487 }
3488
3489
3490 // boot --
3491
3492 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3493 {
3494 op->mark_osdmon_event(__func__);
3495 auto m = op->get_req<MOSDBoot>();
3496 int from = m->get_orig_source_inst().name.num();
3497
3498 // check permissions, ignore if failed (no response expected)
3499 MonSession *session = op->get_session();
3500 if (!session)
3501 goto ignore;
3502 if (!session->is_capable("osd", MON_CAP_X)) {
3503 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3504 << session->caps << dendl;
3505 goto ignore;
3506 }
3507
3508 if (m->sb.cluster_fsid != mon.monmap->fsid) {
3509 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3510 << " != " << mon.monmap->fsid << dendl;
3511 goto ignore;
3512 }
3513
3514 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3515 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3516 goto ignore;
3517 }
3518
3519 ceph_assert(m->get_orig_source_inst().name.is_osd());
3520
3521 // lower bound of N-2
3522 if (!HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS)) {
3523 mon.clog->info() << "disallowing boot of OSD "
3524 << m->get_orig_source_inst()
3525 << " because the osd lacks CEPH_FEATURE_SERVER_OCTOPUS";
3526 goto ignore;
3527 }
3528
3529 // make sure osd versions do not span more than 3 releases
3530 if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) &&
3531 osdmap.require_osd_release < ceph_release_t::nautilus) {
3532 mon.clog->info() << "disallowing boot of pacific+ OSD "
3533 << m->get_orig_source_inst()
3534 << " because require_osd_release < nautilus";
3535 goto ignore;
3536 }
3537 if (HAVE_FEATURE(m->osd_features, SERVER_QUINCY) &&
3538 osdmap.require_osd_release < ceph_release_t::octopus) {
3539 mon.clog->info() << "disallowing boot of quincy+ OSD "
3540 << m->get_orig_source_inst()
3541 << " because require_osd_release < octopus";
3542 goto ignore;
3543 }
3544
3545 if (osdmap.stretch_mode_enabled &&
3546 !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3547 mon.clog->info() << "disallowing boot of OSD "
3548 << m->get_orig_source_inst()
3549 << " because stretch mode is on and OSD lacks support";
3550 goto ignore;
3551 }
3552
3553 // already booted?
3554 if (osdmap.is_up(from) &&
3555 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3556 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3557 // yup.
3558 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3559 << " " << m->get_orig_source_addrs()
3560 << " =~ " << osdmap.get_addrs(from) << dendl;
3561 _booted(op, false);
3562 return true;
3563 }
3564
3565 if (osdmap.exists(from) &&
3566 !osdmap.get_uuid(from).is_zero() &&
3567 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3568 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3569 << " clashes with existing osd: different fsid"
3570 << " (ours: " << osdmap.get_uuid(from)
3571 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3572 goto ignore;
3573 }
3574
3575 if (osdmap.exists(from) &&
3576 osdmap.get_info(from).up_from > m->version &&
3577 osdmap.get_most_recent_addrs(from).legacy_equals(
3578 m->get_orig_source_addrs())) {
3579 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3580 send_latest(op, m->sb.current_epoch+1);
3581 return true;
3582 }
3583
3584 // noup?
3585 if (!can_mark_up(from)) {
3586 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3587 send_latest(op, m->sb.current_epoch+1);
3588 return true;
3589 }
3590
3591 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3592 return false;
3593
3594 ignore:
3595 return true;
3596 }
3597
3598 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3599 {
3600 op->mark_osdmon_event(__func__);
3601 auto m = op->get_req<MOSDBoot>();
3602 dout(7) << __func__ << " from " << m->get_source()
3603 << " sb " << m->sb
3604 << " client_addrs" << m->get_connection()->get_peer_addrs()
3605 << " cluster_addrs " << m->cluster_addrs
3606 << " hb_back_addrs " << m->hb_back_addrs
3607 << " hb_front_addrs " << m->hb_front_addrs
3608 << dendl;
3609
3610 ceph_assert(m->get_orig_source().is_osd());
3611 int from = m->get_orig_source().num();
3612
3613 // does this osd exist?
3614 if (from >= osdmap.get_max_osd()) {
3615 dout(1) << "boot from osd." << from << " >= max_osd "
3616 << osdmap.get_max_osd() << dendl;
3617 return false;
3618 }
3619
3620 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3621 if (pending_inc.new_state.count(from))
3622 oldstate ^= pending_inc.new_state[from];
3623
3624 // already up? mark down first?
3625 if (osdmap.is_up(from)) {
3626 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3627 << osdmap.get_addrs(from) << dendl;
3628 // preprocess should have caught these; if not, assert.
3629 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3630 m->get_orig_source_addrs()) ||
3631 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3632 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3633
3634 if (pending_inc.new_state.count(from) == 0 ||
3635 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3636 // mark previous guy down
3637 pending_inc.new_state[from] = CEPH_OSD_UP;
3638 }
3639 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3640 } else if (pending_inc.new_up_client.count(from)) {
3641 // already prepared, just wait
3642 dout(7) << __func__ << " already prepared, waiting on "
3643 << m->get_orig_source_addr() << dendl;
3644 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3645 } else {
3646 // mark new guy up.
3647 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3648 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3649 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3650 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3651
3652 down_pending_out.erase(from); // if any
3653
3654 if (m->sb.weight)
3655 osd_weight[from] = m->sb.weight;
3656
3657 // set uuid?
3658 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3659 << dendl;
3660 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3661 // preprocess should have caught this; if not, assert.
3662 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3663 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3664 }
3665
3666 // fresh osd?
3667 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3668 const osd_info_t& i = osdmap.get_info(from);
3669 if (i.up_from > i.lost_at) {
3670 dout(10) << " fresh osd; marking lost_at too" << dendl;
3671 pending_inc.new_lost[from] = osdmap.get_epoch();
3672 }
3673 }
3674
3675 // metadata
3676 bufferlist osd_metadata;
3677 encode(m->metadata, osd_metadata);
3678 pending_metadata[from] = osd_metadata;
3679 pending_metadata_rm.erase(from);
3680
3681 // adjust last clean unmount epoch?
3682 const osd_info_t& info = osdmap.get_info(from);
3683 dout(10) << " old osd_info: " << info << dendl;
3684 if (m->sb.mounted > info.last_clean_begin ||
3685 (m->sb.mounted == info.last_clean_begin &&
3686 m->sb.clean_thru > info.last_clean_end)) {
3687 epoch_t begin = m->sb.mounted;
3688 epoch_t end = m->sb.clean_thru;
3689
3690 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3691 << "[" << info.last_clean_begin << "," << info.last_clean_end
3692 << ") -> [" << begin << "-" << end << ")"
3693 << dendl;
3694 pending_inc.new_last_clean_interval[from] =
3695 pair<epoch_t,epoch_t>(begin, end);
3696 }
3697
3698 if (pending_inc.new_xinfo.count(from) == 0)
3699 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3700 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3701 if (m->boot_epoch == 0) {
3702 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3703 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3704 dout(10) << " not laggy, new xi " << xi << dendl;
3705 } else {
3706 if (xi.down_stamp.sec()) {
3707 int interval = ceph_clock_now().sec() -
3708 xi.down_stamp.sec();
3709 if (g_conf()->mon_osd_laggy_max_interval &&
3710 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3711 interval = g_conf()->mon_osd_laggy_max_interval;
3712 }
3713 xi.laggy_interval =
3714 interval * g_conf()->mon_osd_laggy_weight +
3715 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3716 }
3717 xi.laggy_probability =
3718 g_conf()->mon_osd_laggy_weight +
3719 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3720 dout(10) << " laggy, now xi " << xi << dendl;
3721 }
3722
3723 // set features shared by the osd
3724 if (m->osd_features)
3725 xi.features = m->osd_features;
3726 else
3727 xi.features = m->get_connection()->get_features();
3728
3729 // mark in?
3730 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3731 (oldstate & CEPH_OSD_AUTOOUT)) ||
3732 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3733 (g_conf()->mon_osd_auto_mark_in)) {
3734 if (can_mark_in(from)) {
3735 if (xi.old_weight > 0) {
3736 pending_inc.new_weight[from] = xi.old_weight;
3737 xi.old_weight = 0;
3738 } else {
3739 pending_inc.new_weight[from] = CEPH_OSD_IN;
3740 }
3741 } else {
3742 dout(7) << __func__ << " NOIN set, will not mark in "
3743 << m->get_orig_source_addr() << dendl;
3744 }
3745 }
3746
3747 // wait
3748 wait_for_finished_proposal(op, new C_Booted(this, op));
3749 }
3750 return true;
3751 }
3752
3753 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3754 {
3755 op->mark_osdmon_event(__func__);
3756 auto m = op->get_req<MOSDBoot>();
3757 dout(7) << "_booted " << m->get_orig_source_inst()
3758 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3759
3760 if (logit) {
3761 mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3762 << " boot";
3763 }
3764
3765 send_latest(op, m->sb.current_epoch+1);
3766 }
3767
3768
3769 // -------------
3770 // full
3771
3772 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3773 {
3774 op->mark_osdmon_event(__func__);
3775 auto m = op->get_req<MOSDFull>();
3776 int from = m->get_orig_source().num();
3777 set<string> state;
3778 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3779
3780 // check permissions, ignore if failed
3781 MonSession *session = op->get_session();
3782 if (!session)
3783 goto ignore;
3784 if (!session->is_capable("osd", MON_CAP_X)) {
3785 dout(0) << "MOSDFull from entity with insufficient privileges:"
3786 << session->caps << dendl;
3787 goto ignore;
3788 }
3789
3790 // ignore a full message from the osd instance that already went down
3791 if (!osdmap.exists(from)) {
3792 dout(7) << __func__ << " ignoring full message from nonexistent "
3793 << m->get_orig_source_inst() << dendl;
3794 goto ignore;
3795 }
3796 if ((!osdmap.is_up(from) &&
3797 osdmap.get_most_recent_addrs(from).legacy_equals(
3798 m->get_orig_source_addrs())) ||
3799 (osdmap.is_up(from) &&
3800 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3801 dout(7) << __func__ << " ignoring full message from down "
3802 << m->get_orig_source_inst() << dendl;
3803 goto ignore;
3804 }
3805
3806 OSDMap::calc_state_set(osdmap.get_state(from), state);
3807
3808 if ((osdmap.get_state(from) & mask) == m->state) {
3809 dout(7) << __func__ << " state already " << state << " for osd." << from
3810 << " " << m->get_orig_source_inst() << dendl;
3811 _reply_map(op, m->version);
3812 goto ignore;
3813 }
3814
3815 dout(10) << __func__ << " want state " << state << " for osd." << from
3816 << " " << m->get_orig_source_inst() << dendl;
3817 return false;
3818
3819 ignore:
3820 return true;
3821 }
3822
3823 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3824 {
3825 op->mark_osdmon_event(__func__);
3826 auto m = op->get_req<MOSDFull>();
3827 const int from = m->get_orig_source().num();
3828
3829 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3830 const unsigned want_state = m->state & mask; // safety first
3831
3832 unsigned cur_state = osdmap.get_state(from);
3833 auto p = pending_inc.new_state.find(from);
3834 if (p != pending_inc.new_state.end()) {
3835 cur_state ^= p->second;
3836 }
3837 cur_state &= mask;
3838
3839 set<string> want_state_set, cur_state_set;
3840 OSDMap::calc_state_set(want_state, want_state_set);
3841 OSDMap::calc_state_set(cur_state, cur_state_set);
3842
3843 if (cur_state != want_state) {
3844 if (p != pending_inc.new_state.end()) {
3845 p->second &= ~mask;
3846 } else {
3847 pending_inc.new_state[from] = 0;
3848 }
3849 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3850 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3851 << " -> " << want_state_set << dendl;
3852 } else {
3853 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3854 << " = wanted " << want_state_set << ", just waiting" << dendl;
3855 }
3856
3857 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3858 return true;
3859 }
3860
3861 // -------------
3862 // alive
3863
3864 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3865 {
3866 op->mark_osdmon_event(__func__);
3867 auto m = op->get_req<MOSDAlive>();
3868 int from = m->get_orig_source().num();
3869
3870 // check permissions, ignore if failed
3871 MonSession *session = op->get_session();
3872 if (!session)
3873 goto ignore;
3874 if (!session->is_capable("osd", MON_CAP_X)) {
3875 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3876 << session->caps << dendl;
3877 goto ignore;
3878 }
3879
3880 if (!osdmap.is_up(from) ||
3881 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3882 dout(7) << "preprocess_alive ignoring alive message from down "
3883 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3884 << dendl;
3885 goto ignore;
3886 }
3887
3888 if (osdmap.get_up_thru(from) >= m->want) {
3889 // yup.
3890 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3891 _reply_map(op, m->version);
3892 return true;
3893 }
3894
3895 dout(10) << "preprocess_alive want up_thru " << m->want
3896 << " from " << m->get_orig_source_inst() << dendl;
3897 return false;
3898
3899 ignore:
3900 return true;
3901 }
3902
3903 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3904 {
3905 op->mark_osdmon_event(__func__);
3906 auto m = op->get_req<MOSDAlive>();
3907 int from = m->get_orig_source().num();
3908
3909 if (0) { // we probably don't care much about these
3910 mon.clog->debug() << m->get_orig_source_inst() << " alive";
3911 }
3912
3913 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3914 << " from " << m->get_orig_source_inst() << dendl;
3915
3916 update_up_thru(from, m->version); // set to the latest map the OSD has
3917 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3918 return true;
3919 }
3920
3921 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3922 {
3923 op->mark_osdmon_event(__func__);
3924 dout(7) << "_reply_map " << e
3925 << " from " << op->get_req()->get_orig_source_inst()
3926 << dendl;
3927 send_latest(op, e);
3928 }
3929
3930 // pg_created
3931 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3932 {
3933 op->mark_osdmon_event(__func__);
3934 auto m = op->get_req<MOSDPGCreated>();
3935 dout(10) << __func__ << " " << *m << dendl;
3936 auto session = op->get_session();
3937 mon.no_reply(op);
3938 if (!session) {
3939 dout(10) << __func__ << ": no monitor session!" << dendl;
3940 return true;
3941 }
3942 if (!session->is_capable("osd", MON_CAP_X)) {
3943 derr << __func__ << " received from entity "
3944 << "with insufficient privileges " << session->caps << dendl;
3945 return true;
3946 }
3947 // always forward the "created!" to the leader
3948 return false;
3949 }
3950
3951 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3952 {
3953 op->mark_osdmon_event(__func__);
3954 auto m = op->get_req<MOSDPGCreated>();
3955 dout(10) << __func__ << " " << *m << dendl;
3956 auto src = m->get_orig_source();
3957 auto from = src.num();
3958 if (!src.is_osd() ||
3959 !mon.osdmon()->osdmap.is_up(from) ||
3960 !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3961 m->get_orig_source_addrs())) {
3962 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3963 return false;
3964 }
3965 pending_created_pgs.push_back(m->pgid);
3966 return true;
3967 }
3968
3969 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3970 {
3971 op->mark_osdmon_event(__func__);
3972 auto m = op->get_req<MOSDPGReadyToMerge>();
3973 dout(10) << __func__ << " " << *m << dendl;
3974 const pg_pool_t *pi;
3975 auto session = op->get_session();
3976 if (!session) {
3977 dout(10) << __func__ << ": no monitor session!" << dendl;
3978 goto ignore;
3979 }
3980 if (!session->is_capable("osd", MON_CAP_X)) {
3981 derr << __func__ << " received from entity "
3982 << "with insufficient privileges " << session->caps << dendl;
3983 goto ignore;
3984 }
3985 pi = osdmap.get_pg_pool(m->pgid.pool());
3986 if (!pi) {
3987 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3988 goto ignore;
3989 }
3990 if (pi->get_pg_num() <= m->pgid.ps()) {
3991 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3992 goto ignore;
3993 }
3994 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3995 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3996 goto ignore;
3997 }
3998 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3999 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4000 goto ignore;
4001 }
4002 return false;
4003
4004 ignore:
4005 mon.no_reply(op);
4006 return true;
4007 }
4008
4009 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4010 {
4011 op->mark_osdmon_event(__func__);
4012 auto m = op->get_req<MOSDPGReadyToMerge>();
4013 dout(10) << __func__ << " " << *m << dendl;
4014 pg_pool_t p;
4015 if (pending_inc.new_pools.count(m->pgid.pool()))
4016 p = pending_inc.new_pools[m->pgid.pool()];
4017 else
4018 p = *osdmap.get_pg_pool(m->pgid.pool());
4019 if (p.get_pg_num() != m->pgid.ps() + 1 ||
4020 p.get_pg_num_pending() > m->pgid.ps()) {
4021 dout(10) << __func__
4022 << " race with concurrent pg_num[_pending] update, will retry"
4023 << dendl;
4024 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4025 return true;
4026 }
4027
4028 if (m->ready) {
4029 p.dec_pg_num(m->pgid,
4030 pending_inc.epoch,
4031 m->source_version,
4032 m->target_version,
4033 m->last_epoch_started,
4034 m->last_epoch_clean);
4035 p.last_change = pending_inc.epoch;
4036 } else {
4037 // back off the merge attempt!
4038 p.set_pg_num_pending(p.get_pg_num());
4039 }
4040
4041 // force pre-nautilus clients to resend their ops, since they
4042 // don't understand pg_num_pending changes form a new interval
4043 p.last_force_op_resend_prenautilus = pending_inc.epoch;
4044
4045 pending_inc.new_pools[m->pgid.pool()] = p;
4046
4047 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4048 if (m->ready &&
4049 prob > 0 &&
4050 prob > (double)(rand() % 1000)/1000.0) {
4051 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4052 auto n = new MMonCommand(mon.monmap->get_fsid());
4053 n->set_connection(m->get_connection());
4054 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4055 osdmap.get_pool_name(m->pgid.pool()) +
4056 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4057 stringify(m->pgid.ps() + 1) + "\"}" };
4058 MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4059 nop->set_type_service();
4060 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4061 } else {
4062 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4063 }
4064 return true;
4065 }
4066
4067
4068 // -------------
4069 // pg_temp changes
4070
4071 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4072 {
4073 auto m = op->get_req<MOSDPGTemp>();
4074 dout(10) << "preprocess_pgtemp " << *m << dendl;
4075 mempool::osdmap::vector<int> empty;
4076 int from = m->get_orig_source().num();
4077 size_t ignore_cnt = 0;
4078
4079 // check caps
4080 MonSession *session = op->get_session();
4081 if (!session)
4082 goto ignore;
4083 if (!session->is_capable("osd", MON_CAP_X)) {
4084 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4085 << session->caps << dendl;
4086 goto ignore;
4087 }
4088
4089 if (!osdmap.is_up(from) ||
4090 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4091 dout(7) << "ignoring pgtemp message from down "
4092 << m->get_orig_source() << " " << m->get_orig_source_addrs()
4093 << dendl;
4094 goto ignore;
4095 }
4096
4097 if (m->forced) {
4098 return false;
4099 }
4100
4101 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4102 dout(20) << " " << p->first
4103 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4104 << " -> " << p->second << dendl;
4105
4106 // does the pool exist?
4107 if (!osdmap.have_pg_pool(p->first.pool())) {
4108 /*
4109 * 1. If the osdmap does not have the pool, it means the pool has been
4110 * removed in-between the osd sending this message and us handling it.
4111 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4112 * not exist in the pending either, as the osds would not send a
4113 * message about a pool they know nothing about (yet).
4114 * 3. However, if the pool does exist in the pending, then it must be a
4115 * new pool, and not relevant to this message (see 1).
4116 */
4117 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4118 << ": pool has been removed" << dendl;
4119 ignore_cnt++;
4120 continue;
4121 }
4122
4123 int acting_primary = -1;
4124 osdmap.pg_to_up_acting_osds(
4125 p->first, nullptr, nullptr, nullptr, &acting_primary);
4126 if (acting_primary != from) {
4127 /* If the source isn't the primary based on the current osdmap, we know
4128 * that the interval changed and that we can discard this message.
4129 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4130 * which of two pg temp mappings on the same pg is more recent.
4131 */
4132 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4133 << ": primary has changed" << dendl;
4134 ignore_cnt++;
4135 continue;
4136 }
4137
4138 // removal?
4139 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4140 osdmap.primary_temp->count(p->first)))
4141 return false;
4142 // change?
4143 // NOTE: we assume that this will clear pg_primary, so consider
4144 // an existing pg_primary field to imply a change
4145 if (p->second.size() &&
4146 (osdmap.pg_temp->count(p->first) == 0 ||
4147 osdmap.pg_temp->get(p->first) != p->second ||
4148 osdmap.primary_temp->count(p->first)))
4149 return false;
4150 }
4151
4152 // should we ignore all the pgs?
4153 if (ignore_cnt == m->pg_temp.size())
4154 goto ignore;
4155
4156 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4157 _reply_map(op, m->map_epoch);
4158 return true;
4159
4160 ignore:
4161 mon.no_reply(op);
4162 return true;
4163 }
4164
4165 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4166 {
4167 epoch_t old_up_thru = osdmap.get_up_thru(from);
4168 auto ut = pending_inc.new_up_thru.find(from);
4169 if (ut != pending_inc.new_up_thru.end()) {
4170 old_up_thru = ut->second;
4171 }
4172 if (up_thru > old_up_thru) {
4173 // set up_thru too, so the osd doesn't have to ask again
4174 pending_inc.new_up_thru[from] = up_thru;
4175 }
4176 }
4177
4178 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4179 {
4180 op->mark_osdmon_event(__func__);
4181 auto m = op->get_req<MOSDPGTemp>();
4182 int from = m->get_orig_source().num();
4183 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4184 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4185 uint64_t pool = p->first.pool();
4186 if (pending_inc.old_pools.count(pool)) {
4187 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4188 << ": pool pending removal" << dendl;
4189 continue;
4190 }
4191 if (!osdmap.have_pg_pool(pool)) {
4192 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4193 << ": pool has been removed" << dendl;
4194 continue;
4195 }
4196 pending_inc.new_pg_temp[p->first] =
4197 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4198
4199 // unconditionally clear pg_primary (until this message can encode
4200 // a change for that, too.. at which point we need to also fix
4201 // preprocess_pg_temp)
4202 if (osdmap.primary_temp->count(p->first) ||
4203 pending_inc.new_primary_temp.count(p->first))
4204 pending_inc.new_primary_temp[p->first] = -1;
4205 }
4206
4207 // set up_thru too, so the osd doesn't have to ask again
4208 update_up_thru(from, m->map_epoch);
4209
4210 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4211 return true;
4212 }
4213
4214
4215 // ---
4216
4217 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4218 {
4219 op->mark_osdmon_event(__func__);
4220 auto m = op->get_req<MRemoveSnaps>();
4221 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4222
4223 // check privilege, ignore if failed
4224 MonSession *session = op->get_session();
4225 mon.no_reply(op);
4226 if (!session)
4227 goto ignore;
4228 if (!session->caps.is_capable(
4229 cct,
4230 session->entity_name,
4231 "osd", "osd pool rmsnap", {}, true, true, false,
4232 session->get_peer_socket_addr())) {
4233 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4234 << session->caps << dendl;
4235 goto ignore;
4236 }
4237
4238 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4239 q != m->snaps.end();
4240 ++q) {
4241 if (!osdmap.have_pg_pool(q->first)) {
4242 dout(10) << " ignoring removed_snaps " << q->second
4243 << " on non-existent pool " << q->first << dendl;
4244 continue;
4245 }
4246 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4247 for (vector<snapid_t>::iterator p = q->second.begin();
4248 p != q->second.end();
4249 ++p) {
4250 if (*p > pi->get_snap_seq() ||
4251 !_is_removed_snap(q->first, *p)) {
4252 return false;
4253 }
4254 }
4255 }
4256
4257 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4258 auto reply = make_message<MRemoveSnaps>();
4259 reply->snaps = m->snaps;
4260 mon.send_reply(op, reply.detach());
4261 }
4262
4263 ignore:
4264 return true;
4265 }
4266
4267 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4268 {
4269 op->mark_osdmon_event(__func__);
4270 auto m = op->get_req<MRemoveSnaps>();
4271 dout(7) << "prepare_remove_snaps " << *m << dendl;
4272
4273 for (auto& [pool, snaps] : m->snaps) {
4274 if (!osdmap.have_pg_pool(pool)) {
4275 dout(10) << " ignoring removed_snaps " << snaps
4276 << " on non-existent pool " << pool << dendl;
4277 continue;
4278 }
4279
4280 pg_pool_t& pi = osdmap.pools[pool];
4281 for (auto s : snaps) {
4282 if (!_is_removed_snap(pool, s) &&
4283 (!pending_inc.new_pools.count(pool) ||
4284 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4285 (!pending_inc.new_removed_snaps.count(pool) ||
4286 !pending_inc.new_removed_snaps[pool].contains(s))) {
4287 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4288 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4289 newpi->removed_snaps.insert(s);
4290 dout(10) << " pool " << pool << " removed_snaps added " << s
4291 << " (now " << newpi->removed_snaps << ")" << dendl;
4292 }
4293 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4294 if (s > newpi->get_snap_seq()) {
4295 dout(10) << " pool " << pool << " snap_seq "
4296 << newpi->get_snap_seq() << " -> " << s << dendl;
4297 newpi->set_snap_seq(s);
4298 }
4299 newpi->set_snap_epoch(pending_inc.epoch);
4300 dout(10) << " added pool " << pool << " snap " << s
4301 << " to removed_snaps queue" << dendl;
4302 pending_inc.new_removed_snaps[pool].insert(s);
4303 }
4304 }
4305 }
4306
4307 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4308 auto reply = make_message<MRemoveSnaps>();
4309 reply->snaps = m->snaps;
4310 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4311 }
4312
4313 return true;
4314 }
4315
4316 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4317 {
4318 op->mark_osdmon_event(__func__);
4319 auto m = op->get_req<MMonGetPurgedSnaps>();
4320 dout(7) << __func__ << " " << *m << dendl;
4321
4322 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4323
4324 string k = make_purged_snap_epoch_key(m->start);
4325 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4326 it->upper_bound(k);
4327 unsigned long epoch = m->last;
4328 while (it->valid()) {
4329 if (it->key().find("purged_epoch_") != 0) {
4330 break;
4331 }
4332 string k = it->key();
4333 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4334 if (n != 1) {
4335 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4336 } else if (epoch > m->last) {
4337 break;
4338 } else {
4339 bufferlist bl = it->value();
4340 auto p = bl.cbegin();
4341 auto &v = r[epoch];
4342 try {
4343 ceph::decode(v, p);
4344 } catch (ceph::buffer::error& e) {
4345 derr << __func__ << " unable to parse value for key '" << it->key()
4346 << "': \n";
4347 bl.hexdump(*_dout);
4348 *_dout << dendl;
4349 }
4350 n += 4 + v.size() * 16;
4351 }
4352 if (n > 1048576) {
4353 // impose a semi-arbitrary limit to message size
4354 break;
4355 }
4356 it->next();
4357 }
4358
4359 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4360 reply->purged_snaps.swap(r);
4361 mon.send_reply(op, reply.detach());
4362
4363 return true;
4364 }
4365
4366 // osd beacon
4367 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4368 {
4369 op->mark_osdmon_event(__func__);
4370 // check caps
4371 auto session = op->get_session();
4372 mon.no_reply(op);
4373 if (!session) {
4374 dout(10) << __func__ << " no monitor session!" << dendl;
4375 return true;
4376 }
4377 if (!session->is_capable("osd", MON_CAP_X)) {
4378 derr << __func__ << " received from entity "
4379 << "with insufficient privileges " << session->caps << dendl;
4380 return true;
4381 }
4382 // Always forward the beacon to the leader, even if they are the same as
4383 // the old one. The leader will mark as down osds that haven't sent
4384 // beacon for a few minutes.
4385 return false;
4386 }
4387
4388 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4389 {
4390 op->mark_osdmon_event(__func__);
4391 const auto beacon = op->get_req<MOSDBeacon>();
4392 const auto src = beacon->get_orig_source();
4393 dout(10) << __func__ << " " << *beacon
4394 << " from " << src << dendl;
4395 int from = src.num();
4396
4397 if (!src.is_osd() ||
4398 !osdmap.is_up(from) ||
4399 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4400 if (src.is_osd() && !osdmap.is_up(from)) {
4401 // share some new maps with this guy in case it may not be
4402 // aware of its own deadness...
4403 send_latest(op, beacon->version+1);
4404 }
4405 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4406 return false;
4407 }
4408
4409 last_osd_report[from].first = ceph_clock_now();
4410 last_osd_report[from].second = beacon->osd_beacon_report_interval;
4411 osd_epochs[from] = beacon->version;
4412
4413 for (const auto& pg : beacon->pgs) {
4414 if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4415 unsigned pg_num = pool->get_pg_num();
4416 last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4417 }
4418 }
4419
4420 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4421 beacon->last_purged_snaps_scrub) {
4422 if (pending_inc.new_xinfo.count(from) == 0) {
4423 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4424 }
4425 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4426 beacon->last_purged_snaps_scrub;
4427 return true;
4428 } else {
4429 return false;
4430 }
4431 }
4432
4433 // ---------------
4434 // map helpers
4435
4436 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4437 {
4438 op->mark_osdmon_event(__func__);
4439 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4440 << " start " << start << dendl;
4441 if (start == 0)
4442 send_full(op);
4443 else
4444 send_incremental(op, start);
4445 }
4446
4447
4448 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4449 {
4450 MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4451 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4452 r->oldest_map = get_first_committed();
4453 r->newest_map = osdmap.get_epoch();
4454 return r;
4455 }
4456
4457 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4458 {
4459 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4460 << std::hex << features << std::dec << dendl;
4461 MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4462 m->oldest_map = get_first_committed();
4463 m->newest_map = osdmap.get_epoch();
4464
4465 for (epoch_t e = to; e >= from && e > 0; e--) {
4466 bufferlist bl;
4467 int err = get_version(e, features, bl);
4468 if (err == 0) {
4469 ceph_assert(bl.length());
4470 // if (get_version(e, bl) > 0) {
4471 dout(20) << "build_incremental inc " << e << " "
4472 << bl.length() << " bytes" << dendl;
4473 m->incremental_maps[e] = bl;
4474 } else {
4475 ceph_assert(err == -ENOENT);
4476 ceph_assert(!bl.length());
4477 get_version_full(e, features, bl);
4478 if (bl.length() > 0) {
4479 //else if (get_version("full", e, bl) > 0) {
4480 dout(20) << "build_incremental full " << e << " "
4481 << bl.length() << " bytes" << dendl;
4482 m->maps[e] = bl;
4483 } else {
4484 ceph_abort(); // we should have all maps.
4485 }
4486 }
4487 }
4488 return m;
4489 }
4490
4491 void OSDMonitor::send_full(MonOpRequestRef op)
4492 {
4493 op->mark_osdmon_event(__func__);
4494 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4495 mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4496 }
4497
4498 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4499 {
4500 op->mark_osdmon_event(__func__);
4501
4502 MonSession *s = op->get_session();
4503 ceph_assert(s);
4504
4505 if (s->proxy_con) {
4506 // oh, we can tell the other mon to do it
4507 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4508 << first << dendl;
4509 MRoute *r = new MRoute(s->proxy_tid, NULL);
4510 r->send_osdmap_first = first;
4511 s->proxy_con->send_message(r);
4512 op->mark_event("reply: send routed send_osdmap_first reply");
4513 } else {
4514 // do it ourselves
4515 send_incremental(first, s, false, op);
4516 }
4517 }
4518
4519 void OSDMonitor::send_incremental(epoch_t first,
4520 MonSession *session,
4521 bool onetime,
4522 MonOpRequestRef req)
4523 {
4524 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4525 << " to " << session->name << dendl;
4526
4527 // get feature of the peer
4528 // use quorum_con_features, if it's an anonymous connection.
4529 uint64_t features = session->con_features ? session->con_features :
4530 mon.get_quorum_con_features();
4531
4532 if (first <= session->osd_epoch) {
4533 dout(10) << __func__ << " " << session->name << " should already have epoch "
4534 << session->osd_epoch << dendl;
4535 first = session->osd_epoch + 1;
4536 }
4537
4538 if (first < get_first_committed()) {
4539 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4540 m->oldest_map = get_first_committed();
4541 m->newest_map = osdmap.get_epoch();
4542
4543 first = get_first_committed();
4544 bufferlist bl;
4545 int err = get_version_full(first, features, bl);
4546 ceph_assert(err == 0);
4547 ceph_assert(bl.length());
4548 dout(20) << "send_incremental starting with base full "
4549 << first << " " << bl.length() << " bytes" << dendl;
4550 m->maps[first] = bl;
4551
4552 if (req) {
4553 mon.send_reply(req, m);
4554 session->osd_epoch = first;
4555 return;
4556 } else {
4557 session->con->send_message(m);
4558 session->osd_epoch = first;
4559 }
4560 first++;
4561 }
4562
4563 while (first <= osdmap.get_epoch()) {
4564 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4565 osdmap.get_epoch());
4566 MOSDMap *m = build_incremental(first, last, features);
4567
4568 if (req) {
4569 // send some maps. it may not be all of them, but it will get them
4570 // started.
4571 mon.send_reply(req, m);
4572 } else {
4573 session->con->send_message(m);
4574 first = last + 1;
4575 }
4576 session->osd_epoch = last;
4577 if (onetime || req)
4578 break;
4579 }
4580 }
4581
4582 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4583 {
4584 return get_version(ver, mon.get_quorum_con_features(), bl);
4585 }
4586
4587 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4588 {
4589 OSDMap::Incremental inc;
4590 auto q = bl.cbegin();
4591 inc.decode(q);
4592 // always encode with subset of osdmap's canonical features
4593 uint64_t f = features & inc.encode_features;
4594 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4595 << dendl;
4596 bl.clear();
4597 if (inc.fullmap.length()) {
4598 // embedded full map?
4599 OSDMap m;
4600 m.decode(inc.fullmap);
4601 inc.fullmap.clear();
4602 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4603 }
4604 if (inc.crush.length()) {
4605 // embedded crush map
4606 CrushWrapper c;
4607 auto p = inc.crush.cbegin();
4608 c.decode(p);
4609 inc.crush.clear();
4610 c.encode(inc.crush, f);
4611 }
4612 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4613 }
4614
4615 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4616 {
4617 OSDMap m;
4618 auto q = bl.cbegin();
4619 m.decode(q);
4620 // always encode with subset of osdmap's canonical features
4621 uint64_t f = features & m.get_encoding_features();
4622 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4623 << dendl;
4624 bl.clear();
4625 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4626 }
4627
4628 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4629 {
4630 uint64_t significant_features = OSDMap::get_significant_features(features);
4631 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4632 return 0;
4633 }
4634 int ret = PaxosService::get_version(ver, bl);
4635 if (ret < 0) {
4636 return ret;
4637 }
4638 // NOTE: this check is imprecise; the OSDMap encoding features may
4639 // be a subset of the latest mon quorum features, but worst case we
4640 // reencode once and then cache the (identical) result under both
4641 // feature masks.
4642 if (significant_features !=
4643 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4644 reencode_incremental_map(bl, features);
4645 }
4646 inc_osd_cache.add_bytes({ver, significant_features}, bl);
4647 return 0;
4648 }
4649
4650 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4651 {
4652 bufferlist inc_bl;
4653 int err = get_version(ver, inc_bl);
4654 ceph_assert(err == 0);
4655 ceph_assert(inc_bl.length());
4656
4657 auto p = inc_bl.cbegin();
4658 inc.decode(p);
4659 dout(10) << __func__ << " "
4660 << " epoch " << inc.epoch
4661 << " inc_crc " << inc.inc_crc
4662 << " full_crc " << inc.full_crc
4663 << " encode_features " << inc.encode_features << dendl;
4664 return 0;
4665 }
4666
4667 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4668 {
4669 dout(10) << __func__ << " ver " << ver << dendl;
4670
4671 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4672 if (closest_pinned == 0) {
4673 return -ENOENT;
4674 }
4675 if (closest_pinned > ver) {
4676 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4677 }
4678 ceph_assert(closest_pinned <= ver);
4679
4680 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4681
4682 // get osdmap incremental maps and apply on top of this one.
4683 bufferlist osdm_bl;
4684 bool has_cached_osdmap = false;
4685 for (version_t v = ver-1; v >= closest_pinned; --v) {
4686 if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4687 &osdm_bl)) {
4688 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4689 closest_pinned = v;
4690 has_cached_osdmap = true;
4691 break;
4692 }
4693 }
4694
4695 if (!has_cached_osdmap) {
4696 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4697 if (err != 0) {
4698 derr << __func__ << " closest pinned map ver " << closest_pinned
4699 << " not available! error: " << cpp_strerror(err) << dendl;
4700 }
4701 ceph_assert(err == 0);
4702 }
4703
4704 ceph_assert(osdm_bl.length());
4705
4706 OSDMap osdm;
4707 osdm.decode(osdm_bl);
4708
4709 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4710 << " e" << osdm.epoch
4711 << " crc " << osdm.get_crc()
4712 << " -- applying incremental maps." << dendl;
4713
4714 uint64_t encode_features = 0;
4715 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4716 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4717
4718 OSDMap::Incremental inc;
4719 int err = get_inc(v, inc);
4720 ceph_assert(err == 0);
4721
4722 encode_features = inc.encode_features;
4723
4724 err = osdm.apply_incremental(inc);
4725 ceph_assert(err == 0);
4726
4727 // this block performs paranoid checks on map retrieval
4728 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4729 inc.full_crc != 0) {
4730
4731 uint64_t f = encode_features;
4732 if (!f) {
4733 f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4734 }
4735
4736 // encode osdmap to force calculating crcs
4737 bufferlist tbl;
4738 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4739 // decode osdmap to compare crcs with what's expected by incremental
4740 OSDMap tosdm;
4741 tosdm.decode(tbl);
4742
4743 if (tosdm.get_crc() != inc.full_crc) {
4744 derr << __func__
4745 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4746 << ", expected " << inc.full_crc << ")" << dendl;
4747 ceph_abort_msg("osdmap crc mismatch");
4748 }
4749 }
4750
4751 // note: we cannot add the recently computed map to the cache, as is,
4752 // because we have not encoded the map into a bl.
4753 }
4754
4755 if (!encode_features) {
4756 dout(10) << __func__
4757 << " last incremental map didn't have features;"
4758 << " defaulting to quorum's or all" << dendl;
4759 encode_features =
4760 (mon.quorum_con_features ? mon.quorum_con_features : -1);
4761 }
4762 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4763
4764 return 0;
4765 }
4766
4767 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4768 {
4769 return get_version_full(ver, mon.get_quorum_con_features(), bl);
4770 }
4771
4772 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4773 bufferlist& bl)
4774 {
4775 uint64_t significant_features = OSDMap::get_significant_features(features);
4776 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4777 return 0;
4778 }
4779 int ret = PaxosService::get_version_full(ver, bl);
4780 if (ret == -ENOENT) {
4781 // build map?
4782 ret = get_full_from_pinned_map(ver, bl);
4783 }
4784 if (ret < 0) {
4785 return ret;
4786 }
4787 // NOTE: this check is imprecise; the OSDMap encoding features may
4788 // be a subset of the latest mon quorum features, but worst case we
4789 // reencode once and then cache the (identical) result under both
4790 // feature masks.
4791 if (significant_features !=
4792 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4793 reencode_full_map(bl, features);
4794 }
4795 full_osd_cache.add_bytes({ver, significant_features}, bl);
4796 return 0;
4797 }
4798
4799 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4800 {
4801 dout(10) << "blocklist " << av << " until " << until << dendl;
4802 for (auto a : av.v) {
4803 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4804 a.set_type(entity_addr_t::TYPE_ANY);
4805 } else {
4806 a.set_type(entity_addr_t::TYPE_LEGACY);
4807 }
4808 pending_inc.new_blocklist[a] = until;
4809 }
4810 return pending_inc.epoch;
4811 }
4812
4813 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4814 {
4815 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4816 a.set_type(entity_addr_t::TYPE_ANY);
4817 } else {
4818 a.set_type(entity_addr_t::TYPE_LEGACY);
4819 }
4820 dout(10) << "blocklist " << a << " until " << until << dendl;
4821 pending_inc.new_blocklist[a] = until;
4822 return pending_inc.epoch;
4823 }
4824
4825
4826 void OSDMonitor::check_osdmap_subs()
4827 {
4828 dout(10) << __func__ << dendl;
4829 if (!osdmap.get_epoch()) {
4830 return;
4831 }
4832 auto osdmap_subs = mon.session_map.subs.find("osdmap");
4833 if (osdmap_subs == mon.session_map.subs.end()) {
4834 return;
4835 }
4836 auto p = osdmap_subs->second->begin();
4837 while (!p.end()) {
4838 auto sub = *p;
4839 ++p;
4840 check_osdmap_sub(sub);
4841 }
4842 }
4843
4844 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4845 {
4846 dout(10) << __func__ << " " << sub << " next " << sub->next
4847 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4848 if (sub->next <= osdmap.get_epoch()) {
4849 if (sub->next >= 1)
4850 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4851 else
4852 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4853 if (sub->onetime)
4854 mon.session_map.remove_sub(sub);
4855 else
4856 sub->next = osdmap.get_epoch() + 1;
4857 }
4858 }
4859
4860 void OSDMonitor::check_pg_creates_subs()
4861 {
4862 if (!osdmap.get_num_up_osds()) {
4863 return;
4864 }
4865 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4866 mon.with_session_map([this](const MonSessionMap& session_map) {
4867 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4868 if (pg_creates_subs == session_map.subs.end()) {
4869 return;
4870 }
4871 for (auto sub : *pg_creates_subs->second) {
4872 check_pg_creates_sub(sub);
4873 }
4874 });
4875 }
4876
4877 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4878 {
4879 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4880 ceph_assert(sub->type == "osd_pg_creates");
4881 // only send these if the OSD is up. we will check_subs() when they do
4882 // come up so they will get the creates then.
4883 if (sub->session->name.is_osd() &&
4884 mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4885 sub->next = send_pg_creates(sub->session->name.num(),
4886 sub->session->con.get(),
4887 sub->next);
4888 }
4889 }
4890
4891 void OSDMonitor::do_application_enable(int64_t pool_id,
4892 const std::string &app_name,
4893 const std::string &app_key,
4894 const std::string &app_value,
4895 bool force)
4896 {
4897 ceph_assert(paxos.is_plugged() && is_writeable());
4898
4899 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4900 << dendl;
4901
4902 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4903
4904 auto pp = osdmap.get_pg_pool(pool_id);
4905 ceph_assert(pp != nullptr);
4906
4907 pg_pool_t p = *pp;
4908 if (pending_inc.new_pools.count(pool_id)) {
4909 p = pending_inc.new_pools[pool_id];
4910 }
4911
4912 if (app_key.empty()) {
4913 p.application_metadata.insert({app_name, {}});
4914 } else {
4915 if (force) {
4916 p.application_metadata[app_name][app_key] = app_value;
4917 } else {
4918 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4919 }
4920 }
4921 p.last_change = pending_inc.epoch;
4922 pending_inc.new_pools[pool_id] = p;
4923 }
4924
4925 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4926 pool_opts_t::key_t opt,
4927 pool_opts_t::value_t val)
4928 {
4929 auto p = pending_inc.new_pools.try_emplace(
4930 pool_id, *osdmap.get_pg_pool(pool_id));
4931 p.first->second.opts.set(opt, val);
4932 }
4933
4934 unsigned OSDMonitor::scan_for_creating_pgs(
4935 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4936 const mempool::osdmap::set<int64_t>& removed_pools,
4937 utime_t modified,
4938 creating_pgs_t* creating_pgs) const
4939 {
4940 unsigned queued = 0;
4941 for (auto& p : pools) {
4942 int64_t poolid = p.first;
4943 if (creating_pgs->created_pools.count(poolid)) {
4944 dout(10) << __func__ << " already created " << poolid << dendl;
4945 continue;
4946 }
4947 const pg_pool_t& pool = p.second;
4948 int ruleno = pool.get_crush_rule();
4949 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4950 continue;
4951
4952 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4953 const auto created = pool.get_last_change();
4954 if (last_scan_epoch && created <= last_scan_epoch) {
4955 dout(10) << __func__ << " no change in pool " << poolid
4956 << " " << pool << dendl;
4957 continue;
4958 }
4959 if (removed_pools.count(poolid)) {
4960 dout(10) << __func__ << " pool is being removed: " << poolid
4961 << " " << pool << dendl;
4962 continue;
4963 }
4964 dout(10) << __func__ << " queueing pool create for " << poolid
4965 << " " << pool << dendl;
4966 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4967 created, modified);
4968 queued++;
4969 }
4970 return queued;
4971 }
4972
4973 void OSDMonitor::update_creating_pgs()
4974 {
4975 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4976 << creating_pgs.queue.size() << " pools in queue" << dendl;
4977 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4978 std::lock_guard<std::mutex> l(creating_pgs_lock);
4979 for (const auto& pg : creating_pgs.pgs) {
4980 int acting_primary = -1;
4981 auto pgid = pg.first;
4982 if (!osdmap.pg_exists(pgid)) {
4983 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4984 << dendl;
4985 continue;
4986 }
4987 auto mapped = pg.second.create_epoch;
4988 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4989 spg_t spgid(pgid);
4990 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4991 // check the previous creating_pgs, look for the target to whom the pg was
4992 // previously mapped
4993 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4994 const auto last_acting_primary = pgs_by_epoch.first;
4995 for (auto& pgs: pgs_by_epoch.second) {
4996 if (pgs.second.count(spgid)) {
4997 if (last_acting_primary == acting_primary) {
4998 mapped = pgs.first;
4999 } else {
5000 dout(20) << __func__ << " " << pgid << " "
5001 << " acting_primary:" << last_acting_primary
5002 << " -> " << acting_primary << dendl;
5003 // note epoch if the target of the create message changed.
5004 mapped = mapping.get_epoch();
5005 }
5006 break;
5007 } else {
5008 // newly creating
5009 mapped = mapping.get_epoch();
5010 }
5011 }
5012 }
5013 dout(10) << __func__ << " will instruct osd." << acting_primary
5014 << " to create " << pgid << "@" << mapped << dendl;
5015 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5016 }
5017 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5018 creating_pgs_epoch = mapping.get_epoch();
5019 }
5020
5021 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5022 {
5023 dout(30) << __func__ << " osd." << osd << " next=" << next
5024 << " " << creating_pgs_by_osd_epoch << dendl;
5025 std::lock_guard<std::mutex> l(creating_pgs_lock);
5026 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5027 dout(20) << __func__
5028 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5029 // the subscribers will be updated when the mapping is completed anyway
5030 return next;
5031 }
5032 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5033 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5034 return next;
5035 ceph_assert(!creating_pgs_by_epoch->second.empty());
5036
5037 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
5038 MOSDPGCreate2 *m = nullptr;
5039
5040 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
5041
5042 epoch_t last = 0;
5043 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5044 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5045 auto epoch = epoch_pgs->first;
5046 auto& pgs = epoch_pgs->second;
5047 dout(20) << __func__ << " osd." << osd << " from " << next
5048 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5049 last = epoch;
5050 for (auto& pg : pgs) {
5051 // Need the create time from the monitor using its clock to set
5052 // last_scrub_stamp upon pg creation.
5053 auto create = creating_pgs.pgs.find(pg.pgid);
5054 ceph_assert(create != creating_pgs.pgs.end());
5055 if (old) {
5056 if (!oldm) {
5057 oldm = new MOSDPGCreate(creating_pgs_epoch);
5058 }
5059 oldm->mkpg.emplace(pg.pgid,
5060 pg_create_t{create->second.create_epoch, pg.pgid, 0});
5061 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
5062 } else {
5063 if (!m) {
5064 m = new MOSDPGCreate2(creating_pgs_epoch);
5065 }
5066 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5067 create->second.create_stamp));
5068 if (create->second.history.epoch_created) {
5069 dout(20) << __func__ << " " << pg << " " << create->second.history
5070 << " " << create->second.past_intervals << dendl;
5071 m->pg_extra.emplace(pg, make_pair(create->second.history,
5072 create->second.past_intervals));
5073 }
5074 }
5075 dout(20) << __func__ << " will create " << pg
5076 << " at " << create->second.create_epoch << dendl;
5077 }
5078 }
5079 if (m) {
5080 con->send_message(m);
5081 } else if (oldm) {
5082 con->send_message(oldm);
5083 } else {
5084 dout(20) << __func__ << " osd." << osd << " from " << next
5085 << " has nothing to send" << dendl;
5086 return next;
5087 }
5088
5089 // sub is current through last + 1
5090 return last + 1;
5091 }
5092
5093 // TICK
5094
5095
5096 void OSDMonitor::tick()
5097 {
5098 if (!is_active()) return;
5099
5100 dout(10) << osdmap << dendl;
5101
5102 // always update osdmap manifest, regardless of being the leader.
5103 load_osdmap_manifest();
5104
5105 // always tune priority cache manager memory on leader and peons
5106 if (ceph_using_tcmalloc() && mon_memory_autotune) {
5107 std::lock_guard l(balancer_lock);
5108 if (pcm != nullptr) {
5109 pcm->tune_memory();
5110 pcm->balance();
5111 _set_new_cache_sizes();
5112 dout(10) << "tick balancer "
5113 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5114 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5115 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5116 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5117 << dendl;
5118 dout(10) << "tick balancer "
5119 << " full cache_bytes: " << full_cache->get_cache_bytes()
5120 << " full comtd_bytes: " << full_cache->get_committed_size()
5121 << " full used_bytes: " << full_cache->_get_used_bytes()
5122 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5123 << dendl;
5124 }
5125 }
5126
5127 if (!mon.is_leader()) return;
5128
5129 bool do_propose = false;
5130 utime_t now = ceph_clock_now();
5131
5132 if (handle_osd_timeouts(now, last_osd_report)) {
5133 do_propose = true;
5134 }
5135
5136 // mark osds down?
5137 if (check_failures(now)) {
5138 do_propose = true;
5139 }
5140
5141 // Force a proposal if we need to prune; pruning is performed on
5142 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5143 // even if there's nothing going on.
5144 if (is_prune_enabled() && should_prune()) {
5145 do_propose = true;
5146 }
5147
5148 // mark down osds out?
5149
5150 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5151 * influence at all. The decision is made based on the ratio of "in" osds,
5152 * and the function returns false if this ratio is lower that the minimum
5153 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5154 */
5155 if (can_mark_out(-1)) {
5156 string down_out_subtree_limit = g_conf().get_val<string>(
5157 "mon_osd_down_out_subtree_limit");
5158 set<int> down_cache; // quick cache of down subtrees
5159
5160 map<int,utime_t>::iterator i = down_pending_out.begin();
5161 while (i != down_pending_out.end()) {
5162 int o = i->first;
5163 utime_t down = now;
5164 down -= i->second;
5165 ++i;
5166
5167 if (osdmap.is_down(o) &&
5168 osdmap.is_in(o) &&
5169 can_mark_out(o)) {
5170 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5171 utime_t grace = orig_grace;
5172 double my_grace = 0.0;
5173
5174 if (g_conf()->mon_osd_adjust_down_out_interval) {
5175 // scale grace period the same way we do the heartbeat grace.
5176 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5177 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5178 double decay_k = ::log(.5) / halflife;
5179 double decay = exp((double)down * decay_k);
5180 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5181 << " down for " << down << " decay " << decay << dendl;
5182 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5183 grace += my_grace;
5184 }
5185
5186 // is this an entire large subtree down?
5187 if (down_out_subtree_limit.length()) {
5188 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5189 if (type > 0) {
5190 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5191 dout(10) << "tick entire containing " << down_out_subtree_limit
5192 << " subtree for osd." << o
5193 << " is down; resetting timer" << dendl;
5194 // reset timer, too.
5195 down_pending_out[o] = now;
5196 continue;
5197 }
5198 }
5199 }
5200
5201 bool down_out = !osdmap.is_destroyed(o) &&
5202 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5203 bool destroyed_out = osdmap.is_destroyed(o) &&
5204 g_conf()->mon_osd_destroyed_out_interval > 0 &&
5205 // this is not precise enough as we did not make a note when this osd
5206 // was marked as destroyed, but let's not bother with that
5207 // complexity for now.
5208 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5209 if (down_out || destroyed_out) {
5210 dout(10) << "tick marking osd." << o << " OUT after " << down
5211 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5212 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5213
5214 // set the AUTOOUT bit.
5215 if (pending_inc.new_state.count(o) == 0)
5216 pending_inc.new_state[o] = 0;
5217 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5218
5219 // remember previous weight
5220 if (pending_inc.new_xinfo.count(o) == 0)
5221 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5222 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5223
5224 do_propose = true;
5225
5226 mon.clog->info() << "Marking osd." << o << " out (has been down for "
5227 << int(down.sec()) << " seconds)";
5228 } else
5229 continue;
5230 }
5231
5232 down_pending_out.erase(o);
5233 }
5234 } else {
5235 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5236 }
5237
5238 // expire blocklisted items?
5239 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5240 p != osdmap.blocklist.end();
5241 ++p) {
5242 if (p->second < now) {
5243 dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5244 pending_inc.old_blocklist.push_back(p->first);
5245 do_propose = true;
5246 }
5247 }
5248
5249 if (try_prune_purged_snaps()) {
5250 do_propose = true;
5251 }
5252
5253 if (update_pools_status())
5254 do_propose = true;
5255
5256 if (do_propose ||
5257 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5258 propose_pending();
5259 }
5260
5261 void OSDMonitor::_set_new_cache_sizes()
5262 {
5263 uint64_t cache_size = 0;
5264 int64_t inc_alloc = 0;
5265 int64_t full_alloc = 0;
5266 int64_t kv_alloc = 0;
5267
5268 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5269 cache_size = pcm->get_tuned_mem();
5270 inc_alloc = inc_cache->get_committed_size();
5271 full_alloc = full_cache->get_committed_size();
5272 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5273 }
5274
5275 inc_osd_cache.set_bytes(inc_alloc);
5276 full_osd_cache.set_bytes(full_alloc);
5277
5278 dout(1) << __func__ << " cache_size:" << cache_size
5279 << " inc_alloc: " << inc_alloc
5280 << " full_alloc: " << full_alloc
5281 << " kv_alloc: " << kv_alloc
5282 << dendl;
5283 }
5284
5285 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5286 std::map<int, std::pair<utime_t, int>> &last_osd_report)
5287 {
5288 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5289 if (now - mon.get_leader_since() < timeo) {
5290 // We haven't been the leader for long enough to consider OSD timeouts
5291 return false;
5292 }
5293
5294 int max_osd = osdmap.get_max_osd();
5295 bool new_down = false;
5296
5297 for (int i=0; i < max_osd; ++i) {
5298 dout(30) << __func__ << ": checking up on osd " << i << dendl;
5299 if (!osdmap.exists(i)) {
5300 last_osd_report.erase(i); // if any
5301 continue;
5302 }
5303 if (!osdmap.is_up(i))
5304 continue;
5305 const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5306 if (t == last_osd_report.end()) {
5307 // it wasn't in the map; start the timer.
5308 last_osd_report[i].first = now;
5309 last_osd_report[i].second = 0;
5310 } else if (can_mark_down(i)) {
5311 utime_t diff = now - t->second.first;
5312 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5313 // to allow for the osd to miss a beacon.
5314 int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5315 utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0);
5316 if (diff > max_timeout) {
5317 mon.clog->info() << "osd." << i << " marked down after no beacon for "
5318 << diff << " seconds";
5319 derr << "no beacon from osd." << i << " since " << t->second.first
5320 << ", " << diff << " seconds ago. marking down" << dendl;
5321 pending_inc.new_state[i] = CEPH_OSD_UP;
5322 new_down = true;
5323 }
5324 }
5325 }
5326 return new_down;
5327 }
5328
5329 static void dump_cpu_list(Formatter *f, const char *name,
5330 const string& strlist)
5331 {
5332 cpu_set_t cpu_set;
5333 size_t cpu_set_size;
5334 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5335 return;
5336 }
5337 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5338 f->open_array_section(name);
5339 for (auto cpu : cpus) {
5340 f->dump_int("cpu", cpu);
5341 }
5342 f->close_section();
5343 }
5344
5345 void OSDMonitor::dump_info(Formatter *f)
5346 {
5347 f->open_object_section("osdmap");
5348 osdmap.dump(f);
5349 f->close_section();
5350
5351 f->open_array_section("osd_metadata");
5352 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5353 if (osdmap.exists(i)) {
5354 f->open_object_section("osd");
5355 f->dump_unsigned("id", i);
5356 dump_osd_metadata(i, f, NULL);
5357 f->close_section();
5358 }
5359 }
5360 f->close_section();
5361
5362 f->open_object_section("osdmap_clean_epochs");
5363 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5364
5365 f->open_object_section("last_epoch_clean");
5366 last_epoch_clean.dump(f);
5367 f->close_section();
5368
5369 f->open_array_section("osd_epochs");
5370 for (auto& osd_epoch : osd_epochs) {
5371 f->open_object_section("osd");
5372 f->dump_unsigned("id", osd_epoch.first);
5373 f->dump_unsigned("epoch", osd_epoch.second);
5374 f->close_section();
5375 }
5376 f->close_section(); // osd_epochs
5377
5378 f->close_section(); // osd_clean_epochs
5379
5380 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5381 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5382
5383 f->open_object_section("crushmap");
5384 osdmap.crush->dump(f);
5385 f->close_section();
5386
5387 if (has_osdmap_manifest) {
5388 f->open_object_section("osdmap_manifest");
5389 osdmap_manifest.dump(f);
5390 f->close_section();
5391 }
5392 }
5393
5394 namespace {
5395 enum osd_pool_get_choices {
5396 SIZE, MIN_SIZE,
5397 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5398 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5399 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5400 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5401 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5402 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5403 CACHE_TARGET_FULL_RATIO,
5404 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5405 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5406 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5407 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5408 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5409 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5410 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5411 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5412 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5413 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5414 PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5415 DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX };
5416
5417 std::set<osd_pool_get_choices>
5418 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5419 const std::set<osd_pool_get_choices>& second)
5420 {
5421 std::set<osd_pool_get_choices> result;
5422 std::set_difference(first.begin(), first.end(),
5423 second.begin(), second.end(),
5424 std::inserter(result, result.end()));
5425 return result;
5426 }
5427 }
5428
5429
5430 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5431 {
5432 op->mark_osdmon_event(__func__);
5433 auto m = op->get_req<MMonCommand>();
5434 int r = 0;
5435 bufferlist rdata;
5436 stringstream ss, ds;
5437
5438 cmdmap_t cmdmap;
5439 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5440 string rs = ss.str();
5441 mon.reply_command(op, -EINVAL, rs, get_last_committed());
5442 return true;
5443 }
5444
5445 MonSession *session = op->get_session();
5446 if (!session) {
5447 derr << __func__ << " no session" << dendl;
5448 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5449 return true;
5450 }
5451
5452 string prefix;
5453 cmd_getval(cmdmap, "prefix", prefix);
5454
5455 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
5456 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5457
5458 if (prefix == "osd stat") {
5459 if (f) {
5460 f->open_object_section("osdmap");
5461 osdmap.print_summary(f.get(), ds, "", true);
5462 f->close_section();
5463 f->flush(rdata);
5464 } else {
5465 osdmap.print_summary(nullptr, ds, "", true);
5466 rdata.append(ds);
5467 }
5468 }
5469 else if (prefix == "osd dump" ||
5470 prefix == "osd tree" ||
5471 prefix == "osd tree-from" ||
5472 prefix == "osd ls" ||
5473 prefix == "osd getmap" ||
5474 prefix == "osd getcrushmap" ||
5475 prefix == "osd ls-tree" ||
5476 prefix == "osd info") {
5477
5478 epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
5479 bufferlist osdmap_bl;
5480 int err = get_version_full(epoch, osdmap_bl);
5481 if (err == -ENOENT) {
5482 r = -ENOENT;
5483 ss << "there is no map for epoch " << epoch;
5484 goto reply;
5485 }
5486 ceph_assert(err == 0);
5487 ceph_assert(osdmap_bl.length());
5488
5489 OSDMap *p;
5490 if (epoch == osdmap.get_epoch()) {
5491 p = &osdmap;
5492 } else {
5493 p = new OSDMap;
5494 p->decode(osdmap_bl);
5495 }
5496
5497 auto sg = make_scope_guard([&] {
5498 if (p != &osdmap) {
5499 delete p;
5500 }
5501 });
5502
5503 if (prefix == "osd dump") {
5504 stringstream ds;
5505 if (f) {
5506 f->open_object_section("osdmap");
5507 p->dump(f.get());
5508 f->close_section();
5509 f->flush(ds);
5510 } else {
5511 p->print(ds);
5512 }
5513 rdata.append(ds);
5514 if (!f)
5515 ds << " ";
5516 } else if (prefix == "osd ls") {
5517 if (f) {
5518 f->open_array_section("osds");
5519 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5520 if (osdmap.exists(i)) {
5521 f->dump_int("osd", i);
5522 }
5523 }
5524 f->close_section();
5525 f->flush(ds);
5526 } else {
5527 bool first = true;
5528 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5529 if (osdmap.exists(i)) {
5530 if (!first)
5531 ds << "\n";
5532 first = false;
5533 ds << i;
5534 }
5535 }
5536 }
5537 rdata.append(ds);
5538 } else if (prefix == "osd info") {
5539 int64_t osd_id;
5540 bool do_single_osd = true;
5541 if (!cmd_getval(cmdmap, "id", osd_id)) {
5542 do_single_osd = false;
5543 }
5544
5545 if (do_single_osd && !osdmap.exists(osd_id)) {
5546 ss << "osd." << osd_id << " does not exist";
5547 r = -EINVAL;
5548 goto reply;
5549 }
5550
5551 if (f) {
5552 if (do_single_osd) {
5553 osdmap.dump_osd(osd_id, f.get());
5554 } else {
5555 osdmap.dump_osds(f.get());
5556 }
5557 f->flush(ds);
5558 } else {
5559 if (do_single_osd) {
5560 osdmap.print_osd(osd_id, ds);
5561 } else {
5562 osdmap.print_osds(ds);
5563 }
5564 }
5565 rdata.append(ds);
5566 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5567 string bucket;
5568 if (prefix == "osd tree-from") {
5569 cmd_getval(cmdmap, "bucket", bucket);
5570 if (!osdmap.crush->name_exists(bucket)) {
5571 ss << "bucket '" << bucket << "' does not exist";
5572 r = -ENOENT;
5573 goto reply;
5574 }
5575 int id = osdmap.crush->get_item_id(bucket);
5576 if (id >= 0) {
5577 ss << "\"" << bucket << "\" is not a bucket";
5578 r = -EINVAL;
5579 goto reply;
5580 }
5581 }
5582
5583 vector<string> states;
5584 cmd_getval(cmdmap, "states", states);
5585 unsigned filter = 0;
5586 for (auto& s : states) {
5587 if (s == "up") {
5588 filter |= OSDMap::DUMP_UP;
5589 } else if (s == "down") {
5590 filter |= OSDMap::DUMP_DOWN;
5591 } else if (s == "in") {
5592 filter |= OSDMap::DUMP_IN;
5593 } else if (s == "out") {
5594 filter |= OSDMap::DUMP_OUT;
5595 } else if (s == "destroyed") {
5596 filter |= OSDMap::DUMP_DESTROYED;
5597 } else {
5598 ss << "unrecognized state '" << s << "'";
5599 r = -EINVAL;
5600 goto reply;
5601 }
5602 }
5603 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5604 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5605 ss << "cannot specify both 'in' and 'out'";
5606 r = -EINVAL;
5607 goto reply;
5608 }
5609 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5610 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5611 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5612 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5613 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5614 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5615 ss << "can specify only one of 'up', 'down' and 'destroyed'";
5616 r = -EINVAL;
5617 goto reply;
5618 }
5619 if (f) {
5620 f->open_object_section("tree");
5621 p->print_tree(f.get(), NULL, filter, bucket);
5622 f->close_section();
5623 f->flush(ds);
5624 } else {
5625 p->print_tree(NULL, &ds, filter, bucket);
5626 }
5627 rdata.append(ds);
5628 } else if (prefix == "osd getmap") {
5629 rdata.append(osdmap_bl);
5630 ss << "got osdmap epoch " << p->get_epoch();
5631 } else if (prefix == "osd getcrushmap") {
5632 p->crush->encode(rdata, mon.get_quorum_con_features());
5633 ss << p->get_crush_version();
5634 } else if (prefix == "osd ls-tree") {
5635 string bucket_name;
5636 cmd_getval(cmdmap, "name", bucket_name);
5637 set<int> osds;
5638 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5639 if (r == -ENOENT) {
5640 ss << "\"" << bucket_name << "\" does not exist";
5641 goto reply;
5642 } else if (r < 0) {
5643 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5644 goto reply;
5645 }
5646
5647 if (f) {
5648 f->open_array_section("osds");
5649 for (auto &i : osds) {
5650 if (osdmap.exists(i)) {
5651 f->dump_int("osd", i);
5652 }
5653 }
5654 f->close_section();
5655 f->flush(ds);
5656 } else {
5657 bool first = true;
5658 for (auto &i : osds) {
5659 if (osdmap.exists(i)) {
5660 if (!first)
5661 ds << "\n";
5662 first = false;
5663 ds << i;
5664 }
5665 }
5666 }
5667
5668 rdata.append(ds);
5669 }
5670 } else if (prefix == "osd getmaxosd") {
5671 if (f) {
5672 f->open_object_section("getmaxosd");
5673 f->dump_unsigned("epoch", osdmap.get_epoch());
5674 f->dump_int("max_osd", osdmap.get_max_osd());
5675 f->close_section();
5676 f->flush(rdata);
5677 } else {
5678 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5679 rdata.append(ds);
5680 }
5681 } else if (prefix == "osd utilization") {
5682 string out;
5683 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5684 if (f)
5685 f->flush(rdata);
5686 else
5687 rdata.append(out);
5688 r = 0;
5689 goto reply;
5690 } else if (prefix == "osd find") {
5691 int64_t osd;
5692 if (!cmd_getval(cmdmap, "id", osd)) {
5693 ss << "unable to parse osd id value '"
5694 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5695 r = -EINVAL;
5696 goto reply;
5697 }
5698 if (!osdmap.exists(osd)) {
5699 ss << "osd." << osd << " does not exist";
5700 r = -ENOENT;
5701 goto reply;
5702 }
5703 string format;
5704 cmd_getval(cmdmap, "format", format);
5705 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5706 f->open_object_section("osd_location");
5707 f->dump_int("osd", osd);
5708 f->dump_object("addrs", osdmap.get_addrs(osd));
5709 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5710
5711 // try to identify host, pod/container name, etc.
5712 map<string,string> m;
5713 load_metadata(osd, m, nullptr);
5714 if (auto p = m.find("hostname"); p != m.end()) {
5715 f->dump_string("host", p->second);
5716 }
5717 for (auto& k : {
5718 "pod_name", "pod_namespace", // set by rook
5719 "container_name" // set by cephadm, ceph-ansible
5720 }) {
5721 if (auto p = m.find(k); p != m.end()) {
5722 f->dump_string(k, p->second);
5723 }
5724 }
5725
5726 // crush is helpful too
5727 f->open_object_section("crush_location");
5728 map<string,string> loc = osdmap.crush->get_full_location(osd);
5729 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5730 f->dump_string(p->first.c_str(), p->second);
5731 f->close_section();
5732 f->close_section();
5733 f->flush(rdata);
5734 } else if (prefix == "osd metadata") {
5735 int64_t osd = -1;
5736 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5737 !cmd_getval(cmdmap, "id", osd)) {
5738 ss << "unable to parse osd id value '"
5739 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5740 r = -EINVAL;
5741 goto reply;
5742 }
5743 if (osd >= 0 && !osdmap.exists(osd)) {
5744 ss << "osd." << osd << " does not exist";
5745 r = -ENOENT;
5746 goto reply;
5747 }
5748 string format;
5749 cmd_getval(cmdmap, "format", format);
5750 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5751 if (osd >= 0) {
5752 f->open_object_section("osd_metadata");
5753 f->dump_unsigned("id", osd);
5754 r = dump_osd_metadata(osd, f.get(), &ss);
5755 if (r < 0)
5756 goto reply;
5757 f->close_section();
5758 } else {
5759 r = 0;
5760 f->open_array_section("osd_metadata");
5761 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5762 if (osdmap.exists(i)) {
5763 f->open_object_section("osd");
5764 f->dump_unsigned("id", i);
5765 r = dump_osd_metadata(i, f.get(), NULL);
5766 if (r == -EINVAL || r == -ENOENT) {
5767 // Drop error, continue to get other daemons' metadata
5768 dout(4) << "No metadata for osd." << i << dendl;
5769 r = 0;
5770 } else if (r < 0) {
5771 // Unexpected error
5772 goto reply;
5773 }
5774 f->close_section();
5775 }
5776 }
5777 f->close_section();
5778 }
5779 f->flush(rdata);
5780 } else if (prefix == "osd versions") {
5781 if (!f)
5782 f.reset(Formatter::create("json-pretty"));
5783 count_metadata("ceph_version", f.get());
5784 f->flush(rdata);
5785 r = 0;
5786 } else if (prefix == "osd count-metadata") {
5787 if (!f)
5788 f.reset(Formatter::create("json-pretty"));
5789 string field;
5790 cmd_getval(cmdmap, "property", field);
5791 count_metadata(field, f.get());
5792 f->flush(rdata);
5793 r = 0;
5794 } else if (prefix == "osd numa-status") {
5795 TextTable tbl;
5796 if (f) {
5797 f->open_array_section("osds");
5798 } else {
5799 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5800 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5801 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5802 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5803 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5804 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5805 }
5806 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5807 if (osdmap.exists(i)) {
5808 map<string,string> m;
5809 ostringstream err;
5810 if (load_metadata(i, m, &err) < 0) {
5811 continue;
5812 }
5813 string host;
5814 auto p = m.find("hostname");
5815 if (p != m.end()) {
5816 host = p->second;
5817 }
5818 if (f) {
5819 f->open_object_section("osd");
5820 f->dump_int("osd", i);
5821 f->dump_string("host", host);
5822 for (auto n : { "network_numa_node", "objectstore_numa_node",
5823 "numa_node" }) {
5824 p = m.find(n);
5825 if (p != m.end()) {
5826 f->dump_int(n, atoi(p->second.c_str()));
5827 }
5828 }
5829 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5830 p = m.find(n);
5831 if (p != m.end()) {
5832 list<string> ls = get_str_list(p->second, ",");
5833 f->open_array_section(n);
5834 for (auto node : ls) {
5835 f->dump_int("node", atoi(node.c_str()));
5836 }
5837 f->close_section();
5838 }
5839 }
5840 for (auto n : { "numa_node_cpus" }) {
5841 p = m.find(n);
5842 if (p != m.end()) {
5843 dump_cpu_list(f.get(), n, p->second);
5844 }
5845 }
5846 f->close_section();
5847 } else {
5848 tbl << i;
5849 tbl << host;
5850 p = m.find("network_numa_nodes");
5851 if (p != m.end()) {
5852 tbl << p->second;
5853 } else {
5854 tbl << "-";
5855 }
5856 p = m.find("objectstore_numa_nodes");
5857 if (p != m.end()) {
5858 tbl << p->second;
5859 } else {
5860 tbl << "-";
5861 }
5862 p = m.find("numa_node");
5863 auto q = m.find("numa_node_cpus");
5864 if (p != m.end() && q != m.end()) {
5865 tbl << p->second;
5866 tbl << q->second;
5867 } else {
5868 tbl << "-";
5869 tbl << "-";
5870 }
5871 tbl << TextTable::endrow;
5872 }
5873 }
5874 }
5875 if (f) {
5876 f->close_section();
5877 f->flush(rdata);
5878 } else {
5879 rdata.append(stringify(tbl));
5880 }
5881 } else if (prefix == "osd map") {
5882 string poolstr, objstr, namespacestr;
5883 cmd_getval(cmdmap, "pool", poolstr);
5884 cmd_getval(cmdmap, "object", objstr);
5885 cmd_getval(cmdmap, "nspace", namespacestr);
5886
5887 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5888 if (pool < 0) {
5889 ss << "pool " << poolstr << " does not exist";
5890 r = -ENOENT;
5891 goto reply;
5892 }
5893 object_locator_t oloc(pool, namespacestr);
5894 object_t oid(objstr);
5895 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5896 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5897 vector<int> up, acting;
5898 int up_p, acting_p;
5899 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5900
5901 string fullobjname;
5902 if (!namespacestr.empty())
5903 fullobjname = namespacestr + string("/") + oid.name;
5904 else
5905 fullobjname = oid.name;
5906 if (f) {
5907 f->open_object_section("osd_map");
5908 f->dump_unsigned("epoch", osdmap.get_epoch());
5909 f->dump_string("pool", poolstr);
5910 f->dump_int("pool_id", pool);
5911 f->dump_stream("objname") << fullobjname;
5912 f->dump_stream("raw_pgid") << pgid;
5913 f->dump_stream("pgid") << mpgid;
5914 f->open_array_section("up");
5915 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5916 f->dump_int("osd", *p);
5917 f->close_section();
5918 f->dump_int("up_primary", up_p);
5919 f->open_array_section("acting");
5920 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5921 f->dump_int("osd", *p);
5922 f->close_section();
5923 f->dump_int("acting_primary", acting_p);
5924 f->close_section(); // osd_map
5925 f->flush(rdata);
5926 } else {
5927 ds << "osdmap e" << osdmap.get_epoch()
5928 << " pool '" << poolstr << "' (" << pool << ")"
5929 << " object '" << fullobjname << "' ->"
5930 << " pg " << pgid << " (" << mpgid << ")"
5931 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5932 << pg_vector_string(acting) << ", p" << acting_p << ")";
5933 rdata.append(ds);
5934 }
5935
5936 } else if (prefix == "pg map") {
5937 pg_t pgid;
5938 string pgidstr;
5939 cmd_getval(cmdmap, "pgid", pgidstr);
5940 if (!pgid.parse(pgidstr.c_str())) {
5941 ss << "invalid pgid '" << pgidstr << "'";
5942 r = -EINVAL;
5943 goto reply;
5944 }
5945 vector<int> up, acting;
5946 if (!osdmap.have_pg_pool(pgid.pool())) {
5947 ss << "pg '" << pgidstr << "' does not exist";
5948 r = -ENOENT;
5949 goto reply;
5950 }
5951 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5952 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5953 if (f) {
5954 f->open_object_section("pg_map");
5955 f->dump_unsigned("epoch", osdmap.get_epoch());
5956 f->dump_stream("raw_pgid") << pgid;
5957 f->dump_stream("pgid") << mpgid;
5958 f->open_array_section("up");
5959 for (auto osd : up) {
5960 f->dump_int("up_osd", osd);
5961 }
5962 f->close_section();
5963 f->open_array_section("acting");
5964 for (auto osd : acting) {
5965 f->dump_int("acting_osd", osd);
5966 }
5967 f->close_section();
5968 f->close_section();
5969 f->flush(rdata);
5970 } else {
5971 ds << "osdmap e" << osdmap.get_epoch()
5972 << " pg " << pgid << " (" << mpgid << ")"
5973 << " -> up " << up << " acting " << acting;
5974 rdata.append(ds);
5975 }
5976 goto reply;
5977
5978 } else if (prefix == "osd lspools") {
5979 if (f)
5980 f->open_array_section("pools");
5981 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5982 p != osdmap.pools.end();
5983 ++p) {
5984 if (f) {
5985 f->open_object_section("pool");
5986 f->dump_int("poolnum", p->first);
5987 f->dump_string("poolname", osdmap.pool_name[p->first]);
5988 f->close_section();
5989 } else {
5990 ds << p->first << ' ' << osdmap.pool_name[p->first];
5991 if (next(p) != osdmap.pools.end()) {
5992 ds << '\n';
5993 }
5994 }
5995 }
5996 if (f) {
5997 f->close_section();
5998 f->flush(ds);
5999 }
6000 rdata.append(ds);
6001 } else if (prefix == "osd blocklist ls" ||
6002 prefix == "osd blacklist ls") {
6003 if (f)
6004 f->open_array_section("blocklist");
6005
6006 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6007 p != osdmap.blocklist.end();
6008 ++p) {
6009 if (f) {
6010 f->open_object_section("entry");
6011 f->dump_string("addr", p->first.get_legacy_str());
6012 f->dump_stream("until") << p->second;
6013 f->close_section();
6014 } else {
6015 stringstream ss;
6016 string s;
6017 ss << p->first << " " << p->second;
6018 getline(ss, s);
6019 s += "\n";
6020 rdata.append(s);
6021 }
6022 }
6023 if (f) {
6024 f->close_section();
6025 f->flush(rdata);
6026 }
6027 ss << "listed " << osdmap.blocklist.size() << " entries";
6028
6029 } else if (prefix == "osd pool ls") {
6030 string detail;
6031 cmd_getval(cmdmap, "detail", detail);
6032 if (!f && detail == "detail") {
6033 ostringstream ss;
6034 osdmap.print_pools(ss);
6035 rdata.append(ss.str());
6036 } else {
6037 if (f)
6038 f->open_array_section("pools");
6039 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
6040 it != osdmap.get_pools().end();
6041 ++it) {
6042 if (f) {
6043 if (detail == "detail") {
6044 f->open_object_section("pool");
6045 f->dump_int("pool_id", it->first);
6046 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6047 it->second.dump(f.get());
6048 f->close_section();
6049 } else {
6050 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6051 }
6052 } else {
6053 rdata.append(osdmap.get_pool_name(it->first) + "\n");
6054 }
6055 }
6056 if (f) {
6057 f->close_section();
6058 f->flush(rdata);
6059 }
6060 }
6061
6062 } else if (prefix == "osd crush get-tunable") {
6063 string tunable;
6064 cmd_getval(cmdmap, "tunable", tunable);
6065 ostringstream rss;
6066 if (f)
6067 f->open_object_section("tunable");
6068 if (tunable == "straw_calc_version") {
6069 if (f)
6070 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6071 else
6072 rss << osdmap.crush->get_straw_calc_version() << "\n";
6073 } else {
6074 r = -EINVAL;
6075 goto reply;
6076 }
6077 if (f) {
6078 f->close_section();
6079 f->flush(rdata);
6080 } else {
6081 rdata.append(rss.str());
6082 }
6083 r = 0;
6084
6085 } else if (prefix == "osd pool get") {
6086 string poolstr;
6087 cmd_getval(cmdmap, "pool", poolstr);
6088 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6089 if (pool < 0) {
6090 ss << "unrecognized pool '" << poolstr << "'";
6091 r = -ENOENT;
6092 goto reply;
6093 }
6094
6095 const pg_pool_t *p = osdmap.get_pg_pool(pool);
6096 string var;
6097 cmd_getval(cmdmap, "var", var);
6098
6099 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6100 const choices_map_t ALL_CHOICES = {
6101 {"size", SIZE},
6102 {"min_size", MIN_SIZE},
6103 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6104 {"crush_rule", CRUSH_RULE},
6105 {"hashpspool", HASHPSPOOL},
6106 {"eio", POOL_EIO},
6107 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6108 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6109 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6110 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6111 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6112 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6113 {"use_gmt_hitset", USE_GMT_HITSET},
6114 {"target_max_objects", TARGET_MAX_OBJECTS},
6115 {"target_max_bytes", TARGET_MAX_BYTES},
6116 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6117 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6118 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6119 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6120 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6121 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6122 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6123 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6124 {"fast_read", FAST_READ},
6125 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6126 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6127 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6128 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6129 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6130 {"recovery_priority", RECOVERY_PRIORITY},
6131 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6132 {"scrub_priority", SCRUB_PRIORITY},
6133 {"compression_mode", COMPRESSION_MODE},
6134 {"compression_algorithm", COMPRESSION_ALGORITHM},
6135 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6136 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6137 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6138 {"csum_type", CSUM_TYPE},
6139 {"csum_max_block", CSUM_MAX_BLOCK},
6140 {"csum_min_block", CSUM_MIN_BLOCK},
6141 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6142 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6143 {"pg_num_min", PG_NUM_MIN},
6144 {"pg_num_max", PG_NUM_MAX},
6145 {"target_size_bytes", TARGET_SIZE_BYTES},
6146 {"target_size_ratio", TARGET_SIZE_RATIO},
6147 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6148 {"dedup_tier", DEDUP_TIER},
6149 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6150 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6151 {"bulk", BULK}
6152 };
6153
6154 typedef std::set<osd_pool_get_choices> choices_set_t;
6155
6156 const choices_set_t ONLY_TIER_CHOICES = {
6157 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6158 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6159 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6160 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6161 MIN_READ_RECENCY_FOR_PROMOTE,
6162 MIN_WRITE_RECENCY_FOR_PROMOTE,
6163 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6164 };
6165 const choices_set_t ONLY_ERASURE_CHOICES = {
6166 EC_OVERWRITES, ERASURE_CODE_PROFILE
6167 };
6168
6169 choices_set_t selected_choices;
6170 if (var == "all") {
6171 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6172 it != ALL_CHOICES.end(); ++it) {
6173 selected_choices.insert(it->second);
6174 }
6175
6176 if(!p->is_tier()) {
6177 selected_choices = subtract_second_from_first(selected_choices,
6178 ONLY_TIER_CHOICES);
6179 }
6180
6181 if(!p->is_erasure()) {
6182 selected_choices = subtract_second_from_first(selected_choices,
6183 ONLY_ERASURE_CHOICES);
6184 }
6185 } else /* var != "all" */ {
6186 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6187 if (found == ALL_CHOICES.end()) {
6188 ss << "pool '" << poolstr
6189 << "': invalid variable: '" << var << "'";
6190 r = -EINVAL;
6191 goto reply;
6192 }
6193
6194 osd_pool_get_choices selected = found->second;
6195
6196 if (!p->is_tier() &&
6197 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6198 ss << "pool '" << poolstr
6199 << "' is not a tier pool: variable not applicable";
6200 r = -EACCES;
6201 goto reply;
6202 }
6203
6204 if (!p->is_erasure() &&
6205 ONLY_ERASURE_CHOICES.find(selected)
6206 != ONLY_ERASURE_CHOICES.end()) {
6207 ss << "pool '" << poolstr
6208 << "' is not a erasure pool: variable not applicable";
6209 r = -EACCES;
6210 goto reply;
6211 }
6212
6213 if (pool_opts_t::is_opt_name(var) &&
6214 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6215 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6216 r = -ENOENT;
6217 goto reply;
6218 }
6219
6220 selected_choices.insert(selected);
6221 }
6222
6223 if (f) {
6224 f->open_object_section("pool");
6225 f->dump_string("pool", poolstr);
6226 f->dump_int("pool_id", pool);
6227 for(choices_set_t::const_iterator it = selected_choices.begin();
6228 it != selected_choices.end(); ++it) {
6229 choices_map_t::const_iterator i;
6230 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6231 if (i->second == *it) {
6232 break;
6233 }
6234 }
6235 ceph_assert(i != ALL_CHOICES.end());
6236 switch(*it) {
6237 case PG_NUM:
6238 f->dump_int("pg_num", p->get_pg_num());
6239 break;
6240 case PGP_NUM:
6241 f->dump_int("pgp_num", p->get_pgp_num());
6242 break;
6243 case SIZE:
6244 f->dump_int("size", p->get_size());
6245 break;
6246 case MIN_SIZE:
6247 f->dump_int("min_size", p->get_min_size());
6248 break;
6249 case CRUSH_RULE:
6250 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6251 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6252 p->get_crush_rule()));
6253 } else {
6254 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6255 }
6256 break;
6257 case EC_OVERWRITES:
6258 f->dump_bool("allow_ec_overwrites",
6259 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6260 break;
6261 case PG_AUTOSCALE_MODE:
6262 f->dump_string("pg_autoscale_mode",
6263 pg_pool_t::get_pg_autoscale_mode_name(
6264 p->pg_autoscale_mode));
6265 break;
6266 case HASHPSPOOL:
6267 case POOL_EIO:
6268 case NODELETE:
6269 case BULK:
6270 case NOPGCHANGE:
6271 case NOSIZECHANGE:
6272 case WRITE_FADVISE_DONTNEED:
6273 case NOSCRUB:
6274 case NODEEP_SCRUB:
6275 f->dump_bool(i->first.c_str(),
6276 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6277 break;
6278 case HIT_SET_PERIOD:
6279 f->dump_int("hit_set_period", p->hit_set_period);
6280 break;
6281 case HIT_SET_COUNT:
6282 f->dump_int("hit_set_count", p->hit_set_count);
6283 break;
6284 case HIT_SET_TYPE:
6285 f->dump_string("hit_set_type",
6286 HitSet::get_type_name(p->hit_set_params.get_type()));
6287 break;
6288 case HIT_SET_FPP:
6289 {
6290 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6291 BloomHitSet::Params *bloomp =
6292 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6293 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6294 } else if(var != "all") {
6295 f->close_section();
6296 ss << "hit set is not of type Bloom; " <<
6297 "invalid to get a false positive rate!";
6298 r = -EINVAL;
6299 goto reply;
6300 }
6301 }
6302 break;
6303 case USE_GMT_HITSET:
6304 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6305 break;
6306 case TARGET_MAX_OBJECTS:
6307 f->dump_unsigned("target_max_objects", p->target_max_objects);
6308 break;
6309 case TARGET_MAX_BYTES:
6310 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6311 break;
6312 case CACHE_TARGET_DIRTY_RATIO:
6313 f->dump_unsigned("cache_target_dirty_ratio_micro",
6314 p->cache_target_dirty_ratio_micro);
6315 f->dump_float("cache_target_dirty_ratio",
6316 ((float)p->cache_target_dirty_ratio_micro/1000000));
6317 break;
6318 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6319 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6320 p->cache_target_dirty_high_ratio_micro);
6321 f->dump_float("cache_target_dirty_high_ratio",
6322 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6323 break;
6324 case CACHE_TARGET_FULL_RATIO:
6325 f->dump_unsigned("cache_target_full_ratio_micro",
6326 p->cache_target_full_ratio_micro);
6327 f->dump_float("cache_target_full_ratio",
6328 ((float)p->cache_target_full_ratio_micro/1000000));
6329 break;
6330 case CACHE_MIN_FLUSH_AGE:
6331 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6332 break;
6333 case CACHE_MIN_EVICT_AGE:
6334 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6335 break;
6336 case ERASURE_CODE_PROFILE:
6337 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6338 break;
6339 case MIN_READ_RECENCY_FOR_PROMOTE:
6340 f->dump_int("min_read_recency_for_promote",
6341 p->min_read_recency_for_promote);
6342 break;
6343 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6344 f->dump_int("min_write_recency_for_promote",
6345 p->min_write_recency_for_promote);
6346 break;
6347 case FAST_READ:
6348 f->dump_int("fast_read", p->fast_read);
6349 break;
6350 case HIT_SET_GRADE_DECAY_RATE:
6351 f->dump_int("hit_set_grade_decay_rate",
6352 p->hit_set_grade_decay_rate);
6353 break;
6354 case HIT_SET_SEARCH_LAST_N:
6355 f->dump_int("hit_set_search_last_n",
6356 p->hit_set_search_last_n);
6357 break;
6358 case SCRUB_MIN_INTERVAL:
6359 case SCRUB_MAX_INTERVAL:
6360 case DEEP_SCRUB_INTERVAL:
6361 case RECOVERY_PRIORITY:
6362 case RECOVERY_OP_PRIORITY:
6363 case SCRUB_PRIORITY:
6364 case COMPRESSION_MODE:
6365 case COMPRESSION_ALGORITHM:
6366 case COMPRESSION_REQUIRED_RATIO:
6367 case COMPRESSION_MAX_BLOB_SIZE:
6368 case COMPRESSION_MIN_BLOB_SIZE:
6369 case CSUM_TYPE:
6370 case CSUM_MAX_BLOCK:
6371 case CSUM_MIN_BLOCK:
6372 case FINGERPRINT_ALGORITHM:
6373 case PG_NUM_MIN:
6374 case PG_NUM_MAX:
6375 case TARGET_SIZE_BYTES:
6376 case TARGET_SIZE_RATIO:
6377 case PG_AUTOSCALE_BIAS:
6378 case DEDUP_TIER:
6379 case DEDUP_CHUNK_ALGORITHM:
6380 case DEDUP_CDC_CHUNK_SIZE:
6381 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6382 if (p->opts.is_set(key)) {
6383 if(*it == CSUM_TYPE) {
6384 int64_t val;
6385 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6386 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6387 } else {
6388 p->opts.dump(i->first, f.get());
6389 }
6390 }
6391 break;
6392 }
6393 }
6394 f->close_section();
6395 f->flush(rdata);
6396 } else /* !f */ {
6397 for(choices_set_t::const_iterator it = selected_choices.begin();
6398 it != selected_choices.end(); ++it) {
6399 choices_map_t::const_iterator i;
6400 switch(*it) {
6401 case PG_NUM:
6402 ss << "pg_num: " << p->get_pg_num() << "\n";
6403 break;
6404 case PGP_NUM:
6405 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6406 break;
6407 case SIZE:
6408 ss << "size: " << p->get_size() << "\n";
6409 break;
6410 case MIN_SIZE:
6411 ss << "min_size: " << p->get_min_size() << "\n";
6412 break;
6413 case CRUSH_RULE:
6414 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6415 ss << "crush_rule: " << osdmap.crush->get_rule_name(
6416 p->get_crush_rule()) << "\n";
6417 } else {
6418 ss << "crush_rule: " << p->get_crush_rule() << "\n";
6419 }
6420 break;
6421 case PG_AUTOSCALE_MODE:
6422 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6423 p->pg_autoscale_mode) <<"\n";
6424 break;
6425 case HIT_SET_PERIOD:
6426 ss << "hit_set_period: " << p->hit_set_period << "\n";
6427 break;
6428 case HIT_SET_COUNT:
6429 ss << "hit_set_count: " << p->hit_set_count << "\n";
6430 break;
6431 case HIT_SET_TYPE:
6432 ss << "hit_set_type: " <<
6433 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6434 break;
6435 case HIT_SET_FPP:
6436 {
6437 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6438 BloomHitSet::Params *bloomp =
6439 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6440 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6441 } else if(var != "all") {
6442 ss << "hit set is not of type Bloom; " <<
6443 "invalid to get a false positive rate!";
6444 r = -EINVAL;
6445 goto reply;
6446 }
6447 }
6448 break;
6449 case USE_GMT_HITSET:
6450 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6451 break;
6452 case TARGET_MAX_OBJECTS:
6453 ss << "target_max_objects: " << p->target_max_objects << "\n";
6454 break;
6455 case TARGET_MAX_BYTES:
6456 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6457 break;
6458 case CACHE_TARGET_DIRTY_RATIO:
6459 ss << "cache_target_dirty_ratio: "
6460 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6461 break;
6462 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6463 ss << "cache_target_dirty_high_ratio: "
6464 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6465 break;
6466 case CACHE_TARGET_FULL_RATIO:
6467 ss << "cache_target_full_ratio: "
6468 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6469 break;
6470 case CACHE_MIN_FLUSH_AGE:
6471 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6472 break;
6473 case CACHE_MIN_EVICT_AGE:
6474 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6475 break;
6476 case ERASURE_CODE_PROFILE:
6477 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6478 break;
6479 case MIN_READ_RECENCY_FOR_PROMOTE:
6480 ss << "min_read_recency_for_promote: " <<
6481 p->min_read_recency_for_promote << "\n";
6482 break;
6483 case HIT_SET_GRADE_DECAY_RATE:
6484 ss << "hit_set_grade_decay_rate: " <<
6485 p->hit_set_grade_decay_rate << "\n";
6486 break;
6487 case HIT_SET_SEARCH_LAST_N:
6488 ss << "hit_set_search_last_n: " <<
6489 p->hit_set_search_last_n << "\n";
6490 break;
6491 case EC_OVERWRITES:
6492 ss << "allow_ec_overwrites: " <<
6493 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6494 "\n";
6495 break;
6496 case HASHPSPOOL:
6497 case POOL_EIO:
6498 case NODELETE:
6499 case BULK:
6500 case NOPGCHANGE:
6501 case NOSIZECHANGE:
6502 case WRITE_FADVISE_DONTNEED:
6503 case NOSCRUB:
6504 case NODEEP_SCRUB:
6505 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6506 if (i->second == *it)
6507 break;
6508 }
6509 ceph_assert(i != ALL_CHOICES.end());
6510 ss << i->first << ": " <<
6511 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6512 "true" : "false") << "\n";
6513 break;
6514 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6515 ss << "min_write_recency_for_promote: " <<
6516 p->min_write_recency_for_promote << "\n";
6517 break;
6518 case FAST_READ:
6519 ss << "fast_read: " << p->fast_read << "\n";
6520 break;
6521 case SCRUB_MIN_INTERVAL:
6522 case SCRUB_MAX_INTERVAL:
6523 case DEEP_SCRUB_INTERVAL:
6524 case RECOVERY_PRIORITY:
6525 case RECOVERY_OP_PRIORITY:
6526 case SCRUB_PRIORITY:
6527 case COMPRESSION_MODE:
6528 case COMPRESSION_ALGORITHM:
6529 case COMPRESSION_REQUIRED_RATIO:
6530 case COMPRESSION_MAX_BLOB_SIZE:
6531 case COMPRESSION_MIN_BLOB_SIZE:
6532 case CSUM_TYPE:
6533 case CSUM_MAX_BLOCK:
6534 case CSUM_MIN_BLOCK:
6535 case FINGERPRINT_ALGORITHM:
6536 case PG_NUM_MIN:
6537 case PG_NUM_MAX:
6538 case TARGET_SIZE_BYTES:
6539 case TARGET_SIZE_RATIO:
6540 case PG_AUTOSCALE_BIAS:
6541 case DEDUP_TIER:
6542 case DEDUP_CHUNK_ALGORITHM:
6543 case DEDUP_CDC_CHUNK_SIZE:
6544 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6545 if (i->second == *it)
6546 break;
6547 }
6548 ceph_assert(i != ALL_CHOICES.end());
6549 {
6550 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6551 if (p->opts.is_set(key)) {
6552 if(key == pool_opts_t::CSUM_TYPE) {
6553 int64_t val;
6554 p->opts.get(key, &val);
6555 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6556 } else {
6557 ss << i->first << ": " << p->opts.get(key) << "\n";
6558 }
6559 }
6560 }
6561 break;
6562 }
6563 rdata.append(ss.str());
6564 ss.str("");
6565 }
6566 }
6567 r = 0;
6568 } else if (prefix == "osd pool get-quota") {
6569 string pool_name;
6570 cmd_getval(cmdmap, "pool", pool_name);
6571
6572 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6573 if (poolid < 0) {
6574 ceph_assert(poolid == -ENOENT);
6575 ss << "unrecognized pool '" << pool_name << "'";
6576 r = -ENOENT;
6577 goto reply;
6578 }
6579 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6580 const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6581 if (!pstat) {
6582 ss << "no stats for pool '" << pool_name << "'";
6583 r = -ENOENT;
6584 goto reply;
6585 }
6586 const object_stat_sum_t& sum = pstat->stats.sum;
6587 if (f) {
6588 f->open_object_section("pool_quotas");
6589 f->dump_string("pool_name", pool_name);
6590 f->dump_unsigned("pool_id", poolid);
6591 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6592 f->dump_int("current_num_objects", sum.num_objects);
6593 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6594 f->dump_int("current_num_bytes", sum.num_bytes);
6595 f->close_section();
6596 f->flush(rdata);
6597 } else {
6598 stringstream rs;
6599 rs << "quotas for pool '" << pool_name << "':\n"
6600 << " max objects: ";
6601 if (p->quota_max_objects == 0)
6602 rs << "N/A";
6603 else {
6604 rs << si_u_t(p->quota_max_objects) << " objects";
6605 rs << " (current num objects: " << sum.num_objects << " objects)";
6606 }
6607 rs << "\n"
6608 << " max bytes : ";
6609 if (p->quota_max_bytes == 0)
6610 rs << "N/A";
6611 else {
6612 rs << byte_u_t(p->quota_max_bytes);
6613 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6614 }
6615 rdata.append(rs.str());
6616 }
6617 rdata.append("\n");
6618 r = 0;
6619 } else if (prefix == "osd crush rule list" ||
6620 prefix == "osd crush rule ls") {
6621 if (f) {
6622 f->open_array_section("rules");
6623 osdmap.crush->list_rules(f.get());
6624 f->close_section();
6625 f->flush(rdata);
6626 } else {
6627 ostringstream ss;
6628 osdmap.crush->list_rules(&ss);
6629 rdata.append(ss.str());
6630 }
6631 } else if (prefix == "osd crush rule ls-by-class") {
6632 string class_name;
6633 cmd_getval(cmdmap, "class", class_name);
6634 if (class_name.empty()) {
6635 ss << "no class specified";
6636 r = -EINVAL;
6637 goto reply;
6638 }
6639 set<int> rules;
6640 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6641 if (r < 0) {
6642 ss << "failed to get rules by class '" << class_name << "'";
6643 goto reply;
6644 }
6645 if (f) {
6646 f->open_array_section("rules");
6647 for (auto &rule: rules) {
6648 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6649 }
6650 f->close_section();
6651 f->flush(rdata);
6652 } else {
6653 ostringstream rs;
6654 for (auto &rule: rules) {
6655 rs << osdmap.crush->get_rule_name(rule) << "\n";
6656 }
6657 rdata.append(rs.str());
6658 }
6659 } else if (prefix == "osd crush rule dump") {
6660 string name;
6661 cmd_getval(cmdmap, "name", name);
6662 string format;
6663 cmd_getval(cmdmap, "format", format);
6664 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6665 if (name == "") {
6666 f->open_array_section("rules");
6667 osdmap.crush->dump_rules(f.get());
6668 f->close_section();
6669 } else {
6670 int ruleno = osdmap.crush->get_rule_id(name);
6671 if (ruleno < 0) {
6672 ss << "unknown crush rule '" << name << "'";
6673 r = ruleno;
6674 goto reply;
6675 }
6676 osdmap.crush->dump_rule(ruleno, f.get());
6677 }
6678 ostringstream rs;
6679 f->flush(rs);
6680 rs << "\n";
6681 rdata.append(rs.str());
6682 } else if (prefix == "osd crush dump") {
6683 string format;
6684 cmd_getval(cmdmap, "format", format);
6685 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6686 f->open_object_section("crush_map");
6687 osdmap.crush->dump(f.get());
6688 f->close_section();
6689 ostringstream rs;
6690 f->flush(rs);
6691 rs << "\n";
6692 rdata.append(rs.str());
6693 } else if (prefix == "osd crush show-tunables") {
6694 string format;
6695 cmd_getval(cmdmap, "format", format);
6696 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6697 f->open_object_section("crush_map_tunables");
6698 osdmap.crush->dump_tunables(f.get());
6699 f->close_section();
6700 ostringstream rs;
6701 f->flush(rs);
6702 rs << "\n";
6703 rdata.append(rs.str());
6704 } else if (prefix == "osd crush tree") {
6705 bool show_shadow = false;
6706 if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
6707 std::string shadow;
6708 if (cmd_getval(cmdmap, "shadow", shadow) &&
6709 shadow == "--show-shadow") {
6710 show_shadow = true;
6711 }
6712 }
6713 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6714 if (f) {
6715 f->open_object_section("crush_tree");
6716 osdmap.crush->dump_tree(nullptr,
6717 f.get(),
6718 osdmap.get_pool_names(),
6719 show_shadow);
6720 f->close_section();
6721 f->flush(rdata);
6722 } else {
6723 ostringstream ss;
6724 osdmap.crush->dump_tree(&ss,
6725 nullptr,
6726 osdmap.get_pool_names(),
6727 show_shadow);
6728 rdata.append(ss.str());
6729 }
6730 } else if (prefix == "osd crush ls") {
6731 string name;
6732 if (!cmd_getval(cmdmap, "node", name)) {
6733 ss << "no node specified";
6734 r = -EINVAL;
6735 goto reply;
6736 }
6737 if (!osdmap.crush->name_exists(name)) {
6738 ss << "node '" << name << "' does not exist";
6739 r = -ENOENT;
6740 goto reply;
6741 }
6742 int id = osdmap.crush->get_item_id(name);
6743 list<int> result;
6744 if (id >= 0) {
6745 result.push_back(id);
6746 } else {
6747 int num = osdmap.crush->get_bucket_size(id);
6748 for (int i = 0; i < num; ++i) {
6749 result.push_back(osdmap.crush->get_bucket_item(id, i));
6750 }
6751 }
6752 if (f) {
6753 f->open_array_section("items");
6754 for (auto i : result) {
6755 f->dump_string("item", osdmap.crush->get_item_name(i));
6756 }
6757 f->close_section();
6758 f->flush(rdata);
6759 } else {
6760 ostringstream ss;
6761 for (auto i : result) {
6762 ss << osdmap.crush->get_item_name(i) << "\n";
6763 }
6764 rdata.append(ss.str());
6765 }
6766 r = 0;
6767 } else if (prefix == "osd crush class ls") {
6768 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6769 f->open_array_section("crush_classes");
6770 for (auto i : osdmap.crush->class_name)
6771 f->dump_string("class", i.second);
6772 f->close_section();
6773 f->flush(rdata);
6774 } else if (prefix == "osd crush class ls-osd") {
6775 string name;
6776 cmd_getval(cmdmap, "class", name);
6777 set<int> osds;
6778 osdmap.crush->get_devices_by_class(name, &osds);
6779 if (f) {
6780 f->open_array_section("osds");
6781 for (auto &osd: osds)
6782 f->dump_int("osd", osd);
6783 f->close_section();
6784 f->flush(rdata);
6785 } else {
6786 bool first = true;
6787 for (auto &osd : osds) {
6788 if (!first)
6789 ds << "\n";
6790 first = false;
6791 ds << osd;
6792 }
6793 rdata.append(ds);
6794 }
6795 } else if (prefix == "osd crush get-device-class") {
6796 vector<string> idvec;
6797 cmd_getval(cmdmap, "ids", idvec);
6798 map<int, string> class_by_osd;
6799 for (auto& id : idvec) {
6800 ostringstream ts;
6801 long osd = parse_osd_id(id.c_str(), &ts);
6802 if (osd < 0) {
6803 ss << "unable to parse osd id:'" << id << "'";
6804 r = -EINVAL;
6805 goto reply;
6806 }
6807 auto device_class = osdmap.crush->get_item_class(osd);
6808 if (device_class)
6809 class_by_osd[osd] = device_class;
6810 else
6811 class_by_osd[osd] = ""; // no class
6812 }
6813 if (f) {
6814 f->open_array_section("osd_device_classes");
6815 for (auto& i : class_by_osd) {
6816 f->open_object_section("osd_device_class");
6817 f->dump_int("osd", i.first);
6818 f->dump_string("device_class", i.second);
6819 f->close_section();
6820 }
6821 f->close_section();
6822 f->flush(rdata);
6823 } else {
6824 if (class_by_osd.size() == 1) {
6825 // for single input, make a clean output
6826 ds << class_by_osd.begin()->second;
6827 } else {
6828 // note that we do not group osds by class here
6829 for (auto it = class_by_osd.begin();
6830 it != class_by_osd.end();
6831 it++) {
6832 ds << "osd." << it->first << ' ' << it->second;
6833 if (next(it) != class_by_osd.end())
6834 ds << '\n';
6835 }
6836 }
6837 rdata.append(ds);
6838 }
6839 } else if (prefix == "osd erasure-code-profile ls") {
6840 const auto &profiles = osdmap.get_erasure_code_profiles();
6841 if (f)
6842 f->open_array_section("erasure-code-profiles");
6843 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6844 if (f)
6845 f->dump_string("profile", i->first.c_str());
6846 else
6847 rdata.append(i->first + "\n");
6848 }
6849 if (f) {
6850 f->close_section();
6851 ostringstream rs;
6852 f->flush(rs);
6853 rs << "\n";
6854 rdata.append(rs.str());
6855 }
6856 } else if (prefix == "osd crush weight-set ls") {
6857 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6858 if (f) {
6859 f->open_array_section("weight_sets");
6860 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6861 f->dump_string("pool", "(compat)");
6862 }
6863 for (auto& i : osdmap.crush->choose_args) {
6864 if (i.first >= 0) {
6865 f->dump_string("pool", osdmap.get_pool_name(i.first));
6866 }
6867 }
6868 f->close_section();
6869 f->flush(rdata);
6870 } else {
6871 ostringstream rs;
6872 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6873 rs << "(compat)\n";
6874 }
6875 for (auto& i : osdmap.crush->choose_args) {
6876 if (i.first >= 0) {
6877 rs << osdmap.get_pool_name(i.first) << "\n";
6878 }
6879 }
6880 rdata.append(rs.str());
6881 }
6882 } else if (prefix == "osd crush weight-set dump") {
6883 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6884 "json-pretty"));
6885 osdmap.crush->dump_choose_args(f.get());
6886 f->flush(rdata);
6887 } else if (prefix == "osd erasure-code-profile get") {
6888 string name;
6889 cmd_getval(cmdmap, "name", name);
6890 if (!osdmap.has_erasure_code_profile(name)) {
6891 ss << "unknown erasure code profile '" << name << "'";
6892 r = -ENOENT;
6893 goto reply;
6894 }
6895 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6896 if (f)
6897 f->open_object_section("profile");
6898 for (map<string,string>::const_iterator i = profile.begin();
6899 i != profile.end();
6900 ++i) {
6901 if (f)
6902 f->dump_string(i->first.c_str(), i->second.c_str());
6903 else
6904 rdata.append(i->first + "=" + i->second + "\n");
6905 }
6906 if (f) {
6907 f->close_section();
6908 ostringstream rs;
6909 f->flush(rs);
6910 rs << "\n";
6911 rdata.append(rs.str());
6912 }
6913 } else if (prefix == "osd pool application get") {
6914 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6915 "json-pretty"));
6916 string pool_name;
6917 cmd_getval(cmdmap, "pool", pool_name);
6918 string app;
6919 cmd_getval(cmdmap, "app", app);
6920 string key;
6921 cmd_getval(cmdmap, "key", key);
6922
6923 if (pool_name.empty()) {
6924 // all
6925 f->open_object_section("pools");
6926 for (const auto &pool : osdmap.pools) {
6927 std::string name("<unknown>");
6928 const auto &pni = osdmap.pool_name.find(pool.first);
6929 if (pni != osdmap.pool_name.end())
6930 name = pni->second;
6931 f->open_object_section(name.c_str());
6932 for (auto &app_pair : pool.second.application_metadata) {
6933 f->open_object_section(app_pair.first.c_str());
6934 for (auto &kv_pair : app_pair.second) {
6935 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6936 }
6937 f->close_section();
6938 }
6939 f->close_section(); // name
6940 }
6941 f->close_section(); // pools
6942 f->flush(rdata);
6943 } else {
6944 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6945 if (pool < 0) {
6946 ss << "unrecognized pool '" << pool_name << "'";
6947 r = -ENOENT;
6948 goto reply;
6949 }
6950 auto p = osdmap.get_pg_pool(pool);
6951 // filter by pool
6952 if (app.empty()) {
6953 f->open_object_section(pool_name.c_str());
6954 for (auto &app_pair : p->application_metadata) {
6955 f->open_object_section(app_pair.first.c_str());
6956 for (auto &kv_pair : app_pair.second) {
6957 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6958 }
6959 f->close_section(); // application
6960 }
6961 f->close_section(); // pool_name
6962 f->flush(rdata);
6963 goto reply;
6964 }
6965
6966 auto app_it = p->application_metadata.find(app);
6967 if (app_it == p->application_metadata.end()) {
6968 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6969 r = -ENOENT;
6970 goto reply;
6971 }
6972 // filter by pool + app
6973 if (key.empty()) {
6974 f->open_object_section(app_it->first.c_str());
6975 for (auto &kv_pair : app_it->second) {
6976 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6977 }
6978 f->close_section(); // application
6979 f->flush(rdata);
6980 goto reply;
6981 }
6982 // filter by pool + app + key
6983 auto key_it = app_it->second.find(key);
6984 if (key_it == app_it->second.end()) {
6985 ss << "application '" << app << "' on pool '" << pool_name
6986 << "' does not have key '" << key << "'";
6987 r = -ENOENT;
6988 goto reply;
6989 }
6990 ss << key_it->second << "\n";
6991 rdata.append(ss.str());
6992 ss.str("");
6993 }
6994 } else if (prefix == "osd get-require-min-compat-client") {
6995 ss << osdmap.require_min_compat_client << std::endl;
6996 rdata.append(ss.str());
6997 ss.str("");
6998 goto reply;
6999 } else if (prefix == "osd pool application enable" ||
7000 prefix == "osd pool application disable" ||
7001 prefix == "osd pool application set" ||
7002 prefix == "osd pool application rm") {
7003 bool changed = false;
7004 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7005 if (r != 0) {
7006 // Error, reply.
7007 goto reply;
7008 } else if (changed) {
7009 // Valid mutation, proceed to prepare phase
7010 return false;
7011 } else {
7012 // Idempotent case, reply
7013 goto reply;
7014 }
7015 } else {
7016 // try prepare update
7017 return false;
7018 }
7019
7020 reply:
7021 string rs;
7022 getline(ss, rs);
7023 mon.reply_command(op, r, rs, rdata, get_last_committed());
7024 return true;
7025 }
7026
7027 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7028 {
7029 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7030 osdmap.get_pg_pool(pool_id));
7031 ceph_assert(pool);
7032 pool->set_flag(flags);
7033 }
7034
7035 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7036 {
7037 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7038 osdmap.get_pg_pool(pool_id));
7039 ceph_assert(pool);
7040 pool->unset_flag(flags);
7041 }
7042
7043 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7044 {
7045 char k[80];
7046 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7047 return k;
7048 }
7049
7050 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7051 {
7052 char k[80];
7053 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7054 (unsigned long long)pool, (unsigned long long)snap);
7055 return k;
7056 }
7057
7058 string OSDMonitor::make_purged_snap_key_value(
7059 int64_t pool, snapid_t snap, snapid_t num,
7060 epoch_t epoch, bufferlist *v)
7061 {
7062 // encode the *last* epoch in the key so that we can use forward
7063 // iteration only to search for an epoch in an interval.
7064 encode(snap, *v);
7065 encode(snap + num, *v);
7066 encode(epoch, *v);
7067 return make_purged_snap_key(pool, snap + num - 1);
7068 }
7069
7070
7071 int OSDMonitor::lookup_purged_snap(
7072 int64_t pool, snapid_t snap,
7073 snapid_t *begin, snapid_t *end)
7074 {
7075 string k = make_purged_snap_key(pool, snap);
7076 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7077 it->lower_bound(k);
7078 if (!it->valid()) {
7079 dout(20) << __func__
7080 << " pool " << pool << " snap " << snap
7081 << " - key '" << k << "' not found" << dendl;
7082 return -ENOENT;
7083 }
7084 if (it->key().find("purged_snap_") != 0) {
7085 dout(20) << __func__
7086 << " pool " << pool << " snap " << snap
7087 << " - key '" << k << "' got '" << it->key()
7088 << "', wrong prefix" << dendl;
7089 return -ENOENT;
7090 }
7091 string gotk = it->key();
7092 const char *format = "purged_snap_%llu_";
7093 long long int keypool;
7094 int n = sscanf(gotk.c_str(), format, &keypool);
7095 if (n != 1) {
7096 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7097 return -ENOENT;
7098 }
7099 if (pool != keypool) {
7100 dout(20) << __func__
7101 << " pool " << pool << " snap " << snap
7102 << " - key '" << k << "' got '" << gotk
7103 << "', wrong pool " << keypool
7104 << dendl;
7105 return -ENOENT;
7106 }
7107 bufferlist v = it->value();
7108 auto p = v.cbegin();
7109 decode(*begin, p);
7110 decode(*end, p);
7111 if (snap < *begin || snap >= *end) {
7112 dout(20) << __func__
7113 << " pool " << pool << " snap " << snap
7114 << " - found [" << *begin << "," << *end << "), no overlap"
7115 << dendl;
7116 return -ENOENT;
7117 }
7118 return 0;
7119 }
7120
7121 void OSDMonitor::insert_purged_snap_update(
7122 int64_t pool,
7123 snapid_t start, snapid_t end,
7124 epoch_t epoch,
7125 MonitorDBStore::TransactionRef t)
7126 {
7127 snapid_t before_begin, before_end;
7128 snapid_t after_begin, after_end;
7129 int b = lookup_purged_snap(pool, start - 1,
7130 &before_begin, &before_end);
7131 int a = lookup_purged_snap(pool, end,
7132 &after_begin, &after_end);
7133 if (!b && !a) {
7134 dout(10) << __func__
7135 << " [" << start << "," << end << ") - joins ["
7136 << before_begin << "," << before_end << ") and ["
7137 << after_begin << "," << after_end << ")" << dendl;
7138 // erase only the begin record; we'll overwrite the end one.
7139 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7140 bufferlist v;
7141 string k = make_purged_snap_key_value(pool,
7142 before_begin, after_end - before_begin,
7143 pending_inc.epoch, &v);
7144 t->put(OSD_SNAP_PREFIX, k, v);
7145 } else if (!b) {
7146 dout(10) << __func__
7147 << " [" << start << "," << end << ") - join with earlier ["
7148 << before_begin << "," << before_end << ")" << dendl;
7149 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7150 bufferlist v;
7151 string k = make_purged_snap_key_value(pool,
7152 before_begin, end - before_begin,
7153 pending_inc.epoch, &v);
7154 t->put(OSD_SNAP_PREFIX, k, v);
7155 } else if (!a) {
7156 dout(10) << __func__
7157 << " [" << start << "," << end << ") - join with later ["
7158 << after_begin << "," << after_end << ")" << dendl;
7159 // overwrite after record
7160 bufferlist v;
7161 string k = make_purged_snap_key_value(pool,
7162 start, after_end - start,
7163 pending_inc.epoch, &v);
7164 t->put(OSD_SNAP_PREFIX, k, v);
7165 } else {
7166 dout(10) << __func__
7167 << " [" << start << "," << end << ") - new"
7168 << dendl;
7169 bufferlist v;
7170 string k = make_purged_snap_key_value(pool,
7171 start, end - start,
7172 pending_inc.epoch, &v);
7173 t->put(OSD_SNAP_PREFIX, k, v);
7174 }
7175 }
7176
7177 bool OSDMonitor::try_prune_purged_snaps()
7178 {
7179 if (!mon.mgrstatmon()->is_readable()) {
7180 return false;
7181 }
7182 if (!pending_inc.new_purged_snaps.empty()) {
7183 return false; // we already pruned for this epoch
7184 }
7185
7186 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7187 "mon_max_snap_prune_per_epoch");
7188 if (!max_prune) {
7189 max_prune = 100000;
7190 }
7191 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7192
7193 unsigned actually_pruned = 0;
7194 auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7195 for (auto& p : osdmap.get_pools()) {
7196 auto q = purged_snaps.find(p.first);
7197 if (q == purged_snaps.end()) {
7198 continue;
7199 }
7200 auto& purged = q->second;
7201 if (purged.empty()) {
7202 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7203 continue;
7204 }
7205 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7206 snap_interval_set_t to_prune;
7207 unsigned maybe_pruned = actually_pruned;
7208 for (auto i = purged.begin(); i != purged.end(); ++i) {
7209 snapid_t begin = i.get_start();
7210 auto end = i.get_start() + i.get_len();
7211 snapid_t pbegin = 0, pend = 0;
7212 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7213 if (r == 0) {
7214 // already purged.
7215 // be a bit aggressive about backing off here, because the mon may
7216 // do a lot of work going through this set, and if we know the
7217 // purged set from the OSDs is at least *partly* stale we may as
7218 // well wait for it to be fresh.
7219 dout(20) << __func__ << " we've already purged " << pbegin
7220 << "~" << (pend - pbegin) << dendl;
7221 break; // next pool
7222 }
7223 if (pbegin && pbegin > begin && pbegin < end) {
7224 // the tail of [begin,end) is purged; shorten the range
7225 end = pbegin;
7226 }
7227 to_prune.insert(begin, end - begin);
7228 maybe_pruned += end - begin;
7229 if (maybe_pruned >= max_prune) {
7230 break;
7231 }
7232 }
7233 if (!to_prune.empty()) {
7234 // PGs may still be reporting things as purged that we have already
7235 // pruned from removed_snaps_queue.
7236 snap_interval_set_t actual;
7237 auto r = osdmap.removed_snaps_queue.find(p.first);
7238 if (r != osdmap.removed_snaps_queue.end()) {
7239 actual.intersection_of(to_prune, r->second);
7240 }
7241 actually_pruned += actual.size();
7242 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7243 << ", actual pruned " << actual << dendl;
7244 if (!actual.empty()) {
7245 pending_inc.new_purged_snaps[p.first].swap(actual);
7246 }
7247 }
7248 if (actually_pruned >= max_prune) {
7249 break;
7250 }
7251 }
7252 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7253 return !!actually_pruned;
7254 }
7255
7256 bool OSDMonitor::update_pools_status()
7257 {
7258 if (!mon.mgrstatmon()->is_readable())
7259 return false;
7260
7261 bool ret = false;
7262
7263 auto& pools = osdmap.get_pools();
7264 for (auto it = pools.begin(); it != pools.end(); ++it) {
7265 const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7266 if (!pstat)
7267 continue;
7268 const object_stat_sum_t& sum = pstat->stats.sum;
7269 const pg_pool_t &pool = it->second;
7270 const string& pool_name = osdmap.get_pool_name(it->first);
7271
7272 bool pool_is_full =
7273 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7274 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7275
7276 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7277 if (pool_is_full)
7278 continue;
7279
7280 mon.clog->info() << "pool '" << pool_name
7281 << "' no longer out of quota; removing NO_QUOTA flag";
7282 // below we cancel FLAG_FULL too, we'll set it again in
7283 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7284 clear_pool_flags(it->first,
7285 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7286 ret = true;
7287 } else {
7288 if (!pool_is_full)
7289 continue;
7290
7291 if (pool.quota_max_bytes > 0 &&
7292 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7293 mon.clog->warn() << "pool '" << pool_name << "' is full"
7294 << " (reached quota's max_bytes: "
7295 << byte_u_t(pool.quota_max_bytes) << ")";
7296 }
7297 if (pool.quota_max_objects > 0 &&
7298 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7299 mon.clog->warn() << "pool '" << pool_name << "' is full"
7300 << " (reached quota's max_objects: "
7301 << pool.quota_max_objects << ")";
7302 }
7303 // set both FLAG_FULL_QUOTA and FLAG_FULL
7304 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7305 // since FLAG_FULL should always take precedence
7306 set_pool_flags(it->first,
7307 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7308 clear_pool_flags(it->first,
7309 pg_pool_t::FLAG_NEARFULL |
7310 pg_pool_t::FLAG_BACKFILLFULL);
7311 ret = true;
7312 }
7313 }
7314 return ret;
7315 }
7316
7317 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7318 {
7319 op->mark_osdmon_event(__func__);
7320 auto m = op->get_req<MPoolOp>();
7321 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7322 MonSession *session = op->get_session();
7323 if (!session)
7324 return -EPERM;
7325 string erasure_code_profile;
7326 stringstream ss;
7327 string rule_name;
7328 bool bulk = false;
7329 int ret = 0;
7330 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7331 0, 0, 0, 0, 0, 0, 0.0,
7332 erasure_code_profile,
7333 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
7334 &ss);
7335
7336 if (ret < 0) {
7337 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7338 }
7339 return ret;
7340 }
7341
7342 int OSDMonitor::crush_rename_bucket(const string& srcname,
7343 const string& dstname,
7344 ostream *ss)
7345 {
7346 int ret;
7347 //
7348 // Avoid creating a pending crush if it does not already exists and
7349 // the rename would fail.
7350 //
7351 if (!_have_pending_crush()) {
7352 ret = _get_stable_crush().can_rename_bucket(srcname,
7353 dstname,
7354 ss);
7355 if (ret)
7356 return ret;
7357 }
7358
7359 CrushWrapper newcrush = _get_pending_crush();
7360
7361 ret = newcrush.rename_bucket(srcname,
7362 dstname,
7363 ss);
7364 if (ret)
7365 return ret;
7366
7367 pending_inc.crush.clear();
7368 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7369 *ss << "renamed bucket " << srcname << " into " << dstname;
7370 return 0;
7371 }
7372
7373 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7374 {
7375 string replacement = "";
7376
7377 if (plugin == "jerasure_generic" ||
7378 plugin == "jerasure_sse3" ||
7379 plugin == "jerasure_sse4" ||
7380 plugin == "jerasure_neon") {
7381 replacement = "jerasure";
7382 } else if (plugin == "shec_generic" ||
7383 plugin == "shec_sse3" ||
7384 plugin == "shec_sse4" ||
7385 plugin == "shec_neon") {
7386 replacement = "shec";
7387 }
7388
7389 if (replacement != "") {
7390 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7391 << plugin << " that has been deprecated. Please use "
7392 << replacement << " instead." << dendl;
7393 }
7394 }
7395
7396 int OSDMonitor::normalize_profile(const string& profilename,
7397 ErasureCodeProfile &profile,
7398 bool force,
7399 ostream *ss)
7400 {
7401 ErasureCodeInterfaceRef erasure_code;
7402 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7403 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7404 check_legacy_ec_plugin(plugin->second, profilename);
7405 int err = instance.factory(plugin->second,
7406 g_conf().get_val<std::string>("erasure_code_dir"),
7407 profile, &erasure_code, ss);
7408 if (err) {
7409 return err;
7410 }
7411
7412 err = erasure_code->init(profile, ss);
7413 if (err) {
7414 return err;
7415 }
7416
7417 auto it = profile.find("stripe_unit");
7418 if (it != profile.end()) {
7419 string err_str;
7420 uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
7421 if (!err_str.empty()) {
7422 *ss << "could not parse stripe_unit '" << it->second
7423 << "': " << err_str << std::endl;
7424 return -EINVAL;
7425 }
7426 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7427 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7428 if (chunk_size != stripe_unit) {
7429 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7430 << "alignment. Would be padded to " << chunk_size
7431 << std::endl;
7432 return -EINVAL;
7433 }
7434 if ((stripe_unit % 4096) != 0 && !force) {
7435 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7436 << "use --force to override this check" << std::endl;
7437 return -EINVAL;
7438 }
7439 }
7440 return 0;
7441 }
7442
7443 int OSDMonitor::crush_rule_create_erasure(const string &name,
7444 const string &profile,
7445 int *rule,
7446 ostream *ss)
7447 {
7448 int ruleid = osdmap.crush->get_rule_id(name);
7449 if (ruleid != -ENOENT) {
7450 *rule = ruleid;
7451 return -EEXIST;
7452 }
7453
7454 CrushWrapper newcrush = _get_pending_crush();
7455
7456 ruleid = newcrush.get_rule_id(name);
7457 if (ruleid != -ENOENT) {
7458 *rule = ruleid;
7459 return -EALREADY;
7460 } else {
7461 ErasureCodeInterfaceRef erasure_code;
7462 int err = get_erasure_code(profile, &erasure_code, ss);
7463 if (err) {
7464 *ss << "failed to load plugin using profile " << profile << std::endl;
7465 return err;
7466 }
7467
7468 err = erasure_code->create_rule(name, newcrush, ss);
7469 erasure_code.reset();
7470 if (err < 0)
7471 return err;
7472 *rule = err;
7473 pending_inc.crush.clear();
7474 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7475 return 0;
7476 }
7477 }
7478
7479 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7480 ErasureCodeInterfaceRef *erasure_code,
7481 ostream *ss) const
7482 {
7483 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7484 return -EAGAIN;
7485 ErasureCodeProfile profile =
7486 osdmap.get_erasure_code_profile(erasure_code_profile);
7487 ErasureCodeProfile::const_iterator plugin =
7488 profile.find("plugin");
7489 if (plugin == profile.end()) {
7490 *ss << "cannot determine the erasure code plugin"
7491 << " because there is no 'plugin' entry in the erasure_code_profile "
7492 << profile << std::endl;
7493 return -EINVAL;
7494 }
7495 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7496 auto& instance = ErasureCodePluginRegistry::instance();
7497 return instance.factory(plugin->second,
7498 g_conf().get_val<std::string>("erasure_code_dir"),
7499 profile, erasure_code, ss);
7500 }
7501
7502 int OSDMonitor::check_cluster_features(uint64_t features,
7503 stringstream &ss)
7504 {
7505 stringstream unsupported_ss;
7506 int unsupported_count = 0;
7507 if ((mon.get_quorum_con_features() & features) != features) {
7508 unsupported_ss << "the monitor cluster";
7509 ++unsupported_count;
7510 }
7511
7512 set<int32_t> up_osds;
7513 osdmap.get_up_osds(up_osds);
7514 for (set<int32_t>::iterator it = up_osds.begin();
7515 it != up_osds.end(); ++it) {
7516 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7517 if ((xi.features & features) != features) {
7518 if (unsupported_count > 0)
7519 unsupported_ss << ", ";
7520 unsupported_ss << "osd." << *it;
7521 unsupported_count ++;
7522 }
7523 }
7524
7525 if (unsupported_count > 0) {
7526 ss << "features " << features << " unsupported by: "
7527 << unsupported_ss.str();
7528 return -ENOTSUP;
7529 }
7530
7531 // check pending osd state, too!
7532 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7533 pending_inc.new_xinfo.begin();
7534 p != pending_inc.new_xinfo.end(); ++p) {
7535 const osd_xinfo_t &xi = p->second;
7536 if ((xi.features & features) != features) {
7537 dout(10) << __func__ << " pending osd." << p->first
7538 << " features are insufficient; retry" << dendl;
7539 return -EAGAIN;
7540 }
7541 }
7542
7543 return 0;
7544 }
7545
7546 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7547 stringstream& ss)
7548 {
7549 OSDMap::Incremental new_pending = pending_inc;
7550 encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7551 OSDMap newmap;
7552 newmap.deepish_copy_from(osdmap);
7553 newmap.apply_incremental(new_pending);
7554
7555 // client compat
7556 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7557 auto mv = newmap.get_min_compat_client();
7558 if (mv > newmap.require_min_compat_client) {
7559 ss << "new crush map requires client version " << mv
7560 << " but require_min_compat_client is "
7561 << newmap.require_min_compat_client;
7562 return false;
7563 }
7564 }
7565
7566 // osd compat
7567 uint64_t features =
7568 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7569 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7570 stringstream features_ss;
7571 int r = check_cluster_features(features, features_ss);
7572 if (r) {
7573 ss << "Could not change CRUSH: " << features_ss.str();
7574 return false;
7575 }
7576
7577 return true;
7578 }
7579
7580 bool OSDMonitor::erasure_code_profile_in_use(
7581 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7582 const string &profile,
7583 ostream *ss)
7584 {
7585 bool found = false;
7586 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7587 p != pools.end();
7588 ++p) {
7589 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7590 *ss << osdmap.pool_name[p->first] << " ";
7591 found = true;
7592 }
7593 }
7594 if (found) {
7595 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7596 }
7597 return found;
7598 }
7599
7600 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7601 map<string,string> *erasure_code_profile_map,
7602 ostream *ss)
7603 {
7604 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7605 get_json_str_map,
7606 *ss,
7607 erasure_code_profile_map,
7608 true);
7609 if (r)
7610 return r;
7611 ceph_assert((*erasure_code_profile_map).count("plugin"));
7612 string default_plugin = (*erasure_code_profile_map)["plugin"];
7613 map<string,string> user_map;
7614 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7615 i != erasure_code_profile.end();
7616 ++i) {
7617 size_t equal = i->find('=');
7618 if (equal == string::npos) {
7619 user_map[*i] = string();
7620 (*erasure_code_profile_map)[*i] = string();
7621 } else {
7622 const string key = i->substr(0, equal);
7623 equal++;
7624 const string value = i->substr(equal);
7625 if (key.find("ruleset-") == 0) {
7626 *ss << "property '" << key << "' is no longer supported; try "
7627 << "'crush-" << key.substr(8) << "' instead";
7628 return -EINVAL;
7629 }
7630 user_map[key] = value;
7631 (*erasure_code_profile_map)[key] = value;
7632 }
7633 }
7634
7635 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7636 (*erasure_code_profile_map) = user_map;
7637
7638 return 0;
7639 }
7640
7641 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7642 const string &erasure_code_profile,
7643 uint8_t repl_size,
7644 unsigned *size, unsigned *min_size,
7645 ostream *ss)
7646 {
7647 int err = 0;
7648 bool set_min_size = false;
7649 switch (pool_type) {
7650 case pg_pool_t::TYPE_REPLICATED:
7651 if (osdmap.stretch_mode_enabled) {
7652 if (repl_size == 0)
7653 repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7654 if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7655 *ss << "prepare_pool_size: we are in stretch mode but size "
7656 << repl_size << " does not match!";
7657 return -EINVAL;
7658 }
7659 *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7660 set_min_size = true;
7661 }
7662 if (repl_size == 0) {
7663 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7664 }
7665 *size = repl_size;
7666 if (!set_min_size)
7667 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7668 break;
7669 case pg_pool_t::TYPE_ERASURE:
7670 {
7671 if (osdmap.stretch_mode_enabled) {
7672 *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7673 return -EINVAL;
7674 }
7675 ErasureCodeInterfaceRef erasure_code;
7676 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7677 if (err == 0) {
7678 *size = erasure_code->get_chunk_count();
7679 *min_size =
7680 erasure_code->get_data_chunk_count() +
7681 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7682 assert(*min_size <= *size);
7683 assert(*min_size >= erasure_code->get_data_chunk_count());
7684 }
7685 }
7686 break;
7687 default:
7688 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7689 err = -EINVAL;
7690 break;
7691 }
7692 return err;
7693 }
7694
7695 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7696 const string &erasure_code_profile,
7697 uint32_t *stripe_width,
7698 ostream *ss)
7699 {
7700 int err = 0;
7701 switch (pool_type) {
7702 case pg_pool_t::TYPE_REPLICATED:
7703 // ignored
7704 break;
7705 case pg_pool_t::TYPE_ERASURE:
7706 {
7707 ErasureCodeProfile profile =
7708 osdmap.get_erasure_code_profile(erasure_code_profile);
7709 ErasureCodeInterfaceRef erasure_code;
7710 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7711 if (err)
7712 break;
7713 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7714 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7715 auto it = profile.find("stripe_unit");
7716 if (it != profile.end()) {
7717 string err_str;
7718 stripe_unit = strict_iecstrtoll(it->second, &err_str);
7719 ceph_assert(err_str.empty());
7720 }
7721 *stripe_width = data_chunks *
7722 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7723 }
7724 break;
7725 default:
7726 *ss << "prepare_pool_stripe_width: "
7727 << pool_type << " is not a known pool type";
7728 err = -EINVAL;
7729 break;
7730 }
7731 return err;
7732 }
7733
7734 int OSDMonitor::get_replicated_stretch_crush_rule()
7735 {
7736 /* we don't write down the stretch rule anywhere, so
7737 * we have to guess it. How? Look at all the pools
7738 * and count up how many times a given rule is used
7739 * on stretch pools and then return the one with
7740 * the most users!
7741 */
7742 map<int,int> rule_counts;
7743 for (const auto& pooli : osdmap.pools) {
7744 const pg_pool_t& p = pooli.second;
7745 if (p.is_replicated() && p.is_stretch_pool()) {
7746 if (!rule_counts.count(p.crush_rule)) {
7747 rule_counts[p.crush_rule] = 1;
7748 } else {
7749 ++rule_counts[p.crush_rule];
7750 }
7751 }
7752 }
7753
7754 if (rule_counts.empty()) {
7755 return -ENOENT;
7756 }
7757
7758 int most_used_count = 0;
7759 int most_used_rule = -1;
7760 for (auto i : rule_counts) {
7761 if (i.second > most_used_count) {
7762 most_used_rule = i.first;
7763 most_used_count = i.second;
7764 }
7765 }
7766 ceph_assert(most_used_count > 0);
7767 ceph_assert(most_used_rule >= 0);
7768 return most_used_rule;
7769 }
7770
7771 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7772 const string &erasure_code_profile,
7773 const string &rule_name,
7774 int *crush_rule,
7775 ostream *ss)
7776 {
7777
7778 if (*crush_rule < 0) {
7779 switch (pool_type) {
7780 case pg_pool_t::TYPE_REPLICATED:
7781 {
7782 if (rule_name == "") {
7783 if (osdmap.stretch_mode_enabled) {
7784 *crush_rule = get_replicated_stretch_crush_rule();
7785 } else {
7786 // Use default rule
7787 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
7788 }
7789 if (*crush_rule < 0) {
7790 // Errors may happen e.g. if no valid rule is available
7791 *ss << "No suitable CRUSH rule exists, check "
7792 << "'osd pool default crush *' config options";
7793 return -ENOENT;
7794 }
7795 } else {
7796 return get_crush_rule(rule_name, crush_rule, ss);
7797 }
7798 }
7799 break;
7800 case pg_pool_t::TYPE_ERASURE:
7801 {
7802 int err = crush_rule_create_erasure(rule_name,
7803 erasure_code_profile,
7804 crush_rule, ss);
7805 switch (err) {
7806 case -EALREADY:
7807 dout(20) << "prepare_pool_crush_rule: rule "
7808 << rule_name << " try again" << dendl;
7809 // fall through
7810 case 0:
7811 // need to wait for the crush rule to be proposed before proceeding
7812 err = -EAGAIN;
7813 break;
7814 case -EEXIST:
7815 err = 0;
7816 break;
7817 }
7818 return err;
7819 }
7820 break;
7821 default:
7822 *ss << "prepare_pool_crush_rule: " << pool_type
7823 << " is not a known pool type";
7824 return -EINVAL;
7825 }
7826 } else {
7827 if (!osdmap.crush->rule_exists(*crush_rule)) {
7828 *ss << "CRUSH rule " << *crush_rule << " not found";
7829 return -ENOENT;
7830 }
7831 }
7832
7833 return 0;
7834 }
7835
7836 int OSDMonitor::get_crush_rule(const string &rule_name,
7837 int *crush_rule,
7838 ostream *ss)
7839 {
7840 int ret;
7841 ret = osdmap.crush->get_rule_id(rule_name);
7842 if (ret != -ENOENT) {
7843 // found it, use it
7844 *crush_rule = ret;
7845 } else {
7846 CrushWrapper newcrush = _get_pending_crush();
7847
7848 ret = newcrush.get_rule_id(rule_name);
7849 if (ret != -ENOENT) {
7850 // found it, wait for it to be proposed
7851 dout(20) << __func__ << ": rule " << rule_name
7852 << " try again" << dendl;
7853 return -EAGAIN;
7854 } else {
7855 // Cannot find it , return error
7856 *ss << "specified rule " << rule_name << " doesn't exist";
7857 return ret;
7858 }
7859 }
7860 return 0;
7861 }
7862
7863 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, int crush_rule, ostream *ss)
7864 {
7865 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7866 uint64_t projected = 0;
7867 unsigned osd_num = 0;
7868 // assume min cluster size 3
7869 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);
7870 if (pool < 0) {
7871 // a new pool
7872 projected += pg_num * size;
7873 }
7874 if (mapping.get_epoch() >= osdmap.get_epoch()) {
7875 set<int> roots;
7876 CrushWrapper newcrush = _get_pending_crush();
7877 newcrush.find_takes_by_rule(crush_rule, &roots);
7878 int max_osd = osdmap.get_max_osd();
7879 for (auto root : roots) {
7880 const char *rootname = newcrush.get_item_name(root);
7881 set<int> osd_ids;
7882 newcrush.get_leaves(rootname, &osd_ids);
7883 unsigned out_osd = 0;
7884 for (auto id : osd_ids) {
7885 if (id > max_osd) {
7886 out_osd++;
7887 continue;
7888 }
7889 projected += mapping.get_osd_acting_pgs(id).size();
7890 }
7891 osd_num += osd_ids.size() - out_osd;
7892 }
7893 if (pool >= 0) {
7894 // update an existing pool's pg num
7895 const auto& pg_info = osdmap.get_pools().at(pool);
7896 // already counted the pgs of this `pool` by iterating crush map, so
7897 // remove them using adding the specified pg num
7898 projected += pg_num * size;
7899 projected -= pg_info.get_pg_num_target() * pg_info.get_size();
7900 }
7901 num_osds = std::max(osd_num, 3u); // assume min cluster size 3
7902 } else {
7903 // use pg_num target for evaluating the projected pg num
7904 for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
7905 if (pool_id == pool) {
7906 projected += pg_num * size;
7907 } else {
7908 projected += pool_info.get_pg_num_target() * pool_info.get_size();
7909 }
7910 }
7911 }
7912 auto max_pgs = max_pgs_per_osd * num_osds;
7913 if (projected > max_pgs) {
7914 if (pool >= 0) {
7915 *ss << "pool id " << pool;
7916 }
7917 *ss << " pg_num " << pg_num << " size " << size
7918 << " would mean " << projected
7919 << " total pgs, which exceeds max " << max_pgs
7920 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7921 << " * num_in_osds " << num_osds << ")";
7922 return -ERANGE;
7923 }
7924 return 0;
7925 }
7926
7927 /**
7928 * @param name The name of the new pool
7929 * @param crush_rule The crush rule to use. If <0, will use the system default
7930 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7931 * @param pg_num The pg_num to use. If set to 0, will use the system default
7932 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7933 * @param pg_num_min min pg_num
7934 * @param pg_num_max max pg_num
7935 * @param repl_size Replication factor, or 0 for default
7936 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7937 * @param pool_type TYPE_ERASURE, or TYPE_REP
7938 * @param expected_num_objects expected number of objects on the pool
7939 * @param fast_read fast read type.
7940 * @param ss human readable error message, if any.
7941 *
7942 * @return 0 on success, negative errno on failure.
7943 */
7944 int OSDMonitor::prepare_new_pool(string& name,
7945 int crush_rule,
7946 const string &crush_rule_name,
7947 unsigned pg_num, unsigned pgp_num,
7948 unsigned pg_num_min,
7949 unsigned pg_num_max,
7950 const uint64_t repl_size,
7951 const uint64_t target_size_bytes,
7952 const float target_size_ratio,
7953 const string &erasure_code_profile,
7954 const unsigned pool_type,
7955 const uint64_t expected_num_objects,
7956 FastReadType fast_read,
7957 const string& pg_autoscale_mode,
7958 bool bulk,
7959 ostream *ss)
7960 {
7961 if (name.length() == 0)
7962 return -EINVAL;
7963 if (pg_num == 0) {
7964 auto pg_num_from_mode =
7965 [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
7966 (const string& mode) {
7967 return mode == "on" ? 1 : pg_num;
7968 };
7969 pg_num = pg_num_from_mode(
7970 pg_autoscale_mode.empty() ?
7971 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
7972 pg_autoscale_mode);
7973 }
7974 if (pgp_num == 0)
7975 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7976 if (!pgp_num)
7977 pgp_num = pg_num;
7978 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7979 *ss << "'pg_num' must be greater than 0 and less than or equal to "
7980 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7981 << " (you may adjust 'mon max pool pg num' for higher values)";
7982 return -ERANGE;
7983 }
7984 if (pgp_num > pg_num) {
7985 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7986 << ", which in this case is " << pg_num;
7987 return -ERANGE;
7988 }
7989 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7990 *ss << "'fast_read' can only apply to erasure coding pool";
7991 return -EINVAL;
7992 }
7993 int r;
7994 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7995 crush_rule_name, &crush_rule, ss);
7996 if (r) {
7997 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7998 return r;
7999 }
8000 unsigned size, min_size;
8001 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
8002 &size, &min_size, ss);
8003 if (r) {
8004 dout(10) << "prepare_pool_size returns " << r << dendl;
8005 return r;
8006 }
8007 if (g_conf()->mon_osd_crush_smoke_test) {
8008 CrushWrapper newcrush = _get_pending_crush();
8009 ostringstream err;
8010 CrushTester tester(newcrush, err);
8011 tester.set_min_x(0);
8012 tester.set_max_x(50);
8013 tester.set_rule(crush_rule);
8014 tester.set_num_rep(size);
8015 auto start = ceph::coarse_mono_clock::now();
8016 r = tester.test_with_fork(g_conf()->mon_lease);
8017 auto duration = ceph::coarse_mono_clock::now() - start;
8018 if (r < 0) {
8019 dout(10) << "tester.test_with_fork returns " << r
8020 << ": " << err.str() << dendl;
8021 *ss << "crush test failed with " << r << ": " << err.str();
8022 return r;
8023 }
8024 dout(10) << __func__ << " crush smoke test duration: "
8025 << duration << dendl;
8026 }
8027 r = check_pg_num(-1, pg_num, size, crush_rule, ss);
8028 if (r) {
8029 dout(10) << "check_pg_num returns " << r << dendl;
8030 return r;
8031 }
8032
8033 if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) {
8034 *ss << "crush rule " << crush_rule << " type does not match pool";
8035 return -EINVAL;
8036 }
8037
8038 uint32_t stripe_width = 0;
8039 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
8040 if (r) {
8041 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
8042 return r;
8043 }
8044
8045 bool fread = false;
8046 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8047 switch (fast_read) {
8048 case FAST_READ_OFF:
8049 fread = false;
8050 break;
8051 case FAST_READ_ON:
8052 fread = true;
8053 break;
8054 case FAST_READ_DEFAULT:
8055 fread = g_conf()->osd_pool_default_ec_fast_read;
8056 break;
8057 default:
8058 *ss << "invalid fast_read setting: " << fast_read;
8059 return -EINVAL;
8060 }
8061 }
8062
8063 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8064 p != pending_inc.new_pool_names.end();
8065 ++p) {
8066 if (p->second == name)
8067 return 0;
8068 }
8069
8070 if (-1 == pending_inc.new_pool_max)
8071 pending_inc.new_pool_max = osdmap.pool_max;
8072 int64_t pool = ++pending_inc.new_pool_max;
8073 pg_pool_t empty;
8074 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
8075 pi->create_time = ceph_clock_now();
8076 pi->type = pool_type;
8077 pi->fast_read = fread;
8078 pi->flags = g_conf()->osd_pool_default_flags;
8079 if (bulk) {
8080 pi->set_flag(pg_pool_t::FLAG_BULK);
8081 } else if (g_conf()->osd_pool_default_flag_bulk) {
8082 pi->set_flag(pg_pool_t::FLAG_BULK);
8083 }
8084 if (g_conf()->osd_pool_default_flag_hashpspool)
8085 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
8086 if (g_conf()->osd_pool_default_flag_nodelete)
8087 pi->set_flag(pg_pool_t::FLAG_NODELETE);
8088 if (g_conf()->osd_pool_default_flag_nopgchange)
8089 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8090 if (g_conf()->osd_pool_default_flag_nosizechange)
8091 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
8092 pi->set_flag(pg_pool_t::FLAG_CREATING);
8093 if (g_conf()->osd_pool_use_gmt_hitset)
8094 pi->use_gmt_hitset = true;
8095 else
8096 pi->use_gmt_hitset = false;
8097
8098 pi->size = size;
8099 pi->min_size = min_size;
8100 pi->crush_rule = crush_rule;
8101 pi->expected_num_objects = expected_num_objects;
8102 pi->object_hash = CEPH_STR_HASH_RJENKINS;
8103 if (osdmap.stretch_mode_enabled) {
8104 pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8105 pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8106 pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8107 pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8108 if (osdmap.degraded_stretch_mode) {
8109 pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8110 pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8111 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8112 // TODO: drat, we don't record this ^ anywhere, though given that it
8113 // necessarily won't exist elsewhere it likely doesn't matter
8114 pi->min_size = pi->min_size / 2;
8115 pi->size = pi->size / 2; // only support 2 zones now
8116 }
8117 }
8118
8119 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8120 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8121 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8122 pi->pg_autoscale_mode = m;
8123 } else {
8124 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
8125 }
8126 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8127 pi->set_pg_num(
8128 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8129 : pg_num);
8130 pi->set_pg_num_pending(pi->get_pg_num());
8131 pi->set_pg_num_target(pg_num);
8132 pi->set_pgp_num(pi->get_pg_num());
8133 pi->set_pgp_num_target(pgp_num);
8134 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8135 pg_num_min) {
8136 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8137 }
8138 if (osdmap.require_osd_release >= ceph_release_t::quincy &&
8139 pg_num_max) {
8140 pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
8141 }
8142 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8143 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8144 pi->pg_autoscale_mode = m;
8145 }
8146
8147 pi->last_change = pending_inc.epoch;
8148 pi->auid = 0;
8149
8150 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8151 pi->erasure_code_profile = erasure_code_profile;
8152 } else {
8153 pi->erasure_code_profile = "";
8154 }
8155 pi->stripe_width = stripe_width;
8156
8157 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8158 target_size_bytes) {
8159 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8160 // larger than int32_t max.
8161 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8162 }
8163 if (target_size_ratio > 0.0 &&
8164 osdmap.require_osd_release >= ceph_release_t::nautilus) {
8165 // only store for nautilus+, just to be consistent and tidy.
8166 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8167 }
8168
8169 pi->cache_target_dirty_ratio_micro =
8170 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8171 pi->cache_target_dirty_high_ratio_micro =
8172 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8173 pi->cache_target_full_ratio_micro =
8174 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8175 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8176 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8177
8178 pending_inc.new_pool_names[pool] = name;
8179 return 0;
8180 }
8181
8182 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8183 {
8184 op->mark_osdmon_event(__func__);
8185 ostringstream ss;
8186 if (pending_inc.new_flags < 0)
8187 pending_inc.new_flags = osdmap.get_flags();
8188 pending_inc.new_flags |= flag;
8189 ss << OSDMap::get_flag_string(flag) << " is set";
8190 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8191 get_last_committed() + 1));
8192 return true;
8193 }
8194
8195 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8196 {
8197 op->mark_osdmon_event(__func__);
8198 ostringstream ss;
8199 if (pending_inc.new_flags < 0)
8200 pending_inc.new_flags = osdmap.get_flags();
8201 pending_inc.new_flags &= ~flag;
8202 ss << OSDMap::get_flag_string(flag) << " is unset";
8203 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8204 get_last_committed() + 1));
8205 return true;
8206 }
8207
8208 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8209 stringstream& ss)
8210 {
8211 string poolstr;
8212 cmd_getval(cmdmap, "pool", poolstr);
8213 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8214 if (pool < 0) {
8215 ss << "unrecognized pool '" << poolstr << "'";
8216 return -ENOENT;
8217 }
8218 string var;
8219 cmd_getval(cmdmap, "var", var);
8220
8221 pg_pool_t p = *osdmap.get_pg_pool(pool);
8222 if (pending_inc.new_pools.count(pool))
8223 p = pending_inc.new_pools[pool];
8224
8225 // accept val as a json string in the normal case (current
8226 // generation monitor). parse out int or float values from the
8227 // string as needed. however, if it is not a string, try to pull
8228 // out an int, in case an older monitor with an older json schema is
8229 // forwarding a request.
8230 string val;
8231 string interr, floaterr;
8232 int64_t n = 0;
8233 double f = 0;
8234 int64_t uf = 0; // micro-f
8235 cmd_getval(cmdmap, "val", val);
8236
8237 auto si_options = {
8238 "target_max_objects"
8239 };
8240 auto iec_options = {
8241 "target_max_bytes",
8242 "target_size_bytes",
8243 "compression_max_blob_size",
8244 "compression_min_blob_size",
8245 "csum_max_block",
8246 "csum_min_block",
8247 };
8248 if (count(begin(si_options), end(si_options), var)) {
8249 n = strict_si_cast<int64_t>(val, &interr);
8250 } else if (count(begin(iec_options), end(iec_options), var)) {
8251 n = strict_iec_cast<int64_t>(val, &interr);
8252 } else {
8253 // parse string as both int and float; different fields use different types.
8254 n = strict_strtoll(val.c_str(), 10, &interr);
8255 f = strict_strtod(val.c_str(), &floaterr);
8256 uf = llrintl(f * (double)1000000.0);
8257 }
8258
8259 if (!p.is_tier() &&
8260 (var == "hit_set_type" || var == "hit_set_period" ||
8261 var == "hit_set_count" || var == "hit_set_fpp" ||
8262 var == "target_max_objects" || var == "target_max_bytes" ||
8263 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8264 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8265 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8266 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8267 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8268 return -EACCES;
8269 }
8270
8271 if (var == "size") {
8272 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8273 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8274 return -EPERM;
8275 }
8276 if (p.type == pg_pool_t::TYPE_ERASURE) {
8277 ss << "can not change the size of an erasure-coded pool";
8278 return -ENOTSUP;
8279 }
8280 if (interr.length()) {
8281 ss << "error parsing integer value '" << val << "': " << interr;
8282 return -EINVAL;
8283 }
8284 if (n <= 0 || n > 10) {
8285 ss << "pool size must be between 1 and 10";
8286 return -EINVAL;
8287 }
8288 if (n == 1) {
8289 if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8290 ss << "configuring pool size as 1 is disabled by default.";
8291 return -EPERM;
8292 }
8293 bool sure = false;
8294 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8295 if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8296 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8297 "pass the flag --yes-i-really-mean-it.";
8298 return -EPERM;
8299 }
8300 }
8301 if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) {
8302 ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
8303 return -EINVAL;
8304 }
8305 int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
8306 if (r < 0) {
8307 return r;
8308 }
8309 p.size = n;
8310 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8311 } else if (var == "min_size") {
8312 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8313 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8314 return -EPERM;
8315 }
8316 if (interr.length()) {
8317 ss << "error parsing integer value '" << val << "': " << interr;
8318 return -EINVAL;
8319 }
8320
8321 if (p.type != pg_pool_t::TYPE_ERASURE) {
8322 if (n < 1 || n > p.size) {
8323 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8324 return -EINVAL;
8325 }
8326 } else {
8327 ErasureCodeInterfaceRef erasure_code;
8328 int k;
8329 stringstream tmp;
8330 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8331 if (err == 0) {
8332 k = erasure_code->get_data_chunk_count();
8333 } else {
8334 ss << __func__ << " get_erasure_code failed: " << tmp.str();
8335 return err;
8336 }
8337
8338 if (n < k || n > p.size) {
8339 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8340 return -EINVAL;
8341 }
8342 }
8343 p.min_size = n;
8344 } else if (var == "pg_num_actual") {
8345 if (interr.length()) {
8346 ss << "error parsing integer value '" << val << "': " << interr;
8347 return -EINVAL;
8348 }
8349 if (n == (int)p.get_pg_num()) {
8350 return 0;
8351 }
8352 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8353 ss << "'pg_num' must be greater than 0 and less than or equal to "
8354 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8355 << " (you may adjust 'mon max pool pg num' for higher values)";
8356 return -ERANGE;
8357 }
8358 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8359 ss << "cannot adjust pg_num while initial PGs are being created";
8360 return -EBUSY;
8361 }
8362 if (n > (int)p.get_pg_num()) {
8363 if (p.get_pg_num() != p.get_pg_num_pending()) {
8364 // force pre-nautilus clients to resend their ops, since they
8365 // don't understand pg_num_pending changes form a new interval
8366 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8367 }
8368 p.set_pg_num(n);
8369 } else {
8370 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8371 ss << "nautilus OSDs are required to adjust pg_num_pending";
8372 return -EPERM;
8373 }
8374 if (n < (int)p.get_pgp_num()) {
8375 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8376 return -EINVAL;
8377 }
8378 if (n < (int)p.get_pg_num() - 1) {
8379 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8380 << ") - 1; only single pg decrease is currently supported";
8381 return -EINVAL;
8382 }
8383 p.set_pg_num_pending(n);
8384 // force pre-nautilus clients to resend their ops, since they
8385 // don't understand pg_num_pending changes form a new interval
8386 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8387 }
8388 // force pre-luminous clients to resend their ops, since they
8389 // don't understand that split PGs now form a new interval.
8390 p.last_force_op_resend_preluminous = pending_inc.epoch;
8391 } else if (var == "pg_num") {
8392 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8393 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8394 return -EPERM;
8395 }
8396 if (interr.length()) {
8397 ss << "error parsing integer value '" << val << "': " << interr;
8398 return -EINVAL;
8399 }
8400 if (n == (int)p.get_pg_num_target()) {
8401 return 0;
8402 }
8403 if (n <= 0 || static_cast<uint64_t>(n) >
8404 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8405 ss << "'pg_num' must be greater than 0 and less than or equal to "
8406 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8407 << " (you may adjust 'mon max pool pg num' for higher values)";
8408 return -ERANGE;
8409 }
8410 if (n > (int)p.get_pg_num_target()) {
8411 int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
8412 if (r) {
8413 return r;
8414 }
8415 bool force = false;
8416 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8417 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8418 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8419 return -EPERM;
8420 }
8421 } else {
8422 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8423 ss << "nautilus OSDs are required to decrease pg_num";
8424 return -EPERM;
8425 }
8426 }
8427 int64_t pg_min = 0, pg_max = 0;
8428 p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
8429 p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
8430 if (pg_min && n < pg_min) {
8431 ss << "specified pg_num " << n
8432 << " < pg_num_min " << pg_min;
8433 return -EINVAL;
8434 }
8435 if (pg_max && n > pg_max) {
8436 ss << "specified pg_num " << n
8437 << " < pg_num_max " << pg_max;
8438 return -EINVAL;
8439 }
8440 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8441 // pre-nautilus osdmap format; increase pg_num directly
8442 assert(n > (int)p.get_pg_num());
8443 // force pre-nautilus clients to resend their ops, since they
8444 // don't understand pg_num_target changes form a new interval
8445 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8446 // force pre-luminous clients to resend their ops, since they
8447 // don't understand that split PGs now form a new interval.
8448 p.last_force_op_resend_preluminous = pending_inc.epoch;
8449 p.set_pg_num(n);
8450 } else {
8451 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8452 // make pgp_num track pg_num if it already matches. if it is set
8453 // differently, leave it different and let the user control it
8454 // manually.
8455 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8456 p.set_pgp_num_target(n);
8457 }
8458 p.set_pg_num_target(n);
8459 }
8460 } else if (var == "pgp_num_actual") {
8461 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8462 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8463 return -EPERM;
8464 }
8465 if (interr.length()) {
8466 ss << "error parsing integer value '" << val << "': " << interr;
8467 return -EINVAL;
8468 }
8469 if (n <= 0) {
8470 ss << "specified pgp_num must > 0, but you set to " << n;
8471 return -EINVAL;
8472 }
8473 if (n > (int)p.get_pg_num()) {
8474 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8475 return -EINVAL;
8476 }
8477 if (n > (int)p.get_pg_num_pending()) {
8478 ss << "specified pgp_num " << n
8479 << " > pg_num_pending " << p.get_pg_num_pending();
8480 return -EINVAL;
8481 }
8482 p.set_pgp_num(n);
8483 } else if (var == "pgp_num") {
8484 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8485 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8486 return -EPERM;
8487 }
8488 if (interr.length()) {
8489 ss << "error parsing integer value '" << val << "': " << interr;
8490 return -EINVAL;
8491 }
8492 if (n <= 0) {
8493 ss << "specified pgp_num must > 0, but you set to " << n;
8494 return -EINVAL;
8495 }
8496 if (n > (int)p.get_pg_num_target()) {
8497 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8498 return -EINVAL;
8499 }
8500 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8501 // pre-nautilus osdmap format; increase pgp_num directly
8502 p.set_pgp_num(n);
8503 } else {
8504 p.set_pgp_num_target(n);
8505 }
8506 } else if (var == "pg_autoscale_mode") {
8507 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8508 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8509 ss << "specified invalid mode " << val;
8510 return -EINVAL;
8511 }
8512 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8513 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8514 return -EINVAL;
8515 }
8516 p.pg_autoscale_mode = m;
8517 } else if (var == "crush_rule") {
8518 int id = osdmap.crush->get_rule_id(val);
8519 if (id == -ENOENT) {
8520 ss << "crush rule " << val << " does not exist";
8521 return -ENOENT;
8522 }
8523 if (id < 0) {
8524 ss << cpp_strerror(id);
8525 return -ENOENT;
8526 }
8527 if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) {
8528 ss << "crush rule " << id << " type does not match pool";
8529 return -EINVAL;
8530 }
8531 p.crush_rule = id;
8532 } else if (var == "nodelete" || var == "nopgchange" ||
8533 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8534 var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
8535 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8536 // make sure we only compare against 'n' if we didn't receive a string
8537 if (val == "true" || (interr.empty() && n == 1)) {
8538 p.set_flag(flag);
8539 } else if (val == "false" || (interr.empty() && n == 0)) {
8540 p.unset_flag(flag);
8541 } else {
8542 ss << "expecting value 'true', 'false', '0', or '1'";
8543 return -EINVAL;
8544 }
8545 } else if (var == "eio") {
8546 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8547
8548 // make sure we only compare against 'n' if we didn't receive a string
8549 if (val == "true" || (interr.empty() && n == 1)) {
8550 p.set_flag(flag);
8551 } else if (val == "false" || (interr.empty() && n == 0)) {
8552 p.unset_flag(flag);
8553 } else {
8554 ss << "expecting value 'true', 'false', '0', or '1'";
8555 return -EINVAL;
8556 }
8557 } else if (var == "hashpspool") {
8558 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8559 bool force = false;
8560 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8561
8562 if (!force) {
8563 ss << "are you SURE? this will remap all placement groups in this pool,"
8564 " this triggers large data movement,"
8565 " pass --yes-i-really-mean-it if you really do.";
8566 return -EPERM;
8567 }
8568 // make sure we only compare against 'n' if we didn't receive a string
8569 if (val == "true" || (interr.empty() && n == 1)) {
8570 p.set_flag(flag);
8571 } else if (val == "false" || (interr.empty() && n == 0)) {
8572 p.unset_flag(flag);
8573 } else {
8574 ss << "expecting value 'true', 'false', '0', or '1'";
8575 return -EINVAL;
8576 }
8577 } else if (var == "hit_set_type") {
8578 if (val == "none")
8579 p.hit_set_params = HitSet::Params();
8580 else {
8581 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8582 if (err)
8583 return err;
8584 if (val == "bloom") {
8585 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8586 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8587 p.hit_set_params = HitSet::Params(bsp);
8588 } else if (val == "explicit_hash")
8589 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8590 else if (val == "explicit_object")
8591 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8592 else {
8593 ss << "unrecognized hit_set type '" << val << "'";
8594 return -EINVAL;
8595 }
8596 }
8597 } else if (var == "hit_set_period") {
8598 if (interr.length()) {
8599 ss << "error parsing integer value '" << val << "': " << interr;
8600 return -EINVAL;
8601 } else if (n < 0) {
8602 ss << "hit_set_period should be non-negative";
8603 return -EINVAL;
8604 }
8605 p.hit_set_period = n;
8606 } else if (var == "hit_set_count") {
8607 if (interr.length()) {
8608 ss << "error parsing integer value '" << val << "': " << interr;
8609 return -EINVAL;
8610 } else if (n < 0) {
8611 ss << "hit_set_count should be non-negative";
8612 return -EINVAL;
8613 }
8614 p.hit_set_count = n;
8615 } else if (var == "hit_set_fpp") {
8616 if (floaterr.length()) {
8617 ss << "error parsing floating point value '" << val << "': " << floaterr;
8618 return -EINVAL;
8619 } else if (f < 0 || f > 1.0) {
8620 ss << "hit_set_fpp should be in the range 0..1";
8621 return -EINVAL;
8622 }
8623 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8624 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8625 return -EINVAL;
8626 }
8627 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8628 bloomp->set_fpp(f);
8629 } else if (var == "use_gmt_hitset") {
8630 if (val == "true" || (interr.empty() && n == 1)) {
8631 p.use_gmt_hitset = true;
8632 } else {
8633 ss << "expecting value 'true' or '1'";
8634 return -EINVAL;
8635 }
8636 } else if (var == "allow_ec_overwrites") {
8637 if (!p.is_erasure()) {
8638 ss << "ec overwrites can only be enabled for an erasure coded pool";
8639 return -EINVAL;
8640 }
8641 stringstream err;
8642 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8643 !is_pool_currently_all_bluestore(pool, p, &err)) {
8644 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8645 return -EINVAL;
8646 }
8647 if (val == "true" || (interr.empty() && n == 1)) {
8648 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8649 } else if (val == "false" || (interr.empty() && n == 0)) {
8650 ss << "ec overwrites cannot be disabled once enabled";
8651 return -EINVAL;
8652 } else {
8653 ss << "expecting value 'true', 'false', '0', or '1'";
8654 return -EINVAL;
8655 }
8656 } else if (var == "target_max_objects") {
8657 if (interr.length()) {
8658 ss << "error parsing int '" << val << "': " << interr;
8659 return -EINVAL;
8660 }
8661 p.target_max_objects = n;
8662 } else if (var == "target_max_bytes") {
8663 if (interr.length()) {
8664 ss << "error parsing int '" << val << "': " << interr;
8665 return -EINVAL;
8666 }
8667 p.target_max_bytes = n;
8668 } else if (var == "cache_target_dirty_ratio") {
8669 if (floaterr.length()) {
8670 ss << "error parsing float '" << val << "': " << floaterr;
8671 return -EINVAL;
8672 }
8673 if (f < 0 || f > 1.0) {
8674 ss << "value must be in the range 0..1";
8675 return -ERANGE;
8676 }
8677 p.cache_target_dirty_ratio_micro = uf;
8678 } else if (var == "cache_target_dirty_high_ratio") {
8679 if (floaterr.length()) {
8680 ss << "error parsing float '" << val << "': " << floaterr;
8681 return -EINVAL;
8682 }
8683 if (f < 0 || f > 1.0) {
8684 ss << "value must be in the range 0..1";
8685 return -ERANGE;
8686 }
8687 p.cache_target_dirty_high_ratio_micro = uf;
8688 } else if (var == "cache_target_full_ratio") {
8689 if (floaterr.length()) {
8690 ss << "error parsing float '" << val << "': " << floaterr;
8691 return -EINVAL;
8692 }
8693 if (f < 0 || f > 1.0) {
8694 ss << "value must be in the range 0..1";
8695 return -ERANGE;
8696 }
8697 p.cache_target_full_ratio_micro = uf;
8698 } else if (var == "cache_min_flush_age") {
8699 if (interr.length()) {
8700 ss << "error parsing int '" << val << "': " << interr;
8701 return -EINVAL;
8702 }
8703 p.cache_min_flush_age = n;
8704 } else if (var == "cache_min_evict_age") {
8705 if (interr.length()) {
8706 ss << "error parsing int '" << val << "': " << interr;
8707 return -EINVAL;
8708 }
8709 p.cache_min_evict_age = n;
8710 } else if (var == "min_read_recency_for_promote") {
8711 if (interr.length()) {
8712 ss << "error parsing integer value '" << val << "': " << interr;
8713 return -EINVAL;
8714 }
8715 p.min_read_recency_for_promote = n;
8716 } else if (var == "hit_set_grade_decay_rate") {
8717 if (interr.length()) {
8718 ss << "error parsing integer value '" << val << "': " << interr;
8719 return -EINVAL;
8720 }
8721 if (n > 100 || n < 0) {
8722 ss << "value out of range,valid range is 0 - 100";
8723 return -EINVAL;
8724 }
8725 p.hit_set_grade_decay_rate = n;
8726 } else if (var == "hit_set_search_last_n") {
8727 if (interr.length()) {
8728 ss << "error parsing integer value '" << val << "': " << interr;
8729 return -EINVAL;
8730 }
8731 if (n > p.hit_set_count || n < 0) {
8732 ss << "value out of range,valid range is 0 - hit_set_count";
8733 return -EINVAL;
8734 }
8735 p.hit_set_search_last_n = n;
8736 } else if (var == "min_write_recency_for_promote") {
8737 if (interr.length()) {
8738 ss << "error parsing integer value '" << val << "': " << interr;
8739 return -EINVAL;
8740 }
8741 p.min_write_recency_for_promote = n;
8742 } else if (var == "fast_read") {
8743 if (p.is_replicated()) {
8744 ss << "fast read is not supported in replication pool";
8745 return -EINVAL;
8746 }
8747 if (val == "true" || (interr.empty() && n == 1)) {
8748 p.fast_read = true;
8749 } else if (val == "false" || (interr.empty() && n == 0)) {
8750 p.fast_read = false;
8751 } else {
8752 ss << "expecting value 'true', 'false', '0', or '1'";
8753 return -EINVAL;
8754 }
8755 } else if (pool_opts_t::is_opt_name(var)) {
8756 bool unset = val == "unset";
8757 if (var == "compression_mode") {
8758 if (!unset) {
8759 auto cmode = Compressor::get_comp_mode_type(val);
8760 if (!cmode) {
8761 ss << "unrecognized compression mode '" << val << "'";
8762 return -EINVAL;
8763 }
8764 }
8765 } else if (var == "compression_algorithm") {
8766 if (!unset) {
8767 auto alg = Compressor::get_comp_alg_type(val);
8768 if (!alg) {
8769 ss << "unrecognized compression_algorithm '" << val << "'";
8770 return -EINVAL;
8771 }
8772 }
8773 } else if (var == "compression_required_ratio") {
8774 if (floaterr.length()) {
8775 ss << "error parsing float value '" << val << "': " << floaterr;
8776 return -EINVAL;
8777 }
8778 if (f < 0 || f > 1) {
8779 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8780 return -EINVAL;
8781 }
8782 } else if (var == "csum_type") {
8783 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8784 if (t < 0 ) {
8785 ss << "unrecognized csum_type '" << val << "'";
8786 return -EINVAL;
8787 }
8788 //preserve csum_type numeric value
8789 n = t;
8790 interr.clear();
8791 } else if (var == "compression_max_blob_size" ||
8792 var == "compression_min_blob_size" ||
8793 var == "csum_max_block" ||
8794 var == "csum_min_block") {
8795 if (interr.length()) {
8796 ss << "error parsing int value '" << val << "': " << interr;
8797 return -EINVAL;
8798 }
8799 } else if (var == "fingerprint_algorithm") {
8800 if (!unset) {
8801 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8802 if (!alg) {
8803 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8804 return -EINVAL;
8805 }
8806 }
8807 } else if (var == "target_size_bytes") {
8808 if (interr.length()) {
8809 ss << "error parsing unit value '" << val << "': " << interr;
8810 return -EINVAL;
8811 }
8812 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8813 ss << "must set require_osd_release to nautilus or "
8814 << "later before setting target_size_bytes";
8815 return -EINVAL;
8816 }
8817 } else if (var == "pg_num_min") {
8818 if (interr.length()) {
8819 ss << "error parsing int value '" << val << "': " << interr;
8820 return -EINVAL;
8821 }
8822 if (n > (int)p.get_pg_num_target()) {
8823 ss << "specified pg_num_min " << n
8824 << " > pg_num " << p.get_pg_num_target();
8825 return -EINVAL;
8826 }
8827 } else if (var == "pg_num_max") {
8828 if (interr.length()) {
8829 ss << "error parsing int value '" << val << "': " << interr;
8830 return -EINVAL;
8831 }
8832 if (n && n < (int)p.get_pg_num_target()) {
8833 ss << "specified pg_num_max " << n
8834 << " < pg_num " << p.get_pg_num_target();
8835 return -EINVAL;
8836 }
8837 } else if (var == "recovery_priority") {
8838 if (interr.length()) {
8839 ss << "error parsing int value '" << val << "': " << interr;
8840 return -EINVAL;
8841 }
8842 if (!g_conf()->debug_allow_any_pool_priority) {
8843 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8844 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8845 << " and " << OSD_POOL_PRIORITY_MAX;
8846 return -EINVAL;
8847 }
8848 }
8849 } else if (var == "pg_autoscale_bias") {
8850 if (f < 0.0 || f > 1000.0) {
8851 ss << "pg_autoscale_bias must be between 0 and 1000";
8852 return -EINVAL;
8853 }
8854 } else if (var == "dedup_tier") {
8855 if (interr.empty()) {
8856 ss << "expecting value 'pool name'";
8857 return -EINVAL;
8858 }
8859 // Current base tier in dedup does not support ec pool
8860 if (p.is_erasure()) {
8861 ss << "pool '" << poolstr
8862 << "' is an ec pool, which cannot be a base tier";
8863 return -ENOTSUP;
8864 }
8865 int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8866 if (lowtierpool_id < 0) {
8867 ss << "unrecognized pool '" << val << "'";
8868 return -ENOENT;
8869 }
8870 const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8871 ceph_assert(tp);
8872 n = lowtierpool_id;
8873 // The original input is string (pool name), but we convert it to int64_t.
8874 // So, clear interr
8875 interr.clear();
8876 } else if (var == "dedup_chunk_algorithm") {
8877 if (!unset) {
8878 auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8879 if (!alg) {
8880 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8881 return -EINVAL;
8882 }
8883 }
8884 } else if (var == "dedup_cdc_chunk_size") {
8885 if (interr.length()) {
8886 ss << "error parsing int value '" << val << "': " << interr;
8887 return -EINVAL;
8888 }
8889 }
8890
8891 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8892 switch (desc.type) {
8893 case pool_opts_t::STR:
8894 if (unset) {
8895 p.opts.unset(desc.key);
8896 } else {
8897 p.opts.set(desc.key, static_cast<std::string>(val));
8898 }
8899 break;
8900 case pool_opts_t::INT:
8901 if (interr.length()) {
8902 ss << "error parsing integer value '" << val << "': " << interr;
8903 return -EINVAL;
8904 }
8905 if (n == 0) {
8906 p.opts.unset(desc.key);
8907 } else {
8908 p.opts.set(desc.key, static_cast<int64_t>(n));
8909 }
8910 break;
8911 case pool_opts_t::DOUBLE:
8912 if (floaterr.length()) {
8913 ss << "error parsing floating point value '" << val << "': " << floaterr;
8914 return -EINVAL;
8915 }
8916 if (f == 0) {
8917 p.opts.unset(desc.key);
8918 } else {
8919 p.opts.set(desc.key, static_cast<double>(f));
8920 }
8921 break;
8922 default:
8923 ceph_assert(!"unknown type");
8924 }
8925 } else {
8926 ss << "unrecognized variable '" << var << "'";
8927 return -EINVAL;
8928 }
8929 if (val != "unset") {
8930 ss << "set pool " << pool << " " << var << " to " << val;
8931 } else {
8932 ss << "unset pool " << pool << " " << var;
8933 }
8934 p.last_change = pending_inc.epoch;
8935 pending_inc.new_pools[pool] = p;
8936 return 0;
8937 }
8938
8939 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8940 const cmdmap_t& cmdmap,
8941 stringstream& ss)
8942 {
8943 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8944 }
8945
8946 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8947 const cmdmap_t& cmdmap,
8948 stringstream& ss,
8949 bool *modified)
8950 {
8951 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8952 }
8953
8954
8955 /**
8956 * Common logic for preprocess and prepare phases of pool application
8957 * tag commands. In preprocess mode we're only detecting invalid
8958 * commands, and determining whether it was a modification or a no-op.
8959 * In prepare mode we're actually updating the pending state.
8960 */
8961 int OSDMonitor::_command_pool_application(const string &prefix,
8962 const cmdmap_t& cmdmap,
8963 stringstream& ss,
8964 bool *modified,
8965 bool preparing)
8966 {
8967 string pool_name;
8968 cmd_getval(cmdmap, "pool", pool_name);
8969 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8970 if (pool < 0) {
8971 ss << "unrecognized pool '" << pool_name << "'";
8972 return -ENOENT;
8973 }
8974
8975 pg_pool_t p = *osdmap.get_pg_pool(pool);
8976 if (preparing) {
8977 if (pending_inc.new_pools.count(pool)) {
8978 p = pending_inc.new_pools[pool];
8979 }
8980 }
8981
8982 string app;
8983 cmd_getval(cmdmap, "app", app);
8984 bool app_exists = (p.application_metadata.count(app) > 0);
8985
8986 string key;
8987 cmd_getval(cmdmap, "key", key);
8988 if (key == "all") {
8989 ss << "key cannot be 'all'";
8990 return -EINVAL;
8991 }
8992
8993 string value;
8994 cmd_getval(cmdmap, "value", value);
8995 if (value == "all") {
8996 ss << "value cannot be 'all'";
8997 return -EINVAL;
8998 }
8999
9000 if (boost::algorithm::ends_with(prefix, "enable")) {
9001 if (app.empty()) {
9002 ss << "application name must be provided";
9003 return -EINVAL;
9004 }
9005
9006 if (p.is_tier()) {
9007 ss << "application must be enabled on base tier";
9008 return -EINVAL;
9009 }
9010
9011 bool force = false;
9012 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9013
9014 if (!app_exists && !p.application_metadata.empty() && !force) {
9015 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
9016 << "application; pass --yes-i-really-mean-it to proceed anyway";
9017 return -EPERM;
9018 }
9019
9020 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
9021 ss << "too many enabled applications on pool '" << pool_name << "'; "
9022 << "max " << MAX_POOL_APPLICATIONS;
9023 return -EINVAL;
9024 }
9025
9026 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
9027 ss << "application name '" << app << "' too long; max length "
9028 << MAX_POOL_APPLICATION_LENGTH;
9029 return -EINVAL;
9030 }
9031
9032 if (!app_exists) {
9033 p.application_metadata[app] = {};
9034 }
9035 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
9036
9037 } else if (boost::algorithm::ends_with(prefix, "disable")) {
9038 bool force = false;
9039 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9040
9041 if (!force) {
9042 ss << "Are you SURE? Disabling an application within a pool might result "
9043 << "in loss of application functionality; pass "
9044 << "--yes-i-really-mean-it to proceed anyway";
9045 return -EPERM;
9046 }
9047
9048 if (!app_exists) {
9049 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9050 << "'";
9051 return 0; // idempotent
9052 }
9053
9054 p.application_metadata.erase(app);
9055 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
9056
9057 } else if (boost::algorithm::ends_with(prefix, "set")) {
9058 if (p.is_tier()) {
9059 ss << "application metadata must be set on base tier";
9060 return -EINVAL;
9061 }
9062
9063 if (!app_exists) {
9064 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9065 << "'";
9066 return -ENOENT;
9067 }
9068
9069 string key;
9070 cmd_getval(cmdmap, "key", key);
9071
9072 if (key.empty()) {
9073 ss << "key must be provided";
9074 return -EINVAL;
9075 }
9076
9077 auto &app_keys = p.application_metadata[app];
9078 if (app_keys.count(key) == 0 &&
9079 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
9080 ss << "too many keys set for application '" << app << "' on pool '"
9081 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
9082 return -EINVAL;
9083 }
9084
9085 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
9086 ss << "key '" << app << "' too long; max length "
9087 << MAX_POOL_APPLICATION_LENGTH;
9088 return -EINVAL;
9089 }
9090
9091 string value;
9092 cmd_getval(cmdmap, "value", value);
9093 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9094 ss << "value '" << value << "' too long; max length "
9095 << MAX_POOL_APPLICATION_LENGTH;
9096 return -EINVAL;
9097 }
9098
9099 p.application_metadata[app][key] = value;
9100 ss << "set application '" << app << "' key '" << key << "' to '"
9101 << value << "' on pool '" << pool_name << "'";
9102 } else if (boost::algorithm::ends_with(prefix, "rm")) {
9103 if (!app_exists) {
9104 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9105 << "'";
9106 return -ENOENT;
9107 }
9108
9109 string key;
9110 cmd_getval(cmdmap, "key", key);
9111 auto it = p.application_metadata[app].find(key);
9112 if (it == p.application_metadata[app].end()) {
9113 ss << "application '" << app << "' on pool '" << pool_name
9114 << "' does not have key '" << key << "'";
9115 return 0; // idempotent
9116 }
9117
9118 p.application_metadata[app].erase(it);
9119 ss << "removed application '" << app << "' key '" << key << "' on pool '"
9120 << pool_name << "'";
9121 } else {
9122 ceph_abort();
9123 }
9124
9125 if (preparing) {
9126 p.last_change = pending_inc.epoch;
9127 pending_inc.new_pools[pool] = p;
9128 }
9129
9130 // Because we fell through this far, we didn't hit no-op cases,
9131 // so pool was definitely modified
9132 if (modified != nullptr) {
9133 *modified = true;
9134 }
9135
9136 return 0;
9137 }
9138
9139 int OSDMonitor::_prepare_command_osd_crush_remove(
9140 CrushWrapper &newcrush,
9141 int32_t id,
9142 int32_t ancestor,
9143 bool has_ancestor,
9144 bool unlink_only)
9145 {
9146 int err = 0;
9147
9148 if (has_ancestor) {
9149 err = newcrush.remove_item_under(cct, id, ancestor,
9150 unlink_only);
9151 } else {
9152 err = newcrush.remove_item(cct, id, unlink_only);
9153 }
9154 return err;
9155 }
9156
9157 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9158 {
9159 pending_inc.crush.clear();
9160 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9161 }
9162
9163 int OSDMonitor::prepare_command_osd_crush_remove(
9164 CrushWrapper &newcrush,
9165 int32_t id,
9166 int32_t ancestor,
9167 bool has_ancestor,
9168 bool unlink_only)
9169 {
9170 int err = _prepare_command_osd_crush_remove(
9171 newcrush, id, ancestor,
9172 has_ancestor, unlink_only);
9173
9174 if (err < 0)
9175 return err;
9176
9177 ceph_assert(err == 0);
9178 do_osd_crush_remove(newcrush);
9179
9180 return 0;
9181 }
9182
9183 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9184 {
9185 if (osdmap.is_up(id)) {
9186 return -EBUSY;
9187 }
9188
9189 pending_inc.new_state[id] = osdmap.get_state(id);
9190 pending_inc.new_uuid[id] = uuid_d();
9191 pending_metadata_rm.insert(id);
9192 pending_metadata.erase(id);
9193
9194 return 0;
9195 }
9196
9197 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9198 {
9199 ceph_assert(existing_id);
9200 *existing_id = -1;
9201
9202 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9203 if (!osdmap.exists(i) &&
9204 pending_inc.new_up_client.count(i) == 0 &&
9205 (pending_inc.new_state.count(i) == 0 ||
9206 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9207 *existing_id = i;
9208 return -1;
9209 }
9210 }
9211
9212 if (pending_inc.new_max_osd < 0) {
9213 return osdmap.get_max_osd();
9214 }
9215 return pending_inc.new_max_osd;
9216 }
9217
9218 void OSDMonitor::do_osd_create(
9219 const int32_t id,
9220 const uuid_d& uuid,
9221 const string& device_class,
9222 int32_t* new_id)
9223 {
9224 dout(10) << __func__ << " uuid " << uuid << dendl;
9225 ceph_assert(new_id);
9226
9227 // We presume validation has been performed prior to calling this
9228 // function. We assert with prejudice.
9229
9230 int32_t allocated_id = -1; // declare here so we can jump
9231 int32_t existing_id = -1;
9232 if (!uuid.is_zero()) {
9233 existing_id = osdmap.identify_osd(uuid);
9234 if (existing_id >= 0) {
9235 ceph_assert(id < 0 || id == existing_id);
9236 *new_id = existing_id;
9237 goto out;
9238 } else if (id >= 0) {
9239 // uuid does not exist, and id has been provided, so just create
9240 // the new osd.id
9241 *new_id = id;
9242 goto out;
9243 }
9244 }
9245
9246 // allocate a new id
9247 allocated_id = _allocate_osd_id(&existing_id);
9248 dout(10) << __func__ << " allocated id " << allocated_id
9249 << " existing id " << existing_id << dendl;
9250 if (existing_id >= 0) {
9251 ceph_assert(existing_id < osdmap.get_max_osd());
9252 ceph_assert(allocated_id < 0);
9253 *new_id = existing_id;
9254 } else if (allocated_id >= 0) {
9255 ceph_assert(existing_id < 0);
9256 // raise max_osd
9257 if (pending_inc.new_max_osd < 0) {
9258 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9259 } else {
9260 ++pending_inc.new_max_osd;
9261 }
9262 *new_id = pending_inc.new_max_osd - 1;
9263 ceph_assert(*new_id == allocated_id);
9264 } else {
9265 ceph_abort_msg("unexpected condition");
9266 }
9267
9268 out:
9269 if (device_class.size()) {
9270 CrushWrapper newcrush = _get_pending_crush();
9271 if (newcrush.get_max_devices() < *new_id + 1) {
9272 newcrush.set_max_devices(*new_id + 1);
9273 }
9274 string name = string("osd.") + stringify(*new_id);
9275 if (!newcrush.item_exists(*new_id)) {
9276 newcrush.set_item_name(*new_id, name);
9277 }
9278 ostringstream ss;
9279 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9280 if (r < 0) {
9281 derr << __func__ << " failed to set " << name << " device_class "
9282 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9283 << dendl;
9284 // non-fatal... this might be a replay and we want to be idempotent.
9285 } else {
9286 dout(20) << __func__ << " set " << name << " device_class " << device_class
9287 << dendl;
9288 pending_inc.crush.clear();
9289 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9290 }
9291 } else {
9292 dout(20) << __func__ << " no device_class" << dendl;
9293 }
9294
9295 dout(10) << __func__ << " using id " << *new_id << dendl;
9296 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9297 pending_inc.new_max_osd = *new_id + 1;
9298 }
9299
9300 pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9301 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9302 // set it for us. (ugh.)
9303 pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9304 if (!uuid.is_zero())
9305 pending_inc.new_uuid[*new_id] = uuid;
9306 }
9307
9308 int OSDMonitor::validate_osd_create(
9309 const int32_t id,
9310 const uuid_d& uuid,
9311 const bool check_osd_exists,
9312 int32_t* existing_id,
9313 stringstream& ss)
9314 {
9315
9316 dout(10) << __func__ << " id " << id << " uuid " << uuid
9317 << " check_osd_exists " << check_osd_exists << dendl;
9318
9319 ceph_assert(existing_id);
9320
9321 if (id < 0 && uuid.is_zero()) {
9322 // we have nothing to validate
9323 *existing_id = -1;
9324 return 0;
9325 } else if (uuid.is_zero()) {
9326 // we have an id but we will ignore it - because that's what
9327 // `osd create` does.
9328 return 0;
9329 }
9330
9331 /*
9332 * This function will be used to validate whether we are able to
9333 * create a new osd when the `uuid` is specified.
9334 *
9335 * It will be used by both `osd create` and `osd new`, as the checks
9336 * are basically the same when it pertains to osd id and uuid validation.
9337 * However, `osd create` presumes an `uuid` is optional, for legacy
9338 * reasons, while `osd new` requires the `uuid` to be provided. This
9339 * means that `osd create` will not be idempotent if an `uuid` is not
9340 * provided, but we will always guarantee the idempotency of `osd new`.
9341 */
9342
9343 ceph_assert(!uuid.is_zero());
9344 if (pending_inc.identify_osd(uuid) >= 0) {
9345 // osd is about to exist
9346 return -EAGAIN;
9347 }
9348
9349 int32_t i = osdmap.identify_osd(uuid);
9350 if (i >= 0) {
9351 // osd already exists
9352 if (id >= 0 && i != id) {
9353 ss << "uuid " << uuid << " already in use for different id " << i;
9354 return -EEXIST;
9355 }
9356 // return a positive errno to distinguish between a blocking error
9357 // and an error we consider to not be a problem (i.e., this would be
9358 // an idempotent operation).
9359 *existing_id = i;
9360 return EEXIST;
9361 }
9362 // i < 0
9363 if (id >= 0) {
9364 if (pending_inc.new_state.count(id)) {
9365 // osd is about to exist
9366 return -EAGAIN;
9367 }
9368 // we may not care if an osd exists if we are recreating a previously
9369 // destroyed osd.
9370 if (check_osd_exists && osdmap.exists(id)) {
9371 ss << "id " << id << " already in use and does not match uuid "
9372 << uuid;
9373 return -EINVAL;
9374 }
9375 }
9376 return 0;
9377 }
9378
9379 int OSDMonitor::prepare_command_osd_create(
9380 const int32_t id,
9381 const uuid_d& uuid,
9382 int32_t* existing_id,
9383 stringstream& ss)
9384 {
9385 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9386 ceph_assert(existing_id);
9387 if (osdmap.is_destroyed(id)) {
9388 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9389 "instead.";
9390 return -EINVAL;
9391 }
9392
9393 if (uuid.is_zero()) {
9394 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9395 }
9396
9397 return validate_osd_create(id, uuid, true, existing_id, ss);
9398 }
9399
9400 int OSDMonitor::prepare_command_osd_new(
9401 MonOpRequestRef op,
9402 const cmdmap_t& cmdmap,
9403 const map<string,string>& params,
9404 stringstream &ss,
9405 Formatter *f)
9406 {
9407 uuid_d uuid;
9408 string uuidstr;
9409 int64_t id = -1;
9410
9411 ceph_assert(paxos.is_plugged());
9412
9413 dout(10) << __func__ << " " << op << dendl;
9414
9415 /* validate command. abort now if something's wrong. */
9416
9417 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9418 *
9419 * If `id` is not specified, we will identify any existing osd based
9420 * on `uuid`. Operation will be idempotent iff secrets match.
9421 *
9422 * If `id` is specified, we will identify any existing osd based on
9423 * `uuid` and match against `id`. If they match, operation will be
9424 * idempotent iff secrets match.
9425 *
9426 * `-i secrets.json` will be optional. If supplied, will be used
9427 * to check for idempotency when `id` and `uuid` match.
9428 *
9429 * If `id` is not specified, and `uuid` does not exist, an id will
9430 * be found or allocated for the osd.
9431 *
9432 * If `id` is specified, and the osd has been previously marked
9433 * as destroyed, then the `id` will be reused.
9434 */
9435 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9436 ss << "requires the OSD's UUID to be specified.";
9437 return -EINVAL;
9438 } else if (!uuid.parse(uuidstr.c_str())) {
9439 ss << "invalid UUID value '" << uuidstr << "'.";
9440 return -EINVAL;
9441 }
9442
9443 if (cmd_getval(cmdmap, "id", id) &&
9444 (id < 0)) {
9445 ss << "invalid OSD id; must be greater or equal than zero.";
9446 return -EINVAL;
9447 }
9448
9449 // are we running an `osd create`-like command, or recreating
9450 // a previously destroyed osd?
9451
9452 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9453
9454 // we will care about `id` to assess whether osd is `destroyed`, or
9455 // to create a new osd.
9456 // we will need an `id` by the time we reach auth.
9457
9458 int32_t existing_id = -1;
9459 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9460 &existing_id, ss);
9461
9462 bool may_be_idempotent = false;
9463 if (err == EEXIST) {
9464 // this is idempotent from the osdmon's point-of-view
9465 may_be_idempotent = true;
9466 ceph_assert(existing_id >= 0);
9467 id = existing_id;
9468 } else if (err < 0) {
9469 return err;
9470 }
9471
9472 if (!may_be_idempotent) {
9473 // idempotency is out of the window. We are either creating a new
9474 // osd or recreating a destroyed osd.
9475 //
9476 // We now need to figure out if we have an `id` (and if it's valid),
9477 // of find an `id` if we don't have one.
9478
9479 // NOTE: we need to consider the case where the `id` is specified for
9480 // `osd create`, and we must honor it. So this means checking if
9481 // the `id` is destroyed, and if so assume the destroy; otherwise,
9482 // check if it `exists` - in which case we complain about not being
9483 // `destroyed`. In the end, if nothing fails, we must allow the
9484 // creation, so that we are compatible with `create`.
9485 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9486 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9487 ss << "OSD " << id << " has not yet been destroyed";
9488 return -EINVAL;
9489 } else if (id < 0) {
9490 // find an `id`
9491 id = _allocate_osd_id(&existing_id);
9492 if (id < 0) {
9493 ceph_assert(existing_id >= 0);
9494 id = existing_id;
9495 }
9496 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9497 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9498 dout(10) << __func__ << " recreating osd." << id << dendl;
9499 } else {
9500 dout(10) << __func__ << " creating new osd." << id << dendl;
9501 }
9502 } else {
9503 ceph_assert(id >= 0);
9504 ceph_assert(osdmap.exists(id));
9505 }
9506
9507 // we are now able to either create a brand new osd or reuse an existing
9508 // osd that has been previously destroyed.
9509
9510 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9511
9512 if (may_be_idempotent && params.empty()) {
9513 // nothing to do, really.
9514 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9515 ceph_assert(id >= 0);
9516 if (f) {
9517 f->open_object_section("created_osd");
9518 f->dump_int("osdid", id);
9519 f->close_section();
9520 } else {
9521 ss << id;
9522 }
9523 return EEXIST;
9524 }
9525
9526 string device_class;
9527 auto p = params.find("crush_device_class");
9528 if (p != params.end()) {
9529 device_class = p->second;
9530 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9531 }
9532 string cephx_secret, lockbox_secret, dmcrypt_key;
9533 bool has_lockbox = false;
9534 bool has_secrets = params.count("cephx_secret")
9535 || params.count("cephx_lockbox_secret")
9536 || params.count("dmcrypt_key");
9537
9538 KVMonitor *svc = nullptr;
9539 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9540
9541 if (has_secrets) {
9542 if (params.count("cephx_secret") == 0) {
9543 ss << "requires a cephx secret.";
9544 return -EINVAL;
9545 }
9546 cephx_secret = params.at("cephx_secret");
9547
9548 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9549 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9550
9551 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9552 << " dmcrypt " << has_dmcrypt_key << dendl;
9553
9554 if (has_lockbox_secret && has_dmcrypt_key) {
9555 has_lockbox = true;
9556 lockbox_secret = params.at("cephx_lockbox_secret");
9557 dmcrypt_key = params.at("dmcrypt_key");
9558 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9559 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9560 return -EINVAL;
9561 }
9562
9563 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9564
9565 err = mon.authmon()->validate_osd_new(id, uuid,
9566 cephx_secret,
9567 lockbox_secret,
9568 cephx_entity,
9569 lockbox_entity,
9570 ss);
9571 if (err < 0) {
9572 return err;
9573 } else if (may_be_idempotent && err != EEXIST) {
9574 // for this to be idempotent, `id` should already be >= 0; no need
9575 // to use validate_id.
9576 ceph_assert(id >= 0);
9577 ss << "osd." << id << " exists but secrets do not match";
9578 return -EEXIST;
9579 }
9580
9581 if (has_lockbox) {
9582 svc = mon.kvmon();
9583 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9584 if (err < 0) {
9585 return err;
9586 } else if (may_be_idempotent && err != EEXIST) {
9587 ceph_assert(id >= 0);
9588 ss << "osd." << id << " exists but dm-crypt key does not match.";
9589 return -EEXIST;
9590 }
9591 }
9592 }
9593 ceph_assert(!has_secrets || !cephx_secret.empty());
9594 ceph_assert(!has_lockbox || !lockbox_secret.empty());
9595
9596 if (may_be_idempotent) {
9597 // we have nothing to do for either the osdmon or the authmon,
9598 // and we have no lockbox - so the config key service will not be
9599 // touched. This is therefore an idempotent operation, and we can
9600 // just return right away.
9601 dout(10) << __func__ << " idempotent -- no op." << dendl;
9602 ceph_assert(id >= 0);
9603 if (f) {
9604 f->open_object_section("created_osd");
9605 f->dump_int("osdid", id);
9606 f->close_section();
9607 } else {
9608 ss << id;
9609 }
9610 return EEXIST;
9611 }
9612 ceph_assert(!may_be_idempotent);
9613
9614 // perform updates.
9615 if (has_secrets) {
9616 ceph_assert(!cephx_secret.empty());
9617 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9618 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9619
9620 err = mon.authmon()->do_osd_new(cephx_entity,
9621 lockbox_entity,
9622 has_lockbox);
9623 ceph_assert(0 == err);
9624
9625 if (has_lockbox) {
9626 ceph_assert(nullptr != svc);
9627 svc->do_osd_new(uuid, dmcrypt_key);
9628 }
9629 }
9630
9631 if (is_recreate_destroyed) {
9632 ceph_assert(id >= 0);
9633 ceph_assert(osdmap.is_destroyed(id));
9634 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9635 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9636 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9637 }
9638 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9639 // due to http://tracker.ceph.com/issues/20751 some clusters may
9640 // have UP set for non-existent OSDs; make sure it is cleared
9641 // for a newly created osd.
9642 pending_inc.new_state[id] |= CEPH_OSD_UP;
9643 }
9644 pending_inc.new_uuid[id] = uuid;
9645 } else {
9646 ceph_assert(id >= 0);
9647 int32_t new_id = -1;
9648 do_osd_create(id, uuid, device_class, &new_id);
9649 ceph_assert(new_id >= 0);
9650 ceph_assert(id == new_id);
9651 }
9652
9653 if (f) {
9654 f->open_object_section("created_osd");
9655 f->dump_int("osdid", id);
9656 f->close_section();
9657 } else {
9658 ss << id;
9659 }
9660
9661 return 0;
9662 }
9663
9664 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9665 {
9666 op->mark_osdmon_event(__func__);
9667 auto m = op->get_req<MMonCommand>();
9668 stringstream ss;
9669 cmdmap_t cmdmap;
9670 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9671 string rs = ss.str();
9672 mon.reply_command(op, -EINVAL, rs, get_last_committed());
9673 return true;
9674 }
9675
9676 MonSession *session = op->get_session();
9677 if (!session) {
9678 derr << __func__ << " no session" << dendl;
9679 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9680 return true;
9681 }
9682
9683 return prepare_command_impl(op, cmdmap);
9684 }
9685
9686 static int parse_reweights(CephContext *cct,
9687 const cmdmap_t& cmdmap,
9688 const OSDMap& osdmap,
9689 map<int32_t, uint32_t>* weights)
9690 {
9691 string weights_str;
9692 if (!cmd_getval(cmdmap, "weights", weights_str)) {
9693 return -EINVAL;
9694 }
9695 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9696 json_spirit::mValue json_value;
9697 if (!json_spirit::read(weights_str, json_value)) {
9698 return -EINVAL;
9699 }
9700 if (json_value.type() != json_spirit::obj_type) {
9701 return -EINVAL;
9702 }
9703 const auto obj = json_value.get_obj();
9704 try {
9705 for (auto& osd_weight : obj) {
9706 auto osd_id = std::stoi(osd_weight.first);
9707 if (!osdmap.exists(osd_id)) {
9708 return -ENOENT;
9709 }
9710 if (osd_weight.second.type() != json_spirit::str_type) {
9711 return -EINVAL;
9712 }
9713 auto weight = std::stoul(osd_weight.second.get_str());
9714 weights->insert({osd_id, weight});
9715 }
9716 } catch (const std::logic_error& e) {
9717 return -EINVAL;
9718 }
9719 return 0;
9720 }
9721
9722 int OSDMonitor::prepare_command_osd_destroy(
9723 int32_t id,
9724 stringstream& ss)
9725 {
9726 ceph_assert(paxos.is_plugged());
9727
9728 // we check if the osd exists for the benefit of `osd purge`, which may
9729 // have previously removed the osd. If the osd does not exist, return
9730 // -ENOENT to convey this, and let the caller deal with it.
9731 //
9732 // we presume that all auth secrets and config keys were removed prior
9733 // to this command being called. if they exist by now, we also assume
9734 // they must have been created by some other command and do not pertain
9735 // to this non-existent osd.
9736 if (!osdmap.exists(id)) {
9737 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9738 return -ENOENT;
9739 }
9740
9741 uuid_d uuid = osdmap.get_uuid(id);
9742 dout(10) << __func__ << " destroying osd." << id
9743 << " uuid " << uuid << dendl;
9744
9745 // if it has been destroyed, we assume our work here is done.
9746 if (osdmap.is_destroyed(id)) {
9747 ss << "destroyed osd." << id;
9748 return 0;
9749 }
9750
9751 EntityName cephx_entity, lockbox_entity;
9752 bool idempotent_auth = false, idempotent_cks = false;
9753
9754 int err = mon.authmon()->validate_osd_destroy(id, uuid,
9755 cephx_entity,
9756 lockbox_entity,
9757 ss);
9758 if (err < 0) {
9759 if (err == -ENOENT) {
9760 idempotent_auth = true;
9761 } else {
9762 return err;
9763 }
9764 }
9765
9766 auto svc = mon.kvmon();
9767 err = svc->validate_osd_destroy(id, uuid);
9768 if (err < 0) {
9769 ceph_assert(err == -ENOENT);
9770 err = 0;
9771 idempotent_cks = true;
9772 }
9773
9774 if (!idempotent_auth) {
9775 err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9776 ceph_assert(0 == err);
9777 }
9778
9779 if (!idempotent_cks) {
9780 svc->do_osd_destroy(id, uuid);
9781 }
9782
9783 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9784 pending_inc.new_uuid[id] = uuid_d();
9785
9786 // we can only propose_pending() once per service, otherwise we'll be
9787 // defying PaxosService and all laws of nature. Therefore, as we may
9788 // be used during 'osd purge', let's keep the caller responsible for
9789 // proposing.
9790 ceph_assert(err == 0);
9791 return 0;
9792 }
9793
9794 int OSDMonitor::prepare_command_osd_purge(
9795 int32_t id,
9796 stringstream& ss)
9797 {
9798 ceph_assert(paxos.is_plugged());
9799 dout(10) << __func__ << " purging osd." << id << dendl;
9800
9801 ceph_assert(!osdmap.is_up(id));
9802
9803 /*
9804 * This may look a bit weird, but this is what's going to happen:
9805 *
9806 * 1. we make sure that removing from crush works
9807 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9808 * error, then we abort the whole operation, as no updates
9809 * have been made. However, we this function will have
9810 * side-effects, thus we need to make sure that all operations
9811 * performed henceforth will *always* succeed.
9812 * 3. we call `prepare_command_osd_remove()`. Although this
9813 * function can return an error, it currently only checks if the
9814 * osd is up - and we have made sure that it is not so, so there
9815 * is no conflict, and it is effectively an update.
9816 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9817 * the crush update we delayed from before.
9818 */
9819
9820 CrushWrapper newcrush = _get_pending_crush();
9821
9822 bool may_be_idempotent = false;
9823
9824 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9825 if (err == -ENOENT) {
9826 err = 0;
9827 may_be_idempotent = true;
9828 } else if (err < 0) {
9829 ss << "error removing osd." << id << " from crush";
9830 return err;
9831 }
9832
9833 // no point destroying the osd again if it has already been marked destroyed
9834 if (!osdmap.is_destroyed(id)) {
9835 err = prepare_command_osd_destroy(id, ss);
9836 if (err < 0) {
9837 if (err == -ENOENT) {
9838 err = 0;
9839 } else {
9840 return err;
9841 }
9842 } else {
9843 may_be_idempotent = false;
9844 }
9845 }
9846 ceph_assert(0 == err);
9847
9848 if (may_be_idempotent && !osdmap.exists(id)) {
9849 dout(10) << __func__ << " osd." << id << " does not exist and "
9850 << "we are idempotent." << dendl;
9851 return -ENOENT;
9852 }
9853
9854 err = prepare_command_osd_remove(id);
9855 // we should not be busy, as we should have made sure this id is not up.
9856 ceph_assert(0 == err);
9857
9858 do_osd_crush_remove(newcrush);
9859 return 0;
9860 }
9861
9862 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9863 const cmdmap_t& cmdmap)
9864 {
9865 op->mark_osdmon_event(__func__);
9866 auto m = op->get_req<MMonCommand>();
9867 bool ret = false;
9868 stringstream ss;
9869 string rs;
9870 bufferlist rdata;
9871 int err = 0;
9872
9873 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
9874 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9875
9876 string prefix;
9877 cmd_getval(cmdmap, "prefix", prefix);
9878
9879 int64_t osdid;
9880 string osd_name;
9881 bool osdid_present = false;
9882 if (prefix != "osd pg-temp" &&
9883 prefix != "osd pg-upmap" &&
9884 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9885 osdid_present = cmd_getval(cmdmap, "id", osdid);
9886 }
9887 if (osdid_present) {
9888 ostringstream oss;
9889 oss << "osd." << osdid;
9890 osd_name = oss.str();
9891 }
9892
9893 // Even if there's a pending state with changes that could affect
9894 // a command, considering that said state isn't yet committed, we
9895 // just don't care about those changes if the command currently being
9896 // handled acts as a no-op against the current committed state.
9897 // In a nutshell, we assume this command happens *before*.
9898 //
9899 // Let me make this clearer:
9900 //
9901 // - If we have only one client, and that client issues some
9902 // operation that would conflict with this operation but is
9903 // still on the pending state, then we would be sure that said
9904 // operation wouldn't have returned yet, so the client wouldn't
9905 // issue this operation (unless the client didn't wait for the
9906 // operation to finish, and that would be the client's own fault).
9907 //
9908 // - If we have more than one client, each client will observe
9909 // whatever is the state at the moment of the commit. So, if we
9910 // have two clients, one issuing an unlink and another issuing a
9911 // link, and if the link happens while the unlink is still on the
9912 // pending state, from the link's point-of-view this is a no-op.
9913 // If different clients are issuing conflicting operations and
9914 // they care about that, then the clients should make sure they
9915 // enforce some kind of concurrency mechanism -- from our
9916 // perspective that's what Douglas Adams would call an SEP.
9917 //
9918 // This should be used as a general guideline for most commands handled
9919 // in this function. Adapt as you see fit, but please bear in mind that
9920 // this is the expected behavior.
9921
9922
9923 if (prefix == "osd setcrushmap" ||
9924 (prefix == "osd crush set" && !osdid_present)) {
9925 if (pending_inc.crush.length()) {
9926 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9927 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9928 return true;
9929 }
9930 dout(10) << "prepare_command setting new crush map" << dendl;
9931 bufferlist data(m->get_data());
9932 CrushWrapper crush;
9933 try {
9934 auto bl = data.cbegin();
9935 crush.decode(bl);
9936 }
9937 catch (const std::exception &e) {
9938 err = -EINVAL;
9939 ss << "Failed to parse crushmap: " << e.what();
9940 goto reply;
9941 }
9942
9943 int64_t prior_version = 0;
9944 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9945 if (prior_version == osdmap.get_crush_version() - 1) {
9946 // see if we are a resend of the last update. this is imperfect
9947 // (multiple racing updaters may not both get reliable success)
9948 // but we expect crush updaters (via this interface) to be rare-ish.
9949 bufferlist current, proposed;
9950 osdmap.crush->encode(current, mon.get_quorum_con_features());
9951 crush.encode(proposed, mon.get_quorum_con_features());
9952 if (current.contents_equal(proposed)) {
9953 dout(10) << __func__
9954 << " proposed matches current and version equals previous"
9955 << dendl;
9956 err = 0;
9957 ss << osdmap.get_crush_version();
9958 goto reply;
9959 }
9960 }
9961 if (prior_version != osdmap.get_crush_version()) {
9962 err = -EPERM;
9963 ss << "prior_version " << prior_version << " != crush version "
9964 << osdmap.get_crush_version();
9965 goto reply;
9966 }
9967 }
9968
9969 if (!validate_crush_against_features(&crush, ss)) {
9970 err = -EINVAL;
9971 goto reply;
9972 }
9973
9974 err = osdmap.validate_crush_rules(&crush, &ss);
9975 if (err < 0) {
9976 goto reply;
9977 }
9978
9979 if (g_conf()->mon_osd_crush_smoke_test) {
9980 // sanity check: test some inputs to make sure this map isn't
9981 // totally broken
9982 dout(10) << " testing map" << dendl;
9983 stringstream ess;
9984 CrushTester tester(crush, ess);
9985 tester.set_min_x(0);
9986 tester.set_max_x(50);
9987 tester.set_num_rep(3); // arbitrary
9988 auto start = ceph::coarse_mono_clock::now();
9989 int r = tester.test_with_fork(g_conf()->mon_lease);
9990 auto duration = ceph::coarse_mono_clock::now() - start;
9991 if (r < 0) {
9992 dout(10) << " tester.test_with_fork returns " << r
9993 << ": " << ess.str() << dendl;
9994 ss << "crush smoke test failed with " << r << ": " << ess.str();
9995 err = r;
9996 goto reply;
9997 }
9998 dout(10) << __func__ << " crush somke test duration: "
9999 << duration << ", result: " << ess.str() << dendl;
10000 }
10001
10002 pending_inc.crush = data;
10003 ss << osdmap.get_crush_version() + 1;
10004 goto update;
10005
10006 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
10007 CrushWrapper newcrush = _get_pending_crush();
10008 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
10009 int bid = -1 - b;
10010 if (newcrush.bucket_exists(bid) &&
10011 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
10012 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
10013 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
10014 }
10015 }
10016 if (!validate_crush_against_features(&newcrush, ss)) {
10017 err = -EINVAL;
10018 goto reply;
10019 }
10020 pending_inc.crush.clear();
10021 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10022 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10023 get_last_committed() + 1));
10024 return true;
10025 } else if (prefix == "osd crush set-device-class") {
10026 string device_class;
10027 if (!cmd_getval(cmdmap, "class", device_class)) {
10028 err = -EINVAL; // no value!
10029 goto reply;
10030 }
10031
10032 bool stop = false;
10033 vector<string> idvec;
10034 cmd_getval(cmdmap, "ids", idvec);
10035 CrushWrapper newcrush = _get_pending_crush();
10036 set<int> updated;
10037 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10038 set<int> osds;
10039 // wildcard?
10040 if (j == 0 &&
10041 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10042 osdmap.get_all_osds(osds);
10043 stop = true;
10044 } else {
10045 // try traditional single osd way
10046 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10047 if (osd < 0) {
10048 // ss has reason for failure
10049 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10050 err = -EINVAL;
10051 continue;
10052 }
10053 osds.insert(osd);
10054 }
10055
10056 for (auto &osd : osds) {
10057 if (!osdmap.exists(osd)) {
10058 ss << "osd." << osd << " does not exist. ";
10059 continue;
10060 }
10061
10062 ostringstream oss;
10063 oss << "osd." << osd;
10064 string name = oss.str();
10065
10066 if (newcrush.get_max_devices() < osd + 1) {
10067 newcrush.set_max_devices(osd + 1);
10068 }
10069 string action;
10070 if (newcrush.item_exists(osd)) {
10071 action = "updating";
10072 } else {
10073 action = "creating";
10074 newcrush.set_item_name(osd, name);
10075 }
10076
10077 dout(5) << action << " crush item id " << osd << " name '" << name
10078 << "' device_class '" << device_class << "'"
10079 << dendl;
10080 err = newcrush.update_device_class(osd, device_class, name, &ss);
10081 if (err < 0) {
10082 goto reply;
10083 }
10084 if (err == 0 && !_have_pending_crush()) {
10085 if (!stop) {
10086 // for single osd only, wildcard makes too much noise
10087 ss << "set-device-class item id " << osd << " name '" << name
10088 << "' device_class '" << device_class << "': no change. ";
10089 }
10090 } else {
10091 updated.insert(osd);
10092 }
10093 }
10094 }
10095
10096 pending_inc.crush.clear();
10097 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10098 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10099 getline(ss, rs);
10100 wait_for_finished_proposal(
10101 op,
10102 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10103 return true;
10104 } else if (prefix == "osd crush rm-device-class") {
10105 bool stop = false;
10106 vector<string> idvec;
10107 cmd_getval(cmdmap, "ids", idvec);
10108 CrushWrapper newcrush = _get_pending_crush();
10109 set<int> updated;
10110
10111 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10112 set<int> osds;
10113
10114 // wildcard?
10115 if (j == 0 &&
10116 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10117 osdmap.get_all_osds(osds);
10118 stop = true;
10119 } else {
10120 // try traditional single osd way
10121 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10122 if (osd < 0) {
10123 // ss has reason for failure
10124 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10125 err = -EINVAL;
10126 goto reply;
10127 }
10128 osds.insert(osd);
10129 }
10130
10131 for (auto &osd : osds) {
10132 if (!osdmap.exists(osd)) {
10133 ss << "osd." << osd << " does not exist. ";
10134 continue;
10135 }
10136
10137 auto class_name = newcrush.get_item_class(osd);
10138 if (!class_name) {
10139 ss << "osd." << osd << " belongs to no class, ";
10140 continue;
10141 }
10142 // note that we do not verify if class_is_in_use here
10143 // in case the device is misclassified and user wants
10144 // to overridely reset...
10145
10146 err = newcrush.remove_device_class(cct, osd, &ss);
10147 if (err < 0) {
10148 // ss has reason for failure
10149 goto reply;
10150 }
10151 updated.insert(osd);
10152 }
10153 }
10154
10155 pending_inc.crush.clear();
10156 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10157 ss << "done removing class of osd(s): " << updated;
10158 getline(ss, rs);
10159 wait_for_finished_proposal(
10160 op,
10161 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10162 return true;
10163 } else if (prefix == "osd crush class create") {
10164 string device_class;
10165 if (!cmd_getval(cmdmap, "class", device_class)) {
10166 err = -EINVAL; // no value!
10167 goto reply;
10168 }
10169 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10170 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10171 << "luminous' before using crush device classes";
10172 err = -EPERM;
10173 goto reply;
10174 }
10175 if (!_have_pending_crush() &&
10176 _get_stable_crush().class_exists(device_class)) {
10177 ss << "class '" << device_class << "' already exists";
10178 goto reply;
10179 }
10180 CrushWrapper newcrush = _get_pending_crush();
10181 if (newcrush.class_exists(device_class)) {
10182 ss << "class '" << device_class << "' already exists";
10183 goto update;
10184 }
10185 int class_id = newcrush.get_or_create_class_id(device_class);
10186 pending_inc.crush.clear();
10187 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10188 ss << "created class " << device_class << " with id " << class_id
10189 << " to crush map";
10190 goto update;
10191 } else if (prefix == "osd crush class rm") {
10192 string device_class;
10193 if (!cmd_getval(cmdmap, "class", device_class)) {
10194 err = -EINVAL; // no value!
10195 goto reply;
10196 }
10197 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10198 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10199 << "luminous' before using crush device classes";
10200 err = -EPERM;
10201 goto reply;
10202 }
10203
10204 if (!osdmap.crush->class_exists(device_class)) {
10205 err = 0;
10206 goto reply;
10207 }
10208
10209 CrushWrapper newcrush = _get_pending_crush();
10210 if (!newcrush.class_exists(device_class)) {
10211 err = 0; // make command idempotent
10212 goto wait;
10213 }
10214 int class_id = newcrush.get_class_id(device_class);
10215 stringstream ts;
10216 if (newcrush.class_is_in_use(class_id, &ts)) {
10217 err = -EBUSY;
10218 ss << "class '" << device_class << "' " << ts.str();
10219 goto reply;
10220 }
10221
10222 // check if class is used by any erasure-code-profiles
10223 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10224 osdmap.get_erasure_code_profiles();
10225 auto ec_profiles = pending_inc.get_erasure_code_profiles();
10226 #ifdef HAVE_STDLIB_MAP_SPLICING
10227 ec_profiles.merge(old_ec_profiles);
10228 #else
10229 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10230 make_move_iterator(end(old_ec_profiles)));
10231 #endif
10232 list<string> referenced_by;
10233 for (auto &i: ec_profiles) {
10234 for (auto &j: i.second) {
10235 if ("crush-device-class" == j.first && device_class == j.second) {
10236 referenced_by.push_back(i.first);
10237 }
10238 }
10239 }
10240 if (!referenced_by.empty()) {
10241 err = -EBUSY;
10242 ss << "class '" << device_class
10243 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10244 goto reply;
10245 }
10246
10247 set<int> osds;
10248 newcrush.get_devices_by_class(device_class, &osds);
10249 for (auto& p: osds) {
10250 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
10251 if (err < 0) {
10252 // ss has reason for failure
10253 goto reply;
10254 }
10255 }
10256
10257 if (osds.empty()) {
10258 // empty class, remove directly
10259 err = newcrush.remove_class_name(device_class);
10260 if (err < 0) {
10261 ss << "class '" << device_class << "' cannot be removed '"
10262 << cpp_strerror(err) << "'";
10263 goto reply;
10264 }
10265 }
10266
10267 pending_inc.crush.clear();
10268 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10269 ss << "removed class " << device_class << " with id " << class_id
10270 << " from crush map";
10271 goto update;
10272 } else if (prefix == "osd crush class rename") {
10273 string srcname, dstname;
10274 if (!cmd_getval(cmdmap, "srcname", srcname)) {
10275 err = -EINVAL;
10276 goto reply;
10277 }
10278 if (!cmd_getval(cmdmap, "dstname", dstname)) {
10279 err = -EINVAL;
10280 goto reply;
10281 }
10282
10283 CrushWrapper newcrush = _get_pending_crush();
10284 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10285 // suppose this is a replay and return success
10286 // so command is idempotent
10287 ss << "already renamed to '" << dstname << "'";
10288 err = 0;
10289 goto reply;
10290 }
10291
10292 err = newcrush.rename_class(srcname, dstname);
10293 if (err < 0) {
10294 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10295 << cpp_strerror(err);
10296 goto reply;
10297 }
10298
10299 pending_inc.crush.clear();
10300 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10301 ss << "rename class '" << srcname << "' to '" << dstname << "'";
10302 goto update;
10303 } else if (prefix == "osd crush add-bucket") {
10304 // os crush add-bucket <name> <type>
10305 string name, typestr;
10306 vector<string> argvec;
10307 cmd_getval(cmdmap, "name", name);
10308 cmd_getval(cmdmap, "type", typestr);
10309 cmd_getval(cmdmap, "args", argvec);
10310 map<string,string> loc;
10311 if (!argvec.empty()) {
10312 CrushWrapper::parse_loc_map(argvec, &loc);
10313 dout(0) << "will create and move bucket '" << name
10314 << "' to location " << loc << dendl;
10315 }
10316
10317 if (!_have_pending_crush() &&
10318 _get_stable_crush().name_exists(name)) {
10319 ss << "bucket '" << name << "' already exists";
10320 goto reply;
10321 }
10322
10323 CrushWrapper newcrush = _get_pending_crush();
10324
10325 if (newcrush.name_exists(name)) {
10326 ss << "bucket '" << name << "' already exists";
10327 goto update;
10328 }
10329 int type = newcrush.get_type_id(typestr);
10330 if (type < 0) {
10331 ss << "type '" << typestr << "' does not exist";
10332 err = -EINVAL;
10333 goto reply;
10334 }
10335 if (type == 0) {
10336 ss << "type '" << typestr << "' is for devices, not buckets";
10337 err = -EINVAL;
10338 goto reply;
10339 }
10340 int bucketno;
10341 err = newcrush.add_bucket(0, 0,
10342 CRUSH_HASH_DEFAULT, type, 0, NULL,
10343 NULL, &bucketno);
10344 if (err < 0) {
10345 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10346 goto reply;
10347 }
10348 err = newcrush.set_item_name(bucketno, name);
10349 if (err < 0) {
10350 ss << "error setting bucket name to '" << name << "'";
10351 goto reply;
10352 }
10353
10354 if (!loc.empty()) {
10355 if (!newcrush.check_item_loc(cct, bucketno, loc,
10356 (int *)NULL)) {
10357 err = newcrush.move_bucket(cct, bucketno, loc);
10358 if (err < 0) {
10359 ss << "error moving bucket '" << name << "' to location " << loc;
10360 goto reply;
10361 }
10362 } else {
10363 ss << "no need to move item id " << bucketno << " name '" << name
10364 << "' to location " << loc << " in crush map";
10365 }
10366 }
10367
10368 pending_inc.crush.clear();
10369 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10370 if (loc.empty()) {
10371 ss << "added bucket " << name << " type " << typestr
10372 << " to crush map";
10373 } else {
10374 ss << "added bucket " << name << " type " << typestr
10375 << " to location " << loc;
10376 }
10377 goto update;
10378 } else if (prefix == "osd crush rename-bucket") {
10379 string srcname, dstname;
10380 cmd_getval(cmdmap, "srcname", srcname);
10381 cmd_getval(cmdmap, "dstname", dstname);
10382
10383 err = crush_rename_bucket(srcname, dstname, &ss);
10384 if (err == -EALREADY) // equivalent to success for idempotency
10385 err = 0;
10386 if (err)
10387 goto reply;
10388 else
10389 goto update;
10390 } else if (prefix == "osd crush weight-set create" ||
10391 prefix == "osd crush weight-set create-compat") {
10392 if (_have_pending_crush()) {
10393 dout(10) << " first waiting for pending crush changes to commit" << dendl;
10394 goto wait;
10395 }
10396 CrushWrapper newcrush = _get_pending_crush();
10397 int64_t pool;
10398 int positions;
10399 if (newcrush.has_non_straw2_buckets()) {
10400 ss << "crush map contains one or more bucket(s) that are not straw2";
10401 err = -EPERM;
10402 goto reply;
10403 }
10404 if (prefix == "osd crush weight-set create") {
10405 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10406 osdmap.require_min_compat_client < ceph_release_t::luminous) {
10407 ss << "require_min_compat_client "
10408 << osdmap.require_min_compat_client
10409 << " < luminous, which is required for per-pool weight-sets. "
10410 << "Try 'ceph osd set-require-min-compat-client luminous' "
10411 << "before using the new interface";
10412 err = -EPERM;
10413 goto reply;
10414 }
10415 string poolname, mode;
10416 cmd_getval(cmdmap, "pool", poolname);
10417 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10418 if (pool < 0) {
10419 ss << "pool '" << poolname << "' not found";
10420 err = -ENOENT;
10421 goto reply;
10422 }
10423 cmd_getval(cmdmap, "mode", mode);
10424 if (mode != "flat" && mode != "positional") {
10425 ss << "unrecognized weight-set mode '" << mode << "'";
10426 err = -EINVAL;
10427 goto reply;
10428 }
10429 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10430 } else {
10431 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10432 positions = 1;
10433 }
10434 if (!newcrush.create_choose_args(pool, positions)) {
10435 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10436 ss << "compat weight-set already created";
10437 } else {
10438 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10439 << "' already created";
10440 }
10441 goto reply;
10442 }
10443 pending_inc.crush.clear();
10444 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10445 goto update;
10446
10447 } else if (prefix == "osd crush weight-set rm" ||
10448 prefix == "osd crush weight-set rm-compat") {
10449 CrushWrapper newcrush = _get_pending_crush();
10450 int64_t pool;
10451 if (prefix == "osd crush weight-set rm") {
10452 string poolname;
10453 cmd_getval(cmdmap, "pool", poolname);
10454 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10455 if (pool < 0) {
10456 ss << "pool '" << poolname << "' not found";
10457 err = -ENOENT;
10458 goto reply;
10459 }
10460 } else {
10461 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10462 }
10463 newcrush.rm_choose_args(pool);
10464 pending_inc.crush.clear();
10465 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10466 goto update;
10467
10468 } else if (prefix == "osd crush weight-set reweight" ||
10469 prefix == "osd crush weight-set reweight-compat") {
10470 string poolname, item;
10471 vector<double> weight;
10472 cmd_getval(cmdmap, "pool", poolname);
10473 cmd_getval(cmdmap, "item", item);
10474 cmd_getval(cmdmap, "weight", weight);
10475 CrushWrapper newcrush = _get_pending_crush();
10476 int64_t pool;
10477 if (prefix == "osd crush weight-set reweight") {
10478 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10479 if (pool < 0) {
10480 ss << "pool '" << poolname << "' not found";
10481 err = -ENOENT;
10482 goto reply;
10483 }
10484 if (!newcrush.have_choose_args(pool)) {
10485 ss << "no weight-set for pool '" << poolname << "'";
10486 err = -ENOENT;
10487 goto reply;
10488 }
10489 auto arg_map = newcrush.choose_args_get(pool);
10490 int positions = newcrush.get_choose_args_positions(arg_map);
10491 if (weight.size() != (size_t)positions) {
10492 ss << "must specify exact " << positions << " weight values";
10493 err = -EINVAL;
10494 goto reply;
10495 }
10496 } else {
10497 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10498 if (!newcrush.have_choose_args(pool)) {
10499 ss << "no backward-compatible weight-set";
10500 err = -ENOENT;
10501 goto reply;
10502 }
10503 }
10504 if (!newcrush.name_exists(item)) {
10505 ss << "item '" << item << "' does not exist";
10506 err = -ENOENT;
10507 goto reply;
10508 }
10509 err = newcrush.choose_args_adjust_item_weightf(
10510 cct,
10511 newcrush.choose_args_get(pool),
10512 newcrush.get_item_id(item),
10513 weight,
10514 &ss);
10515 if (err < 0) {
10516 goto reply;
10517 }
10518 err = 0;
10519 pending_inc.crush.clear();
10520 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10521 goto update;
10522 } else if (osdid_present &&
10523 (prefix == "osd crush set" || prefix == "osd crush add")) {
10524 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10525 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10526 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10527
10528 if (!osdmap.exists(osdid)) {
10529 err = -ENOENT;
10530 ss << osd_name
10531 << " does not exist. Create it before updating the crush map";
10532 goto reply;
10533 }
10534
10535 double weight;
10536 if (!cmd_getval(cmdmap, "weight", weight)) {
10537 ss << "unable to parse weight value '"
10538 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10539 err = -EINVAL;
10540 goto reply;
10541 }
10542
10543 string args;
10544 vector<string> argvec;
10545 cmd_getval(cmdmap, "args", argvec);
10546 map<string,string> loc;
10547 CrushWrapper::parse_loc_map(argvec, &loc);
10548
10549 if (prefix == "osd crush set"
10550 && !_get_stable_crush().item_exists(osdid)) {
10551 err = -ENOENT;
10552 ss << "unable to set item id " << osdid << " name '" << osd_name
10553 << "' weight " << weight << " at location " << loc
10554 << ": does not exist";
10555 goto reply;
10556 }
10557
10558 dout(5) << "adding/updating crush item id " << osdid << " name '"
10559 << osd_name << "' weight " << weight << " at location "
10560 << loc << dendl;
10561 CrushWrapper newcrush = _get_pending_crush();
10562
10563 string action;
10564 if (prefix == "osd crush set" ||
10565 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10566 action = "set";
10567 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10568 } else {
10569 action = "add";
10570 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10571 if (err == 0)
10572 err = 1;
10573 }
10574
10575 if (err < 0)
10576 goto reply;
10577
10578 if (err == 0 && !_have_pending_crush()) {
10579 ss << action << " item id " << osdid << " name '" << osd_name
10580 << "' weight " << weight << " at location " << loc << ": no change";
10581 goto reply;
10582 }
10583
10584 pending_inc.crush.clear();
10585 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10586 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10587 << weight << " at location " << loc << " to crush map";
10588 getline(ss, rs);
10589 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10590 get_last_committed() + 1));
10591 return true;
10592
10593 } else if (prefix == "osd crush create-or-move") {
10594 do {
10595 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10596 if (!osdmap.exists(osdid)) {
10597 err = -ENOENT;
10598 ss << osd_name
10599 << " does not exist. create it before updating the crush map";
10600 goto reply;
10601 }
10602
10603 double weight;
10604 if (!cmd_getval(cmdmap, "weight", weight)) {
10605 ss << "unable to parse weight value '"
10606 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10607 err = -EINVAL;
10608 goto reply;
10609 }
10610
10611 string args;
10612 vector<string> argvec;
10613 cmd_getval(cmdmap, "args", argvec);
10614 map<string,string> loc;
10615 CrushWrapper::parse_loc_map(argvec, &loc);
10616
10617 dout(0) << "create-or-move crush item name '" << osd_name
10618 << "' initial_weight " << weight << " at location " << loc
10619 << dendl;
10620
10621 CrushWrapper newcrush = _get_pending_crush();
10622
10623 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10624 g_conf()->osd_crush_update_weight_set);
10625 if (err == 0) {
10626 ss << "create-or-move updated item name '" << osd_name
10627 << "' weight " << weight
10628 << " at location " << loc << " to crush map";
10629 break;
10630 }
10631 if (err > 0) {
10632 pending_inc.crush.clear();
10633 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10634 ss << "create-or-move updating item name '" << osd_name
10635 << "' weight " << weight
10636 << " at location " << loc << " to crush map";
10637 getline(ss, rs);
10638 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10639 get_last_committed() + 1));
10640 return true;
10641 }
10642 } while (false);
10643
10644 } else if (prefix == "osd crush move") {
10645 do {
10646 // osd crush move <name> <loc1> [<loc2> ...]
10647 string name;
10648 vector<string> argvec;
10649 cmd_getval(cmdmap, "name", name);
10650 cmd_getval(cmdmap, "args", argvec);
10651 map<string,string> loc;
10652 CrushWrapper::parse_loc_map(argvec, &loc);
10653
10654 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10655 CrushWrapper newcrush = _get_pending_crush();
10656
10657 if (!newcrush.name_exists(name)) {
10658 err = -ENOENT;
10659 ss << "item " << name << " does not exist";
10660 break;
10661 }
10662 int id = newcrush.get_item_id(name);
10663
10664 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10665 if (id >= 0) {
10666 err = newcrush.create_or_move_item(
10667 cct, id, 0, name, loc,
10668 g_conf()->osd_crush_update_weight_set);
10669 } else {
10670 err = newcrush.move_bucket(cct, id, loc);
10671 }
10672 if (err >= 0) {
10673 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10674 pending_inc.crush.clear();
10675 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10676 getline(ss, rs);
10677 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10678 get_last_committed() + 1));
10679 return true;
10680 }
10681 } else {
10682 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10683 err = 0;
10684 }
10685 } while (false);
10686 } else if (prefix == "osd crush swap-bucket") {
10687 string source, dest;
10688 cmd_getval(cmdmap, "source", source);
10689 cmd_getval(cmdmap, "dest", dest);
10690
10691 bool force = false;
10692 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10693
10694 CrushWrapper newcrush = _get_pending_crush();
10695 if (!newcrush.name_exists(source)) {
10696 ss << "source item " << source << " does not exist";
10697 err = -ENOENT;
10698 goto reply;
10699 }
10700 if (!newcrush.name_exists(dest)) {
10701 ss << "dest item " << dest << " does not exist";
10702 err = -ENOENT;
10703 goto reply;
10704 }
10705 int sid = newcrush.get_item_id(source);
10706 int did = newcrush.get_item_id(dest);
10707 int sparent;
10708 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10709 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10710 err = -EPERM;
10711 goto reply;
10712 }
10713 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10714 !force) {
10715 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10716 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10717 << "; pass --yes-i-really-mean-it to proceed anyway";
10718 err = -EPERM;
10719 goto reply;
10720 }
10721 int r = newcrush.swap_bucket(cct, sid, did);
10722 if (r < 0) {
10723 ss << "failed to swap bucket contents: " << cpp_strerror(r);
10724 err = r;
10725 goto reply;
10726 }
10727 ss << "swapped bucket of " << source << " to " << dest;
10728 pending_inc.crush.clear();
10729 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10730 wait_for_finished_proposal(op,
10731 new Monitor::C_Command(mon, op, err, ss.str(),
10732 get_last_committed() + 1));
10733 return true;
10734 } else if (prefix == "osd crush link") {
10735 // osd crush link <name> <loc1> [<loc2> ...]
10736 string name;
10737 cmd_getval(cmdmap, "name", name);
10738 vector<string> argvec;
10739 cmd_getval(cmdmap, "args", argvec);
10740 map<string,string> loc;
10741 CrushWrapper::parse_loc_map(argvec, &loc);
10742
10743 // Need an explicit check for name_exists because get_item_id returns
10744 // 0 on unfound.
10745 int id = osdmap.crush->get_item_id(name);
10746 if (!osdmap.crush->name_exists(name)) {
10747 err = -ENOENT;
10748 ss << "item " << name << " does not exist";
10749 goto reply;
10750 } else {
10751 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10752 }
10753 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10754 ss << "no need to move item id " << id << " name '" << name
10755 << "' to location " << loc << " in crush map";
10756 err = 0;
10757 goto reply;
10758 }
10759
10760 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10761 CrushWrapper newcrush = _get_pending_crush();
10762
10763 if (!newcrush.name_exists(name)) {
10764 err = -ENOENT;
10765 ss << "item " << name << " does not exist";
10766 goto reply;
10767 } else {
10768 int id = newcrush.get_item_id(name);
10769 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10770 err = newcrush.link_bucket(cct, id, loc);
10771 if (err >= 0) {
10772 ss << "linked item id " << id << " name '" << name
10773 << "' to location " << loc << " in crush map";
10774 pending_inc.crush.clear();
10775 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10776 } else {
10777 ss << "cannot link item id " << id << " name '" << name
10778 << "' to location " << loc;
10779 goto reply;
10780 }
10781 } else {
10782 ss << "no need to move item id " << id << " name '" << name
10783 << "' to location " << loc << " in crush map";
10784 err = 0;
10785 }
10786 }
10787 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10788 get_last_committed() + 1));
10789 return true;
10790 } else if (prefix == "osd crush rm" ||
10791 prefix == "osd crush remove" ||
10792 prefix == "osd crush unlink") {
10793 do {
10794 // osd crush rm <id> [ancestor]
10795 CrushWrapper newcrush = _get_pending_crush();
10796
10797 string name;
10798 cmd_getval(cmdmap, "name", name);
10799
10800 if (!osdmap.crush->name_exists(name)) {
10801 err = 0;
10802 ss << "device '" << name << "' does not appear in the crush map";
10803 break;
10804 }
10805 if (!newcrush.name_exists(name)) {
10806 err = 0;
10807 ss << "device '" << name << "' does not appear in the crush map";
10808 getline(ss, rs);
10809 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10810 get_last_committed() + 1));
10811 return true;
10812 }
10813 int id = newcrush.get_item_id(name);
10814 int ancestor = 0;
10815
10816 bool unlink_only = prefix == "osd crush unlink";
10817 string ancestor_str;
10818 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10819 if (!newcrush.name_exists(ancestor_str)) {
10820 err = -ENOENT;
10821 ss << "ancestor item '" << ancestor_str
10822 << "' does not appear in the crush map";
10823 break;
10824 }
10825 ancestor = newcrush.get_item_id(ancestor_str);
10826 }
10827
10828 err = prepare_command_osd_crush_remove(
10829 newcrush,
10830 id, ancestor,
10831 (ancestor < 0), unlink_only);
10832
10833 if (err == -ENOENT) {
10834 ss << "item " << id << " does not appear in that position";
10835 err = 0;
10836 break;
10837 }
10838 if (err == 0) {
10839 if (!unlink_only)
10840 pending_inc.new_crush_node_flags[id] = 0;
10841 ss << "removed item id " << id << " name '" << name << "' from crush map";
10842 getline(ss, rs);
10843 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10844 get_last_committed() + 1));
10845 return true;
10846 }
10847 } while (false);
10848
10849 } else if (prefix == "osd crush reweight-all") {
10850 CrushWrapper newcrush = _get_pending_crush();
10851
10852 newcrush.reweight(cct);
10853 pending_inc.crush.clear();
10854 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10855 ss << "reweighted crush hierarchy";
10856 getline(ss, rs);
10857 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10858 get_last_committed() + 1));
10859 return true;
10860 } else if (prefix == "osd crush reweight") {
10861 // osd crush reweight <name> <weight>
10862 CrushWrapper newcrush = _get_pending_crush();
10863
10864 string name;
10865 cmd_getval(cmdmap, "name", name);
10866 if (!newcrush.name_exists(name)) {
10867 err = -ENOENT;
10868 ss << "device '" << name << "' does not appear in the crush map";
10869 goto reply;
10870 }
10871
10872 int id = newcrush.get_item_id(name);
10873 if (id < 0) {
10874 ss << "device '" << name << "' is not a leaf in the crush map";
10875 err = -EINVAL;
10876 goto reply;
10877 }
10878 double w;
10879 if (!cmd_getval(cmdmap, "weight", w)) {
10880 ss << "unable to parse weight value '"
10881 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10882 err = -EINVAL;
10883 goto reply;
10884 }
10885
10886 err = newcrush.adjust_item_weightf(cct, id, w,
10887 g_conf()->osd_crush_update_weight_set);
10888 if (err < 0)
10889 goto reply;
10890 pending_inc.crush.clear();
10891 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10892 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10893 << " in crush map";
10894 getline(ss, rs);
10895 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10896 get_last_committed() + 1));
10897 return true;
10898 } else if (prefix == "osd crush reweight-subtree") {
10899 // osd crush reweight <name> <weight>
10900 CrushWrapper newcrush = _get_pending_crush();
10901
10902 string name;
10903 cmd_getval(cmdmap, "name", name);
10904 if (!newcrush.name_exists(name)) {
10905 err = -ENOENT;
10906 ss << "device '" << name << "' does not appear in the crush map";
10907 goto reply;
10908 }
10909
10910 int id = newcrush.get_item_id(name);
10911 if (id >= 0) {
10912 ss << "device '" << name << "' is not a subtree in the crush map";
10913 err = -EINVAL;
10914 goto reply;
10915 }
10916 double w;
10917 if (!cmd_getval(cmdmap, "weight", w)) {
10918 ss << "unable to parse weight value '"
10919 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10920 err = -EINVAL;
10921 goto reply;
10922 }
10923
10924 err = newcrush.adjust_subtree_weightf(cct, id, w,
10925 g_conf()->osd_crush_update_weight_set);
10926 if (err < 0)
10927 goto reply;
10928 pending_inc.crush.clear();
10929 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10930 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10931 << " in crush map";
10932 getline(ss, rs);
10933 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10934 get_last_committed() + 1));
10935 return true;
10936 } else if (prefix == "osd crush tunables") {
10937 CrushWrapper newcrush = _get_pending_crush();
10938
10939 err = 0;
10940 string profile;
10941 cmd_getval(cmdmap, "profile", profile);
10942 if (profile == "legacy" || profile == "argonaut") {
10943 newcrush.set_tunables_legacy();
10944 } else if (profile == "bobtail") {
10945 newcrush.set_tunables_bobtail();
10946 } else if (profile == "firefly") {
10947 newcrush.set_tunables_firefly();
10948 } else if (profile == "hammer") {
10949 newcrush.set_tunables_hammer();
10950 } else if (profile == "jewel") {
10951 newcrush.set_tunables_jewel();
10952 } else if (profile == "optimal") {
10953 newcrush.set_tunables_optimal();
10954 } else if (profile == "default") {
10955 newcrush.set_tunables_default();
10956 } else {
10957 ss << "unrecognized profile '" << profile << "'";
10958 err = -EINVAL;
10959 goto reply;
10960 }
10961
10962 if (!validate_crush_against_features(&newcrush, ss)) {
10963 err = -EINVAL;
10964 goto reply;
10965 }
10966
10967 pending_inc.crush.clear();
10968 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10969 ss << "adjusted tunables profile to " << profile;
10970 getline(ss, rs);
10971 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10972 get_last_committed() + 1));
10973 return true;
10974 } else if (prefix == "osd crush set-tunable") {
10975 CrushWrapper newcrush = _get_pending_crush();
10976
10977 err = 0;
10978 string tunable;
10979 cmd_getval(cmdmap, "tunable", tunable);
10980
10981 int64_t value = -1;
10982 if (!cmd_getval(cmdmap, "value", value)) {
10983 err = -EINVAL;
10984 ss << "failed to parse integer value "
10985 << cmd_vartype_stringify(cmdmap.at("value"));
10986 goto reply;
10987 }
10988
10989 if (tunable == "straw_calc_version") {
10990 if (value != 0 && value != 1) {
10991 ss << "value must be 0 or 1; got " << value;
10992 err = -EINVAL;
10993 goto reply;
10994 }
10995 newcrush.set_straw_calc_version(value);
10996 } else {
10997 ss << "unrecognized tunable '" << tunable << "'";
10998 err = -EINVAL;
10999 goto reply;
11000 }
11001
11002 if (!validate_crush_against_features(&newcrush, ss)) {
11003 err = -EINVAL;
11004 goto reply;
11005 }
11006
11007 pending_inc.crush.clear();
11008 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11009 ss << "adjusted tunable " << tunable << " to " << value;
11010 getline(ss, rs);
11011 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11012 get_last_committed() + 1));
11013 return true;
11014
11015 } else if (prefix == "osd crush rule create-simple") {
11016 string name, root, type, mode;
11017 cmd_getval(cmdmap, "name", name);
11018 cmd_getval(cmdmap, "root", root);
11019 cmd_getval(cmdmap, "type", type);
11020 cmd_getval(cmdmap, "mode", mode);
11021 if (mode == "")
11022 mode = "firstn";
11023
11024 if (osdmap.crush->rule_exists(name)) {
11025 // The name is uniquely associated to a ruleid and the rule it contains
11026 // From the user point of view, the rule is more meaningfull.
11027 ss << "rule " << name << " already exists";
11028 err = 0;
11029 goto reply;
11030 }
11031
11032 CrushWrapper newcrush = _get_pending_crush();
11033
11034 if (newcrush.rule_exists(name)) {
11035 // The name is uniquely associated to a ruleid and the rule it contains
11036 // From the user point of view, the rule is more meaningfull.
11037 ss << "rule " << name << " already exists";
11038 err = 0;
11039 } else {
11040 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
11041 pg_pool_t::TYPE_REPLICATED, &ss);
11042 if (ruleno < 0) {
11043 err = ruleno;
11044 goto reply;
11045 }
11046
11047 pending_inc.crush.clear();
11048 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11049 }
11050 getline(ss, rs);
11051 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11052 get_last_committed() + 1));
11053 return true;
11054
11055 } else if (prefix == "osd crush rule create-replicated") {
11056 string name, root, type, device_class;
11057 cmd_getval(cmdmap, "name", name);
11058 cmd_getval(cmdmap, "root", root);
11059 cmd_getval(cmdmap, "type", type);
11060 cmd_getval(cmdmap, "class", device_class);
11061
11062 if (osdmap.crush->rule_exists(name)) {
11063 // The name is uniquely associated to a ruleid and the rule it contains
11064 // From the user point of view, the rule is more meaningfull.
11065 ss << "rule " << name << " already exists";
11066 err = 0;
11067 goto reply;
11068 }
11069
11070 CrushWrapper newcrush = _get_pending_crush();
11071
11072 if (newcrush.rule_exists(name)) {
11073 // The name is uniquely associated to a ruleid and the rule it contains
11074 // From the user point of view, the rule is more meaningfull.
11075 ss << "rule " << name << " already exists";
11076 err = 0;
11077 } else {
11078 int ruleno = newcrush.add_simple_rule(
11079 name, root, type, device_class,
11080 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11081 if (ruleno < 0) {
11082 err = ruleno;
11083 goto reply;
11084 }
11085
11086 pending_inc.crush.clear();
11087 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11088 }
11089 getline(ss, rs);
11090 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11091 get_last_committed() + 1));
11092 return true;
11093
11094 } else if (prefix == "osd erasure-code-profile rm") {
11095 string name;
11096 cmd_getval(cmdmap, "name", name);
11097
11098 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11099 goto wait;
11100
11101 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11102 err = -EBUSY;
11103 goto reply;
11104 }
11105
11106 if (osdmap.has_erasure_code_profile(name) ||
11107 pending_inc.new_erasure_code_profiles.count(name)) {
11108 if (osdmap.has_erasure_code_profile(name)) {
11109 pending_inc.old_erasure_code_profiles.push_back(name);
11110 } else {
11111 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11112 pending_inc.new_erasure_code_profiles.erase(name);
11113 }
11114
11115 getline(ss, rs);
11116 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11117 get_last_committed() + 1));
11118 return true;
11119 } else {
11120 ss << "erasure-code-profile " << name << " does not exist";
11121 err = 0;
11122 goto reply;
11123 }
11124
11125 } else if (prefix == "osd erasure-code-profile set") {
11126 string name;
11127 cmd_getval(cmdmap, "name", name);
11128 vector<string> profile;
11129 cmd_getval(cmdmap, "profile", profile);
11130
11131 bool force = false;
11132 cmd_getval(cmdmap, "force", force);
11133
11134 map<string,string> profile_map;
11135 err = parse_erasure_code_profile(profile, &profile_map, &ss);
11136 if (err)
11137 goto reply;
11138 if (auto found = profile_map.find("crush-failure-domain");
11139 found != profile_map.end()) {
11140 const auto& failure_domain = found->second;
11141 int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11142 if (failure_domain_type < 0) {
11143 ss << "erasure-code-profile " << profile_map
11144 << " contains an invalid failure-domain " << std::quoted(failure_domain);
11145 err = -EINVAL;
11146 goto reply;
11147 }
11148 }
11149
11150 if (profile_map.find("plugin") == profile_map.end()) {
11151 ss << "erasure-code-profile " << profile_map
11152 << " must contain a plugin entry" << std::endl;
11153 err = -EINVAL;
11154 goto reply;
11155 }
11156 string plugin = profile_map["plugin"];
11157
11158 if (pending_inc.has_erasure_code_profile(name)) {
11159 dout(20) << "erasure code profile " << name << " try again" << dendl;
11160 goto wait;
11161 } else {
11162 err = normalize_profile(name, profile_map, force, &ss);
11163 if (err)
11164 goto reply;
11165
11166 if (osdmap.has_erasure_code_profile(name)) {
11167 ErasureCodeProfile existing_profile_map =
11168 osdmap.get_erasure_code_profile(name);
11169 err = normalize_profile(name, existing_profile_map, force, &ss);
11170 if (err)
11171 goto reply;
11172
11173 if (existing_profile_map == profile_map) {
11174 err = 0;
11175 goto reply;
11176 }
11177 if (!force) {
11178 err = -EPERM;
11179 ss << "will not override erasure code profile " << name
11180 << " because the existing profile "
11181 << existing_profile_map
11182 << " is different from the proposed profile "
11183 << profile_map;
11184 goto reply;
11185 }
11186 }
11187
11188 dout(20) << "erasure code profile set " << name << "="
11189 << profile_map << dendl;
11190 pending_inc.set_erasure_code_profile(name, profile_map);
11191 }
11192
11193 getline(ss, rs);
11194 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11195 get_last_committed() + 1));
11196 return true;
11197
11198 } else if (prefix == "osd crush rule create-erasure") {
11199 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11200 if (err == -EAGAIN)
11201 goto wait;
11202 if (err)
11203 goto reply;
11204 string name, poolstr;
11205 cmd_getval(cmdmap, "name", name);
11206 string profile;
11207 cmd_getval(cmdmap, "profile", profile);
11208 if (profile == "")
11209 profile = "default";
11210 if (profile == "default") {
11211 if (!osdmap.has_erasure_code_profile(profile)) {
11212 if (pending_inc.has_erasure_code_profile(profile)) {
11213 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11214 goto wait;
11215 }
11216
11217 map<string,string> profile_map;
11218 err = osdmap.get_erasure_code_profile_default(cct,
11219 profile_map,
11220 &ss);
11221 if (err)
11222 goto reply;
11223 err = normalize_profile(name, profile_map, true, &ss);
11224 if (err)
11225 goto reply;
11226 dout(20) << "erasure code profile set " << profile << "="
11227 << profile_map << dendl;
11228 pending_inc.set_erasure_code_profile(profile, profile_map);
11229 goto wait;
11230 }
11231 }
11232
11233 int rule;
11234 err = crush_rule_create_erasure(name, profile, &rule, &ss);
11235 if (err < 0) {
11236 switch(err) {
11237 case -EEXIST: // return immediately
11238 ss << "rule " << name << " already exists";
11239 err = 0;
11240 goto reply;
11241 break;
11242 case -EALREADY: // wait for pending to be proposed
11243 ss << "rule " << name << " already exists";
11244 err = 0;
11245 break;
11246 default: // non recoverable error
11247 goto reply;
11248 break;
11249 }
11250 } else {
11251 ss << "created rule " << name << " at " << rule;
11252 }
11253
11254 getline(ss, rs);
11255 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11256 get_last_committed() + 1));
11257 return true;
11258
11259 } else if (prefix == "osd crush rule rm") {
11260 string name;
11261 cmd_getval(cmdmap, "name", name);
11262
11263 if (!osdmap.crush->rule_exists(name)) {
11264 ss << "rule " << name << " does not exist";
11265 err = 0;
11266 goto reply;
11267 }
11268
11269 CrushWrapper newcrush = _get_pending_crush();
11270
11271 if (!newcrush.rule_exists(name)) {
11272 ss << "rule " << name << " does not exist";
11273 err = 0;
11274 } else {
11275 int ruleno = newcrush.get_rule_id(name);
11276 ceph_assert(ruleno >= 0);
11277
11278 // make sure it is not in use.
11279 // FIXME: this is ok in some situations, but let's not bother with that
11280 // complexity now.
11281 if (osdmap.crush_rule_in_use(ruleno)) {
11282 ss << "crush rule " << name << " (" << ruleno << ") is in use";
11283 err = -EBUSY;
11284 goto reply;
11285 }
11286
11287 err = newcrush.remove_rule(ruleno);
11288 if (err < 0) {
11289 goto reply;
11290 }
11291
11292 pending_inc.crush.clear();
11293 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11294 }
11295 getline(ss, rs);
11296 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11297 get_last_committed() + 1));
11298 return true;
11299
11300 } else if (prefix == "osd crush rule rename") {
11301 string srcname;
11302 string dstname;
11303 cmd_getval(cmdmap, "srcname", srcname);
11304 cmd_getval(cmdmap, "dstname", dstname);
11305 if (srcname.empty() || dstname.empty()) {
11306 ss << "must specify both source rule name and destination rule name";
11307 err = -EINVAL;
11308 goto reply;
11309 }
11310 if (srcname == dstname) {
11311 ss << "destination rule name is equal to source rule name";
11312 err = 0;
11313 goto reply;
11314 }
11315
11316 CrushWrapper newcrush = _get_pending_crush();
11317 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11318 // srcname does not exist and dstname already exists
11319 // suppose this is a replay and return success
11320 // (so this command is idempotent)
11321 ss << "already renamed to '" << dstname << "'";
11322 err = 0;
11323 goto reply;
11324 }
11325
11326 err = newcrush.rename_rule(srcname, dstname, &ss);
11327 if (err < 0) {
11328 // ss has reason for failure
11329 goto reply;
11330 }
11331 pending_inc.crush.clear();
11332 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11333 getline(ss, rs);
11334 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11335 get_last_committed() + 1));
11336 return true;
11337
11338 } else if (prefix == "osd setmaxosd") {
11339 int64_t newmax;
11340 if (!cmd_getval(cmdmap, "newmax", newmax)) {
11341 ss << "unable to parse 'newmax' value '"
11342 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11343 err = -EINVAL;
11344 goto reply;
11345 }
11346
11347 if (newmax > g_conf()->mon_max_osd) {
11348 err = -ERANGE;
11349 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11350 << g_conf()->mon_max_osd << ")";
11351 goto reply;
11352 }
11353
11354 // Don't allow shrinking OSD number as this will cause data loss
11355 // and may cause kernel crashes.
11356 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11357 if (newmax < osdmap.get_max_osd()) {
11358 // Check if the OSDs exist between current max and new value.
11359 // If there are any OSDs exist, then don't allow shrinking number
11360 // of OSDs.
11361 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11362 if (osdmap.exists(i)) {
11363 err = -EBUSY;
11364 ss << "cannot shrink max_osd to " << newmax
11365 << " because osd." << i << " (and possibly others) still in use";
11366 goto reply;
11367 }
11368 }
11369 }
11370
11371 pending_inc.new_max_osd = newmax;
11372 ss << "set new max_osd = " << pending_inc.new_max_osd;
11373 getline(ss, rs);
11374 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11375 get_last_committed() + 1));
11376 return true;
11377
11378 } else if (prefix == "osd set-full-ratio" ||
11379 prefix == "osd set-backfillfull-ratio" ||
11380 prefix == "osd set-nearfull-ratio") {
11381 double n;
11382 if (!cmd_getval(cmdmap, "ratio", n)) {
11383 ss << "unable to parse 'ratio' value '"
11384 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11385 err = -EINVAL;
11386 goto reply;
11387 }
11388 if (prefix == "osd set-full-ratio")
11389 pending_inc.new_full_ratio = n;
11390 else if (prefix == "osd set-backfillfull-ratio")
11391 pending_inc.new_backfillfull_ratio = n;
11392 else if (prefix == "osd set-nearfull-ratio")
11393 pending_inc.new_nearfull_ratio = n;
11394 ss << prefix << " " << n;
11395 getline(ss, rs);
11396 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11397 get_last_committed() + 1));
11398 return true;
11399 } else if (prefix == "osd set-require-min-compat-client") {
11400 string v;
11401 cmd_getval(cmdmap, "version", v);
11402 ceph_release_t vno = ceph_release_from_name(v);
11403 if (!vno) {
11404 ss << "version " << v << " is not recognized";
11405 err = -EINVAL;
11406 goto reply;
11407 }
11408 OSDMap newmap;
11409 newmap.deepish_copy_from(osdmap);
11410 newmap.apply_incremental(pending_inc);
11411 newmap.require_min_compat_client = vno;
11412 auto mvno = newmap.get_min_compat_client();
11413 if (vno < mvno) {
11414 ss << "osdmap current utilizes features that require " << mvno
11415 << "; cannot set require_min_compat_client below that to " << vno;
11416 err = -EPERM;
11417 goto reply;
11418 }
11419 bool sure = false;
11420 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11421 if (!sure) {
11422 FeatureMap m;
11423 mon.get_combined_feature_map(&m);
11424 uint64_t features = ceph_release_features(to_integer<int>(vno));
11425 bool first = true;
11426 bool ok = true;
11427 for (int type : {
11428 CEPH_ENTITY_TYPE_CLIENT,
11429 CEPH_ENTITY_TYPE_MDS,
11430 CEPH_ENTITY_TYPE_MGR }) {
11431 auto p = m.m.find(type);
11432 if (p == m.m.end()) {
11433 continue;
11434 }
11435 for (auto& q : p->second) {
11436 uint64_t missing = ~q.first & features;
11437 if (missing) {
11438 if (first) {
11439 ss << "cannot set require_min_compat_client to " << v << ": ";
11440 } else {
11441 ss << "; ";
11442 }
11443 first = false;
11444 ss << q.second << " connected " << ceph_entity_type_name(type)
11445 << "(s) look like " << ceph_release_name(
11446 ceph_release_from_features(q.first))
11447 << " (missing 0x" << std::hex << missing << std::dec << ")";
11448 ok = false;
11449 }
11450 }
11451 }
11452 if (!ok) {
11453 ss << "; add --yes-i-really-mean-it to do it anyway";
11454 err = -EPERM;
11455 goto reply;
11456 }
11457 }
11458 ss << "set require_min_compat_client to " << vno;
11459 pending_inc.new_require_min_compat_client = vno;
11460 getline(ss, rs);
11461 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11462 get_last_committed() + 1));
11463 return true;
11464 } else if (prefix == "osd pause") {
11465 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11466
11467 } else if (prefix == "osd unpause") {
11468 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11469
11470 } else if (prefix == "osd set") {
11471 bool sure = false;
11472 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11473
11474 string key;
11475 cmd_getval(cmdmap, "key", key);
11476 if (key == "pause")
11477 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11478 else if (key == "noup")
11479 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11480 else if (key == "nodown")
11481 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11482 else if (key == "noout")
11483 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11484 else if (key == "noin")
11485 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11486 else if (key == "nobackfill")
11487 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11488 else if (key == "norebalance")
11489 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11490 else if (key == "norecover")
11491 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11492 else if (key == "noscrub")
11493 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11494 else if (key == "nodeep-scrub")
11495 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11496 else if (key == "notieragent")
11497 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11498 else if (key == "nosnaptrim")
11499 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11500 else if (key == "pglog_hardlimit") {
11501 if (!osdmap.get_num_up_osds() && !sure) {
11502 ss << "Not advisable to continue since no OSDs are up. Pass "
11503 << "--yes-i-really-mean-it if you really wish to continue.";
11504 err = -EPERM;
11505 goto reply;
11506 }
11507 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11508 // we are reusing a jewel feature bit that was retired in luminous.
11509 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11510 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11511 || sure)) {
11512 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11513 } else {
11514 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11515 err = -EPERM;
11516 goto reply;
11517 }
11518 } else {
11519 ss << "unrecognized flag '" << key << "'";
11520 err = -EINVAL;
11521 }
11522
11523 } else if (prefix == "osd unset") {
11524 string key;
11525 cmd_getval(cmdmap, "key", key);
11526 if (key == "pause")
11527 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11528 else if (key == "noup")
11529 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11530 else if (key == "nodown")
11531 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11532 else if (key == "noout")
11533 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11534 else if (key == "noin")
11535 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11536 else if (key == "nobackfill")
11537 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11538 else if (key == "norebalance")
11539 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11540 else if (key == "norecover")
11541 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11542 else if (key == "noscrub")
11543 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11544 else if (key == "nodeep-scrub")
11545 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11546 else if (key == "notieragent")
11547 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11548 else if (key == "nosnaptrim")
11549 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11550 else {
11551 ss << "unrecognized flag '" << key << "'";
11552 err = -EINVAL;
11553 }
11554
11555 } else if (prefix == "osd require-osd-release") {
11556 string release;
11557 cmd_getval(cmdmap, "release", release);
11558 bool sure = false;
11559 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11560 ceph_release_t rel = ceph_release_from_name(release.c_str());
11561 if (!rel) {
11562 ss << "unrecognized release " << release;
11563 err = -EINVAL;
11564 goto reply;
11565 }
11566 if (rel == osdmap.require_osd_release) {
11567 // idempotent
11568 err = 0;
11569 goto reply;
11570 }
11571 ceph_assert(osdmap.require_osd_release >= ceph_release_t::octopus);
11572 if (!osdmap.get_num_up_osds() && !sure) {
11573 ss << "Not advisable to continue since no OSDs are up. Pass "
11574 << "--yes-i-really-mean-it if you really wish to continue.";
11575 err = -EPERM;
11576 goto reply;
11577 }
11578 if (rel == ceph_release_t::octopus) {
11579 if (!mon.monmap->get_required_features().contains_all(
11580 ceph::features::mon::FEATURE_OCTOPUS)) {
11581 ss << "not all mons are octopus";
11582 err = -EPERM;
11583 goto reply;
11584 }
11585 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11586 && !sure) {
11587 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11588 err = -EPERM;
11589 goto reply;
11590 }
11591 } else if (rel == ceph_release_t::pacific) {
11592 if (!mon.monmap->get_required_features().contains_all(
11593 ceph::features::mon::FEATURE_PACIFIC)) {
11594 ss << "not all mons are pacific";
11595 err = -EPERM;
11596 goto reply;
11597 }
11598 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11599 && !sure) {
11600 ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11601 err = -EPERM;
11602 goto reply;
11603 }
11604 } else if (rel == ceph_release_t::quincy) {
11605 if (!mon.monmap->get_required_features().contains_all(
11606 ceph::features::mon::FEATURE_QUINCY)) {
11607 ss << "not all mons are quincy";
11608 err = -EPERM;
11609 goto reply;
11610 }
11611 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
11612 && !sure) {
11613 ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11614 err = -EPERM;
11615 goto reply;
11616 }
11617 } else {
11618 ss << "not supported for this release";
11619 err = -EPERM;
11620 goto reply;
11621 }
11622 if (rel < osdmap.require_osd_release) {
11623 ss << "require_osd_release cannot be lowered once it has been set";
11624 err = -EPERM;
11625 goto reply;
11626 }
11627 pending_inc.new_require_osd_release = rel;
11628 goto update;
11629 } else if (prefix == "osd down" ||
11630 prefix == "osd out" ||
11631 prefix == "osd in" ||
11632 prefix == "osd rm" ||
11633 prefix == "osd stop") {
11634
11635 bool any = false;
11636 bool stop = false;
11637 bool verbose = true;
11638 bool definitely_dead = false;
11639
11640 vector<string> idvec;
11641 cmd_getval(cmdmap, "ids", idvec);
11642 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11643 derr << "definitely_dead " << (int)definitely_dead << dendl;
11644 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11645 set<int> osds;
11646
11647 // wildcard?
11648 if (j == 0 &&
11649 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11650 if (prefix == "osd in") {
11651 // touch out osds only
11652 osdmap.get_out_existing_osds(osds);
11653 } else {
11654 osdmap.get_all_osds(osds);
11655 }
11656 stop = true;
11657 verbose = false; // so the output is less noisy.
11658 } else {
11659 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11660 if (osd < 0) {
11661 ss << "invalid osd id" << osd;
11662 err = -EINVAL;
11663 continue;
11664 } else if (!osdmap.exists(osd)) {
11665 ss << "osd." << osd << " does not exist. ";
11666 continue;
11667 }
11668
11669 osds.insert(osd);
11670 }
11671
11672 for (auto &osd : osds) {
11673 if (prefix == "osd down") {
11674 if (osdmap.is_down(osd)) {
11675 if (verbose)
11676 ss << "osd." << osd << " is already down. ";
11677 } else {
11678 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11679 ss << "marked down osd." << osd << ". ";
11680 any = true;
11681 }
11682 if (definitely_dead) {
11683 if (!pending_inc.new_xinfo.count(osd)) {
11684 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11685 }
11686 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11687 any = true;
11688 }
11689 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11690 }
11691 } else if (prefix == "osd out") {
11692 if (osdmap.is_out(osd)) {
11693 if (verbose)
11694 ss << "osd." << osd << " is already out. ";
11695 } else {
11696 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11697 if (osdmap.osd_weight[osd]) {
11698 if (pending_inc.new_xinfo.count(osd) == 0) {
11699 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11700 }
11701 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11702 }
11703 ss << "marked out osd." << osd << ". ";
11704 std::ostringstream msg;
11705 msg << "Client " << op->get_session()->entity_name
11706 << " marked osd." << osd << " out";
11707 if (osdmap.is_up(osd)) {
11708 msg << ", while it was still marked up";
11709 } else {
11710 auto period = ceph_clock_now() - down_pending_out[osd];
11711 msg << ", after it was down for " << int(period.sec())
11712 << " seconds";
11713 }
11714
11715 mon.clog->info() << msg.str();
11716 any = true;
11717 }
11718 } else if (prefix == "osd in") {
11719 if (osdmap.is_in(osd)) {
11720 if (verbose)
11721 ss << "osd." << osd << " is already in. ";
11722 } else {
11723 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11724 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11725 if (pending_inc.new_xinfo.count(osd) == 0) {
11726 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11727 }
11728 pending_inc.new_xinfo[osd].old_weight = 0;
11729 } else {
11730 pending_inc.new_weight[osd] = CEPH_OSD_IN;
11731 }
11732 ss << "marked in osd." << osd << ". ";
11733 any = true;
11734 }
11735 } else if (prefix == "osd rm") {
11736 err = prepare_command_osd_remove(osd);
11737
11738 if (err == -EBUSY) {
11739 if (any)
11740 ss << ", ";
11741 ss << "osd." << osd << " is still up; must be down before removal. ";
11742 } else {
11743 ceph_assert(err == 0);
11744 if (any) {
11745 ss << ", osd." << osd;
11746 } else {
11747 ss << "removed osd." << osd;
11748 }
11749 any = true;
11750 }
11751 } else if (prefix == "osd stop") {
11752 if (osdmap.is_stop(osd)) {
11753 if (verbose)
11754 ss << "osd." << osd << " is already stopped. ";
11755 } else if (osdmap.is_down(osd)) {
11756 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11757 ss << "stop down osd." << osd << ". ";
11758 any = true;
11759 } else {
11760 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11761 ss << "stop osd." << osd << ". ";
11762 any = true;
11763 }
11764 }
11765 }
11766 }
11767 if (any) {
11768 getline(ss, rs);
11769 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11770 get_last_committed() + 1));
11771 return true;
11772 }
11773 } else if (prefix == "osd set-group" ||
11774 prefix == "osd unset-group" ||
11775 prefix == "osd add-noup" ||
11776 prefix == "osd add-nodown" ||
11777 prefix == "osd add-noin" ||
11778 prefix == "osd add-noout" ||
11779 prefix == "osd rm-noup" ||
11780 prefix == "osd rm-nodown" ||
11781 prefix == "osd rm-noin" ||
11782 prefix == "osd rm-noout") {
11783 bool do_set = prefix == "osd set-group" ||
11784 prefix.find("add") != string::npos;
11785 string flag_str;
11786 unsigned flags = 0;
11787 vector<string> who;
11788 if (prefix == "osd set-group" || prefix == "osd unset-group") {
11789 cmd_getval(cmdmap, "flags", flag_str);
11790 cmd_getval(cmdmap, "who", who);
11791 vector<string> raw_flags;
11792 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11793 for (auto& f : raw_flags) {
11794 if (f == "noup")
11795 flags |= CEPH_OSD_NOUP;
11796 else if (f == "nodown")
11797 flags |= CEPH_OSD_NODOWN;
11798 else if (f == "noin")
11799 flags |= CEPH_OSD_NOIN;
11800 else if (f == "noout")
11801 flags |= CEPH_OSD_NOOUT;
11802 else {
11803 ss << "unrecognized flag '" << f << "', must be one of "
11804 << "{noup,nodown,noin,noout}";
11805 err = -EINVAL;
11806 goto reply;
11807 }
11808 }
11809 } else {
11810 cmd_getval(cmdmap, "ids", who);
11811 if (prefix.find("noup") != string::npos)
11812 flags = CEPH_OSD_NOUP;
11813 else if (prefix.find("nodown") != string::npos)
11814 flags = CEPH_OSD_NODOWN;
11815 else if (prefix.find("noin") != string::npos)
11816 flags = CEPH_OSD_NOIN;
11817 else if (prefix.find("noout") != string::npos)
11818 flags = CEPH_OSD_NOOUT;
11819 else
11820 ceph_assert(0 == "Unreachable!");
11821 }
11822 if (flags == 0) {
11823 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11824 err = -EINVAL;
11825 goto reply;
11826 }
11827 if (who.empty()) {
11828 ss << "must specify at least one or more targets to set/unset";
11829 err = -EINVAL;
11830 goto reply;
11831 }
11832 set<int> osds;
11833 set<int> crush_nodes;
11834 set<int> device_classes;
11835 for (auto& w : who) {
11836 if (w == "any" || w == "all" || w == "*") {
11837 osdmap.get_all_osds(osds);
11838 break;
11839 }
11840 std::stringstream ts;
11841 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11842 osds.insert(osd);
11843 } else if (osdmap.crush->name_exists(w)) {
11844 crush_nodes.insert(osdmap.crush->get_item_id(w));
11845 } else if (osdmap.crush->class_exists(w)) {
11846 device_classes.insert(osdmap.crush->get_class_id(w));
11847 } else {
11848 ss << "unable to parse osd id or crush node or device class: "
11849 << "\"" << w << "\". ";
11850 }
11851 }
11852 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11853 // ss has reason for failure
11854 err = -EINVAL;
11855 goto reply;
11856 }
11857 bool any = false;
11858 for (auto osd : osds) {
11859 if (!osdmap.exists(osd)) {
11860 ss << "osd." << osd << " does not exist. ";
11861 continue;
11862 }
11863 if (do_set) {
11864 if (flags & CEPH_OSD_NOUP) {
11865 any |= osdmap.is_noup_by_osd(osd) ?
11866 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11867 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11868 }
11869 if (flags & CEPH_OSD_NODOWN) {
11870 any |= osdmap.is_nodown_by_osd(osd) ?
11871 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11872 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11873 }
11874 if (flags & CEPH_OSD_NOIN) {
11875 any |= osdmap.is_noin_by_osd(osd) ?
11876 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11877 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11878 }
11879 if (flags & CEPH_OSD_NOOUT) {
11880 any |= osdmap.is_noout_by_osd(osd) ?
11881 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11882 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11883 }
11884 } else {
11885 if (flags & CEPH_OSD_NOUP) {
11886 any |= osdmap.is_noup_by_osd(osd) ?
11887 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11888 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11889 }
11890 if (flags & CEPH_OSD_NODOWN) {
11891 any |= osdmap.is_nodown_by_osd(osd) ?
11892 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11893 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11894 }
11895 if (flags & CEPH_OSD_NOIN) {
11896 any |= osdmap.is_noin_by_osd(osd) ?
11897 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11898 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11899 }
11900 if (flags & CEPH_OSD_NOOUT) {
11901 any |= osdmap.is_noout_by_osd(osd) ?
11902 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11903 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11904 }
11905 }
11906 }
11907 for (auto& id : crush_nodes) {
11908 auto old_flags = osdmap.get_crush_node_flags(id);
11909 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11910 pending_flags |= old_flags; // adopt existing flags first!
11911 if (do_set) {
11912 pending_flags |= flags;
11913 } else {
11914 pending_flags &= ~flags;
11915 }
11916 any = true;
11917 }
11918 for (auto& id : device_classes) {
11919 auto old_flags = osdmap.get_device_class_flags(id);
11920 auto& pending_flags = pending_inc.new_device_class_flags[id];
11921 pending_flags |= old_flags;
11922 if (do_set) {
11923 pending_flags |= flags;
11924 } else {
11925 pending_flags &= ~flags;
11926 }
11927 any = true;
11928 }
11929 if (any) {
11930 getline(ss, rs);
11931 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11932 get_last_committed() + 1));
11933 return true;
11934 }
11935 } else if (prefix == "osd pg-temp") {
11936 string pgidstr;
11937 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11938 ss << "unable to parse 'pgid' value '"
11939 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11940 err = -EINVAL;
11941 goto reply;
11942 }
11943 pg_t pgid;
11944 if (!pgid.parse(pgidstr.c_str())) {
11945 ss << "invalid pgid '" << pgidstr << "'";
11946 err = -EINVAL;
11947 goto reply;
11948 }
11949 if (!osdmap.pg_exists(pgid)) {
11950 ss << "pg " << pgid << " does not exist";
11951 err = -ENOENT;
11952 goto reply;
11953 }
11954 if (pending_inc.new_pg_temp.count(pgid)) {
11955 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11956 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11957 return true;
11958 }
11959
11960 vector<int64_t> id_vec;
11961 vector<int32_t> new_pg_temp;
11962 cmd_getval(cmdmap, "id", id_vec);
11963 if (id_vec.empty()) {
11964 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11965 ss << "done cleaning up pg_temp of " << pgid;
11966 goto update;
11967 }
11968 for (auto osd : id_vec) {
11969 if (!osdmap.exists(osd)) {
11970 ss << "osd." << osd << " does not exist";
11971 err = -ENOENT;
11972 goto reply;
11973 }
11974 new_pg_temp.push_back(osd);
11975 }
11976
11977 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11978 if ((int)new_pg_temp.size() < pool_min_size) {
11979 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11980 << pool_min_size << ")";
11981 err = -EINVAL;
11982 goto reply;
11983 }
11984
11985 int pool_size = osdmap.get_pg_pool_size(pgid);
11986 if ((int)new_pg_temp.size() > pool_size) {
11987 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11988 << pool_size << ")";
11989 err = -EINVAL;
11990 goto reply;
11991 }
11992
11993 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11994 new_pg_temp.begin(), new_pg_temp.end());
11995 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11996 goto update;
11997 } else if (prefix == "osd primary-temp") {
11998 string pgidstr;
11999 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
12000 ss << "unable to parse 'pgid' value '"
12001 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
12002 err = -EINVAL;
12003 goto reply;
12004 }
12005 pg_t pgid;
12006 if (!pgid.parse(pgidstr.c_str())) {
12007 ss << "invalid pgid '" << pgidstr << "'";
12008 err = -EINVAL;
12009 goto reply;
12010 }
12011 if (!osdmap.pg_exists(pgid)) {
12012 ss << "pg " << pgid << " does not exist";
12013 err = -ENOENT;
12014 goto reply;
12015 }
12016
12017 int64_t osd;
12018 if (!cmd_getval(cmdmap, "id", osd)) {
12019 ss << "unable to parse 'id' value '"
12020 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12021 err = -EINVAL;
12022 goto reply;
12023 }
12024 if (osd != -1 && !osdmap.exists(osd)) {
12025 ss << "osd." << osd << " does not exist";
12026 err = -ENOENT;
12027 goto reply;
12028 }
12029
12030 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12031 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12032 ss << "require_min_compat_client "
12033 << osdmap.require_min_compat_client
12034 << " < firefly, which is required for primary-temp";
12035 err = -EPERM;
12036 goto reply;
12037 }
12038
12039 pending_inc.new_primary_temp[pgid] = osd;
12040 ss << "set " << pgid << " primary_temp mapping to " << osd;
12041 goto update;
12042 } else if (prefix == "pg repeer") {
12043 pg_t pgid;
12044 string pgidstr;
12045 cmd_getval(cmdmap, "pgid", pgidstr);
12046 if (!pgid.parse(pgidstr.c_str())) {
12047 ss << "invalid pgid '" << pgidstr << "'";
12048 err = -EINVAL;
12049 goto reply;
12050 }
12051 if (!osdmap.pg_exists(pgid)) {
12052 ss << "pg '" << pgidstr << "' does not exist";
12053 err = -ENOENT;
12054 goto reply;
12055 }
12056 vector<int> acting;
12057 int primary;
12058 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12059 if (primary < 0) {
12060 err = -EAGAIN;
12061 ss << "pg currently has no primary";
12062 goto reply;
12063 }
12064 if (acting.size() > 1) {
12065 // map to just primary; it will map back to what it wants
12066 pending_inc.new_pg_temp[pgid] = { primary };
12067 } else {
12068 // hmm, pick another arbitrary osd to induce a change. Note
12069 // that this won't work if there is only one suitable OSD in the cluster.
12070 int i;
12071 bool done = false;
12072 for (i = 0; i < osdmap.get_max_osd(); ++i) {
12073 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12074 continue;
12075 }
12076 pending_inc.new_pg_temp[pgid] = { primary, i };
12077 done = true;
12078 break;
12079 }
12080 if (!done) {
12081 err = -EAGAIN;
12082 ss << "not enough up OSDs in the cluster to force repeer";
12083 goto reply;
12084 }
12085 }
12086 goto update;
12087 } else if (prefix == "osd pg-upmap" ||
12088 prefix == "osd rm-pg-upmap" ||
12089 prefix == "osd pg-upmap-items" ||
12090 prefix == "osd rm-pg-upmap-items") {
12091 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
12092 ss << "min_compat_client "
12093 << osdmap.require_min_compat_client
12094 << " < luminous, which is required for pg-upmap. "
12095 << "Try 'ceph osd set-require-min-compat-client luminous' "
12096 << "before using the new interface";
12097 err = -EPERM;
12098 goto reply;
12099 }
12100 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
12101 if (err == -EAGAIN)
12102 goto wait;
12103 if (err < 0)
12104 goto reply;
12105 string pgidstr;
12106 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
12107 ss << "unable to parse 'pgid' value '"
12108 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
12109 err = -EINVAL;
12110 goto reply;
12111 }
12112 pg_t pgid;
12113 if (!pgid.parse(pgidstr.c_str())) {
12114 ss << "invalid pgid '" << pgidstr << "'";
12115 err = -EINVAL;
12116 goto reply;
12117 }
12118 if (!osdmap.pg_exists(pgid)) {
12119 ss << "pg " << pgid << " does not exist";
12120 err = -ENOENT;
12121 goto reply;
12122 }
12123 if (pending_inc.old_pools.count(pgid.pool())) {
12124 ss << "pool of " << pgid << " is pending removal";
12125 err = -ENOENT;
12126 getline(ss, rs);
12127 wait_for_finished_proposal(op,
12128 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12129 return true;
12130 }
12131
12132 enum {
12133 OP_PG_UPMAP,
12134 OP_RM_PG_UPMAP,
12135 OP_PG_UPMAP_ITEMS,
12136 OP_RM_PG_UPMAP_ITEMS,
12137 } option;
12138
12139 if (prefix == "osd pg-upmap") {
12140 option = OP_PG_UPMAP;
12141 } else if (prefix == "osd rm-pg-upmap") {
12142 option = OP_RM_PG_UPMAP;
12143 } else if (prefix == "osd pg-upmap-items") {
12144 option = OP_PG_UPMAP_ITEMS;
12145 } else {
12146 option = OP_RM_PG_UPMAP_ITEMS;
12147 }
12148
12149 // check pending upmap changes
12150 switch (option) {
12151 case OP_PG_UPMAP: // fall through
12152 case OP_RM_PG_UPMAP:
12153 if (pending_inc.new_pg_upmap.count(pgid) ||
12154 pending_inc.old_pg_upmap.count(pgid)) {
12155 dout(10) << __func__ << " waiting for pending update on "
12156 << pgid << dendl;
12157 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12158 return true;
12159 }
12160 break;
12161
12162 case OP_PG_UPMAP_ITEMS: // fall through
12163 case OP_RM_PG_UPMAP_ITEMS:
12164 if (pending_inc.new_pg_upmap_items.count(pgid) ||
12165 pending_inc.old_pg_upmap_items.count(pgid)) {
12166 dout(10) << __func__ << " waiting for pending update on "
12167 << pgid << dendl;
12168 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12169 return true;
12170 }
12171 break;
12172
12173 default:
12174 ceph_abort_msg("invalid option");
12175 }
12176
12177 switch (option) {
12178 case OP_PG_UPMAP:
12179 {
12180 vector<int64_t> id_vec;
12181 if (!cmd_getval(cmdmap, "id", id_vec)) {
12182 ss << "unable to parse 'id' value(s) '"
12183 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12184 err = -EINVAL;
12185 goto reply;
12186 }
12187
12188 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12189 if ((int)id_vec.size() < pool_min_size) {
12190 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12191 << pool_min_size << ")";
12192 err = -EINVAL;
12193 goto reply;
12194 }
12195
12196 int pool_size = osdmap.get_pg_pool_size(pgid);
12197 if ((int)id_vec.size() > pool_size) {
12198 ss << "num of osds (" << id_vec.size() <<") > pool size ("
12199 << pool_size << ")";
12200 err = -EINVAL;
12201 goto reply;
12202 }
12203
12204 vector<int32_t> new_pg_upmap;
12205 for (auto osd : id_vec) {
12206 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12207 ss << "osd." << osd << " does not exist";
12208 err = -ENOENT;
12209 goto reply;
12210 }
12211 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12212 if (it != new_pg_upmap.end()) {
12213 ss << "osd." << osd << " already exists, ";
12214 continue;
12215 }
12216 new_pg_upmap.push_back(osd);
12217 }
12218
12219 if (new_pg_upmap.empty()) {
12220 ss << "no valid upmap items(pairs) is specified";
12221 err = -EINVAL;
12222 goto reply;
12223 }
12224
12225 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12226 new_pg_upmap.begin(), new_pg_upmap.end());
12227 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12228 }
12229 break;
12230
12231 case OP_RM_PG_UPMAP:
12232 {
12233 pending_inc.old_pg_upmap.insert(pgid);
12234 ss << "clear " << pgid << " pg_upmap mapping";
12235 }
12236 break;
12237
12238 case OP_PG_UPMAP_ITEMS:
12239 {
12240 vector<int64_t> id_vec;
12241 if (!cmd_getval(cmdmap, "id", id_vec)) {
12242 ss << "unable to parse 'id' value(s) '"
12243 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12244 err = -EINVAL;
12245 goto reply;
12246 }
12247
12248 if (id_vec.size() % 2) {
12249 ss << "you must specify pairs of osd ids to be remapped";
12250 err = -EINVAL;
12251 goto reply;
12252 }
12253
12254 int pool_size = osdmap.get_pg_pool_size(pgid);
12255 if ((int)(id_vec.size() / 2) > pool_size) {
12256 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12257 << pool_size << ")";
12258 err = -EINVAL;
12259 goto reply;
12260 }
12261
12262 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12263 ostringstream items;
12264 items << "[";
12265 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12266 int from = *p++;
12267 int to = *p;
12268 if (from == to) {
12269 ss << "from osd." << from << " == to osd." << to << ", ";
12270 continue;
12271 }
12272 if (!osdmap.exists(from)) {
12273 ss << "osd." << from << " does not exist";
12274 err = -ENOENT;
12275 goto reply;
12276 }
12277 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12278 ss << "osd." << to << " does not exist";
12279 err = -ENOENT;
12280 goto reply;
12281 }
12282 pair<int32_t,int32_t> entry = make_pair(from, to);
12283 auto it = std::find(new_pg_upmap_items.begin(),
12284 new_pg_upmap_items.end(), entry);
12285 if (it != new_pg_upmap_items.end()) {
12286 ss << "osd." << from << " -> osd." << to << " already exists, ";
12287 continue;
12288 }
12289 new_pg_upmap_items.push_back(entry);
12290 items << from << "->" << to << ",";
12291 }
12292 string out(items.str());
12293 out.resize(out.size() - 1); // drop last ','
12294 out += "]";
12295
12296 if (new_pg_upmap_items.empty()) {
12297 ss << "no valid upmap items(pairs) is specified";
12298 err = -EINVAL;
12299 goto reply;
12300 }
12301
12302 pending_inc.new_pg_upmap_items[pgid] =
12303 mempool::osdmap::vector<pair<int32_t,int32_t>>(
12304 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12305 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12306 }
12307 break;
12308
12309 case OP_RM_PG_UPMAP_ITEMS:
12310 {
12311 pending_inc.old_pg_upmap_items.insert(pgid);
12312 ss << "clear " << pgid << " pg_upmap_items mapping";
12313 }
12314 break;
12315
12316 default:
12317 ceph_abort_msg("invalid option");
12318 }
12319
12320 goto update;
12321 } else if (prefix == "osd primary-affinity") {
12322 int64_t id;
12323 if (!cmd_getval(cmdmap, "id", id)) {
12324 ss << "invalid osd id value '"
12325 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12326 err = -EINVAL;
12327 goto reply;
12328 }
12329 double w;
12330 if (!cmd_getval(cmdmap, "weight", w)) {
12331 ss << "unable to parse 'weight' value '"
12332 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12333 err = -EINVAL;
12334 goto reply;
12335 }
12336 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12337 if (ww < 0L) {
12338 ss << "weight must be >= 0";
12339 err = -EINVAL;
12340 goto reply;
12341 }
12342 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12343 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12344 ss << "require_min_compat_client "
12345 << osdmap.require_min_compat_client
12346 << " < firefly, which is required for primary-affinity";
12347 err = -EPERM;
12348 goto reply;
12349 }
12350 if (osdmap.exists(id)) {
12351 pending_inc.new_primary_affinity[id] = ww;
12352 ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12353 getline(ss, rs);
12354 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12355 get_last_committed() + 1));
12356 return true;
12357 } else {
12358 ss << "osd." << id << " does not exist";
12359 err = -ENOENT;
12360 goto reply;
12361 }
12362 } else if (prefix == "osd reweight") {
12363 int64_t id;
12364 if (!cmd_getval(cmdmap, "id", id)) {
12365 ss << "unable to parse osd id value '"
12366 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12367 err = -EINVAL;
12368 goto reply;
12369 }
12370 double w;
12371 if (!cmd_getval(cmdmap, "weight", w)) {
12372 ss << "unable to parse weight value '"
12373 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12374 err = -EINVAL;
12375 goto reply;
12376 }
12377 long ww = (int)((double)CEPH_OSD_IN*w);
12378 if (ww < 0L) {
12379 ss << "weight must be >= 0";
12380 err = -EINVAL;
12381 goto reply;
12382 }
12383 if (osdmap.exists(id)) {
12384 pending_inc.new_weight[id] = ww;
12385 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12386 getline(ss, rs);
12387 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12388 get_last_committed() + 1));
12389 return true;
12390 } else {
12391 ss << "osd." << id << " does not exist";
12392 err = -ENOENT;
12393 goto reply;
12394 }
12395 } else if (prefix == "osd reweightn") {
12396 map<int32_t, uint32_t> weights;
12397 err = parse_reweights(cct, cmdmap, osdmap, &weights);
12398 if (err) {
12399 ss << "unable to parse 'weights' value '"
12400 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12401 goto reply;
12402 }
12403 pending_inc.new_weight.insert(weights.begin(), weights.end());
12404 wait_for_finished_proposal(
12405 op,
12406 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12407 return true;
12408 } else if (prefix == "osd lost") {
12409 int64_t id;
12410 if (!cmd_getval(cmdmap, "id", id)) {
12411 ss << "unable to parse osd id value '"
12412 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12413 err = -EINVAL;
12414 goto reply;
12415 }
12416 bool sure = false;
12417 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12418 if (!sure) {
12419 ss << "are you SURE? this might mean real, permanent data loss. pass "
12420 "--yes-i-really-mean-it if you really do.";
12421 err = -EPERM;
12422 goto reply;
12423 } else if (!osdmap.exists(id)) {
12424 ss << "osd." << id << " does not exist";
12425 err = -ENOENT;
12426 goto reply;
12427 } else if (!osdmap.is_down(id)) {
12428 ss << "osd." << id << " is not down";
12429 err = -EBUSY;
12430 goto reply;
12431 } else {
12432 epoch_t e = osdmap.get_info(id).down_at;
12433 pending_inc.new_lost[id] = e;
12434 ss << "marked osd lost in epoch " << e;
12435 getline(ss, rs);
12436 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12437 get_last_committed() + 1));
12438 return true;
12439 }
12440
12441 } else if (prefix == "osd destroy-actual" ||
12442 prefix == "osd purge-actual" ||
12443 prefix == "osd purge-new") {
12444 /* Destroying an OSD means that we don't expect to further make use of
12445 * the OSDs data (which may even become unreadable after this operation),
12446 * and that we are okay with scrubbing all its cephx keys and config-key
12447 * data (which may include lockbox keys, thus rendering the osd's data
12448 * unreadable).
12449 *
12450 * The OSD will not be removed. Instead, we will mark it as destroyed,
12451 * such that a subsequent call to `create` will not reuse the osd id.
12452 * This will play into being able to recreate the OSD, at the same
12453 * crush location, with minimal data movement.
12454 */
12455
12456 // make sure authmon is writeable.
12457 if (!mon.authmon()->is_writeable()) {
12458 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12459 << "osd destroy" << dendl;
12460 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12461 return false;
12462 }
12463
12464 int64_t id;
12465 if (!cmd_getval(cmdmap, "id", id)) {
12466 auto p = cmdmap.find("id");
12467 if (p == cmdmap.end()) {
12468 ss << "no osd id specified";
12469 } else {
12470 ss << "unable to parse osd id value '"
12471 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12472 }
12473 err = -EINVAL;
12474 goto reply;
12475 }
12476
12477 bool is_destroy = (prefix == "osd destroy-actual");
12478 if (!is_destroy) {
12479 ceph_assert("osd purge-actual" == prefix ||
12480 "osd purge-new" == prefix);
12481 }
12482
12483 bool sure = false;
12484 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12485 if (!sure) {
12486 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12487 << "This will mean real, permanent data loss, as well "
12488 << "as deletion of cephx and lockbox keys. "
12489 << "Pass --yes-i-really-mean-it if you really do.";
12490 err = -EPERM;
12491 goto reply;
12492 } else if (!osdmap.exists(id)) {
12493 ss << "osd." << id << " does not exist";
12494 err = 0; // idempotent
12495 goto reply;
12496 } else if (osdmap.is_up(id)) {
12497 ss << "osd." << id << " is not `down`.";
12498 err = -EBUSY;
12499 goto reply;
12500 } else if (is_destroy && osdmap.is_destroyed(id)) {
12501 ss << "destroyed osd." << id;
12502 err = 0;
12503 goto reply;
12504 }
12505
12506 if (prefix == "osd purge-new" &&
12507 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12508 ss << "osd." << id << " is not new";
12509 err = -EPERM;
12510 goto reply;
12511 }
12512
12513 bool goto_reply = false;
12514
12515 paxos.plug();
12516 if (is_destroy) {
12517 err = prepare_command_osd_destroy(id, ss);
12518 // we checked above that it should exist.
12519 ceph_assert(err != -ENOENT);
12520 } else {
12521 err = prepare_command_osd_purge(id, ss);
12522 if (err == -ENOENT) {
12523 err = 0;
12524 ss << "osd." << id << " does not exist.";
12525 goto_reply = true;
12526 }
12527 }
12528 paxos.unplug();
12529
12530 if (err < 0 || goto_reply) {
12531 goto reply;
12532 }
12533
12534 if (is_destroy) {
12535 ss << "destroyed osd." << id;
12536 } else {
12537 ss << "purged osd." << id;
12538 }
12539
12540 getline(ss, rs);
12541 wait_for_finished_proposal(op,
12542 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12543 force_immediate_propose();
12544 return true;
12545
12546 } else if (prefix == "osd new") {
12547
12548 // make sure authmon is writeable.
12549 if (!mon.authmon()->is_writeable()) {
12550 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12551 << "osd new" << dendl;
12552 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12553 return false;
12554 }
12555
12556 map<string,string> param_map;
12557
12558 bufferlist bl = m->get_data();
12559 string param_json = bl.to_str();
12560 dout(20) << __func__ << " osd new json = " << param_json << dendl;
12561
12562 err = get_json_str_map(param_json, ss, &param_map);
12563 if (err < 0)
12564 goto reply;
12565
12566 dout(20) << __func__ << " osd new params " << param_map << dendl;
12567
12568 paxos.plug();
12569 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12570 paxos.unplug();
12571
12572 if (err < 0) {
12573 goto reply;
12574 }
12575
12576 if (f) {
12577 f->flush(rdata);
12578 } else {
12579 rdata.append(ss);
12580 }
12581
12582 if (err == EEXIST) {
12583 // idempotent operation
12584 err = 0;
12585 goto reply;
12586 }
12587
12588 wait_for_finished_proposal(op,
12589 new Monitor::C_Command(mon, op, 0, rs, rdata,
12590 get_last_committed() + 1));
12591 force_immediate_propose();
12592 return true;
12593
12594 } else if (prefix == "osd create") {
12595
12596 // optional id provided?
12597 int64_t id = -1, cmd_id = -1;
12598 if (cmd_getval(cmdmap, "id", cmd_id)) {
12599 if (cmd_id < 0) {
12600 ss << "invalid osd id value '" << cmd_id << "'";
12601 err = -EINVAL;
12602 goto reply;
12603 }
12604 dout(10) << " osd create got id " << cmd_id << dendl;
12605 }
12606
12607 uuid_d uuid;
12608 string uuidstr;
12609 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12610 if (!uuid.parse(uuidstr.c_str())) {
12611 ss << "invalid uuid value '" << uuidstr << "'";
12612 err = -EINVAL;
12613 goto reply;
12614 }
12615 // we only care about the id if we also have the uuid, to
12616 // ensure the operation's idempotency.
12617 id = cmd_id;
12618 }
12619
12620 int32_t new_id = -1;
12621 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12622 if (err < 0) {
12623 if (err == -EAGAIN) {
12624 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12625 return true;
12626 }
12627 // a check has failed; reply to the user.
12628 goto reply;
12629
12630 } else if (err == EEXIST) {
12631 // this is an idempotent operation; we can go ahead and reply.
12632 if (f) {
12633 f->open_object_section("created_osd");
12634 f->dump_int("osdid", new_id);
12635 f->close_section();
12636 f->flush(rdata);
12637 } else {
12638 ss << new_id;
12639 rdata.append(ss);
12640 }
12641 err = 0;
12642 goto reply;
12643 }
12644
12645 string empty_device_class;
12646 do_osd_create(id, uuid, empty_device_class, &new_id);
12647
12648 if (f) {
12649 f->open_object_section("created_osd");
12650 f->dump_int("osdid", new_id);
12651 f->close_section();
12652 f->flush(rdata);
12653 } else {
12654 ss << new_id;
12655 rdata.append(ss);
12656 }
12657 wait_for_finished_proposal(op,
12658 new Monitor::C_Command(mon, op, 0, rs, rdata,
12659 get_last_committed() + 1));
12660 return true;
12661
12662 } else if (prefix == "osd blocklist clear" ||
12663 prefix == "osd blacklist clear") {
12664 pending_inc.new_blocklist.clear();
12665 std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12666 osdmap.get_blocklist(&blocklist);
12667 for (const auto &entry : blocklist) {
12668 pending_inc.old_blocklist.push_back(entry.first);
12669 }
12670 ss << " removed all blocklist entries";
12671 getline(ss, rs);
12672 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12673 get_last_committed() + 1));
12674 return true;
12675 } else if (prefix == "osd blocklist" ||
12676 prefix == "osd blacklist") {
12677 string addrstr;
12678 cmd_getval(cmdmap, "addr", addrstr);
12679 entity_addr_t addr;
12680 if (!addr.parse(addrstr)) {
12681 ss << "unable to parse address " << addrstr;
12682 err = -EINVAL;
12683 goto reply;
12684 }
12685 else {
12686 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12687 // always blocklist type ANY
12688 addr.set_type(entity_addr_t::TYPE_ANY);
12689 } else {
12690 addr.set_type(entity_addr_t::TYPE_LEGACY);
12691 }
12692
12693 string blocklistop;
12694 if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12695 cmd_getval(cmdmap, "blacklistop", blocklistop);
12696 }
12697 if (blocklistop == "add") {
12698 utime_t expires = ceph_clock_now();
12699 // default one hour
12700 double d = cmd_getval_or<double>(cmdmap, "expire",
12701 g_conf()->mon_osd_blocklist_default_expire);
12702 expires += d;
12703
12704 pending_inc.new_blocklist[addr] = expires;
12705
12706 {
12707 // cancel any pending un-blocklisting request too
12708 auto it = std::find(pending_inc.old_blocklist.begin(),
12709 pending_inc.old_blocklist.end(), addr);
12710 if (it != pending_inc.old_blocklist.end()) {
12711 pending_inc.old_blocklist.erase(it);
12712 }
12713 }
12714
12715 ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12716 getline(ss, rs);
12717 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12718 get_last_committed() + 1));
12719 return true;
12720 } else if (blocklistop == "rm") {
12721 if (osdmap.is_blocklisted(addr) ||
12722 pending_inc.new_blocklist.count(addr)) {
12723 if (osdmap.is_blocklisted(addr))
12724 pending_inc.old_blocklist.push_back(addr);
12725 else
12726 pending_inc.new_blocklist.erase(addr);
12727 ss << "un-blocklisting " << addr;
12728 getline(ss, rs);
12729 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12730 get_last_committed() + 1));
12731 return true;
12732 }
12733 ss << addr << " isn't blocklisted";
12734 err = 0;
12735 goto reply;
12736 }
12737 }
12738 } else if (prefix == "osd pool mksnap") {
12739 string poolstr;
12740 cmd_getval(cmdmap, "pool", poolstr);
12741 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12742 if (pool < 0) {
12743 ss << "unrecognized pool '" << poolstr << "'";
12744 err = -ENOENT;
12745 goto reply;
12746 }
12747 string snapname;
12748 cmd_getval(cmdmap, "snap", snapname);
12749 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12750 if (p->is_unmanaged_snaps_mode()) {
12751 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12752 err = -EINVAL;
12753 goto reply;
12754 } else if (p->snap_exists(snapname.c_str())) {
12755 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12756 err = 0;
12757 goto reply;
12758 } else if (p->is_tier()) {
12759 ss << "pool " << poolstr << " is a cache tier";
12760 err = -EINVAL;
12761 goto reply;
12762 }
12763 pg_pool_t *pp = 0;
12764 if (pending_inc.new_pools.count(pool))
12765 pp = &pending_inc.new_pools[pool];
12766 if (!pp) {
12767 pp = &pending_inc.new_pools[pool];
12768 *pp = *p;
12769 }
12770 if (pp->snap_exists(snapname.c_str())) {
12771 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12772 } else {
12773 pp->add_snap(snapname.c_str(), ceph_clock_now());
12774 pp->set_snap_epoch(pending_inc.epoch);
12775 ss << "created pool " << poolstr << " snap " << snapname;
12776 }
12777 getline(ss, rs);
12778 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12779 get_last_committed() + 1));
12780 return true;
12781 } else if (prefix == "osd pool rmsnap") {
12782 string poolstr;
12783 cmd_getval(cmdmap, "pool", poolstr);
12784 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12785 if (pool < 0) {
12786 ss << "unrecognized pool '" << poolstr << "'";
12787 err = -ENOENT;
12788 goto reply;
12789 }
12790 string snapname;
12791 cmd_getval(cmdmap, "snap", snapname);
12792 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12793 if (p->is_unmanaged_snaps_mode()) {
12794 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12795 err = -EINVAL;
12796 goto reply;
12797 } else if (!p->snap_exists(snapname.c_str())) {
12798 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12799 err = 0;
12800 goto reply;
12801 }
12802 pg_pool_t *pp = 0;
12803 if (pending_inc.new_pools.count(pool))
12804 pp = &pending_inc.new_pools[pool];
12805 if (!pp) {
12806 pp = &pending_inc.new_pools[pool];
12807 *pp = *p;
12808 }
12809 snapid_t sn = pp->snap_exists(snapname.c_str());
12810 if (sn) {
12811 pp->remove_snap(sn);
12812 pp->set_snap_epoch(pending_inc.epoch);
12813 ss << "removed pool " << poolstr << " snap " << snapname;
12814 } else {
12815 ss << "already removed pool " << poolstr << " snap " << snapname;
12816 }
12817 getline(ss, rs);
12818 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12819 get_last_committed() + 1));
12820 return true;
12821 } else if (prefix == "osd pool create") {
12822 int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
12823 int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
12824 int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
12825 int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
12826 string pool_type_str;
12827 cmd_getval(cmdmap, "pool_type", pool_type_str);
12828 if (pool_type_str.empty())
12829 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12830
12831 string poolstr;
12832 cmd_getval(cmdmap, "pool", poolstr);
12833 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12834 if (pool_id >= 0) {
12835 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12836 if (pool_type_str != p->get_type_name()) {
12837 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12838 err = -EINVAL;
12839 } else {
12840 ss << "pool '" << poolstr << "' already exists";
12841 err = 0;
12842 }
12843 goto reply;
12844 }
12845
12846 int pool_type;
12847 if (pool_type_str == "replicated") {
12848 pool_type = pg_pool_t::TYPE_REPLICATED;
12849 } else if (pool_type_str == "erasure") {
12850 pool_type = pg_pool_t::TYPE_ERASURE;
12851 } else {
12852 ss << "unknown pool type '" << pool_type_str << "'";
12853 err = -EINVAL;
12854 goto reply;
12855 }
12856
12857 bool implicit_rule_creation = false;
12858 int64_t expected_num_objects = 0;
12859 string rule_name;
12860 cmd_getval(cmdmap, "rule", rule_name);
12861 string erasure_code_profile;
12862 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12863
12864 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12865 if (erasure_code_profile == "")
12866 erasure_code_profile = "default";
12867 //handle the erasure code profile
12868 if (erasure_code_profile == "default") {
12869 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12870 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12871 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12872 goto wait;
12873 }
12874
12875 map<string,string> profile_map;
12876 err = osdmap.get_erasure_code_profile_default(cct,
12877 profile_map,
12878 &ss);
12879 if (err)
12880 goto reply;
12881 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12882 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12883 goto wait;
12884 }
12885 }
12886 if (rule_name == "") {
12887 implicit_rule_creation = true;
12888 if (erasure_code_profile == "default") {
12889 rule_name = "erasure-code";
12890 } else {
12891 dout(1) << "implicitly use rule named after the pool: "
12892 << poolstr << dendl;
12893 rule_name = poolstr;
12894 }
12895 }
12896 expected_num_objects =
12897 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
12898 } else {
12899 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12900 // and put expected_num_objects to rule field
12901 if (erasure_code_profile != "") { // cmd is from CLI
12902 if (rule_name != "") {
12903 string interr;
12904 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12905 if (interr.length()) {
12906 ss << "error parsing integer value '" << rule_name << "': " << interr;
12907 err = -EINVAL;
12908 goto reply;
12909 }
12910 }
12911 rule_name = erasure_code_profile;
12912 } else { // cmd is well-formed
12913 expected_num_objects =
12914 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
12915 }
12916 }
12917
12918 if (!implicit_rule_creation && rule_name != "") {
12919 int rule;
12920 err = get_crush_rule(rule_name, &rule, &ss);
12921 if (err == -EAGAIN) {
12922 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12923 return true;
12924 }
12925 if (err)
12926 goto reply;
12927 }
12928
12929 if (expected_num_objects < 0) {
12930 ss << "'expected_num_objects' must be non-negative";
12931 err = -EINVAL;
12932 goto reply;
12933 }
12934
12935 set<int32_t> osds;
12936 osdmap.get_all_osds(osds);
12937 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12938 string type;
12939 if (!get_osd_objectstore_type(osd, &type)) {
12940 return type == "filestore";
12941 } else {
12942 return false;
12943 }
12944 });
12945
12946 if (has_filestore_osd &&
12947 expected_num_objects > 0 &&
12948 cct->_conf->filestore_merge_threshold > 0) {
12949 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12950 err = -EINVAL;
12951 goto reply;
12952 }
12953
12954 if (has_filestore_osd &&
12955 expected_num_objects == 0 &&
12956 cct->_conf->filestore_merge_threshold < 0) {
12957 int osds = osdmap.get_num_osds();
12958 bool sure = false;
12959 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12960 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12961 ss << "For better initial performance on pools expected to store a "
12962 << "large number of objects, consider supplying the "
12963 << "expected_num_objects parameter when creating the pool."
12964 << " Pass --yes-i-really-mean-it to ignore it";
12965 err = -EPERM;
12966 goto reply;
12967 }
12968 }
12969
12970 int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
12971 FastReadType fast_read = FAST_READ_DEFAULT;
12972 if (fast_read_param == 0)
12973 fast_read = FAST_READ_OFF;
12974 else if (fast_read_param > 0)
12975 fast_read = FAST_READ_ON;
12976
12977 int64_t repl_size = 0;
12978 cmd_getval(cmdmap, "size", repl_size);
12979 int64_t target_size_bytes = 0;
12980 double target_size_ratio = 0.0;
12981 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12982 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12983
12984 string pg_autoscale_mode;
12985 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12986
12987 bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
12988 err = prepare_new_pool(poolstr,
12989 -1, // default crush rule
12990 rule_name,
12991 pg_num, pgp_num, pg_num_min, pg_num_max,
12992 repl_size, target_size_bytes, target_size_ratio,
12993 erasure_code_profile, pool_type,
12994 (uint64_t)expected_num_objects,
12995 fast_read,
12996 pg_autoscale_mode,
12997 bulk,
12998 &ss);
12999 if (err < 0) {
13000 switch(err) {
13001 case -EEXIST:
13002 ss << "pool '" << poolstr << "' already exists";
13003 break;
13004 case -EAGAIN:
13005 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13006 return true;
13007 case -ERANGE:
13008 goto reply;
13009 default:
13010 goto reply;
13011 break;
13012 }
13013 } else {
13014 ss << "pool '" << poolstr << "' created";
13015 }
13016 getline(ss, rs);
13017 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13018 get_last_committed() + 1));
13019 return true;
13020
13021 } else if (prefix == "osd pool delete" ||
13022 prefix == "osd pool rm") {
13023 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13024 string poolstr, poolstr2, sure;
13025 cmd_getval(cmdmap, "pool", poolstr);
13026 cmd_getval(cmdmap, "pool2", poolstr2);
13027 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13028 if (pool < 0) {
13029 ss << "pool '" << poolstr << "' does not exist";
13030 err = 0;
13031 goto reply;
13032 }
13033
13034 bool force_no_fake = false;
13035 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
13036 bool force = false;
13037 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
13038 if (poolstr2 != poolstr ||
13039 (!force && !force_no_fake)) {
13040 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13041 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13042 << "followed by --yes-i-really-really-mean-it.";
13043 err = -EPERM;
13044 goto reply;
13045 }
13046 err = _prepare_remove_pool(pool, &ss, force_no_fake);
13047 if (err == -EAGAIN) {
13048 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13049 return true;
13050 }
13051 if (err < 0)
13052 goto reply;
13053 goto update;
13054 } else if (prefix == "osd pool rename") {
13055 string srcpoolstr, destpoolstr;
13056 cmd_getval(cmdmap, "srcpool", srcpoolstr);
13057 cmd_getval(cmdmap, "destpool", destpoolstr);
13058 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13059 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
13060
13061 if (pool_src < 0) {
13062 if (pool_dst >= 0) {
13063 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13064 // of operations, assume this rename succeeded, as it is not changing
13065 // the current state. Make sure we output something understandable
13066 // for whoever is issuing the command, if they are paying attention,
13067 // in case it was not intentional; or to avoid a "wtf?" and a bug
13068 // report in case it was intentional, while expecting a failure.
13069 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13070 << destpoolstr << "' does -- assuming successful rename";
13071 err = 0;
13072 } else {
13073 ss << "unrecognized pool '" << srcpoolstr << "'";
13074 err = -ENOENT;
13075 }
13076 goto reply;
13077 } else if (pool_dst >= 0) {
13078 // source pool exists and so does the destination pool
13079 ss << "pool '" << destpoolstr << "' already exists";
13080 err = -EEXIST;
13081 goto reply;
13082 }
13083
13084 int ret = _prepare_rename_pool(pool_src, destpoolstr);
13085 if (ret == 0) {
13086 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13087 } else {
13088 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13089 << cpp_strerror(ret);
13090 }
13091 getline(ss, rs);
13092 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13093 get_last_committed() + 1));
13094 return true;
13095
13096 } else if (prefix == "osd pool set") {
13097 err = prepare_command_pool_set(cmdmap, ss);
13098 if (err == -EAGAIN)
13099 goto wait;
13100 if (err < 0)
13101 goto reply;
13102
13103 getline(ss, rs);
13104 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13105 get_last_committed() + 1));
13106 return true;
13107 } else if (prefix == "osd tier add") {
13108 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13109 if (err == -EAGAIN)
13110 goto wait;
13111 if (err)
13112 goto reply;
13113 string poolstr;
13114 cmd_getval(cmdmap, "pool", poolstr);
13115 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13116 if (pool_id < 0) {
13117 ss << "unrecognized pool '" << poolstr << "'";
13118 err = -ENOENT;
13119 goto reply;
13120 }
13121 string tierpoolstr;
13122 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13123 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13124 if (tierpool_id < 0) {
13125 ss << "unrecognized pool '" << tierpoolstr << "'";
13126 err = -ENOENT;
13127 goto reply;
13128 }
13129 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13130 ceph_assert(p);
13131 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13132 ceph_assert(tp);
13133
13134 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13135 goto reply;
13136 }
13137
13138 // make sure new tier is empty
13139 bool force_nonempty = false;
13140 cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
13141 const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13142 if (pstats && pstats->stats.sum.num_objects != 0 &&
13143 !force_nonempty) {
13144 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13145 err = -ENOTEMPTY;
13146 goto reply;
13147 }
13148 if (tp->is_erasure()) {
13149 ss << "tier pool '" << tierpoolstr
13150 << "' is an ec pool, which cannot be a tier";
13151 err = -ENOTSUP;
13152 goto reply;
13153 }
13154 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13155 (!force_nonempty ||
13156 !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
13157 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13158 err = -ENOTEMPTY;
13159 goto reply;
13160 }
13161 // go
13162 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13163 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13164 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13165 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13166 return true;
13167 }
13168 np->tiers.insert(tierpool_id);
13169 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13170 ntp->tier_of = pool_id;
13171 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13172 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13173 get_last_committed() + 1));
13174 return true;
13175 } else if (prefix == "osd tier remove" ||
13176 prefix == "osd tier rm") {
13177 string poolstr;
13178 cmd_getval(cmdmap, "pool", poolstr);
13179 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13180 if (pool_id < 0) {
13181 ss << "unrecognized pool '" << poolstr << "'";
13182 err = -ENOENT;
13183 goto reply;
13184 }
13185 string tierpoolstr;
13186 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13187 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13188 if (tierpool_id < 0) {
13189 ss << "unrecognized pool '" << tierpoolstr << "'";
13190 err = -ENOENT;
13191 goto reply;
13192 }
13193 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13194 ceph_assert(p);
13195 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13196 ceph_assert(tp);
13197
13198 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13199 goto reply;
13200 }
13201
13202 if (p->tiers.count(tierpool_id) == 0) {
13203 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13204 err = 0;
13205 goto reply;
13206 }
13207 if (tp->tier_of != pool_id) {
13208 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13209 << osdmap.get_pool_name(tp->tier_of) << "': "
13210 // be scary about it; this is an inconsistency and bells must go off
13211 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13212 err = -EINVAL;
13213 goto reply;
13214 }
13215 if (p->read_tier == tierpool_id) {
13216 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13217 err = -EBUSY;
13218 goto reply;
13219 }
13220 // go
13221 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13222 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13223 if (np->tiers.count(tierpool_id) == 0 ||
13224 ntp->tier_of != pool_id ||
13225 np->read_tier == tierpool_id) {
13226 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13227 return true;
13228 }
13229 np->tiers.erase(tierpool_id);
13230 ntp->clear_tier();
13231 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13232 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13233 get_last_committed() + 1));
13234 return true;
13235 } else if (prefix == "osd tier set-overlay") {
13236 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13237 if (err == -EAGAIN)
13238 goto wait;
13239 if (err)
13240 goto reply;
13241 string poolstr;
13242 cmd_getval(cmdmap, "pool", poolstr);
13243 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13244 if (pool_id < 0) {
13245 ss << "unrecognized pool '" << poolstr << "'";
13246 err = -ENOENT;
13247 goto reply;
13248 }
13249 string overlaypoolstr;
13250 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13251 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13252 if (overlaypool_id < 0) {
13253 ss << "unrecognized pool '" << overlaypoolstr << "'";
13254 err = -ENOENT;
13255 goto reply;
13256 }
13257 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13258 ceph_assert(p);
13259 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13260 ceph_assert(overlay_p);
13261 if (p->tiers.count(overlaypool_id) == 0) {
13262 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13263 err = -EINVAL;
13264 goto reply;
13265 }
13266 if (p->read_tier == overlaypool_id) {
13267 err = 0;
13268 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13269 goto reply;
13270 }
13271 if (p->has_read_tier()) {
13272 ss << "pool '" << poolstr << "' has overlay '"
13273 << osdmap.get_pool_name(p->read_tier)
13274 << "'; please remove-overlay first";
13275 err = -EINVAL;
13276 goto reply;
13277 }
13278
13279 // go
13280 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13281 np->read_tier = overlaypool_id;
13282 np->write_tier = overlaypool_id;
13283 np->set_last_force_op_resend(pending_inc.epoch);
13284 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13285 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13286 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13287 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13288 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13289 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13290 get_last_committed() + 1));
13291 return true;
13292 } else if (prefix == "osd tier remove-overlay" ||
13293 prefix == "osd tier rm-overlay") {
13294 string poolstr;
13295 cmd_getval(cmdmap, "pool", poolstr);
13296 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13297 if (pool_id < 0) {
13298 ss << "unrecognized pool '" << poolstr << "'";
13299 err = -ENOENT;
13300 goto reply;
13301 }
13302 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13303 ceph_assert(p);
13304 if (!p->has_read_tier()) {
13305 err = 0;
13306 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13307 goto reply;
13308 }
13309
13310 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13311 goto reply;
13312 }
13313
13314 // go
13315 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13316 if (np->has_read_tier()) {
13317 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13318 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13319 nop->set_last_force_op_resend(pending_inc.epoch);
13320 }
13321 if (np->has_write_tier()) {
13322 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13323 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13324 nop->set_last_force_op_resend(pending_inc.epoch);
13325 }
13326 np->clear_read_tier();
13327 np->clear_write_tier();
13328 np->set_last_force_op_resend(pending_inc.epoch);
13329 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13330 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13331 get_last_committed() + 1));
13332 return true;
13333 } else if (prefix == "osd tier cache-mode") {
13334 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13335 if (err == -EAGAIN)
13336 goto wait;
13337 if (err)
13338 goto reply;
13339 string poolstr;
13340 cmd_getval(cmdmap, "pool", poolstr);
13341 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13342 if (pool_id < 0) {
13343 ss << "unrecognized pool '" << poolstr << "'";
13344 err = -ENOENT;
13345 goto reply;
13346 }
13347 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13348 ceph_assert(p);
13349 if (!p->is_tier()) {
13350 ss << "pool '" << poolstr << "' is not a tier";
13351 err = -EINVAL;
13352 goto reply;
13353 }
13354 string modestr;
13355 cmd_getval(cmdmap, "mode", modestr);
13356 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13357 if (int(mode) < 0) {
13358 ss << "'" << modestr << "' is not a valid cache mode";
13359 err = -EINVAL;
13360 goto reply;
13361 }
13362
13363 bool sure = false;
13364 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13365
13366 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13367 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13368 ss << "'" << modestr << "' is no longer a supported cache mode";
13369 err = -EPERM;
13370 goto reply;
13371 }
13372 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13373 mode != pg_pool_t::CACHEMODE_NONE &&
13374 mode != pg_pool_t::CACHEMODE_PROXY &&
13375 mode != pg_pool_t::CACHEMODE_READPROXY) &&
13376 !sure) {
13377 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13378 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13379 err = -EPERM;
13380 goto reply;
13381 }
13382
13383 // pool already has this cache-mode set and there are no pending changes
13384 if (p->cache_mode == mode &&
13385 (pending_inc.new_pools.count(pool_id) == 0 ||
13386 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13387 ss << "set cache-mode for pool '" << poolstr << "'"
13388 << " to " << pg_pool_t::get_cache_mode_name(mode);
13389 err = 0;
13390 goto reply;
13391 }
13392
13393 /* Mode description:
13394 *
13395 * none: No cache-mode defined
13396 * forward: Forward all reads and writes to base pool [removed]
13397 * writeback: Cache writes, promote reads from base pool
13398 * readonly: Forward writes to base pool
13399 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13400 * proxy: Proxy all reads and writes to base pool
13401 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13402 *
13403 * Hence, these are the allowed transitions:
13404 *
13405 * none -> any
13406 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13407 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13408 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13409 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13410 * writeback -> readproxy || proxy
13411 * readonly -> any
13412 */
13413
13414 // We check if the transition is valid against the current pool mode, as
13415 // it is the only committed state thus far. We will blantly squash
13416 // whatever mode is on the pending state.
13417
13418 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13419 (mode != pg_pool_t::CACHEMODE_PROXY &&
13420 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13421 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13422 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13423 << "' pool; only '"
13424 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13425 << "' allowed.";
13426 err = -EINVAL;
13427 goto reply;
13428 }
13429 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13430 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13431 mode != pg_pool_t::CACHEMODE_PROXY &&
13432 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13433
13434 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13435 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13436 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13437
13438 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13439 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13440 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13441
13442 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13443 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13444 mode != pg_pool_t::CACHEMODE_PROXY &&
13445 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13446
13447 const pool_stat_t* pstats =
13448 mon.mgrstatmon()->get_pool_stat(pool_id);
13449
13450 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13451 ss << "unable to set cache-mode '"
13452 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13453 << "': dirty objects found";
13454 err = -EBUSY;
13455 goto reply;
13456 }
13457 }
13458 // go
13459 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13460 np->cache_mode = mode;
13461 // set this both when moving to and from cache_mode NONE. this is to
13462 // capture legacy pools that were set up before this flag existed.
13463 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13464 ss << "set cache-mode for pool '" << poolstr
13465 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13466 if (mode == pg_pool_t::CACHEMODE_NONE) {
13467 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13468 ceph_assert(base_pool);
13469 if (base_pool->read_tier == pool_id ||
13470 base_pool->write_tier == pool_id)
13471 ss <<" (WARNING: pool is still configured as read or write tier)";
13472 }
13473 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13474 get_last_committed() + 1));
13475 return true;
13476 } else if (prefix == "osd tier add-cache") {
13477 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13478 if (err == -EAGAIN)
13479 goto wait;
13480 if (err)
13481 goto reply;
13482 string poolstr;
13483 cmd_getval(cmdmap, "pool", poolstr);
13484 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13485 if (pool_id < 0) {
13486 ss << "unrecognized pool '" << poolstr << "'";
13487 err = -ENOENT;
13488 goto reply;
13489 }
13490 string tierpoolstr;
13491 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13492 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13493 if (tierpool_id < 0) {
13494 ss << "unrecognized pool '" << tierpoolstr << "'";
13495 err = -ENOENT;
13496 goto reply;
13497 }
13498 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13499 ceph_assert(p);
13500 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13501 ceph_assert(tp);
13502
13503 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13504 goto reply;
13505 }
13506
13507 int64_t size = 0;
13508 if (!cmd_getval(cmdmap, "size", size)) {
13509 ss << "unable to parse 'size' value '"
13510 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13511 err = -EINVAL;
13512 goto reply;
13513 }
13514 // make sure new tier is empty
13515 const pool_stat_t *pstats =
13516 mon.mgrstatmon()->get_pool_stat(tierpool_id);
13517 if (pstats && pstats->stats.sum.num_objects != 0) {
13518 ss << "tier pool '" << tierpoolstr << "' is not empty";
13519 err = -ENOTEMPTY;
13520 goto reply;
13521 }
13522 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13523 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13524 if (int(mode) < 0) {
13525 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13526 err = -EINVAL;
13527 goto reply;
13528 }
13529 HitSet::Params hsp;
13530 auto& cache_hit_set_type =
13531 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13532 if (cache_hit_set_type == "bloom") {
13533 BloomHitSet::Params *bsp = new BloomHitSet::Params;
13534 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13535 hsp = HitSet::Params(bsp);
13536 } else if (cache_hit_set_type == "explicit_hash") {
13537 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13538 } else if (cache_hit_set_type == "explicit_object") {
13539 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13540 } else {
13541 ss << "osd tier cache default hit set type '"
13542 << cache_hit_set_type << "' is not a known type";
13543 err = -EINVAL;
13544 goto reply;
13545 }
13546 // go
13547 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13548 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13549 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13550 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13551 return true;
13552 }
13553 np->tiers.insert(tierpool_id);
13554 np->read_tier = np->write_tier = tierpool_id;
13555 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13556 np->set_last_force_op_resend(pending_inc.epoch);
13557 ntp->set_last_force_op_resend(pending_inc.epoch);
13558 ntp->tier_of = pool_id;
13559 ntp->cache_mode = mode;
13560 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13561 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13562 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13563 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13564 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13565 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13566 ntp->hit_set_params = hsp;
13567 ntp->target_max_bytes = size;
13568 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13569 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13570 get_last_committed() + 1));
13571 return true;
13572 } else if (prefix == "osd pool set-quota") {
13573 string poolstr;
13574 cmd_getval(cmdmap, "pool", poolstr);
13575 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13576 if (pool_id < 0) {
13577 ss << "unrecognized pool '" << poolstr << "'";
13578 err = -ENOENT;
13579 goto reply;
13580 }
13581
13582 string field;
13583 cmd_getval(cmdmap, "field", field);
13584 if (field != "max_objects" && field != "max_bytes") {
13585 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13586 err = -EINVAL;
13587 goto reply;
13588 }
13589
13590 // val could contain unit designations, so we treat as a string
13591 string val;
13592 cmd_getval(cmdmap, "val", val);
13593 string tss;
13594 int64_t value;
13595 if (field == "max_objects") {
13596 value = strict_si_cast<uint64_t>(val, &tss);
13597 } else if (field == "max_bytes") {
13598 value = strict_iecstrtoll(val, &tss);
13599 } else {
13600 ceph_abort_msg("unrecognized option");
13601 }
13602 if (!tss.empty()) {
13603 ss << "error parsing value '" << val << "': " << tss;
13604 err = -EINVAL;
13605 goto reply;
13606 }
13607
13608 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13609 if (field == "max_objects") {
13610 pi->quota_max_objects = value;
13611 } else if (field == "max_bytes") {
13612 pi->quota_max_bytes = value;
13613 } else {
13614 ceph_abort_msg("unrecognized option");
13615 }
13616 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13617 rs = ss.str();
13618 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13619 get_last_committed() + 1));
13620 return true;
13621 } else if (prefix == "osd pool application enable" ||
13622 prefix == "osd pool application disable" ||
13623 prefix == "osd pool application set" ||
13624 prefix == "osd pool application rm") {
13625 err = prepare_command_pool_application(prefix, cmdmap, ss);
13626 if (err == -EAGAIN) {
13627 goto wait;
13628 } else if (err < 0) {
13629 goto reply;
13630 } else {
13631 goto update;
13632 }
13633 } else if (prefix == "osd force-create-pg") {
13634 pg_t pgid;
13635 string pgidstr;
13636 cmd_getval(cmdmap, "pgid", pgidstr);
13637 if (!pgid.parse(pgidstr.c_str())) {
13638 ss << "invalid pgid '" << pgidstr << "'";
13639 err = -EINVAL;
13640 goto reply;
13641 }
13642 if (!osdmap.pg_exists(pgid)) {
13643 ss << "pg " << pgid << " should not exist";
13644 err = -ENOENT;
13645 goto reply;
13646 }
13647 bool sure = false;
13648 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13649 if (!sure) {
13650 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13651 << "that the cluster will give up ever trying to recover the lost data. Do this "
13652 << "only if you are certain that all copies of the PG are in fact lost and you are "
13653 << "willing to accept that the data is permanently destroyed. Pass "
13654 << "--yes-i-really-mean-it to proceed.";
13655 err = -EPERM;
13656 goto reply;
13657 }
13658 bool creating_now;
13659 {
13660 std::lock_guard<std::mutex> l(creating_pgs_lock);
13661 auto emplaced = creating_pgs.pgs.emplace(
13662 pgid,
13663 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13664 ceph_clock_now()));
13665 creating_now = emplaced.second;
13666 }
13667 if (creating_now) {
13668 ss << "pg " << pgidstr << " now creating, ok";
13669 // set the pool's CREATING flag so that (1) the osd won't ignore our
13670 // create message and (2) we won't propose any future pg_num changes
13671 // until after the PG has been instantiated.
13672 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13673 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13674 }
13675 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13676 err = 0;
13677 goto update;
13678 } else {
13679 ss << "pg " << pgid << " already creating";
13680 err = 0;
13681 goto reply;
13682 }
13683 } else if (prefix == "osd force_healthy_stretch_mode") {
13684 bool sure = false;
13685 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13686 if (!sure) {
13687 ss << "This command will require peering across multiple CRUSH buckets "
13688 "(probably two data centers or availability zones?) and may result in PGs "
13689 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13690 err = -EPERM;
13691 goto reply;
13692 }
13693 try_end_recovery_stretch_mode(true);
13694 ss << "Triggering healthy stretch mode";
13695 err = 0;
13696 goto reply;
13697 } else if (prefix == "osd force_recovery_stretch_mode") {
13698 bool sure = false;
13699 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13700 if (!sure) {
13701 ss << "This command will increase pool sizes to try and spread them "
13702 "across multiple CRUSH buckets (probably two data centers or "
13703 "availability zones?) and should have happened automatically"
13704 "Pass --yes-i-really-mean-it to proceed.";
13705 err = -EPERM;
13706 goto reply;
13707 }
13708 mon.go_recovery_stretch_mode();
13709 ss << "Triggering recovery stretch mode";
13710 err = 0;
13711 goto reply;
13712 } else {
13713 err = -EINVAL;
13714 }
13715
13716 reply:
13717 getline(ss, rs);
13718 if (err < 0 && rs.length() == 0)
13719 rs = cpp_strerror(err);
13720 mon.reply_command(op, err, rs, rdata, get_last_committed());
13721 return ret;
13722
13723 update:
13724 getline(ss, rs);
13725 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13726 get_last_committed() + 1));
13727 return true;
13728
13729 wait:
13730 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13731 return true;
13732 }
13733
13734 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13735 {
13736 op->mark_osdmon_event(__func__);
13737
13738 auto m = op->get_req<MPoolOp>();
13739 MonSession *session = op->get_session();
13740 if (!session) {
13741 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13742 return true;
13743 }
13744
13745 switch (m->op) {
13746 case POOL_OP_CREATE_UNMANAGED_SNAP:
13747 case POOL_OP_DELETE_UNMANAGED_SNAP:
13748 {
13749 const std::string* pool_name = nullptr;
13750 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13751 if (pg_pool != nullptr) {
13752 pool_name = &osdmap.get_pool_name(m->pool);
13753 }
13754
13755 if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
13756 session->entity_name, session->caps,
13757 session->get_peer_socket_addr(),
13758 pool_name)) {
13759 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13760 << "privileges. message: " << *m << std::endl
13761 << "caps: " << session->caps << dendl;
13762 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13763 return true;
13764 }
13765 }
13766 break;
13767 default:
13768 if (!session->is_capable("osd", MON_CAP_W)) {
13769 dout(0) << "got pool op from entity with insufficient privileges. "
13770 << "message: " << *m << std::endl
13771 << "caps: " << session->caps << dendl;
13772 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13773 return true;
13774 }
13775 break;
13776 }
13777
13778 return false;
13779 }
13780
13781 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13782 {
13783 op->mark_osdmon_event(__func__);
13784 auto m = op->get_req<MPoolOp>();
13785
13786 if (enforce_pool_op_caps(op)) {
13787 return true;
13788 }
13789
13790 if (m->fsid != mon.monmap->fsid) {
13791 dout(0) << __func__ << " drop message on fsid " << m->fsid
13792 << " != " << mon.monmap->fsid << " for " << *m << dendl;
13793 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13794 return true;
13795 }
13796
13797 if (m->op == POOL_OP_CREATE)
13798 return preprocess_pool_op_create(op);
13799
13800 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13801 if (p == nullptr) {
13802 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13803 if (m->op == POOL_OP_DELETE) {
13804 _pool_op_reply(op, 0, osdmap.get_epoch());
13805 } else {
13806 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13807 }
13808 return true;
13809 }
13810
13811 // check if the snap and snapname exist
13812 bool snap_exists = false;
13813 if (p->snap_exists(m->name.c_str()))
13814 snap_exists = true;
13815
13816 switch (m->op) {
13817 case POOL_OP_CREATE_SNAP:
13818 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13819 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13820 return true;
13821 }
13822 if (snap_exists) {
13823 _pool_op_reply(op, 0, osdmap.get_epoch());
13824 return true;
13825 }
13826 return false;
13827 case POOL_OP_CREATE_UNMANAGED_SNAP:
13828 if (p->is_pool_snaps_mode()) {
13829 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13830 return true;
13831 }
13832 return false;
13833 case POOL_OP_DELETE_SNAP:
13834 if (p->is_unmanaged_snaps_mode()) {
13835 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13836 return true;
13837 }
13838 if (!snap_exists) {
13839 _pool_op_reply(op, 0, osdmap.get_epoch());
13840 return true;
13841 }
13842 return false;
13843 case POOL_OP_DELETE_UNMANAGED_SNAP:
13844 if (p->is_pool_snaps_mode()) {
13845 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13846 return true;
13847 }
13848 if (_is_removed_snap(m->pool, m->snapid)) {
13849 _pool_op_reply(op, 0, osdmap.get_epoch());
13850 return true;
13851 }
13852 return false;
13853 case POOL_OP_DELETE:
13854 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13855 _pool_op_reply(op, 0, osdmap.get_epoch());
13856 return true;
13857 }
13858 return false;
13859 case POOL_OP_AUID_CHANGE:
13860 return false;
13861 default:
13862 ceph_abort();
13863 break;
13864 }
13865
13866 return false;
13867 }
13868
13869 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13870 {
13871 if (!osdmap.have_pg_pool(pool)) {
13872 dout(10) << __func__ << " pool " << pool << " snap " << snap
13873 << " - pool dne" << dendl;
13874 return true;
13875 }
13876 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13877 dout(10) << __func__ << " pool " << pool << " snap " << snap
13878 << " - in osdmap removed_snaps_queue" << dendl;
13879 return true;
13880 }
13881 snapid_t begin, end;
13882 int r = lookup_purged_snap(pool, snap, &begin, &end);
13883 if (r == 0) {
13884 dout(10) << __func__ << " pool " << pool << " snap " << snap
13885 << " - purged, [" << begin << "," << end << ")" << dendl;
13886 return true;
13887 }
13888 return false;
13889 }
13890
13891 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13892 {
13893 if (pending_inc.old_pools.count(pool)) {
13894 dout(10) << __func__ << " pool " << pool << " snap " << snap
13895 << " - pool pending deletion" << dendl;
13896 return true;
13897 }
13898 if (pending_inc.in_new_removed_snaps(pool, snap)) {
13899 dout(10) << __func__ << " pool " << pool << " snap " << snap
13900 << " - in pending new_removed_snaps" << dendl;
13901 return true;
13902 }
13903 return false;
13904 }
13905
13906 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13907 {
13908 op->mark_osdmon_event(__func__);
13909 auto m = op->get_req<MPoolOp>();
13910 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13911 if (pool >= 0) {
13912 _pool_op_reply(op, 0, osdmap.get_epoch());
13913 return true;
13914 }
13915
13916 return false;
13917 }
13918
13919 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13920 {
13921 op->mark_osdmon_event(__func__);
13922 auto m = op->get_req<MPoolOp>();
13923 dout(10) << "prepare_pool_op " << *m << dendl;
13924 if (m->op == POOL_OP_CREATE) {
13925 return prepare_pool_op_create(op);
13926 } else if (m->op == POOL_OP_DELETE) {
13927 return prepare_pool_op_delete(op);
13928 }
13929
13930 int ret = 0;
13931 bool changed = false;
13932
13933 if (!osdmap.have_pg_pool(m->pool)) {
13934 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13935 return false;
13936 }
13937
13938 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13939
13940 switch (m->op) {
13941 case POOL_OP_CREATE_SNAP:
13942 if (pool->is_tier()) {
13943 ret = -EINVAL;
13944 _pool_op_reply(op, ret, osdmap.get_epoch());
13945 return false;
13946 } // else, fall through
13947 case POOL_OP_DELETE_SNAP:
13948 if (!pool->is_unmanaged_snaps_mode()) {
13949 bool snap_exists = pool->snap_exists(m->name.c_str());
13950 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13951 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13952 ret = 0;
13953 } else {
13954 break;
13955 }
13956 } else {
13957 ret = -EINVAL;
13958 }
13959 _pool_op_reply(op, ret, osdmap.get_epoch());
13960 return false;
13961
13962 case POOL_OP_DELETE_UNMANAGED_SNAP:
13963 // we won't allow removal of an unmanaged snapshot from a pool
13964 // not in unmanaged snaps mode.
13965 if (!pool->is_unmanaged_snaps_mode()) {
13966 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13967 return false;
13968 }
13969 /* fall-thru */
13970 case POOL_OP_CREATE_UNMANAGED_SNAP:
13971 // but we will allow creating an unmanaged snapshot on any pool
13972 // as long as it is not in 'pool' snaps mode.
13973 if (pool->is_pool_snaps_mode()) {
13974 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13975 return false;
13976 }
13977 }
13978
13979 // projected pool info
13980 pg_pool_t pp;
13981 if (pending_inc.new_pools.count(m->pool))
13982 pp = pending_inc.new_pools[m->pool];
13983 else
13984 pp = *osdmap.get_pg_pool(m->pool);
13985
13986 bufferlist reply_data;
13987
13988 // pool snaps vs unmanaged snaps are mutually exclusive
13989 switch (m->op) {
13990 case POOL_OP_CREATE_SNAP:
13991 case POOL_OP_DELETE_SNAP:
13992 if (pp.is_unmanaged_snaps_mode()) {
13993 ret = -EINVAL;
13994 goto out;
13995 }
13996 break;
13997
13998 case POOL_OP_CREATE_UNMANAGED_SNAP:
13999 case POOL_OP_DELETE_UNMANAGED_SNAP:
14000 if (pp.is_pool_snaps_mode()) {
14001 ret = -EINVAL;
14002 goto out;
14003 }
14004 }
14005
14006 switch (m->op) {
14007 case POOL_OP_CREATE_SNAP:
14008 if (!pp.snap_exists(m->name.c_str())) {
14009 pp.add_snap(m->name.c_str(), ceph_clock_now());
14010 dout(10) << "create snap in pool " << m->pool << " " << m->name
14011 << " seq " << pp.get_snap_epoch() << dendl;
14012 changed = true;
14013 }
14014 break;
14015
14016 case POOL_OP_DELETE_SNAP:
14017 {
14018 snapid_t s = pp.snap_exists(m->name.c_str());
14019 if (s) {
14020 pp.remove_snap(s);
14021 pending_inc.new_removed_snaps[m->pool].insert(s);
14022 changed = true;
14023 }
14024 }
14025 break;
14026
14027 case POOL_OP_CREATE_UNMANAGED_SNAP:
14028 {
14029 uint64_t snapid = pp.add_unmanaged_snap(
14030 osdmap.require_osd_release < ceph_release_t::octopus);
14031 encode(snapid, reply_data);
14032 changed = true;
14033 }
14034 break;
14035
14036 case POOL_OP_DELETE_UNMANAGED_SNAP:
14037 if (!_is_removed_snap(m->pool, m->snapid) &&
14038 !_is_pending_removed_snap(m->pool, m->snapid)) {
14039 if (m->snapid > pp.get_snap_seq()) {
14040 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14041 return false;
14042 }
14043 pp.remove_unmanaged_snap(
14044 m->snapid,
14045 osdmap.require_osd_release < ceph_release_t::octopus);
14046 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
14047 // also record the new seq as purged: this avoids a discontinuity
14048 // after all of the snaps have been purged, since the seq assigned
14049 // during removal lives in the same namespace as the actual snaps.
14050 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
14051 changed = true;
14052 }
14053 break;
14054
14055 case POOL_OP_AUID_CHANGE:
14056 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14057 return false;
14058
14059 default:
14060 ceph_abort();
14061 break;
14062 }
14063
14064 if (changed) {
14065 pp.set_snap_epoch(pending_inc.epoch);
14066 pending_inc.new_pools[m->pool] = pp;
14067 }
14068
14069 out:
14070 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14071 return true;
14072 }
14073
14074 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14075 {
14076 op->mark_osdmon_event(__func__);
14077 int err = prepare_new_pool(op);
14078 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14079 return true;
14080 }
14081
14082 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14083 ostream *ss)
14084 {
14085 const string& poolstr = osdmap.get_pool_name(pool_id);
14086
14087 // If the Pool is in use by CephFS, refuse to delete it
14088 FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14089 if (pending_fsmap.pool_in_use(pool_id)) {
14090 *ss << "pool '" << poolstr << "' is in use by CephFS";
14091 return -EBUSY;
14092 }
14093
14094 if (pool.tier_of >= 0) {
14095 *ss << "pool '" << poolstr << "' is a tier of '"
14096 << osdmap.get_pool_name(pool.tier_of) << "'";
14097 return -EBUSY;
14098 }
14099 if (!pool.tiers.empty()) {
14100 *ss << "pool '" << poolstr << "' has tiers";
14101 for(auto tier : pool.tiers) {
14102 *ss << " " << osdmap.get_pool_name(tier);
14103 }
14104 return -EBUSY;
14105 }
14106
14107 if (!g_conf()->mon_allow_pool_delete) {
14108 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14109 return -EPERM;
14110 }
14111
14112 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14113 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14114 return -EPERM;
14115 }
14116
14117 *ss << "pool '" << poolstr << "' removed";
14118 return 0;
14119 }
14120
14121 /**
14122 * Check if it is safe to add a tier to a base pool
14123 *
14124 * @return
14125 * True if the operation should proceed, false if we should abort here
14126 * (abort doesn't necessarily mean error, could be idempotency)
14127 */
14128 bool OSDMonitor::_check_become_tier(
14129 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14130 const int64_t base_pool_id, const pg_pool_t *base_pool,
14131 int *err,
14132 ostream *ss) const
14133 {
14134 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14135 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14136
14137 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14138 if (pending_fsmap.pool_in_use(tier_pool_id)) {
14139 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14140 *err = -EBUSY;
14141 return false;
14142 }
14143
14144 if (base_pool->tiers.count(tier_pool_id)) {
14145 ceph_assert(tier_pool->tier_of == base_pool_id);
14146 *err = 0;
14147 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14148 << base_pool_name << "'";
14149 return false;
14150 }
14151
14152 if (base_pool->is_tier()) {
14153 *ss << "pool '" << base_pool_name << "' is already a tier of '"
14154 << osdmap.get_pool_name(base_pool->tier_of) << "', "
14155 << "multiple tiers are not yet supported.";
14156 *err = -EINVAL;
14157 return false;
14158 }
14159
14160 if (tier_pool->has_tiers()) {
14161 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14162 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14163 it != tier_pool->tiers.end(); ++it)
14164 *ss << "'" << osdmap.get_pool_name(*it) << "',";
14165 *ss << " multiple tiers are not yet supported.";
14166 *err = -EINVAL;
14167 return false;
14168 }
14169
14170 if (tier_pool->is_tier()) {
14171 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14172 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14173 *err = -EINVAL;
14174 return false;
14175 }
14176
14177 *err = 0;
14178 return true;
14179 }
14180
14181
14182 /**
14183 * Check if it is safe to remove a tier from this base pool
14184 *
14185 * @return
14186 * True if the operation should proceed, false if we should abort here
14187 * (abort doesn't necessarily mean error, could be idempotency)
14188 */
14189 bool OSDMonitor::_check_remove_tier(
14190 const int64_t base_pool_id, const pg_pool_t *base_pool,
14191 const pg_pool_t *tier_pool,
14192 int *err, ostream *ss) const
14193 {
14194 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14195
14196 // Apply CephFS-specific checks
14197 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14198 if (pending_fsmap.pool_in_use(base_pool_id)) {
14199 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14200 // If the underlying pool is erasure coded and does not allow EC
14201 // overwrites, we can't permit the removal of the replicated tier that
14202 // CephFS relies on to access it
14203 *ss << "pool '" << base_pool_name <<
14204 "' does not allow EC overwrites and is in use by CephFS"
14205 " via its tier";
14206 *err = -EBUSY;
14207 return false;
14208 }
14209
14210 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14211 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14212 "tier is still in use as a writeback cache. Change the cache "
14213 "mode and flush the cache before removing it";
14214 *err = -EBUSY;
14215 return false;
14216 }
14217 }
14218
14219 *err = 0;
14220 return true;
14221 }
14222
14223 int OSDMonitor::_prepare_remove_pool(
14224 int64_t pool, ostream *ss, bool no_fake)
14225 {
14226 dout(10) << __func__ << " " << pool << dendl;
14227 const pg_pool_t *p = osdmap.get_pg_pool(pool);
14228 int r = _check_remove_pool(pool, *p, ss);
14229 if (r < 0)
14230 return r;
14231
14232 auto new_pool = pending_inc.new_pools.find(pool);
14233 if (new_pool != pending_inc.new_pools.end()) {
14234 // if there is a problem with the pending info, wait and retry
14235 // this op.
14236 const auto& p = new_pool->second;
14237 int r = _check_remove_pool(pool, p, ss);
14238 if (r < 0)
14239 return -EAGAIN;
14240 }
14241
14242 if (pending_inc.old_pools.count(pool)) {
14243 dout(10) << __func__ << " " << pool << " already pending removal"
14244 << dendl;
14245 return 0;
14246 }
14247
14248 if (g_conf()->mon_fake_pool_delete && !no_fake) {
14249 string old_name = osdmap.get_pool_name(pool);
14250 string new_name = old_name + "." + stringify(pool) + ".DELETED";
14251 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14252 << old_name << " -> " << new_name << dendl;
14253 pending_inc.new_pool_names[pool] = new_name;
14254 return 0;
14255 }
14256
14257 // remove
14258 pending_inc.old_pools.insert(pool);
14259
14260 // remove any pg_temp mappings for this pool
14261 for (auto p = osdmap.pg_temp->begin();
14262 p != osdmap.pg_temp->end();
14263 ++p) {
14264 if (p->first.pool() == pool) {
14265 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14266 << p->first << dendl;
14267 pending_inc.new_pg_temp[p->first].clear();
14268 }
14269 }
14270 // remove any primary_temp mappings for this pool
14271 for (auto p = osdmap.primary_temp->begin();
14272 p != osdmap.primary_temp->end();
14273 ++p) {
14274 if (p->first.pool() == pool) {
14275 dout(10) << __func__ << " " << pool
14276 << " removing obsolete primary_temp" << p->first << dendl;
14277 pending_inc.new_primary_temp[p->first] = -1;
14278 }
14279 }
14280 // remove any pg_upmap mappings for this pool
14281 for (auto& p : osdmap.pg_upmap) {
14282 if (p.first.pool() == pool) {
14283 dout(10) << __func__ << " " << pool
14284 << " removing obsolete pg_upmap "
14285 << p.first << dendl;
14286 pending_inc.old_pg_upmap.insert(p.first);
14287 }
14288 }
14289 // remove any pending pg_upmap mappings for this pool
14290 {
14291 auto it = pending_inc.new_pg_upmap.begin();
14292 while (it != pending_inc.new_pg_upmap.end()) {
14293 if (it->first.pool() == pool) {
14294 dout(10) << __func__ << " " << pool
14295 << " removing pending pg_upmap "
14296 << it->first << dendl;
14297 it = pending_inc.new_pg_upmap.erase(it);
14298 } else {
14299 it++;
14300 }
14301 }
14302 }
14303 // remove any pg_upmap_items mappings for this pool
14304 for (auto& p : osdmap.pg_upmap_items) {
14305 if (p.first.pool() == pool) {
14306 dout(10) << __func__ << " " << pool
14307 << " removing obsolete pg_upmap_items " << p.first
14308 << dendl;
14309 pending_inc.old_pg_upmap_items.insert(p.first);
14310 }
14311 }
14312 // remove any pending pg_upmap mappings for this pool
14313 {
14314 auto it = pending_inc.new_pg_upmap_items.begin();
14315 while (it != pending_inc.new_pg_upmap_items.end()) {
14316 if (it->first.pool() == pool) {
14317 dout(10) << __func__ << " " << pool
14318 << " removing pending pg_upmap_items "
14319 << it->first << dendl;
14320 it = pending_inc.new_pg_upmap_items.erase(it);
14321 } else {
14322 it++;
14323 }
14324 }
14325 }
14326
14327 // remove any choose_args for this pool
14328 CrushWrapper newcrush = _get_pending_crush();
14329 if (newcrush.have_choose_args(pool)) {
14330 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14331 newcrush.rm_choose_args(pool);
14332 pending_inc.crush.clear();
14333 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14334 }
14335 return 0;
14336 }
14337
14338 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14339 {
14340 dout(10) << "_prepare_rename_pool " << pool << dendl;
14341 if (pending_inc.old_pools.count(pool)) {
14342 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14343 return -ENOENT;
14344 }
14345 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14346 p != pending_inc.new_pool_names.end();
14347 ++p) {
14348 if (p->second == newname && p->first != pool) {
14349 return -EEXIST;
14350 }
14351 }
14352
14353 pending_inc.new_pool_names[pool] = newname;
14354 return 0;
14355 }
14356
14357 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14358 {
14359 op->mark_osdmon_event(__func__);
14360 auto m = op->get_req<MPoolOp>();
14361 ostringstream ss;
14362 int ret = _prepare_remove_pool(m->pool, &ss, false);
14363 if (ret == -EAGAIN) {
14364 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14365 return true;
14366 }
14367 if (ret < 0)
14368 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14369 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14370 pending_inc.epoch));
14371 return true;
14372 }
14373
14374 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14375 int ret, epoch_t epoch, bufferlist *blp)
14376 {
14377 op->mark_osdmon_event(__func__);
14378 auto m = op->get_req<MPoolOp>();
14379 dout(20) << "_pool_op_reply " << ret << dendl;
14380 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14381 ret, epoch, get_last_committed(), blp);
14382 mon.send_reply(op, reply);
14383 }
14384
14385 void OSDMonitor::convert_pool_priorities(void)
14386 {
14387 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14388 int64_t max_prio = 0;
14389 int64_t min_prio = 0;
14390 for (const auto &i : osdmap.get_pools()) {
14391 const auto &pool = i.second;
14392
14393 if (pool.opts.is_set(key)) {
14394 int64_t prio = 0;
14395 pool.opts.get(key, &prio);
14396 if (prio > max_prio)
14397 max_prio = prio;
14398 if (prio < min_prio)
14399 min_prio = prio;
14400 }
14401 }
14402 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14403 dout(20) << __func__ << " nothing to fix" << dendl;
14404 return;
14405 }
14406 // Current pool priorities exceeds new maximum
14407 for (const auto &i : osdmap.get_pools()) {
14408 const auto pool_id = i.first;
14409 pg_pool_t pool = i.second;
14410
14411 int64_t prio = 0;
14412 pool.opts.get(key, &prio);
14413 int64_t n;
14414
14415 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14416 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14417 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14418 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14419 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14420 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14421 } else {
14422 continue;
14423 }
14424 if (n == 0) {
14425 pool.opts.unset(key);
14426 } else {
14427 pool.opts.set(key, static_cast<int64_t>(n));
14428 }
14429 dout(10) << __func__ << " pool " << pool_id
14430 << " recovery_priority adjusted "
14431 << prio << " to " << n << dendl;
14432 pool.last_change = pending_inc.epoch;
14433 pending_inc.new_pools[pool_id] = pool;
14434 }
14435 }
14436
14437 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14438 int *errcode,
14439 set<pg_pool_t*>* pools,
14440 const string& new_crush_rule)
14441 {
14442 dout(20) << __func__ << dendl;
14443 *okay = false;
14444 int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14445 if (new_crush_rule_result < 0) {
14446 ss << "unrecognized crush rule " << new_crush_rule_result;
14447 *errcode = new_crush_rule_result;
14448 return;
14449 }
14450 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14451 for (const auto& pooli : osdmap.pools) {
14452 int64_t poolid = pooli.first;
14453 const pg_pool_t *p = &pooli.second;
14454 if (!p->is_replicated()) {
14455 ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14456 *errcode = -EINVAL;
14457 return;
14458 }
14459 uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14460 if ((p->get_size() != default_size ||
14461 (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14462 (p->get_crush_rule() != new_rule)) {
14463 ss << "we currently require stretch mode pools start out with the"
14464 " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14465 *errcode = -EINVAL;
14466 return;
14467 }
14468 pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14469 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14470 // the attempt may fail and then we have these pool updates...but they won't do anything
14471 // if there is a failure, so if it's hard to change the interface, no need to bother
14472 pools->insert(pp);
14473 }
14474 *okay = true;
14475 return;
14476 }
14477
14478 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14479 int *errcode, bool commit,
14480 const string& dividing_bucket,
14481 uint32_t bucket_count,
14482 const set<pg_pool_t*>& pools,
14483 const string& new_crush_rule)
14484 {
14485 dout(20) << __func__ << dendl;
14486 *okay = false;
14487 CrushWrapper crush = _get_pending_crush();
14488 int dividing_id = -1;
14489 if (auto type_id = crush.get_validated_type_id(dividing_bucket);
14490 !type_id.has_value()) {
14491 ss << dividing_bucket << " is not a valid crush bucket type";
14492 *errcode = -ENOENT;
14493 ceph_assert(!commit);
14494 return;
14495 } else {
14496 dividing_id = *type_id;
14497 }
14498 vector<int> subtrees;
14499 crush.get_subtree_of_type(dividing_id, &subtrees);
14500 if (subtrees.size() != 2) {
14501 ss << "there are " << subtrees.size() << dividing_bucket
14502 << "'s in the cluster but stretch mode currently only works with 2!";
14503 *errcode = -EINVAL;
14504 ceph_assert(!commit || subtrees.size() == 2);
14505 return;
14506 }
14507
14508 int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14509 if (new_crush_rule_result < 0) {
14510 ss << "unrecognized crush rule " << new_crush_rule;
14511 *errcode = new_crush_rule_result;
14512 ceph_assert(!commit || (new_crush_rule_result > 0));
14513 return;
14514 }
14515 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14516
14517 int weight1 = crush.get_item_weight(subtrees[0]);
14518 int weight2 = crush.get_item_weight(subtrees[1]);
14519 if (weight1 != weight2) {
14520 // TODO: I'm really not sure this is a good idea?
14521 ss << "the 2 " << dividing_bucket
14522 << "instances in the cluster have differing weights "
14523 << weight1 << " and " << weight2
14524 <<" but stretch mode currently requires they be the same!";
14525 *errcode = -EINVAL;
14526 ceph_assert(!commit || (weight1 == weight2));
14527 return;
14528 }
14529 if (bucket_count != 2) {
14530 ss << "currently we only support 2-site stretch clusters!";
14531 *errcode = -EINVAL;
14532 ceph_assert(!commit || bucket_count == 2);
14533 return;
14534 }
14535 // TODO: check CRUSH rules for pools so that we are appropriately divided
14536 if (commit) {
14537 for (auto pool : pools) {
14538 pool->crush_rule = new_rule;
14539 pool->peering_crush_bucket_count = bucket_count;
14540 pool->peering_crush_bucket_target = bucket_count;
14541 pool->peering_crush_bucket_barrier = dividing_id;
14542 pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14543 pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14544 pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14545 }
14546 pending_inc.change_stretch_mode = true;
14547 pending_inc.stretch_mode_enabled = true;
14548 pending_inc.new_stretch_bucket_count = bucket_count;
14549 pending_inc.new_degraded_stretch_mode = 0;
14550 pending_inc.new_stretch_mode_bucket = dividing_id;
14551 }
14552 *okay = true;
14553 return;
14554 }
14555
14556 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14557 set<int> *really_down_buckets,
14558 set<string> *really_down_mons)
14559 {
14560 dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14561 ceph_assert(is_readable());
14562 if (dead_buckets.empty()) return false;
14563 set<int> down_cache;
14564 bool really_down = false;
14565 for (auto dbi : dead_buckets) {
14566 const string& bucket_name = dbi.first;
14567 ceph_assert(osdmap.crush->name_exists(bucket_name));
14568 int bucket_id = osdmap.crush->get_item_id(bucket_name);
14569 dout(20) << "Checking " << bucket_name << " id " << bucket_id
14570 << " to see if OSDs are also down" << dendl;
14571 bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14572 if (subtree_down) {
14573 dout(20) << "subtree is down!" << dendl;
14574 really_down = true;
14575 really_down_buckets->insert(bucket_id);
14576 really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14577 }
14578 }
14579 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14580 << " and mons " << *really_down_mons << " are really down" << dendl;
14581 return really_down;
14582 }
14583
14584 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14585 const set<string>& live_zones)
14586 {
14587 dout(20) << __func__ << dendl;
14588 stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14589 // update the general OSDMap changes
14590 pending_inc.change_stretch_mode = true;
14591 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14592 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14593 int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14594 ceph_assert(new_site_count == 1); // stretch count 2!
14595 pending_inc.new_degraded_stretch_mode = new_site_count;
14596 pending_inc.new_recovering_stretch_mode = 0;
14597 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14598
14599 // and then apply them to all the pg_pool_ts
14600 ceph_assert(live_zones.size() == 1); // only support 2 zones now
14601 const string& remaining_site_name = *(live_zones.begin());
14602 ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14603 int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14604 for (auto pgi : osdmap.pools) {
14605 if (pgi.second.peering_crush_bucket_count) {
14606 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14607 newp.peering_crush_bucket_count = new_site_count;
14608 newp.peering_crush_mandatory_member = remaining_site;
14609 newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14610 newp.last_force_op_resend = pending_inc.epoch;
14611 }
14612 }
14613 propose_pending();
14614 }
14615
14616 void OSDMonitor::trigger_recovery_stretch_mode()
14617 {
14618 dout(20) << __func__ << dendl;
14619 stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14620 pending_inc.change_stretch_mode = true;
14621 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14622 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14623 pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14624 pending_inc.new_recovering_stretch_mode = 1;
14625 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14626
14627 for (auto pgi : osdmap.pools) {
14628 if (pgi.second.peering_crush_bucket_count) {
14629 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14630 newp.last_force_op_resend = pending_inc.epoch;
14631 }
14632 }
14633 propose_pending();
14634 }
14635
14636 void OSDMonitor::set_degraded_stretch_mode()
14637 {
14638 stretch_recovery_triggered.set_from_double(0);
14639 }
14640
14641 void OSDMonitor::set_recovery_stretch_mode()
14642 {
14643 if (stretch_recovery_triggered.is_zero()) {
14644 stretch_recovery_triggered = ceph_clock_now();
14645 }
14646 }
14647
14648 void OSDMonitor::set_healthy_stretch_mode()
14649 {
14650 stretch_recovery_triggered.set_from_double(0);
14651 }
14652
14653 void OSDMonitor::notify_new_pg_digest()
14654 {
14655 dout(20) << __func__ << dendl;
14656 if (!stretch_recovery_triggered.is_zero()) {
14657 try_end_recovery_stretch_mode(false);
14658 }
14659 }
14660
14661 struct CMonExitRecovery : public Context {
14662 OSDMonitor *m;
14663 bool force;
14664 CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14665 void finish(int r) {
14666 m->try_end_recovery_stretch_mode(force);
14667 }
14668 };
14669
14670 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
14671 {
14672 dout(20) << __func__ << dendl;
14673 if (!mon.is_leader()) return;
14674 if (!mon.is_degraded_stretch_mode()) return;
14675 if (!mon.is_recovering_stretch_mode()) return;
14676 if (!is_readable()) {
14677 wait_for_readable_ctx(new CMonExitRecovery(this, force));
14678 return;
14679 }
14680
14681 if (osdmap.recovering_stretch_mode &&
14682 ((!stretch_recovery_triggered.is_zero() &&
14683 ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
14684 stretch_recovery_triggered) ||
14685 force)) {
14686 if (!mon.mgrstatmon()->is_readable()) {
14687 mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
14688 return;
14689 }
14690 const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
14691 double misplaced, degraded, inactive, unknown;
14692 pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
14693 if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
14694 // we can exit degraded stretch mode!
14695 mon.trigger_healthy_stretch_mode();
14696 }
14697 }
14698 }
14699
14700 void OSDMonitor::trigger_healthy_stretch_mode()
14701 {
14702 ceph_assert(is_writeable());
14703 stretch_recovery_triggered.set_from_double(0);
14704 pending_inc.change_stretch_mode = true;
14705 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14706 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14707 pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
14708 pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
14709 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14710 for (auto pgi : osdmap.pools) {
14711 if (pgi.second.peering_crush_bucket_count) {
14712 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14713 newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
14714 newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14715 newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14716 newp.last_force_op_resend = pending_inc.epoch;
14717 }
14718 }
14719 propose_pending();
14720 }