]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
c4707101b69fa3a4ead156a8cff9d18da82f0875
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
61
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
69
70 #include "common/config.h"
71 #include "common/errno.h"
72
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
76
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
86
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
89
90 #include "json_spirit/json_spirit_reader.h"
91
92 #include <boost/algorithm/string/predicate.hpp>
93
94 using std::dec;
95 using std::hex;
96 using std::list;
97 using std::map;
98 using std::make_pair;
99 using std::ostringstream;
100 using std::pair;
101 using std::set;
102 using std::string;
103 using std::stringstream;
104 using std::to_string;
105 using std::vector;
106
107 using ceph::bufferlist;
108 using ceph::decode;
109 using ceph::encode;
110 using ceph::ErasureCodeInterfaceRef;
111 using ceph::ErasureCodePluginRegistry;
112 using ceph::ErasureCodeProfile;
113 using ceph::Formatter;
114 using ceph::JSONFormatter;
115 using ceph::make_message;
116
117 #define dout_subsys ceph_subsys_mon
118 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
119 static const string OSD_METADATA_PREFIX("osd_metadata");
120 static const string OSD_SNAP_PREFIX("osd_snap");
121
122 /*
123
124 OSD snapshot metadata
125 ---------------------
126
127 -- starting with mimic, removed in octopus --
128
129 "removed_epoch_%llu_%08lx" % (pool, epoch)
130 -> interval_set<snapid_t>
131
132 "removed_snap_%llu_%016llx" % (pool, last_snap)
133 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
134
135
136 -- starting with mimic --
137
138 "purged_snap_%llu_%016llx" % (pool, last_snap)
139 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
140
141 - note that the {removed,purged}_snap put the last snap in they key so
142 that we can use forward iteration only to search for an epoch in an
143 interval. e.g., to test if epoch N is removed/purged, we'll find a key
144 >= N that either does or doesn't contain the given snap.
145
146
147 -- starting with octopus --
148
149 "purged_epoch_%08lx" % epoch
150 -> map<int64_t,interval_set<snapid_t>>
151
152 */
153 using namespace TOPNSPC::common;
154 namespace {
155
156 struct OSDMemCache : public PriorityCache::PriCache {
157 OSDMonitor *osdmon;
158 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
159 int64_t committed_bytes = 0;
160 double cache_ratio = 0;
161
162 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
163
164 virtual uint64_t _get_used_bytes() const = 0;
165
166 virtual int64_t request_cache_bytes(
167 PriorityCache::Priority pri, uint64_t total_cache) const {
168 int64_t assigned = get_cache_bytes(pri);
169
170 switch (pri) {
171 // All cache items are currently set to have PRI1 priority
172 case PriorityCache::Priority::PRI1:
173 {
174 int64_t request = _get_used_bytes();
175 return (request > assigned) ? request - assigned : 0;
176 }
177 default:
178 break;
179 }
180 return -EOPNOTSUPP;
181 }
182
183 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
184 return cache_bytes[pri];
185 }
186
187 virtual int64_t get_cache_bytes() const {
188 int64_t total = 0;
189
190 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
191 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
192 total += get_cache_bytes(pri);
193 }
194 return total;
195 }
196
197 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
198 cache_bytes[pri] = bytes;
199 }
200 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
201 cache_bytes[pri] += bytes;
202 }
203 virtual int64_t commit_cache_size(uint64_t total_cache) {
204 committed_bytes = PriorityCache::get_chunk(
205 get_cache_bytes(), total_cache);
206 return committed_bytes;
207 }
208 virtual int64_t get_committed_size() const {
209 return committed_bytes;
210 }
211 virtual double get_cache_ratio() const {
212 return cache_ratio;
213 }
214 virtual void set_cache_ratio(double ratio) {
215 cache_ratio = ratio;
216 }
217 virtual string get_cache_name() const = 0;
218 };
219
220 struct IncCache : public OSDMemCache {
221 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
222
223 virtual uint64_t _get_used_bytes() const {
224 return osdmon->inc_osd_cache.get_bytes();
225 }
226
227 virtual string get_cache_name() const {
228 return "OSDMap Inc Cache";
229 }
230
231 uint64_t _get_num_osdmaps() const {
232 return osdmon->inc_osd_cache.get_size();
233 }
234 };
235
236 struct FullCache : public OSDMemCache {
237 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
238
239 virtual uint64_t _get_used_bytes() const {
240 return osdmon->full_osd_cache.get_bytes();
241 }
242
243 virtual string get_cache_name() const {
244 return "OSDMap Full Cache";
245 }
246
247 uint64_t _get_num_osdmaps() const {
248 return osdmon->full_osd_cache.get_size();
249 }
250 };
251
252 std::shared_ptr<IncCache> inc_cache;
253 std::shared_ptr<FullCache> full_cache;
254
255 const uint32_t MAX_POOL_APPLICATIONS = 4;
256 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
257 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
258
259 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
260 // Note: this doesn't include support for the application tag match
261 if ((grant.spec.allow & OSD_CAP_W) != 0) {
262 auto& match = grant.match;
263 if (match.is_match_all()) {
264 return true;
265 } else if (pool_name != nullptr &&
266 !match.pool_namespace.pool_name.empty() &&
267 match.pool_namespace.pool_name == *pool_name) {
268 return true;
269 }
270 }
271 return false;
272 }
273
274 bool is_unmanaged_snap_op_permitted(CephContext* cct,
275 const KeyServer& key_server,
276 const EntityName& entity_name,
277 const MonCap& mon_caps,
278 const entity_addr_t& peer_socket_addr,
279 const std::string* pool_name)
280 {
281 typedef std::map<std::string, std::string> CommandArgs;
282
283 if (mon_caps.is_capable(
284 cct, entity_name, "osd",
285 "osd pool op unmanaged-snap",
286 (pool_name == nullptr ?
287 CommandArgs{} /* pool DNE, require unrestricted cap */ :
288 CommandArgs{{"poolname", *pool_name}}),
289 false, true, false,
290 peer_socket_addr)) {
291 return true;
292 }
293
294 AuthCapsInfo caps_info;
295 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
296 caps_info)) {
297 dout(10) << "unable to locate OSD cap data for " << entity_name
298 << " in auth db" << dendl;
299 return false;
300 }
301
302 string caps_str;
303 if (caps_info.caps.length() > 0) {
304 auto p = caps_info.caps.cbegin();
305 try {
306 decode(caps_str, p);
307 } catch (const ceph::buffer::error &err) {
308 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
309 << dendl;
310 return false;
311 }
312 }
313
314 OSDCap osd_cap;
315 if (!osd_cap.parse(caps_str, nullptr)) {
316 dout(10) << "unable to parse OSD cap data for " << entity_name
317 << " in auth db" << dendl;
318 return false;
319 }
320
321 // if the entity has write permissions in one or all pools, permit
322 // usage of unmanaged-snapshots
323 if (osd_cap.allow_all()) {
324 return true;
325 }
326
327 for (auto& grant : osd_cap.grants) {
328 if (grant.profile.is_valid()) {
329 for (auto& profile_grant : grant.profile_grants) {
330 if (is_osd_writable(profile_grant, pool_name)) {
331 return true;
332 }
333 }
334 } else if (is_osd_writable(grant, pool_name)) {
335 return true;
336 }
337 }
338
339 return false;
340 }
341
342 } // anonymous namespace
343
344 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
345 {
346 if (epoch_by_pg.size() <= ps) {
347 epoch_by_pg.resize(ps + 1, 0);
348 }
349 const auto old_lec = epoch_by_pg[ps];
350 if (old_lec >= last_epoch_clean) {
351 // stale lec
352 return;
353 }
354 epoch_by_pg[ps] = last_epoch_clean;
355 if (last_epoch_clean < floor) {
356 floor = last_epoch_clean;
357 } else if (last_epoch_clean > floor) {
358 if (old_lec == floor) {
359 // probably should increase floor?
360 auto new_floor = std::min_element(std::begin(epoch_by_pg),
361 std::end(epoch_by_pg));
362 floor = *new_floor;
363 }
364 }
365 if (ps != next_missing) {
366 return;
367 }
368 for (; next_missing < epoch_by_pg.size(); next_missing++) {
369 if (epoch_by_pg[next_missing] == 0) {
370 break;
371 }
372 }
373 }
374
375 void LastEpochClean::remove_pool(uint64_t pool)
376 {
377 report_by_pool.erase(pool);
378 }
379
380 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
381 {
382 auto& lec = report_by_pool[pg.pool()];
383 return lec.report(pg.ps(), last_epoch_clean);
384 }
385
386 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
387 {
388 auto floor = latest.get_epoch();
389 for (auto& pool : latest.get_pools()) {
390 auto reported = report_by_pool.find(pool.first);
391 if (reported == report_by_pool.end()) {
392 return 0;
393 }
394 if (reported->second.next_missing < pool.second.get_pg_num()) {
395 return 0;
396 }
397 if (reported->second.floor < floor) {
398 floor = reported->second.floor;
399 }
400 }
401 return floor;
402 }
403
404 void LastEpochClean::dump(Formatter *f) const
405 {
406 f->open_array_section("per_pool");
407
408 for (auto& [pool, lec] : report_by_pool) {
409 f->open_object_section("pool");
410 f->dump_unsigned("poolid", pool);
411 f->dump_unsigned("floor", lec.floor);
412 f->close_section();
413 }
414
415 f->close_section();
416 }
417
418 class C_UpdateCreatingPGs : public Context {
419 public:
420 OSDMonitor *osdmon;
421 utime_t start;
422 epoch_t epoch;
423 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
424 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
425 void finish(int r) override {
426 if (r >= 0) {
427 utime_t end = ceph_clock_now();
428 dout(10) << "osdmap epoch " << epoch << " mapping took "
429 << (end - start) << " seconds" << dendl;
430 osdmon->update_creating_pgs();
431 osdmon->check_pg_creates_subs();
432 }
433 }
434 };
435
436 #undef dout_prefix
437 #define dout_prefix _prefix(_dout, mon, osdmap)
438 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
439 return *_dout << "mon." << mon.name << "@" << mon.rank
440 << "(" << mon.get_state_name()
441 << ").osd e" << osdmap.get_epoch() << " ";
442 }
443
444 OSDMonitor::OSDMonitor(
445 CephContext *cct,
446 Monitor &mn,
447 Paxos &p,
448 const string& service_name)
449 : PaxosService(mn, p, service_name),
450 cct(cct),
451 inc_osd_cache(g_conf()->mon_osd_cache_size),
452 full_osd_cache(g_conf()->mon_osd_cache_size),
453 has_osdmap_manifest(false),
454 mapper(mn.cct, &mn.cpu_tp)
455 {
456 inc_cache = std::make_shared<IncCache>(this);
457 full_cache = std::make_shared<FullCache>(this);
458 cct->_conf.add_observer(this);
459 int r = _set_cache_sizes();
460 if (r < 0) {
461 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
462 << g_conf()->mon_osd_cache_size
463 << ") without priority cache management"
464 << dendl;
465 }
466 }
467
468 const char **OSDMonitor::get_tracked_conf_keys() const
469 {
470 static const char* KEYS[] = {
471 "mon_memory_target",
472 "mon_memory_autotune",
473 "rocksdb_cache_size",
474 NULL
475 };
476 return KEYS;
477 }
478
479 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
480 const std::set<std::string> &changed)
481 {
482 dout(10) << __func__ << " " << changed << dendl;
483
484 if (changed.count("mon_memory_autotune")) {
485 _set_cache_autotuning();
486 }
487 if (changed.count("mon_memory_target") ||
488 changed.count("rocksdb_cache_size")) {
489 int r = _update_mon_cache_settings();
490 if (r < 0) {
491 derr << __func__ << " mon_memory_target:"
492 << g_conf()->mon_memory_target
493 << " rocksdb_cache_size:"
494 << g_conf()->rocksdb_cache_size
495 << ". Unable to update cache size."
496 << dendl;
497 }
498 }
499 }
500
501 void OSDMonitor::_set_cache_autotuning()
502 {
503 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
504 // Disable cache autotuning
505 std::lock_guard l(balancer_lock);
506 pcm = nullptr;
507 }
508
509 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
510 int r = register_cache_with_pcm();
511 if (r < 0) {
512 dout(10) << __func__
513 << " Error while registering osdmon caches with pcm."
514 << " Cache auto tuning not enabled."
515 << dendl;
516 mon_memory_autotune = false;
517 } else {
518 mon_memory_autotune = true;
519 }
520 }
521 }
522
523 int OSDMonitor::_update_mon_cache_settings()
524 {
525 if (g_conf()->mon_memory_target <= 0 ||
526 g_conf()->mon_memory_target < mon_memory_min ||
527 g_conf()->rocksdb_cache_size <= 0) {
528 return -EINVAL;
529 }
530
531 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
532 derr << __func__ << " not using pcm and rocksdb" << dendl;
533 return -EINVAL;
534 }
535
536 uint64_t old_mon_memory_target = mon_memory_target;
537 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
538
539 // Set the new pcm memory cache sizes
540 mon_memory_target = g_conf()->mon_memory_target;
541 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
542
543 uint64_t base = mon_memory_base;
544 double fragmentation = mon_memory_fragmentation;
545 uint64_t target = mon_memory_target;
546 uint64_t min = mon_memory_min;
547 uint64_t max = min;
548
549 uint64_t ltarget = (1.0 - fragmentation) * target;
550 if (ltarget > base + min) {
551 max = ltarget - base;
552 }
553
554 int r = _set_cache_ratios();
555 if (r < 0) {
556 derr << __func__ << " Cache ratios for pcm could not be set."
557 << " Review the kv (rocksdb) and mon_memory_target sizes."
558 << dendl;
559 mon_memory_target = old_mon_memory_target;
560 rocksdb_cache_size = old_rocksdb_cache_size;
561 return -EINVAL;
562 }
563
564 if (mon_memory_autotune && pcm != nullptr) {
565 std::lock_guard l(balancer_lock);
566 // set pcm cache levels
567 pcm->set_target_memory(target);
568 pcm->set_min_memory(min);
569 pcm->set_max_memory(max);
570 // tune memory based on new values
571 pcm->tune_memory();
572 pcm->balance();
573 _set_new_cache_sizes();
574 dout(1) << __func__ << " Updated mon cache setting."
575 << " target: " << target
576 << " min: " << min
577 << " max: " << max
578 << dendl;
579 }
580 return 0;
581 }
582
583 int OSDMonitor::_set_cache_sizes()
584 {
585 if (g_conf()->mon_memory_autotune) {
586 // set the new osdmon cache targets to be managed by pcm
587 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
588 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
589 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
590 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
591 mon_memory_target = g_conf()->mon_memory_target;
592 mon_memory_min = g_conf()->mon_osd_cache_size_min;
593 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
594 derr << __func__ << " mon_memory_target:" << mon_memory_target
595 << " mon_memory_min:" << mon_memory_min
596 << ". Invalid size option(s) provided."
597 << dendl;
598 return -EINVAL;
599 }
600 // Set the initial inc and full LRU cache sizes
601 inc_osd_cache.set_bytes(mon_memory_min);
602 full_osd_cache.set_bytes(mon_memory_min);
603 mon_memory_autotune = g_conf()->mon_memory_autotune;
604 }
605 return 0;
606 }
607
608 bool OSDMonitor::_have_pending_crush()
609 {
610 return pending_inc.crush.length() > 0;
611 }
612
613 CrushWrapper &OSDMonitor::_get_stable_crush()
614 {
615 return *osdmap.crush;
616 }
617
618 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
619 {
620 bufferlist bl;
621 if (pending_inc.crush.length())
622 bl = pending_inc.crush;
623 else
624 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
625
626 auto p = bl.cbegin();
627 newcrush.decode(p);
628 }
629
630 void OSDMonitor::create_initial()
631 {
632 dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
633
634 OSDMap newmap;
635
636 bufferlist bl;
637 mon.store->get("mkfs", "osdmap", bl);
638
639 if (bl.length()) {
640 newmap.decode(bl);
641 newmap.set_fsid(mon.monmap->fsid);
642 } else {
643 newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
644 }
645 newmap.set_epoch(1);
646 newmap.created = newmap.modified = ceph_clock_now();
647
648 // new clusters should sort bitwise by default.
649 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
650
651 newmap.flags |=
652 CEPH_OSDMAP_RECOVERY_DELETES |
653 CEPH_OSDMAP_PURGED_SNAPDIRS |
654 CEPH_OSDMAP_PGLOG_HARDLIMIT;
655 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
656 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
657 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
658 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
659 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
660 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
661
662 // new cluster should require latest by default
663 if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) {
664 if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
665 derr << __func__ << " mon_debug_no_require_pacific and octopus=true" << dendl;
666 newmap.require_osd_release = ceph_release_t::nautilus;
667 } else {
668 derr << __func__ << " mon_debug_no_require_pacific=true" << dendl;
669 newmap.require_osd_release = ceph_release_t::octopus;
670 }
671 } else {
672 newmap.require_osd_release = ceph_release_t::pacific;
673 }
674
675 if (newmap.require_osd_release >= ceph_release_t::octopus) {
676 ceph_release_t r = ceph_release_from_name(
677 g_conf()->mon_osd_initial_require_min_compat_client);
678 if (!r) {
679 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
680 }
681 newmap.require_min_compat_client = r;
682 }
683
684 // encode into pending incremental
685 uint64_t features = newmap.get_encoding_features();
686 newmap.encode(pending_inc.fullmap,
687 features | CEPH_FEATURE_RESERVED);
688 pending_inc.full_crc = newmap.get_crc();
689 dout(20) << " full crc " << pending_inc.full_crc << dendl;
690 }
691
692 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
693 {
694 s.insert(service_name);
695 s.insert(OSD_PG_CREATING_PREFIX);
696 s.insert(OSD_METADATA_PREFIX);
697 s.insert(OSD_SNAP_PREFIX);
698 }
699
700 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
701 {
702 // we really don't care if the version has been updated, because we may
703 // have trimmed without having increased the last committed; yet, we may
704 // need to update the in-memory manifest.
705 load_osdmap_manifest();
706
707 version_t version = get_last_committed();
708 if (version == osdmap.epoch)
709 return;
710 ceph_assert(version > osdmap.epoch);
711
712 dout(15) << "update_from_paxos paxos e " << version
713 << ", my e " << osdmap.epoch << dendl;
714
715 int prev_num_up_osd = osdmap.num_up_osd;
716
717 if (mapping_job) {
718 if (!mapping_job->is_done()) {
719 dout(1) << __func__ << " mapping job "
720 << mapping_job.get() << " did not complete, "
721 << mapping_job->shards << " left, canceling" << dendl;
722 mapping_job->abort();
723 }
724 mapping_job.reset();
725 }
726
727 load_health();
728
729 /*
730 * We will possibly have a stashed latest that *we* wrote, and we will
731 * always be sure to have the oldest full map in the first..last range
732 * due to encode_trim_extra(), which includes the oldest full map in the trim
733 * transaction.
734 *
735 * encode_trim_extra() does not however write the full map's
736 * version to 'full_latest'. This is only done when we are building the
737 * full maps from the incremental versions. But don't panic! We make sure
738 * that the following conditions find whichever full map version is newer.
739 */
740 version_t latest_full = get_version_latest_full();
741 if (latest_full == 0 && get_first_committed() > 1)
742 latest_full = get_first_committed();
743
744 if (get_first_committed() > 1 &&
745 latest_full < get_first_committed()) {
746 // the monitor could be just sync'ed with its peer, and the latest_full key
747 // is not encoded in the paxos commits in encode_pending(), so we need to
748 // make sure we get it pointing to a proper version.
749 version_t lc = get_last_committed();
750 version_t fc = get_first_committed();
751
752 dout(10) << __func__ << " looking for valid full map in interval"
753 << " [" << fc << ", " << lc << "]" << dendl;
754
755 latest_full = 0;
756 for (version_t v = lc; v >= fc; v--) {
757 string full_key = "full_" + stringify(v);
758 if (mon.store->exists(get_service_name(), full_key)) {
759 dout(10) << __func__ << " found latest full map v " << v << dendl;
760 latest_full = v;
761 break;
762 }
763 }
764
765 ceph_assert(latest_full > 0);
766 auto t(std::make_shared<MonitorDBStore::Transaction>());
767 put_version_latest_full(t, latest_full);
768 mon.store->apply_transaction(t);
769 dout(10) << __func__ << " updated the on-disk full map version to "
770 << latest_full << dendl;
771 }
772
773 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
774 bufferlist latest_bl;
775 get_version_full(latest_full, latest_bl);
776 ceph_assert(latest_bl.length() != 0);
777 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
778 osdmap = OSDMap();
779 osdmap.decode(latest_bl);
780 }
781
782 bufferlist bl;
783 if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
784 auto p = bl.cbegin();
785 std::lock_guard<std::mutex> l(creating_pgs_lock);
786 creating_pgs.decode(p);
787 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
788 << creating_pgs.last_scan_epoch
789 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
790 } else {
791 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
792 << dendl;
793 }
794
795 // walk through incrementals
796 MonitorDBStore::TransactionRef t;
797 size_t tx_size = 0;
798 while (version > osdmap.epoch) {
799 bufferlist inc_bl;
800 int err = get_version(osdmap.epoch+1, inc_bl);
801 ceph_assert(err == 0);
802 ceph_assert(inc_bl.length());
803 // set priority cache manager levels if the osdmap is
804 // being populated for the first time.
805 if (mon_memory_autotune && pcm == nullptr) {
806 int r = register_cache_with_pcm();
807 if (r < 0) {
808 dout(10) << __func__
809 << " Error while registering osdmon caches with pcm."
810 << " Proceeding without cache auto tuning."
811 << dendl;
812 }
813 }
814
815 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
816 << dendl;
817 OSDMap::Incremental inc(inc_bl);
818 err = osdmap.apply_incremental(inc);
819 ceph_assert(err == 0);
820
821 if (!t)
822 t.reset(new MonitorDBStore::Transaction);
823
824 // Write out the full map for all past epochs. Encode the full
825 // map with the same features as the incremental. If we don't
826 // know, use the quorum features. If we don't know those either,
827 // encode with all features.
828 uint64_t f = inc.encode_features;
829 if (!f)
830 f = mon.get_quorum_con_features();
831 if (!f)
832 f = -1;
833 bufferlist full_bl;
834 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
835 tx_size += full_bl.length();
836
837 bufferlist orig_full_bl;
838 get_version_full(osdmap.epoch, orig_full_bl);
839 if (orig_full_bl.length()) {
840 // the primary provided the full map
841 ceph_assert(inc.have_crc);
842 if (inc.full_crc != osdmap.crc) {
843 // This will happen if the mons were running mixed versions in
844 // the past or some other circumstance made the full encoded
845 // maps divergent. Reloading here will bring us back into
846 // sync with the primary for this and all future maps. OSDs
847 // will also be brought back into sync when they discover the
848 // crc mismatch and request a full map from a mon.
849 derr << __func__ << " full map CRC mismatch, resetting to canonical"
850 << dendl;
851
852 dout(20) << __func__ << " my (bad) full osdmap:\n";
853 JSONFormatter jf(true);
854 jf.dump_object("osdmap", osdmap);
855 jf.flush(*_dout);
856 *_dout << "\nhexdump:\n";
857 full_bl.hexdump(*_dout);
858 *_dout << dendl;
859
860 osdmap = OSDMap();
861 osdmap.decode(orig_full_bl);
862
863 dout(20) << __func__ << " canonical full osdmap:\n";
864 JSONFormatter jf(true);
865 jf.dump_object("osdmap", osdmap);
866 jf.flush(*_dout);
867 *_dout << "\nhexdump:\n";
868 orig_full_bl.hexdump(*_dout);
869 *_dout << dendl;
870 }
871 } else {
872 ceph_assert(!inc.have_crc);
873 put_version_full(t, osdmap.epoch, full_bl);
874 }
875 put_version_latest_full(t, osdmap.epoch);
876
877 // share
878 dout(1) << osdmap << dendl;
879
880 if (osdmap.epoch == 1) {
881 t->erase("mkfs", "osdmap");
882 }
883
884 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
885 mon.store->apply_transaction(t);
886 t = MonitorDBStore::TransactionRef();
887 tx_size = 0;
888 }
889 for (const auto [osd, state] : inc.new_state) {
890 if (state & CEPH_OSD_UP) {
891 // could be marked up *or* down, but we're too lazy to check which
892 last_osd_report.erase(osd);
893 }
894 if (state & CEPH_OSD_OUT) {
895 // could be marked in *or* out, but we can safely drop it
896 osd_epochs.erase(osd);
897 }
898 }
899 for (const auto [osd, weight] : inc.new_weight) {
900 if (weight == CEPH_OSD_OUT) {
901 // manually marked out, so drop it
902 osd_epochs.erase(osd);
903 }
904 }
905 }
906
907 if (t) {
908 mon.store->apply_transaction(t);
909 }
910
911 bool marked_osd_down = false;
912 for (int o = 0; o < osdmap.get_max_osd(); o++) {
913 if (osdmap.is_out(o))
914 continue;
915 auto found = down_pending_out.find(o);
916 if (osdmap.is_down(o)) {
917 // populate down -> out map
918 if (found == down_pending_out.end()) {
919 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
920 down_pending_out[o] = ceph_clock_now();
921 marked_osd_down = true;
922 }
923 } else {
924 if (found != down_pending_out.end()) {
925 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
926 down_pending_out.erase(found);
927 }
928 }
929 }
930 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
931
932 check_osdmap_subs();
933 check_pg_creates_subs();
934
935 share_map_with_random_osd();
936 update_logger();
937 process_failures();
938
939 // make sure our feature bits reflect the latest map
940 update_msgr_features();
941
942 if (!mon.is_leader()) {
943 // will be called by on_active() on the leader, avoid doing so twice
944 start_mapping();
945 }
946 if (osdmap.stretch_mode_enabled) {
947 dout(20) << "Stretch mode enabled in this map" << dendl;
948 mon.maybe_engage_stretch_mode();
949 if (osdmap.degraded_stretch_mode) {
950 dout(20) << "Degraded stretch mode set in this map" << dendl;
951 if (!osdmap.recovering_stretch_mode) {
952 mon.set_degraded_stretch_mode();
953 if (prev_num_up_osd < osdmap.num_up_osd &&
954 (osdmap.num_up_osd / (double)osdmap.num_osd) >
955 cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) {
956 // TODO: This works for 2-site clusters when the OSD maps are appropriately
957 // trimmed and everything is "normal" but not if you have a lot of out OSDs
958 // you're ignoring or in some really degenerate failure cases
959 dout(10) << "Enabling recovery stretch mode in this map" << dendl;
960 mon.go_recovery_stretch_mode();
961 }
962 }
963 }
964 if (marked_osd_down &&
965 (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
966 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
967 mon.maybe_go_degraded_stretch_mode();
968 }
969 if (osdmap.recovering_stretch_mode && stretch_recovery_triggered.is_zero()) {
970 stretch_recovery_triggered = ceph_clock_now();
971 }
972 }
973 }
974
975 int OSDMonitor::register_cache_with_pcm()
976 {
977 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
978 derr << __func__ << " Invalid memory size specified for mon caches."
979 << " Caches will not be auto-tuned."
980 << dendl;
981 return -EINVAL;
982 }
983 uint64_t base = mon_memory_base;
984 double fragmentation = mon_memory_fragmentation;
985 // For calculating total target memory, consider rocksdb cache size.
986 uint64_t target = mon_memory_target;
987 uint64_t min = mon_memory_min;
988 uint64_t max = min;
989
990 // Apply the same logic as in bluestore to set the max amount
991 // of memory to use for cache. Assume base memory for OSDMaps
992 // and then add in some overhead for fragmentation.
993 uint64_t ltarget = (1.0 - fragmentation) * target;
994 if (ltarget > base + min) {
995 max = ltarget - base;
996 }
997
998 rocksdb_binned_kv_cache = mon.store->get_priority_cache();
999 if (!rocksdb_binned_kv_cache) {
1000 derr << __func__ << " not using rocksdb" << dendl;
1001 return -EINVAL;
1002 }
1003
1004 int r = _set_cache_ratios();
1005 if (r < 0) {
1006 derr << __func__ << " Cache ratios for pcm could not be set."
1007 << " Review the kv (rocksdb) and mon_memory_target sizes."
1008 << dendl;
1009 return -EINVAL;
1010 }
1011
1012 pcm = std::make_shared<PriorityCache::Manager>(
1013 cct, min, max, target, true);
1014 pcm->insert("kv", rocksdb_binned_kv_cache, true);
1015 pcm->insert("inc", inc_cache, true);
1016 pcm->insert("full", full_cache, true);
1017 dout(1) << __func__ << " pcm target: " << target
1018 << " pcm max: " << max
1019 << " pcm min: " << min
1020 << " inc_osd_cache size: " << inc_osd_cache.get_size()
1021 << dendl;
1022 return 0;
1023 }
1024
1025 int OSDMonitor::_set_cache_ratios()
1026 {
1027 double old_cache_kv_ratio = cache_kv_ratio;
1028
1029 // Set the cache ratios for kv(rocksdb), inc and full caches
1030 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1031 if (cache_kv_ratio >= 1.0) {
1032 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1033 << ") must be in range [0,<1.0]."
1034 << dendl;
1035 cache_kv_ratio = old_cache_kv_ratio;
1036 return -EINVAL;
1037 }
1038 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1039 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1040 inc_cache->set_cache_ratio(cache_inc_ratio);
1041 full_cache->set_cache_ratio(cache_full_ratio);
1042
1043 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1044 << " inc ratio " << cache_inc_ratio
1045 << " full ratio " << cache_full_ratio
1046 << dendl;
1047 return 0;
1048 }
1049
1050 void OSDMonitor::start_mapping()
1051 {
1052 // initiate mapping job
1053 if (mapping_job) {
1054 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1055 << dendl;
1056 mapping_job->abort();
1057 }
1058 if (!osdmap.get_pools().empty()) {
1059 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1060 mapping_job = mapping.start_update(osdmap, mapper,
1061 g_conf()->mon_osd_mapping_pgs_per_chunk);
1062 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1063 << " at " << fin->start << dendl;
1064 mapping_job->set_finish_event(fin);
1065 } else {
1066 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1067 mapping_job = nullptr;
1068 }
1069 }
1070
1071 void OSDMonitor::update_msgr_features()
1072 {
1073 const int types[] = {
1074 entity_name_t::TYPE_OSD,
1075 entity_name_t::TYPE_CLIENT,
1076 entity_name_t::TYPE_MDS,
1077 entity_name_t::TYPE_MON
1078 };
1079 for (int type : types) {
1080 uint64_t mask;
1081 uint64_t features = osdmap.get_features(type, &mask);
1082 if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1083 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1084 ceph::net::Policy p = mon.messenger->get_policy(type);
1085 p.features_required = (p.features_required & ~mask) | features;
1086 mon.messenger->set_policy(type, p);
1087 }
1088 }
1089 }
1090
1091 void OSDMonitor::on_active()
1092 {
1093 update_logger();
1094
1095 if (mon.is_leader()) {
1096 mon.clog->debug() << "osdmap " << osdmap;
1097 if (!priority_convert) {
1098 // Only do this once at start-up
1099 convert_pool_priorities();
1100 priority_convert = true;
1101 }
1102 } else {
1103 list<MonOpRequestRef> ls;
1104 take_all_failures(ls);
1105 while (!ls.empty()) {
1106 MonOpRequestRef op = ls.front();
1107 op->mark_osdmon_event(__func__);
1108 dispatch(op);
1109 ls.pop_front();
1110 }
1111 }
1112 start_mapping();
1113 }
1114
1115 void OSDMonitor::on_restart()
1116 {
1117 last_osd_report.clear();
1118 }
1119
1120 void OSDMonitor::on_shutdown()
1121 {
1122 dout(10) << __func__ << dendl;
1123 if (mapping_job) {
1124 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1125 << dendl;
1126 mapping_job->abort();
1127 }
1128
1129 // discard failure info, waiters
1130 list<MonOpRequestRef> ls;
1131 take_all_failures(ls);
1132 ls.clear();
1133 }
1134
1135 void OSDMonitor::update_logger()
1136 {
1137 dout(10) << "update_logger" << dendl;
1138
1139 mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1140 mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1141 mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1142 mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1143 }
1144
1145 void OSDMonitor::create_pending()
1146 {
1147 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1148 pending_inc.fsid = mon.monmap->fsid;
1149 pending_metadata.clear();
1150 pending_metadata_rm.clear();
1151 pending_pseudo_purged_snaps.clear();
1152
1153 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1154
1155 // safety checks (this shouldn't really happen)
1156 {
1157 if (osdmap.backfillfull_ratio <= 0) {
1158 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1159 if (pending_inc.new_backfillfull_ratio > 1.0)
1160 pending_inc.new_backfillfull_ratio /= 100;
1161 dout(1) << __func__ << " setting backfillfull_ratio = "
1162 << pending_inc.new_backfillfull_ratio << dendl;
1163 }
1164 if (osdmap.full_ratio <= 0) {
1165 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1166 if (pending_inc.new_full_ratio > 1.0)
1167 pending_inc.new_full_ratio /= 100;
1168 dout(1) << __func__ << " setting full_ratio = "
1169 << pending_inc.new_full_ratio << dendl;
1170 }
1171 if (osdmap.nearfull_ratio <= 0) {
1172 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1173 if (pending_inc.new_nearfull_ratio > 1.0)
1174 pending_inc.new_nearfull_ratio /= 100;
1175 dout(1) << __func__ << " setting nearfull_ratio = "
1176 << pending_inc.new_nearfull_ratio << dendl;
1177 }
1178 }
1179
1180 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1181 // structure.
1182 if (osdmap.crush->has_legacy_rule_ids()) {
1183 CrushWrapper newcrush;
1184 _get_pending_crush(newcrush);
1185
1186 // First, for all pools, work out which rule they really used
1187 // by resolving ruleset to rule.
1188 for (const auto &i : osdmap.get_pools()) {
1189 const auto pool_id = i.first;
1190 const auto &pool = i.second;
1191 int new_rule_id = newcrush.find_rule(pool.crush_rule,
1192 pool.type, pool.size);
1193
1194 dout(1) << __func__ << " rewriting pool "
1195 << osdmap.get_pool_name(pool_id) << " crush ruleset "
1196 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1197 if (pending_inc.new_pools.count(pool_id) == 0) {
1198 pending_inc.new_pools[pool_id] = pool;
1199 }
1200 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1201 }
1202
1203 // Now, go ahead and renumber all the rules so that their
1204 // rule_id field corresponds to their position in the array
1205 auto old_to_new = newcrush.renumber_rules();
1206 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1207 for (const auto &i : old_to_new) {
1208 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1209 }
1210 pending_inc.crush.clear();
1211 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
1212 }
1213 }
1214
1215 creating_pgs_t
1216 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1217 const OSDMap& nextmap)
1218 {
1219 dout(10) << __func__ << dendl;
1220 creating_pgs_t pending_creatings;
1221 {
1222 std::lock_guard<std::mutex> l(creating_pgs_lock);
1223 pending_creatings = creating_pgs;
1224 }
1225 // check for new or old pools
1226 if (pending_creatings.last_scan_epoch < inc.epoch) {
1227 unsigned queued = 0;
1228 queued += scan_for_creating_pgs(osdmap.get_pools(),
1229 inc.old_pools,
1230 inc.modified,
1231 &pending_creatings);
1232 queued += scan_for_creating_pgs(inc.new_pools,
1233 inc.old_pools,
1234 inc.modified,
1235 &pending_creatings);
1236 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1237 for (auto deleted_pool : inc.old_pools) {
1238 auto removed = pending_creatings.remove_pool(deleted_pool);
1239 dout(10) << __func__ << " " << removed
1240 << " pg removed because containing pool deleted: "
1241 << deleted_pool << dendl;
1242 last_epoch_clean.remove_pool(deleted_pool);
1243 }
1244 // pgmon updates its creating_pgs in check_osd_map() which is called by
1245 // on_active() and check_osd_map() could be delayed if lease expires, so its
1246 // creating_pgs could be stale in comparison with the one of osdmon. let's
1247 // trim them here. otherwise, they will be added back after being erased.
1248 unsigned removed = 0;
1249 for (auto& pg : pending_created_pgs) {
1250 dout(20) << __func__ << " noting created pg " << pg << dendl;
1251 pending_creatings.created_pools.insert(pg.pool());
1252 removed += pending_creatings.pgs.erase(pg);
1253 }
1254 pending_created_pgs.clear();
1255 dout(10) << __func__ << " " << removed
1256 << " pgs removed because they're created" << dendl;
1257 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1258 }
1259
1260 // filter out any pgs that shouldn't exist.
1261 {
1262 auto i = pending_creatings.pgs.begin();
1263 while (i != pending_creatings.pgs.end()) {
1264 if (!nextmap.pg_exists(i->first)) {
1265 dout(10) << __func__ << " removing pg " << i->first
1266 << " which should not exist" << dendl;
1267 i = pending_creatings.pgs.erase(i);
1268 } else {
1269 ++i;
1270 }
1271 }
1272 }
1273
1274 // process queue
1275 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1276 const auto total = pending_creatings.pgs.size();
1277 while (pending_creatings.pgs.size() < max &&
1278 !pending_creatings.queue.empty()) {
1279 auto p = pending_creatings.queue.begin();
1280 int64_t poolid = p->first;
1281 dout(10) << __func__ << " pool " << poolid
1282 << " created " << p->second.created
1283 << " modified " << p->second.modified
1284 << " [" << p->second.start << "-" << p->second.end << ")"
1285 << dendl;
1286 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1287 p->second.end - p->second.start);
1288 ps_t first = p->second.start;
1289 ps_t end = first + n;
1290 for (ps_t ps = first; ps < end; ++ps) {
1291 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1292 // NOTE: use the *current* epoch as the PG creation epoch so that the
1293 // OSD does not have to generate a long set of PastIntervals.
1294 pending_creatings.pgs.emplace(
1295 pgid,
1296 creating_pgs_t::pg_create_info(inc.epoch,
1297 p->second.modified));
1298 dout(10) << __func__ << " adding " << pgid << dendl;
1299 }
1300 p->second.start = end;
1301 if (p->second.done()) {
1302 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1303 pending_creatings.queue.erase(p);
1304 } else {
1305 dout(10) << __func__ << " pool " << poolid
1306 << " now [" << p->second.start << "-" << p->second.end << ")"
1307 << dendl;
1308 }
1309 }
1310 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1311 << " pools" << dendl;
1312
1313 if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1314 // walk creating pgs' history and past_intervals forward
1315 for (auto& i : pending_creatings.pgs) {
1316 // this mirrors PG::start_peering_interval()
1317 pg_t pgid = i.first;
1318
1319 // this is a bit imprecise, but sufficient?
1320 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1321 const pg_pool_t *pi;
1322 bool operator()(const set<pg_shard_t> &have) const {
1323 return have.size() >= pi->min_size;
1324 }
1325 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1326 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1327
1328 vector<int> up, acting;
1329 int up_primary, acting_primary;
1330 nextmap.pg_to_up_acting_osds(
1331 pgid, &up, &up_primary, &acting, &acting_primary);
1332 if (i.second.history.epoch_created == 0) {
1333 // new pg entry, set it up
1334 i.second.up = up;
1335 i.second.acting = acting;
1336 i.second.up_primary = up_primary;
1337 i.second.acting_primary = acting_primary;
1338 i.second.history = pg_history_t(i.second.create_epoch,
1339 i.second.create_stamp);
1340 dout(10) << __func__ << " pg " << pgid << " just added, "
1341 << " up " << i.second.up
1342 << " p " << i.second.up_primary
1343 << " acting " << i.second.acting
1344 << " p " << i.second.acting_primary
1345 << " history " << i.second.history
1346 << " past_intervals " << i.second.past_intervals
1347 << dendl;
1348 } else {
1349 std::stringstream debug;
1350 if (PastIntervals::check_new_interval(
1351 i.second.acting_primary, acting_primary,
1352 i.second.acting, acting,
1353 i.second.up_primary, up_primary,
1354 i.second.up, up,
1355 i.second.history.same_interval_since,
1356 i.second.history.last_epoch_clean,
1357 &nextmap,
1358 &osdmap,
1359 pgid,
1360 min_size_predicate,
1361 &i.second.past_intervals,
1362 &debug)) {
1363 epoch_t e = inc.epoch;
1364 i.second.history.same_interval_since = e;
1365 if (i.second.up != up) {
1366 i.second.history.same_up_since = e;
1367 }
1368 if (i.second.acting_primary != acting_primary) {
1369 i.second.history.same_primary_since = e;
1370 }
1371 if (pgid.is_split(
1372 osdmap.get_pg_num(pgid.pool()),
1373 nextmap.get_pg_num(pgid.pool()),
1374 nullptr)) {
1375 i.second.history.last_epoch_split = e;
1376 }
1377 dout(10) << __func__ << " pg " << pgid << " new interval,"
1378 << " up " << i.second.up << " -> " << up
1379 << " p " << i.second.up_primary << " -> " << up_primary
1380 << " acting " << i.second.acting << " -> " << acting
1381 << " p " << i.second.acting_primary << " -> "
1382 << acting_primary
1383 << " history " << i.second.history
1384 << " past_intervals " << i.second.past_intervals
1385 << dendl;
1386 dout(20) << " debug: " << debug.str() << dendl;
1387 i.second.up = up;
1388 i.second.acting = acting;
1389 i.second.up_primary = up_primary;
1390 i.second.acting_primary = acting_primary;
1391 }
1392 }
1393 }
1394 }
1395 dout(10) << __func__
1396 << " " << (pending_creatings.pgs.size() - total)
1397 << "/" << pending_creatings.pgs.size()
1398 << " pgs added from queued pools" << dendl;
1399 return pending_creatings;
1400 }
1401
1402 void OSDMonitor::maybe_prime_pg_temp()
1403 {
1404 bool all = false;
1405 if (pending_inc.crush.length()) {
1406 dout(10) << __func__ << " new crush map, all" << dendl;
1407 all = true;
1408 }
1409
1410 if (!pending_inc.new_up_client.empty()) {
1411 dout(10) << __func__ << " new up osds, all" << dendl;
1412 all = true;
1413 }
1414
1415 // check for interesting OSDs
1416 set<int> osds;
1417 for (auto p = pending_inc.new_state.begin();
1418 !all && p != pending_inc.new_state.end();
1419 ++p) {
1420 if ((p->second & CEPH_OSD_UP) &&
1421 osdmap.is_up(p->first)) {
1422 osds.insert(p->first);
1423 }
1424 }
1425 for (auto p = pending_inc.new_weight.begin();
1426 !all && p != pending_inc.new_weight.end();
1427 ++p) {
1428 if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1429 // weight reduction
1430 osds.insert(p->first);
1431 } else {
1432 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1433 << dendl;
1434 all = true;
1435 }
1436 }
1437
1438 if (!all && osds.empty())
1439 return;
1440
1441 if (!all) {
1442 unsigned estimate =
1443 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1444 if (estimate > mapping.get_num_pgs() *
1445 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1446 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1447 << osds.size() << " osds >= "
1448 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1449 << mapping.get_num_pgs() << " pgs, all"
1450 << dendl;
1451 all = true;
1452 } else {
1453 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1454 << osds.size() << " osds" << dendl;
1455 }
1456 }
1457
1458 OSDMap next;
1459 next.deepish_copy_from(osdmap);
1460 next.apply_incremental(pending_inc);
1461
1462 if (next.get_pools().empty()) {
1463 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1464 } else if (all) {
1465 PrimeTempJob job(next, this);
1466 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1467 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1468 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1469 } else {
1470 dout(10) << __func__ << " did not finish in "
1471 << g_conf()->mon_osd_prime_pg_temp_max_time
1472 << ", stopping" << dendl;
1473 job.abort();
1474 }
1475 } else {
1476 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1477 utime_t stop = ceph_clock_now();
1478 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1479 const int chunk = 1000;
1480 int n = chunk;
1481 std::unordered_set<pg_t> did_pgs;
1482 for (auto osd : osds) {
1483 auto& pgs = mapping.get_osd_acting_pgs(osd);
1484 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1485 for (auto pgid : pgs) {
1486 if (!did_pgs.insert(pgid).second) {
1487 continue;
1488 }
1489 prime_pg_temp(next, pgid);
1490 if (--n <= 0) {
1491 n = chunk;
1492 if (ceph_clock_now() > stop) {
1493 dout(10) << __func__ << " consumed more than "
1494 << g_conf()->mon_osd_prime_pg_temp_max_time
1495 << " seconds, stopping"
1496 << dendl;
1497 return;
1498 }
1499 }
1500 }
1501 }
1502 }
1503 }
1504
1505 void OSDMonitor::prime_pg_temp(
1506 const OSDMap& next,
1507 pg_t pgid)
1508 {
1509 // TODO: remove this creating_pgs direct access?
1510 if (creating_pgs.pgs.count(pgid)) {
1511 return;
1512 }
1513 if (!osdmap.pg_exists(pgid)) {
1514 return;
1515 }
1516
1517 vector<int> up, acting;
1518 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1519
1520 vector<int> next_up, next_acting;
1521 int next_up_primary, next_acting_primary;
1522 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1523 &next_acting, &next_acting_primary);
1524 if (acting == next_acting &&
1525 !(up != acting && next_up == next_acting))
1526 return; // no change since last epoch
1527
1528 if (acting.empty())
1529 return; // if previously empty now we can be no worse off
1530 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1531 if (pool && acting.size() < pool->min_size)
1532 return; // can be no worse off than before
1533
1534 if (next_up == next_acting) {
1535 acting.clear();
1536 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1537 << dendl;
1538 }
1539
1540 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1541 << " -> " << next_up << "/" << next_acting
1542 << ", priming " << acting
1543 << dendl;
1544 {
1545 std::lock_guard l(prime_pg_temp_lock);
1546 // do not touch a mapping if a change is pending
1547 pending_inc.new_pg_temp.emplace(
1548 pgid,
1549 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1550 }
1551 }
1552
1553 /**
1554 * @note receiving a transaction in this function gives a fair amount of
1555 * freedom to the service implementation if it does need it. It shouldn't.
1556 */
1557 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1558 {
1559 dout(10) << "encode_pending e " << pending_inc.epoch
1560 << dendl;
1561
1562 if (do_prune(t)) {
1563 dout(1) << __func__ << " osdmap full prune encoded e"
1564 << pending_inc.epoch << dendl;
1565 }
1566
1567 // finalize up pending_inc
1568 pending_inc.modified = ceph_clock_now();
1569
1570 int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1571 ceph_assert(r == 0);
1572
1573 if (mapping_job) {
1574 if (!mapping_job->is_done()) {
1575 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1576 << mapping_job.get() << " did not complete, "
1577 << mapping_job->shards << " left" << dendl;
1578 mapping_job->abort();
1579 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1580 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1581 << mapping_job.get() << " is prior epoch "
1582 << mapping.get_epoch() << dendl;
1583 } else {
1584 if (g_conf()->mon_osd_prime_pg_temp) {
1585 maybe_prime_pg_temp();
1586 }
1587 }
1588 } else if (g_conf()->mon_osd_prime_pg_temp) {
1589 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1590 << dendl;
1591 }
1592 mapping_job.reset();
1593
1594 // ensure we don't have blank new_state updates. these are interrpeted as
1595 // CEPH_OSD_UP (and almost certainly not what we want!).
1596 auto p = pending_inc.new_state.begin();
1597 while (p != pending_inc.new_state.end()) {
1598 if (p->second == 0) {
1599 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1600 p = pending_inc.new_state.erase(p);
1601 } else {
1602 if (p->second & CEPH_OSD_UP) {
1603 pending_inc.new_last_up_change = pending_inc.modified;
1604 }
1605 ++p;
1606 }
1607 }
1608 if (!pending_inc.new_up_client.empty()) {
1609 pending_inc.new_last_up_change = pending_inc.modified;
1610 }
1611 for (auto& i : pending_inc.new_weight) {
1612 if (i.first >= osdmap.max_osd) {
1613 if (i.second) {
1614 // new osd is already marked in
1615 pending_inc.new_last_in_change = pending_inc.modified;
1616 break;
1617 }
1618 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1619 // existing osd marked in or out
1620 pending_inc.new_last_in_change = pending_inc.modified;
1621 break;
1622 }
1623 }
1624
1625 {
1626 OSDMap tmp;
1627 tmp.deepish_copy_from(osdmap);
1628 tmp.apply_incremental(pending_inc);
1629
1630 // clean pg_temp mappings
1631 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1632
1633 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1634 {
1635 // check every upmapped pg for now
1636 // until we could reliably identify certain cases to ignore,
1637 // which is obviously the hard part TBD..
1638 vector<pg_t> pgs_to_check;
1639 tmp.get_upmap_pgs(&pgs_to_check);
1640 if (pgs_to_check.size() <
1641 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1642 // not enough pgs, do it inline
1643 tmp.clean_pg_upmaps(cct, &pending_inc);
1644 } else {
1645 CleanUpmapJob job(cct, tmp, pending_inc);
1646 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1647 job.wait();
1648 }
1649 }
1650
1651 // update creating pgs first so that we can remove the created pgid and
1652 // process the pool flag removal below in the same osdmap epoch.
1653 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1654 bufferlist creatings_bl;
1655 uint64_t features = CEPH_FEATURES_ALL;
1656 if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1657 dout(20) << __func__ << " encoding pending pgs without octopus features"
1658 << dendl;
1659 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1660 }
1661 encode(pending_creatings, creatings_bl, features);
1662 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1663
1664 // remove any old (or incompat) POOL_CREATING flags
1665 for (auto& i : tmp.get_pools()) {
1666 if (tmp.require_osd_release < ceph_release_t::nautilus) {
1667 // pre-nautilus OSDMaps shouldn't get this flag.
1668 if (pending_inc.new_pools.count(i.first)) {
1669 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1670 }
1671 }
1672 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1673 !pending_creatings.still_creating_pool(i.first)) {
1674 dout(10) << __func__ << " done creating pool " << i.first
1675 << ", clearing CREATING flag" << dendl;
1676 if (pending_inc.new_pools.count(i.first) == 0) {
1677 pending_inc.new_pools[i.first] = i.second;
1678 }
1679 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1680 }
1681 }
1682
1683 // collect which pools are currently affected by
1684 // the near/backfill/full osd(s),
1685 // and set per-pool near/backfill/full flag instead
1686 set<int64_t> full_pool_ids;
1687 set<int64_t> backfillfull_pool_ids;
1688 set<int64_t> nearfull_pool_ids;
1689 tmp.get_full_pools(cct,
1690 &full_pool_ids,
1691 &backfillfull_pool_ids,
1692 &nearfull_pool_ids);
1693 if (full_pool_ids.empty() ||
1694 backfillfull_pool_ids.empty() ||
1695 nearfull_pool_ids.empty()) {
1696 // normal case - no nearfull, backfillfull or full osds
1697 // try cancel any improper nearfull/backfillfull/full pool
1698 // flags first
1699 for (auto &pool: tmp.get_pools()) {
1700 auto p = pool.first;
1701 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1702 nearfull_pool_ids.empty()) {
1703 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1704 << "'s nearfull flag" << dendl;
1705 if (pending_inc.new_pools.count(p) == 0) {
1706 // load original pool info first!
1707 pending_inc.new_pools[p] = pool.second;
1708 }
1709 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1710 }
1711 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1712 backfillfull_pool_ids.empty()) {
1713 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1714 << "'s backfillfull flag" << dendl;
1715 if (pending_inc.new_pools.count(p) == 0) {
1716 pending_inc.new_pools[p] = pool.second;
1717 }
1718 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1719 }
1720 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1721 full_pool_ids.empty()) {
1722 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1723 // set by EQUOTA, skipping
1724 continue;
1725 }
1726 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1727 << "'s full flag" << dendl;
1728 if (pending_inc.new_pools.count(p) == 0) {
1729 pending_inc.new_pools[p] = pool.second;
1730 }
1731 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1732 }
1733 }
1734 }
1735 if (!full_pool_ids.empty()) {
1736 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1737 << " as full" << dendl;
1738 for (auto &p: full_pool_ids) {
1739 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1740 continue;
1741 }
1742 if (pending_inc.new_pools.count(p) == 0) {
1743 pending_inc.new_pools[p] = tmp.pools[p];
1744 }
1745 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1746 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1747 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1748 }
1749 // cancel FLAG_FULL for pools which are no longer full too
1750 for (auto &pool: tmp.get_pools()) {
1751 auto p = pool.first;
1752 if (full_pool_ids.count(p)) {
1753 // skip pools we have just marked as full above
1754 continue;
1755 }
1756 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1757 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1758 // don't touch if currently is not full
1759 // or is running out of quota (and hence considered as full)
1760 continue;
1761 }
1762 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1763 << "'s full flag" << dendl;
1764 if (pending_inc.new_pools.count(p) == 0) {
1765 pending_inc.new_pools[p] = pool.second;
1766 }
1767 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1768 }
1769 }
1770 if (!backfillfull_pool_ids.empty()) {
1771 for (auto &p: backfillfull_pool_ids) {
1772 if (full_pool_ids.count(p)) {
1773 // skip pools we have already considered as full above
1774 continue;
1775 }
1776 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1777 // make sure FLAG_FULL is truly set, so we are safe not
1778 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1779 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1780 continue;
1781 }
1782 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1783 // don't bother if pool is already marked as backfillfull
1784 continue;
1785 }
1786 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1787 << "'s as backfillfull" << dendl;
1788 if (pending_inc.new_pools.count(p) == 0) {
1789 pending_inc.new_pools[p] = tmp.pools[p];
1790 }
1791 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1792 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1793 }
1794 // cancel FLAG_BACKFILLFULL for pools
1795 // which are no longer backfillfull too
1796 for (auto &pool: tmp.get_pools()) {
1797 auto p = pool.first;
1798 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1799 // skip pools we have just marked as backfillfull/full above
1800 continue;
1801 }
1802 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1803 // and don't touch if currently is not backfillfull
1804 continue;
1805 }
1806 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1807 << "'s backfillfull flag" << dendl;
1808 if (pending_inc.new_pools.count(p) == 0) {
1809 pending_inc.new_pools[p] = pool.second;
1810 }
1811 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1812 }
1813 }
1814 if (!nearfull_pool_ids.empty()) {
1815 for (auto &p: nearfull_pool_ids) {
1816 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1817 continue;
1818 }
1819 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1820 // make sure FLAG_FULL is truly set, so we are safe not
1821 // to set a extra (redundant) FLAG_NEARFULL flag
1822 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1823 continue;
1824 }
1825 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1826 // don't bother if pool is already marked as nearfull
1827 continue;
1828 }
1829 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1830 << "'s as nearfull" << dendl;
1831 if (pending_inc.new_pools.count(p) == 0) {
1832 pending_inc.new_pools[p] = tmp.pools[p];
1833 }
1834 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1835 }
1836 // cancel FLAG_NEARFULL for pools
1837 // which are no longer nearfull too
1838 for (auto &pool: tmp.get_pools()) {
1839 auto p = pool.first;
1840 if (full_pool_ids.count(p) ||
1841 backfillfull_pool_ids.count(p) ||
1842 nearfull_pool_ids.count(p)) {
1843 // skip pools we have just marked as
1844 // nearfull/backfillfull/full above
1845 continue;
1846 }
1847 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1848 // and don't touch if currently is not nearfull
1849 continue;
1850 }
1851 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1852 << "'s nearfull flag" << dendl;
1853 if (pending_inc.new_pools.count(p) == 0) {
1854 pending_inc.new_pools[p] = pool.second;
1855 }
1856 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1857 }
1858 }
1859
1860 // min_compat_client?
1861 if (!tmp.require_min_compat_client) {
1862 auto mv = tmp.get_min_compat_client();
1863 dout(1) << __func__ << " setting require_min_compat_client to currently "
1864 << "required " << mv << dendl;
1865 mon.clog->info() << "setting require_min_compat_client to currently "
1866 << "required " << mv;
1867 pending_inc.new_require_min_compat_client = mv;
1868 }
1869
1870 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1871 tmp.require_osd_release >= ceph_release_t::nautilus) {
1872 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1873 // add creating flags?
1874 for (auto& i : tmp.get_pools()) {
1875 if (pending_creatings.still_creating_pool(i.first)) {
1876 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1877 << dendl;
1878 if (pending_inc.new_pools.count(i.first) == 0) {
1879 pending_inc.new_pools[i.first] = i.second;
1880 }
1881 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1882 }
1883 }
1884 // adjust blocklist items to all be TYPE_ANY
1885 for (auto& i : tmp.blocklist) {
1886 auto a = i.first;
1887 a.set_type(entity_addr_t::TYPE_ANY);
1888 pending_inc.new_blocklist[a] = i.second;
1889 pending_inc.old_blocklist.push_back(i.first);
1890 }
1891 }
1892
1893 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1894 tmp.require_osd_release >= ceph_release_t::octopus) {
1895 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1896
1897 // adjust obsoleted cache modes
1898 for (auto& [poolid, pi] : tmp.pools) {
1899 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1900 if (pending_inc.new_pools.count(poolid) == 0) {
1901 pending_inc.new_pools[poolid] = pi;
1902 }
1903 dout(10) << __func__ << " switching pool " << poolid
1904 << " cachemode from forward -> proxy" << dendl;
1905 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1906 }
1907 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1908 if (pending_inc.new_pools.count(poolid) == 0) {
1909 pending_inc.new_pools[poolid] = pi;
1910 }
1911 dout(10) << __func__ << " switching pool " << poolid
1912 << " cachemode from readforward -> readproxy" << dendl;
1913 pending_inc.new_pools[poolid].cache_mode =
1914 pg_pool_t::CACHEMODE_READPROXY;
1915 }
1916 }
1917
1918 // clear removed_snaps for every pool
1919 for (auto& [poolid, pi] : tmp.pools) {
1920 if (pi.removed_snaps.empty()) {
1921 continue;
1922 }
1923 if (pending_inc.new_pools.count(poolid) == 0) {
1924 pending_inc.new_pools[poolid] = pi;
1925 }
1926 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1927 << dendl;
1928 pending_inc.new_pools[poolid].removed_snaps.clear();
1929 }
1930
1931 // create a combined purged snap epoch key for all purged snaps
1932 // prior to this epoch, and store it in the current epoch (i.e.,
1933 // the last pre-octopus epoch, just prior to the one we're
1934 // encoding now).
1935 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1936 it->lower_bound("purged_snap_");
1937 map<int64_t,snap_interval_set_t> combined;
1938 while (it->valid()) {
1939 if (it->key().find("purged_snap_") != 0) {
1940 break;
1941 }
1942 string k = it->key();
1943 long long unsigned pool;
1944 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1945 if (n != 1) {
1946 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1947 } else {
1948 bufferlist v = it->value();
1949 auto p = v.cbegin();
1950 snapid_t begin, end;
1951 ceph::decode(begin, p);
1952 ceph::decode(end, p);
1953 combined[pool].insert(begin, end - begin);
1954 }
1955 it->next();
1956 }
1957 if (!combined.empty()) {
1958 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1959 bufferlist v;
1960 ceph::encode(combined, v);
1961 t->put(OSD_SNAP_PREFIX, k, v);
1962 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1963 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1964 << dendl;
1965 } else {
1966 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1967 << dendl;
1968 }
1969
1970 // clean out the old removed_snap_ and removed_epoch keys
1971 // ('`' is ASCII '_' + 1)
1972 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1973 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1974 }
1975 }
1976
1977 // tell me about it
1978 for (auto i = pending_inc.new_state.begin();
1979 i != pending_inc.new_state.end();
1980 ++i) {
1981 int s = i->second ? i->second : CEPH_OSD_UP;
1982 if (s & CEPH_OSD_UP) {
1983 dout(2) << " osd." << i->first << " DOWN" << dendl;
1984 // Reset laggy parameters if failure interval exceeds a threshold.
1985 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1986 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1987 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1988 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1989 set_default_laggy_params(i->first);
1990 }
1991 }
1992 }
1993 if (s & CEPH_OSD_EXISTS)
1994 dout(2) << " osd." << i->first << " DNE" << dendl;
1995 }
1996 for (auto i = pending_inc.new_up_client.begin();
1997 i != pending_inc.new_up_client.end();
1998 ++i) {
1999 //FIXME: insert cluster addresses too
2000 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
2001 }
2002 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
2003 i != pending_inc.new_weight.end();
2004 ++i) {
2005 if (i->second == CEPH_OSD_OUT) {
2006 dout(2) << " osd." << i->first << " OUT" << dendl;
2007 } else if (i->second == CEPH_OSD_IN) {
2008 dout(2) << " osd." << i->first << " IN" << dendl;
2009 } else {
2010 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
2011 }
2012 }
2013
2014 // features for osdmap and its incremental
2015 uint64_t features;
2016
2017 // encode full map and determine its crc
2018 OSDMap tmp;
2019 {
2020 tmp.deepish_copy_from(osdmap);
2021 tmp.apply_incremental(pending_inc);
2022
2023 // determine appropriate features
2024 features = tmp.get_encoding_features();
2025 dout(10) << __func__ << " encoding full map with "
2026 << tmp.require_osd_release
2027 << " features " << features << dendl;
2028
2029 // the features should be a subset of the mon quorum's features!
2030 ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2031
2032 bufferlist fullbl;
2033 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2034 pending_inc.full_crc = tmp.get_crc();
2035
2036 // include full map in the txn. note that old monitors will
2037 // overwrite this. new ones will now skip the local full map
2038 // encode and reload from this.
2039 put_version_full(t, pending_inc.epoch, fullbl);
2040 }
2041
2042 // encode
2043 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2044 bufferlist bl;
2045 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2046
2047 dout(20) << " full_crc " << tmp.get_crc()
2048 << " inc_crc " << pending_inc.inc_crc << dendl;
2049
2050 /* put everything in the transaction */
2051 put_version(t, pending_inc.epoch, bl);
2052 put_last_committed(t, pending_inc.epoch);
2053
2054 // metadata, too!
2055 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2056 p != pending_metadata.end();
2057 ++p)
2058 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2059 for (set<int>::iterator p = pending_metadata_rm.begin();
2060 p != pending_metadata_rm.end();
2061 ++p)
2062 t->erase(OSD_METADATA_PREFIX, stringify(*p));
2063 pending_metadata.clear();
2064 pending_metadata_rm.clear();
2065
2066 // purged_snaps
2067 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2068 !pending_inc.new_purged_snaps.empty()) {
2069 // all snaps purged this epoch (across all pools)
2070 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2071 bufferlist v;
2072 encode(pending_inc.new_purged_snaps, v);
2073 t->put(OSD_SNAP_PREFIX, k, v);
2074 }
2075 for (auto& i : pending_inc.new_purged_snaps) {
2076 for (auto q = i.second.begin();
2077 q != i.second.end();
2078 ++q) {
2079 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2080 pending_inc.epoch,
2081 t);
2082 }
2083 }
2084 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2085 for (auto snap : snaps) {
2086 insert_purged_snap_update(pool, snap, snap + 1,
2087 pending_inc.epoch,
2088 t);
2089 }
2090 }
2091
2092 // health
2093 health_check_map_t next;
2094 tmp.check_health(cct, &next);
2095 encode_health(next, t);
2096 }
2097
2098 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2099 {
2100 bufferlist bl;
2101 int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2102 if (r < 0)
2103 return r;
2104 try {
2105 auto p = bl.cbegin();
2106 decode(m, p);
2107 }
2108 catch (ceph::buffer::error& e) {
2109 if (err)
2110 *err << "osd." << osd << " metadata is corrupt";
2111 return -EIO;
2112 }
2113 return 0;
2114 }
2115
2116 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2117 {
2118 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2119 if (osdmap.is_up(osd)) {
2120 map<string,string> meta;
2121 load_metadata(osd, meta, nullptr);
2122 auto p = meta.find(field);
2123 if (p == meta.end()) {
2124 (*out)["unknown"]++;
2125 } else {
2126 (*out)[p->second]++;
2127 }
2128 }
2129 }
2130 }
2131
2132 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2133 {
2134 map<string,int> by_val;
2135 count_metadata(field, &by_val);
2136 f->open_object_section(field.c_str());
2137 for (auto& p : by_val) {
2138 f->dump_int(p.first.c_str(), p.second);
2139 }
2140 f->close_section();
2141 }
2142
2143 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2144 {
2145 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2146 if (osdmap.is_up(osd)) {
2147 map<string,string> meta;
2148 load_metadata(osd, meta, nullptr);
2149 auto p = meta.find("ceph_version_short");
2150 if (p == meta.end()) continue;
2151 versions[p->second].push_back(string("osd.") + stringify(osd));
2152 }
2153 }
2154 }
2155
2156 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2157 {
2158 map<string, string> metadata;
2159 int r = load_metadata(osd, metadata, nullptr);
2160 if (r < 0)
2161 return r;
2162
2163 auto it = metadata.find("osd_objectstore");
2164 if (it == metadata.end())
2165 return -ENOENT;
2166 *type = it->second;
2167 return 0;
2168 }
2169
2170 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2171 const pg_pool_t &pool,
2172 ostream *err)
2173 {
2174 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2175 // since filestore osds could always join the pool later
2176 set<int> checked_osds;
2177 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2178 vector<int> up, acting;
2179 pg_t pgid(ps, pool_id);
2180 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2181 for (int osd : up) {
2182 if (checked_osds.find(osd) != checked_osds.end())
2183 continue;
2184 string objectstore_type;
2185 int r = get_osd_objectstore_type(osd, &objectstore_type);
2186 // allow with missing metadata, e.g. due to an osd never booting yet
2187 if (r < 0 || objectstore_type == "bluestore") {
2188 checked_osds.insert(osd);
2189 continue;
2190 }
2191 *err << "osd." << osd << " uses " << objectstore_type;
2192 return false;
2193 }
2194 }
2195 return true;
2196 }
2197
2198 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2199 {
2200 map<string,string> m;
2201 if (int r = load_metadata(osd, m, err))
2202 return r;
2203 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2204 f->dump_string(p->first.c_str(), p->second);
2205 return 0;
2206 }
2207
2208 void OSDMonitor::print_nodes(Formatter *f)
2209 {
2210 // group OSDs by their hosts
2211 map<string, list<int> > osds; // hostname => osd
2212 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2213 map<string, string> m;
2214 if (load_metadata(osd, m, NULL)) {
2215 continue;
2216 }
2217 map<string, string>::iterator hostname = m.find("hostname");
2218 if (hostname == m.end()) {
2219 // not likely though
2220 continue;
2221 }
2222 osds[hostname->second].push_back(osd);
2223 }
2224
2225 dump_services(f, osds, "osd");
2226 }
2227
2228 void OSDMonitor::share_map_with_random_osd()
2229 {
2230 if (osdmap.get_num_up_osds() == 0) {
2231 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2232 return;
2233 }
2234
2235 MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2236 if (!s) {
2237 dout(10) << __func__ << " no up osd on our session map" << dendl;
2238 return;
2239 }
2240
2241 dout(10) << "committed, telling random " << s->name
2242 << " all about it" << dendl;
2243
2244 // get feature of the peer
2245 // use quorum_con_features, if it's an anonymous connection.
2246 uint64_t features = s->con_features ? s->con_features :
2247 mon.get_quorum_con_features();
2248 // whatev, they'll request more if they need it
2249 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2250 s->con->send_message(m);
2251 // NOTE: do *not* record osd has up to this epoch (as we do
2252 // elsewhere) as they may still need to request older values.
2253 }
2254
2255 version_t OSDMonitor::get_trim_to() const
2256 {
2257 if (mon.get_quorum().empty()) {
2258 dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2259 return 0;
2260 }
2261
2262 {
2263 std::lock_guard<std::mutex> l(creating_pgs_lock);
2264 if (!creating_pgs.pgs.empty()) {
2265 dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2266 return 0;
2267 }
2268 }
2269
2270 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2271 dout(0) << __func__
2272 << " blocking osdmap trim"
2273 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2274 << " trim_to = 0" << dendl;
2275 return 0;
2276 }
2277
2278 {
2279 epoch_t floor = get_min_last_epoch_clean();
2280 dout(10) << " min_last_epoch_clean " << floor << dendl;
2281 if (g_conf()->mon_osd_force_trim_to > 0 &&
2282 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2283 floor = g_conf()->mon_osd_force_trim_to;
2284 dout(10) << __func__
2285 << " explicit mon_osd_force_trim_to = " << floor << dendl;
2286 }
2287 unsigned min = g_conf()->mon_min_osdmap_epochs;
2288 if (floor + min > get_last_committed()) {
2289 if (min < get_last_committed())
2290 floor = get_last_committed() - min;
2291 else
2292 floor = 0;
2293 }
2294 if (floor > get_first_committed()) {
2295 dout(10) << __func__ << " trim_to = " << floor << dendl;
2296 return floor;
2297 }
2298 }
2299 dout(10) << __func__ << " trim_to = 0" << dendl;
2300 return 0;
2301 }
2302
2303 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2304 {
2305 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2306 // also scan osd epochs
2307 // don't trim past the oldest reported osd epoch
2308 for (auto [osd, epoch] : osd_epochs) {
2309 if (epoch < floor) {
2310 floor = epoch;
2311 }
2312 }
2313 return floor;
2314 }
2315
2316 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2317 version_t first)
2318 {
2319 dout(10) << __func__ << " including full map for e " << first << dendl;
2320 bufferlist bl;
2321 get_version_full(first, bl);
2322 put_version_full(tx, first, bl);
2323
2324 if (has_osdmap_manifest &&
2325 first > osdmap_manifest.get_first_pinned()) {
2326 _prune_update_trimmed(tx, first);
2327 }
2328 }
2329
2330
2331 /* full osdmap prune
2332 *
2333 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2334 */
2335
2336 void OSDMonitor::load_osdmap_manifest()
2337 {
2338 bool store_has_manifest =
2339 mon.store->exists(get_service_name(), "osdmap_manifest");
2340
2341 if (!store_has_manifest) {
2342 if (!has_osdmap_manifest) {
2343 return;
2344 }
2345
2346 dout(20) << __func__
2347 << " dropping osdmap manifest from memory." << dendl;
2348 osdmap_manifest = osdmap_manifest_t();
2349 has_osdmap_manifest = false;
2350 return;
2351 }
2352
2353 dout(20) << __func__
2354 << " osdmap manifest detected in store; reload." << dendl;
2355
2356 bufferlist manifest_bl;
2357 int r = get_value("osdmap_manifest", manifest_bl);
2358 if (r < 0) {
2359 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2360 ceph_abort_msg("error reading manifest");
2361 }
2362 osdmap_manifest.decode(manifest_bl);
2363 has_osdmap_manifest = true;
2364
2365 dout(10) << __func__ << " store osdmap manifest pinned ("
2366 << osdmap_manifest.get_first_pinned()
2367 << " .. "
2368 << osdmap_manifest.get_last_pinned()
2369 << ")"
2370 << dendl;
2371 }
2372
2373 bool OSDMonitor::should_prune() const
2374 {
2375 version_t first = get_first_committed();
2376 version_t last = get_last_committed();
2377 version_t min_osdmap_epochs =
2378 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2379 version_t prune_min =
2380 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2381 version_t prune_interval =
2382 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2383 version_t last_pinned = osdmap_manifest.get_last_pinned();
2384 version_t last_to_pin = last - min_osdmap_epochs;
2385
2386 // Make it or break it constraints.
2387 //
2388 // If any of these conditions fails, we will not prune, regardless of
2389 // whether we have an on-disk manifest with an on-going pruning state.
2390 //
2391 if ((last - first) <= min_osdmap_epochs) {
2392 // between the first and last committed epochs, we don't have
2393 // enough epochs to trim, much less to prune.
2394 dout(10) << __func__
2395 << " currently holding only " << (last - first)
2396 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2397 << "); do not prune."
2398 << dendl;
2399 return false;
2400
2401 } else if ((last_to_pin - first) < prune_min) {
2402 // between the first committed epoch and the last epoch we would prune,
2403 // we simply don't have enough versions over the minimum to prune maps.
2404 dout(10) << __func__
2405 << " could only prune " << (last_to_pin - first)
2406 << " epochs (" << first << ".." << last_to_pin << "), which"
2407 " is less than the required minimum (" << prune_min << ")"
2408 << dendl;
2409 return false;
2410
2411 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2412 dout(10) << __func__
2413 << " we have pruned as far as we can; do not prune."
2414 << dendl;
2415 return false;
2416
2417 } else if (last_pinned + prune_interval > last_to_pin) {
2418 dout(10) << __func__
2419 << " not enough epochs to form an interval (last pinned: "
2420 << last_pinned << ", last to pin: "
2421 << last_to_pin << ", interval: " << prune_interval << ")"
2422 << dendl;
2423 return false;
2424 }
2425
2426 dout(15) << __func__
2427 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2428 << " lc (" << first << ".." << last << ")"
2429 << dendl;
2430 return true;
2431 }
2432
2433 void OSDMonitor::_prune_update_trimmed(
2434 MonitorDBStore::TransactionRef tx,
2435 version_t first)
2436 {
2437 dout(10) << __func__
2438 << " first " << first
2439 << " last_pinned " << osdmap_manifest.get_last_pinned()
2440 << dendl;
2441
2442 osdmap_manifest_t manifest = osdmap_manifest;
2443
2444 if (!manifest.is_pinned(first)) {
2445 manifest.pin(first);
2446 }
2447
2448 set<version_t>::iterator p_end = manifest.pinned.find(first);
2449 set<version_t>::iterator p = manifest.pinned.begin();
2450 manifest.pinned.erase(p, p_end);
2451 ceph_assert(manifest.get_first_pinned() == first);
2452
2453 if (manifest.get_last_pinned() == first+1 ||
2454 manifest.pinned.size() == 1) {
2455 // we reached the end of the line, as pinned maps go; clean up our
2456 // manifest, and let `should_prune()` decide whether we should prune
2457 // again.
2458 tx->erase(get_service_name(), "osdmap_manifest");
2459 return;
2460 }
2461
2462 bufferlist bl;
2463 manifest.encode(bl);
2464 tx->put(get_service_name(), "osdmap_manifest", bl);
2465 }
2466
2467 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2468 {
2469 dout(1) << __func__ << dendl;
2470
2471 version_t pin_first;
2472
2473 // verify constrainsts on stable in-memory state
2474 if (!has_osdmap_manifest) {
2475 // we must have never pruned, OR if we pruned the state must no longer
2476 // be relevant (i.e., the state must have been removed alongside with
2477 // the trim that *must* have removed past the last pinned map in a
2478 // previous prune).
2479 ceph_assert(osdmap_manifest.pinned.empty());
2480 ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2481 pin_first = get_first_committed();
2482
2483 } else {
2484 // we must have pruned in the past AND its state is still relevant
2485 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2486 // and thus we still hold a manifest in the store).
2487 ceph_assert(!osdmap_manifest.pinned.empty());
2488 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2489 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2490
2491 dout(10) << __func__
2492 << " first_pinned " << osdmap_manifest.get_first_pinned()
2493 << " last_pinned " << osdmap_manifest.get_last_pinned()
2494 << dendl;
2495
2496 pin_first = osdmap_manifest.get_last_pinned();
2497 }
2498
2499 manifest.pin(pin_first);
2500 }
2501
2502 bool OSDMonitor::_prune_sanitize_options() const
2503 {
2504 uint64_t prune_interval =
2505 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2506 uint64_t prune_min =
2507 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2508 uint64_t txsize =
2509 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2510
2511 bool r = true;
2512
2513 if (prune_interval == 0) {
2514 derr << __func__
2515 << " prune is enabled BUT prune interval is zero; abort."
2516 << dendl;
2517 r = false;
2518 } else if (prune_interval == 1) {
2519 derr << __func__
2520 << " prune interval is equal to one, which essentially means"
2521 " no pruning; abort."
2522 << dendl;
2523 r = false;
2524 }
2525 if (prune_min == 0) {
2526 derr << __func__
2527 << " prune is enabled BUT prune min is zero; abort."
2528 << dendl;
2529 r = false;
2530 }
2531 if (prune_interval > prune_min) {
2532 derr << __func__
2533 << " impossible to ascertain proper prune interval because"
2534 << " it is greater than the minimum prune epochs"
2535 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2536 << dendl;
2537 r = false;
2538 }
2539
2540 if (txsize < prune_interval - 1) {
2541 derr << __func__
2542 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2543 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2544 << "); abort." << dendl;
2545 r = false;
2546 }
2547 return r;
2548 }
2549
2550 bool OSDMonitor::is_prune_enabled() const {
2551 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2552 }
2553
2554 bool OSDMonitor::is_prune_supported() const {
2555 return mon.get_required_mon_features().contains_any(
2556 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2557 }
2558
2559 /** do_prune
2560 *
2561 * @returns true if has side-effects; false otherwise.
2562 */
2563 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2564 {
2565 bool enabled = is_prune_enabled();
2566
2567 dout(1) << __func__ << " osdmap full prune "
2568 << ( enabled ? "enabled" : "disabled")
2569 << dendl;
2570
2571 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2572 return false;
2573 }
2574
2575 // we are beyond the minimum prune versions, we need to remove maps because
2576 // otherwise the store will grow unbounded and we may end up having issues
2577 // with available disk space or store hangs.
2578
2579 // we will not pin all versions. We will leave a buffer number of versions.
2580 // this allows us the monitor to trim maps without caring too much about
2581 // pinned maps, and then allow us to use another ceph-mon without these
2582 // capabilities, without having to repair the store.
2583
2584 osdmap_manifest_t manifest = osdmap_manifest;
2585
2586 version_t first = get_first_committed();
2587 version_t last = get_last_committed();
2588
2589 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2590 version_t last_pinned = manifest.get_last_pinned();
2591 uint64_t prune_interval =
2592 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2593 uint64_t txsize =
2594 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2595
2596 prune_init(manifest);
2597
2598 // we need to get rid of some osdmaps
2599
2600 dout(5) << __func__
2601 << " lc (" << first << " .. " << last << ")"
2602 << " last_pinned " << last_pinned
2603 << " interval " << prune_interval
2604 << " last_to_pin " << last_to_pin
2605 << dendl;
2606
2607 // We will be erasing maps as we go.
2608 //
2609 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2610 //
2611 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2612 // we stop pruning. We could prune the maps between `next_to_pin` and
2613 // `last_to_pin`, but by not doing it we end up with neater pruned
2614 // intervals, aligned with `prune_interval`. Besides, this should not be a
2615 // problem as long as `prune_interval` is set to a sane value, instead of
2616 // hundreds or thousands of maps.
2617
2618 auto map_exists = [this](version_t v) {
2619 string k = mon.store->combine_strings("full", v);
2620 return mon.store->exists(get_service_name(), k);
2621 };
2622
2623 // 'interval' represents the number of maps from the last pinned
2624 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2625 // version 11 next; all intermediate versions will be removed.
2626 //
2627 // 'txsize' represents the maximum number of versions we'll be removing in
2628 // this iteration. If 'txsize' is large enough to perform multiple passes
2629 // pinning and removing maps, we will do so; if not, we'll do at least one
2630 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2631 // ensure that we never go *over* the maximum.
2632
2633 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2634 uint64_t removal_interval = prune_interval - 1;
2635
2636 if (txsize < removal_interval) {
2637 dout(5) << __func__
2638 << " setting txsize to removal interval size ("
2639 << removal_interval << " versions"
2640 << dendl;
2641 txsize = removal_interval;
2642 }
2643 ceph_assert(removal_interval > 0);
2644
2645 uint64_t num_pruned = 0;
2646 while (num_pruned + removal_interval <= txsize) {
2647 last_pinned = manifest.get_last_pinned();
2648
2649 if (last_pinned + prune_interval > last_to_pin) {
2650 break;
2651 }
2652 ceph_assert(last_pinned < last_to_pin);
2653
2654 version_t next_pinned = last_pinned + prune_interval;
2655 ceph_assert(next_pinned <= last_to_pin);
2656 manifest.pin(next_pinned);
2657
2658 dout(20) << __func__
2659 << " last_pinned " << last_pinned
2660 << " next_pinned " << next_pinned
2661 << " num_pruned " << num_pruned
2662 << " removal interval (" << (last_pinned+1)
2663 << ".." << (next_pinned-1) << ")"
2664 << " txsize " << txsize << dendl;
2665
2666 ceph_assert(map_exists(last_pinned));
2667 ceph_assert(map_exists(next_pinned));
2668
2669 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2670 ceph_assert(!manifest.is_pinned(v));
2671
2672 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2673 string full_key = mon.store->combine_strings("full", v);
2674 tx->erase(get_service_name(), full_key);
2675 ++num_pruned;
2676 }
2677 }
2678
2679 ceph_assert(num_pruned > 0);
2680
2681 bufferlist bl;
2682 manifest.encode(bl);
2683 tx->put(get_service_name(), "osdmap_manifest", bl);
2684
2685 return true;
2686 }
2687
2688
2689 // -------------
2690
2691 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2692 {
2693 op->mark_osdmon_event(__func__);
2694 Message *m = op->get_req();
2695 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2696
2697 switch (m->get_type()) {
2698 // READs
2699 case MSG_MON_COMMAND:
2700 try {
2701 return preprocess_command(op);
2702 } catch (const bad_cmd_get& e) {
2703 bufferlist bl;
2704 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2705 return true;
2706 }
2707 case CEPH_MSG_MON_GET_OSDMAP:
2708 return preprocess_get_osdmap(op);
2709
2710 // damp updates
2711 case MSG_OSD_MARK_ME_DOWN:
2712 return preprocess_mark_me_down(op);
2713 case MSG_OSD_MARK_ME_DEAD:
2714 return preprocess_mark_me_dead(op);
2715 case MSG_OSD_FULL:
2716 return preprocess_full(op);
2717 case MSG_OSD_FAILURE:
2718 return preprocess_failure(op);
2719 case MSG_OSD_BOOT:
2720 return preprocess_boot(op);
2721 case MSG_OSD_ALIVE:
2722 return preprocess_alive(op);
2723 case MSG_OSD_PG_CREATED:
2724 return preprocess_pg_created(op);
2725 case MSG_OSD_PG_READY_TO_MERGE:
2726 return preprocess_pg_ready_to_merge(op);
2727 case MSG_OSD_PGTEMP:
2728 return preprocess_pgtemp(op);
2729 case MSG_OSD_BEACON:
2730 return preprocess_beacon(op);
2731
2732 case CEPH_MSG_POOLOP:
2733 return preprocess_pool_op(op);
2734
2735 case MSG_REMOVE_SNAPS:
2736 return preprocess_remove_snaps(op);
2737
2738 case MSG_MON_GET_PURGED_SNAPS:
2739 return preprocess_get_purged_snaps(op);
2740
2741 default:
2742 ceph_abort();
2743 return true;
2744 }
2745 }
2746
2747 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2748 {
2749 op->mark_osdmon_event(__func__);
2750 Message *m = op->get_req();
2751 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2752
2753 switch (m->get_type()) {
2754 // damp updates
2755 case MSG_OSD_MARK_ME_DOWN:
2756 return prepare_mark_me_down(op);
2757 case MSG_OSD_MARK_ME_DEAD:
2758 return prepare_mark_me_dead(op);
2759 case MSG_OSD_FULL:
2760 return prepare_full(op);
2761 case MSG_OSD_FAILURE:
2762 return prepare_failure(op);
2763 case MSG_OSD_BOOT:
2764 return prepare_boot(op);
2765 case MSG_OSD_ALIVE:
2766 return prepare_alive(op);
2767 case MSG_OSD_PG_CREATED:
2768 return prepare_pg_created(op);
2769 case MSG_OSD_PGTEMP:
2770 return prepare_pgtemp(op);
2771 case MSG_OSD_PG_READY_TO_MERGE:
2772 return prepare_pg_ready_to_merge(op);
2773 case MSG_OSD_BEACON:
2774 return prepare_beacon(op);
2775
2776 case MSG_MON_COMMAND:
2777 try {
2778 return prepare_command(op);
2779 } catch (const bad_cmd_get& e) {
2780 bufferlist bl;
2781 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2782 return true;
2783 }
2784
2785 case CEPH_MSG_POOLOP:
2786 return prepare_pool_op(op);
2787
2788 case MSG_REMOVE_SNAPS:
2789 return prepare_remove_snaps(op);
2790
2791
2792 default:
2793 ceph_abort();
2794 }
2795
2796 return false;
2797 }
2798
2799 bool OSDMonitor::should_propose(double& delay)
2800 {
2801 dout(10) << "should_propose" << dendl;
2802
2803 // if full map, propose immediately! any subsequent changes will be clobbered.
2804 if (pending_inc.fullmap.length())
2805 return true;
2806
2807 // adjust osd weights?
2808 if (!osd_weight.empty() &&
2809 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2810 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2811 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2812 delay = 0.0;
2813 osd_weight.clear();
2814 return true;
2815 }
2816
2817 return PaxosService::should_propose(delay);
2818 }
2819
2820
2821
2822 // ---------------------------
2823 // READs
2824
2825 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2826 {
2827 op->mark_osdmon_event(__func__);
2828 auto m = op->get_req<MMonGetOSDMap>();
2829
2830 uint64_t features = mon.get_quorum_con_features();
2831 if (op->get_session() && op->get_session()->con_features)
2832 features = op->get_session()->con_features;
2833
2834 dout(10) << __func__ << " " << *m << dendl;
2835 MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2836 epoch_t first = get_first_committed();
2837 epoch_t last = osdmap.get_epoch();
2838 int max = g_conf()->osd_map_message_max;
2839 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2840 for (epoch_t e = std::max(first, m->get_full_first());
2841 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2842 ++e, --max) {
2843 bufferlist& bl = reply->maps[e];
2844 int r = get_version_full(e, features, bl);
2845 ceph_assert(r >= 0);
2846 max_bytes -= bl.length();
2847 }
2848 for (epoch_t e = std::max(first, m->get_inc_first());
2849 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2850 ++e, --max) {
2851 bufferlist& bl = reply->incremental_maps[e];
2852 int r = get_version(e, features, bl);
2853 ceph_assert(r >= 0);
2854 max_bytes -= bl.length();
2855 }
2856 reply->oldest_map = first;
2857 reply->newest_map = last;
2858 mon.send_reply(op, reply);
2859 return true;
2860 }
2861
2862
2863 // ---------------------------
2864 // UPDATEs
2865
2866 // failure --
2867
2868 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2869 // check permissions
2870 MonSession *session = op->get_session();
2871 if (!session)
2872 return true;
2873 if (!session->is_capable("osd", MON_CAP_X)) {
2874 dout(0) << "got MOSDFailure from entity with insufficient caps "
2875 << session->caps << dendl;
2876 return true;
2877 }
2878 if (fsid != mon.monmap->fsid) {
2879 dout(0) << "check_source: on fsid " << fsid
2880 << " != " << mon.monmap->fsid << dendl;
2881 return true;
2882 }
2883 return false;
2884 }
2885
2886
2887 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2888 {
2889 op->mark_osdmon_event(__func__);
2890 auto m = op->get_req<MOSDFailure>();
2891 // who is target_osd
2892 int badboy = m->get_target_osd();
2893
2894 // check permissions
2895 if (check_source(op, m->fsid))
2896 goto didit;
2897
2898 // first, verify the reporting host is valid
2899 if (m->get_orig_source().is_osd()) {
2900 int from = m->get_orig_source().num();
2901 if (!osdmap.exists(from) ||
2902 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2903 (osdmap.is_down(from) && m->if_osd_failed())) {
2904 dout(5) << "preprocess_failure from dead osd." << from
2905 << ", ignoring" << dendl;
2906 send_incremental(op, m->get_epoch()+1);
2907 goto didit;
2908 }
2909 }
2910
2911
2912 // weird?
2913 if (osdmap.is_down(badboy)) {
2914 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2915 << " " << m->get_target_addrs()
2916 << ", from " << m->get_orig_source() << dendl;
2917 if (m->get_epoch() < osdmap.get_epoch())
2918 send_incremental(op, m->get_epoch()+1);
2919 goto didit;
2920 }
2921 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2922 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2923 << " " << m->get_target_addrs()
2924 << " != map's " << osdmap.get_addrs(badboy)
2925 << ", from " << m->get_orig_source() << dendl;
2926 if (m->get_epoch() < osdmap.get_epoch())
2927 send_incremental(op, m->get_epoch()+1);
2928 goto didit;
2929 }
2930
2931 // already reported?
2932 if (osdmap.is_down(badboy) ||
2933 osdmap.get_up_from(badboy) > m->get_epoch()) {
2934 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2935 << " " << m->get_target_addrs()
2936 << ", from " << m->get_orig_source() << dendl;
2937 if (m->get_epoch() < osdmap.get_epoch())
2938 send_incremental(op, m->get_epoch()+1);
2939 goto didit;
2940 }
2941
2942 if (!can_mark_down(badboy)) {
2943 dout(5) << "preprocess_failure ignoring report of osd."
2944 << m->get_target_osd() << " " << m->get_target_addrs()
2945 << " from " << m->get_orig_source() << dendl;
2946 goto didit;
2947 }
2948
2949 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2950 << " " << m->get_target_addrs()
2951 << ", from " << m->get_orig_source() << dendl;
2952 return false;
2953
2954 didit:
2955 mon.no_reply(op);
2956 return true;
2957 }
2958
2959 class C_AckMarkedDown : public C_MonOp {
2960 OSDMonitor *osdmon;
2961 public:
2962 C_AckMarkedDown(
2963 OSDMonitor *osdmon,
2964 MonOpRequestRef op)
2965 : C_MonOp(op), osdmon(osdmon) {}
2966
2967 void _finish(int r) override {
2968 if (r == 0) {
2969 auto m = op->get_req<MOSDMarkMeDown>();
2970 osdmon->mon.send_reply(
2971 op,
2972 new MOSDMarkMeDown(
2973 m->fsid,
2974 m->target_osd,
2975 m->target_addrs,
2976 m->get_epoch(),
2977 false)); // ACK itself does not request an ack
2978 } else if (r == -EAGAIN) {
2979 osdmon->dispatch(op);
2980 } else {
2981 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2982 }
2983 }
2984 ~C_AckMarkedDown() override {
2985 }
2986 };
2987
2988 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2989 {
2990 op->mark_osdmon_event(__func__);
2991 auto m = op->get_req<MOSDMarkMeDown>();
2992 int from = m->target_osd;
2993
2994 // check permissions
2995 if (check_source(op, m->fsid))
2996 goto reply;
2997
2998 // first, verify the reporting host is valid
2999 if (!m->get_orig_source().is_osd())
3000 goto reply;
3001
3002 if (!osdmap.exists(from) ||
3003 osdmap.is_down(from) ||
3004 osdmap.get_addrs(from) != m->target_addrs) {
3005 dout(5) << "preprocess_mark_me_down from dead osd."
3006 << from << ", ignoring" << dendl;
3007 send_incremental(op, m->get_epoch()+1);
3008 goto reply;
3009 }
3010
3011 // no down might be set
3012 if (!can_mark_down(from))
3013 goto reply;
3014
3015 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3016 << " " << m->target_addrs << dendl;
3017 return false;
3018
3019 reply:
3020 if (m->request_ack) {
3021 Context *c(new C_AckMarkedDown(this, op));
3022 c->complete(0);
3023 }
3024 return true;
3025 }
3026
3027 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3028 {
3029 op->mark_osdmon_event(__func__);
3030 auto m = op->get_req<MOSDMarkMeDown>();
3031 int target_osd = m->target_osd;
3032
3033 ceph_assert(osdmap.is_up(target_osd));
3034 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3035
3036 mon.clog->info() << "osd." << target_osd << " marked itself down";
3037 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3038 if (m->request_ack)
3039 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3040 return true;
3041 }
3042
3043 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3044 {
3045 op->mark_osdmon_event(__func__);
3046 auto m = op->get_req<MOSDMarkMeDead>();
3047 int from = m->target_osd;
3048
3049 // check permissions
3050 if (check_source(op, m->fsid)) {
3051 mon.no_reply(op);
3052 return true;
3053 }
3054
3055 // first, verify the reporting host is valid
3056 if (!m->get_orig_source().is_osd()) {
3057 mon.no_reply(op);
3058 return true;
3059 }
3060
3061 if (!osdmap.exists(from) ||
3062 !osdmap.is_down(from)) {
3063 dout(5) << __func__ << " from nonexistent or up osd." << from
3064 << ", ignoring" << dendl;
3065 send_incremental(op, m->get_epoch()+1);
3066 mon.no_reply(op);
3067 return true;
3068 }
3069
3070 return false;
3071 }
3072
3073 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3074 {
3075 op->mark_osdmon_event(__func__);
3076 auto m = op->get_req<MOSDMarkMeDead>();
3077 int target_osd = m->target_osd;
3078
3079 ceph_assert(osdmap.is_down(target_osd));
3080
3081 mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3082 << m->get_epoch();
3083 if (!pending_inc.new_xinfo.count(target_osd)) {
3084 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3085 }
3086 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3087 wait_for_finished_proposal(
3088 op,
3089 new LambdaContext(
3090 [op, this] (int r) {
3091 if (r >= 0) {
3092 mon.no_reply(op); // ignore on success
3093 }
3094 }
3095 ));
3096 return true;
3097 }
3098
3099 bool OSDMonitor::can_mark_down(int i)
3100 {
3101 if (osdmap.is_nodown(i)) {
3102 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3103 << "will not mark it down" << dendl;
3104 return false;
3105 }
3106
3107 int num_osds = osdmap.get_num_osds();
3108 if (num_osds == 0) {
3109 dout(5) << __func__ << " no osds" << dendl;
3110 return false;
3111 }
3112 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3113 float up_ratio = (float)up / (float)num_osds;
3114 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3115 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3116 << g_conf()->mon_osd_min_up_ratio
3117 << ", will not mark osd." << i << " down" << dendl;
3118 return false;
3119 }
3120 return true;
3121 }
3122
3123 bool OSDMonitor::can_mark_up(int i)
3124 {
3125 if (osdmap.is_noup(i)) {
3126 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3127 << "will not mark it up" << dendl;
3128 return false;
3129 }
3130
3131 return true;
3132 }
3133
3134 /**
3135 * @note the parameter @p i apparently only exists here so we can output the
3136 * osd's id on messages.
3137 */
3138 bool OSDMonitor::can_mark_out(int i)
3139 {
3140 if (osdmap.is_noout(i)) {
3141 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3142 << "will not mark it out" << dendl;
3143 return false;
3144 }
3145
3146 int num_osds = osdmap.get_num_osds();
3147 if (num_osds == 0) {
3148 dout(5) << __func__ << " no osds" << dendl;
3149 return false;
3150 }
3151 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3152 float in_ratio = (float)in / (float)num_osds;
3153 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3154 if (i >= 0)
3155 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3156 << g_conf()->mon_osd_min_in_ratio
3157 << ", will not mark osd." << i << " out" << dendl;
3158 else
3159 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3160 << g_conf()->mon_osd_min_in_ratio
3161 << ", will not mark osds out" << dendl;
3162 return false;
3163 }
3164
3165 return true;
3166 }
3167
3168 bool OSDMonitor::can_mark_in(int i)
3169 {
3170 if (osdmap.is_noin(i)) {
3171 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3172 << "will not mark it in" << dendl;
3173 return false;
3174 }
3175
3176 return true;
3177 }
3178
3179 bool OSDMonitor::check_failures(utime_t now)
3180 {
3181 bool found_failure = false;
3182 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3183 p != failure_info.end();
3184 ++p) {
3185 if (can_mark_down(p->first)) {
3186 found_failure |= check_failure(now, p->first, p->second);
3187 }
3188 }
3189 return found_failure;
3190 }
3191
3192 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3193 {
3194 // already pending failure?
3195 if (pending_inc.new_state.count(target_osd) &&
3196 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3197 dout(10) << " already pending failure" << dendl;
3198 return true;
3199 }
3200
3201 set<string> reporters_by_subtree;
3202 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3203 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3204 utime_t max_failed_since = fi.get_failed_since();
3205 utime_t failed_for = now - max_failed_since;
3206
3207 utime_t grace = orig_grace;
3208 double my_grace = 0, peer_grace = 0;
3209 double decay_k = 0;
3210 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3211 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3212 decay_k = ::log(.5) / halflife;
3213
3214 // scale grace period based on historical probability of 'lagginess'
3215 // (false positive failures due to slowness).
3216 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3217 double decay = exp((double)failed_for * decay_k);
3218 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3219 << " failed_for " << failed_for << " decay " << decay << dendl;
3220 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3221 grace += my_grace;
3222 }
3223
3224 // consider the peers reporting a failure a proxy for a potential
3225 // 'subcluster' over the overall cluster that is similarly
3226 // laggy. this is clearly not true in all cases, but will sometimes
3227 // help us localize the grace correction to a subset of the system
3228 // (say, a rack with a bad switch) that is unhappy.
3229 ceph_assert(fi.reporters.size());
3230 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3231 // get the parent bucket whose type matches with "reporter_subtree_level".
3232 // fall back to OSD if the level doesn't exist.
3233 if (osdmap.exists(p->first)) {
3234 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3235 if (auto iter = reporter_loc.find(reporter_subtree_level);
3236 iter == reporter_loc.end()) {
3237 reporters_by_subtree.insert("osd." + to_string(p->first));
3238 } else {
3239 reporters_by_subtree.insert(iter->second);
3240 }
3241 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3242 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3243 utime_t elapsed = now - xi.down_stamp;
3244 double decay = exp((double)elapsed * decay_k);
3245 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3246 }
3247 ++p;
3248 } else {
3249 fi.cancel_report(p->first);;
3250 p = fi.reporters.erase(p);
3251 }
3252 }
3253
3254 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3255 peer_grace /= (double)fi.reporters.size();
3256 grace += peer_grace;
3257 }
3258
3259 dout(10) << " osd." << target_osd << " has "
3260 << fi.reporters.size() << " reporters, "
3261 << grace << " grace (" << orig_grace << " + " << my_grace
3262 << " + " << peer_grace << "), max_failed_since " << max_failed_since
3263 << dendl;
3264
3265 if (failed_for >= grace &&
3266 reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3267 dout(1) << " we have enough reporters to mark osd." << target_osd
3268 << " down" << dendl;
3269 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3270
3271 mon.clog->info() << "osd." << target_osd << " failed ("
3272 << osdmap.crush->get_full_location_ordered_string(
3273 target_osd)
3274 << ") ("
3275 << (int)reporters_by_subtree.size()
3276 << " reporters from different "
3277 << reporter_subtree_level << " after "
3278 << failed_for << " >= grace " << grace << ")";
3279 return true;
3280 }
3281 return false;
3282 }
3283
3284 void OSDMonitor::force_failure(int target_osd, int by)
3285 {
3286 // already pending failure?
3287 if (pending_inc.new_state.count(target_osd) &&
3288 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3289 dout(10) << " already pending failure" << dendl;
3290 return;
3291 }
3292
3293 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3294 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3295 if (!pending_inc.new_xinfo.count(target_osd)) {
3296 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3297 }
3298 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3299
3300 mon.clog->info() << "osd." << target_osd << " failed ("
3301 << osdmap.crush->get_full_location_ordered_string(target_osd)
3302 << ") (connection refused reported by osd." << by << ")";
3303 return;
3304 }
3305
3306 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3307 {
3308 op->mark_osdmon_event(__func__);
3309 auto m = op->get_req<MOSDFailure>();
3310 dout(1) << "prepare_failure osd." << m->get_target_osd()
3311 << " " << m->get_target_addrs()
3312 << " from " << m->get_orig_source()
3313 << " is reporting failure:" << m->if_osd_failed() << dendl;
3314
3315 int target_osd = m->get_target_osd();
3316 int reporter = m->get_orig_source().num();
3317 ceph_assert(osdmap.is_up(target_osd));
3318 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3319
3320 mon.no_reply(op);
3321
3322 if (m->if_osd_failed()) {
3323 // calculate failure time
3324 utime_t now = ceph_clock_now();
3325 utime_t failed_since =
3326 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3327
3328 // add a report
3329 if (m->is_immediate()) {
3330 mon.clog->debug() << "osd." << m->get_target_osd()
3331 << " reported immediately failed by "
3332 << m->get_orig_source();
3333 force_failure(target_osd, reporter);
3334 return true;
3335 }
3336 mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3337 << m->get_orig_source();
3338
3339 failure_info_t& fi = failure_info[target_osd];
3340 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3341 if (old_op) {
3342 mon.no_reply(old_op);
3343 }
3344
3345 return check_failure(now, target_osd, fi);
3346 } else {
3347 // remove the report
3348 mon.clog->debug() << "osd." << m->get_target_osd()
3349 << " failure report canceled by "
3350 << m->get_orig_source();
3351 if (failure_info.count(target_osd)) {
3352 failure_info_t& fi = failure_info[target_osd];
3353 MonOpRequestRef report_op = fi.cancel_report(reporter);
3354 if (report_op) {
3355 mon.no_reply(report_op);
3356 }
3357 if (fi.reporters.empty()) {
3358 dout(10) << " removing last failure_info for osd." << target_osd
3359 << dendl;
3360 failure_info.erase(target_osd);
3361 } else {
3362 dout(10) << " failure_info for osd." << target_osd << " now "
3363 << fi.reporters.size() << " reporters" << dendl;
3364 }
3365 } else {
3366 dout(10) << " no failure_info for osd." << target_osd << dendl;
3367 }
3368 }
3369
3370 return false;
3371 }
3372
3373 void OSDMonitor::process_failures()
3374 {
3375 map<int,failure_info_t>::iterator p = failure_info.begin();
3376 while (p != failure_info.end()) {
3377 if (osdmap.is_up(p->first)) {
3378 ++p;
3379 } else {
3380 dout(10) << "process_failures osd." << p->first << dendl;
3381 list<MonOpRequestRef> ls;
3382 p->second.take_report_messages(ls);
3383 failure_info.erase(p++);
3384
3385 while (!ls.empty()) {
3386 MonOpRequestRef o = ls.front();
3387 if (o) {
3388 o->mark_event(__func__);
3389 MOSDFailure *m = o->get_req<MOSDFailure>();
3390 send_latest(o, m->get_epoch());
3391 mon.no_reply(o);
3392 }
3393 ls.pop_front();
3394 }
3395 }
3396 }
3397 }
3398
3399 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3400 {
3401 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3402
3403 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3404 p != failure_info.end();
3405 ++p) {
3406 p->second.take_report_messages(ls);
3407 }
3408 failure_info.clear();
3409 }
3410
3411 int OSDMonitor::get_grace_interval_threshold()
3412 {
3413 int halflife = g_conf()->mon_osd_laggy_halflife;
3414 // Scale the halflife period (default: 1_hr) by
3415 // a factor (48) to calculate the threshold.
3416 int grace_threshold_factor = 48;
3417 return halflife * grace_threshold_factor;
3418 }
3419
3420 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3421 {
3422 int grace_interval_threshold_secs = get_grace_interval_threshold();
3423 if (last_failed_interval > grace_interval_threshold_secs) {
3424 dout(1) << " last_failed_interval " << last_failed_interval
3425 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3426 << dendl;
3427 return true;
3428 }
3429 return false;
3430 }
3431
3432 void OSDMonitor::set_default_laggy_params(int target_osd)
3433 {
3434 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3435 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3436 }
3437 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3438 xi.down_stamp = pending_inc.modified;
3439 xi.laggy_probability = 0.0;
3440 xi.laggy_interval = 0;
3441 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3442 }
3443
3444
3445 // boot --
3446
3447 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3448 {
3449 op->mark_osdmon_event(__func__);
3450 auto m = op->get_req<MOSDBoot>();
3451 int from = m->get_orig_source_inst().name.num();
3452
3453 // check permissions, ignore if failed (no response expected)
3454 MonSession *session = op->get_session();
3455 if (!session)
3456 goto ignore;
3457 if (!session->is_capable("osd", MON_CAP_X)) {
3458 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3459 << session->caps << dendl;
3460 goto ignore;
3461 }
3462
3463 if (m->sb.cluster_fsid != mon.monmap->fsid) {
3464 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3465 << " != " << mon.monmap->fsid << dendl;
3466 goto ignore;
3467 }
3468
3469 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3470 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3471 goto ignore;
3472 }
3473
3474 ceph_assert(m->get_orig_source_inst().name.is_osd());
3475
3476 // force all osds to have gone through luminous prior to upgrade to nautilus
3477 {
3478 vector<string> missing;
3479 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3480 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3481 }
3482 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3483 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3484 }
3485 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3486 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3487 }
3488 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3489 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3490 }
3491
3492 if (!missing.empty()) {
3493 using std::experimental::make_ostream_joiner;
3494
3495 stringstream ss;
3496 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3497
3498 mon.clog->info() << "disallowing boot of OSD "
3499 << m->get_orig_source_inst()
3500 << " because the osd lacks " << ss.str();
3501 goto ignore;
3502 }
3503 }
3504
3505 // make sure osd versions do not span more than 3 releases
3506 if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3507 osdmap.require_osd_release < ceph_release_t::mimic) {
3508 mon.clog->info() << "disallowing boot of octopus+ OSD "
3509 << m->get_orig_source_inst()
3510 << " because require_osd_release < mimic";
3511 goto ignore;
3512 }
3513 if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) &&
3514 osdmap.require_osd_release < ceph_release_t::nautilus) {
3515 mon.clog->info() << "disallowing boot of pacific+ OSD "
3516 << m->get_orig_source_inst()
3517 << " because require_osd_release < nautilus";
3518 goto ignore;
3519 }
3520
3521 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3522 // we are reusing a jewel feature bit that was retired in luminous.
3523 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3524 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3525 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3526 mon.clog->info() << "disallowing boot of OSD "
3527 << m->get_orig_source_inst()
3528 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3529 goto ignore;
3530 }
3531
3532 if (osdmap.stretch_mode_enabled &&
3533 !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3534 mon.clog->info() << "disallowing boot of OSD "
3535 << m->get_orig_source_inst()
3536 << " because stretch mode is on and OSD lacks support";
3537 goto ignore;
3538 }
3539
3540 // already booted?
3541 if (osdmap.is_up(from) &&
3542 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3543 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3544 // yup.
3545 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3546 << " " << m->get_orig_source_addrs()
3547 << " =~ " << osdmap.get_addrs(from) << dendl;
3548 _booted(op, false);
3549 return true;
3550 }
3551
3552 if (osdmap.exists(from) &&
3553 !osdmap.get_uuid(from).is_zero() &&
3554 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3555 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3556 << " clashes with existing osd: different fsid"
3557 << " (ours: " << osdmap.get_uuid(from)
3558 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3559 goto ignore;
3560 }
3561
3562 if (osdmap.exists(from) &&
3563 osdmap.get_info(from).up_from > m->version &&
3564 osdmap.get_most_recent_addrs(from).legacy_equals(
3565 m->get_orig_source_addrs())) {
3566 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3567 send_latest(op, m->sb.current_epoch+1);
3568 return true;
3569 }
3570
3571 // noup?
3572 if (!can_mark_up(from)) {
3573 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3574 send_latest(op, m->sb.current_epoch+1);
3575 return true;
3576 }
3577
3578 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3579 return false;
3580
3581 ignore:
3582 return true;
3583 }
3584
3585 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3586 {
3587 op->mark_osdmon_event(__func__);
3588 auto m = op->get_req<MOSDBoot>();
3589 dout(7) << __func__ << " from " << m->get_source()
3590 << " sb " << m->sb
3591 << " client_addrs" << m->get_connection()->get_peer_addrs()
3592 << " cluster_addrs " << m->cluster_addrs
3593 << " hb_back_addrs " << m->hb_back_addrs
3594 << " hb_front_addrs " << m->hb_front_addrs
3595 << dendl;
3596
3597 ceph_assert(m->get_orig_source().is_osd());
3598 int from = m->get_orig_source().num();
3599
3600 // does this osd exist?
3601 if (from >= osdmap.get_max_osd()) {
3602 dout(1) << "boot from osd." << from << " >= max_osd "
3603 << osdmap.get_max_osd() << dendl;
3604 return false;
3605 }
3606
3607 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3608 if (pending_inc.new_state.count(from))
3609 oldstate ^= pending_inc.new_state[from];
3610
3611 // already up? mark down first?
3612 if (osdmap.is_up(from)) {
3613 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3614 << osdmap.get_addrs(from) << dendl;
3615 // preprocess should have caught these; if not, assert.
3616 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3617 m->get_orig_source_addrs()) ||
3618 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3619 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3620
3621 if (pending_inc.new_state.count(from) == 0 ||
3622 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3623 // mark previous guy down
3624 pending_inc.new_state[from] = CEPH_OSD_UP;
3625 }
3626 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3627 } else if (pending_inc.new_up_client.count(from)) {
3628 // already prepared, just wait
3629 dout(7) << __func__ << " already prepared, waiting on "
3630 << m->get_orig_source_addr() << dendl;
3631 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3632 } else {
3633 // mark new guy up.
3634 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3635 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3636 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3637 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3638
3639 down_pending_out.erase(from); // if any
3640
3641 if (m->sb.weight)
3642 osd_weight[from] = m->sb.weight;
3643
3644 // set uuid?
3645 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3646 << dendl;
3647 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3648 // preprocess should have caught this; if not, assert.
3649 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3650 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3651 }
3652
3653 // fresh osd?
3654 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3655 const osd_info_t& i = osdmap.get_info(from);
3656 if (i.up_from > i.lost_at) {
3657 dout(10) << " fresh osd; marking lost_at too" << dendl;
3658 pending_inc.new_lost[from] = osdmap.get_epoch();
3659 }
3660 }
3661
3662 // metadata
3663 bufferlist osd_metadata;
3664 encode(m->metadata, osd_metadata);
3665 pending_metadata[from] = osd_metadata;
3666 pending_metadata_rm.erase(from);
3667
3668 // adjust last clean unmount epoch?
3669 const osd_info_t& info = osdmap.get_info(from);
3670 dout(10) << " old osd_info: " << info << dendl;
3671 if (m->sb.mounted > info.last_clean_begin ||
3672 (m->sb.mounted == info.last_clean_begin &&
3673 m->sb.clean_thru > info.last_clean_end)) {
3674 epoch_t begin = m->sb.mounted;
3675 epoch_t end = m->sb.clean_thru;
3676
3677 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3678 << "[" << info.last_clean_begin << "," << info.last_clean_end
3679 << ") -> [" << begin << "-" << end << ")"
3680 << dendl;
3681 pending_inc.new_last_clean_interval[from] =
3682 pair<epoch_t,epoch_t>(begin, end);
3683 }
3684
3685 if (pending_inc.new_xinfo.count(from) == 0)
3686 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3687 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3688 if (m->boot_epoch == 0) {
3689 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3690 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3691 dout(10) << " not laggy, new xi " << xi << dendl;
3692 } else {
3693 if (xi.down_stamp.sec()) {
3694 int interval = ceph_clock_now().sec() -
3695 xi.down_stamp.sec();
3696 if (g_conf()->mon_osd_laggy_max_interval &&
3697 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3698 interval = g_conf()->mon_osd_laggy_max_interval;
3699 }
3700 xi.laggy_interval =
3701 interval * g_conf()->mon_osd_laggy_weight +
3702 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3703 }
3704 xi.laggy_probability =
3705 g_conf()->mon_osd_laggy_weight +
3706 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3707 dout(10) << " laggy, now xi " << xi << dendl;
3708 }
3709
3710 // set features shared by the osd
3711 if (m->osd_features)
3712 xi.features = m->osd_features;
3713 else
3714 xi.features = m->get_connection()->get_features();
3715
3716 // mark in?
3717 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3718 (oldstate & CEPH_OSD_AUTOOUT)) ||
3719 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3720 (g_conf()->mon_osd_auto_mark_in)) {
3721 if (can_mark_in(from)) {
3722 if (xi.old_weight > 0) {
3723 pending_inc.new_weight[from] = xi.old_weight;
3724 xi.old_weight = 0;
3725 } else {
3726 pending_inc.new_weight[from] = CEPH_OSD_IN;
3727 }
3728 } else {
3729 dout(7) << __func__ << " NOIN set, will not mark in "
3730 << m->get_orig_source_addr() << dendl;
3731 }
3732 }
3733
3734 // wait
3735 wait_for_finished_proposal(op, new C_Booted(this, op));
3736 }
3737 return true;
3738 }
3739
3740 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3741 {
3742 op->mark_osdmon_event(__func__);
3743 auto m = op->get_req<MOSDBoot>();
3744 dout(7) << "_booted " << m->get_orig_source_inst()
3745 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3746
3747 if (logit) {
3748 mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3749 << " boot";
3750 }
3751
3752 send_latest(op, m->sb.current_epoch+1);
3753 }
3754
3755
3756 // -------------
3757 // full
3758
3759 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3760 {
3761 op->mark_osdmon_event(__func__);
3762 auto m = op->get_req<MOSDFull>();
3763 int from = m->get_orig_source().num();
3764 set<string> state;
3765 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3766
3767 // check permissions, ignore if failed
3768 MonSession *session = op->get_session();
3769 if (!session)
3770 goto ignore;
3771 if (!session->is_capable("osd", MON_CAP_X)) {
3772 dout(0) << "MOSDFull from entity with insufficient privileges:"
3773 << session->caps << dendl;
3774 goto ignore;
3775 }
3776
3777 // ignore a full message from the osd instance that already went down
3778 if (!osdmap.exists(from)) {
3779 dout(7) << __func__ << " ignoring full message from nonexistent "
3780 << m->get_orig_source_inst() << dendl;
3781 goto ignore;
3782 }
3783 if ((!osdmap.is_up(from) &&
3784 osdmap.get_most_recent_addrs(from).legacy_equals(
3785 m->get_orig_source_addrs())) ||
3786 (osdmap.is_up(from) &&
3787 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3788 dout(7) << __func__ << " ignoring full message from down "
3789 << m->get_orig_source_inst() << dendl;
3790 goto ignore;
3791 }
3792
3793 OSDMap::calc_state_set(osdmap.get_state(from), state);
3794
3795 if ((osdmap.get_state(from) & mask) == m->state) {
3796 dout(7) << __func__ << " state already " << state << " for osd." << from
3797 << " " << m->get_orig_source_inst() << dendl;
3798 _reply_map(op, m->version);
3799 goto ignore;
3800 }
3801
3802 dout(10) << __func__ << " want state " << state << " for osd." << from
3803 << " " << m->get_orig_source_inst() << dendl;
3804 return false;
3805
3806 ignore:
3807 return true;
3808 }
3809
3810 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3811 {
3812 op->mark_osdmon_event(__func__);
3813 auto m = op->get_req<MOSDFull>();
3814 const int from = m->get_orig_source().num();
3815
3816 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3817 const unsigned want_state = m->state & mask; // safety first
3818
3819 unsigned cur_state = osdmap.get_state(from);
3820 auto p = pending_inc.new_state.find(from);
3821 if (p != pending_inc.new_state.end()) {
3822 cur_state ^= p->second;
3823 }
3824 cur_state &= mask;
3825
3826 set<string> want_state_set, cur_state_set;
3827 OSDMap::calc_state_set(want_state, want_state_set);
3828 OSDMap::calc_state_set(cur_state, cur_state_set);
3829
3830 if (cur_state != want_state) {
3831 if (p != pending_inc.new_state.end()) {
3832 p->second &= ~mask;
3833 } else {
3834 pending_inc.new_state[from] = 0;
3835 }
3836 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3837 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3838 << " -> " << want_state_set << dendl;
3839 } else {
3840 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3841 << " = wanted " << want_state_set << ", just waiting" << dendl;
3842 }
3843
3844 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3845 return true;
3846 }
3847
3848 // -------------
3849 // alive
3850
3851 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3852 {
3853 op->mark_osdmon_event(__func__);
3854 auto m = op->get_req<MOSDAlive>();
3855 int from = m->get_orig_source().num();
3856
3857 // check permissions, ignore if failed
3858 MonSession *session = op->get_session();
3859 if (!session)
3860 goto ignore;
3861 if (!session->is_capable("osd", MON_CAP_X)) {
3862 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3863 << session->caps << dendl;
3864 goto ignore;
3865 }
3866
3867 if (!osdmap.is_up(from) ||
3868 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3869 dout(7) << "preprocess_alive ignoring alive message from down "
3870 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3871 << dendl;
3872 goto ignore;
3873 }
3874
3875 if (osdmap.get_up_thru(from) >= m->want) {
3876 // yup.
3877 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3878 _reply_map(op, m->version);
3879 return true;
3880 }
3881
3882 dout(10) << "preprocess_alive want up_thru " << m->want
3883 << " from " << m->get_orig_source_inst() << dendl;
3884 return false;
3885
3886 ignore:
3887 return true;
3888 }
3889
3890 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3891 {
3892 op->mark_osdmon_event(__func__);
3893 auto m = op->get_req<MOSDAlive>();
3894 int from = m->get_orig_source().num();
3895
3896 if (0) { // we probably don't care much about these
3897 mon.clog->debug() << m->get_orig_source_inst() << " alive";
3898 }
3899
3900 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3901 << " from " << m->get_orig_source_inst() << dendl;
3902
3903 update_up_thru(from, m->version); // set to the latest map the OSD has
3904 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3905 return true;
3906 }
3907
3908 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3909 {
3910 op->mark_osdmon_event(__func__);
3911 dout(7) << "_reply_map " << e
3912 << " from " << op->get_req()->get_orig_source_inst()
3913 << dendl;
3914 send_latest(op, e);
3915 }
3916
3917 // pg_created
3918 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3919 {
3920 op->mark_osdmon_event(__func__);
3921 auto m = op->get_req<MOSDPGCreated>();
3922 dout(10) << __func__ << " " << *m << dendl;
3923 auto session = op->get_session();
3924 mon.no_reply(op);
3925 if (!session) {
3926 dout(10) << __func__ << ": no monitor session!" << dendl;
3927 return true;
3928 }
3929 if (!session->is_capable("osd", MON_CAP_X)) {
3930 derr << __func__ << " received from entity "
3931 << "with insufficient privileges " << session->caps << dendl;
3932 return true;
3933 }
3934 // always forward the "created!" to the leader
3935 return false;
3936 }
3937
3938 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3939 {
3940 op->mark_osdmon_event(__func__);
3941 auto m = op->get_req<MOSDPGCreated>();
3942 dout(10) << __func__ << " " << *m << dendl;
3943 auto src = m->get_orig_source();
3944 auto from = src.num();
3945 if (!src.is_osd() ||
3946 !mon.osdmon()->osdmap.is_up(from) ||
3947 !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3948 m->get_orig_source_addrs())) {
3949 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3950 return false;
3951 }
3952 pending_created_pgs.push_back(m->pgid);
3953 return true;
3954 }
3955
3956 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3957 {
3958 op->mark_osdmon_event(__func__);
3959 auto m = op->get_req<MOSDPGReadyToMerge>();
3960 dout(10) << __func__ << " " << *m << dendl;
3961 const pg_pool_t *pi;
3962 auto session = op->get_session();
3963 if (!session) {
3964 dout(10) << __func__ << ": no monitor session!" << dendl;
3965 goto ignore;
3966 }
3967 if (!session->is_capable("osd", MON_CAP_X)) {
3968 derr << __func__ << " received from entity "
3969 << "with insufficient privileges " << session->caps << dendl;
3970 goto ignore;
3971 }
3972 pi = osdmap.get_pg_pool(m->pgid.pool());
3973 if (!pi) {
3974 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3975 goto ignore;
3976 }
3977 if (pi->get_pg_num() <= m->pgid.ps()) {
3978 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3979 goto ignore;
3980 }
3981 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3982 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3983 goto ignore;
3984 }
3985 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3986 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3987 goto ignore;
3988 }
3989 return false;
3990
3991 ignore:
3992 mon.no_reply(op);
3993 return true;
3994 }
3995
3996 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3997 {
3998 op->mark_osdmon_event(__func__);
3999 auto m = op->get_req<MOSDPGReadyToMerge>();
4000 dout(10) << __func__ << " " << *m << dendl;
4001 pg_pool_t p;
4002 if (pending_inc.new_pools.count(m->pgid.pool()))
4003 p = pending_inc.new_pools[m->pgid.pool()];
4004 else
4005 p = *osdmap.get_pg_pool(m->pgid.pool());
4006 if (p.get_pg_num() != m->pgid.ps() + 1 ||
4007 p.get_pg_num_pending() > m->pgid.ps()) {
4008 dout(10) << __func__
4009 << " race with concurrent pg_num[_pending] update, will retry"
4010 << dendl;
4011 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4012 return true;
4013 }
4014
4015 if (m->ready) {
4016 p.dec_pg_num(m->pgid,
4017 pending_inc.epoch,
4018 m->source_version,
4019 m->target_version,
4020 m->last_epoch_started,
4021 m->last_epoch_clean);
4022 p.last_change = pending_inc.epoch;
4023 } else {
4024 // back off the merge attempt!
4025 p.set_pg_num_pending(p.get_pg_num());
4026 }
4027
4028 // force pre-nautilus clients to resend their ops, since they
4029 // don't understand pg_num_pending changes form a new interval
4030 p.last_force_op_resend_prenautilus = pending_inc.epoch;
4031
4032 pending_inc.new_pools[m->pgid.pool()] = p;
4033
4034 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4035 if (m->ready &&
4036 prob > 0 &&
4037 prob > (double)(rand() % 1000)/1000.0) {
4038 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4039 auto n = new MMonCommand(mon.monmap->get_fsid());
4040 n->set_connection(m->get_connection());
4041 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4042 osdmap.get_pool_name(m->pgid.pool()) +
4043 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4044 stringify(m->pgid.ps() + 1) + "\"}" };
4045 MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4046 nop->set_type_service();
4047 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4048 } else {
4049 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4050 }
4051 return true;
4052 }
4053
4054
4055 // -------------
4056 // pg_temp changes
4057
4058 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4059 {
4060 auto m = op->get_req<MOSDPGTemp>();
4061 dout(10) << "preprocess_pgtemp " << *m << dendl;
4062 mempool::osdmap::vector<int> empty;
4063 int from = m->get_orig_source().num();
4064 size_t ignore_cnt = 0;
4065
4066 // check caps
4067 MonSession *session = op->get_session();
4068 if (!session)
4069 goto ignore;
4070 if (!session->is_capable("osd", MON_CAP_X)) {
4071 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4072 << session->caps << dendl;
4073 goto ignore;
4074 }
4075
4076 if (!osdmap.is_up(from) ||
4077 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4078 dout(7) << "ignoring pgtemp message from down "
4079 << m->get_orig_source() << " " << m->get_orig_source_addrs()
4080 << dendl;
4081 goto ignore;
4082 }
4083
4084 if (m->forced) {
4085 return false;
4086 }
4087
4088 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4089 dout(20) << " " << p->first
4090 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4091 << " -> " << p->second << dendl;
4092
4093 // does the pool exist?
4094 if (!osdmap.have_pg_pool(p->first.pool())) {
4095 /*
4096 * 1. If the osdmap does not have the pool, it means the pool has been
4097 * removed in-between the osd sending this message and us handling it.
4098 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4099 * not exist in the pending either, as the osds would not send a
4100 * message about a pool they know nothing about (yet).
4101 * 3. However, if the pool does exist in the pending, then it must be a
4102 * new pool, and not relevant to this message (see 1).
4103 */
4104 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4105 << ": pool has been removed" << dendl;
4106 ignore_cnt++;
4107 continue;
4108 }
4109
4110 int acting_primary = -1;
4111 osdmap.pg_to_up_acting_osds(
4112 p->first, nullptr, nullptr, nullptr, &acting_primary);
4113 if (acting_primary != from) {
4114 /* If the source isn't the primary based on the current osdmap, we know
4115 * that the interval changed and that we can discard this message.
4116 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4117 * which of two pg temp mappings on the same pg is more recent.
4118 */
4119 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4120 << ": primary has changed" << dendl;
4121 ignore_cnt++;
4122 continue;
4123 }
4124
4125 // removal?
4126 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4127 osdmap.primary_temp->count(p->first)))
4128 return false;
4129 // change?
4130 // NOTE: we assume that this will clear pg_primary, so consider
4131 // an existing pg_primary field to imply a change
4132 if (p->second.size() &&
4133 (osdmap.pg_temp->count(p->first) == 0 ||
4134 osdmap.pg_temp->get(p->first) != p->second ||
4135 osdmap.primary_temp->count(p->first)))
4136 return false;
4137 }
4138
4139 // should we ignore all the pgs?
4140 if (ignore_cnt == m->pg_temp.size())
4141 goto ignore;
4142
4143 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4144 _reply_map(op, m->map_epoch);
4145 return true;
4146
4147 ignore:
4148 mon.no_reply(op);
4149 return true;
4150 }
4151
4152 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4153 {
4154 epoch_t old_up_thru = osdmap.get_up_thru(from);
4155 auto ut = pending_inc.new_up_thru.find(from);
4156 if (ut != pending_inc.new_up_thru.end()) {
4157 old_up_thru = ut->second;
4158 }
4159 if (up_thru > old_up_thru) {
4160 // set up_thru too, so the osd doesn't have to ask again
4161 pending_inc.new_up_thru[from] = up_thru;
4162 }
4163 }
4164
4165 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4166 {
4167 op->mark_osdmon_event(__func__);
4168 auto m = op->get_req<MOSDPGTemp>();
4169 int from = m->get_orig_source().num();
4170 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4171 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4172 uint64_t pool = p->first.pool();
4173 if (pending_inc.old_pools.count(pool)) {
4174 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4175 << ": pool pending removal" << dendl;
4176 continue;
4177 }
4178 if (!osdmap.have_pg_pool(pool)) {
4179 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4180 << ": pool has been removed" << dendl;
4181 continue;
4182 }
4183 pending_inc.new_pg_temp[p->first] =
4184 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4185
4186 // unconditionally clear pg_primary (until this message can encode
4187 // a change for that, too.. at which point we need to also fix
4188 // preprocess_pg_temp)
4189 if (osdmap.primary_temp->count(p->first) ||
4190 pending_inc.new_primary_temp.count(p->first))
4191 pending_inc.new_primary_temp[p->first] = -1;
4192 }
4193
4194 // set up_thru too, so the osd doesn't have to ask again
4195 update_up_thru(from, m->map_epoch);
4196
4197 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4198 return true;
4199 }
4200
4201
4202 // ---
4203
4204 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4205 {
4206 op->mark_osdmon_event(__func__);
4207 auto m = op->get_req<MRemoveSnaps>();
4208 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4209
4210 // check privilege, ignore if failed
4211 MonSession *session = op->get_session();
4212 mon.no_reply(op);
4213 if (!session)
4214 goto ignore;
4215 if (!session->caps.is_capable(
4216 cct,
4217 session->entity_name,
4218 "osd", "osd pool rmsnap", {}, true, true, false,
4219 session->get_peer_socket_addr())) {
4220 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4221 << session->caps << dendl;
4222 goto ignore;
4223 }
4224
4225 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4226 q != m->snaps.end();
4227 ++q) {
4228 if (!osdmap.have_pg_pool(q->first)) {
4229 dout(10) << " ignoring removed_snaps " << q->second
4230 << " on non-existent pool " << q->first << dendl;
4231 continue;
4232 }
4233 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4234 for (vector<snapid_t>::iterator p = q->second.begin();
4235 p != q->second.end();
4236 ++p) {
4237 if (*p > pi->get_snap_seq() ||
4238 !_is_removed_snap(q->first, *p)) {
4239 return false;
4240 }
4241 }
4242 }
4243
4244 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4245 auto reply = make_message<MRemoveSnaps>();
4246 reply->snaps = m->snaps;
4247 mon.send_reply(op, reply.detach());
4248 }
4249
4250 ignore:
4251 return true;
4252 }
4253
4254 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4255 {
4256 op->mark_osdmon_event(__func__);
4257 auto m = op->get_req<MRemoveSnaps>();
4258 dout(7) << "prepare_remove_snaps " << *m << dendl;
4259
4260 for (auto& [pool, snaps] : m->snaps) {
4261 if (!osdmap.have_pg_pool(pool)) {
4262 dout(10) << " ignoring removed_snaps " << snaps
4263 << " on non-existent pool " << pool << dendl;
4264 continue;
4265 }
4266
4267 pg_pool_t& pi = osdmap.pools[pool];
4268 for (auto s : snaps) {
4269 if (!_is_removed_snap(pool, s) &&
4270 (!pending_inc.new_pools.count(pool) ||
4271 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4272 (!pending_inc.new_removed_snaps.count(pool) ||
4273 !pending_inc.new_removed_snaps[pool].contains(s))) {
4274 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4275 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4276 newpi->removed_snaps.insert(s);
4277 dout(10) << " pool " << pool << " removed_snaps added " << s
4278 << " (now " << newpi->removed_snaps << ")" << dendl;
4279 }
4280 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4281 if (s > newpi->get_snap_seq()) {
4282 dout(10) << " pool " << pool << " snap_seq "
4283 << newpi->get_snap_seq() << " -> " << s << dendl;
4284 newpi->set_snap_seq(s);
4285 }
4286 newpi->set_snap_epoch(pending_inc.epoch);
4287 dout(10) << " added pool " << pool << " snap " << s
4288 << " to removed_snaps queue" << dendl;
4289 pending_inc.new_removed_snaps[pool].insert(s);
4290 }
4291 }
4292 }
4293
4294 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4295 auto reply = make_message<MRemoveSnaps>();
4296 reply->snaps = m->snaps;
4297 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4298 }
4299
4300 return true;
4301 }
4302
4303 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4304 {
4305 op->mark_osdmon_event(__func__);
4306 auto m = op->get_req<MMonGetPurgedSnaps>();
4307 dout(7) << __func__ << " " << *m << dendl;
4308
4309 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4310
4311 string k = make_purged_snap_epoch_key(m->start);
4312 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4313 it->upper_bound(k);
4314 unsigned long epoch = m->last;
4315 while (it->valid()) {
4316 if (it->key().find("purged_epoch_") != 0) {
4317 break;
4318 }
4319 string k = it->key();
4320 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4321 if (n != 1) {
4322 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4323 } else if (epoch > m->last) {
4324 break;
4325 } else {
4326 bufferlist bl = it->value();
4327 auto p = bl.cbegin();
4328 auto &v = r[epoch];
4329 try {
4330 ceph::decode(v, p);
4331 } catch (ceph::buffer::error& e) {
4332 derr << __func__ << " unable to parse value for key '" << it->key()
4333 << "': \n";
4334 bl.hexdump(*_dout);
4335 *_dout << dendl;
4336 }
4337 n += 4 + v.size() * 16;
4338 }
4339 if (n > 1048576) {
4340 // impose a semi-arbitrary limit to message size
4341 break;
4342 }
4343 it->next();
4344 }
4345
4346 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4347 reply->purged_snaps.swap(r);
4348 mon.send_reply(op, reply.detach());
4349
4350 return true;
4351 }
4352
4353 // osd beacon
4354 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4355 {
4356 op->mark_osdmon_event(__func__);
4357 // check caps
4358 auto session = op->get_session();
4359 mon.no_reply(op);
4360 if (!session) {
4361 dout(10) << __func__ << " no monitor session!" << dendl;
4362 return true;
4363 }
4364 if (!session->is_capable("osd", MON_CAP_X)) {
4365 derr << __func__ << " received from entity "
4366 << "with insufficient privileges " << session->caps << dendl;
4367 return true;
4368 }
4369 // Always forward the beacon to the leader, even if they are the same as
4370 // the old one. The leader will mark as down osds that haven't sent
4371 // beacon for a few minutes.
4372 return false;
4373 }
4374
4375 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4376 {
4377 op->mark_osdmon_event(__func__);
4378 const auto beacon = op->get_req<MOSDBeacon>();
4379 const auto src = beacon->get_orig_source();
4380 dout(10) << __func__ << " " << *beacon
4381 << " from " << src << dendl;
4382 int from = src.num();
4383
4384 if (!src.is_osd() ||
4385 !osdmap.is_up(from) ||
4386 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4387 if (src.is_osd() && !osdmap.is_up(from)) {
4388 // share some new maps with this guy in case it may not be
4389 // aware of its own deadness...
4390 send_latest(op, beacon->version+1);
4391 }
4392 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4393 return false;
4394 }
4395
4396 last_osd_report[from].first = ceph_clock_now();
4397 last_osd_report[from].second = beacon->osd_beacon_report_interval;
4398 osd_epochs[from] = beacon->version;
4399
4400 for (const auto& pg : beacon->pgs) {
4401 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4402 }
4403
4404 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4405 beacon->last_purged_snaps_scrub) {
4406 if (pending_inc.new_xinfo.count(from) == 0) {
4407 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4408 }
4409 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4410 beacon->last_purged_snaps_scrub;
4411 return true;
4412 } else {
4413 return false;
4414 }
4415 }
4416
4417 // ---------------
4418 // map helpers
4419
4420 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4421 {
4422 op->mark_osdmon_event(__func__);
4423 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4424 << " start " << start << dendl;
4425 if (start == 0)
4426 send_full(op);
4427 else
4428 send_incremental(op, start);
4429 }
4430
4431
4432 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4433 {
4434 MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4435 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4436 r->oldest_map = get_first_committed();
4437 r->newest_map = osdmap.get_epoch();
4438 return r;
4439 }
4440
4441 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4442 {
4443 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4444 << std::hex << features << std::dec << dendl;
4445 MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4446 m->oldest_map = get_first_committed();
4447 m->newest_map = osdmap.get_epoch();
4448
4449 for (epoch_t e = to; e >= from && e > 0; e--) {
4450 bufferlist bl;
4451 int err = get_version(e, features, bl);
4452 if (err == 0) {
4453 ceph_assert(bl.length());
4454 // if (get_version(e, bl) > 0) {
4455 dout(20) << "build_incremental inc " << e << " "
4456 << bl.length() << " bytes" << dendl;
4457 m->incremental_maps[e] = bl;
4458 } else {
4459 ceph_assert(err == -ENOENT);
4460 ceph_assert(!bl.length());
4461 get_version_full(e, features, bl);
4462 if (bl.length() > 0) {
4463 //else if (get_version("full", e, bl) > 0) {
4464 dout(20) << "build_incremental full " << e << " "
4465 << bl.length() << " bytes" << dendl;
4466 m->maps[e] = bl;
4467 } else {
4468 ceph_abort(); // we should have all maps.
4469 }
4470 }
4471 }
4472 return m;
4473 }
4474
4475 void OSDMonitor::send_full(MonOpRequestRef op)
4476 {
4477 op->mark_osdmon_event(__func__);
4478 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4479 mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4480 }
4481
4482 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4483 {
4484 op->mark_osdmon_event(__func__);
4485
4486 MonSession *s = op->get_session();
4487 ceph_assert(s);
4488
4489 if (s->proxy_con) {
4490 // oh, we can tell the other mon to do it
4491 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4492 << first << dendl;
4493 MRoute *r = new MRoute(s->proxy_tid, NULL);
4494 r->send_osdmap_first = first;
4495 s->proxy_con->send_message(r);
4496 op->mark_event("reply: send routed send_osdmap_first reply");
4497 } else {
4498 // do it ourselves
4499 send_incremental(first, s, false, op);
4500 }
4501 }
4502
4503 void OSDMonitor::send_incremental(epoch_t first,
4504 MonSession *session,
4505 bool onetime,
4506 MonOpRequestRef req)
4507 {
4508 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4509 << " to " << session->name << dendl;
4510
4511 // get feature of the peer
4512 // use quorum_con_features, if it's an anonymous connection.
4513 uint64_t features = session->con_features ? session->con_features :
4514 mon.get_quorum_con_features();
4515
4516 if (first <= session->osd_epoch) {
4517 dout(10) << __func__ << " " << session->name << " should already have epoch "
4518 << session->osd_epoch << dendl;
4519 first = session->osd_epoch + 1;
4520 }
4521
4522 if (first < get_first_committed()) {
4523 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4524 m->oldest_map = get_first_committed();
4525 m->newest_map = osdmap.get_epoch();
4526
4527 first = get_first_committed();
4528 bufferlist bl;
4529 int err = get_version_full(first, features, bl);
4530 ceph_assert(err == 0);
4531 ceph_assert(bl.length());
4532 dout(20) << "send_incremental starting with base full "
4533 << first << " " << bl.length() << " bytes" << dendl;
4534 m->maps[first] = bl;
4535
4536 if (req) {
4537 mon.send_reply(req, m);
4538 session->osd_epoch = first;
4539 return;
4540 } else {
4541 session->con->send_message(m);
4542 session->osd_epoch = first;
4543 }
4544 first++;
4545 }
4546
4547 while (first <= osdmap.get_epoch()) {
4548 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4549 osdmap.get_epoch());
4550 MOSDMap *m = build_incremental(first, last, features);
4551
4552 if (req) {
4553 // send some maps. it may not be all of them, but it will get them
4554 // started.
4555 mon.send_reply(req, m);
4556 } else {
4557 session->con->send_message(m);
4558 first = last + 1;
4559 }
4560 session->osd_epoch = last;
4561 if (onetime || req)
4562 break;
4563 }
4564 }
4565
4566 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4567 {
4568 return get_version(ver, mon.get_quorum_con_features(), bl);
4569 }
4570
4571 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4572 {
4573 OSDMap::Incremental inc;
4574 auto q = bl.cbegin();
4575 inc.decode(q);
4576 // always encode with subset of osdmap's canonical features
4577 uint64_t f = features & inc.encode_features;
4578 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4579 << dendl;
4580 bl.clear();
4581 if (inc.fullmap.length()) {
4582 // embedded full map?
4583 OSDMap m;
4584 m.decode(inc.fullmap);
4585 inc.fullmap.clear();
4586 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4587 }
4588 if (inc.crush.length()) {
4589 // embedded crush map
4590 CrushWrapper c;
4591 auto p = inc.crush.cbegin();
4592 c.decode(p);
4593 inc.crush.clear();
4594 c.encode(inc.crush, f);
4595 }
4596 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4597 }
4598
4599 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4600 {
4601 OSDMap m;
4602 auto q = bl.cbegin();
4603 m.decode(q);
4604 // always encode with subset of osdmap's canonical features
4605 uint64_t f = features & m.get_encoding_features();
4606 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4607 << dendl;
4608 bl.clear();
4609 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4610 }
4611
4612 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4613 {
4614 uint64_t significant_features = OSDMap::get_significant_features(features);
4615 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4616 return 0;
4617 }
4618 int ret = PaxosService::get_version(ver, bl);
4619 if (ret < 0) {
4620 return ret;
4621 }
4622 // NOTE: this check is imprecise; the OSDMap encoding features may
4623 // be a subset of the latest mon quorum features, but worst case we
4624 // reencode once and then cache the (identical) result under both
4625 // feature masks.
4626 if (significant_features !=
4627 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4628 reencode_incremental_map(bl, features);
4629 }
4630 inc_osd_cache.add_bytes({ver, significant_features}, bl);
4631 return 0;
4632 }
4633
4634 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4635 {
4636 bufferlist inc_bl;
4637 int err = get_version(ver, inc_bl);
4638 ceph_assert(err == 0);
4639 ceph_assert(inc_bl.length());
4640
4641 auto p = inc_bl.cbegin();
4642 inc.decode(p);
4643 dout(10) << __func__ << " "
4644 << " epoch " << inc.epoch
4645 << " inc_crc " << inc.inc_crc
4646 << " full_crc " << inc.full_crc
4647 << " encode_features " << inc.encode_features << dendl;
4648 return 0;
4649 }
4650
4651 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4652 {
4653 dout(10) << __func__ << " ver " << ver << dendl;
4654
4655 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4656 if (closest_pinned == 0) {
4657 return -ENOENT;
4658 }
4659 if (closest_pinned > ver) {
4660 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4661 }
4662 ceph_assert(closest_pinned <= ver);
4663
4664 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4665
4666 // get osdmap incremental maps and apply on top of this one.
4667 bufferlist osdm_bl;
4668 bool has_cached_osdmap = false;
4669 for (version_t v = ver-1; v >= closest_pinned; --v) {
4670 if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4671 &osdm_bl)) {
4672 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4673 closest_pinned = v;
4674 has_cached_osdmap = true;
4675 break;
4676 }
4677 }
4678
4679 if (!has_cached_osdmap) {
4680 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4681 if (err != 0) {
4682 derr << __func__ << " closest pinned map ver " << closest_pinned
4683 << " not available! error: " << cpp_strerror(err) << dendl;
4684 }
4685 ceph_assert(err == 0);
4686 }
4687
4688 ceph_assert(osdm_bl.length());
4689
4690 OSDMap osdm;
4691 osdm.decode(osdm_bl);
4692
4693 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4694 << " e" << osdm.epoch
4695 << " crc " << osdm.get_crc()
4696 << " -- applying incremental maps." << dendl;
4697
4698 uint64_t encode_features = 0;
4699 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4700 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4701
4702 OSDMap::Incremental inc;
4703 int err = get_inc(v, inc);
4704 ceph_assert(err == 0);
4705
4706 encode_features = inc.encode_features;
4707
4708 err = osdm.apply_incremental(inc);
4709 ceph_assert(err == 0);
4710
4711 // this block performs paranoid checks on map retrieval
4712 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4713 inc.full_crc != 0) {
4714
4715 uint64_t f = encode_features;
4716 if (!f) {
4717 f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4718 }
4719
4720 // encode osdmap to force calculating crcs
4721 bufferlist tbl;
4722 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4723 // decode osdmap to compare crcs with what's expected by incremental
4724 OSDMap tosdm;
4725 tosdm.decode(tbl);
4726
4727 if (tosdm.get_crc() != inc.full_crc) {
4728 derr << __func__
4729 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4730 << ", expected " << inc.full_crc << ")" << dendl;
4731 ceph_abort_msg("osdmap crc mismatch");
4732 }
4733 }
4734
4735 // note: we cannot add the recently computed map to the cache, as is,
4736 // because we have not encoded the map into a bl.
4737 }
4738
4739 if (!encode_features) {
4740 dout(10) << __func__
4741 << " last incremental map didn't have features;"
4742 << " defaulting to quorum's or all" << dendl;
4743 encode_features =
4744 (mon.quorum_con_features ? mon.quorum_con_features : -1);
4745 }
4746 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4747
4748 return 0;
4749 }
4750
4751 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4752 {
4753 return get_version_full(ver, mon.get_quorum_con_features(), bl);
4754 }
4755
4756 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4757 bufferlist& bl)
4758 {
4759 uint64_t significant_features = OSDMap::get_significant_features(features);
4760 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4761 return 0;
4762 }
4763 int ret = PaxosService::get_version_full(ver, bl);
4764 if (ret == -ENOENT) {
4765 // build map?
4766 ret = get_full_from_pinned_map(ver, bl);
4767 }
4768 if (ret < 0) {
4769 return ret;
4770 }
4771 // NOTE: this check is imprecise; the OSDMap encoding features may
4772 // be a subset of the latest mon quorum features, but worst case we
4773 // reencode once and then cache the (identical) result under both
4774 // feature masks.
4775 if (significant_features !=
4776 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4777 reencode_full_map(bl, features);
4778 }
4779 full_osd_cache.add_bytes({ver, significant_features}, bl);
4780 return 0;
4781 }
4782
4783 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4784 {
4785 dout(10) << "blocklist " << av << " until " << until << dendl;
4786 for (auto a : av.v) {
4787 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4788 a.set_type(entity_addr_t::TYPE_ANY);
4789 } else {
4790 a.set_type(entity_addr_t::TYPE_LEGACY);
4791 }
4792 pending_inc.new_blocklist[a] = until;
4793 }
4794 return pending_inc.epoch;
4795 }
4796
4797 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4798 {
4799 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4800 a.set_type(entity_addr_t::TYPE_ANY);
4801 } else {
4802 a.set_type(entity_addr_t::TYPE_LEGACY);
4803 }
4804 dout(10) << "blocklist " << a << " until " << until << dendl;
4805 pending_inc.new_blocklist[a] = until;
4806 return pending_inc.epoch;
4807 }
4808
4809
4810 void OSDMonitor::check_osdmap_subs()
4811 {
4812 dout(10) << __func__ << dendl;
4813 if (!osdmap.get_epoch()) {
4814 return;
4815 }
4816 auto osdmap_subs = mon.session_map.subs.find("osdmap");
4817 if (osdmap_subs == mon.session_map.subs.end()) {
4818 return;
4819 }
4820 auto p = osdmap_subs->second->begin();
4821 while (!p.end()) {
4822 auto sub = *p;
4823 ++p;
4824 check_osdmap_sub(sub);
4825 }
4826 }
4827
4828 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4829 {
4830 dout(10) << __func__ << " " << sub << " next " << sub->next
4831 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4832 if (sub->next <= osdmap.get_epoch()) {
4833 if (sub->next >= 1)
4834 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4835 else
4836 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4837 if (sub->onetime)
4838 mon.session_map.remove_sub(sub);
4839 else
4840 sub->next = osdmap.get_epoch() + 1;
4841 }
4842 }
4843
4844 void OSDMonitor::check_pg_creates_subs()
4845 {
4846 if (!osdmap.get_num_up_osds()) {
4847 return;
4848 }
4849 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4850 mon.with_session_map([this](const MonSessionMap& session_map) {
4851 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4852 if (pg_creates_subs == session_map.subs.end()) {
4853 return;
4854 }
4855 for (auto sub : *pg_creates_subs->second) {
4856 check_pg_creates_sub(sub);
4857 }
4858 });
4859 }
4860
4861 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4862 {
4863 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4864 ceph_assert(sub->type == "osd_pg_creates");
4865 // only send these if the OSD is up. we will check_subs() when they do
4866 // come up so they will get the creates then.
4867 if (sub->session->name.is_osd() &&
4868 mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4869 sub->next = send_pg_creates(sub->session->name.num(),
4870 sub->session->con.get(),
4871 sub->next);
4872 }
4873 }
4874
4875 void OSDMonitor::do_application_enable(int64_t pool_id,
4876 const std::string &app_name,
4877 const std::string &app_key,
4878 const std::string &app_value,
4879 bool force)
4880 {
4881 ceph_assert(paxos.is_plugged() && is_writeable());
4882
4883 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4884 << dendl;
4885
4886 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4887
4888 auto pp = osdmap.get_pg_pool(pool_id);
4889 ceph_assert(pp != nullptr);
4890
4891 pg_pool_t p = *pp;
4892 if (pending_inc.new_pools.count(pool_id)) {
4893 p = pending_inc.new_pools[pool_id];
4894 }
4895
4896 if (app_key.empty()) {
4897 p.application_metadata.insert({app_name, {}});
4898 } else {
4899 if (force) {
4900 p.application_metadata[app_name][app_key] = app_value;
4901 } else {
4902 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4903 }
4904 }
4905 p.last_change = pending_inc.epoch;
4906 pending_inc.new_pools[pool_id] = p;
4907 }
4908
4909 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4910 pool_opts_t::key_t opt,
4911 pool_opts_t::value_t val)
4912 {
4913 auto p = pending_inc.new_pools.try_emplace(
4914 pool_id, *osdmap.get_pg_pool(pool_id));
4915 p.first->second.opts.set(opt, val);
4916 }
4917
4918 unsigned OSDMonitor::scan_for_creating_pgs(
4919 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4920 const mempool::osdmap::set<int64_t>& removed_pools,
4921 utime_t modified,
4922 creating_pgs_t* creating_pgs) const
4923 {
4924 unsigned queued = 0;
4925 for (auto& p : pools) {
4926 int64_t poolid = p.first;
4927 if (creating_pgs->created_pools.count(poolid)) {
4928 dout(10) << __func__ << " already created " << poolid << dendl;
4929 continue;
4930 }
4931 const pg_pool_t& pool = p.second;
4932 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4933 pool.get_type(), pool.get_size());
4934 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4935 continue;
4936
4937 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4938 const auto created = pool.get_last_change();
4939 if (last_scan_epoch && created <= last_scan_epoch) {
4940 dout(10) << __func__ << " no change in pool " << poolid
4941 << " " << pool << dendl;
4942 continue;
4943 }
4944 if (removed_pools.count(poolid)) {
4945 dout(10) << __func__ << " pool is being removed: " << poolid
4946 << " " << pool << dendl;
4947 continue;
4948 }
4949 dout(10) << __func__ << " queueing pool create for " << poolid
4950 << " " << pool << dendl;
4951 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4952 created, modified);
4953 queued++;
4954 }
4955 return queued;
4956 }
4957
4958 void OSDMonitor::update_creating_pgs()
4959 {
4960 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4961 << creating_pgs.queue.size() << " pools in queue" << dendl;
4962 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4963 std::lock_guard<std::mutex> l(creating_pgs_lock);
4964 for (const auto& pg : creating_pgs.pgs) {
4965 int acting_primary = -1;
4966 auto pgid = pg.first;
4967 if (!osdmap.pg_exists(pgid)) {
4968 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4969 << dendl;
4970 continue;
4971 }
4972 auto mapped = pg.second.create_epoch;
4973 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4974 spg_t spgid(pgid);
4975 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4976 // check the previous creating_pgs, look for the target to whom the pg was
4977 // previously mapped
4978 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4979 const auto last_acting_primary = pgs_by_epoch.first;
4980 for (auto& pgs: pgs_by_epoch.second) {
4981 if (pgs.second.count(spgid)) {
4982 if (last_acting_primary == acting_primary) {
4983 mapped = pgs.first;
4984 } else {
4985 dout(20) << __func__ << " " << pgid << " "
4986 << " acting_primary:" << last_acting_primary
4987 << " -> " << acting_primary << dendl;
4988 // note epoch if the target of the create message changed.
4989 mapped = mapping.get_epoch();
4990 }
4991 break;
4992 } else {
4993 // newly creating
4994 mapped = mapping.get_epoch();
4995 }
4996 }
4997 }
4998 dout(10) << __func__ << " will instruct osd." << acting_primary
4999 << " to create " << pgid << "@" << mapped << dendl;
5000 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5001 }
5002 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5003 creating_pgs_epoch = mapping.get_epoch();
5004 }
5005
5006 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5007 {
5008 dout(30) << __func__ << " osd." << osd << " next=" << next
5009 << " " << creating_pgs_by_osd_epoch << dendl;
5010 std::lock_guard<std::mutex> l(creating_pgs_lock);
5011 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5012 dout(20) << __func__
5013 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5014 // the subscribers will be updated when the mapping is completed anyway
5015 return next;
5016 }
5017 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5018 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5019 return next;
5020 ceph_assert(!creating_pgs_by_epoch->second.empty());
5021
5022 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
5023 MOSDPGCreate2 *m = nullptr;
5024
5025 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
5026
5027 epoch_t last = 0;
5028 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5029 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5030 auto epoch = epoch_pgs->first;
5031 auto& pgs = epoch_pgs->second;
5032 dout(20) << __func__ << " osd." << osd << " from " << next
5033 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5034 last = epoch;
5035 for (auto& pg : pgs) {
5036 // Need the create time from the monitor using its clock to set
5037 // last_scrub_stamp upon pg creation.
5038 auto create = creating_pgs.pgs.find(pg.pgid);
5039 ceph_assert(create != creating_pgs.pgs.end());
5040 if (old) {
5041 if (!oldm) {
5042 oldm = new MOSDPGCreate(creating_pgs_epoch);
5043 }
5044 oldm->mkpg.emplace(pg.pgid,
5045 pg_create_t{create->second.create_epoch, pg.pgid, 0});
5046 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
5047 } else {
5048 if (!m) {
5049 m = new MOSDPGCreate2(creating_pgs_epoch);
5050 }
5051 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5052 create->second.create_stamp));
5053 if (create->second.history.epoch_created) {
5054 dout(20) << __func__ << " " << pg << " " << create->second.history
5055 << " " << create->second.past_intervals << dendl;
5056 m->pg_extra.emplace(pg, make_pair(create->second.history,
5057 create->second.past_intervals));
5058 }
5059 }
5060 dout(20) << __func__ << " will create " << pg
5061 << " at " << create->second.create_epoch << dendl;
5062 }
5063 }
5064 if (m) {
5065 con->send_message(m);
5066 } else if (oldm) {
5067 con->send_message(oldm);
5068 } else {
5069 dout(20) << __func__ << " osd." << osd << " from " << next
5070 << " has nothing to send" << dendl;
5071 return next;
5072 }
5073
5074 // sub is current through last + 1
5075 return last + 1;
5076 }
5077
5078 // TICK
5079
5080
5081 void OSDMonitor::tick()
5082 {
5083 if (!is_active()) return;
5084
5085 dout(10) << osdmap << dendl;
5086
5087 // always update osdmap manifest, regardless of being the leader.
5088 load_osdmap_manifest();
5089
5090 // always tune priority cache manager memory on leader and peons
5091 if (ceph_using_tcmalloc() && mon_memory_autotune) {
5092 std::lock_guard l(balancer_lock);
5093 if (pcm != nullptr) {
5094 pcm->tune_memory();
5095 pcm->balance();
5096 _set_new_cache_sizes();
5097 dout(10) << "tick balancer "
5098 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5099 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5100 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5101 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5102 << dendl;
5103 dout(10) << "tick balancer "
5104 << " full cache_bytes: " << full_cache->get_cache_bytes()
5105 << " full comtd_bytes: " << full_cache->get_committed_size()
5106 << " full used_bytes: " << full_cache->_get_used_bytes()
5107 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5108 << dendl;
5109 }
5110 }
5111
5112 if (!mon.is_leader()) return;
5113
5114 bool do_propose = false;
5115 utime_t now = ceph_clock_now();
5116
5117 if (handle_osd_timeouts(now, last_osd_report)) {
5118 do_propose = true;
5119 }
5120
5121 // mark osds down?
5122 if (check_failures(now)) {
5123 do_propose = true;
5124 }
5125
5126 // Force a proposal if we need to prune; pruning is performed on
5127 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5128 // even if there's nothing going on.
5129 if (is_prune_enabled() && should_prune()) {
5130 do_propose = true;
5131 }
5132
5133 // mark down osds out?
5134
5135 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5136 * influence at all. The decision is made based on the ratio of "in" osds,
5137 * and the function returns false if this ratio is lower that the minimum
5138 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5139 */
5140 if (can_mark_out(-1)) {
5141 string down_out_subtree_limit = g_conf().get_val<string>(
5142 "mon_osd_down_out_subtree_limit");
5143 set<int> down_cache; // quick cache of down subtrees
5144
5145 map<int,utime_t>::iterator i = down_pending_out.begin();
5146 while (i != down_pending_out.end()) {
5147 int o = i->first;
5148 utime_t down = now;
5149 down -= i->second;
5150 ++i;
5151
5152 if (osdmap.is_down(o) &&
5153 osdmap.is_in(o) &&
5154 can_mark_out(o)) {
5155 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5156 utime_t grace = orig_grace;
5157 double my_grace = 0.0;
5158
5159 if (g_conf()->mon_osd_adjust_down_out_interval) {
5160 // scale grace period the same way we do the heartbeat grace.
5161 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5162 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5163 double decay_k = ::log(.5) / halflife;
5164 double decay = exp((double)down * decay_k);
5165 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5166 << " down for " << down << " decay " << decay << dendl;
5167 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5168 grace += my_grace;
5169 }
5170
5171 // is this an entire large subtree down?
5172 if (down_out_subtree_limit.length()) {
5173 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5174 if (type > 0) {
5175 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5176 dout(10) << "tick entire containing " << down_out_subtree_limit
5177 << " subtree for osd." << o
5178 << " is down; resetting timer" << dendl;
5179 // reset timer, too.
5180 down_pending_out[o] = now;
5181 continue;
5182 }
5183 }
5184 }
5185
5186 bool down_out = !osdmap.is_destroyed(o) &&
5187 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5188 bool destroyed_out = osdmap.is_destroyed(o) &&
5189 g_conf()->mon_osd_destroyed_out_interval > 0 &&
5190 // this is not precise enough as we did not make a note when this osd
5191 // was marked as destroyed, but let's not bother with that
5192 // complexity for now.
5193 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5194 if (down_out || destroyed_out) {
5195 dout(10) << "tick marking osd." << o << " OUT after " << down
5196 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5197 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5198
5199 // set the AUTOOUT bit.
5200 if (pending_inc.new_state.count(o) == 0)
5201 pending_inc.new_state[o] = 0;
5202 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5203
5204 // remember previous weight
5205 if (pending_inc.new_xinfo.count(o) == 0)
5206 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5207 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5208
5209 do_propose = true;
5210
5211 mon.clog->info() << "Marking osd." << o << " out (has been down for "
5212 << int(down.sec()) << " seconds)";
5213 } else
5214 continue;
5215 }
5216
5217 down_pending_out.erase(o);
5218 }
5219 } else {
5220 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5221 }
5222
5223 // expire blocklisted items?
5224 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5225 p != osdmap.blocklist.end();
5226 ++p) {
5227 if (p->second < now) {
5228 dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5229 pending_inc.old_blocklist.push_back(p->first);
5230 do_propose = true;
5231 }
5232 }
5233
5234 if (try_prune_purged_snaps()) {
5235 do_propose = true;
5236 }
5237
5238 if (update_pools_status())
5239 do_propose = true;
5240
5241 if (do_propose ||
5242 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5243 propose_pending();
5244 }
5245
5246 void OSDMonitor::_set_new_cache_sizes()
5247 {
5248 uint64_t cache_size = 0;
5249 int64_t inc_alloc = 0;
5250 int64_t full_alloc = 0;
5251 int64_t kv_alloc = 0;
5252
5253 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5254 cache_size = pcm->get_tuned_mem();
5255 inc_alloc = inc_cache->get_committed_size();
5256 full_alloc = full_cache->get_committed_size();
5257 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5258 }
5259
5260 inc_osd_cache.set_bytes(inc_alloc);
5261 full_osd_cache.set_bytes(full_alloc);
5262
5263 dout(1) << __func__ << " cache_size:" << cache_size
5264 << " inc_alloc: " << inc_alloc
5265 << " full_alloc: " << full_alloc
5266 << " kv_alloc: " << kv_alloc
5267 << dendl;
5268 }
5269
5270 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5271 std::map<int, std::pair<utime_t, int>> &last_osd_report)
5272 {
5273 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5274 if (now - mon.get_leader_since() < timeo) {
5275 // We haven't been the leader for long enough to consider OSD timeouts
5276 return false;
5277 }
5278
5279 int max_osd = osdmap.get_max_osd();
5280 bool new_down = false;
5281
5282 for (int i=0; i < max_osd; ++i) {
5283 dout(30) << __func__ << ": checking up on osd " << i << dendl;
5284 if (!osdmap.exists(i)) {
5285 last_osd_report.erase(i); // if any
5286 continue;
5287 }
5288 if (!osdmap.is_up(i))
5289 continue;
5290 const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5291 if (t == last_osd_report.end()) {
5292 // it wasn't in the map; start the timer.
5293 last_osd_report[i].first = now;
5294 last_osd_report[i].second = 0;
5295 } else if (can_mark_down(i)) {
5296 utime_t diff = now - t->second.first;
5297 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5298 // to allow for the osd to miss a beacon.
5299 int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5300 utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0);
5301 if (diff > max_timeout) {
5302 mon.clog->info() << "osd." << i << " marked down after no beacon for "
5303 << diff << " seconds";
5304 derr << "no beacon from osd." << i << " since " << t->second.first
5305 << ", " << diff << " seconds ago. marking down" << dendl;
5306 pending_inc.new_state[i] = CEPH_OSD_UP;
5307 new_down = true;
5308 }
5309 }
5310 }
5311 return new_down;
5312 }
5313
5314 static void dump_cpu_list(Formatter *f, const char *name,
5315 const string& strlist)
5316 {
5317 cpu_set_t cpu_set;
5318 size_t cpu_set_size;
5319 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5320 return;
5321 }
5322 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5323 f->open_array_section(name);
5324 for (auto cpu : cpus) {
5325 f->dump_int("cpu", cpu);
5326 }
5327 f->close_section();
5328 }
5329
5330 void OSDMonitor::dump_info(Formatter *f)
5331 {
5332 f->open_object_section("osdmap");
5333 osdmap.dump(f);
5334 f->close_section();
5335
5336 f->open_array_section("osd_metadata");
5337 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5338 if (osdmap.exists(i)) {
5339 f->open_object_section("osd");
5340 f->dump_unsigned("id", i);
5341 dump_osd_metadata(i, f, NULL);
5342 f->close_section();
5343 }
5344 }
5345 f->close_section();
5346
5347 f->open_object_section("osdmap_clean_epochs");
5348 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5349
5350 f->open_object_section("last_epoch_clean");
5351 last_epoch_clean.dump(f);
5352 f->close_section();
5353
5354 f->open_array_section("osd_epochs");
5355 for (auto& osd_epoch : osd_epochs) {
5356 f->open_object_section("osd");
5357 f->dump_unsigned("id", osd_epoch.first);
5358 f->dump_unsigned("epoch", osd_epoch.second);
5359 f->close_section();
5360 }
5361 f->close_section(); // osd_epochs
5362
5363 f->close_section(); // osd_clean_epochs
5364
5365 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5366 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5367
5368 f->open_object_section("crushmap");
5369 osdmap.crush->dump(f);
5370 f->close_section();
5371
5372 if (has_osdmap_manifest) {
5373 f->open_object_section("osdmap_manifest");
5374 osdmap_manifest.dump(f);
5375 f->close_section();
5376 }
5377 }
5378
5379 namespace {
5380 enum osd_pool_get_choices {
5381 SIZE, MIN_SIZE,
5382 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5383 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5384 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5385 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5386 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5387 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5388 CACHE_TARGET_FULL_RATIO,
5389 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5390 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5391 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5392 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5393 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5394 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5395 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5396 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5397 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5398 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5399 PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5400 DEDUP_CDC_CHUNK_SIZE };
5401
5402 std::set<osd_pool_get_choices>
5403 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5404 const std::set<osd_pool_get_choices>& second)
5405 {
5406 std::set<osd_pool_get_choices> result;
5407 std::set_difference(first.begin(), first.end(),
5408 second.begin(), second.end(),
5409 std::inserter(result, result.end()));
5410 return result;
5411 }
5412 }
5413
5414
5415 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5416 {
5417 op->mark_osdmon_event(__func__);
5418 auto m = op->get_req<MMonCommand>();
5419 int r = 0;
5420 bufferlist rdata;
5421 stringstream ss, ds;
5422
5423 cmdmap_t cmdmap;
5424 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5425 string rs = ss.str();
5426 mon.reply_command(op, -EINVAL, rs, get_last_committed());
5427 return true;
5428 }
5429
5430 MonSession *session = op->get_session();
5431 if (!session) {
5432 derr << __func__ << " no session" << dendl;
5433 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5434 return true;
5435 }
5436
5437 string prefix;
5438 cmd_getval(cmdmap, "prefix", prefix);
5439
5440 string format;
5441 cmd_getval(cmdmap, "format", format, string("plain"));
5442 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5443
5444 if (prefix == "osd stat") {
5445 if (f) {
5446 f->open_object_section("osdmap");
5447 osdmap.print_summary(f.get(), ds, "", true);
5448 f->close_section();
5449 f->flush(rdata);
5450 } else {
5451 osdmap.print_summary(nullptr, ds, "", true);
5452 rdata.append(ds);
5453 }
5454 }
5455 else if (prefix == "osd dump" ||
5456 prefix == "osd tree" ||
5457 prefix == "osd tree-from" ||
5458 prefix == "osd ls" ||
5459 prefix == "osd getmap" ||
5460 prefix == "osd getcrushmap" ||
5461 prefix == "osd ls-tree" ||
5462 prefix == "osd info") {
5463
5464 epoch_t epoch = 0;
5465 int64_t epochnum;
5466 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5467 epoch = epochnum;
5468
5469 bufferlist osdmap_bl;
5470 int err = get_version_full(epoch, osdmap_bl);
5471 if (err == -ENOENT) {
5472 r = -ENOENT;
5473 ss << "there is no map for epoch " << epoch;
5474 goto reply;
5475 }
5476 ceph_assert(err == 0);
5477 ceph_assert(osdmap_bl.length());
5478
5479 OSDMap *p;
5480 if (epoch == osdmap.get_epoch()) {
5481 p = &osdmap;
5482 } else {
5483 p = new OSDMap;
5484 p->decode(osdmap_bl);
5485 }
5486
5487 auto sg = make_scope_guard([&] {
5488 if (p != &osdmap) {
5489 delete p;
5490 }
5491 });
5492
5493 if (prefix == "osd dump") {
5494 stringstream ds;
5495 if (f) {
5496 f->open_object_section("osdmap");
5497 p->dump(f.get());
5498 f->close_section();
5499 f->flush(ds);
5500 } else {
5501 p->print(ds);
5502 }
5503 rdata.append(ds);
5504 if (!f)
5505 ds << " ";
5506 } else if (prefix == "osd ls") {
5507 if (f) {
5508 f->open_array_section("osds");
5509 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5510 if (osdmap.exists(i)) {
5511 f->dump_int("osd", i);
5512 }
5513 }
5514 f->close_section();
5515 f->flush(ds);
5516 } else {
5517 bool first = true;
5518 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5519 if (osdmap.exists(i)) {
5520 if (!first)
5521 ds << "\n";
5522 first = false;
5523 ds << i;
5524 }
5525 }
5526 }
5527 rdata.append(ds);
5528 } else if (prefix == "osd info") {
5529 int64_t osd_id;
5530 bool do_single_osd = true;
5531 if (!cmd_getval(cmdmap, "id", osd_id)) {
5532 do_single_osd = false;
5533 }
5534
5535 if (do_single_osd && !osdmap.exists(osd_id)) {
5536 ss << "osd." << osd_id << " does not exist";
5537 r = -EINVAL;
5538 goto reply;
5539 }
5540
5541 if (f) {
5542 if (do_single_osd) {
5543 osdmap.dump_osd(osd_id, f.get());
5544 } else {
5545 osdmap.dump_osds(f.get());
5546 }
5547 f->flush(ds);
5548 } else {
5549 if (do_single_osd) {
5550 osdmap.print_osd(osd_id, ds);
5551 } else {
5552 osdmap.print_osds(ds);
5553 }
5554 }
5555 rdata.append(ds);
5556 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5557 string bucket;
5558 if (prefix == "osd tree-from") {
5559 cmd_getval(cmdmap, "bucket", bucket);
5560 if (!osdmap.crush->name_exists(bucket)) {
5561 ss << "bucket '" << bucket << "' does not exist";
5562 r = -ENOENT;
5563 goto reply;
5564 }
5565 int id = osdmap.crush->get_item_id(bucket);
5566 if (id >= 0) {
5567 ss << "\"" << bucket << "\" is not a bucket";
5568 r = -EINVAL;
5569 goto reply;
5570 }
5571 }
5572
5573 vector<string> states;
5574 cmd_getval(cmdmap, "states", states);
5575 unsigned filter = 0;
5576 for (auto& s : states) {
5577 if (s == "up") {
5578 filter |= OSDMap::DUMP_UP;
5579 } else if (s == "down") {
5580 filter |= OSDMap::DUMP_DOWN;
5581 } else if (s == "in") {
5582 filter |= OSDMap::DUMP_IN;
5583 } else if (s == "out") {
5584 filter |= OSDMap::DUMP_OUT;
5585 } else if (s == "destroyed") {
5586 filter |= OSDMap::DUMP_DESTROYED;
5587 } else {
5588 ss << "unrecognized state '" << s << "'";
5589 r = -EINVAL;
5590 goto reply;
5591 }
5592 }
5593 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5594 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5595 ss << "cannot specify both 'in' and 'out'";
5596 r = -EINVAL;
5597 goto reply;
5598 }
5599 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5600 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5601 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5602 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5603 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5604 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5605 ss << "can specify only one of 'up', 'down' and 'destroyed'";
5606 r = -EINVAL;
5607 goto reply;
5608 }
5609 if (f) {
5610 f->open_object_section("tree");
5611 p->print_tree(f.get(), NULL, filter, bucket);
5612 f->close_section();
5613 f->flush(ds);
5614 } else {
5615 p->print_tree(NULL, &ds, filter, bucket);
5616 }
5617 rdata.append(ds);
5618 } else if (prefix == "osd getmap") {
5619 rdata.append(osdmap_bl);
5620 ss << "got osdmap epoch " << p->get_epoch();
5621 } else if (prefix == "osd getcrushmap") {
5622 p->crush->encode(rdata, mon.get_quorum_con_features());
5623 ss << p->get_crush_version();
5624 } else if (prefix == "osd ls-tree") {
5625 string bucket_name;
5626 cmd_getval(cmdmap, "name", bucket_name);
5627 set<int> osds;
5628 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5629 if (r == -ENOENT) {
5630 ss << "\"" << bucket_name << "\" does not exist";
5631 goto reply;
5632 } else if (r < 0) {
5633 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5634 goto reply;
5635 }
5636
5637 if (f) {
5638 f->open_array_section("osds");
5639 for (auto &i : osds) {
5640 if (osdmap.exists(i)) {
5641 f->dump_int("osd", i);
5642 }
5643 }
5644 f->close_section();
5645 f->flush(ds);
5646 } else {
5647 bool first = true;
5648 for (auto &i : osds) {
5649 if (osdmap.exists(i)) {
5650 if (!first)
5651 ds << "\n";
5652 first = false;
5653 ds << i;
5654 }
5655 }
5656 }
5657
5658 rdata.append(ds);
5659 }
5660 } else if (prefix == "osd getmaxosd") {
5661 if (f) {
5662 f->open_object_section("getmaxosd");
5663 f->dump_unsigned("epoch", osdmap.get_epoch());
5664 f->dump_int("max_osd", osdmap.get_max_osd());
5665 f->close_section();
5666 f->flush(rdata);
5667 } else {
5668 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5669 rdata.append(ds);
5670 }
5671 } else if (prefix == "osd utilization") {
5672 string out;
5673 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5674 if (f)
5675 f->flush(rdata);
5676 else
5677 rdata.append(out);
5678 r = 0;
5679 goto reply;
5680 } else if (prefix == "osd find") {
5681 int64_t osd;
5682 if (!cmd_getval(cmdmap, "id", osd)) {
5683 ss << "unable to parse osd id value '"
5684 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5685 r = -EINVAL;
5686 goto reply;
5687 }
5688 if (!osdmap.exists(osd)) {
5689 ss << "osd." << osd << " does not exist";
5690 r = -ENOENT;
5691 goto reply;
5692 }
5693 string format;
5694 cmd_getval(cmdmap, "format", format);
5695 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5696 f->open_object_section("osd_location");
5697 f->dump_int("osd", osd);
5698 f->dump_object("addrs", osdmap.get_addrs(osd));
5699 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5700
5701 // try to identify host, pod/container name, etc.
5702 map<string,string> m;
5703 load_metadata(osd, m, nullptr);
5704 if (auto p = m.find("hostname"); p != m.end()) {
5705 f->dump_string("host", p->second);
5706 }
5707 for (auto& k : {
5708 "pod_name", "pod_namespace", // set by rook
5709 "container_name" // set by cephadm, ceph-ansible
5710 }) {
5711 if (auto p = m.find(k); p != m.end()) {
5712 f->dump_string(k, p->second);
5713 }
5714 }
5715
5716 // crush is helpful too
5717 f->open_object_section("crush_location");
5718 map<string,string> loc = osdmap.crush->get_full_location(osd);
5719 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5720 f->dump_string(p->first.c_str(), p->second);
5721 f->close_section();
5722 f->close_section();
5723 f->flush(rdata);
5724 } else if (prefix == "osd metadata") {
5725 int64_t osd = -1;
5726 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5727 !cmd_getval(cmdmap, "id", osd)) {
5728 ss << "unable to parse osd id value '"
5729 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5730 r = -EINVAL;
5731 goto reply;
5732 }
5733 if (osd >= 0 && !osdmap.exists(osd)) {
5734 ss << "osd." << osd << " does not exist";
5735 r = -ENOENT;
5736 goto reply;
5737 }
5738 string format;
5739 cmd_getval(cmdmap, "format", format);
5740 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5741 if (osd >= 0) {
5742 f->open_object_section("osd_metadata");
5743 f->dump_unsigned("id", osd);
5744 r = dump_osd_metadata(osd, f.get(), &ss);
5745 if (r < 0)
5746 goto reply;
5747 f->close_section();
5748 } else {
5749 r = 0;
5750 f->open_array_section("osd_metadata");
5751 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5752 if (osdmap.exists(i)) {
5753 f->open_object_section("osd");
5754 f->dump_unsigned("id", i);
5755 r = dump_osd_metadata(i, f.get(), NULL);
5756 if (r == -EINVAL || r == -ENOENT) {
5757 // Drop error, continue to get other daemons' metadata
5758 dout(4) << "No metadata for osd." << i << dendl;
5759 r = 0;
5760 } else if (r < 0) {
5761 // Unexpected error
5762 goto reply;
5763 }
5764 f->close_section();
5765 }
5766 }
5767 f->close_section();
5768 }
5769 f->flush(rdata);
5770 } else if (prefix == "osd versions") {
5771 if (!f)
5772 f.reset(Formatter::create("json-pretty"));
5773 count_metadata("ceph_version", f.get());
5774 f->flush(rdata);
5775 r = 0;
5776 } else if (prefix == "osd count-metadata") {
5777 if (!f)
5778 f.reset(Formatter::create("json-pretty"));
5779 string field;
5780 cmd_getval(cmdmap, "property", field);
5781 count_metadata(field, f.get());
5782 f->flush(rdata);
5783 r = 0;
5784 } else if (prefix == "osd numa-status") {
5785 TextTable tbl;
5786 if (f) {
5787 f->open_array_section("osds");
5788 } else {
5789 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5790 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5791 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5792 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5793 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5794 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5795 }
5796 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5797 if (osdmap.exists(i)) {
5798 map<string,string> m;
5799 ostringstream err;
5800 if (load_metadata(i, m, &err) < 0) {
5801 continue;
5802 }
5803 string host;
5804 auto p = m.find("hostname");
5805 if (p != m.end()) {
5806 host = p->second;
5807 }
5808 if (f) {
5809 f->open_object_section("osd");
5810 f->dump_int("osd", i);
5811 f->dump_string("host", host);
5812 for (auto n : { "network_numa_node", "objectstore_numa_node",
5813 "numa_node" }) {
5814 p = m.find(n);
5815 if (p != m.end()) {
5816 f->dump_int(n, atoi(p->second.c_str()));
5817 }
5818 }
5819 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5820 p = m.find(n);
5821 if (p != m.end()) {
5822 list<string> ls = get_str_list(p->second, ",");
5823 f->open_array_section(n);
5824 for (auto node : ls) {
5825 f->dump_int("node", atoi(node.c_str()));
5826 }
5827 f->close_section();
5828 }
5829 }
5830 for (auto n : { "numa_node_cpus" }) {
5831 p = m.find(n);
5832 if (p != m.end()) {
5833 dump_cpu_list(f.get(), n, p->second);
5834 }
5835 }
5836 f->close_section();
5837 } else {
5838 tbl << i;
5839 tbl << host;
5840 p = m.find("network_numa_nodes");
5841 if (p != m.end()) {
5842 tbl << p->second;
5843 } else {
5844 tbl << "-";
5845 }
5846 p = m.find("objectstore_numa_nodes");
5847 if (p != m.end()) {
5848 tbl << p->second;
5849 } else {
5850 tbl << "-";
5851 }
5852 p = m.find("numa_node");
5853 auto q = m.find("numa_node_cpus");
5854 if (p != m.end() && q != m.end()) {
5855 tbl << p->second;
5856 tbl << q->second;
5857 } else {
5858 tbl << "-";
5859 tbl << "-";
5860 }
5861 tbl << TextTable::endrow;
5862 }
5863 }
5864 }
5865 if (f) {
5866 f->close_section();
5867 f->flush(rdata);
5868 } else {
5869 rdata.append(stringify(tbl));
5870 }
5871 } else if (prefix == "osd map") {
5872 string poolstr, objstr, namespacestr;
5873 cmd_getval(cmdmap, "pool", poolstr);
5874 cmd_getval(cmdmap, "object", objstr);
5875 cmd_getval(cmdmap, "nspace", namespacestr);
5876
5877 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5878 if (pool < 0) {
5879 ss << "pool " << poolstr << " does not exist";
5880 r = -ENOENT;
5881 goto reply;
5882 }
5883 object_locator_t oloc(pool, namespacestr);
5884 object_t oid(objstr);
5885 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5886 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5887 vector<int> up, acting;
5888 int up_p, acting_p;
5889 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5890
5891 string fullobjname;
5892 if (!namespacestr.empty())
5893 fullobjname = namespacestr + string("/") + oid.name;
5894 else
5895 fullobjname = oid.name;
5896 if (f) {
5897 f->open_object_section("osd_map");
5898 f->dump_unsigned("epoch", osdmap.get_epoch());
5899 f->dump_string("pool", poolstr);
5900 f->dump_int("pool_id", pool);
5901 f->dump_stream("objname") << fullobjname;
5902 f->dump_stream("raw_pgid") << pgid;
5903 f->dump_stream("pgid") << mpgid;
5904 f->open_array_section("up");
5905 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5906 f->dump_int("osd", *p);
5907 f->close_section();
5908 f->dump_int("up_primary", up_p);
5909 f->open_array_section("acting");
5910 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5911 f->dump_int("osd", *p);
5912 f->close_section();
5913 f->dump_int("acting_primary", acting_p);
5914 f->close_section(); // osd_map
5915 f->flush(rdata);
5916 } else {
5917 ds << "osdmap e" << osdmap.get_epoch()
5918 << " pool '" << poolstr << "' (" << pool << ")"
5919 << " object '" << fullobjname << "' ->"
5920 << " pg " << pgid << " (" << mpgid << ")"
5921 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5922 << pg_vector_string(acting) << ", p" << acting_p << ")";
5923 rdata.append(ds);
5924 }
5925
5926 } else if (prefix == "pg map") {
5927 pg_t pgid;
5928 string pgidstr;
5929 cmd_getval(cmdmap, "pgid", pgidstr);
5930 if (!pgid.parse(pgidstr.c_str())) {
5931 ss << "invalid pgid '" << pgidstr << "'";
5932 r = -EINVAL;
5933 goto reply;
5934 }
5935 vector<int> up, acting;
5936 if (!osdmap.have_pg_pool(pgid.pool())) {
5937 ss << "pg '" << pgidstr << "' does not exist";
5938 r = -ENOENT;
5939 goto reply;
5940 }
5941 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5942 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5943 if (f) {
5944 f->open_object_section("pg_map");
5945 f->dump_unsigned("epoch", osdmap.get_epoch());
5946 f->dump_stream("raw_pgid") << pgid;
5947 f->dump_stream("pgid") << mpgid;
5948 f->open_array_section("up");
5949 for (auto osd : up) {
5950 f->dump_int("up_osd", osd);
5951 }
5952 f->close_section();
5953 f->open_array_section("acting");
5954 for (auto osd : acting) {
5955 f->dump_int("acting_osd", osd);
5956 }
5957 f->close_section();
5958 f->close_section();
5959 f->flush(rdata);
5960 } else {
5961 ds << "osdmap e" << osdmap.get_epoch()
5962 << " pg " << pgid << " (" << mpgid << ")"
5963 << " -> up " << up << " acting " << acting;
5964 rdata.append(ds);
5965 }
5966 goto reply;
5967
5968 } else if (prefix == "osd lspools") {
5969 if (f)
5970 f->open_array_section("pools");
5971 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5972 p != osdmap.pools.end();
5973 ++p) {
5974 if (f) {
5975 f->open_object_section("pool");
5976 f->dump_int("poolnum", p->first);
5977 f->dump_string("poolname", osdmap.pool_name[p->first]);
5978 f->close_section();
5979 } else {
5980 ds << p->first << ' ' << osdmap.pool_name[p->first];
5981 if (next(p) != osdmap.pools.end()) {
5982 ds << '\n';
5983 }
5984 }
5985 }
5986 if (f) {
5987 f->close_section();
5988 f->flush(ds);
5989 }
5990 rdata.append(ds);
5991 } else if (prefix == "osd blocklist ls" ||
5992 prefix == "osd blacklist ls") {
5993 if (f)
5994 f->open_array_section("blocklist");
5995
5996 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5997 p != osdmap.blocklist.end();
5998 ++p) {
5999 if (f) {
6000 f->open_object_section("entry");
6001 f->dump_string("addr", p->first.get_legacy_str());
6002 f->dump_stream("until") << p->second;
6003 f->close_section();
6004 } else {
6005 stringstream ss;
6006 string s;
6007 ss << p->first << " " << p->second;
6008 getline(ss, s);
6009 s += "\n";
6010 rdata.append(s);
6011 }
6012 }
6013 if (f) {
6014 f->close_section();
6015 f->flush(rdata);
6016 }
6017 ss << "listed " << osdmap.blocklist.size() << " entries";
6018
6019 } else if (prefix == "osd pool ls") {
6020 string detail;
6021 cmd_getval(cmdmap, "detail", detail);
6022 if (!f && detail == "detail") {
6023 ostringstream ss;
6024 osdmap.print_pools(ss);
6025 rdata.append(ss.str());
6026 } else {
6027 if (f)
6028 f->open_array_section("pools");
6029 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
6030 it != osdmap.get_pools().end();
6031 ++it) {
6032 if (f) {
6033 if (detail == "detail") {
6034 f->open_object_section("pool");
6035 f->dump_int("pool_id", it->first);
6036 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6037 it->second.dump(f.get());
6038 f->close_section();
6039 } else {
6040 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6041 }
6042 } else {
6043 rdata.append(osdmap.get_pool_name(it->first) + "\n");
6044 }
6045 }
6046 if (f) {
6047 f->close_section();
6048 f->flush(rdata);
6049 }
6050 }
6051
6052 } else if (prefix == "osd crush get-tunable") {
6053 string tunable;
6054 cmd_getval(cmdmap, "tunable", tunable);
6055 ostringstream rss;
6056 if (f)
6057 f->open_object_section("tunable");
6058 if (tunable == "straw_calc_version") {
6059 if (f)
6060 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6061 else
6062 rss << osdmap.crush->get_straw_calc_version() << "\n";
6063 } else {
6064 r = -EINVAL;
6065 goto reply;
6066 }
6067 if (f) {
6068 f->close_section();
6069 f->flush(rdata);
6070 } else {
6071 rdata.append(rss.str());
6072 }
6073 r = 0;
6074
6075 } else if (prefix == "osd pool get") {
6076 string poolstr;
6077 cmd_getval(cmdmap, "pool", poolstr);
6078 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6079 if (pool < 0) {
6080 ss << "unrecognized pool '" << poolstr << "'";
6081 r = -ENOENT;
6082 goto reply;
6083 }
6084
6085 const pg_pool_t *p = osdmap.get_pg_pool(pool);
6086 string var;
6087 cmd_getval(cmdmap, "var", var);
6088
6089 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6090 const choices_map_t ALL_CHOICES = {
6091 {"size", SIZE},
6092 {"min_size", MIN_SIZE},
6093 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6094 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
6095 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6096 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6097 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6098 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6099 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6100 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6101 {"use_gmt_hitset", USE_GMT_HITSET},
6102 {"target_max_objects", TARGET_MAX_OBJECTS},
6103 {"target_max_bytes", TARGET_MAX_BYTES},
6104 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6105 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6106 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6107 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6108 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6109 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6110 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6111 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6112 {"fast_read", FAST_READ},
6113 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6114 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6115 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6116 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6117 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6118 {"recovery_priority", RECOVERY_PRIORITY},
6119 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6120 {"scrub_priority", SCRUB_PRIORITY},
6121 {"compression_mode", COMPRESSION_MODE},
6122 {"compression_algorithm", COMPRESSION_ALGORITHM},
6123 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6124 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6125 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6126 {"csum_type", CSUM_TYPE},
6127 {"csum_max_block", CSUM_MAX_BLOCK},
6128 {"csum_min_block", CSUM_MIN_BLOCK},
6129 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6130 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6131 {"pg_num_min", PG_NUM_MIN},
6132 {"target_size_bytes", TARGET_SIZE_BYTES},
6133 {"target_size_ratio", TARGET_SIZE_RATIO},
6134 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6135 {"dedup_tier", DEDUP_TIER},
6136 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6137 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6138 };
6139
6140 typedef std::set<osd_pool_get_choices> choices_set_t;
6141
6142 const choices_set_t ONLY_TIER_CHOICES = {
6143 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6144 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6145 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6146 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6147 MIN_READ_RECENCY_FOR_PROMOTE,
6148 MIN_WRITE_RECENCY_FOR_PROMOTE,
6149 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6150 };
6151 const choices_set_t ONLY_ERASURE_CHOICES = {
6152 EC_OVERWRITES, ERASURE_CODE_PROFILE
6153 };
6154
6155 choices_set_t selected_choices;
6156 if (var == "all") {
6157 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6158 it != ALL_CHOICES.end(); ++it) {
6159 selected_choices.insert(it->second);
6160 }
6161
6162 if(!p->is_tier()) {
6163 selected_choices = subtract_second_from_first(selected_choices,
6164 ONLY_TIER_CHOICES);
6165 }
6166
6167 if(!p->is_erasure()) {
6168 selected_choices = subtract_second_from_first(selected_choices,
6169 ONLY_ERASURE_CHOICES);
6170 }
6171 } else /* var != "all" */ {
6172 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6173 osd_pool_get_choices selected = found->second;
6174
6175 if (!p->is_tier() &&
6176 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6177 ss << "pool '" << poolstr
6178 << "' is not a tier pool: variable not applicable";
6179 r = -EACCES;
6180 goto reply;
6181 }
6182
6183 if (!p->is_erasure() &&
6184 ONLY_ERASURE_CHOICES.find(selected)
6185 != ONLY_ERASURE_CHOICES.end()) {
6186 ss << "pool '" << poolstr
6187 << "' is not a erasure pool: variable not applicable";
6188 r = -EACCES;
6189 goto reply;
6190 }
6191
6192 if (pool_opts_t::is_opt_name(var) &&
6193 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6194 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6195 r = -ENOENT;
6196 goto reply;
6197 }
6198
6199 selected_choices.insert(selected);
6200 }
6201
6202 if (f) {
6203 f->open_object_section("pool");
6204 f->dump_string("pool", poolstr);
6205 f->dump_int("pool_id", pool);
6206 for(choices_set_t::const_iterator it = selected_choices.begin();
6207 it != selected_choices.end(); ++it) {
6208 choices_map_t::const_iterator i;
6209 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6210 if (i->second == *it) {
6211 break;
6212 }
6213 }
6214 ceph_assert(i != ALL_CHOICES.end());
6215 switch(*it) {
6216 case PG_NUM:
6217 f->dump_int("pg_num", p->get_pg_num());
6218 break;
6219 case PGP_NUM:
6220 f->dump_int("pgp_num", p->get_pgp_num());
6221 break;
6222 case SIZE:
6223 f->dump_int("size", p->get_size());
6224 break;
6225 case MIN_SIZE:
6226 f->dump_int("min_size", p->get_min_size());
6227 break;
6228 case CRUSH_RULE:
6229 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6230 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6231 p->get_crush_rule()));
6232 } else {
6233 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6234 }
6235 break;
6236 case EC_OVERWRITES:
6237 f->dump_bool("allow_ec_overwrites",
6238 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6239 break;
6240 case PG_AUTOSCALE_MODE:
6241 f->dump_string("pg_autoscale_mode",
6242 pg_pool_t::get_pg_autoscale_mode_name(
6243 p->pg_autoscale_mode));
6244 break;
6245 case HASHPSPOOL:
6246 case NODELETE:
6247 case NOPGCHANGE:
6248 case NOSIZECHANGE:
6249 case WRITE_FADVISE_DONTNEED:
6250 case NOSCRUB:
6251 case NODEEP_SCRUB:
6252 f->dump_bool(i->first.c_str(),
6253 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6254 break;
6255 case HIT_SET_PERIOD:
6256 f->dump_int("hit_set_period", p->hit_set_period);
6257 break;
6258 case HIT_SET_COUNT:
6259 f->dump_int("hit_set_count", p->hit_set_count);
6260 break;
6261 case HIT_SET_TYPE:
6262 f->dump_string("hit_set_type",
6263 HitSet::get_type_name(p->hit_set_params.get_type()));
6264 break;
6265 case HIT_SET_FPP:
6266 {
6267 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6268 BloomHitSet::Params *bloomp =
6269 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6270 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6271 } else if(var != "all") {
6272 f->close_section();
6273 ss << "hit set is not of type Bloom; " <<
6274 "invalid to get a false positive rate!";
6275 r = -EINVAL;
6276 goto reply;
6277 }
6278 }
6279 break;
6280 case USE_GMT_HITSET:
6281 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6282 break;
6283 case TARGET_MAX_OBJECTS:
6284 f->dump_unsigned("target_max_objects", p->target_max_objects);
6285 break;
6286 case TARGET_MAX_BYTES:
6287 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6288 break;
6289 case CACHE_TARGET_DIRTY_RATIO:
6290 f->dump_unsigned("cache_target_dirty_ratio_micro",
6291 p->cache_target_dirty_ratio_micro);
6292 f->dump_float("cache_target_dirty_ratio",
6293 ((float)p->cache_target_dirty_ratio_micro/1000000));
6294 break;
6295 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6296 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6297 p->cache_target_dirty_high_ratio_micro);
6298 f->dump_float("cache_target_dirty_high_ratio",
6299 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6300 break;
6301 case CACHE_TARGET_FULL_RATIO:
6302 f->dump_unsigned("cache_target_full_ratio_micro",
6303 p->cache_target_full_ratio_micro);
6304 f->dump_float("cache_target_full_ratio",
6305 ((float)p->cache_target_full_ratio_micro/1000000));
6306 break;
6307 case CACHE_MIN_FLUSH_AGE:
6308 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6309 break;
6310 case CACHE_MIN_EVICT_AGE:
6311 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6312 break;
6313 case ERASURE_CODE_PROFILE:
6314 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6315 break;
6316 case MIN_READ_RECENCY_FOR_PROMOTE:
6317 f->dump_int("min_read_recency_for_promote",
6318 p->min_read_recency_for_promote);
6319 break;
6320 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6321 f->dump_int("min_write_recency_for_promote",
6322 p->min_write_recency_for_promote);
6323 break;
6324 case FAST_READ:
6325 f->dump_int("fast_read", p->fast_read);
6326 break;
6327 case HIT_SET_GRADE_DECAY_RATE:
6328 f->dump_int("hit_set_grade_decay_rate",
6329 p->hit_set_grade_decay_rate);
6330 break;
6331 case HIT_SET_SEARCH_LAST_N:
6332 f->dump_int("hit_set_search_last_n",
6333 p->hit_set_search_last_n);
6334 break;
6335 case SCRUB_MIN_INTERVAL:
6336 case SCRUB_MAX_INTERVAL:
6337 case DEEP_SCRUB_INTERVAL:
6338 case RECOVERY_PRIORITY:
6339 case RECOVERY_OP_PRIORITY:
6340 case SCRUB_PRIORITY:
6341 case COMPRESSION_MODE:
6342 case COMPRESSION_ALGORITHM:
6343 case COMPRESSION_REQUIRED_RATIO:
6344 case COMPRESSION_MAX_BLOB_SIZE:
6345 case COMPRESSION_MIN_BLOB_SIZE:
6346 case CSUM_TYPE:
6347 case CSUM_MAX_BLOCK:
6348 case CSUM_MIN_BLOCK:
6349 case FINGERPRINT_ALGORITHM:
6350 case PG_NUM_MIN:
6351 case TARGET_SIZE_BYTES:
6352 case TARGET_SIZE_RATIO:
6353 case PG_AUTOSCALE_BIAS:
6354 case DEDUP_TIER:
6355 case DEDUP_CHUNK_ALGORITHM:
6356 case DEDUP_CDC_CHUNK_SIZE:
6357 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6358 if (p->opts.is_set(key)) {
6359 if(*it == CSUM_TYPE) {
6360 int64_t val;
6361 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6362 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6363 } else {
6364 p->opts.dump(i->first, f.get());
6365 }
6366 }
6367 break;
6368 }
6369 }
6370 f->close_section();
6371 f->flush(rdata);
6372 } else /* !f */ {
6373 for(choices_set_t::const_iterator it = selected_choices.begin();
6374 it != selected_choices.end(); ++it) {
6375 choices_map_t::const_iterator i;
6376 switch(*it) {
6377 case PG_NUM:
6378 ss << "pg_num: " << p->get_pg_num() << "\n";
6379 break;
6380 case PGP_NUM:
6381 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6382 break;
6383 case SIZE:
6384 ss << "size: " << p->get_size() << "\n";
6385 break;
6386 case MIN_SIZE:
6387 ss << "min_size: " << p->get_min_size() << "\n";
6388 break;
6389 case CRUSH_RULE:
6390 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6391 ss << "crush_rule: " << osdmap.crush->get_rule_name(
6392 p->get_crush_rule()) << "\n";
6393 } else {
6394 ss << "crush_rule: " << p->get_crush_rule() << "\n";
6395 }
6396 break;
6397 case PG_AUTOSCALE_MODE:
6398 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6399 p->pg_autoscale_mode) <<"\n";
6400 break;
6401 case HIT_SET_PERIOD:
6402 ss << "hit_set_period: " << p->hit_set_period << "\n";
6403 break;
6404 case HIT_SET_COUNT:
6405 ss << "hit_set_count: " << p->hit_set_count << "\n";
6406 break;
6407 case HIT_SET_TYPE:
6408 ss << "hit_set_type: " <<
6409 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6410 break;
6411 case HIT_SET_FPP:
6412 {
6413 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6414 BloomHitSet::Params *bloomp =
6415 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6416 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6417 } else if(var != "all") {
6418 ss << "hit set is not of type Bloom; " <<
6419 "invalid to get a false positive rate!";
6420 r = -EINVAL;
6421 goto reply;
6422 }
6423 }
6424 break;
6425 case USE_GMT_HITSET:
6426 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6427 break;
6428 case TARGET_MAX_OBJECTS:
6429 ss << "target_max_objects: " << p->target_max_objects << "\n";
6430 break;
6431 case TARGET_MAX_BYTES:
6432 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6433 break;
6434 case CACHE_TARGET_DIRTY_RATIO:
6435 ss << "cache_target_dirty_ratio: "
6436 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6437 break;
6438 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6439 ss << "cache_target_dirty_high_ratio: "
6440 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6441 break;
6442 case CACHE_TARGET_FULL_RATIO:
6443 ss << "cache_target_full_ratio: "
6444 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6445 break;
6446 case CACHE_MIN_FLUSH_AGE:
6447 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6448 break;
6449 case CACHE_MIN_EVICT_AGE:
6450 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6451 break;
6452 case ERASURE_CODE_PROFILE:
6453 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6454 break;
6455 case MIN_READ_RECENCY_FOR_PROMOTE:
6456 ss << "min_read_recency_for_promote: " <<
6457 p->min_read_recency_for_promote << "\n";
6458 break;
6459 case HIT_SET_GRADE_DECAY_RATE:
6460 ss << "hit_set_grade_decay_rate: " <<
6461 p->hit_set_grade_decay_rate << "\n";
6462 break;
6463 case HIT_SET_SEARCH_LAST_N:
6464 ss << "hit_set_search_last_n: " <<
6465 p->hit_set_search_last_n << "\n";
6466 break;
6467 case EC_OVERWRITES:
6468 ss << "allow_ec_overwrites: " <<
6469 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6470 "\n";
6471 break;
6472 case HASHPSPOOL:
6473 case NODELETE:
6474 case NOPGCHANGE:
6475 case NOSIZECHANGE:
6476 case WRITE_FADVISE_DONTNEED:
6477 case NOSCRUB:
6478 case NODEEP_SCRUB:
6479 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6480 if (i->second == *it)
6481 break;
6482 }
6483 ceph_assert(i != ALL_CHOICES.end());
6484 ss << i->first << ": " <<
6485 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6486 "true" : "false") << "\n";
6487 break;
6488 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6489 ss << "min_write_recency_for_promote: " <<
6490 p->min_write_recency_for_promote << "\n";
6491 break;
6492 case FAST_READ:
6493 ss << "fast_read: " << p->fast_read << "\n";
6494 break;
6495 case SCRUB_MIN_INTERVAL:
6496 case SCRUB_MAX_INTERVAL:
6497 case DEEP_SCRUB_INTERVAL:
6498 case RECOVERY_PRIORITY:
6499 case RECOVERY_OP_PRIORITY:
6500 case SCRUB_PRIORITY:
6501 case COMPRESSION_MODE:
6502 case COMPRESSION_ALGORITHM:
6503 case COMPRESSION_REQUIRED_RATIO:
6504 case COMPRESSION_MAX_BLOB_SIZE:
6505 case COMPRESSION_MIN_BLOB_SIZE:
6506 case CSUM_TYPE:
6507 case CSUM_MAX_BLOCK:
6508 case CSUM_MIN_BLOCK:
6509 case FINGERPRINT_ALGORITHM:
6510 case PG_NUM_MIN:
6511 case TARGET_SIZE_BYTES:
6512 case TARGET_SIZE_RATIO:
6513 case PG_AUTOSCALE_BIAS:
6514 case DEDUP_TIER:
6515 case DEDUP_CHUNK_ALGORITHM:
6516 case DEDUP_CDC_CHUNK_SIZE:
6517 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6518 if (i->second == *it)
6519 break;
6520 }
6521 ceph_assert(i != ALL_CHOICES.end());
6522 {
6523 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6524 if (p->opts.is_set(key)) {
6525 if(key == pool_opts_t::CSUM_TYPE) {
6526 int64_t val;
6527 p->opts.get(key, &val);
6528 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6529 } else {
6530 ss << i->first << ": " << p->opts.get(key) << "\n";
6531 }
6532 }
6533 }
6534 break;
6535 }
6536 rdata.append(ss.str());
6537 ss.str("");
6538 }
6539 }
6540 r = 0;
6541 } else if (prefix == "osd pool get-quota") {
6542 string pool_name;
6543 cmd_getval(cmdmap, "pool", pool_name);
6544
6545 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6546 if (poolid < 0) {
6547 ceph_assert(poolid == -ENOENT);
6548 ss << "unrecognized pool '" << pool_name << "'";
6549 r = -ENOENT;
6550 goto reply;
6551 }
6552 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6553 const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6554 const object_stat_sum_t& sum = pstat->stats.sum;
6555 if (f) {
6556 f->open_object_section("pool_quotas");
6557 f->dump_string("pool_name", pool_name);
6558 f->dump_unsigned("pool_id", poolid);
6559 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6560 f->dump_int("current_num_objects", sum.num_objects);
6561 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6562 f->dump_int("current_num_bytes", sum.num_bytes);
6563 f->close_section();
6564 f->flush(rdata);
6565 } else {
6566 stringstream rs;
6567 rs << "quotas for pool '" << pool_name << "':\n"
6568 << " max objects: ";
6569 if (p->quota_max_objects == 0)
6570 rs << "N/A";
6571 else {
6572 rs << si_u_t(p->quota_max_objects) << " objects";
6573 rs << " (current num objects: " << sum.num_objects << " objects)";
6574 }
6575 rs << "\n"
6576 << " max bytes : ";
6577 if (p->quota_max_bytes == 0)
6578 rs << "N/A";
6579 else {
6580 rs << byte_u_t(p->quota_max_bytes);
6581 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6582 }
6583 rdata.append(rs.str());
6584 }
6585 rdata.append("\n");
6586 r = 0;
6587 } else if (prefix == "osd crush rule list" ||
6588 prefix == "osd crush rule ls") {
6589 if (f) {
6590 f->open_array_section("rules");
6591 osdmap.crush->list_rules(f.get());
6592 f->close_section();
6593 f->flush(rdata);
6594 } else {
6595 ostringstream ss;
6596 osdmap.crush->list_rules(&ss);
6597 rdata.append(ss.str());
6598 }
6599 } else if (prefix == "osd crush rule ls-by-class") {
6600 string class_name;
6601 cmd_getval(cmdmap, "class", class_name);
6602 if (class_name.empty()) {
6603 ss << "no class specified";
6604 r = -EINVAL;
6605 goto reply;
6606 }
6607 set<int> rules;
6608 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6609 if (r < 0) {
6610 ss << "failed to get rules by class '" << class_name << "'";
6611 goto reply;
6612 }
6613 if (f) {
6614 f->open_array_section("rules");
6615 for (auto &rule: rules) {
6616 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6617 }
6618 f->close_section();
6619 f->flush(rdata);
6620 } else {
6621 ostringstream rs;
6622 for (auto &rule: rules) {
6623 rs << osdmap.crush->get_rule_name(rule) << "\n";
6624 }
6625 rdata.append(rs.str());
6626 }
6627 } else if (prefix == "osd crush rule dump") {
6628 string name;
6629 cmd_getval(cmdmap, "name", name);
6630 string format;
6631 cmd_getval(cmdmap, "format", format);
6632 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6633 if (name == "") {
6634 f->open_array_section("rules");
6635 osdmap.crush->dump_rules(f.get());
6636 f->close_section();
6637 } else {
6638 int ruleno = osdmap.crush->get_rule_id(name);
6639 if (ruleno < 0) {
6640 ss << "unknown crush rule '" << name << "'";
6641 r = ruleno;
6642 goto reply;
6643 }
6644 osdmap.crush->dump_rule(ruleno, f.get());
6645 }
6646 ostringstream rs;
6647 f->flush(rs);
6648 rs << "\n";
6649 rdata.append(rs.str());
6650 } else if (prefix == "osd crush dump") {
6651 string format;
6652 cmd_getval(cmdmap, "format", format);
6653 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6654 f->open_object_section("crush_map");
6655 osdmap.crush->dump(f.get());
6656 f->close_section();
6657 ostringstream rs;
6658 f->flush(rs);
6659 rs << "\n";
6660 rdata.append(rs.str());
6661 } else if (prefix == "osd crush show-tunables") {
6662 string format;
6663 cmd_getval(cmdmap, "format", format);
6664 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6665 f->open_object_section("crush_map_tunables");
6666 osdmap.crush->dump_tunables(f.get());
6667 f->close_section();
6668 ostringstream rs;
6669 f->flush(rs);
6670 rs << "\n";
6671 rdata.append(rs.str());
6672 } else if (prefix == "osd crush tree") {
6673 string shadow;
6674 cmd_getval(cmdmap, "shadow", shadow);
6675 bool show_shadow = shadow == "--show-shadow";
6676 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6677 if (f) {
6678 f->open_object_section("crush_tree");
6679 osdmap.crush->dump_tree(nullptr,
6680 f.get(),
6681 osdmap.get_pool_names(),
6682 show_shadow);
6683 f->close_section();
6684 f->flush(rdata);
6685 } else {
6686 ostringstream ss;
6687 osdmap.crush->dump_tree(&ss,
6688 nullptr,
6689 osdmap.get_pool_names(),
6690 show_shadow);
6691 rdata.append(ss.str());
6692 }
6693 } else if (prefix == "osd crush ls") {
6694 string name;
6695 if (!cmd_getval(cmdmap, "node", name)) {
6696 ss << "no node specified";
6697 r = -EINVAL;
6698 goto reply;
6699 }
6700 if (!osdmap.crush->name_exists(name)) {
6701 ss << "node '" << name << "' does not exist";
6702 r = -ENOENT;
6703 goto reply;
6704 }
6705 int id = osdmap.crush->get_item_id(name);
6706 list<int> result;
6707 if (id >= 0) {
6708 result.push_back(id);
6709 } else {
6710 int num = osdmap.crush->get_bucket_size(id);
6711 for (int i = 0; i < num; ++i) {
6712 result.push_back(osdmap.crush->get_bucket_item(id, i));
6713 }
6714 }
6715 if (f) {
6716 f->open_array_section("items");
6717 for (auto i : result) {
6718 f->dump_string("item", osdmap.crush->get_item_name(i));
6719 }
6720 f->close_section();
6721 f->flush(rdata);
6722 } else {
6723 ostringstream ss;
6724 for (auto i : result) {
6725 ss << osdmap.crush->get_item_name(i) << "\n";
6726 }
6727 rdata.append(ss.str());
6728 }
6729 r = 0;
6730 } else if (prefix == "osd crush class ls") {
6731 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6732 f->open_array_section("crush_classes");
6733 for (auto i : osdmap.crush->class_name)
6734 f->dump_string("class", i.second);
6735 f->close_section();
6736 f->flush(rdata);
6737 } else if (prefix == "osd crush class ls-osd") {
6738 string name;
6739 cmd_getval(cmdmap, "class", name);
6740 set<int> osds;
6741 osdmap.crush->get_devices_by_class(name, &osds);
6742 if (f) {
6743 f->open_array_section("osds");
6744 for (auto &osd: osds)
6745 f->dump_int("osd", osd);
6746 f->close_section();
6747 f->flush(rdata);
6748 } else {
6749 bool first = true;
6750 for (auto &osd : osds) {
6751 if (!first)
6752 ds << "\n";
6753 first = false;
6754 ds << osd;
6755 }
6756 rdata.append(ds);
6757 }
6758 } else if (prefix == "osd crush get-device-class") {
6759 vector<string> idvec;
6760 cmd_getval(cmdmap, "ids", idvec);
6761 map<int, string> class_by_osd;
6762 for (auto& id : idvec) {
6763 ostringstream ts;
6764 long osd = parse_osd_id(id.c_str(), &ts);
6765 if (osd < 0) {
6766 ss << "unable to parse osd id:'" << id << "'";
6767 r = -EINVAL;
6768 goto reply;
6769 }
6770 auto device_class = osdmap.crush->get_item_class(osd);
6771 if (device_class)
6772 class_by_osd[osd] = device_class;
6773 else
6774 class_by_osd[osd] = ""; // no class
6775 }
6776 if (f) {
6777 f->open_array_section("osd_device_classes");
6778 for (auto& i : class_by_osd) {
6779 f->open_object_section("osd_device_class");
6780 f->dump_int("osd", i.first);
6781 f->dump_string("device_class", i.second);
6782 f->close_section();
6783 }
6784 f->close_section();
6785 f->flush(rdata);
6786 } else {
6787 if (class_by_osd.size() == 1) {
6788 // for single input, make a clean output
6789 ds << class_by_osd.begin()->second;
6790 } else {
6791 // note that we do not group osds by class here
6792 for (auto it = class_by_osd.begin();
6793 it != class_by_osd.end();
6794 it++) {
6795 ds << "osd." << it->first << ' ' << it->second;
6796 if (next(it) != class_by_osd.end())
6797 ds << '\n';
6798 }
6799 }
6800 rdata.append(ds);
6801 }
6802 } else if (prefix == "osd erasure-code-profile ls") {
6803 const auto &profiles = osdmap.get_erasure_code_profiles();
6804 if (f)
6805 f->open_array_section("erasure-code-profiles");
6806 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6807 if (f)
6808 f->dump_string("profile", i->first.c_str());
6809 else
6810 rdata.append(i->first + "\n");
6811 }
6812 if (f) {
6813 f->close_section();
6814 ostringstream rs;
6815 f->flush(rs);
6816 rs << "\n";
6817 rdata.append(rs.str());
6818 }
6819 } else if (prefix == "osd crush weight-set ls") {
6820 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6821 if (f) {
6822 f->open_array_section("weight_sets");
6823 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6824 f->dump_string("pool", "(compat)");
6825 }
6826 for (auto& i : osdmap.crush->choose_args) {
6827 if (i.first >= 0) {
6828 f->dump_string("pool", osdmap.get_pool_name(i.first));
6829 }
6830 }
6831 f->close_section();
6832 f->flush(rdata);
6833 } else {
6834 ostringstream rs;
6835 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6836 rs << "(compat)\n";
6837 }
6838 for (auto& i : osdmap.crush->choose_args) {
6839 if (i.first >= 0) {
6840 rs << osdmap.get_pool_name(i.first) << "\n";
6841 }
6842 }
6843 rdata.append(rs.str());
6844 }
6845 } else if (prefix == "osd crush weight-set dump") {
6846 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6847 "json-pretty"));
6848 osdmap.crush->dump_choose_args(f.get());
6849 f->flush(rdata);
6850 } else if (prefix == "osd erasure-code-profile get") {
6851 string name;
6852 cmd_getval(cmdmap, "name", name);
6853 if (!osdmap.has_erasure_code_profile(name)) {
6854 ss << "unknown erasure code profile '" << name << "'";
6855 r = -ENOENT;
6856 goto reply;
6857 }
6858 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6859 if (f)
6860 f->open_object_section("profile");
6861 for (map<string,string>::const_iterator i = profile.begin();
6862 i != profile.end();
6863 ++i) {
6864 if (f)
6865 f->dump_string(i->first.c_str(), i->second.c_str());
6866 else
6867 rdata.append(i->first + "=" + i->second + "\n");
6868 }
6869 if (f) {
6870 f->close_section();
6871 ostringstream rs;
6872 f->flush(rs);
6873 rs << "\n";
6874 rdata.append(rs.str());
6875 }
6876 } else if (prefix == "osd pool application get") {
6877 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6878 "json-pretty"));
6879 string pool_name;
6880 cmd_getval(cmdmap, "pool", pool_name);
6881 string app;
6882 cmd_getval(cmdmap, "app", app);
6883 string key;
6884 cmd_getval(cmdmap, "key", key);
6885
6886 if (pool_name.empty()) {
6887 // all
6888 f->open_object_section("pools");
6889 for (const auto &pool : osdmap.pools) {
6890 std::string name("<unknown>");
6891 const auto &pni = osdmap.pool_name.find(pool.first);
6892 if (pni != osdmap.pool_name.end())
6893 name = pni->second;
6894 f->open_object_section(name.c_str());
6895 for (auto &app_pair : pool.second.application_metadata) {
6896 f->open_object_section(app_pair.first.c_str());
6897 for (auto &kv_pair : app_pair.second) {
6898 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6899 }
6900 f->close_section();
6901 }
6902 f->close_section(); // name
6903 }
6904 f->close_section(); // pools
6905 f->flush(rdata);
6906 } else {
6907 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6908 if (pool < 0) {
6909 ss << "unrecognized pool '" << pool_name << "'";
6910 r = -ENOENT;
6911 goto reply;
6912 }
6913 auto p = osdmap.get_pg_pool(pool);
6914 // filter by pool
6915 if (app.empty()) {
6916 f->open_object_section(pool_name.c_str());
6917 for (auto &app_pair : p->application_metadata) {
6918 f->open_object_section(app_pair.first.c_str());
6919 for (auto &kv_pair : app_pair.second) {
6920 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6921 }
6922 f->close_section(); // application
6923 }
6924 f->close_section(); // pool_name
6925 f->flush(rdata);
6926 goto reply;
6927 }
6928
6929 auto app_it = p->application_metadata.find(app);
6930 if (app_it == p->application_metadata.end()) {
6931 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6932 r = -ENOENT;
6933 goto reply;
6934 }
6935 // filter by pool + app
6936 if (key.empty()) {
6937 f->open_object_section(app_it->first.c_str());
6938 for (auto &kv_pair : app_it->second) {
6939 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6940 }
6941 f->close_section(); // application
6942 f->flush(rdata);
6943 goto reply;
6944 }
6945 // filter by pool + app + key
6946 auto key_it = app_it->second.find(key);
6947 if (key_it == app_it->second.end()) {
6948 ss << "application '" << app << "' on pool '" << pool_name
6949 << "' does not have key '" << key << "'";
6950 r = -ENOENT;
6951 goto reply;
6952 }
6953 ss << key_it->second << "\n";
6954 rdata.append(ss.str());
6955 ss.str("");
6956 }
6957 } else if (prefix == "osd get-require-min-compat-client") {
6958 ss << osdmap.require_min_compat_client << std::endl;
6959 rdata.append(ss.str());
6960 ss.str("");
6961 goto reply;
6962 } else if (prefix == "osd pool application enable" ||
6963 prefix == "osd pool application disable" ||
6964 prefix == "osd pool application set" ||
6965 prefix == "osd pool application rm") {
6966 bool changed = false;
6967 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6968 if (r != 0) {
6969 // Error, reply.
6970 goto reply;
6971 } else if (changed) {
6972 // Valid mutation, proceed to prepare phase
6973 return false;
6974 } else {
6975 // Idempotent case, reply
6976 goto reply;
6977 }
6978 } else {
6979 // try prepare update
6980 return false;
6981 }
6982
6983 reply:
6984 string rs;
6985 getline(ss, rs);
6986 mon.reply_command(op, r, rs, rdata, get_last_committed());
6987 return true;
6988 }
6989
6990 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6991 {
6992 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6993 osdmap.get_pg_pool(pool_id));
6994 ceph_assert(pool);
6995 pool->set_flag(flags);
6996 }
6997
6998 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6999 {
7000 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7001 osdmap.get_pg_pool(pool_id));
7002 ceph_assert(pool);
7003 pool->unset_flag(flags);
7004 }
7005
7006 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7007 {
7008 char k[80];
7009 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7010 return k;
7011 }
7012
7013 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7014 {
7015 char k[80];
7016 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7017 (unsigned long long)pool, (unsigned long long)snap);
7018 return k;
7019 }
7020
7021 string OSDMonitor::make_purged_snap_key_value(
7022 int64_t pool, snapid_t snap, snapid_t num,
7023 epoch_t epoch, bufferlist *v)
7024 {
7025 // encode the *last* epoch in the key so that we can use forward
7026 // iteration only to search for an epoch in an interval.
7027 encode(snap, *v);
7028 encode(snap + num, *v);
7029 encode(epoch, *v);
7030 return make_purged_snap_key(pool, snap + num - 1);
7031 }
7032
7033
7034 int OSDMonitor::lookup_purged_snap(
7035 int64_t pool, snapid_t snap,
7036 snapid_t *begin, snapid_t *end)
7037 {
7038 string k = make_purged_snap_key(pool, snap);
7039 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7040 it->lower_bound(k);
7041 if (!it->valid()) {
7042 dout(20) << __func__
7043 << " pool " << pool << " snap " << snap
7044 << " - key '" << k << "' not found" << dendl;
7045 return -ENOENT;
7046 }
7047 if (it->key().find("purged_snap_") != 0) {
7048 dout(20) << __func__
7049 << " pool " << pool << " snap " << snap
7050 << " - key '" << k << "' got '" << it->key()
7051 << "', wrong prefix" << dendl;
7052 return -ENOENT;
7053 }
7054 string gotk = it->key();
7055 const char *format = "purged_snap_%llu_";
7056 long long int keypool;
7057 int n = sscanf(gotk.c_str(), format, &keypool);
7058 if (n != 1) {
7059 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7060 return -ENOENT;
7061 }
7062 if (pool != keypool) {
7063 dout(20) << __func__
7064 << " pool " << pool << " snap " << snap
7065 << " - key '" << k << "' got '" << gotk
7066 << "', wrong pool " << keypool
7067 << dendl;
7068 return -ENOENT;
7069 }
7070 bufferlist v = it->value();
7071 auto p = v.cbegin();
7072 decode(*begin, p);
7073 decode(*end, p);
7074 if (snap < *begin || snap >= *end) {
7075 dout(20) << __func__
7076 << " pool " << pool << " snap " << snap
7077 << " - found [" << *begin << "," << *end << "), no overlap"
7078 << dendl;
7079 return -ENOENT;
7080 }
7081 return 0;
7082 }
7083
7084 void OSDMonitor::insert_purged_snap_update(
7085 int64_t pool,
7086 snapid_t start, snapid_t end,
7087 epoch_t epoch,
7088 MonitorDBStore::TransactionRef t)
7089 {
7090 snapid_t before_begin, before_end;
7091 snapid_t after_begin, after_end;
7092 int b = lookup_purged_snap(pool, start - 1,
7093 &before_begin, &before_end);
7094 int a = lookup_purged_snap(pool, end,
7095 &after_begin, &after_end);
7096 if (!b && !a) {
7097 dout(10) << __func__
7098 << " [" << start << "," << end << ") - joins ["
7099 << before_begin << "," << before_end << ") and ["
7100 << after_begin << "," << after_end << ")" << dendl;
7101 // erase only the begin record; we'll overwrite the end one.
7102 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7103 bufferlist v;
7104 string k = make_purged_snap_key_value(pool,
7105 before_begin, after_end - before_begin,
7106 pending_inc.epoch, &v);
7107 t->put(OSD_SNAP_PREFIX, k, v);
7108 } else if (!b) {
7109 dout(10) << __func__
7110 << " [" << start << "," << end << ") - join with earlier ["
7111 << before_begin << "," << before_end << ")" << dendl;
7112 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7113 bufferlist v;
7114 string k = make_purged_snap_key_value(pool,
7115 before_begin, end - before_begin,
7116 pending_inc.epoch, &v);
7117 t->put(OSD_SNAP_PREFIX, k, v);
7118 } else if (!a) {
7119 dout(10) << __func__
7120 << " [" << start << "," << end << ") - join with later ["
7121 << after_begin << "," << after_end << ")" << dendl;
7122 // overwrite after record
7123 bufferlist v;
7124 string k = make_purged_snap_key_value(pool,
7125 start, after_end - start,
7126 pending_inc.epoch, &v);
7127 t->put(OSD_SNAP_PREFIX, k, v);
7128 } else {
7129 dout(10) << __func__
7130 << " [" << start << "," << end << ") - new"
7131 << dendl;
7132 bufferlist v;
7133 string k = make_purged_snap_key_value(pool,
7134 start, end - start,
7135 pending_inc.epoch, &v);
7136 t->put(OSD_SNAP_PREFIX, k, v);
7137 }
7138 }
7139
7140 bool OSDMonitor::try_prune_purged_snaps()
7141 {
7142 if (!mon.mgrstatmon()->is_readable()) {
7143 return false;
7144 }
7145 if (!pending_inc.new_purged_snaps.empty()) {
7146 return false; // we already pruned for this epoch
7147 }
7148
7149 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7150 "mon_max_snap_prune_per_epoch");
7151 if (!max_prune) {
7152 max_prune = 100000;
7153 }
7154 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7155
7156 unsigned actually_pruned = 0;
7157 auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7158 for (auto& p : osdmap.get_pools()) {
7159 auto q = purged_snaps.find(p.first);
7160 if (q == purged_snaps.end()) {
7161 continue;
7162 }
7163 auto& purged = q->second;
7164 if (purged.empty()) {
7165 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7166 continue;
7167 }
7168 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7169 snap_interval_set_t to_prune;
7170 unsigned maybe_pruned = actually_pruned;
7171 for (auto i = purged.begin(); i != purged.end(); ++i) {
7172 snapid_t begin = i.get_start();
7173 auto end = i.get_start() + i.get_len();
7174 snapid_t pbegin = 0, pend = 0;
7175 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7176 if (r == 0) {
7177 // already purged.
7178 // be a bit aggressive about backing off here, because the mon may
7179 // do a lot of work going through this set, and if we know the
7180 // purged set from the OSDs is at least *partly* stale we may as
7181 // well wait for it to be fresh.
7182 dout(20) << __func__ << " we've already purged " << pbegin
7183 << "~" << (pend - pbegin) << dendl;
7184 break; // next pool
7185 }
7186 if (pbegin && pbegin > begin && pbegin < end) {
7187 // the tail of [begin,end) is purged; shorten the range
7188 end = pbegin;
7189 }
7190 to_prune.insert(begin, end - begin);
7191 maybe_pruned += end - begin;
7192 if (maybe_pruned >= max_prune) {
7193 break;
7194 }
7195 }
7196 if (!to_prune.empty()) {
7197 // PGs may still be reporting things as purged that we have already
7198 // pruned from removed_snaps_queue.
7199 snap_interval_set_t actual;
7200 auto r = osdmap.removed_snaps_queue.find(p.first);
7201 if (r != osdmap.removed_snaps_queue.end()) {
7202 actual.intersection_of(to_prune, r->second);
7203 }
7204 actually_pruned += actual.size();
7205 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7206 << ", actual pruned " << actual << dendl;
7207 if (!actual.empty()) {
7208 pending_inc.new_purged_snaps[p.first].swap(actual);
7209 }
7210 }
7211 if (actually_pruned >= max_prune) {
7212 break;
7213 }
7214 }
7215 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7216 return !!actually_pruned;
7217 }
7218
7219 bool OSDMonitor::update_pools_status()
7220 {
7221 if (!mon.mgrstatmon()->is_readable())
7222 return false;
7223
7224 bool ret = false;
7225
7226 auto& pools = osdmap.get_pools();
7227 for (auto it = pools.begin(); it != pools.end(); ++it) {
7228 const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7229 if (!pstat)
7230 continue;
7231 const object_stat_sum_t& sum = pstat->stats.sum;
7232 const pg_pool_t &pool = it->second;
7233 const string& pool_name = osdmap.get_pool_name(it->first);
7234
7235 bool pool_is_full =
7236 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7237 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7238
7239 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7240 if (pool_is_full)
7241 continue;
7242
7243 mon.clog->info() << "pool '" << pool_name
7244 << "' no longer out of quota; removing NO_QUOTA flag";
7245 // below we cancel FLAG_FULL too, we'll set it again in
7246 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7247 clear_pool_flags(it->first,
7248 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7249 ret = true;
7250 } else {
7251 if (!pool_is_full)
7252 continue;
7253
7254 if (pool.quota_max_bytes > 0 &&
7255 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7256 mon.clog->warn() << "pool '" << pool_name << "' is full"
7257 << " (reached quota's max_bytes: "
7258 << byte_u_t(pool.quota_max_bytes) << ")";
7259 }
7260 if (pool.quota_max_objects > 0 &&
7261 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7262 mon.clog->warn() << "pool '" << pool_name << "' is full"
7263 << " (reached quota's max_objects: "
7264 << pool.quota_max_objects << ")";
7265 }
7266 // set both FLAG_FULL_QUOTA and FLAG_FULL
7267 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7268 // since FLAG_FULL should always take precedence
7269 set_pool_flags(it->first,
7270 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7271 clear_pool_flags(it->first,
7272 pg_pool_t::FLAG_NEARFULL |
7273 pg_pool_t::FLAG_BACKFILLFULL);
7274 ret = true;
7275 }
7276 }
7277 return ret;
7278 }
7279
7280 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7281 {
7282 op->mark_osdmon_event(__func__);
7283 auto m = op->get_req<MPoolOp>();
7284 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7285 MonSession *session = op->get_session();
7286 if (!session)
7287 return -EPERM;
7288 string erasure_code_profile;
7289 stringstream ss;
7290 string rule_name;
7291 int ret = 0;
7292 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7293 0, 0, 0, 0, 0, 0.0,
7294 erasure_code_profile,
7295 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7296 &ss);
7297
7298 if (ret < 0) {
7299 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7300 }
7301 return ret;
7302 }
7303
7304 int OSDMonitor::crush_rename_bucket(const string& srcname,
7305 const string& dstname,
7306 ostream *ss)
7307 {
7308 int ret;
7309 //
7310 // Avoid creating a pending crush if it does not already exists and
7311 // the rename would fail.
7312 //
7313 if (!_have_pending_crush()) {
7314 ret = _get_stable_crush().can_rename_bucket(srcname,
7315 dstname,
7316 ss);
7317 if (ret)
7318 return ret;
7319 }
7320
7321 CrushWrapper newcrush;
7322 _get_pending_crush(newcrush);
7323
7324 ret = newcrush.rename_bucket(srcname,
7325 dstname,
7326 ss);
7327 if (ret)
7328 return ret;
7329
7330 pending_inc.crush.clear();
7331 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7332 *ss << "renamed bucket " << srcname << " into " << dstname;
7333 return 0;
7334 }
7335
7336 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7337 {
7338 string replacement = "";
7339
7340 if (plugin == "jerasure_generic" ||
7341 plugin == "jerasure_sse3" ||
7342 plugin == "jerasure_sse4" ||
7343 plugin == "jerasure_neon") {
7344 replacement = "jerasure";
7345 } else if (plugin == "shec_generic" ||
7346 plugin == "shec_sse3" ||
7347 plugin == "shec_sse4" ||
7348 plugin == "shec_neon") {
7349 replacement = "shec";
7350 }
7351
7352 if (replacement != "") {
7353 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7354 << plugin << " that has been deprecated. Please use "
7355 << replacement << " instead." << dendl;
7356 }
7357 }
7358
7359 int OSDMonitor::normalize_profile(const string& profilename,
7360 ErasureCodeProfile &profile,
7361 bool force,
7362 ostream *ss)
7363 {
7364 ErasureCodeInterfaceRef erasure_code;
7365 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7366 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7367 check_legacy_ec_plugin(plugin->second, profilename);
7368 int err = instance.factory(plugin->second,
7369 g_conf().get_val<std::string>("erasure_code_dir"),
7370 profile, &erasure_code, ss);
7371 if (err) {
7372 return err;
7373 }
7374
7375 err = erasure_code->init(profile, ss);
7376 if (err) {
7377 return err;
7378 }
7379
7380 auto it = profile.find("stripe_unit");
7381 if (it != profile.end()) {
7382 string err_str;
7383 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7384 if (!err_str.empty()) {
7385 *ss << "could not parse stripe_unit '" << it->second
7386 << "': " << err_str << std::endl;
7387 return -EINVAL;
7388 }
7389 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7390 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7391 if (chunk_size != stripe_unit) {
7392 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7393 << "alignment. Would be padded to " << chunk_size
7394 << std::endl;
7395 return -EINVAL;
7396 }
7397 if ((stripe_unit % 4096) != 0 && !force) {
7398 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7399 << "use --force to override this check" << std::endl;
7400 return -EINVAL;
7401 }
7402 }
7403 return 0;
7404 }
7405
7406 int OSDMonitor::crush_rule_create_erasure(const string &name,
7407 const string &profile,
7408 int *rule,
7409 ostream *ss)
7410 {
7411 int ruleid = osdmap.crush->get_rule_id(name);
7412 if (ruleid != -ENOENT) {
7413 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7414 return -EEXIST;
7415 }
7416
7417 CrushWrapper newcrush;
7418 _get_pending_crush(newcrush);
7419
7420 ruleid = newcrush.get_rule_id(name);
7421 if (ruleid != -ENOENT) {
7422 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7423 return -EALREADY;
7424 } else {
7425 ErasureCodeInterfaceRef erasure_code;
7426 int err = get_erasure_code(profile, &erasure_code, ss);
7427 if (err) {
7428 *ss << "failed to load plugin using profile " << profile << std::endl;
7429 return err;
7430 }
7431
7432 err = erasure_code->create_rule(name, newcrush, ss);
7433 erasure_code.reset();
7434 if (err < 0)
7435 return err;
7436 *rule = err;
7437 pending_inc.crush.clear();
7438 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7439 return 0;
7440 }
7441 }
7442
7443 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7444 ErasureCodeInterfaceRef *erasure_code,
7445 ostream *ss) const
7446 {
7447 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7448 return -EAGAIN;
7449 ErasureCodeProfile profile =
7450 osdmap.get_erasure_code_profile(erasure_code_profile);
7451 ErasureCodeProfile::const_iterator plugin =
7452 profile.find("plugin");
7453 if (plugin == profile.end()) {
7454 *ss << "cannot determine the erasure code plugin"
7455 << " because there is no 'plugin' entry in the erasure_code_profile "
7456 << profile << std::endl;
7457 return -EINVAL;
7458 }
7459 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7460 auto& instance = ErasureCodePluginRegistry::instance();
7461 return instance.factory(plugin->second,
7462 g_conf().get_val<std::string>("erasure_code_dir"),
7463 profile, erasure_code, ss);
7464 }
7465
7466 int OSDMonitor::check_cluster_features(uint64_t features,
7467 stringstream &ss)
7468 {
7469 stringstream unsupported_ss;
7470 int unsupported_count = 0;
7471 if ((mon.get_quorum_con_features() & features) != features) {
7472 unsupported_ss << "the monitor cluster";
7473 ++unsupported_count;
7474 }
7475
7476 set<int32_t> up_osds;
7477 osdmap.get_up_osds(up_osds);
7478 for (set<int32_t>::iterator it = up_osds.begin();
7479 it != up_osds.end(); ++it) {
7480 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7481 if ((xi.features & features) != features) {
7482 if (unsupported_count > 0)
7483 unsupported_ss << ", ";
7484 unsupported_ss << "osd." << *it;
7485 unsupported_count ++;
7486 }
7487 }
7488
7489 if (unsupported_count > 0) {
7490 ss << "features " << features << " unsupported by: "
7491 << unsupported_ss.str();
7492 return -ENOTSUP;
7493 }
7494
7495 // check pending osd state, too!
7496 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7497 pending_inc.new_xinfo.begin();
7498 p != pending_inc.new_xinfo.end(); ++p) {
7499 const osd_xinfo_t &xi = p->second;
7500 if ((xi.features & features) != features) {
7501 dout(10) << __func__ << " pending osd." << p->first
7502 << " features are insufficient; retry" << dendl;
7503 return -EAGAIN;
7504 }
7505 }
7506
7507 return 0;
7508 }
7509
7510 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7511 stringstream& ss)
7512 {
7513 OSDMap::Incremental new_pending = pending_inc;
7514 encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7515 OSDMap newmap;
7516 newmap.deepish_copy_from(osdmap);
7517 newmap.apply_incremental(new_pending);
7518
7519 // client compat
7520 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7521 auto mv = newmap.get_min_compat_client();
7522 if (mv > newmap.require_min_compat_client) {
7523 ss << "new crush map requires client version " << mv
7524 << " but require_min_compat_client is "
7525 << newmap.require_min_compat_client;
7526 return false;
7527 }
7528 }
7529
7530 // osd compat
7531 uint64_t features =
7532 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7533 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7534 stringstream features_ss;
7535 int r = check_cluster_features(features, features_ss);
7536 if (r) {
7537 ss << "Could not change CRUSH: " << features_ss.str();
7538 return false;
7539 }
7540
7541 return true;
7542 }
7543
7544 bool OSDMonitor::erasure_code_profile_in_use(
7545 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7546 const string &profile,
7547 ostream *ss)
7548 {
7549 bool found = false;
7550 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7551 p != pools.end();
7552 ++p) {
7553 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7554 *ss << osdmap.pool_name[p->first] << " ";
7555 found = true;
7556 }
7557 }
7558 if (found) {
7559 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7560 }
7561 return found;
7562 }
7563
7564 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7565 map<string,string> *erasure_code_profile_map,
7566 ostream *ss)
7567 {
7568 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7569 get_json_str_map,
7570 *ss,
7571 erasure_code_profile_map,
7572 true);
7573 if (r)
7574 return r;
7575 ceph_assert((*erasure_code_profile_map).count("plugin"));
7576 string default_plugin = (*erasure_code_profile_map)["plugin"];
7577 map<string,string> user_map;
7578 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7579 i != erasure_code_profile.end();
7580 ++i) {
7581 size_t equal = i->find('=');
7582 if (equal == string::npos) {
7583 user_map[*i] = string();
7584 (*erasure_code_profile_map)[*i] = string();
7585 } else {
7586 const string key = i->substr(0, equal);
7587 equal++;
7588 const string value = i->substr(equal);
7589 if (key.find("ruleset-") == 0) {
7590 *ss << "property '" << key << "' is no longer supported; try "
7591 << "'crush-" << key.substr(8) << "' instead";
7592 return -EINVAL;
7593 }
7594 user_map[key] = value;
7595 (*erasure_code_profile_map)[key] = value;
7596 }
7597 }
7598
7599 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7600 (*erasure_code_profile_map) = user_map;
7601
7602 return 0;
7603 }
7604
7605 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7606 const string &erasure_code_profile,
7607 uint8_t repl_size,
7608 unsigned *size, unsigned *min_size,
7609 ostream *ss)
7610 {
7611 int err = 0;
7612 bool set_min_size = false;
7613 switch (pool_type) {
7614 case pg_pool_t::TYPE_REPLICATED:
7615 if (osdmap.stretch_mode_enabled) {
7616 if (repl_size == 0)
7617 repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7618 if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7619 *ss << "prepare_pool_size: we are in stretch mode but size "
7620 << repl_size << " does not match!";
7621 return -EINVAL;
7622 }
7623 *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7624 set_min_size = true;
7625 }
7626 if (repl_size == 0) {
7627 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7628 }
7629 *size = repl_size;
7630 if (!set_min_size)
7631 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7632 break;
7633 case pg_pool_t::TYPE_ERASURE:
7634 {
7635 if (osdmap.stretch_mode_enabled) {
7636 *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7637 return -EINVAL;
7638 }
7639 ErasureCodeInterfaceRef erasure_code;
7640 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7641 if (err == 0) {
7642 *size = erasure_code->get_chunk_count();
7643 *min_size =
7644 erasure_code->get_data_chunk_count() +
7645 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7646 assert(*min_size <= *size);
7647 assert(*min_size >= erasure_code->get_data_chunk_count());
7648 }
7649 }
7650 break;
7651 default:
7652 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7653 err = -EINVAL;
7654 break;
7655 }
7656 return err;
7657 }
7658
7659 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7660 const string &erasure_code_profile,
7661 uint32_t *stripe_width,
7662 ostream *ss)
7663 {
7664 int err = 0;
7665 switch (pool_type) {
7666 case pg_pool_t::TYPE_REPLICATED:
7667 // ignored
7668 break;
7669 case pg_pool_t::TYPE_ERASURE:
7670 {
7671 ErasureCodeProfile profile =
7672 osdmap.get_erasure_code_profile(erasure_code_profile);
7673 ErasureCodeInterfaceRef erasure_code;
7674 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7675 if (err)
7676 break;
7677 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7678 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7679 auto it = profile.find("stripe_unit");
7680 if (it != profile.end()) {
7681 string err_str;
7682 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7683 ceph_assert(err_str.empty());
7684 }
7685 *stripe_width = data_chunks *
7686 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7687 }
7688 break;
7689 default:
7690 *ss << "prepare_pool_stripe_width: "
7691 << pool_type << " is not a known pool type";
7692 err = -EINVAL;
7693 break;
7694 }
7695 return err;
7696 }
7697
7698 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7699 const string &erasure_code_profile,
7700 const string &rule_name,
7701 int *crush_rule,
7702 ostream *ss)
7703 {
7704
7705 if (*crush_rule < 0) {
7706 switch (pool_type) {
7707 case pg_pool_t::TYPE_REPLICATED:
7708 {
7709 if (rule_name == "") {
7710 // Use default rule
7711 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7712 if (*crush_rule < 0) {
7713 // Errors may happen e.g. if no valid rule is available
7714 *ss << "No suitable CRUSH rule exists, check "
7715 << "'osd pool default crush *' config options";
7716 return -ENOENT;
7717 }
7718 } else {
7719 return get_crush_rule(rule_name, crush_rule, ss);
7720 }
7721 }
7722 break;
7723 case pg_pool_t::TYPE_ERASURE:
7724 {
7725 int err = crush_rule_create_erasure(rule_name,
7726 erasure_code_profile,
7727 crush_rule, ss);
7728 switch (err) {
7729 case -EALREADY:
7730 dout(20) << "prepare_pool_crush_rule: rule "
7731 << rule_name << " try again" << dendl;
7732 // fall through
7733 case 0:
7734 // need to wait for the crush rule to be proposed before proceeding
7735 err = -EAGAIN;
7736 break;
7737 case -EEXIST:
7738 err = 0;
7739 break;
7740 }
7741 return err;
7742 }
7743 break;
7744 default:
7745 *ss << "prepare_pool_crush_rule: " << pool_type
7746 << " is not a known pool type";
7747 return -EINVAL;
7748 }
7749 } else {
7750 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7751 *ss << "CRUSH rule " << *crush_rule << " not found";
7752 return -ENOENT;
7753 }
7754 }
7755
7756 return 0;
7757 }
7758
7759 int OSDMonitor::get_crush_rule(const string &rule_name,
7760 int *crush_rule,
7761 ostream *ss)
7762 {
7763 int ret;
7764 ret = osdmap.crush->get_rule_id(rule_name);
7765 if (ret != -ENOENT) {
7766 // found it, use it
7767 *crush_rule = ret;
7768 } else {
7769 CrushWrapper newcrush;
7770 _get_pending_crush(newcrush);
7771
7772 ret = newcrush.get_rule_id(rule_name);
7773 if (ret != -ENOENT) {
7774 // found it, wait for it to be proposed
7775 dout(20) << __func__ << ": rule " << rule_name
7776 << " try again" << dendl;
7777 return -EAGAIN;
7778 } else {
7779 // Cannot find it , return error
7780 *ss << "specified rule " << rule_name << " doesn't exist";
7781 return ret;
7782 }
7783 }
7784 return 0;
7785 }
7786
7787 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7788 {
7789 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7790 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
7791 auto max_pgs = max_pgs_per_osd * num_osds;
7792 uint64_t projected = 0;
7793 if (pool < 0) {
7794 projected += pg_num * size;
7795 }
7796 for (const auto& i : osdmap.get_pools()) {
7797 if (i.first == pool) {
7798 projected += pg_num * size;
7799 } else {
7800 projected += i.second.get_pg_num_target() * i.second.get_size();
7801 }
7802 }
7803 if (projected > max_pgs) {
7804 if (pool >= 0) {
7805 *ss << "pool id " << pool;
7806 }
7807 *ss << " pg_num " << pg_num << " size " << size
7808 << " would mean " << projected
7809 << " total pgs, which exceeds max " << max_pgs
7810 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7811 << " * num_in_osds " << num_osds << ")";
7812 return -ERANGE;
7813 }
7814 return 0;
7815 }
7816
7817 /**
7818 * @param name The name of the new pool
7819 * @param crush_rule The crush rule to use. If <0, will use the system default
7820 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7821 * @param pg_num The pg_num to use. If set to 0, will use the system default
7822 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7823 * @param repl_size Replication factor, or 0 for default
7824 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7825 * @param pool_type TYPE_ERASURE, or TYPE_REP
7826 * @param expected_num_objects expected number of objects on the pool
7827 * @param fast_read fast read type.
7828 * @param ss human readable error message, if any.
7829 *
7830 * @return 0 on success, negative errno on failure.
7831 */
7832 int OSDMonitor::prepare_new_pool(string& name,
7833 int crush_rule,
7834 const string &crush_rule_name,
7835 unsigned pg_num, unsigned pgp_num,
7836 unsigned pg_num_min,
7837 const uint64_t repl_size,
7838 const uint64_t target_size_bytes,
7839 const float target_size_ratio,
7840 const string &erasure_code_profile,
7841 const unsigned pool_type,
7842 const uint64_t expected_num_objects,
7843 FastReadType fast_read,
7844 const string& pg_autoscale_mode,
7845 ostream *ss)
7846 {
7847 if (name.length() == 0)
7848 return -EINVAL;
7849 if (pg_num == 0)
7850 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7851 if (pgp_num == 0)
7852 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7853 if (!pgp_num)
7854 pgp_num = pg_num;
7855 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7856 *ss << "'pg_num' must be greater than 0 and less than or equal to "
7857 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7858 << " (you may adjust 'mon max pool pg num' for higher values)";
7859 return -ERANGE;
7860 }
7861 if (pgp_num > pg_num) {
7862 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7863 << ", which in this case is " << pg_num;
7864 return -ERANGE;
7865 }
7866 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7867 *ss << "'fast_read' can only apply to erasure coding pool";
7868 return -EINVAL;
7869 }
7870 int r;
7871 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7872 crush_rule_name, &crush_rule, ss);
7873 if (r) {
7874 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7875 return r;
7876 }
7877 if (g_conf()->mon_osd_crush_smoke_test) {
7878 CrushWrapper newcrush;
7879 _get_pending_crush(newcrush);
7880 ostringstream err;
7881 CrushTester tester(newcrush, err);
7882 tester.set_min_x(0);
7883 tester.set_max_x(50);
7884 tester.set_rule(crush_rule);
7885 auto start = ceph::coarse_mono_clock::now();
7886 r = tester.test_with_fork(g_conf()->mon_lease);
7887 auto duration = ceph::coarse_mono_clock::now() - start;
7888 if (r < 0) {
7889 dout(10) << "tester.test_with_fork returns " << r
7890 << ": " << err.str() << dendl;
7891 *ss << "crush test failed with " << r << ": " << err.str();
7892 return r;
7893 }
7894 dout(10) << __func__ << " crush smoke test duration: "
7895 << duration << dendl;
7896 }
7897 unsigned size, min_size;
7898 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7899 &size, &min_size, ss);
7900 if (r) {
7901 dout(10) << "prepare_pool_size returns " << r << dendl;
7902 return r;
7903 }
7904 r = check_pg_num(-1, pg_num, size, ss);
7905 if (r) {
7906 dout(10) << "check_pg_num returns " << r << dendl;
7907 return r;
7908 }
7909
7910 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7911 return -EINVAL;
7912 }
7913
7914 uint32_t stripe_width = 0;
7915 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7916 if (r) {
7917 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7918 return r;
7919 }
7920
7921 bool fread = false;
7922 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7923 switch (fast_read) {
7924 case FAST_READ_OFF:
7925 fread = false;
7926 break;
7927 case FAST_READ_ON:
7928 fread = true;
7929 break;
7930 case FAST_READ_DEFAULT:
7931 fread = g_conf()->osd_pool_default_ec_fast_read;
7932 break;
7933 default:
7934 *ss << "invalid fast_read setting: " << fast_read;
7935 return -EINVAL;
7936 }
7937 }
7938
7939 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7940 p != pending_inc.new_pool_names.end();
7941 ++p) {
7942 if (p->second == name)
7943 return 0;
7944 }
7945
7946 if (-1 == pending_inc.new_pool_max)
7947 pending_inc.new_pool_max = osdmap.pool_max;
7948 int64_t pool = ++pending_inc.new_pool_max;
7949 pg_pool_t empty;
7950 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7951 pi->create_time = ceph_clock_now();
7952 pi->type = pool_type;
7953 pi->fast_read = fread;
7954 pi->flags = g_conf()->osd_pool_default_flags;
7955 if (g_conf()->osd_pool_default_flag_hashpspool)
7956 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7957 if (g_conf()->osd_pool_default_flag_nodelete)
7958 pi->set_flag(pg_pool_t::FLAG_NODELETE);
7959 if (g_conf()->osd_pool_default_flag_nopgchange)
7960 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7961 if (g_conf()->osd_pool_default_flag_nosizechange)
7962 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7963 pi->set_flag(pg_pool_t::FLAG_CREATING);
7964 if (g_conf()->osd_pool_use_gmt_hitset)
7965 pi->use_gmt_hitset = true;
7966 else
7967 pi->use_gmt_hitset = false;
7968
7969 pi->size = size;
7970 pi->min_size = min_size;
7971 pi->crush_rule = crush_rule;
7972 pi->expected_num_objects = expected_num_objects;
7973 pi->object_hash = CEPH_STR_HASH_RJENKINS;
7974 if (osdmap.stretch_mode_enabled) {
7975 pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
7976 pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
7977 pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
7978 pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
7979 if (osdmap.degraded_stretch_mode) {
7980 pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
7981 pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
7982 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
7983 // TODO: drat, we don't record this ^ anywhere, though given that it
7984 // necessarily won't exist elsewhere it likely doesn't matter
7985 pi->min_size = pi->min_size / 2;
7986 pi->size = pi->size / 2; // only support 2 zones now
7987 }
7988 }
7989
7990 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7991 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7992 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7993 pi->pg_autoscale_mode = m;
7994 } else {
7995 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7996 }
7997 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7998 pi->set_pg_num(
7999 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8000 : pg_num);
8001 pi->set_pg_num_pending(pi->get_pg_num());
8002 pi->set_pg_num_target(pg_num);
8003 pi->set_pgp_num(pi->get_pg_num());
8004 pi->set_pgp_num_target(pgp_num);
8005 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8006 pg_num_min) {
8007 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8008 }
8009 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8010 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8011 pi->pg_autoscale_mode = m;
8012 }
8013
8014 pi->last_change = pending_inc.epoch;
8015 pi->auid = 0;
8016
8017 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8018 pi->erasure_code_profile = erasure_code_profile;
8019 } else {
8020 pi->erasure_code_profile = "";
8021 }
8022 pi->stripe_width = stripe_width;
8023
8024 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8025 target_size_bytes) {
8026 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8027 // larger than int32_t max.
8028 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8029 }
8030 if (target_size_ratio > 0.0 &&
8031 osdmap.require_osd_release >= ceph_release_t::nautilus) {
8032 // only store for nautilus+, just to be consistent and tidy.
8033 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8034 }
8035
8036 pi->cache_target_dirty_ratio_micro =
8037 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8038 pi->cache_target_dirty_high_ratio_micro =
8039 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8040 pi->cache_target_full_ratio_micro =
8041 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8042 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8043 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8044
8045 pending_inc.new_pool_names[pool] = name;
8046 return 0;
8047 }
8048
8049 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8050 {
8051 op->mark_osdmon_event(__func__);
8052 ostringstream ss;
8053 if (pending_inc.new_flags < 0)
8054 pending_inc.new_flags = osdmap.get_flags();
8055 pending_inc.new_flags |= flag;
8056 ss << OSDMap::get_flag_string(flag) << " is set";
8057 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8058 get_last_committed() + 1));
8059 return true;
8060 }
8061
8062 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8063 {
8064 op->mark_osdmon_event(__func__);
8065 ostringstream ss;
8066 if (pending_inc.new_flags < 0)
8067 pending_inc.new_flags = osdmap.get_flags();
8068 pending_inc.new_flags &= ~flag;
8069 ss << OSDMap::get_flag_string(flag) << " is unset";
8070 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8071 get_last_committed() + 1));
8072 return true;
8073 }
8074
8075 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8076 stringstream& ss)
8077 {
8078 string poolstr;
8079 cmd_getval(cmdmap, "pool", poolstr);
8080 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8081 if (pool < 0) {
8082 ss << "unrecognized pool '" << poolstr << "'";
8083 return -ENOENT;
8084 }
8085 string var;
8086 cmd_getval(cmdmap, "var", var);
8087
8088 pg_pool_t p = *osdmap.get_pg_pool(pool);
8089 if (pending_inc.new_pools.count(pool))
8090 p = pending_inc.new_pools[pool];
8091
8092 // accept val as a json string in the normal case (current
8093 // generation monitor). parse out int or float values from the
8094 // string as needed. however, if it is not a string, try to pull
8095 // out an int, in case an older monitor with an older json schema is
8096 // forwarding a request.
8097 string val;
8098 string interr, floaterr;
8099 int64_t n = 0;
8100 double f = 0;
8101 int64_t uf = 0; // micro-f
8102 cmd_getval(cmdmap, "val", val);
8103
8104 auto si_options = {
8105 "target_max_objects"
8106 };
8107 auto iec_options = {
8108 "target_max_bytes",
8109 "target_size_bytes",
8110 "compression_max_blob_size",
8111 "compression_min_blob_size",
8112 "csum_max_block",
8113 "csum_min_block",
8114 };
8115 if (count(begin(si_options), end(si_options), var)) {
8116 n = strict_si_cast<int64_t>(val.c_str(), &interr);
8117 } else if (count(begin(iec_options), end(iec_options), var)) {
8118 n = strict_iec_cast<int64_t>(val.c_str(), &interr);
8119 } else {
8120 // parse string as both int and float; different fields use different types.
8121 n = strict_strtoll(val.c_str(), 10, &interr);
8122 f = strict_strtod(val.c_str(), &floaterr);
8123 uf = llrintl(f * (double)1000000.0);
8124 }
8125
8126 if (!p.is_tier() &&
8127 (var == "hit_set_type" || var == "hit_set_period" ||
8128 var == "hit_set_count" || var == "hit_set_fpp" ||
8129 var == "target_max_objects" || var == "target_max_bytes" ||
8130 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8131 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8132 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8133 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8134 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8135 return -EACCES;
8136 }
8137
8138 if (var == "size") {
8139 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8140 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8141 return -EPERM;
8142 }
8143 if (p.type == pg_pool_t::TYPE_ERASURE) {
8144 ss << "can not change the size of an erasure-coded pool";
8145 return -ENOTSUP;
8146 }
8147 if (interr.length()) {
8148 ss << "error parsing integer value '" << val << "': " << interr;
8149 return -EINVAL;
8150 }
8151 if (n <= 0 || n > 10) {
8152 ss << "pool size must be between 1 and 10";
8153 return -EINVAL;
8154 }
8155 if (n == 1) {
8156 if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8157 ss << "configuring pool size as 1 is disabled by default.";
8158 return -EPERM;
8159 }
8160 bool sure = false;
8161 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8162 if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8163 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8164 "pass the flag --yes-i-really-mean-it.";
8165 return -EPERM;
8166 }
8167 }
8168 if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
8169 return -EINVAL;
8170 }
8171 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
8172 if (r < 0) {
8173 return r;
8174 }
8175 p.size = n;
8176 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8177 } else if (var == "min_size") {
8178 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8179 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8180 return -EPERM;
8181 }
8182 if (interr.length()) {
8183 ss << "error parsing integer value '" << val << "': " << interr;
8184 return -EINVAL;
8185 }
8186
8187 if (p.type != pg_pool_t::TYPE_ERASURE) {
8188 if (n < 1 || n > p.size) {
8189 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8190 return -EINVAL;
8191 }
8192 } else {
8193 ErasureCodeInterfaceRef erasure_code;
8194 int k;
8195 stringstream tmp;
8196 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8197 if (err == 0) {
8198 k = erasure_code->get_data_chunk_count();
8199 } else {
8200 ss << __func__ << " get_erasure_code failed: " << tmp.str();
8201 return err;
8202 }
8203
8204 if (n < k || n > p.size) {
8205 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8206 return -EINVAL;
8207 }
8208 }
8209 p.min_size = n;
8210 } else if (var == "pg_num_actual") {
8211 if (interr.length()) {
8212 ss << "error parsing integer value '" << val << "': " << interr;
8213 return -EINVAL;
8214 }
8215 if (n == (int)p.get_pg_num()) {
8216 return 0;
8217 }
8218 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8219 ss << "'pg_num' must be greater than 0 and less than or equal to "
8220 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8221 << " (you may adjust 'mon max pool pg num' for higher values)";
8222 return -ERANGE;
8223 }
8224 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8225 ss << "cannot adjust pg_num while initial PGs are being created";
8226 return -EBUSY;
8227 }
8228 if (n > (int)p.get_pg_num()) {
8229 if (p.get_pg_num() != p.get_pg_num_pending()) {
8230 // force pre-nautilus clients to resend their ops, since they
8231 // don't understand pg_num_pending changes form a new interval
8232 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8233 }
8234 p.set_pg_num(n);
8235 } else {
8236 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8237 ss << "nautilus OSDs are required to adjust pg_num_pending";
8238 return -EPERM;
8239 }
8240 if (n < (int)p.get_pgp_num()) {
8241 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8242 return -EINVAL;
8243 }
8244 if (n < (int)p.get_pg_num() - 1) {
8245 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8246 << ") - 1; only single pg decrease is currently supported";
8247 return -EINVAL;
8248 }
8249 p.set_pg_num_pending(n);
8250 // force pre-nautilus clients to resend their ops, since they
8251 // don't understand pg_num_pending changes form a new interval
8252 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8253 }
8254 // force pre-luminous clients to resend their ops, since they
8255 // don't understand that split PGs now form a new interval.
8256 p.last_force_op_resend_preluminous = pending_inc.epoch;
8257 } else if (var == "pg_num") {
8258 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8259 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8260 return -EPERM;
8261 }
8262 if (interr.length()) {
8263 ss << "error parsing integer value '" << val << "': " << interr;
8264 return -EINVAL;
8265 }
8266 if (n == (int)p.get_pg_num_target()) {
8267 return 0;
8268 }
8269 if (n <= 0 || static_cast<uint64_t>(n) >
8270 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8271 ss << "'pg_num' must be greater than 0 and less than or equal to "
8272 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8273 << " (you may adjust 'mon max pool pg num' for higher values)";
8274 return -ERANGE;
8275 }
8276 if (n > (int)p.get_pg_num_target()) {
8277 int r = check_pg_num(pool, n, p.get_size(), &ss);
8278 if (r) {
8279 return r;
8280 }
8281 bool force = false;
8282 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8283 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8284 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8285 return -EPERM;
8286 }
8287 } else {
8288 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8289 ss << "nautilus OSDs are required to decrease pg_num";
8290 return -EPERM;
8291 }
8292 }
8293 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8294 // pre-nautilus osdmap format; increase pg_num directly
8295 assert(n > (int)p.get_pg_num());
8296 // force pre-nautilus clients to resend their ops, since they
8297 // don't understand pg_num_target changes form a new interval
8298 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8299 // force pre-luminous clients to resend their ops, since they
8300 // don't understand that split PGs now form a new interval.
8301 p.last_force_op_resend_preluminous = pending_inc.epoch;
8302 p.set_pg_num(n);
8303 } else {
8304 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8305 // make pgp_num track pg_num if it already matches. if it is set
8306 // differently, leave it different and let the user control it
8307 // manually.
8308 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8309 p.set_pgp_num_target(n);
8310 }
8311 p.set_pg_num_target(n);
8312 }
8313 } else if (var == "pgp_num_actual") {
8314 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8315 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8316 return -EPERM;
8317 }
8318 if (interr.length()) {
8319 ss << "error parsing integer value '" << val << "': " << interr;
8320 return -EINVAL;
8321 }
8322 if (n <= 0) {
8323 ss << "specified pgp_num must > 0, but you set to " << n;
8324 return -EINVAL;
8325 }
8326 if (n > (int)p.get_pg_num()) {
8327 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8328 return -EINVAL;
8329 }
8330 if (n > (int)p.get_pg_num_pending()) {
8331 ss << "specified pgp_num " << n
8332 << " > pg_num_pending " << p.get_pg_num_pending();
8333 return -EINVAL;
8334 }
8335 p.set_pgp_num(n);
8336 } else if (var == "pgp_num") {
8337 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8338 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8339 return -EPERM;
8340 }
8341 if (interr.length()) {
8342 ss << "error parsing integer value '" << val << "': " << interr;
8343 return -EINVAL;
8344 }
8345 if (n <= 0) {
8346 ss << "specified pgp_num must > 0, but you set to " << n;
8347 return -EINVAL;
8348 }
8349 if (n > (int)p.get_pg_num_target()) {
8350 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8351 return -EINVAL;
8352 }
8353 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8354 // pre-nautilus osdmap format; increase pgp_num directly
8355 p.set_pgp_num(n);
8356 } else {
8357 p.set_pgp_num_target(n);
8358 }
8359 } else if (var == "pg_autoscale_mode") {
8360 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8361 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8362 ss << "specified invalid mode " << val;
8363 return -EINVAL;
8364 }
8365 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8366 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8367 return -EINVAL;
8368 }
8369 p.pg_autoscale_mode = m;
8370 } else if (var == "crush_rule") {
8371 int id = osdmap.crush->get_rule_id(val);
8372 if (id == -ENOENT) {
8373 ss << "crush rule " << val << " does not exist";
8374 return -ENOENT;
8375 }
8376 if (id < 0) {
8377 ss << cpp_strerror(id);
8378 return -ENOENT;
8379 }
8380 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8381 return -EINVAL;
8382 }
8383 p.crush_rule = id;
8384 } else if (var == "nodelete" || var == "nopgchange" ||
8385 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8386 var == "noscrub" || var == "nodeep-scrub") {
8387 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8388 // make sure we only compare against 'n' if we didn't receive a string
8389 if (val == "true" || (interr.empty() && n == 1)) {
8390 p.set_flag(flag);
8391 } else if (val == "false" || (interr.empty() && n == 0)) {
8392 p.unset_flag(flag);
8393 } else {
8394 ss << "expecting value 'true', 'false', '0', or '1'";
8395 return -EINVAL;
8396 }
8397 } else if (var == "hashpspool") {
8398 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8399 bool force = false;
8400 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8401
8402 if (!force) {
8403 ss << "are you SURE? this will remap all placement groups in this pool,"
8404 " this triggers large data movement,"
8405 " pass --yes-i-really-mean-it if you really do.";
8406 return -EPERM;
8407 }
8408 // make sure we only compare against 'n' if we didn't receive a string
8409 if (val == "true" || (interr.empty() && n == 1)) {
8410 p.set_flag(flag);
8411 } else if (val == "false" || (interr.empty() && n == 0)) {
8412 p.unset_flag(flag);
8413 } else {
8414 ss << "expecting value 'true', 'false', '0', or '1'";
8415 return -EINVAL;
8416 }
8417 } else if (var == "hit_set_type") {
8418 if (val == "none")
8419 p.hit_set_params = HitSet::Params();
8420 else {
8421 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8422 if (err)
8423 return err;
8424 if (val == "bloom") {
8425 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8426 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8427 p.hit_set_params = HitSet::Params(bsp);
8428 } else if (val == "explicit_hash")
8429 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8430 else if (val == "explicit_object")
8431 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8432 else {
8433 ss << "unrecognized hit_set type '" << val << "'";
8434 return -EINVAL;
8435 }
8436 }
8437 } else if (var == "hit_set_period") {
8438 if (interr.length()) {
8439 ss << "error parsing integer value '" << val << "': " << interr;
8440 return -EINVAL;
8441 } else if (n < 0) {
8442 ss << "hit_set_period should be non-negative";
8443 return -EINVAL;
8444 }
8445 p.hit_set_period = n;
8446 } else if (var == "hit_set_count") {
8447 if (interr.length()) {
8448 ss << "error parsing integer value '" << val << "': " << interr;
8449 return -EINVAL;
8450 } else if (n < 0) {
8451 ss << "hit_set_count should be non-negative";
8452 return -EINVAL;
8453 }
8454 p.hit_set_count = n;
8455 } else if (var == "hit_set_fpp") {
8456 if (floaterr.length()) {
8457 ss << "error parsing floating point value '" << val << "': " << floaterr;
8458 return -EINVAL;
8459 } else if (f < 0 || f > 1.0) {
8460 ss << "hit_set_fpp should be in the range 0..1";
8461 return -EINVAL;
8462 }
8463 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8464 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8465 return -EINVAL;
8466 }
8467 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8468 bloomp->set_fpp(f);
8469 } else if (var == "use_gmt_hitset") {
8470 if (val == "true" || (interr.empty() && n == 1)) {
8471 p.use_gmt_hitset = true;
8472 } else {
8473 ss << "expecting value 'true' or '1'";
8474 return -EINVAL;
8475 }
8476 } else if (var == "allow_ec_overwrites") {
8477 if (!p.is_erasure()) {
8478 ss << "ec overwrites can only be enabled for an erasure coded pool";
8479 return -EINVAL;
8480 }
8481 stringstream err;
8482 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8483 !is_pool_currently_all_bluestore(pool, p, &err)) {
8484 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8485 return -EINVAL;
8486 }
8487 if (val == "true" || (interr.empty() && n == 1)) {
8488 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8489 } else if (val == "false" || (interr.empty() && n == 0)) {
8490 ss << "ec overwrites cannot be disabled once enabled";
8491 return -EINVAL;
8492 } else {
8493 ss << "expecting value 'true', 'false', '0', or '1'";
8494 return -EINVAL;
8495 }
8496 } else if (var == "target_max_objects") {
8497 if (interr.length()) {
8498 ss << "error parsing int '" << val << "': " << interr;
8499 return -EINVAL;
8500 }
8501 p.target_max_objects = n;
8502 } else if (var == "target_max_bytes") {
8503 if (interr.length()) {
8504 ss << "error parsing int '" << val << "': " << interr;
8505 return -EINVAL;
8506 }
8507 p.target_max_bytes = n;
8508 } else if (var == "cache_target_dirty_ratio") {
8509 if (floaterr.length()) {
8510 ss << "error parsing float '" << val << "': " << floaterr;
8511 return -EINVAL;
8512 }
8513 if (f < 0 || f > 1.0) {
8514 ss << "value must be in the range 0..1";
8515 return -ERANGE;
8516 }
8517 p.cache_target_dirty_ratio_micro = uf;
8518 } else if (var == "cache_target_dirty_high_ratio") {
8519 if (floaterr.length()) {
8520 ss << "error parsing float '" << val << "': " << floaterr;
8521 return -EINVAL;
8522 }
8523 if (f < 0 || f > 1.0) {
8524 ss << "value must be in the range 0..1";
8525 return -ERANGE;
8526 }
8527 p.cache_target_dirty_high_ratio_micro = uf;
8528 } else if (var == "cache_target_full_ratio") {
8529 if (floaterr.length()) {
8530 ss << "error parsing float '" << val << "': " << floaterr;
8531 return -EINVAL;
8532 }
8533 if (f < 0 || f > 1.0) {
8534 ss << "value must be in the range 0..1";
8535 return -ERANGE;
8536 }
8537 p.cache_target_full_ratio_micro = uf;
8538 } else if (var == "cache_min_flush_age") {
8539 if (interr.length()) {
8540 ss << "error parsing int '" << val << "': " << interr;
8541 return -EINVAL;
8542 }
8543 p.cache_min_flush_age = n;
8544 } else if (var == "cache_min_evict_age") {
8545 if (interr.length()) {
8546 ss << "error parsing int '" << val << "': " << interr;
8547 return -EINVAL;
8548 }
8549 p.cache_min_evict_age = n;
8550 } else if (var == "min_read_recency_for_promote") {
8551 if (interr.length()) {
8552 ss << "error parsing integer value '" << val << "': " << interr;
8553 return -EINVAL;
8554 }
8555 p.min_read_recency_for_promote = n;
8556 } else if (var == "hit_set_grade_decay_rate") {
8557 if (interr.length()) {
8558 ss << "error parsing integer value '" << val << "': " << interr;
8559 return -EINVAL;
8560 }
8561 if (n > 100 || n < 0) {
8562 ss << "value out of range,valid range is 0 - 100";
8563 return -EINVAL;
8564 }
8565 p.hit_set_grade_decay_rate = n;
8566 } else if (var == "hit_set_search_last_n") {
8567 if (interr.length()) {
8568 ss << "error parsing integer value '" << val << "': " << interr;
8569 return -EINVAL;
8570 }
8571 if (n > p.hit_set_count || n < 0) {
8572 ss << "value out of range,valid range is 0 - hit_set_count";
8573 return -EINVAL;
8574 }
8575 p.hit_set_search_last_n = n;
8576 } else if (var == "min_write_recency_for_promote") {
8577 if (interr.length()) {
8578 ss << "error parsing integer value '" << val << "': " << interr;
8579 return -EINVAL;
8580 }
8581 p.min_write_recency_for_promote = n;
8582 } else if (var == "fast_read") {
8583 if (p.is_replicated()) {
8584 ss << "fast read is not supported in replication pool";
8585 return -EINVAL;
8586 }
8587 if (val == "true" || (interr.empty() && n == 1)) {
8588 p.fast_read = true;
8589 } else if (val == "false" || (interr.empty() && n == 0)) {
8590 p.fast_read = false;
8591 } else {
8592 ss << "expecting value 'true', 'false', '0', or '1'";
8593 return -EINVAL;
8594 }
8595 } else if (pool_opts_t::is_opt_name(var)) {
8596 bool unset = val == "unset";
8597 if (var == "compression_mode") {
8598 if (!unset) {
8599 auto cmode = Compressor::get_comp_mode_type(val);
8600 if (!cmode) {
8601 ss << "unrecognized compression mode '" << val << "'";
8602 return -EINVAL;
8603 }
8604 }
8605 } else if (var == "compression_algorithm") {
8606 if (!unset) {
8607 auto alg = Compressor::get_comp_alg_type(val);
8608 if (!alg) {
8609 ss << "unrecognized compression_algorithm '" << val << "'";
8610 return -EINVAL;
8611 }
8612 }
8613 } else if (var == "compression_required_ratio") {
8614 if (floaterr.length()) {
8615 ss << "error parsing float value '" << val << "': " << floaterr;
8616 return -EINVAL;
8617 }
8618 if (f < 0 || f > 1) {
8619 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8620 return -EINVAL;
8621 }
8622 } else if (var == "csum_type") {
8623 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8624 if (t < 0 ) {
8625 ss << "unrecognized csum_type '" << val << "'";
8626 return -EINVAL;
8627 }
8628 //preserve csum_type numeric value
8629 n = t;
8630 interr.clear();
8631 } else if (var == "compression_max_blob_size" ||
8632 var == "compression_min_blob_size" ||
8633 var == "csum_max_block" ||
8634 var == "csum_min_block") {
8635 if (interr.length()) {
8636 ss << "error parsing int value '" << val << "': " << interr;
8637 return -EINVAL;
8638 }
8639 } else if (var == "fingerprint_algorithm") {
8640 if (!unset) {
8641 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8642 if (!alg) {
8643 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8644 return -EINVAL;
8645 }
8646 }
8647 } else if (var == "target_size_bytes") {
8648 if (interr.length()) {
8649 ss << "error parsing unit value '" << val << "': " << interr;
8650 return -EINVAL;
8651 }
8652 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8653 ss << "must set require_osd_release to nautilus or "
8654 << "later before setting target_size_bytes";
8655 return -EINVAL;
8656 }
8657 } else if (var == "pg_num_min") {
8658 if (interr.length()) {
8659 ss << "error parsing int value '" << val << "': " << interr;
8660 return -EINVAL;
8661 }
8662 if (n > (int)p.get_pg_num_target()) {
8663 ss << "specified pg_num_min " << n
8664 << " > pg_num " << p.get_pg_num_target();
8665 return -EINVAL;
8666 }
8667 } else if (var == "recovery_priority") {
8668 if (interr.length()) {
8669 ss << "error parsing int value '" << val << "': " << interr;
8670 return -EINVAL;
8671 }
8672 if (!g_conf()->debug_allow_any_pool_priority) {
8673 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8674 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8675 << " and " << OSD_POOL_PRIORITY_MAX;
8676 return -EINVAL;
8677 }
8678 }
8679 } else if (var == "pg_autoscale_bias") {
8680 if (f < 0.0 || f > 1000.0) {
8681 ss << "pg_autoscale_bias must be between 0 and 1000";
8682 return -EINVAL;
8683 }
8684 } else if (var == "dedup_tier") {
8685 if (interr.empty()) {
8686 ss << "expecting value 'pool name'";
8687 return -EINVAL;
8688 }
8689 // Current base tier in dedup does not support ec pool
8690 if (p.is_erasure()) {
8691 ss << "pool '" << poolstr
8692 << "' is an ec pool, which cannot be a base tier";
8693 return -ENOTSUP;
8694 }
8695 int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8696 if (lowtierpool_id < 0) {
8697 ss << "unrecognized pool '" << val << "'";
8698 return -ENOENT;
8699 }
8700 const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8701 ceph_assert(tp);
8702 n = lowtierpool_id;
8703 // The original input is string (pool name), but we convert it to int64_t.
8704 // So, clear interr
8705 interr.clear();
8706 } else if (var == "dedup_chunk_algorithm") {
8707 if (!unset) {
8708 auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8709 if (!alg) {
8710 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8711 return -EINVAL;
8712 }
8713 }
8714 } else if (var == "dedup_cdc_chunk_size") {
8715 if (interr.length()) {
8716 ss << "error parsing int value '" << val << "': " << interr;
8717 return -EINVAL;
8718 }
8719 }
8720
8721 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8722 switch (desc.type) {
8723 case pool_opts_t::STR:
8724 if (unset) {
8725 p.opts.unset(desc.key);
8726 } else {
8727 p.opts.set(desc.key, static_cast<std::string>(val));
8728 }
8729 break;
8730 case pool_opts_t::INT:
8731 if (interr.length()) {
8732 ss << "error parsing integer value '" << val << "': " << interr;
8733 return -EINVAL;
8734 }
8735 if (n == 0) {
8736 p.opts.unset(desc.key);
8737 } else {
8738 p.opts.set(desc.key, static_cast<int64_t>(n));
8739 }
8740 break;
8741 case pool_opts_t::DOUBLE:
8742 if (floaterr.length()) {
8743 ss << "error parsing floating point value '" << val << "': " << floaterr;
8744 return -EINVAL;
8745 }
8746 if (f == 0) {
8747 p.opts.unset(desc.key);
8748 } else {
8749 p.opts.set(desc.key, static_cast<double>(f));
8750 }
8751 break;
8752 default:
8753 ceph_assert(!"unknown type");
8754 }
8755 } else {
8756 ss << "unrecognized variable '" << var << "'";
8757 return -EINVAL;
8758 }
8759 if (val != "unset") {
8760 ss << "set pool " << pool << " " << var << " to " << val;
8761 } else {
8762 ss << "unset pool " << pool << " " << var;
8763 }
8764 p.last_change = pending_inc.epoch;
8765 pending_inc.new_pools[pool] = p;
8766 return 0;
8767 }
8768
8769 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8770 const cmdmap_t& cmdmap,
8771 stringstream& ss)
8772 {
8773 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8774 }
8775
8776 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8777 const cmdmap_t& cmdmap,
8778 stringstream& ss,
8779 bool *modified)
8780 {
8781 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8782 }
8783
8784
8785 /**
8786 * Common logic for preprocess and prepare phases of pool application
8787 * tag commands. In preprocess mode we're only detecting invalid
8788 * commands, and determining whether it was a modification or a no-op.
8789 * In prepare mode we're actually updating the pending state.
8790 */
8791 int OSDMonitor::_command_pool_application(const string &prefix,
8792 const cmdmap_t& cmdmap,
8793 stringstream& ss,
8794 bool *modified,
8795 bool preparing)
8796 {
8797 string pool_name;
8798 cmd_getval(cmdmap, "pool", pool_name);
8799 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8800 if (pool < 0) {
8801 ss << "unrecognized pool '" << pool_name << "'";
8802 return -ENOENT;
8803 }
8804
8805 pg_pool_t p = *osdmap.get_pg_pool(pool);
8806 if (preparing) {
8807 if (pending_inc.new_pools.count(pool)) {
8808 p = pending_inc.new_pools[pool];
8809 }
8810 }
8811
8812 string app;
8813 cmd_getval(cmdmap, "app", app);
8814 bool app_exists = (p.application_metadata.count(app) > 0);
8815
8816 string key;
8817 cmd_getval(cmdmap, "key", key);
8818 if (key == "all") {
8819 ss << "key cannot be 'all'";
8820 return -EINVAL;
8821 }
8822
8823 string value;
8824 cmd_getval(cmdmap, "value", value);
8825 if (value == "all") {
8826 ss << "value cannot be 'all'";
8827 return -EINVAL;
8828 }
8829
8830 if (boost::algorithm::ends_with(prefix, "enable")) {
8831 if (app.empty()) {
8832 ss << "application name must be provided";
8833 return -EINVAL;
8834 }
8835
8836 if (p.is_tier()) {
8837 ss << "application must be enabled on base tier";
8838 return -EINVAL;
8839 }
8840
8841 bool force = false;
8842 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8843
8844 if (!app_exists && !p.application_metadata.empty() && !force) {
8845 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8846 << "application; pass --yes-i-really-mean-it to proceed anyway";
8847 return -EPERM;
8848 }
8849
8850 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8851 ss << "too many enabled applications on pool '" << pool_name << "'; "
8852 << "max " << MAX_POOL_APPLICATIONS;
8853 return -EINVAL;
8854 }
8855
8856 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8857 ss << "application name '" << app << "' too long; max length "
8858 << MAX_POOL_APPLICATION_LENGTH;
8859 return -EINVAL;
8860 }
8861
8862 if (!app_exists) {
8863 p.application_metadata[app] = {};
8864 }
8865 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8866
8867 } else if (boost::algorithm::ends_with(prefix, "disable")) {
8868 bool force = false;
8869 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8870
8871 if (!force) {
8872 ss << "Are you SURE? Disabling an application within a pool might result "
8873 << "in loss of application functionality; pass "
8874 << "--yes-i-really-mean-it to proceed anyway";
8875 return -EPERM;
8876 }
8877
8878 if (!app_exists) {
8879 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8880 << "'";
8881 return 0; // idempotent
8882 }
8883
8884 p.application_metadata.erase(app);
8885 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8886
8887 } else if (boost::algorithm::ends_with(prefix, "set")) {
8888 if (p.is_tier()) {
8889 ss << "application metadata must be set on base tier";
8890 return -EINVAL;
8891 }
8892
8893 if (!app_exists) {
8894 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8895 << "'";
8896 return -ENOENT;
8897 }
8898
8899 string key;
8900 cmd_getval(cmdmap, "key", key);
8901
8902 if (key.empty()) {
8903 ss << "key must be provided";
8904 return -EINVAL;
8905 }
8906
8907 auto &app_keys = p.application_metadata[app];
8908 if (app_keys.count(key) == 0 &&
8909 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8910 ss << "too many keys set for application '" << app << "' on pool '"
8911 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8912 return -EINVAL;
8913 }
8914
8915 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8916 ss << "key '" << app << "' too long; max length "
8917 << MAX_POOL_APPLICATION_LENGTH;
8918 return -EINVAL;
8919 }
8920
8921 string value;
8922 cmd_getval(cmdmap, "value", value);
8923 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8924 ss << "value '" << value << "' too long; max length "
8925 << MAX_POOL_APPLICATION_LENGTH;
8926 return -EINVAL;
8927 }
8928
8929 p.application_metadata[app][key] = value;
8930 ss << "set application '" << app << "' key '" << key << "' to '"
8931 << value << "' on pool '" << pool_name << "'";
8932 } else if (boost::algorithm::ends_with(prefix, "rm")) {
8933 if (!app_exists) {
8934 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8935 << "'";
8936 return -ENOENT;
8937 }
8938
8939 string key;
8940 cmd_getval(cmdmap, "key", key);
8941 auto it = p.application_metadata[app].find(key);
8942 if (it == p.application_metadata[app].end()) {
8943 ss << "application '" << app << "' on pool '" << pool_name
8944 << "' does not have key '" << key << "'";
8945 return 0; // idempotent
8946 }
8947
8948 p.application_metadata[app].erase(it);
8949 ss << "removed application '" << app << "' key '" << key << "' on pool '"
8950 << pool_name << "'";
8951 } else {
8952 ceph_abort();
8953 }
8954
8955 if (preparing) {
8956 p.last_change = pending_inc.epoch;
8957 pending_inc.new_pools[pool] = p;
8958 }
8959
8960 // Because we fell through this far, we didn't hit no-op cases,
8961 // so pool was definitely modified
8962 if (modified != nullptr) {
8963 *modified = true;
8964 }
8965
8966 return 0;
8967 }
8968
8969 int OSDMonitor::_prepare_command_osd_crush_remove(
8970 CrushWrapper &newcrush,
8971 int32_t id,
8972 int32_t ancestor,
8973 bool has_ancestor,
8974 bool unlink_only)
8975 {
8976 int err = 0;
8977
8978 if (has_ancestor) {
8979 err = newcrush.remove_item_under(cct, id, ancestor,
8980 unlink_only);
8981 } else {
8982 err = newcrush.remove_item(cct, id, unlink_only);
8983 }
8984 return err;
8985 }
8986
8987 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8988 {
8989 pending_inc.crush.clear();
8990 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
8991 }
8992
8993 int OSDMonitor::prepare_command_osd_crush_remove(
8994 CrushWrapper &newcrush,
8995 int32_t id,
8996 int32_t ancestor,
8997 bool has_ancestor,
8998 bool unlink_only)
8999 {
9000 int err = _prepare_command_osd_crush_remove(
9001 newcrush, id, ancestor,
9002 has_ancestor, unlink_only);
9003
9004 if (err < 0)
9005 return err;
9006
9007 ceph_assert(err == 0);
9008 do_osd_crush_remove(newcrush);
9009
9010 return 0;
9011 }
9012
9013 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9014 {
9015 if (osdmap.is_up(id)) {
9016 return -EBUSY;
9017 }
9018
9019 pending_inc.new_state[id] = osdmap.get_state(id);
9020 pending_inc.new_uuid[id] = uuid_d();
9021 pending_metadata_rm.insert(id);
9022 pending_metadata.erase(id);
9023
9024 return 0;
9025 }
9026
9027 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9028 {
9029 ceph_assert(existing_id);
9030 *existing_id = -1;
9031
9032 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9033 if (!osdmap.exists(i) &&
9034 pending_inc.new_up_client.count(i) == 0 &&
9035 (pending_inc.new_state.count(i) == 0 ||
9036 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9037 *existing_id = i;
9038 return -1;
9039 }
9040 }
9041
9042 if (pending_inc.new_max_osd < 0) {
9043 return osdmap.get_max_osd();
9044 }
9045 return pending_inc.new_max_osd;
9046 }
9047
9048 void OSDMonitor::do_osd_create(
9049 const int32_t id,
9050 const uuid_d& uuid,
9051 const string& device_class,
9052 int32_t* new_id)
9053 {
9054 dout(10) << __func__ << " uuid " << uuid << dendl;
9055 ceph_assert(new_id);
9056
9057 // We presume validation has been performed prior to calling this
9058 // function. We assert with prejudice.
9059
9060 int32_t allocated_id = -1; // declare here so we can jump
9061 int32_t existing_id = -1;
9062 if (!uuid.is_zero()) {
9063 existing_id = osdmap.identify_osd(uuid);
9064 if (existing_id >= 0) {
9065 ceph_assert(id < 0 || id == existing_id);
9066 *new_id = existing_id;
9067 goto out;
9068 } else if (id >= 0) {
9069 // uuid does not exist, and id has been provided, so just create
9070 // the new osd.id
9071 *new_id = id;
9072 goto out;
9073 }
9074 }
9075
9076 // allocate a new id
9077 allocated_id = _allocate_osd_id(&existing_id);
9078 dout(10) << __func__ << " allocated id " << allocated_id
9079 << " existing id " << existing_id << dendl;
9080 if (existing_id >= 0) {
9081 ceph_assert(existing_id < osdmap.get_max_osd());
9082 ceph_assert(allocated_id < 0);
9083 *new_id = existing_id;
9084 } else if (allocated_id >= 0) {
9085 ceph_assert(existing_id < 0);
9086 // raise max_osd
9087 if (pending_inc.new_max_osd < 0) {
9088 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9089 } else {
9090 ++pending_inc.new_max_osd;
9091 }
9092 *new_id = pending_inc.new_max_osd - 1;
9093 ceph_assert(*new_id == allocated_id);
9094 } else {
9095 ceph_abort_msg("unexpected condition");
9096 }
9097
9098 out:
9099 if (device_class.size()) {
9100 CrushWrapper newcrush;
9101 _get_pending_crush(newcrush);
9102 if (newcrush.get_max_devices() < *new_id + 1) {
9103 newcrush.set_max_devices(*new_id + 1);
9104 }
9105 string name = string("osd.") + stringify(*new_id);
9106 if (!newcrush.item_exists(*new_id)) {
9107 newcrush.set_item_name(*new_id, name);
9108 }
9109 ostringstream ss;
9110 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9111 if (r < 0) {
9112 derr << __func__ << " failed to set " << name << " device_class "
9113 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9114 << dendl;
9115 // non-fatal... this might be a replay and we want to be idempotent.
9116 } else {
9117 dout(20) << __func__ << " set " << name << " device_class " << device_class
9118 << dendl;
9119 pending_inc.crush.clear();
9120 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9121 }
9122 } else {
9123 dout(20) << __func__ << " no device_class" << dendl;
9124 }
9125
9126 dout(10) << __func__ << " using id " << *new_id << dendl;
9127 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9128 pending_inc.new_max_osd = *new_id + 1;
9129 }
9130
9131 pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9132 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9133 // set it for us. (ugh.)
9134 pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9135 if (!uuid.is_zero())
9136 pending_inc.new_uuid[*new_id] = uuid;
9137 }
9138
9139 int OSDMonitor::validate_osd_create(
9140 const int32_t id,
9141 const uuid_d& uuid,
9142 const bool check_osd_exists,
9143 int32_t* existing_id,
9144 stringstream& ss)
9145 {
9146
9147 dout(10) << __func__ << " id " << id << " uuid " << uuid
9148 << " check_osd_exists " << check_osd_exists << dendl;
9149
9150 ceph_assert(existing_id);
9151
9152 if (id < 0 && uuid.is_zero()) {
9153 // we have nothing to validate
9154 *existing_id = -1;
9155 return 0;
9156 } else if (uuid.is_zero()) {
9157 // we have an id but we will ignore it - because that's what
9158 // `osd create` does.
9159 return 0;
9160 }
9161
9162 /*
9163 * This function will be used to validate whether we are able to
9164 * create a new osd when the `uuid` is specified.
9165 *
9166 * It will be used by both `osd create` and `osd new`, as the checks
9167 * are basically the same when it pertains to osd id and uuid validation.
9168 * However, `osd create` presumes an `uuid` is optional, for legacy
9169 * reasons, while `osd new` requires the `uuid` to be provided. This
9170 * means that `osd create` will not be idempotent if an `uuid` is not
9171 * provided, but we will always guarantee the idempotency of `osd new`.
9172 */
9173
9174 ceph_assert(!uuid.is_zero());
9175 if (pending_inc.identify_osd(uuid) >= 0) {
9176 // osd is about to exist
9177 return -EAGAIN;
9178 }
9179
9180 int32_t i = osdmap.identify_osd(uuid);
9181 if (i >= 0) {
9182 // osd already exists
9183 if (id >= 0 && i != id) {
9184 ss << "uuid " << uuid << " already in use for different id " << i;
9185 return -EEXIST;
9186 }
9187 // return a positive errno to distinguish between a blocking error
9188 // and an error we consider to not be a problem (i.e., this would be
9189 // an idempotent operation).
9190 *existing_id = i;
9191 return EEXIST;
9192 }
9193 // i < 0
9194 if (id >= 0) {
9195 if (pending_inc.new_state.count(id)) {
9196 // osd is about to exist
9197 return -EAGAIN;
9198 }
9199 // we may not care if an osd exists if we are recreating a previously
9200 // destroyed osd.
9201 if (check_osd_exists && osdmap.exists(id)) {
9202 ss << "id " << id << " already in use and does not match uuid "
9203 << uuid;
9204 return -EINVAL;
9205 }
9206 }
9207 return 0;
9208 }
9209
9210 int OSDMonitor::prepare_command_osd_create(
9211 const int32_t id,
9212 const uuid_d& uuid,
9213 int32_t* existing_id,
9214 stringstream& ss)
9215 {
9216 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9217 ceph_assert(existing_id);
9218 if (osdmap.is_destroyed(id)) {
9219 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9220 "instead.";
9221 return -EINVAL;
9222 }
9223
9224 if (uuid.is_zero()) {
9225 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9226 }
9227
9228 return validate_osd_create(id, uuid, true, existing_id, ss);
9229 }
9230
9231 int OSDMonitor::prepare_command_osd_new(
9232 MonOpRequestRef op,
9233 const cmdmap_t& cmdmap,
9234 const map<string,string>& params,
9235 stringstream &ss,
9236 Formatter *f)
9237 {
9238 uuid_d uuid;
9239 string uuidstr;
9240 int64_t id = -1;
9241
9242 ceph_assert(paxos.is_plugged());
9243
9244 dout(10) << __func__ << " " << op << dendl;
9245
9246 /* validate command. abort now if something's wrong. */
9247
9248 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9249 *
9250 * If `id` is not specified, we will identify any existing osd based
9251 * on `uuid`. Operation will be idempotent iff secrets match.
9252 *
9253 * If `id` is specified, we will identify any existing osd based on
9254 * `uuid` and match against `id`. If they match, operation will be
9255 * idempotent iff secrets match.
9256 *
9257 * `-i secrets.json` will be optional. If supplied, will be used
9258 * to check for idempotency when `id` and `uuid` match.
9259 *
9260 * If `id` is not specified, and `uuid` does not exist, an id will
9261 * be found or allocated for the osd.
9262 *
9263 * If `id` is specified, and the osd has been previously marked
9264 * as destroyed, then the `id` will be reused.
9265 */
9266 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9267 ss << "requires the OSD's UUID to be specified.";
9268 return -EINVAL;
9269 } else if (!uuid.parse(uuidstr.c_str())) {
9270 ss << "invalid UUID value '" << uuidstr << "'.";
9271 return -EINVAL;
9272 }
9273
9274 if (cmd_getval(cmdmap, "id", id) &&
9275 (id < 0)) {
9276 ss << "invalid OSD id; must be greater or equal than zero.";
9277 return -EINVAL;
9278 }
9279
9280 // are we running an `osd create`-like command, or recreating
9281 // a previously destroyed osd?
9282
9283 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9284
9285 // we will care about `id` to assess whether osd is `destroyed`, or
9286 // to create a new osd.
9287 // we will need an `id` by the time we reach auth.
9288
9289 int32_t existing_id = -1;
9290 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9291 &existing_id, ss);
9292
9293 bool may_be_idempotent = false;
9294 if (err == EEXIST) {
9295 // this is idempotent from the osdmon's point-of-view
9296 may_be_idempotent = true;
9297 ceph_assert(existing_id >= 0);
9298 id = existing_id;
9299 } else if (err < 0) {
9300 return err;
9301 }
9302
9303 if (!may_be_idempotent) {
9304 // idempotency is out of the window. We are either creating a new
9305 // osd or recreating a destroyed osd.
9306 //
9307 // We now need to figure out if we have an `id` (and if it's valid),
9308 // of find an `id` if we don't have one.
9309
9310 // NOTE: we need to consider the case where the `id` is specified for
9311 // `osd create`, and we must honor it. So this means checking if
9312 // the `id` is destroyed, and if so assume the destroy; otherwise,
9313 // check if it `exists` - in which case we complain about not being
9314 // `destroyed`. In the end, if nothing fails, we must allow the
9315 // creation, so that we are compatible with `create`.
9316 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9317 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9318 ss << "OSD " << id << " has not yet been destroyed";
9319 return -EINVAL;
9320 } else if (id < 0) {
9321 // find an `id`
9322 id = _allocate_osd_id(&existing_id);
9323 if (id < 0) {
9324 ceph_assert(existing_id >= 0);
9325 id = existing_id;
9326 }
9327 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9328 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9329 dout(10) << __func__ << " recreating osd." << id << dendl;
9330 } else {
9331 dout(10) << __func__ << " creating new osd." << id << dendl;
9332 }
9333 } else {
9334 ceph_assert(id >= 0);
9335 ceph_assert(osdmap.exists(id));
9336 }
9337
9338 // we are now able to either create a brand new osd or reuse an existing
9339 // osd that has been previously destroyed.
9340
9341 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9342
9343 if (may_be_idempotent && params.empty()) {
9344 // nothing to do, really.
9345 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9346 ceph_assert(id >= 0);
9347 if (f) {
9348 f->open_object_section("created_osd");
9349 f->dump_int("osdid", id);
9350 f->close_section();
9351 } else {
9352 ss << id;
9353 }
9354 return EEXIST;
9355 }
9356
9357 string device_class;
9358 auto p = params.find("crush_device_class");
9359 if (p != params.end()) {
9360 device_class = p->second;
9361 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9362 }
9363 string cephx_secret, lockbox_secret, dmcrypt_key;
9364 bool has_lockbox = false;
9365 bool has_secrets = params.count("cephx_secret")
9366 || params.count("cephx_lockbox_secret")
9367 || params.count("dmcrypt_key");
9368
9369 KVMonitor *svc = nullptr;
9370 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9371
9372 if (has_secrets) {
9373 if (params.count("cephx_secret") == 0) {
9374 ss << "requires a cephx secret.";
9375 return -EINVAL;
9376 }
9377 cephx_secret = params.at("cephx_secret");
9378
9379 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9380 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9381
9382 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9383 << " dmcrypt " << has_dmcrypt_key << dendl;
9384
9385 if (has_lockbox_secret && has_dmcrypt_key) {
9386 has_lockbox = true;
9387 lockbox_secret = params.at("cephx_lockbox_secret");
9388 dmcrypt_key = params.at("dmcrypt_key");
9389 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9390 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9391 return -EINVAL;
9392 }
9393
9394 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9395
9396 err = mon.authmon()->validate_osd_new(id, uuid,
9397 cephx_secret,
9398 lockbox_secret,
9399 cephx_entity,
9400 lockbox_entity,
9401 ss);
9402 if (err < 0) {
9403 return err;
9404 } else if (may_be_idempotent && err != EEXIST) {
9405 // for this to be idempotent, `id` should already be >= 0; no need
9406 // to use validate_id.
9407 ceph_assert(id >= 0);
9408 ss << "osd." << id << " exists but secrets do not match";
9409 return -EEXIST;
9410 }
9411
9412 if (has_lockbox) {
9413 svc = mon.kvmon();
9414 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9415 if (err < 0) {
9416 return err;
9417 } else if (may_be_idempotent && err != EEXIST) {
9418 ceph_assert(id >= 0);
9419 ss << "osd." << id << " exists but dm-crypt key does not match.";
9420 return -EEXIST;
9421 }
9422 }
9423 }
9424 ceph_assert(!has_secrets || !cephx_secret.empty());
9425 ceph_assert(!has_lockbox || !lockbox_secret.empty());
9426
9427 if (may_be_idempotent) {
9428 // we have nothing to do for either the osdmon or the authmon,
9429 // and we have no lockbox - so the config key service will not be
9430 // touched. This is therefore an idempotent operation, and we can
9431 // just return right away.
9432 dout(10) << __func__ << " idempotent -- no op." << dendl;
9433 ceph_assert(id >= 0);
9434 if (f) {
9435 f->open_object_section("created_osd");
9436 f->dump_int("osdid", id);
9437 f->close_section();
9438 } else {
9439 ss << id;
9440 }
9441 return EEXIST;
9442 }
9443 ceph_assert(!may_be_idempotent);
9444
9445 // perform updates.
9446 if (has_secrets) {
9447 ceph_assert(!cephx_secret.empty());
9448 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9449 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9450
9451 err = mon.authmon()->do_osd_new(cephx_entity,
9452 lockbox_entity,
9453 has_lockbox);
9454 ceph_assert(0 == err);
9455
9456 if (has_lockbox) {
9457 ceph_assert(nullptr != svc);
9458 svc->do_osd_new(uuid, dmcrypt_key);
9459 }
9460 }
9461
9462 if (is_recreate_destroyed) {
9463 ceph_assert(id >= 0);
9464 ceph_assert(osdmap.is_destroyed(id));
9465 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9466 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9467 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9468 }
9469 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9470 // due to http://tracker.ceph.com/issues/20751 some clusters may
9471 // have UP set for non-existent OSDs; make sure it is cleared
9472 // for a newly created osd.
9473 pending_inc.new_state[id] |= CEPH_OSD_UP;
9474 }
9475 pending_inc.new_uuid[id] = uuid;
9476 } else {
9477 ceph_assert(id >= 0);
9478 int32_t new_id = -1;
9479 do_osd_create(id, uuid, device_class, &new_id);
9480 ceph_assert(new_id >= 0);
9481 ceph_assert(id == new_id);
9482 }
9483
9484 if (f) {
9485 f->open_object_section("created_osd");
9486 f->dump_int("osdid", id);
9487 f->close_section();
9488 } else {
9489 ss << id;
9490 }
9491
9492 return 0;
9493 }
9494
9495 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9496 {
9497 op->mark_osdmon_event(__func__);
9498 auto m = op->get_req<MMonCommand>();
9499 stringstream ss;
9500 cmdmap_t cmdmap;
9501 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9502 string rs = ss.str();
9503 mon.reply_command(op, -EINVAL, rs, get_last_committed());
9504 return true;
9505 }
9506
9507 MonSession *session = op->get_session();
9508 if (!session) {
9509 derr << __func__ << " no session" << dendl;
9510 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9511 return true;
9512 }
9513
9514 return prepare_command_impl(op, cmdmap);
9515 }
9516
9517 static int parse_reweights(CephContext *cct,
9518 const cmdmap_t& cmdmap,
9519 const OSDMap& osdmap,
9520 map<int32_t, uint32_t>* weights)
9521 {
9522 string weights_str;
9523 if (!cmd_getval(cmdmap, "weights", weights_str)) {
9524 return -EINVAL;
9525 }
9526 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9527 json_spirit::mValue json_value;
9528 if (!json_spirit::read(weights_str, json_value)) {
9529 return -EINVAL;
9530 }
9531 if (json_value.type() != json_spirit::obj_type) {
9532 return -EINVAL;
9533 }
9534 const auto obj = json_value.get_obj();
9535 try {
9536 for (auto& osd_weight : obj) {
9537 auto osd_id = std::stoi(osd_weight.first);
9538 if (!osdmap.exists(osd_id)) {
9539 return -ENOENT;
9540 }
9541 if (osd_weight.second.type() != json_spirit::str_type) {
9542 return -EINVAL;
9543 }
9544 auto weight = std::stoul(osd_weight.second.get_str());
9545 weights->insert({osd_id, weight});
9546 }
9547 } catch (const std::logic_error& e) {
9548 return -EINVAL;
9549 }
9550 return 0;
9551 }
9552
9553 int OSDMonitor::prepare_command_osd_destroy(
9554 int32_t id,
9555 stringstream& ss)
9556 {
9557 ceph_assert(paxos.is_plugged());
9558
9559 // we check if the osd exists for the benefit of `osd purge`, which may
9560 // have previously removed the osd. If the osd does not exist, return
9561 // -ENOENT to convey this, and let the caller deal with it.
9562 //
9563 // we presume that all auth secrets and config keys were removed prior
9564 // to this command being called. if they exist by now, we also assume
9565 // they must have been created by some other command and do not pertain
9566 // to this non-existent osd.
9567 if (!osdmap.exists(id)) {
9568 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9569 return -ENOENT;
9570 }
9571
9572 uuid_d uuid = osdmap.get_uuid(id);
9573 dout(10) << __func__ << " destroying osd." << id
9574 << " uuid " << uuid << dendl;
9575
9576 // if it has been destroyed, we assume our work here is done.
9577 if (osdmap.is_destroyed(id)) {
9578 ss << "destroyed osd." << id;
9579 return 0;
9580 }
9581
9582 EntityName cephx_entity, lockbox_entity;
9583 bool idempotent_auth = false, idempotent_cks = false;
9584
9585 int err = mon.authmon()->validate_osd_destroy(id, uuid,
9586 cephx_entity,
9587 lockbox_entity,
9588 ss);
9589 if (err < 0) {
9590 if (err == -ENOENT) {
9591 idempotent_auth = true;
9592 } else {
9593 return err;
9594 }
9595 }
9596
9597 auto svc = mon.kvmon();
9598 err = svc->validate_osd_destroy(id, uuid);
9599 if (err < 0) {
9600 ceph_assert(err == -ENOENT);
9601 err = 0;
9602 idempotent_cks = true;
9603 }
9604
9605 if (!idempotent_auth) {
9606 err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9607 ceph_assert(0 == err);
9608 }
9609
9610 if (!idempotent_cks) {
9611 svc->do_osd_destroy(id, uuid);
9612 }
9613
9614 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9615 pending_inc.new_uuid[id] = uuid_d();
9616
9617 // we can only propose_pending() once per service, otherwise we'll be
9618 // defying PaxosService and all laws of nature. Therefore, as we may
9619 // be used during 'osd purge', let's keep the caller responsible for
9620 // proposing.
9621 ceph_assert(err == 0);
9622 return 0;
9623 }
9624
9625 int OSDMonitor::prepare_command_osd_purge(
9626 int32_t id,
9627 stringstream& ss)
9628 {
9629 ceph_assert(paxos.is_plugged());
9630 dout(10) << __func__ << " purging osd." << id << dendl;
9631
9632 ceph_assert(!osdmap.is_up(id));
9633
9634 /*
9635 * This may look a bit weird, but this is what's going to happen:
9636 *
9637 * 1. we make sure that removing from crush works
9638 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9639 * error, then we abort the whole operation, as no updates
9640 * have been made. However, we this function will have
9641 * side-effects, thus we need to make sure that all operations
9642 * performed henceforth will *always* succeed.
9643 * 3. we call `prepare_command_osd_remove()`. Although this
9644 * function can return an error, it currently only checks if the
9645 * osd is up - and we have made sure that it is not so, so there
9646 * is no conflict, and it is effectively an update.
9647 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9648 * the crush update we delayed from before.
9649 */
9650
9651 CrushWrapper newcrush;
9652 _get_pending_crush(newcrush);
9653
9654 bool may_be_idempotent = false;
9655
9656 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9657 if (err == -ENOENT) {
9658 err = 0;
9659 may_be_idempotent = true;
9660 } else if (err < 0) {
9661 ss << "error removing osd." << id << " from crush";
9662 return err;
9663 }
9664
9665 // no point destroying the osd again if it has already been marked destroyed
9666 if (!osdmap.is_destroyed(id)) {
9667 err = prepare_command_osd_destroy(id, ss);
9668 if (err < 0) {
9669 if (err == -ENOENT) {
9670 err = 0;
9671 } else {
9672 return err;
9673 }
9674 } else {
9675 may_be_idempotent = false;
9676 }
9677 }
9678 ceph_assert(0 == err);
9679
9680 if (may_be_idempotent && !osdmap.exists(id)) {
9681 dout(10) << __func__ << " osd." << id << " does not exist and "
9682 << "we are idempotent." << dendl;
9683 return -ENOENT;
9684 }
9685
9686 err = prepare_command_osd_remove(id);
9687 // we should not be busy, as we should have made sure this id is not up.
9688 ceph_assert(0 == err);
9689
9690 do_osd_crush_remove(newcrush);
9691 return 0;
9692 }
9693
9694 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9695 const cmdmap_t& cmdmap)
9696 {
9697 op->mark_osdmon_event(__func__);
9698 auto m = op->get_req<MMonCommand>();
9699 bool ret = false;
9700 stringstream ss;
9701 string rs;
9702 bufferlist rdata;
9703 int err = 0;
9704
9705 string format;
9706 cmd_getval(cmdmap, "format", format, string("plain"));
9707 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9708
9709 string prefix;
9710 cmd_getval(cmdmap, "prefix", prefix);
9711
9712 int64_t osdid;
9713 string osd_name;
9714 bool osdid_present = false;
9715 if (prefix != "osd pg-temp" &&
9716 prefix != "osd pg-upmap" &&
9717 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9718 osdid_present = cmd_getval(cmdmap, "id", osdid);
9719 }
9720 if (osdid_present) {
9721 ostringstream oss;
9722 oss << "osd." << osdid;
9723 osd_name = oss.str();
9724 }
9725
9726 // Even if there's a pending state with changes that could affect
9727 // a command, considering that said state isn't yet committed, we
9728 // just don't care about those changes if the command currently being
9729 // handled acts as a no-op against the current committed state.
9730 // In a nutshell, we assume this command happens *before*.
9731 //
9732 // Let me make this clearer:
9733 //
9734 // - If we have only one client, and that client issues some
9735 // operation that would conflict with this operation but is
9736 // still on the pending state, then we would be sure that said
9737 // operation wouldn't have returned yet, so the client wouldn't
9738 // issue this operation (unless the client didn't wait for the
9739 // operation to finish, and that would be the client's own fault).
9740 //
9741 // - If we have more than one client, each client will observe
9742 // whatever is the state at the moment of the commit. So, if we
9743 // have two clients, one issuing an unlink and another issuing a
9744 // link, and if the link happens while the unlink is still on the
9745 // pending state, from the link's point-of-view this is a no-op.
9746 // If different clients are issuing conflicting operations and
9747 // they care about that, then the clients should make sure they
9748 // enforce some kind of concurrency mechanism -- from our
9749 // perspective that's what Douglas Adams would call an SEP.
9750 //
9751 // This should be used as a general guideline for most commands handled
9752 // in this function. Adapt as you see fit, but please bear in mind that
9753 // this is the expected behavior.
9754
9755
9756 if (prefix == "osd setcrushmap" ||
9757 (prefix == "osd crush set" && !osdid_present)) {
9758 if (pending_inc.crush.length()) {
9759 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9760 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9761 return true;
9762 }
9763 dout(10) << "prepare_command setting new crush map" << dendl;
9764 bufferlist data(m->get_data());
9765 CrushWrapper crush;
9766 try {
9767 auto bl = data.cbegin();
9768 crush.decode(bl);
9769 }
9770 catch (const std::exception &e) {
9771 err = -EINVAL;
9772 ss << "Failed to parse crushmap: " << e.what();
9773 goto reply;
9774 }
9775
9776 int64_t prior_version = 0;
9777 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9778 if (prior_version == osdmap.get_crush_version() - 1) {
9779 // see if we are a resend of the last update. this is imperfect
9780 // (multiple racing updaters may not both get reliable success)
9781 // but we expect crush updaters (via this interface) to be rare-ish.
9782 bufferlist current, proposed;
9783 osdmap.crush->encode(current, mon.get_quorum_con_features());
9784 crush.encode(proposed, mon.get_quorum_con_features());
9785 if (current.contents_equal(proposed)) {
9786 dout(10) << __func__
9787 << " proposed matches current and version equals previous"
9788 << dendl;
9789 err = 0;
9790 ss << osdmap.get_crush_version();
9791 goto reply;
9792 }
9793 }
9794 if (prior_version != osdmap.get_crush_version()) {
9795 err = -EPERM;
9796 ss << "prior_version " << prior_version << " != crush version "
9797 << osdmap.get_crush_version();
9798 goto reply;
9799 }
9800 }
9801
9802 if (crush.has_legacy_rule_ids()) {
9803 err = -EINVAL;
9804 ss << "crush maps with ruleset != ruleid are no longer allowed";
9805 goto reply;
9806 }
9807 if (!validate_crush_against_features(&crush, ss)) {
9808 err = -EINVAL;
9809 goto reply;
9810 }
9811
9812 err = osdmap.validate_crush_rules(&crush, &ss);
9813 if (err < 0) {
9814 goto reply;
9815 }
9816
9817 if (g_conf()->mon_osd_crush_smoke_test) {
9818 // sanity check: test some inputs to make sure this map isn't
9819 // totally broken
9820 dout(10) << " testing map" << dendl;
9821 stringstream ess;
9822 CrushTester tester(crush, ess);
9823 tester.set_min_x(0);
9824 tester.set_max_x(50);
9825 auto start = ceph::coarse_mono_clock::now();
9826 int r = tester.test_with_fork(g_conf()->mon_lease);
9827 auto duration = ceph::coarse_mono_clock::now() - start;
9828 if (r < 0) {
9829 dout(10) << " tester.test_with_fork returns " << r
9830 << ": " << ess.str() << dendl;
9831 ss << "crush smoke test failed with " << r << ": " << ess.str();
9832 err = r;
9833 goto reply;
9834 }
9835 dout(10) << __func__ << " crush somke test duration: "
9836 << duration << ", result: " << ess.str() << dendl;
9837 }
9838
9839 pending_inc.crush = data;
9840 ss << osdmap.get_crush_version() + 1;
9841 goto update;
9842
9843 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9844 CrushWrapper newcrush;
9845 _get_pending_crush(newcrush);
9846 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9847 int bid = -1 - b;
9848 if (newcrush.bucket_exists(bid) &&
9849 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9850 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9851 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9852 }
9853 }
9854 if (!validate_crush_against_features(&newcrush, ss)) {
9855 err = -EINVAL;
9856 goto reply;
9857 }
9858 pending_inc.crush.clear();
9859 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9860 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9861 get_last_committed() + 1));
9862 return true;
9863 } else if (prefix == "osd crush set-device-class") {
9864 string device_class;
9865 if (!cmd_getval(cmdmap, "class", device_class)) {
9866 err = -EINVAL; // no value!
9867 goto reply;
9868 }
9869
9870 bool stop = false;
9871 vector<string> idvec;
9872 cmd_getval(cmdmap, "ids", idvec);
9873 CrushWrapper newcrush;
9874 _get_pending_crush(newcrush);
9875 set<int> updated;
9876 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9877 set<int> osds;
9878 // wildcard?
9879 if (j == 0 &&
9880 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9881 osdmap.get_all_osds(osds);
9882 stop = true;
9883 } else {
9884 // try traditional single osd way
9885 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9886 if (osd < 0) {
9887 // ss has reason for failure
9888 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9889 err = -EINVAL;
9890 continue;
9891 }
9892 osds.insert(osd);
9893 }
9894
9895 for (auto &osd : osds) {
9896 if (!osdmap.exists(osd)) {
9897 ss << "osd." << osd << " does not exist. ";
9898 continue;
9899 }
9900
9901 ostringstream oss;
9902 oss << "osd." << osd;
9903 string name = oss.str();
9904
9905 if (newcrush.get_max_devices() < osd + 1) {
9906 newcrush.set_max_devices(osd + 1);
9907 }
9908 string action;
9909 if (newcrush.item_exists(osd)) {
9910 action = "updating";
9911 } else {
9912 action = "creating";
9913 newcrush.set_item_name(osd, name);
9914 }
9915
9916 dout(5) << action << " crush item id " << osd << " name '" << name
9917 << "' device_class '" << device_class << "'"
9918 << dendl;
9919 err = newcrush.update_device_class(osd, device_class, name, &ss);
9920 if (err < 0) {
9921 goto reply;
9922 }
9923 if (err == 0 && !_have_pending_crush()) {
9924 if (!stop) {
9925 // for single osd only, wildcard makes too much noise
9926 ss << "set-device-class item id " << osd << " name '" << name
9927 << "' device_class '" << device_class << "': no change. ";
9928 }
9929 } else {
9930 updated.insert(osd);
9931 }
9932 }
9933 }
9934
9935 pending_inc.crush.clear();
9936 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9937 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9938 getline(ss, rs);
9939 wait_for_finished_proposal(
9940 op,
9941 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9942 return true;
9943 } else if (prefix == "osd crush rm-device-class") {
9944 bool stop = false;
9945 vector<string> idvec;
9946 cmd_getval(cmdmap, "ids", idvec);
9947 CrushWrapper newcrush;
9948 _get_pending_crush(newcrush);
9949 set<int> updated;
9950
9951 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9952 set<int> osds;
9953
9954 // wildcard?
9955 if (j == 0 &&
9956 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9957 osdmap.get_all_osds(osds);
9958 stop = true;
9959 } else {
9960 // try traditional single osd way
9961 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9962 if (osd < 0) {
9963 // ss has reason for failure
9964 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9965 err = -EINVAL;
9966 goto reply;
9967 }
9968 osds.insert(osd);
9969 }
9970
9971 for (auto &osd : osds) {
9972 if (!osdmap.exists(osd)) {
9973 ss << "osd." << osd << " does not exist. ";
9974 continue;
9975 }
9976
9977 auto class_name = newcrush.get_item_class(osd);
9978 if (!class_name) {
9979 ss << "osd." << osd << " belongs to no class, ";
9980 continue;
9981 }
9982 // note that we do not verify if class_is_in_use here
9983 // in case the device is misclassified and user wants
9984 // to overridely reset...
9985
9986 err = newcrush.remove_device_class(cct, osd, &ss);
9987 if (err < 0) {
9988 // ss has reason for failure
9989 goto reply;
9990 }
9991 updated.insert(osd);
9992 }
9993 }
9994
9995 pending_inc.crush.clear();
9996 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9997 ss << "done removing class of osd(s): " << updated;
9998 getline(ss, rs);
9999 wait_for_finished_proposal(
10000 op,
10001 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10002 return true;
10003 } else if (prefix == "osd crush class create") {
10004 string device_class;
10005 if (!cmd_getval(cmdmap, "class", device_class)) {
10006 err = -EINVAL; // no value!
10007 goto reply;
10008 }
10009 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10010 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10011 << "luminous' before using crush device classes";
10012 err = -EPERM;
10013 goto reply;
10014 }
10015 if (!_have_pending_crush() &&
10016 _get_stable_crush().class_exists(device_class)) {
10017 ss << "class '" << device_class << "' already exists";
10018 goto reply;
10019 }
10020 CrushWrapper newcrush;
10021 _get_pending_crush(newcrush);
10022 if (newcrush.class_exists(device_class)) {
10023 ss << "class '" << device_class << "' already exists";
10024 goto update;
10025 }
10026 int class_id = newcrush.get_or_create_class_id(device_class);
10027 pending_inc.crush.clear();
10028 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10029 ss << "created class " << device_class << " with id " << class_id
10030 << " to crush map";
10031 goto update;
10032 } else if (prefix == "osd crush class rm") {
10033 string device_class;
10034 if (!cmd_getval(cmdmap, "class", device_class)) {
10035 err = -EINVAL; // no value!
10036 goto reply;
10037 }
10038 if (osdmap.require_osd_release < ceph_release_t::luminous) {
10039 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10040 << "luminous' before using crush device classes";
10041 err = -EPERM;
10042 goto reply;
10043 }
10044
10045 if (!osdmap.crush->class_exists(device_class)) {
10046 err = 0;
10047 goto reply;
10048 }
10049
10050 CrushWrapper newcrush;
10051 _get_pending_crush(newcrush);
10052 if (!newcrush.class_exists(device_class)) {
10053 err = 0; // make command idempotent
10054 goto wait;
10055 }
10056 int class_id = newcrush.get_class_id(device_class);
10057 stringstream ts;
10058 if (newcrush.class_is_in_use(class_id, &ts)) {
10059 err = -EBUSY;
10060 ss << "class '" << device_class << "' " << ts.str();
10061 goto reply;
10062 }
10063
10064 // check if class is used by any erasure-code-profiles
10065 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10066 osdmap.get_erasure_code_profiles();
10067 auto ec_profiles = pending_inc.get_erasure_code_profiles();
10068 #ifdef HAVE_STDLIB_MAP_SPLICING
10069 ec_profiles.merge(old_ec_profiles);
10070 #else
10071 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10072 make_move_iterator(end(old_ec_profiles)));
10073 #endif
10074 list<string> referenced_by;
10075 for (auto &i: ec_profiles) {
10076 for (auto &j: i.second) {
10077 if ("crush-device-class" == j.first && device_class == j.second) {
10078 referenced_by.push_back(i.first);
10079 }
10080 }
10081 }
10082 if (!referenced_by.empty()) {
10083 err = -EBUSY;
10084 ss << "class '" << device_class
10085 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10086 goto reply;
10087 }
10088
10089 set<int> osds;
10090 newcrush.get_devices_by_class(device_class, &osds);
10091 for (auto& p: osds) {
10092 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
10093 if (err < 0) {
10094 // ss has reason for failure
10095 goto reply;
10096 }
10097 }
10098
10099 if (osds.empty()) {
10100 // empty class, remove directly
10101 err = newcrush.remove_class_name(device_class);
10102 if (err < 0) {
10103 ss << "class '" << device_class << "' cannot be removed '"
10104 << cpp_strerror(err) << "'";
10105 goto reply;
10106 }
10107 }
10108
10109 pending_inc.crush.clear();
10110 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10111 ss << "removed class " << device_class << " with id " << class_id
10112 << " from crush map";
10113 goto update;
10114 } else if (prefix == "osd crush class rename") {
10115 string srcname, dstname;
10116 if (!cmd_getval(cmdmap, "srcname", srcname)) {
10117 err = -EINVAL;
10118 goto reply;
10119 }
10120 if (!cmd_getval(cmdmap, "dstname", dstname)) {
10121 err = -EINVAL;
10122 goto reply;
10123 }
10124
10125 CrushWrapper newcrush;
10126 _get_pending_crush(newcrush);
10127 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10128 // suppose this is a replay and return success
10129 // so command is idempotent
10130 ss << "already renamed to '" << dstname << "'";
10131 err = 0;
10132 goto reply;
10133 }
10134
10135 err = newcrush.rename_class(srcname, dstname);
10136 if (err < 0) {
10137 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10138 << cpp_strerror(err);
10139 goto reply;
10140 }
10141
10142 pending_inc.crush.clear();
10143 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10144 ss << "rename class '" << srcname << "' to '" << dstname << "'";
10145 goto update;
10146 } else if (prefix == "osd crush add-bucket") {
10147 // os crush add-bucket <name> <type>
10148 string name, typestr;
10149 vector<string> argvec;
10150 cmd_getval(cmdmap, "name", name);
10151 cmd_getval(cmdmap, "type", typestr);
10152 cmd_getval(cmdmap, "args", argvec);
10153 map<string,string> loc;
10154 if (!argvec.empty()) {
10155 CrushWrapper::parse_loc_map(argvec, &loc);
10156 dout(0) << "will create and move bucket '" << name
10157 << "' to location " << loc << dendl;
10158 }
10159
10160 if (!_have_pending_crush() &&
10161 _get_stable_crush().name_exists(name)) {
10162 ss << "bucket '" << name << "' already exists";
10163 goto reply;
10164 }
10165
10166 CrushWrapper newcrush;
10167 _get_pending_crush(newcrush);
10168
10169 if (newcrush.name_exists(name)) {
10170 ss << "bucket '" << name << "' already exists";
10171 goto update;
10172 }
10173 int type = newcrush.get_type_id(typestr);
10174 if (type < 0) {
10175 ss << "type '" << typestr << "' does not exist";
10176 err = -EINVAL;
10177 goto reply;
10178 }
10179 if (type == 0) {
10180 ss << "type '" << typestr << "' is for devices, not buckets";
10181 err = -EINVAL;
10182 goto reply;
10183 }
10184 int bucketno;
10185 err = newcrush.add_bucket(0, 0,
10186 CRUSH_HASH_DEFAULT, type, 0, NULL,
10187 NULL, &bucketno);
10188 if (err < 0) {
10189 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10190 goto reply;
10191 }
10192 err = newcrush.set_item_name(bucketno, name);
10193 if (err < 0) {
10194 ss << "error setting bucket name to '" << name << "'";
10195 goto reply;
10196 }
10197
10198 if (!loc.empty()) {
10199 if (!newcrush.check_item_loc(cct, bucketno, loc,
10200 (int *)NULL)) {
10201 err = newcrush.move_bucket(cct, bucketno, loc);
10202 if (err < 0) {
10203 ss << "error moving bucket '" << name << "' to location " << loc;
10204 goto reply;
10205 }
10206 } else {
10207 ss << "no need to move item id " << bucketno << " name '" << name
10208 << "' to location " << loc << " in crush map";
10209 }
10210 }
10211
10212 pending_inc.crush.clear();
10213 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10214 if (loc.empty()) {
10215 ss << "added bucket " << name << " type " << typestr
10216 << " to crush map";
10217 } else {
10218 ss << "added bucket " << name << " type " << typestr
10219 << " to location " << loc;
10220 }
10221 goto update;
10222 } else if (prefix == "osd crush rename-bucket") {
10223 string srcname, dstname;
10224 cmd_getval(cmdmap, "srcname", srcname);
10225 cmd_getval(cmdmap, "dstname", dstname);
10226
10227 err = crush_rename_bucket(srcname, dstname, &ss);
10228 if (err == -EALREADY) // equivalent to success for idempotency
10229 err = 0;
10230 if (err)
10231 goto reply;
10232 else
10233 goto update;
10234 } else if (prefix == "osd crush weight-set create" ||
10235 prefix == "osd crush weight-set create-compat") {
10236 CrushWrapper newcrush;
10237 _get_pending_crush(newcrush);
10238 int64_t pool;
10239 int positions;
10240 if (newcrush.has_non_straw2_buckets()) {
10241 ss << "crush map contains one or more bucket(s) that are not straw2";
10242 err = -EPERM;
10243 goto reply;
10244 }
10245 if (prefix == "osd crush weight-set create") {
10246 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10247 osdmap.require_min_compat_client < ceph_release_t::luminous) {
10248 ss << "require_min_compat_client "
10249 << osdmap.require_min_compat_client
10250 << " < luminous, which is required for per-pool weight-sets. "
10251 << "Try 'ceph osd set-require-min-compat-client luminous' "
10252 << "before using the new interface";
10253 err = -EPERM;
10254 goto reply;
10255 }
10256 string poolname, mode;
10257 cmd_getval(cmdmap, "pool", poolname);
10258 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10259 if (pool < 0) {
10260 ss << "pool '" << poolname << "' not found";
10261 err = -ENOENT;
10262 goto reply;
10263 }
10264 cmd_getval(cmdmap, "mode", mode);
10265 if (mode != "flat" && mode != "positional") {
10266 ss << "unrecognized weight-set mode '" << mode << "'";
10267 err = -EINVAL;
10268 goto reply;
10269 }
10270 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10271 } else {
10272 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10273 positions = 1;
10274 }
10275 if (!newcrush.create_choose_args(pool, positions)) {
10276 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10277 ss << "compat weight-set already created";
10278 } else {
10279 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10280 << "' already created";
10281 }
10282 goto reply;
10283 }
10284 pending_inc.crush.clear();
10285 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10286 goto update;
10287
10288 } else if (prefix == "osd crush weight-set rm" ||
10289 prefix == "osd crush weight-set rm-compat") {
10290 CrushWrapper newcrush;
10291 _get_pending_crush(newcrush);
10292 int64_t pool;
10293 if (prefix == "osd crush weight-set rm") {
10294 string poolname;
10295 cmd_getval(cmdmap, "pool", poolname);
10296 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10297 if (pool < 0) {
10298 ss << "pool '" << poolname << "' not found";
10299 err = -ENOENT;
10300 goto reply;
10301 }
10302 } else {
10303 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10304 }
10305 newcrush.rm_choose_args(pool);
10306 pending_inc.crush.clear();
10307 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10308 goto update;
10309
10310 } else if (prefix == "osd crush weight-set reweight" ||
10311 prefix == "osd crush weight-set reweight-compat") {
10312 string poolname, item;
10313 vector<double> weight;
10314 cmd_getval(cmdmap, "pool", poolname);
10315 cmd_getval(cmdmap, "item", item);
10316 cmd_getval(cmdmap, "weight", weight);
10317 CrushWrapper newcrush;
10318 _get_pending_crush(newcrush);
10319 int64_t pool;
10320 if (prefix == "osd crush weight-set reweight") {
10321 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10322 if (pool < 0) {
10323 ss << "pool '" << poolname << "' not found";
10324 err = -ENOENT;
10325 goto reply;
10326 }
10327 if (!newcrush.have_choose_args(pool)) {
10328 ss << "no weight-set for pool '" << poolname << "'";
10329 err = -ENOENT;
10330 goto reply;
10331 }
10332 auto arg_map = newcrush.choose_args_get(pool);
10333 int positions = newcrush.get_choose_args_positions(arg_map);
10334 if (weight.size() != (size_t)positions) {
10335 ss << "must specify exact " << positions << " weight values";
10336 err = -EINVAL;
10337 goto reply;
10338 }
10339 } else {
10340 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10341 if (!newcrush.have_choose_args(pool)) {
10342 ss << "no backward-compatible weight-set";
10343 err = -ENOENT;
10344 goto reply;
10345 }
10346 }
10347 if (!newcrush.name_exists(item)) {
10348 ss << "item '" << item << "' does not exist";
10349 err = -ENOENT;
10350 goto reply;
10351 }
10352 err = newcrush.choose_args_adjust_item_weightf(
10353 cct,
10354 newcrush.choose_args_get(pool),
10355 newcrush.get_item_id(item),
10356 weight,
10357 &ss);
10358 if (err < 0) {
10359 goto reply;
10360 }
10361 err = 0;
10362 pending_inc.crush.clear();
10363 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10364 goto update;
10365 } else if (osdid_present &&
10366 (prefix == "osd crush set" || prefix == "osd crush add")) {
10367 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10368 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10369 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10370
10371 if (!osdmap.exists(osdid)) {
10372 err = -ENOENT;
10373 ss << osd_name
10374 << " does not exist. Create it before updating the crush map";
10375 goto reply;
10376 }
10377
10378 double weight;
10379 if (!cmd_getval(cmdmap, "weight", weight)) {
10380 ss << "unable to parse weight value '"
10381 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10382 err = -EINVAL;
10383 goto reply;
10384 }
10385
10386 string args;
10387 vector<string> argvec;
10388 cmd_getval(cmdmap, "args", argvec);
10389 map<string,string> loc;
10390 CrushWrapper::parse_loc_map(argvec, &loc);
10391
10392 if (prefix == "osd crush set"
10393 && !_get_stable_crush().item_exists(osdid)) {
10394 err = -ENOENT;
10395 ss << "unable to set item id " << osdid << " name '" << osd_name
10396 << "' weight " << weight << " at location " << loc
10397 << ": does not exist";
10398 goto reply;
10399 }
10400
10401 dout(5) << "adding/updating crush item id " << osdid << " name '"
10402 << osd_name << "' weight " << weight << " at location "
10403 << loc << dendl;
10404 CrushWrapper newcrush;
10405 _get_pending_crush(newcrush);
10406
10407 string action;
10408 if (prefix == "osd crush set" ||
10409 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10410 action = "set";
10411 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10412 } else {
10413 action = "add";
10414 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10415 if (err == 0)
10416 err = 1;
10417 }
10418
10419 if (err < 0)
10420 goto reply;
10421
10422 if (err == 0 && !_have_pending_crush()) {
10423 ss << action << " item id " << osdid << " name '" << osd_name
10424 << "' weight " << weight << " at location " << loc << ": no change";
10425 goto reply;
10426 }
10427
10428 pending_inc.crush.clear();
10429 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10430 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10431 << weight << " at location " << loc << " to crush map";
10432 getline(ss, rs);
10433 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10434 get_last_committed() + 1));
10435 return true;
10436
10437 } else if (prefix == "osd crush create-or-move") {
10438 do {
10439 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10440 if (!osdmap.exists(osdid)) {
10441 err = -ENOENT;
10442 ss << osd_name
10443 << " does not exist. create it before updating the crush map";
10444 goto reply;
10445 }
10446
10447 double weight;
10448 if (!cmd_getval(cmdmap, "weight", weight)) {
10449 ss << "unable to parse weight value '"
10450 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10451 err = -EINVAL;
10452 goto reply;
10453 }
10454
10455 string args;
10456 vector<string> argvec;
10457 cmd_getval(cmdmap, "args", argvec);
10458 map<string,string> loc;
10459 CrushWrapper::parse_loc_map(argvec, &loc);
10460
10461 dout(0) << "create-or-move crush item name '" << osd_name
10462 << "' initial_weight " << weight << " at location " << loc
10463 << dendl;
10464
10465 CrushWrapper newcrush;
10466 _get_pending_crush(newcrush);
10467
10468 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10469 g_conf()->osd_crush_update_weight_set);
10470 if (err == 0) {
10471 ss << "create-or-move updated item name '" << osd_name
10472 << "' weight " << weight
10473 << " at location " << loc << " to crush map";
10474 break;
10475 }
10476 if (err > 0) {
10477 pending_inc.crush.clear();
10478 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10479 ss << "create-or-move updating item name '" << osd_name
10480 << "' weight " << weight
10481 << " at location " << loc << " to crush map";
10482 getline(ss, rs);
10483 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10484 get_last_committed() + 1));
10485 return true;
10486 }
10487 } while (false);
10488
10489 } else if (prefix == "osd crush move") {
10490 do {
10491 // osd crush move <name> <loc1> [<loc2> ...]
10492 string name;
10493 vector<string> argvec;
10494 cmd_getval(cmdmap, "name", name);
10495 cmd_getval(cmdmap, "args", argvec);
10496 map<string,string> loc;
10497 CrushWrapper::parse_loc_map(argvec, &loc);
10498
10499 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10500 CrushWrapper newcrush;
10501 _get_pending_crush(newcrush);
10502
10503 if (!newcrush.name_exists(name)) {
10504 err = -ENOENT;
10505 ss << "item " << name << " does not exist";
10506 break;
10507 }
10508 int id = newcrush.get_item_id(name);
10509
10510 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10511 if (id >= 0) {
10512 err = newcrush.create_or_move_item(
10513 cct, id, 0, name, loc,
10514 g_conf()->osd_crush_update_weight_set);
10515 } else {
10516 err = newcrush.move_bucket(cct, id, loc);
10517 }
10518 if (err >= 0) {
10519 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10520 pending_inc.crush.clear();
10521 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10522 getline(ss, rs);
10523 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10524 get_last_committed() + 1));
10525 return true;
10526 }
10527 } else {
10528 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10529 err = 0;
10530 }
10531 } while (false);
10532 } else if (prefix == "osd crush swap-bucket") {
10533 string source, dest;
10534 cmd_getval(cmdmap, "source", source);
10535 cmd_getval(cmdmap, "dest", dest);
10536
10537 bool force = false;
10538 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10539
10540 CrushWrapper newcrush;
10541 _get_pending_crush(newcrush);
10542 if (!newcrush.name_exists(source)) {
10543 ss << "source item " << source << " does not exist";
10544 err = -ENOENT;
10545 goto reply;
10546 }
10547 if (!newcrush.name_exists(dest)) {
10548 ss << "dest item " << dest << " does not exist";
10549 err = -ENOENT;
10550 goto reply;
10551 }
10552 int sid = newcrush.get_item_id(source);
10553 int did = newcrush.get_item_id(dest);
10554 int sparent;
10555 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10556 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10557 err = -EPERM;
10558 goto reply;
10559 }
10560 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10561 !force) {
10562 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10563 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10564 << "; pass --yes-i-really-mean-it to proceed anyway";
10565 err = -EPERM;
10566 goto reply;
10567 }
10568 int r = newcrush.swap_bucket(cct, sid, did);
10569 if (r < 0) {
10570 ss << "failed to swap bucket contents: " << cpp_strerror(r);
10571 err = r;
10572 goto reply;
10573 }
10574 ss << "swapped bucket of " << source << " to " << dest;
10575 pending_inc.crush.clear();
10576 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10577 wait_for_finished_proposal(op,
10578 new Monitor::C_Command(mon, op, err, ss.str(),
10579 get_last_committed() + 1));
10580 return true;
10581 } else if (prefix == "osd crush link") {
10582 // osd crush link <name> <loc1> [<loc2> ...]
10583 string name;
10584 cmd_getval(cmdmap, "name", name);
10585 vector<string> argvec;
10586 cmd_getval(cmdmap, "args", argvec);
10587 map<string,string> loc;
10588 CrushWrapper::parse_loc_map(argvec, &loc);
10589
10590 // Need an explicit check for name_exists because get_item_id returns
10591 // 0 on unfound.
10592 int id = osdmap.crush->get_item_id(name);
10593 if (!osdmap.crush->name_exists(name)) {
10594 err = -ENOENT;
10595 ss << "item " << name << " does not exist";
10596 goto reply;
10597 } else {
10598 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10599 }
10600 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10601 ss << "no need to move item id " << id << " name '" << name
10602 << "' to location " << loc << " in crush map";
10603 err = 0;
10604 goto reply;
10605 }
10606
10607 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10608 CrushWrapper newcrush;
10609 _get_pending_crush(newcrush);
10610
10611 if (!newcrush.name_exists(name)) {
10612 err = -ENOENT;
10613 ss << "item " << name << " does not exist";
10614 goto reply;
10615 } else {
10616 int id = newcrush.get_item_id(name);
10617 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10618 err = newcrush.link_bucket(cct, id, loc);
10619 if (err >= 0) {
10620 ss << "linked item id " << id << " name '" << name
10621 << "' to location " << loc << " in crush map";
10622 pending_inc.crush.clear();
10623 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10624 } else {
10625 ss << "cannot link item id " << id << " name '" << name
10626 << "' to location " << loc;
10627 goto reply;
10628 }
10629 } else {
10630 ss << "no need to move item id " << id << " name '" << name
10631 << "' to location " << loc << " in crush map";
10632 err = 0;
10633 }
10634 }
10635 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10636 get_last_committed() + 1));
10637 return true;
10638 } else if (prefix == "osd crush rm" ||
10639 prefix == "osd crush remove" ||
10640 prefix == "osd crush unlink") {
10641 do {
10642 // osd crush rm <id> [ancestor]
10643 CrushWrapper newcrush;
10644 _get_pending_crush(newcrush);
10645
10646 string name;
10647 cmd_getval(cmdmap, "name", name);
10648
10649 if (!osdmap.crush->name_exists(name)) {
10650 err = 0;
10651 ss << "device '" << name << "' does not appear in the crush map";
10652 break;
10653 }
10654 if (!newcrush.name_exists(name)) {
10655 err = 0;
10656 ss << "device '" << name << "' does not appear in the crush map";
10657 getline(ss, rs);
10658 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10659 get_last_committed() + 1));
10660 return true;
10661 }
10662 int id = newcrush.get_item_id(name);
10663 int ancestor = 0;
10664
10665 bool unlink_only = prefix == "osd crush unlink";
10666 string ancestor_str;
10667 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10668 if (!newcrush.name_exists(ancestor_str)) {
10669 err = -ENOENT;
10670 ss << "ancestor item '" << ancestor_str
10671 << "' does not appear in the crush map";
10672 break;
10673 }
10674 ancestor = newcrush.get_item_id(ancestor_str);
10675 }
10676
10677 err = prepare_command_osd_crush_remove(
10678 newcrush,
10679 id, ancestor,
10680 (ancestor < 0), unlink_only);
10681
10682 if (err == -ENOENT) {
10683 ss << "item " << id << " does not appear in that position";
10684 err = 0;
10685 break;
10686 }
10687 if (err == 0) {
10688 if (!unlink_only)
10689 pending_inc.new_crush_node_flags[id] = 0;
10690 ss << "removed item id " << id << " name '" << name << "' from crush map";
10691 getline(ss, rs);
10692 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10693 get_last_committed() + 1));
10694 return true;
10695 }
10696 } while (false);
10697
10698 } else if (prefix == "osd crush reweight-all") {
10699 CrushWrapper newcrush;
10700 _get_pending_crush(newcrush);
10701
10702 newcrush.reweight(cct);
10703 pending_inc.crush.clear();
10704 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10705 ss << "reweighted crush hierarchy";
10706 getline(ss, rs);
10707 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10708 get_last_committed() + 1));
10709 return true;
10710 } else if (prefix == "osd crush reweight") {
10711 // osd crush reweight <name> <weight>
10712 CrushWrapper newcrush;
10713 _get_pending_crush(newcrush);
10714
10715 string name;
10716 cmd_getval(cmdmap, "name", name);
10717 if (!newcrush.name_exists(name)) {
10718 err = -ENOENT;
10719 ss << "device '" << name << "' does not appear in the crush map";
10720 goto reply;
10721 }
10722
10723 int id = newcrush.get_item_id(name);
10724 if (id < 0) {
10725 ss << "device '" << name << "' is not a leaf in the crush map";
10726 err = -EINVAL;
10727 goto reply;
10728 }
10729 double w;
10730 if (!cmd_getval(cmdmap, "weight", w)) {
10731 ss << "unable to parse weight value '"
10732 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10733 err = -EINVAL;
10734 goto reply;
10735 }
10736
10737 err = newcrush.adjust_item_weightf(cct, id, w,
10738 g_conf()->osd_crush_update_weight_set);
10739 if (err < 0)
10740 goto reply;
10741 pending_inc.crush.clear();
10742 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10743 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10744 << " in crush map";
10745 getline(ss, rs);
10746 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10747 get_last_committed() + 1));
10748 return true;
10749 } else if (prefix == "osd crush reweight-subtree") {
10750 // osd crush reweight <name> <weight>
10751 CrushWrapper newcrush;
10752 _get_pending_crush(newcrush);
10753
10754 string name;
10755 cmd_getval(cmdmap, "name", name);
10756 if (!newcrush.name_exists(name)) {
10757 err = -ENOENT;
10758 ss << "device '" << name << "' does not appear in the crush map";
10759 goto reply;
10760 }
10761
10762 int id = newcrush.get_item_id(name);
10763 if (id >= 0) {
10764 ss << "device '" << name << "' is not a subtree in the crush map";
10765 err = -EINVAL;
10766 goto reply;
10767 }
10768 double w;
10769 if (!cmd_getval(cmdmap, "weight", w)) {
10770 ss << "unable to parse weight value '"
10771 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10772 err = -EINVAL;
10773 goto reply;
10774 }
10775
10776 err = newcrush.adjust_subtree_weightf(cct, id, w,
10777 g_conf()->osd_crush_update_weight_set);
10778 if (err < 0)
10779 goto reply;
10780 pending_inc.crush.clear();
10781 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10782 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10783 << " in crush map";
10784 getline(ss, rs);
10785 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10786 get_last_committed() + 1));
10787 return true;
10788 } else if (prefix == "osd crush tunables") {
10789 CrushWrapper newcrush;
10790 _get_pending_crush(newcrush);
10791
10792 err = 0;
10793 string profile;
10794 cmd_getval(cmdmap, "profile", profile);
10795 if (profile == "legacy" || profile == "argonaut") {
10796 newcrush.set_tunables_legacy();
10797 } else if (profile == "bobtail") {
10798 newcrush.set_tunables_bobtail();
10799 } else if (profile == "firefly") {
10800 newcrush.set_tunables_firefly();
10801 } else if (profile == "hammer") {
10802 newcrush.set_tunables_hammer();
10803 } else if (profile == "jewel") {
10804 newcrush.set_tunables_jewel();
10805 } else if (profile == "optimal") {
10806 newcrush.set_tunables_optimal();
10807 } else if (profile == "default") {
10808 newcrush.set_tunables_default();
10809 } else {
10810 ss << "unrecognized profile '" << profile << "'";
10811 err = -EINVAL;
10812 goto reply;
10813 }
10814
10815 if (!validate_crush_against_features(&newcrush, ss)) {
10816 err = -EINVAL;
10817 goto reply;
10818 }
10819
10820 pending_inc.crush.clear();
10821 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10822 ss << "adjusted tunables profile to " << profile;
10823 getline(ss, rs);
10824 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10825 get_last_committed() + 1));
10826 return true;
10827 } else if (prefix == "osd crush set-tunable") {
10828 CrushWrapper newcrush;
10829 _get_pending_crush(newcrush);
10830
10831 err = 0;
10832 string tunable;
10833 cmd_getval(cmdmap, "tunable", tunable);
10834
10835 int64_t value = -1;
10836 if (!cmd_getval(cmdmap, "value", value)) {
10837 err = -EINVAL;
10838 ss << "failed to parse integer value "
10839 << cmd_vartype_stringify(cmdmap.at("value"));
10840 goto reply;
10841 }
10842
10843 if (tunable == "straw_calc_version") {
10844 if (value != 0 && value != 1) {
10845 ss << "value must be 0 or 1; got " << value;
10846 err = -EINVAL;
10847 goto reply;
10848 }
10849 newcrush.set_straw_calc_version(value);
10850 } else {
10851 ss << "unrecognized tunable '" << tunable << "'";
10852 err = -EINVAL;
10853 goto reply;
10854 }
10855
10856 if (!validate_crush_against_features(&newcrush, ss)) {
10857 err = -EINVAL;
10858 goto reply;
10859 }
10860
10861 pending_inc.crush.clear();
10862 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10863 ss << "adjusted tunable " << tunable << " to " << value;
10864 getline(ss, rs);
10865 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10866 get_last_committed() + 1));
10867 return true;
10868
10869 } else if (prefix == "osd crush rule create-simple") {
10870 string name, root, type, mode;
10871 cmd_getval(cmdmap, "name", name);
10872 cmd_getval(cmdmap, "root", root);
10873 cmd_getval(cmdmap, "type", type);
10874 cmd_getval(cmdmap, "mode", mode);
10875 if (mode == "")
10876 mode = "firstn";
10877
10878 if (osdmap.crush->rule_exists(name)) {
10879 // The name is uniquely associated to a ruleid and the rule it contains
10880 // From the user point of view, the rule is more meaningfull.
10881 ss << "rule " << name << " already exists";
10882 err = 0;
10883 goto reply;
10884 }
10885
10886 CrushWrapper newcrush;
10887 _get_pending_crush(newcrush);
10888
10889 if (newcrush.rule_exists(name)) {
10890 // The name is uniquely associated to a ruleid and the rule it contains
10891 // From the user point of view, the rule is more meaningfull.
10892 ss << "rule " << name << " already exists";
10893 err = 0;
10894 } else {
10895 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10896 pg_pool_t::TYPE_REPLICATED, &ss);
10897 if (ruleno < 0) {
10898 err = ruleno;
10899 goto reply;
10900 }
10901
10902 pending_inc.crush.clear();
10903 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10904 }
10905 getline(ss, rs);
10906 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10907 get_last_committed() + 1));
10908 return true;
10909
10910 } else if (prefix == "osd crush rule create-replicated") {
10911 string name, root, type, device_class;
10912 cmd_getval(cmdmap, "name", name);
10913 cmd_getval(cmdmap, "root", root);
10914 cmd_getval(cmdmap, "type", type);
10915 cmd_getval(cmdmap, "class", device_class);
10916
10917 if (osdmap.crush->rule_exists(name)) {
10918 // The name is uniquely associated to a ruleid and the rule it contains
10919 // From the user point of view, the rule is more meaningfull.
10920 ss << "rule " << name << " already exists";
10921 err = 0;
10922 goto reply;
10923 }
10924
10925 CrushWrapper newcrush;
10926 _get_pending_crush(newcrush);
10927
10928 if (newcrush.rule_exists(name)) {
10929 // The name is uniquely associated to a ruleid and the rule it contains
10930 // From the user point of view, the rule is more meaningfull.
10931 ss << "rule " << name << " already exists";
10932 err = 0;
10933 } else {
10934 int ruleno = newcrush.add_simple_rule(
10935 name, root, type, device_class,
10936 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10937 if (ruleno < 0) {
10938 err = ruleno;
10939 goto reply;
10940 }
10941
10942 pending_inc.crush.clear();
10943 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10944 }
10945 getline(ss, rs);
10946 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10947 get_last_committed() + 1));
10948 return true;
10949
10950 } else if (prefix == "osd erasure-code-profile rm") {
10951 string name;
10952 cmd_getval(cmdmap, "name", name);
10953
10954 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10955 goto wait;
10956
10957 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10958 err = -EBUSY;
10959 goto reply;
10960 }
10961
10962 if (osdmap.has_erasure_code_profile(name) ||
10963 pending_inc.new_erasure_code_profiles.count(name)) {
10964 if (osdmap.has_erasure_code_profile(name)) {
10965 pending_inc.old_erasure_code_profiles.push_back(name);
10966 } else {
10967 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10968 pending_inc.new_erasure_code_profiles.erase(name);
10969 }
10970
10971 getline(ss, rs);
10972 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10973 get_last_committed() + 1));
10974 return true;
10975 } else {
10976 ss << "erasure-code-profile " << name << " does not exist";
10977 err = 0;
10978 goto reply;
10979 }
10980
10981 } else if (prefix == "osd erasure-code-profile set") {
10982 string name;
10983 cmd_getval(cmdmap, "name", name);
10984 vector<string> profile;
10985 cmd_getval(cmdmap, "profile", profile);
10986
10987 bool force = false;
10988 cmd_getval(cmdmap, "force", force);
10989
10990 map<string,string> profile_map;
10991 err = parse_erasure_code_profile(profile, &profile_map, &ss);
10992 if (err)
10993 goto reply;
10994 if (auto found = profile_map.find("crush-failure-domain");
10995 found != profile_map.end()) {
10996 const auto& failure_domain = found->second;
10997 int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
10998 if (failure_domain_type < 0) {
10999 ss << "erasure-code-profile " << profile_map
11000 << " contains an invalid failure-domain " << std::quoted(failure_domain);
11001 err = -EINVAL;
11002 goto reply;
11003 }
11004 }
11005
11006 if (profile_map.find("plugin") == profile_map.end()) {
11007 ss << "erasure-code-profile " << profile_map
11008 << " must contain a plugin entry" << std::endl;
11009 err = -EINVAL;
11010 goto reply;
11011 }
11012 string plugin = profile_map["plugin"];
11013
11014 if (pending_inc.has_erasure_code_profile(name)) {
11015 dout(20) << "erasure code profile " << name << " try again" << dendl;
11016 goto wait;
11017 } else {
11018 err = normalize_profile(name, profile_map, force, &ss);
11019 if (err)
11020 goto reply;
11021
11022 if (osdmap.has_erasure_code_profile(name)) {
11023 ErasureCodeProfile existing_profile_map =
11024 osdmap.get_erasure_code_profile(name);
11025 err = normalize_profile(name, existing_profile_map, force, &ss);
11026 if (err)
11027 goto reply;
11028
11029 if (existing_profile_map == profile_map) {
11030 err = 0;
11031 goto reply;
11032 }
11033 if (!force) {
11034 err = -EPERM;
11035 ss << "will not override erasure code profile " << name
11036 << " because the existing profile "
11037 << existing_profile_map
11038 << " is different from the proposed profile "
11039 << profile_map;
11040 goto reply;
11041 }
11042 }
11043
11044 dout(20) << "erasure code profile set " << name << "="
11045 << profile_map << dendl;
11046 pending_inc.set_erasure_code_profile(name, profile_map);
11047 }
11048
11049 getline(ss, rs);
11050 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11051 get_last_committed() + 1));
11052 return true;
11053
11054 } else if (prefix == "osd crush rule create-erasure") {
11055 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11056 if (err == -EAGAIN)
11057 goto wait;
11058 if (err)
11059 goto reply;
11060 string name, poolstr;
11061 cmd_getval(cmdmap, "name", name);
11062 string profile;
11063 cmd_getval(cmdmap, "profile", profile);
11064 if (profile == "")
11065 profile = "default";
11066 if (profile == "default") {
11067 if (!osdmap.has_erasure_code_profile(profile)) {
11068 if (pending_inc.has_erasure_code_profile(profile)) {
11069 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11070 goto wait;
11071 }
11072
11073 map<string,string> profile_map;
11074 err = osdmap.get_erasure_code_profile_default(cct,
11075 profile_map,
11076 &ss);
11077 if (err)
11078 goto reply;
11079 err = normalize_profile(name, profile_map, true, &ss);
11080 if (err)
11081 goto reply;
11082 dout(20) << "erasure code profile set " << profile << "="
11083 << profile_map << dendl;
11084 pending_inc.set_erasure_code_profile(profile, profile_map);
11085 goto wait;
11086 }
11087 }
11088
11089 int rule;
11090 err = crush_rule_create_erasure(name, profile, &rule, &ss);
11091 if (err < 0) {
11092 switch(err) {
11093 case -EEXIST: // return immediately
11094 ss << "rule " << name << " already exists";
11095 err = 0;
11096 goto reply;
11097 break;
11098 case -EALREADY: // wait for pending to be proposed
11099 ss << "rule " << name << " already exists";
11100 err = 0;
11101 break;
11102 default: // non recoverable error
11103 goto reply;
11104 break;
11105 }
11106 } else {
11107 ss << "created rule " << name << " at " << rule;
11108 }
11109
11110 getline(ss, rs);
11111 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11112 get_last_committed() + 1));
11113 return true;
11114
11115 } else if (prefix == "osd crush rule rm") {
11116 string name;
11117 cmd_getval(cmdmap, "name", name);
11118
11119 if (!osdmap.crush->rule_exists(name)) {
11120 ss << "rule " << name << " does not exist";
11121 err = 0;
11122 goto reply;
11123 }
11124
11125 CrushWrapper newcrush;
11126 _get_pending_crush(newcrush);
11127
11128 if (!newcrush.rule_exists(name)) {
11129 ss << "rule " << name << " does not exist";
11130 err = 0;
11131 } else {
11132 int ruleno = newcrush.get_rule_id(name);
11133 ceph_assert(ruleno >= 0);
11134
11135 // make sure it is not in use.
11136 // FIXME: this is ok in some situations, but let's not bother with that
11137 // complexity now.
11138 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
11139 if (osdmap.crush_rule_in_use(ruleset)) {
11140 ss << "crush ruleset " << name << " " << ruleset << " is in use";
11141 err = -EBUSY;
11142 goto reply;
11143 }
11144
11145 err = newcrush.remove_rule(ruleno);
11146 if (err < 0) {
11147 goto reply;
11148 }
11149
11150 pending_inc.crush.clear();
11151 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11152 }
11153 getline(ss, rs);
11154 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11155 get_last_committed() + 1));
11156 return true;
11157
11158 } else if (prefix == "osd crush rule rename") {
11159 string srcname;
11160 string dstname;
11161 cmd_getval(cmdmap, "srcname", srcname);
11162 cmd_getval(cmdmap, "dstname", dstname);
11163 if (srcname.empty() || dstname.empty()) {
11164 ss << "must specify both source rule name and destination rule name";
11165 err = -EINVAL;
11166 goto reply;
11167 }
11168 if (srcname == dstname) {
11169 ss << "destination rule name is equal to source rule name";
11170 err = 0;
11171 goto reply;
11172 }
11173
11174 CrushWrapper newcrush;
11175 _get_pending_crush(newcrush);
11176 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11177 // srcname does not exist and dstname already exists
11178 // suppose this is a replay and return success
11179 // (so this command is idempotent)
11180 ss << "already renamed to '" << dstname << "'";
11181 err = 0;
11182 goto reply;
11183 }
11184
11185 err = newcrush.rename_rule(srcname, dstname, &ss);
11186 if (err < 0) {
11187 // ss has reason for failure
11188 goto reply;
11189 }
11190 pending_inc.crush.clear();
11191 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11192 getline(ss, rs);
11193 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11194 get_last_committed() + 1));
11195 return true;
11196
11197 } else if (prefix == "osd setmaxosd") {
11198 int64_t newmax;
11199 if (!cmd_getval(cmdmap, "newmax", newmax)) {
11200 ss << "unable to parse 'newmax' value '"
11201 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11202 err = -EINVAL;
11203 goto reply;
11204 }
11205
11206 if (newmax > g_conf()->mon_max_osd) {
11207 err = -ERANGE;
11208 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11209 << g_conf()->mon_max_osd << ")";
11210 goto reply;
11211 }
11212
11213 // Don't allow shrinking OSD number as this will cause data loss
11214 // and may cause kernel crashes.
11215 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11216 if (newmax < osdmap.get_max_osd()) {
11217 // Check if the OSDs exist between current max and new value.
11218 // If there are any OSDs exist, then don't allow shrinking number
11219 // of OSDs.
11220 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11221 if (osdmap.exists(i)) {
11222 err = -EBUSY;
11223 ss << "cannot shrink max_osd to " << newmax
11224 << " because osd." << i << " (and possibly others) still in use";
11225 goto reply;
11226 }
11227 }
11228 }
11229
11230 pending_inc.new_max_osd = newmax;
11231 ss << "set new max_osd = " << pending_inc.new_max_osd;
11232 getline(ss, rs);
11233 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11234 get_last_committed() + 1));
11235 return true;
11236
11237 } else if (prefix == "osd set-full-ratio" ||
11238 prefix == "osd set-backfillfull-ratio" ||
11239 prefix == "osd set-nearfull-ratio") {
11240 double n;
11241 if (!cmd_getval(cmdmap, "ratio", n)) {
11242 ss << "unable to parse 'ratio' value '"
11243 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11244 err = -EINVAL;
11245 goto reply;
11246 }
11247 if (prefix == "osd set-full-ratio")
11248 pending_inc.new_full_ratio = n;
11249 else if (prefix == "osd set-backfillfull-ratio")
11250 pending_inc.new_backfillfull_ratio = n;
11251 else if (prefix == "osd set-nearfull-ratio")
11252 pending_inc.new_nearfull_ratio = n;
11253 ss << prefix << " " << n;
11254 getline(ss, rs);
11255 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11256 get_last_committed() + 1));
11257 return true;
11258 } else if (prefix == "osd set-require-min-compat-client") {
11259 string v;
11260 cmd_getval(cmdmap, "version", v);
11261 ceph_release_t vno = ceph_release_from_name(v);
11262 if (!vno) {
11263 ss << "version " << v << " is not recognized";
11264 err = -EINVAL;
11265 goto reply;
11266 }
11267 OSDMap newmap;
11268 newmap.deepish_copy_from(osdmap);
11269 newmap.apply_incremental(pending_inc);
11270 newmap.require_min_compat_client = vno;
11271 auto mvno = newmap.get_min_compat_client();
11272 if (vno < mvno) {
11273 ss << "osdmap current utilizes features that require " << mvno
11274 << "; cannot set require_min_compat_client below that to " << vno;
11275 err = -EPERM;
11276 goto reply;
11277 }
11278 bool sure = false;
11279 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11280 if (!sure) {
11281 FeatureMap m;
11282 mon.get_combined_feature_map(&m);
11283 uint64_t features = ceph_release_features(to_integer<int>(vno));
11284 bool first = true;
11285 bool ok = true;
11286 for (int type : {
11287 CEPH_ENTITY_TYPE_CLIENT,
11288 CEPH_ENTITY_TYPE_MDS,
11289 CEPH_ENTITY_TYPE_MGR }) {
11290 auto p = m.m.find(type);
11291 if (p == m.m.end()) {
11292 continue;
11293 }
11294 for (auto& q : p->second) {
11295 uint64_t missing = ~q.first & features;
11296 if (missing) {
11297 if (first) {
11298 ss << "cannot set require_min_compat_client to " << v << ": ";
11299 } else {
11300 ss << "; ";
11301 }
11302 first = false;
11303 ss << q.second << " connected " << ceph_entity_type_name(type)
11304 << "(s) look like " << ceph_release_name(
11305 ceph_release_from_features(q.first))
11306 << " (missing 0x" << std::hex << missing << std::dec << ")";
11307 ok = false;
11308 }
11309 }
11310 }
11311 if (!ok) {
11312 ss << "; add --yes-i-really-mean-it to do it anyway";
11313 err = -EPERM;
11314 goto reply;
11315 }
11316 }
11317 ss << "set require_min_compat_client to " << vno;
11318 pending_inc.new_require_min_compat_client = vno;
11319 getline(ss, rs);
11320 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11321 get_last_committed() + 1));
11322 return true;
11323 } else if (prefix == "osd pause") {
11324 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11325
11326 } else if (prefix == "osd unpause") {
11327 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11328
11329 } else if (prefix == "osd set") {
11330 bool sure = false;
11331 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11332
11333 string key;
11334 cmd_getval(cmdmap, "key", key);
11335 if (key == "pause")
11336 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11337 else if (key == "noup")
11338 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11339 else if (key == "nodown")
11340 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11341 else if (key == "noout")
11342 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11343 else if (key == "noin")
11344 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11345 else if (key == "nobackfill")
11346 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11347 else if (key == "norebalance")
11348 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11349 else if (key == "norecover")
11350 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11351 else if (key == "noscrub")
11352 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11353 else if (key == "nodeep-scrub")
11354 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11355 else if (key == "notieragent")
11356 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11357 else if (key == "nosnaptrim")
11358 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11359 else if (key == "pglog_hardlimit") {
11360 if (!osdmap.get_num_up_osds() && !sure) {
11361 ss << "Not advisable to continue since no OSDs are up. Pass "
11362 << "--yes-i-really-mean-it if you really wish to continue.";
11363 err = -EPERM;
11364 goto reply;
11365 }
11366 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11367 // we are reusing a jewel feature bit that was retired in luminous.
11368 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11369 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11370 || sure)) {
11371 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11372 } else {
11373 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11374 err = -EPERM;
11375 goto reply;
11376 }
11377 } else {
11378 ss << "unrecognized flag '" << key << "'";
11379 err = -EINVAL;
11380 }
11381
11382 } else if (prefix == "osd unset") {
11383 string key;
11384 cmd_getval(cmdmap, "key", key);
11385 if (key == "pause")
11386 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11387 else if (key == "noup")
11388 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11389 else if (key == "nodown")
11390 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11391 else if (key == "noout")
11392 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11393 else if (key == "noin")
11394 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11395 else if (key == "nobackfill")
11396 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11397 else if (key == "norebalance")
11398 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11399 else if (key == "norecover")
11400 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11401 else if (key == "noscrub")
11402 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11403 else if (key == "nodeep-scrub")
11404 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11405 else if (key == "notieragent")
11406 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11407 else if (key == "nosnaptrim")
11408 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11409 else {
11410 ss << "unrecognized flag '" << key << "'";
11411 err = -EINVAL;
11412 }
11413
11414 } else if (prefix == "osd require-osd-release") {
11415 string release;
11416 cmd_getval(cmdmap, "release", release);
11417 bool sure = false;
11418 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11419 ceph_release_t rel = ceph_release_from_name(release.c_str());
11420 if (!rel) {
11421 ss << "unrecognized release " << release;
11422 err = -EINVAL;
11423 goto reply;
11424 }
11425 if (rel == osdmap.require_osd_release) {
11426 // idempotent
11427 err = 0;
11428 goto reply;
11429 }
11430 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11431 if (!osdmap.get_num_up_osds() && !sure) {
11432 ss << "Not advisable to continue since no OSDs are up. Pass "
11433 << "--yes-i-really-mean-it if you really wish to continue.";
11434 err = -EPERM;
11435 goto reply;
11436 }
11437 if (rel == ceph_release_t::mimic) {
11438 if (!mon.monmap->get_required_features().contains_all(
11439 ceph::features::mon::FEATURE_MIMIC)) {
11440 ss << "not all mons are mimic";
11441 err = -EPERM;
11442 goto reply;
11443 }
11444 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11445 && !sure) {
11446 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11447 err = -EPERM;
11448 goto reply;
11449 }
11450 } else if (rel == ceph_release_t::nautilus) {
11451 if (!mon.monmap->get_required_features().contains_all(
11452 ceph::features::mon::FEATURE_NAUTILUS)) {
11453 ss << "not all mons are nautilus";
11454 err = -EPERM;
11455 goto reply;
11456 }
11457 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11458 && !sure) {
11459 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11460 err = -EPERM;
11461 goto reply;
11462 }
11463 } else if (rel == ceph_release_t::octopus) {
11464 if (!mon.monmap->get_required_features().contains_all(
11465 ceph::features::mon::FEATURE_OCTOPUS)) {
11466 ss << "not all mons are octopus";
11467 err = -EPERM;
11468 goto reply;
11469 }
11470 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11471 && !sure) {
11472 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11473 err = -EPERM;
11474 goto reply;
11475 }
11476 } else if (rel == ceph_release_t::pacific) {
11477 if (!mon.monmap->get_required_features().contains_all(
11478 ceph::features::mon::FEATURE_PACIFIC)) {
11479 ss << "not all mons are pacific";
11480 err = -EPERM;
11481 goto reply;
11482 }
11483 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11484 && !sure) {
11485 ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11486 err = -EPERM;
11487 goto reply;
11488 }
11489 } else {
11490 ss << "not supported for this release yet";
11491 err = -EPERM;
11492 goto reply;
11493 }
11494 if (rel < osdmap.require_osd_release) {
11495 ss << "require_osd_release cannot be lowered once it has been set";
11496 err = -EPERM;
11497 goto reply;
11498 }
11499 pending_inc.new_require_osd_release = rel;
11500 goto update;
11501 } else if (prefix == "osd down" ||
11502 prefix == "osd out" ||
11503 prefix == "osd in" ||
11504 prefix == "osd rm" ||
11505 prefix == "osd stop") {
11506
11507 bool any = false;
11508 bool stop = false;
11509 bool verbose = true;
11510 bool definitely_dead = false;
11511
11512 vector<string> idvec;
11513 cmd_getval(cmdmap, "ids", idvec);
11514 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11515 derr << "definitely_dead " << (int)definitely_dead << dendl;
11516 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11517 set<int> osds;
11518
11519 // wildcard?
11520 if (j == 0 &&
11521 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11522 if (prefix == "osd in") {
11523 // touch out osds only
11524 osdmap.get_out_existing_osds(osds);
11525 } else {
11526 osdmap.get_all_osds(osds);
11527 }
11528 stop = true;
11529 verbose = false; // so the output is less noisy.
11530 } else {
11531 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11532 if (osd < 0) {
11533 ss << "invalid osd id" << osd;
11534 err = -EINVAL;
11535 continue;
11536 } else if (!osdmap.exists(osd)) {
11537 ss << "osd." << osd << " does not exist. ";
11538 continue;
11539 }
11540
11541 osds.insert(osd);
11542 }
11543
11544 for (auto &osd : osds) {
11545 if (prefix == "osd down") {
11546 if (osdmap.is_down(osd)) {
11547 if (verbose)
11548 ss << "osd." << osd << " is already down. ";
11549 } else {
11550 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11551 ss << "marked down osd." << osd << ". ";
11552 any = true;
11553 }
11554 if (definitely_dead) {
11555 if (!pending_inc.new_xinfo.count(osd)) {
11556 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11557 }
11558 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11559 any = true;
11560 }
11561 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11562 }
11563 } else if (prefix == "osd out") {
11564 if (osdmap.is_out(osd)) {
11565 if (verbose)
11566 ss << "osd." << osd << " is already out. ";
11567 } else {
11568 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11569 if (osdmap.osd_weight[osd]) {
11570 if (pending_inc.new_xinfo.count(osd) == 0) {
11571 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11572 }
11573 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11574 }
11575 ss << "marked out osd." << osd << ". ";
11576 std::ostringstream msg;
11577 msg << "Client " << op->get_session()->entity_name
11578 << " marked osd." << osd << " out";
11579 if (osdmap.is_up(osd)) {
11580 msg << ", while it was still marked up";
11581 } else {
11582 auto period = ceph_clock_now() - down_pending_out[osd];
11583 msg << ", after it was down for " << int(period.sec())
11584 << " seconds";
11585 }
11586
11587 mon.clog->info() << msg.str();
11588 any = true;
11589 }
11590 } else if (prefix == "osd in") {
11591 if (osdmap.is_in(osd)) {
11592 if (verbose)
11593 ss << "osd." << osd << " is already in. ";
11594 } else {
11595 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11596 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11597 if (pending_inc.new_xinfo.count(osd) == 0) {
11598 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11599 }
11600 pending_inc.new_xinfo[osd].old_weight = 0;
11601 } else {
11602 pending_inc.new_weight[osd] = CEPH_OSD_IN;
11603 }
11604 ss << "marked in osd." << osd << ". ";
11605 any = true;
11606 }
11607 } else if (prefix == "osd rm") {
11608 err = prepare_command_osd_remove(osd);
11609
11610 if (err == -EBUSY) {
11611 if (any)
11612 ss << ", ";
11613 ss << "osd." << osd << " is still up; must be down before removal. ";
11614 } else {
11615 ceph_assert(err == 0);
11616 if (any) {
11617 ss << ", osd." << osd;
11618 } else {
11619 ss << "removed osd." << osd;
11620 }
11621 any = true;
11622 }
11623 } else if (prefix == "osd stop") {
11624 if (osdmap.is_stop(osd)) {
11625 if (verbose)
11626 ss << "osd." << osd << " is already stopped. ";
11627 } else if (osdmap.is_down(osd)) {
11628 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11629 ss << "stop down osd." << osd << ". ";
11630 any = true;
11631 } else {
11632 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11633 ss << "stop osd." << osd << ". ";
11634 any = true;
11635 }
11636 }
11637 }
11638 }
11639 if (any) {
11640 getline(ss, rs);
11641 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11642 get_last_committed() + 1));
11643 return true;
11644 }
11645 } else if (prefix == "osd set-group" ||
11646 prefix == "osd unset-group" ||
11647 prefix == "osd add-noup" ||
11648 prefix == "osd add-nodown" ||
11649 prefix == "osd add-noin" ||
11650 prefix == "osd add-noout" ||
11651 prefix == "osd rm-noup" ||
11652 prefix == "osd rm-nodown" ||
11653 prefix == "osd rm-noin" ||
11654 prefix == "osd rm-noout") {
11655 bool do_set = prefix == "osd set-group" ||
11656 prefix.find("add") != string::npos;
11657 string flag_str;
11658 unsigned flags = 0;
11659 vector<string> who;
11660 if (prefix == "osd set-group" || prefix == "osd unset-group") {
11661 cmd_getval(cmdmap, "flags", flag_str);
11662 cmd_getval(cmdmap, "who", who);
11663 vector<string> raw_flags;
11664 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11665 for (auto& f : raw_flags) {
11666 if (f == "noup")
11667 flags |= CEPH_OSD_NOUP;
11668 else if (f == "nodown")
11669 flags |= CEPH_OSD_NODOWN;
11670 else if (f == "noin")
11671 flags |= CEPH_OSD_NOIN;
11672 else if (f == "noout")
11673 flags |= CEPH_OSD_NOOUT;
11674 else {
11675 ss << "unrecognized flag '" << f << "', must be one of "
11676 << "{noup,nodown,noin,noout}";
11677 err = -EINVAL;
11678 goto reply;
11679 }
11680 }
11681 } else {
11682 cmd_getval(cmdmap, "ids", who);
11683 if (prefix.find("noup") != string::npos)
11684 flags = CEPH_OSD_NOUP;
11685 else if (prefix.find("nodown") != string::npos)
11686 flags = CEPH_OSD_NODOWN;
11687 else if (prefix.find("noin") != string::npos)
11688 flags = CEPH_OSD_NOIN;
11689 else if (prefix.find("noout") != string::npos)
11690 flags = CEPH_OSD_NOOUT;
11691 else
11692 ceph_assert(0 == "Unreachable!");
11693 }
11694 if (flags == 0) {
11695 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11696 err = -EINVAL;
11697 goto reply;
11698 }
11699 if (who.empty()) {
11700 ss << "must specify at least one or more targets to set/unset";
11701 err = -EINVAL;
11702 goto reply;
11703 }
11704 set<int> osds;
11705 set<int> crush_nodes;
11706 set<int> device_classes;
11707 for (auto& w : who) {
11708 if (w == "any" || w == "all" || w == "*") {
11709 osdmap.get_all_osds(osds);
11710 break;
11711 }
11712 std::stringstream ts;
11713 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11714 osds.insert(osd);
11715 } else if (osdmap.crush->name_exists(w)) {
11716 crush_nodes.insert(osdmap.crush->get_item_id(w));
11717 } else if (osdmap.crush->class_exists(w)) {
11718 device_classes.insert(osdmap.crush->get_class_id(w));
11719 } else {
11720 ss << "unable to parse osd id or crush node or device class: "
11721 << "\"" << w << "\". ";
11722 }
11723 }
11724 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11725 // ss has reason for failure
11726 err = -EINVAL;
11727 goto reply;
11728 }
11729 bool any = false;
11730 for (auto osd : osds) {
11731 if (!osdmap.exists(osd)) {
11732 ss << "osd." << osd << " does not exist. ";
11733 continue;
11734 }
11735 if (do_set) {
11736 if (flags & CEPH_OSD_NOUP) {
11737 any |= osdmap.is_noup_by_osd(osd) ?
11738 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11739 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11740 }
11741 if (flags & CEPH_OSD_NODOWN) {
11742 any |= osdmap.is_nodown_by_osd(osd) ?
11743 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11744 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11745 }
11746 if (flags & CEPH_OSD_NOIN) {
11747 any |= osdmap.is_noin_by_osd(osd) ?
11748 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11749 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11750 }
11751 if (flags & CEPH_OSD_NOOUT) {
11752 any |= osdmap.is_noout_by_osd(osd) ?
11753 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11754 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11755 }
11756 } else {
11757 if (flags & CEPH_OSD_NOUP) {
11758 any |= osdmap.is_noup_by_osd(osd) ?
11759 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11760 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11761 }
11762 if (flags & CEPH_OSD_NODOWN) {
11763 any |= osdmap.is_nodown_by_osd(osd) ?
11764 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11765 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11766 }
11767 if (flags & CEPH_OSD_NOIN) {
11768 any |= osdmap.is_noin_by_osd(osd) ?
11769 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11770 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11771 }
11772 if (flags & CEPH_OSD_NOOUT) {
11773 any |= osdmap.is_noout_by_osd(osd) ?
11774 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11775 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11776 }
11777 }
11778 }
11779 for (auto& id : crush_nodes) {
11780 auto old_flags = osdmap.get_crush_node_flags(id);
11781 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11782 pending_flags |= old_flags; // adopt existing flags first!
11783 if (do_set) {
11784 pending_flags |= flags;
11785 } else {
11786 pending_flags &= ~flags;
11787 }
11788 any = true;
11789 }
11790 for (auto& id : device_classes) {
11791 auto old_flags = osdmap.get_device_class_flags(id);
11792 auto& pending_flags = pending_inc.new_device_class_flags[id];
11793 pending_flags |= old_flags;
11794 if (do_set) {
11795 pending_flags |= flags;
11796 } else {
11797 pending_flags &= ~flags;
11798 }
11799 any = true;
11800 }
11801 if (any) {
11802 getline(ss, rs);
11803 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11804 get_last_committed() + 1));
11805 return true;
11806 }
11807 } else if (prefix == "osd pg-temp") {
11808 string pgidstr;
11809 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11810 ss << "unable to parse 'pgid' value '"
11811 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11812 err = -EINVAL;
11813 goto reply;
11814 }
11815 pg_t pgid;
11816 if (!pgid.parse(pgidstr.c_str())) {
11817 ss << "invalid pgid '" << pgidstr << "'";
11818 err = -EINVAL;
11819 goto reply;
11820 }
11821 if (!osdmap.pg_exists(pgid)) {
11822 ss << "pg " << pgid << " does not exist";
11823 err = -ENOENT;
11824 goto reply;
11825 }
11826 if (pending_inc.new_pg_temp.count(pgid)) {
11827 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11828 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11829 return true;
11830 }
11831
11832 vector<int64_t> id_vec;
11833 vector<int32_t> new_pg_temp;
11834 cmd_getval(cmdmap, "id", id_vec);
11835 if (id_vec.empty()) {
11836 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11837 ss << "done cleaning up pg_temp of " << pgid;
11838 goto update;
11839 }
11840 for (auto osd : id_vec) {
11841 if (!osdmap.exists(osd)) {
11842 ss << "osd." << osd << " does not exist";
11843 err = -ENOENT;
11844 goto reply;
11845 }
11846 new_pg_temp.push_back(osd);
11847 }
11848
11849 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11850 if ((int)new_pg_temp.size() < pool_min_size) {
11851 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11852 << pool_min_size << ")";
11853 err = -EINVAL;
11854 goto reply;
11855 }
11856
11857 int pool_size = osdmap.get_pg_pool_size(pgid);
11858 if ((int)new_pg_temp.size() > pool_size) {
11859 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11860 << pool_size << ")";
11861 err = -EINVAL;
11862 goto reply;
11863 }
11864
11865 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11866 new_pg_temp.begin(), new_pg_temp.end());
11867 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11868 goto update;
11869 } else if (prefix == "osd primary-temp") {
11870 string pgidstr;
11871 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11872 ss << "unable to parse 'pgid' value '"
11873 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11874 err = -EINVAL;
11875 goto reply;
11876 }
11877 pg_t pgid;
11878 if (!pgid.parse(pgidstr.c_str())) {
11879 ss << "invalid pgid '" << pgidstr << "'";
11880 err = -EINVAL;
11881 goto reply;
11882 }
11883 if (!osdmap.pg_exists(pgid)) {
11884 ss << "pg " << pgid << " does not exist";
11885 err = -ENOENT;
11886 goto reply;
11887 }
11888
11889 int64_t osd;
11890 if (!cmd_getval(cmdmap, "id", osd)) {
11891 ss << "unable to parse 'id' value '"
11892 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11893 err = -EINVAL;
11894 goto reply;
11895 }
11896 if (osd != -1 && !osdmap.exists(osd)) {
11897 ss << "osd." << osd << " does not exist";
11898 err = -ENOENT;
11899 goto reply;
11900 }
11901
11902 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11903 osdmap.require_min_compat_client < ceph_release_t::firefly) {
11904 ss << "require_min_compat_client "
11905 << osdmap.require_min_compat_client
11906 << " < firefly, which is required for primary-temp";
11907 err = -EPERM;
11908 goto reply;
11909 }
11910
11911 pending_inc.new_primary_temp[pgid] = osd;
11912 ss << "set " << pgid << " primary_temp mapping to " << osd;
11913 goto update;
11914 } else if (prefix == "pg repeer") {
11915 pg_t pgid;
11916 string pgidstr;
11917 cmd_getval(cmdmap, "pgid", pgidstr);
11918 if (!pgid.parse(pgidstr.c_str())) {
11919 ss << "invalid pgid '" << pgidstr << "'";
11920 err = -EINVAL;
11921 goto reply;
11922 }
11923 if (!osdmap.pg_exists(pgid)) {
11924 ss << "pg '" << pgidstr << "' does not exist";
11925 err = -ENOENT;
11926 goto reply;
11927 }
11928 vector<int> acting;
11929 int primary;
11930 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11931 if (primary < 0) {
11932 err = -EAGAIN;
11933 ss << "pg currently has no primary";
11934 goto reply;
11935 }
11936 if (acting.size() > 1) {
11937 // map to just primary; it will map back to what it wants
11938 pending_inc.new_pg_temp[pgid] = { primary };
11939 } else {
11940 // hmm, pick another arbitrary osd to induce a change. Note
11941 // that this won't work if there is only one suitable OSD in the cluster.
11942 int i;
11943 bool done = false;
11944 for (i = 0; i < osdmap.get_max_osd(); ++i) {
11945 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11946 continue;
11947 }
11948 pending_inc.new_pg_temp[pgid] = { primary, i };
11949 done = true;
11950 break;
11951 }
11952 if (!done) {
11953 err = -EAGAIN;
11954 ss << "not enough up OSDs in the cluster to force repeer";
11955 goto reply;
11956 }
11957 }
11958 goto update;
11959 } else if (prefix == "osd pg-upmap" ||
11960 prefix == "osd rm-pg-upmap" ||
11961 prefix == "osd pg-upmap-items" ||
11962 prefix == "osd rm-pg-upmap-items") {
11963 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11964 ss << "min_compat_client "
11965 << osdmap.require_min_compat_client
11966 << " < luminous, which is required for pg-upmap. "
11967 << "Try 'ceph osd set-require-min-compat-client luminous' "
11968 << "before using the new interface";
11969 err = -EPERM;
11970 goto reply;
11971 }
11972 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11973 if (err == -EAGAIN)
11974 goto wait;
11975 if (err < 0)
11976 goto reply;
11977 string pgidstr;
11978 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11979 ss << "unable to parse 'pgid' value '"
11980 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11981 err = -EINVAL;
11982 goto reply;
11983 }
11984 pg_t pgid;
11985 if (!pgid.parse(pgidstr.c_str())) {
11986 ss << "invalid pgid '" << pgidstr << "'";
11987 err = -EINVAL;
11988 goto reply;
11989 }
11990 if (!osdmap.pg_exists(pgid)) {
11991 ss << "pg " << pgid << " does not exist";
11992 err = -ENOENT;
11993 goto reply;
11994 }
11995 if (pending_inc.old_pools.count(pgid.pool())) {
11996 ss << "pool of " << pgid << " is pending removal";
11997 err = -ENOENT;
11998 getline(ss, rs);
11999 wait_for_finished_proposal(op,
12000 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12001 return true;
12002 }
12003
12004 enum {
12005 OP_PG_UPMAP,
12006 OP_RM_PG_UPMAP,
12007 OP_PG_UPMAP_ITEMS,
12008 OP_RM_PG_UPMAP_ITEMS,
12009 } option;
12010
12011 if (prefix == "osd pg-upmap") {
12012 option = OP_PG_UPMAP;
12013 } else if (prefix == "osd rm-pg-upmap") {
12014 option = OP_RM_PG_UPMAP;
12015 } else if (prefix == "osd pg-upmap-items") {
12016 option = OP_PG_UPMAP_ITEMS;
12017 } else {
12018 option = OP_RM_PG_UPMAP_ITEMS;
12019 }
12020
12021 // check pending upmap changes
12022 switch (option) {
12023 case OP_PG_UPMAP: // fall through
12024 case OP_RM_PG_UPMAP:
12025 if (pending_inc.new_pg_upmap.count(pgid) ||
12026 pending_inc.old_pg_upmap.count(pgid)) {
12027 dout(10) << __func__ << " waiting for pending update on "
12028 << pgid << dendl;
12029 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12030 return true;
12031 }
12032 break;
12033
12034 case OP_PG_UPMAP_ITEMS: // fall through
12035 case OP_RM_PG_UPMAP_ITEMS:
12036 if (pending_inc.new_pg_upmap_items.count(pgid) ||
12037 pending_inc.old_pg_upmap_items.count(pgid)) {
12038 dout(10) << __func__ << " waiting for pending update on "
12039 << pgid << dendl;
12040 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12041 return true;
12042 }
12043 break;
12044
12045 default:
12046 ceph_abort_msg("invalid option");
12047 }
12048
12049 switch (option) {
12050 case OP_PG_UPMAP:
12051 {
12052 vector<int64_t> id_vec;
12053 if (!cmd_getval(cmdmap, "id", id_vec)) {
12054 ss << "unable to parse 'id' value(s) '"
12055 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12056 err = -EINVAL;
12057 goto reply;
12058 }
12059
12060 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12061 if ((int)id_vec.size() < pool_min_size) {
12062 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12063 << pool_min_size << ")";
12064 err = -EINVAL;
12065 goto reply;
12066 }
12067
12068 int pool_size = osdmap.get_pg_pool_size(pgid);
12069 if ((int)id_vec.size() > pool_size) {
12070 ss << "num of osds (" << id_vec.size() <<") > pool size ("
12071 << pool_size << ")";
12072 err = -EINVAL;
12073 goto reply;
12074 }
12075
12076 vector<int32_t> new_pg_upmap;
12077 for (auto osd : id_vec) {
12078 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12079 ss << "osd." << osd << " does not exist";
12080 err = -ENOENT;
12081 goto reply;
12082 }
12083 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12084 if (it != new_pg_upmap.end()) {
12085 ss << "osd." << osd << " already exists, ";
12086 continue;
12087 }
12088 new_pg_upmap.push_back(osd);
12089 }
12090
12091 if (new_pg_upmap.empty()) {
12092 ss << "no valid upmap items(pairs) is specified";
12093 err = -EINVAL;
12094 goto reply;
12095 }
12096
12097 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12098 new_pg_upmap.begin(), new_pg_upmap.end());
12099 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12100 }
12101 break;
12102
12103 case OP_RM_PG_UPMAP:
12104 {
12105 pending_inc.old_pg_upmap.insert(pgid);
12106 ss << "clear " << pgid << " pg_upmap mapping";
12107 }
12108 break;
12109
12110 case OP_PG_UPMAP_ITEMS:
12111 {
12112 vector<int64_t> id_vec;
12113 if (!cmd_getval(cmdmap, "id", id_vec)) {
12114 ss << "unable to parse 'id' value(s) '"
12115 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12116 err = -EINVAL;
12117 goto reply;
12118 }
12119
12120 if (id_vec.size() % 2) {
12121 ss << "you must specify pairs of osd ids to be remapped";
12122 err = -EINVAL;
12123 goto reply;
12124 }
12125
12126 int pool_size = osdmap.get_pg_pool_size(pgid);
12127 if ((int)(id_vec.size() / 2) > pool_size) {
12128 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12129 << pool_size << ")";
12130 err = -EINVAL;
12131 goto reply;
12132 }
12133
12134 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12135 ostringstream items;
12136 items << "[";
12137 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12138 int from = *p++;
12139 int to = *p;
12140 if (from == to) {
12141 ss << "from osd." << from << " == to osd." << to << ", ";
12142 continue;
12143 }
12144 if (!osdmap.exists(from)) {
12145 ss << "osd." << from << " does not exist";
12146 err = -ENOENT;
12147 goto reply;
12148 }
12149 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12150 ss << "osd." << to << " does not exist";
12151 err = -ENOENT;
12152 goto reply;
12153 }
12154 pair<int32_t,int32_t> entry = make_pair(from, to);
12155 auto it = std::find(new_pg_upmap_items.begin(),
12156 new_pg_upmap_items.end(), entry);
12157 if (it != new_pg_upmap_items.end()) {
12158 ss << "osd." << from << " -> osd." << to << " already exists, ";
12159 continue;
12160 }
12161 new_pg_upmap_items.push_back(entry);
12162 items << from << "->" << to << ",";
12163 }
12164 string out(items.str());
12165 out.resize(out.size() - 1); // drop last ','
12166 out += "]";
12167
12168 if (new_pg_upmap_items.empty()) {
12169 ss << "no valid upmap items(pairs) is specified";
12170 err = -EINVAL;
12171 goto reply;
12172 }
12173
12174 pending_inc.new_pg_upmap_items[pgid] =
12175 mempool::osdmap::vector<pair<int32_t,int32_t>>(
12176 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12177 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12178 }
12179 break;
12180
12181 case OP_RM_PG_UPMAP_ITEMS:
12182 {
12183 pending_inc.old_pg_upmap_items.insert(pgid);
12184 ss << "clear " << pgid << " pg_upmap_items mapping";
12185 }
12186 break;
12187
12188 default:
12189 ceph_abort_msg("invalid option");
12190 }
12191
12192 goto update;
12193 } else if (prefix == "osd primary-affinity") {
12194 int64_t id;
12195 if (!cmd_getval(cmdmap, "id", id)) {
12196 ss << "invalid osd id value '"
12197 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12198 err = -EINVAL;
12199 goto reply;
12200 }
12201 double w;
12202 if (!cmd_getval(cmdmap, "weight", w)) {
12203 ss << "unable to parse 'weight' value '"
12204 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12205 err = -EINVAL;
12206 goto reply;
12207 }
12208 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12209 if (ww < 0L) {
12210 ss << "weight must be >= 0";
12211 err = -EINVAL;
12212 goto reply;
12213 }
12214 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12215 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12216 ss << "require_min_compat_client "
12217 << osdmap.require_min_compat_client
12218 << " < firefly, which is required for primary-affinity";
12219 err = -EPERM;
12220 goto reply;
12221 }
12222 if (osdmap.exists(id)) {
12223 pending_inc.new_primary_affinity[id] = ww;
12224 ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12225 getline(ss, rs);
12226 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12227 get_last_committed() + 1));
12228 return true;
12229 } else {
12230 ss << "osd." << id << " does not exist";
12231 err = -ENOENT;
12232 goto reply;
12233 }
12234 } else if (prefix == "osd reweight") {
12235 int64_t id;
12236 if (!cmd_getval(cmdmap, "id", id)) {
12237 ss << "unable to parse osd id value '"
12238 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12239 err = -EINVAL;
12240 goto reply;
12241 }
12242 double w;
12243 if (!cmd_getval(cmdmap, "weight", w)) {
12244 ss << "unable to parse weight value '"
12245 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12246 err = -EINVAL;
12247 goto reply;
12248 }
12249 long ww = (int)((double)CEPH_OSD_IN*w);
12250 if (ww < 0L) {
12251 ss << "weight must be >= 0";
12252 err = -EINVAL;
12253 goto reply;
12254 }
12255 if (osdmap.exists(id)) {
12256 pending_inc.new_weight[id] = ww;
12257 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12258 getline(ss, rs);
12259 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12260 get_last_committed() + 1));
12261 return true;
12262 } else {
12263 ss << "osd." << id << " does not exist";
12264 err = -ENOENT;
12265 goto reply;
12266 }
12267 } else if (prefix == "osd reweightn") {
12268 map<int32_t, uint32_t> weights;
12269 err = parse_reweights(cct, cmdmap, osdmap, &weights);
12270 if (err) {
12271 ss << "unable to parse 'weights' value '"
12272 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12273 goto reply;
12274 }
12275 pending_inc.new_weight.insert(weights.begin(), weights.end());
12276 wait_for_finished_proposal(
12277 op,
12278 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12279 return true;
12280 } else if (prefix == "osd lost") {
12281 int64_t id;
12282 if (!cmd_getval(cmdmap, "id", id)) {
12283 ss << "unable to parse osd id value '"
12284 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12285 err = -EINVAL;
12286 goto reply;
12287 }
12288 bool sure = false;
12289 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12290 if (!sure) {
12291 ss << "are you SURE? this might mean real, permanent data loss. pass "
12292 "--yes-i-really-mean-it if you really do.";
12293 err = -EPERM;
12294 goto reply;
12295 } else if (!osdmap.exists(id)) {
12296 ss << "osd." << id << " does not exist";
12297 err = -ENOENT;
12298 goto reply;
12299 } else if (!osdmap.is_down(id)) {
12300 ss << "osd." << id << " is not down";
12301 err = -EBUSY;
12302 goto reply;
12303 } else {
12304 epoch_t e = osdmap.get_info(id).down_at;
12305 pending_inc.new_lost[id] = e;
12306 ss << "marked osd lost in epoch " << e;
12307 getline(ss, rs);
12308 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12309 get_last_committed() + 1));
12310 return true;
12311 }
12312
12313 } else if (prefix == "osd destroy-actual" ||
12314 prefix == "osd purge-actual" ||
12315 prefix == "osd purge-new") {
12316 /* Destroying an OSD means that we don't expect to further make use of
12317 * the OSDs data (which may even become unreadable after this operation),
12318 * and that we are okay with scrubbing all its cephx keys and config-key
12319 * data (which may include lockbox keys, thus rendering the osd's data
12320 * unreadable).
12321 *
12322 * The OSD will not be removed. Instead, we will mark it as destroyed,
12323 * such that a subsequent call to `create` will not reuse the osd id.
12324 * This will play into being able to recreate the OSD, at the same
12325 * crush location, with minimal data movement.
12326 */
12327
12328 // make sure authmon is writeable.
12329 if (!mon.authmon()->is_writeable()) {
12330 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12331 << "osd destroy" << dendl;
12332 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12333 return false;
12334 }
12335
12336 int64_t id;
12337 if (!cmd_getval(cmdmap, "id", id)) {
12338 auto p = cmdmap.find("id");
12339 if (p == cmdmap.end()) {
12340 ss << "no osd id specified";
12341 } else {
12342 ss << "unable to parse osd id value '"
12343 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12344 }
12345 err = -EINVAL;
12346 goto reply;
12347 }
12348
12349 bool is_destroy = (prefix == "osd destroy-actual");
12350 if (!is_destroy) {
12351 ceph_assert("osd purge-actual" == prefix ||
12352 "osd purge-new" == prefix);
12353 }
12354
12355 bool sure = false;
12356 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12357 if (!sure) {
12358 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12359 << "This will mean real, permanent data loss, as well "
12360 << "as deletion of cephx and lockbox keys. "
12361 << "Pass --yes-i-really-mean-it if you really do.";
12362 err = -EPERM;
12363 goto reply;
12364 } else if (!osdmap.exists(id)) {
12365 ss << "osd." << id << " does not exist";
12366 err = 0; // idempotent
12367 goto reply;
12368 } else if (osdmap.is_up(id)) {
12369 ss << "osd." << id << " is not `down`.";
12370 err = -EBUSY;
12371 goto reply;
12372 } else if (is_destroy && osdmap.is_destroyed(id)) {
12373 ss << "destroyed osd." << id;
12374 err = 0;
12375 goto reply;
12376 }
12377
12378 if (prefix == "osd purge-new" &&
12379 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12380 ss << "osd." << id << " is not new";
12381 err = -EPERM;
12382 goto reply;
12383 }
12384
12385 bool goto_reply = false;
12386
12387 paxos.plug();
12388 if (is_destroy) {
12389 err = prepare_command_osd_destroy(id, ss);
12390 // we checked above that it should exist.
12391 ceph_assert(err != -ENOENT);
12392 } else {
12393 err = prepare_command_osd_purge(id, ss);
12394 if (err == -ENOENT) {
12395 err = 0;
12396 ss << "osd." << id << " does not exist.";
12397 goto_reply = true;
12398 }
12399 }
12400 paxos.unplug();
12401
12402 if (err < 0 || goto_reply) {
12403 goto reply;
12404 }
12405
12406 if (is_destroy) {
12407 ss << "destroyed osd." << id;
12408 } else {
12409 ss << "purged osd." << id;
12410 }
12411
12412 getline(ss, rs);
12413 wait_for_finished_proposal(op,
12414 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12415 force_immediate_propose();
12416 return true;
12417
12418 } else if (prefix == "osd new") {
12419
12420 // make sure authmon is writeable.
12421 if (!mon.authmon()->is_writeable()) {
12422 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12423 << "osd new" << dendl;
12424 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12425 return false;
12426 }
12427
12428 map<string,string> param_map;
12429
12430 bufferlist bl = m->get_data();
12431 string param_json = bl.to_str();
12432 dout(20) << __func__ << " osd new json = " << param_json << dendl;
12433
12434 err = get_json_str_map(param_json, ss, &param_map);
12435 if (err < 0)
12436 goto reply;
12437
12438 dout(20) << __func__ << " osd new params " << param_map << dendl;
12439
12440 paxos.plug();
12441 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12442 paxos.unplug();
12443
12444 if (err < 0) {
12445 goto reply;
12446 }
12447
12448 if (f) {
12449 f->flush(rdata);
12450 } else {
12451 rdata.append(ss);
12452 }
12453
12454 if (err == EEXIST) {
12455 // idempotent operation
12456 err = 0;
12457 goto reply;
12458 }
12459
12460 wait_for_finished_proposal(op,
12461 new Monitor::C_Command(mon, op, 0, rs, rdata,
12462 get_last_committed() + 1));
12463 force_immediate_propose();
12464 return true;
12465
12466 } else if (prefix == "osd create") {
12467
12468 // optional id provided?
12469 int64_t id = -1, cmd_id = -1;
12470 if (cmd_getval(cmdmap, "id", cmd_id)) {
12471 if (cmd_id < 0) {
12472 ss << "invalid osd id value '" << cmd_id << "'";
12473 err = -EINVAL;
12474 goto reply;
12475 }
12476 dout(10) << " osd create got id " << cmd_id << dendl;
12477 }
12478
12479 uuid_d uuid;
12480 string uuidstr;
12481 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12482 if (!uuid.parse(uuidstr.c_str())) {
12483 ss << "invalid uuid value '" << uuidstr << "'";
12484 err = -EINVAL;
12485 goto reply;
12486 }
12487 // we only care about the id if we also have the uuid, to
12488 // ensure the operation's idempotency.
12489 id = cmd_id;
12490 }
12491
12492 int32_t new_id = -1;
12493 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12494 if (err < 0) {
12495 if (err == -EAGAIN) {
12496 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12497 return true;
12498 }
12499 // a check has failed; reply to the user.
12500 goto reply;
12501
12502 } else if (err == EEXIST) {
12503 // this is an idempotent operation; we can go ahead and reply.
12504 if (f) {
12505 f->open_object_section("created_osd");
12506 f->dump_int("osdid", new_id);
12507 f->close_section();
12508 f->flush(rdata);
12509 } else {
12510 ss << new_id;
12511 rdata.append(ss);
12512 }
12513 err = 0;
12514 goto reply;
12515 }
12516
12517 string empty_device_class;
12518 do_osd_create(id, uuid, empty_device_class, &new_id);
12519
12520 if (f) {
12521 f->open_object_section("created_osd");
12522 f->dump_int("osdid", new_id);
12523 f->close_section();
12524 f->flush(rdata);
12525 } else {
12526 ss << new_id;
12527 rdata.append(ss);
12528 }
12529 wait_for_finished_proposal(op,
12530 new Monitor::C_Command(mon, op, 0, rs, rdata,
12531 get_last_committed() + 1));
12532 return true;
12533
12534 } else if (prefix == "osd blocklist clear" ||
12535 prefix == "osd blacklist clear") {
12536 pending_inc.new_blocklist.clear();
12537 std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12538 osdmap.get_blocklist(&blocklist);
12539 for (const auto &entry : blocklist) {
12540 pending_inc.old_blocklist.push_back(entry.first);
12541 }
12542 ss << " removed all blocklist entries";
12543 getline(ss, rs);
12544 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12545 get_last_committed() + 1));
12546 return true;
12547 } else if (prefix == "osd blocklist" ||
12548 prefix == "osd blacklist") {
12549 string addrstr;
12550 cmd_getval(cmdmap, "addr", addrstr);
12551 entity_addr_t addr;
12552 if (!addr.parse(addrstr.c_str(), 0)) {
12553 ss << "unable to parse address " << addrstr;
12554 err = -EINVAL;
12555 goto reply;
12556 }
12557 else {
12558 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12559 // always blocklist type ANY
12560 addr.set_type(entity_addr_t::TYPE_ANY);
12561 } else {
12562 addr.set_type(entity_addr_t::TYPE_LEGACY);
12563 }
12564
12565 string blocklistop;
12566 if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12567 cmd_getval(cmdmap, "blacklistop", blocklistop);
12568 }
12569 if (blocklistop == "add") {
12570 utime_t expires = ceph_clock_now();
12571 double d;
12572 // default one hour
12573 cmd_getval(cmdmap, "expire", d,
12574 g_conf()->mon_osd_blocklist_default_expire);
12575 expires += d;
12576
12577 pending_inc.new_blocklist[addr] = expires;
12578
12579 {
12580 // cancel any pending un-blocklisting request too
12581 auto it = std::find(pending_inc.old_blocklist.begin(),
12582 pending_inc.old_blocklist.end(), addr);
12583 if (it != pending_inc.old_blocklist.end()) {
12584 pending_inc.old_blocklist.erase(it);
12585 }
12586 }
12587
12588 ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12589 getline(ss, rs);
12590 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12591 get_last_committed() + 1));
12592 return true;
12593 } else if (blocklistop == "rm") {
12594 if (osdmap.is_blocklisted(addr) ||
12595 pending_inc.new_blocklist.count(addr)) {
12596 if (osdmap.is_blocklisted(addr))
12597 pending_inc.old_blocklist.push_back(addr);
12598 else
12599 pending_inc.new_blocklist.erase(addr);
12600 ss << "un-blocklisting " << addr;
12601 getline(ss, rs);
12602 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12603 get_last_committed() + 1));
12604 return true;
12605 }
12606 ss << addr << " isn't blocklisted";
12607 err = 0;
12608 goto reply;
12609 }
12610 }
12611 } else if (prefix == "osd pool mksnap") {
12612 string poolstr;
12613 cmd_getval(cmdmap, "pool", poolstr);
12614 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12615 if (pool < 0) {
12616 ss << "unrecognized pool '" << poolstr << "'";
12617 err = -ENOENT;
12618 goto reply;
12619 }
12620 string snapname;
12621 cmd_getval(cmdmap, "snap", snapname);
12622 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12623 if (p->is_unmanaged_snaps_mode()) {
12624 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12625 err = -EINVAL;
12626 goto reply;
12627 } else if (p->snap_exists(snapname.c_str())) {
12628 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12629 err = 0;
12630 goto reply;
12631 } else if (p->is_tier()) {
12632 ss << "pool " << poolstr << " is a cache tier";
12633 err = -EINVAL;
12634 goto reply;
12635 }
12636 pg_pool_t *pp = 0;
12637 if (pending_inc.new_pools.count(pool))
12638 pp = &pending_inc.new_pools[pool];
12639 if (!pp) {
12640 pp = &pending_inc.new_pools[pool];
12641 *pp = *p;
12642 }
12643 if (pp->snap_exists(snapname.c_str())) {
12644 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12645 } else {
12646 pp->add_snap(snapname.c_str(), ceph_clock_now());
12647 pp->set_snap_epoch(pending_inc.epoch);
12648 ss << "created pool " << poolstr << " snap " << snapname;
12649 }
12650 getline(ss, rs);
12651 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12652 get_last_committed() + 1));
12653 return true;
12654 } else if (prefix == "osd pool rmsnap") {
12655 string poolstr;
12656 cmd_getval(cmdmap, "pool", poolstr);
12657 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12658 if (pool < 0) {
12659 ss << "unrecognized pool '" << poolstr << "'";
12660 err = -ENOENT;
12661 goto reply;
12662 }
12663 string snapname;
12664 cmd_getval(cmdmap, "snap", snapname);
12665 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12666 if (p->is_unmanaged_snaps_mode()) {
12667 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12668 err = -EINVAL;
12669 goto reply;
12670 } else if (!p->snap_exists(snapname.c_str())) {
12671 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12672 err = 0;
12673 goto reply;
12674 }
12675 pg_pool_t *pp = 0;
12676 if (pending_inc.new_pools.count(pool))
12677 pp = &pending_inc.new_pools[pool];
12678 if (!pp) {
12679 pp = &pending_inc.new_pools[pool];
12680 *pp = *p;
12681 }
12682 snapid_t sn = pp->snap_exists(snapname.c_str());
12683 if (sn) {
12684 pp->remove_snap(sn);
12685 pp->set_snap_epoch(pending_inc.epoch);
12686 ss << "removed pool " << poolstr << " snap " << snapname;
12687 } else {
12688 ss << "already removed pool " << poolstr << " snap " << snapname;
12689 }
12690 getline(ss, rs);
12691 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12692 get_last_committed() + 1));
12693 return true;
12694 } else if (prefix == "osd pool create") {
12695 int64_t pg_num, pg_num_min;
12696 int64_t pgp_num;
12697 cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12698 cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12699 cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12700
12701 string pool_type_str;
12702 cmd_getval(cmdmap, "pool_type", pool_type_str);
12703 if (pool_type_str.empty())
12704 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12705
12706 string poolstr;
12707 cmd_getval(cmdmap, "pool", poolstr);
12708 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12709 if (pool_id >= 0) {
12710 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12711 if (pool_type_str != p->get_type_name()) {
12712 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12713 err = -EINVAL;
12714 } else {
12715 ss << "pool '" << poolstr << "' already exists";
12716 err = 0;
12717 }
12718 goto reply;
12719 }
12720
12721 int pool_type;
12722 if (pool_type_str == "replicated") {
12723 pool_type = pg_pool_t::TYPE_REPLICATED;
12724 } else if (pool_type_str == "erasure") {
12725 pool_type = pg_pool_t::TYPE_ERASURE;
12726 } else {
12727 ss << "unknown pool type '" << pool_type_str << "'";
12728 err = -EINVAL;
12729 goto reply;
12730 }
12731
12732 bool implicit_rule_creation = false;
12733 int64_t expected_num_objects = 0;
12734 string rule_name;
12735 cmd_getval(cmdmap, "rule", rule_name);
12736 string erasure_code_profile;
12737 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12738
12739 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12740 if (erasure_code_profile == "")
12741 erasure_code_profile = "default";
12742 //handle the erasure code profile
12743 if (erasure_code_profile == "default") {
12744 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12745 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12746 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12747 goto wait;
12748 }
12749
12750 map<string,string> profile_map;
12751 err = osdmap.get_erasure_code_profile_default(cct,
12752 profile_map,
12753 &ss);
12754 if (err)
12755 goto reply;
12756 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12757 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12758 goto wait;
12759 }
12760 }
12761 if (rule_name == "") {
12762 implicit_rule_creation = true;
12763 if (erasure_code_profile == "default") {
12764 rule_name = "erasure-code";
12765 } else {
12766 dout(1) << "implicitly use rule named after the pool: "
12767 << poolstr << dendl;
12768 rule_name = poolstr;
12769 }
12770 }
12771 cmd_getval(cmdmap, "expected_num_objects",
12772 expected_num_objects, int64_t(0));
12773 } else {
12774 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12775 // and put expected_num_objects to rule field
12776 if (erasure_code_profile != "") { // cmd is from CLI
12777 if (rule_name != "") {
12778 string interr;
12779 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12780 if (interr.length()) {
12781 ss << "error parsing integer value '" << rule_name << "': " << interr;
12782 err = -EINVAL;
12783 goto reply;
12784 }
12785 }
12786 rule_name = erasure_code_profile;
12787 } else { // cmd is well-formed
12788 cmd_getval(cmdmap, "expected_num_objects",
12789 expected_num_objects, int64_t(0));
12790 }
12791 }
12792
12793 if (!implicit_rule_creation && rule_name != "") {
12794 int rule;
12795 err = get_crush_rule(rule_name, &rule, &ss);
12796 if (err == -EAGAIN) {
12797 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12798 return true;
12799 }
12800 if (err)
12801 goto reply;
12802 }
12803
12804 if (expected_num_objects < 0) {
12805 ss << "'expected_num_objects' must be non-negative";
12806 err = -EINVAL;
12807 goto reply;
12808 }
12809
12810 set<int32_t> osds;
12811 osdmap.get_all_osds(osds);
12812 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12813 string type;
12814 if (!get_osd_objectstore_type(osd, &type)) {
12815 return type == "filestore";
12816 } else {
12817 return false;
12818 }
12819 });
12820
12821 if (has_filestore_osd &&
12822 expected_num_objects > 0 &&
12823 cct->_conf->filestore_merge_threshold > 0) {
12824 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12825 err = -EINVAL;
12826 goto reply;
12827 }
12828
12829 if (has_filestore_osd &&
12830 expected_num_objects == 0 &&
12831 cct->_conf->filestore_merge_threshold < 0) {
12832 int osds = osdmap.get_num_osds();
12833 bool sure = false;
12834 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12835 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12836 ss << "For better initial performance on pools expected to store a "
12837 << "large number of objects, consider supplying the "
12838 << "expected_num_objects parameter when creating the pool."
12839 << " Pass --yes-i-really-mean-it to ignore it";
12840 err = -EPERM;
12841 goto reply;
12842 }
12843 }
12844
12845 int64_t fast_read_param;
12846 cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12847 FastReadType fast_read = FAST_READ_DEFAULT;
12848 if (fast_read_param == 0)
12849 fast_read = FAST_READ_OFF;
12850 else if (fast_read_param > 0)
12851 fast_read = FAST_READ_ON;
12852
12853 int64_t repl_size = 0;
12854 cmd_getval(cmdmap, "size", repl_size);
12855 int64_t target_size_bytes = 0;
12856 double target_size_ratio = 0.0;
12857 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12858 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12859
12860 string pg_autoscale_mode;
12861 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12862
12863 err = prepare_new_pool(poolstr,
12864 -1, // default crush rule
12865 rule_name,
12866 pg_num, pgp_num, pg_num_min,
12867 repl_size, target_size_bytes, target_size_ratio,
12868 erasure_code_profile, pool_type,
12869 (uint64_t)expected_num_objects,
12870 fast_read,
12871 pg_autoscale_mode,
12872 &ss);
12873 if (err < 0) {
12874 switch(err) {
12875 case -EEXIST:
12876 ss << "pool '" << poolstr << "' already exists";
12877 break;
12878 case -EAGAIN:
12879 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12880 return true;
12881 case -ERANGE:
12882 goto reply;
12883 default:
12884 goto reply;
12885 break;
12886 }
12887 } else {
12888 ss << "pool '" << poolstr << "' created";
12889 }
12890 getline(ss, rs);
12891 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12892 get_last_committed() + 1));
12893 return true;
12894
12895 } else if (prefix == "osd pool delete" ||
12896 prefix == "osd pool rm") {
12897 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12898 string poolstr, poolstr2, sure;
12899 cmd_getval(cmdmap, "pool", poolstr);
12900 cmd_getval(cmdmap, "pool2", poolstr2);
12901 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12902 if (pool < 0) {
12903 ss << "pool '" << poolstr << "' does not exist";
12904 err = 0;
12905 goto reply;
12906 }
12907
12908 bool force_no_fake = false;
12909 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12910 bool force = false;
12911 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12912 if (poolstr2 != poolstr ||
12913 (!force && !force_no_fake)) {
12914 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12915 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12916 << "followed by --yes-i-really-really-mean-it.";
12917 err = -EPERM;
12918 goto reply;
12919 }
12920 err = _prepare_remove_pool(pool, &ss, force_no_fake);
12921 if (err == -EAGAIN) {
12922 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12923 return true;
12924 }
12925 if (err < 0)
12926 goto reply;
12927 goto update;
12928 } else if (prefix == "osd pool rename") {
12929 string srcpoolstr, destpoolstr;
12930 cmd_getval(cmdmap, "srcpool", srcpoolstr);
12931 cmd_getval(cmdmap, "destpool", destpoolstr);
12932 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12933 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12934
12935 if (pool_src < 0) {
12936 if (pool_dst >= 0) {
12937 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12938 // of operations, assume this rename succeeded, as it is not changing
12939 // the current state. Make sure we output something understandable
12940 // for whoever is issuing the command, if they are paying attention,
12941 // in case it was not intentional; or to avoid a "wtf?" and a bug
12942 // report in case it was intentional, while expecting a failure.
12943 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12944 << destpoolstr << "' does -- assuming successful rename";
12945 err = 0;
12946 } else {
12947 ss << "unrecognized pool '" << srcpoolstr << "'";
12948 err = -ENOENT;
12949 }
12950 goto reply;
12951 } else if (pool_dst >= 0) {
12952 // source pool exists and so does the destination pool
12953 ss << "pool '" << destpoolstr << "' already exists";
12954 err = -EEXIST;
12955 goto reply;
12956 }
12957
12958 int ret = _prepare_rename_pool(pool_src, destpoolstr);
12959 if (ret == 0) {
12960 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12961 } else {
12962 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12963 << cpp_strerror(ret);
12964 }
12965 getline(ss, rs);
12966 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12967 get_last_committed() + 1));
12968 return true;
12969
12970 } else if (prefix == "osd pool set") {
12971 err = prepare_command_pool_set(cmdmap, ss);
12972 if (err == -EAGAIN)
12973 goto wait;
12974 if (err < 0)
12975 goto reply;
12976
12977 getline(ss, rs);
12978 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12979 get_last_committed() + 1));
12980 return true;
12981 } else if (prefix == "osd tier add") {
12982 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12983 if (err == -EAGAIN)
12984 goto wait;
12985 if (err)
12986 goto reply;
12987 string poolstr;
12988 cmd_getval(cmdmap, "pool", poolstr);
12989 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12990 if (pool_id < 0) {
12991 ss << "unrecognized pool '" << poolstr << "'";
12992 err = -ENOENT;
12993 goto reply;
12994 }
12995 string tierpoolstr;
12996 cmd_getval(cmdmap, "tierpool", tierpoolstr);
12997 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12998 if (tierpool_id < 0) {
12999 ss << "unrecognized pool '" << tierpoolstr << "'";
13000 err = -ENOENT;
13001 goto reply;
13002 }
13003 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13004 ceph_assert(p);
13005 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13006 ceph_assert(tp);
13007
13008 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13009 goto reply;
13010 }
13011
13012 // make sure new tier is empty
13013 string force_nonempty;
13014 cmd_getval(cmdmap, "force_nonempty", force_nonempty);
13015 const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13016 if (pstats && pstats->stats.sum.num_objects != 0 &&
13017 force_nonempty != "--force-nonempty") {
13018 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13019 err = -ENOTEMPTY;
13020 goto reply;
13021 }
13022 if (tp->is_erasure()) {
13023 ss << "tier pool '" << tierpoolstr
13024 << "' is an ec pool, which cannot be a tier";
13025 err = -ENOTSUP;
13026 goto reply;
13027 }
13028 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13029 ((force_nonempty != "--force-nonempty") ||
13030 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
13031 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13032 err = -ENOTEMPTY;
13033 goto reply;
13034 }
13035 // go
13036 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13037 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13038 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13039 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13040 return true;
13041 }
13042 np->tiers.insert(tierpool_id);
13043 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13044 ntp->tier_of = pool_id;
13045 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13046 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13047 get_last_committed() + 1));
13048 return true;
13049 } else if (prefix == "osd tier remove" ||
13050 prefix == "osd tier rm") {
13051 string poolstr;
13052 cmd_getval(cmdmap, "pool", poolstr);
13053 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13054 if (pool_id < 0) {
13055 ss << "unrecognized pool '" << poolstr << "'";
13056 err = -ENOENT;
13057 goto reply;
13058 }
13059 string tierpoolstr;
13060 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13061 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13062 if (tierpool_id < 0) {
13063 ss << "unrecognized pool '" << tierpoolstr << "'";
13064 err = -ENOENT;
13065 goto reply;
13066 }
13067 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13068 ceph_assert(p);
13069 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13070 ceph_assert(tp);
13071
13072 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13073 goto reply;
13074 }
13075
13076 if (p->tiers.count(tierpool_id) == 0) {
13077 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13078 err = 0;
13079 goto reply;
13080 }
13081 if (tp->tier_of != pool_id) {
13082 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13083 << osdmap.get_pool_name(tp->tier_of) << "': "
13084 // be scary about it; this is an inconsistency and bells must go off
13085 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13086 err = -EINVAL;
13087 goto reply;
13088 }
13089 if (p->read_tier == tierpool_id) {
13090 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13091 err = -EBUSY;
13092 goto reply;
13093 }
13094 // go
13095 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13096 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13097 if (np->tiers.count(tierpool_id) == 0 ||
13098 ntp->tier_of != pool_id ||
13099 np->read_tier == tierpool_id) {
13100 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13101 return true;
13102 }
13103 np->tiers.erase(tierpool_id);
13104 ntp->clear_tier();
13105 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13106 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13107 get_last_committed() + 1));
13108 return true;
13109 } else if (prefix == "osd tier set-overlay") {
13110 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13111 if (err == -EAGAIN)
13112 goto wait;
13113 if (err)
13114 goto reply;
13115 string poolstr;
13116 cmd_getval(cmdmap, "pool", poolstr);
13117 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13118 if (pool_id < 0) {
13119 ss << "unrecognized pool '" << poolstr << "'";
13120 err = -ENOENT;
13121 goto reply;
13122 }
13123 string overlaypoolstr;
13124 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13125 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13126 if (overlaypool_id < 0) {
13127 ss << "unrecognized pool '" << overlaypoolstr << "'";
13128 err = -ENOENT;
13129 goto reply;
13130 }
13131 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13132 ceph_assert(p);
13133 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13134 ceph_assert(overlay_p);
13135 if (p->tiers.count(overlaypool_id) == 0) {
13136 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13137 err = -EINVAL;
13138 goto reply;
13139 }
13140 if (p->read_tier == overlaypool_id) {
13141 err = 0;
13142 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13143 goto reply;
13144 }
13145 if (p->has_read_tier()) {
13146 ss << "pool '" << poolstr << "' has overlay '"
13147 << osdmap.get_pool_name(p->read_tier)
13148 << "'; please remove-overlay first";
13149 err = -EINVAL;
13150 goto reply;
13151 }
13152
13153 // go
13154 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13155 np->read_tier = overlaypool_id;
13156 np->write_tier = overlaypool_id;
13157 np->set_last_force_op_resend(pending_inc.epoch);
13158 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13159 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13160 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13161 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13162 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13163 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13164 get_last_committed() + 1));
13165 return true;
13166 } else if (prefix == "osd tier remove-overlay" ||
13167 prefix == "osd tier rm-overlay") {
13168 string poolstr;
13169 cmd_getval(cmdmap, "pool", poolstr);
13170 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13171 if (pool_id < 0) {
13172 ss << "unrecognized pool '" << poolstr << "'";
13173 err = -ENOENT;
13174 goto reply;
13175 }
13176 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13177 ceph_assert(p);
13178 if (!p->has_read_tier()) {
13179 err = 0;
13180 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13181 goto reply;
13182 }
13183
13184 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13185 goto reply;
13186 }
13187
13188 // go
13189 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13190 if (np->has_read_tier()) {
13191 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13192 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13193 nop->set_last_force_op_resend(pending_inc.epoch);
13194 }
13195 if (np->has_write_tier()) {
13196 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13197 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13198 nop->set_last_force_op_resend(pending_inc.epoch);
13199 }
13200 np->clear_read_tier();
13201 np->clear_write_tier();
13202 np->set_last_force_op_resend(pending_inc.epoch);
13203 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13204 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13205 get_last_committed() + 1));
13206 return true;
13207 } else if (prefix == "osd tier cache-mode") {
13208 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13209 if (err == -EAGAIN)
13210 goto wait;
13211 if (err)
13212 goto reply;
13213 string poolstr;
13214 cmd_getval(cmdmap, "pool", poolstr);
13215 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13216 if (pool_id < 0) {
13217 ss << "unrecognized pool '" << poolstr << "'";
13218 err = -ENOENT;
13219 goto reply;
13220 }
13221 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13222 ceph_assert(p);
13223 if (!p->is_tier()) {
13224 ss << "pool '" << poolstr << "' is not a tier";
13225 err = -EINVAL;
13226 goto reply;
13227 }
13228 string modestr;
13229 cmd_getval(cmdmap, "mode", modestr);
13230 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13231 if (int(mode) < 0) {
13232 ss << "'" << modestr << "' is not a valid cache mode";
13233 err = -EINVAL;
13234 goto reply;
13235 }
13236
13237 bool sure = false;
13238 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13239
13240 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13241 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13242 ss << "'" << modestr << "' is no longer a supported cache mode";
13243 err = -EPERM;
13244 goto reply;
13245 }
13246 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13247 mode != pg_pool_t::CACHEMODE_NONE &&
13248 mode != pg_pool_t::CACHEMODE_PROXY &&
13249 mode != pg_pool_t::CACHEMODE_READPROXY) &&
13250 !sure) {
13251 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13252 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13253 err = -EPERM;
13254 goto reply;
13255 }
13256
13257 // pool already has this cache-mode set and there are no pending changes
13258 if (p->cache_mode == mode &&
13259 (pending_inc.new_pools.count(pool_id) == 0 ||
13260 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13261 ss << "set cache-mode for pool '" << poolstr << "'"
13262 << " to " << pg_pool_t::get_cache_mode_name(mode);
13263 err = 0;
13264 goto reply;
13265 }
13266
13267 /* Mode description:
13268 *
13269 * none: No cache-mode defined
13270 * forward: Forward all reads and writes to base pool [removed]
13271 * writeback: Cache writes, promote reads from base pool
13272 * readonly: Forward writes to base pool
13273 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13274 * proxy: Proxy all reads and writes to base pool
13275 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13276 *
13277 * Hence, these are the allowed transitions:
13278 *
13279 * none -> any
13280 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13281 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13282 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13283 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13284 * writeback -> readproxy || proxy
13285 * readonly -> any
13286 */
13287
13288 // We check if the transition is valid against the current pool mode, as
13289 // it is the only committed state thus far. We will blantly squash
13290 // whatever mode is on the pending state.
13291
13292 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13293 (mode != pg_pool_t::CACHEMODE_PROXY &&
13294 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13295 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13296 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13297 << "' pool; only '"
13298 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13299 << "' allowed.";
13300 err = -EINVAL;
13301 goto reply;
13302 }
13303 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13304 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13305 mode != pg_pool_t::CACHEMODE_PROXY &&
13306 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13307
13308 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13309 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13310 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13311
13312 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13313 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13314 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13315
13316 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13317 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13318 mode != pg_pool_t::CACHEMODE_PROXY &&
13319 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13320
13321 const pool_stat_t* pstats =
13322 mon.mgrstatmon()->get_pool_stat(pool_id);
13323
13324 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13325 ss << "unable to set cache-mode '"
13326 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13327 << "': dirty objects found";
13328 err = -EBUSY;
13329 goto reply;
13330 }
13331 }
13332 // go
13333 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13334 np->cache_mode = mode;
13335 // set this both when moving to and from cache_mode NONE. this is to
13336 // capture legacy pools that were set up before this flag existed.
13337 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13338 ss << "set cache-mode for pool '" << poolstr
13339 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13340 if (mode == pg_pool_t::CACHEMODE_NONE) {
13341 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13342 ceph_assert(base_pool);
13343 if (base_pool->read_tier == pool_id ||
13344 base_pool->write_tier == pool_id)
13345 ss <<" (WARNING: pool is still configured as read or write tier)";
13346 }
13347 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13348 get_last_committed() + 1));
13349 return true;
13350 } else if (prefix == "osd tier add-cache") {
13351 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13352 if (err == -EAGAIN)
13353 goto wait;
13354 if (err)
13355 goto reply;
13356 string poolstr;
13357 cmd_getval(cmdmap, "pool", poolstr);
13358 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13359 if (pool_id < 0) {
13360 ss << "unrecognized pool '" << poolstr << "'";
13361 err = -ENOENT;
13362 goto reply;
13363 }
13364 string tierpoolstr;
13365 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13366 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13367 if (tierpool_id < 0) {
13368 ss << "unrecognized pool '" << tierpoolstr << "'";
13369 err = -ENOENT;
13370 goto reply;
13371 }
13372 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13373 ceph_assert(p);
13374 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13375 ceph_assert(tp);
13376
13377 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13378 goto reply;
13379 }
13380
13381 int64_t size = 0;
13382 if (!cmd_getval(cmdmap, "size", size)) {
13383 ss << "unable to parse 'size' value '"
13384 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13385 err = -EINVAL;
13386 goto reply;
13387 }
13388 // make sure new tier is empty
13389 const pool_stat_t *pstats =
13390 mon.mgrstatmon()->get_pool_stat(tierpool_id);
13391 if (pstats && pstats->stats.sum.num_objects != 0) {
13392 ss << "tier pool '" << tierpoolstr << "' is not empty";
13393 err = -ENOTEMPTY;
13394 goto reply;
13395 }
13396 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13397 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13398 if (int(mode) < 0) {
13399 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13400 err = -EINVAL;
13401 goto reply;
13402 }
13403 HitSet::Params hsp;
13404 auto& cache_hit_set_type =
13405 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13406 if (cache_hit_set_type == "bloom") {
13407 BloomHitSet::Params *bsp = new BloomHitSet::Params;
13408 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13409 hsp = HitSet::Params(bsp);
13410 } else if (cache_hit_set_type == "explicit_hash") {
13411 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13412 } else if (cache_hit_set_type == "explicit_object") {
13413 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13414 } else {
13415 ss << "osd tier cache default hit set type '"
13416 << cache_hit_set_type << "' is not a known type";
13417 err = -EINVAL;
13418 goto reply;
13419 }
13420 // go
13421 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13422 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13423 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13424 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13425 return true;
13426 }
13427 np->tiers.insert(tierpool_id);
13428 np->read_tier = np->write_tier = tierpool_id;
13429 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13430 np->set_last_force_op_resend(pending_inc.epoch);
13431 ntp->set_last_force_op_resend(pending_inc.epoch);
13432 ntp->tier_of = pool_id;
13433 ntp->cache_mode = mode;
13434 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13435 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13436 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13437 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13438 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13439 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13440 ntp->hit_set_params = hsp;
13441 ntp->target_max_bytes = size;
13442 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13443 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13444 get_last_committed() + 1));
13445 return true;
13446 } else if (prefix == "osd pool set-quota") {
13447 string poolstr;
13448 cmd_getval(cmdmap, "pool", poolstr);
13449 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13450 if (pool_id < 0) {
13451 ss << "unrecognized pool '" << poolstr << "'";
13452 err = -ENOENT;
13453 goto reply;
13454 }
13455
13456 string field;
13457 cmd_getval(cmdmap, "field", field);
13458 if (field != "max_objects" && field != "max_bytes") {
13459 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13460 err = -EINVAL;
13461 goto reply;
13462 }
13463
13464 // val could contain unit designations, so we treat as a string
13465 string val;
13466 cmd_getval(cmdmap, "val", val);
13467 string tss;
13468 int64_t value;
13469 if (field == "max_objects") {
13470 value = strict_sistrtoll(val.c_str(), &tss);
13471 } else if (field == "max_bytes") {
13472 value = strict_iecstrtoll(val.c_str(), &tss);
13473 } else {
13474 ceph_abort_msg("unrecognized option");
13475 }
13476 if (!tss.empty()) {
13477 ss << "error parsing value '" << val << "': " << tss;
13478 err = -EINVAL;
13479 goto reply;
13480 }
13481
13482 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13483 if (field == "max_objects") {
13484 pi->quota_max_objects = value;
13485 } else if (field == "max_bytes") {
13486 pi->quota_max_bytes = value;
13487 } else {
13488 ceph_abort_msg("unrecognized option");
13489 }
13490 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13491 rs = ss.str();
13492 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13493 get_last_committed() + 1));
13494 return true;
13495 } else if (prefix == "osd pool application enable" ||
13496 prefix == "osd pool application disable" ||
13497 prefix == "osd pool application set" ||
13498 prefix == "osd pool application rm") {
13499 err = prepare_command_pool_application(prefix, cmdmap, ss);
13500 if (err == -EAGAIN) {
13501 goto wait;
13502 } else if (err < 0) {
13503 goto reply;
13504 } else {
13505 goto update;
13506 }
13507 } else if (prefix == "osd force-create-pg") {
13508 pg_t pgid;
13509 string pgidstr;
13510 cmd_getval(cmdmap, "pgid", pgidstr);
13511 if (!pgid.parse(pgidstr.c_str())) {
13512 ss << "invalid pgid '" << pgidstr << "'";
13513 err = -EINVAL;
13514 goto reply;
13515 }
13516 if (!osdmap.pg_exists(pgid)) {
13517 ss << "pg " << pgid << " should not exist";
13518 err = -ENOENT;
13519 goto reply;
13520 }
13521 bool sure = false;
13522 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13523 if (!sure) {
13524 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13525 << "that the cluster will give up ever trying to recover the lost data. Do this "
13526 << "only if you are certain that all copies of the PG are in fact lost and you are "
13527 << "willing to accept that the data is permanently destroyed. Pass "
13528 << "--yes-i-really-mean-it to proceed.";
13529 err = -EPERM;
13530 goto reply;
13531 }
13532 bool creating_now;
13533 {
13534 std::lock_guard<std::mutex> l(creating_pgs_lock);
13535 auto emplaced = creating_pgs.pgs.emplace(
13536 pgid,
13537 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13538 ceph_clock_now()));
13539 creating_now = emplaced.second;
13540 }
13541 if (creating_now) {
13542 ss << "pg " << pgidstr << " now creating, ok";
13543 // set the pool's CREATING flag so that (1) the osd won't ignore our
13544 // create message and (2) we won't propose any future pg_num changes
13545 // until after the PG has been instantiated.
13546 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13547 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13548 }
13549 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13550 err = 0;
13551 goto update;
13552 } else {
13553 ss << "pg " << pgid << " already creating";
13554 err = 0;
13555 goto reply;
13556 }
13557 } else if (prefix == "osd force_healthy_stretch_mode") {
13558 bool sure = false;
13559 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13560 if (!sure) {
13561 ss << "This command will require peering across multiple CRUSH buckets "
13562 "(probably two data centers or availability zones?) and may result in PGs "
13563 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13564 err = -EPERM;
13565 goto reply;
13566 }
13567 try_end_recovery_stretch_mode(true);
13568 ss << "Triggering healthy stretch mode";
13569 err = 0;
13570 goto reply;
13571 } else if (prefix == "osd force_recovery_stretch_mode") {
13572 bool sure = false;
13573 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13574 if (!sure) {
13575 ss << "This command will increase pool sizes to try and spread them "
13576 "across multiple CRUSH buckets (probably two data centers or "
13577 "availability zones?) and should have happened automatically"
13578 "Pass --yes-i-really-mean-it to proceed.";
13579 err = -EPERM;
13580 goto reply;
13581 }
13582 mon.go_recovery_stretch_mode();
13583 ss << "Triggering recovery stretch mode";
13584 err = 0;
13585 goto reply;
13586 } else {
13587 err = -EINVAL;
13588 }
13589
13590 reply:
13591 getline(ss, rs);
13592 if (err < 0 && rs.length() == 0)
13593 rs = cpp_strerror(err);
13594 mon.reply_command(op, err, rs, rdata, get_last_committed());
13595 return ret;
13596
13597 update:
13598 getline(ss, rs);
13599 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13600 get_last_committed() + 1));
13601 return true;
13602
13603 wait:
13604 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13605 return true;
13606 }
13607
13608 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13609 {
13610 op->mark_osdmon_event(__func__);
13611
13612 auto m = op->get_req<MPoolOp>();
13613 MonSession *session = op->get_session();
13614 if (!session) {
13615 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13616 return true;
13617 }
13618
13619 switch (m->op) {
13620 case POOL_OP_CREATE_UNMANAGED_SNAP:
13621 case POOL_OP_DELETE_UNMANAGED_SNAP:
13622 {
13623 const std::string* pool_name = nullptr;
13624 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13625 if (pg_pool != nullptr) {
13626 pool_name = &osdmap.get_pool_name(m->pool);
13627 }
13628
13629 if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
13630 session->entity_name, session->caps,
13631 session->get_peer_socket_addr(),
13632 pool_name)) {
13633 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13634 << "privileges. message: " << *m << std::endl
13635 << "caps: " << session->caps << dendl;
13636 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13637 return true;
13638 }
13639 }
13640 break;
13641 default:
13642 if (!session->is_capable("osd", MON_CAP_W)) {
13643 dout(0) << "got pool op from entity with insufficient privileges. "
13644 << "message: " << *m << std::endl
13645 << "caps: " << session->caps << dendl;
13646 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13647 return true;
13648 }
13649 break;
13650 }
13651
13652 return false;
13653 }
13654
13655 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13656 {
13657 op->mark_osdmon_event(__func__);
13658 auto m = op->get_req<MPoolOp>();
13659
13660 if (enforce_pool_op_caps(op)) {
13661 return true;
13662 }
13663
13664 if (m->fsid != mon.monmap->fsid) {
13665 dout(0) << __func__ << " drop message on fsid " << m->fsid
13666 << " != " << mon.monmap->fsid << " for " << *m << dendl;
13667 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13668 return true;
13669 }
13670
13671 if (m->op == POOL_OP_CREATE)
13672 return preprocess_pool_op_create(op);
13673
13674 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13675 if (p == nullptr) {
13676 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13677 if (m->op == POOL_OP_DELETE) {
13678 _pool_op_reply(op, 0, osdmap.get_epoch());
13679 } else {
13680 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13681 }
13682 return true;
13683 }
13684
13685 // check if the snap and snapname exist
13686 bool snap_exists = false;
13687 if (p->snap_exists(m->name.c_str()))
13688 snap_exists = true;
13689
13690 switch (m->op) {
13691 case POOL_OP_CREATE_SNAP:
13692 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13693 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13694 return true;
13695 }
13696 if (snap_exists) {
13697 _pool_op_reply(op, 0, osdmap.get_epoch());
13698 return true;
13699 }
13700 return false;
13701 case POOL_OP_CREATE_UNMANAGED_SNAP:
13702 if (p->is_pool_snaps_mode()) {
13703 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13704 return true;
13705 }
13706 return false;
13707 case POOL_OP_DELETE_SNAP:
13708 if (p->is_unmanaged_snaps_mode()) {
13709 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13710 return true;
13711 }
13712 if (!snap_exists) {
13713 _pool_op_reply(op, 0, osdmap.get_epoch());
13714 return true;
13715 }
13716 return false;
13717 case POOL_OP_DELETE_UNMANAGED_SNAP:
13718 if (p->is_pool_snaps_mode()) {
13719 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13720 return true;
13721 }
13722 if (_is_removed_snap(m->pool, m->snapid)) {
13723 _pool_op_reply(op, 0, osdmap.get_epoch());
13724 return true;
13725 }
13726 return false;
13727 case POOL_OP_DELETE:
13728 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13729 _pool_op_reply(op, 0, osdmap.get_epoch());
13730 return true;
13731 }
13732 return false;
13733 case POOL_OP_AUID_CHANGE:
13734 return false;
13735 default:
13736 ceph_abort();
13737 break;
13738 }
13739
13740 return false;
13741 }
13742
13743 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13744 {
13745 if (!osdmap.have_pg_pool(pool)) {
13746 dout(10) << __func__ << " pool " << pool << " snap " << snap
13747 << " - pool dne" << dendl;
13748 return true;
13749 }
13750 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13751 dout(10) << __func__ << " pool " << pool << " snap " << snap
13752 << " - in osdmap removed_snaps_queue" << dendl;
13753 return true;
13754 }
13755 snapid_t begin, end;
13756 int r = lookup_purged_snap(pool, snap, &begin, &end);
13757 if (r == 0) {
13758 dout(10) << __func__ << " pool " << pool << " snap " << snap
13759 << " - purged, [" << begin << "," << end << ")" << dendl;
13760 return true;
13761 }
13762 return false;
13763 }
13764
13765 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13766 {
13767 if (pending_inc.old_pools.count(pool)) {
13768 dout(10) << __func__ << " pool " << pool << " snap " << snap
13769 << " - pool pending deletion" << dendl;
13770 return true;
13771 }
13772 if (pending_inc.in_new_removed_snaps(pool, snap)) {
13773 dout(10) << __func__ << " pool " << pool << " snap " << snap
13774 << " - in pending new_removed_snaps" << dendl;
13775 return true;
13776 }
13777 return false;
13778 }
13779
13780 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13781 {
13782 op->mark_osdmon_event(__func__);
13783 auto m = op->get_req<MPoolOp>();
13784 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13785 if (pool >= 0) {
13786 _pool_op_reply(op, 0, osdmap.get_epoch());
13787 return true;
13788 }
13789
13790 return false;
13791 }
13792
13793 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13794 {
13795 op->mark_osdmon_event(__func__);
13796 auto m = op->get_req<MPoolOp>();
13797 dout(10) << "prepare_pool_op " << *m << dendl;
13798 if (m->op == POOL_OP_CREATE) {
13799 return prepare_pool_op_create(op);
13800 } else if (m->op == POOL_OP_DELETE) {
13801 return prepare_pool_op_delete(op);
13802 }
13803
13804 int ret = 0;
13805 bool changed = false;
13806
13807 if (!osdmap.have_pg_pool(m->pool)) {
13808 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13809 return false;
13810 }
13811
13812 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13813
13814 switch (m->op) {
13815 case POOL_OP_CREATE_SNAP:
13816 if (pool->is_tier()) {
13817 ret = -EINVAL;
13818 _pool_op_reply(op, ret, osdmap.get_epoch());
13819 return false;
13820 } // else, fall through
13821 case POOL_OP_DELETE_SNAP:
13822 if (!pool->is_unmanaged_snaps_mode()) {
13823 bool snap_exists = pool->snap_exists(m->name.c_str());
13824 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13825 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13826 ret = 0;
13827 } else {
13828 break;
13829 }
13830 } else {
13831 ret = -EINVAL;
13832 }
13833 _pool_op_reply(op, ret, osdmap.get_epoch());
13834 return false;
13835
13836 case POOL_OP_DELETE_UNMANAGED_SNAP:
13837 // we won't allow removal of an unmanaged snapshot from a pool
13838 // not in unmanaged snaps mode.
13839 if (!pool->is_unmanaged_snaps_mode()) {
13840 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13841 return false;
13842 }
13843 /* fall-thru */
13844 case POOL_OP_CREATE_UNMANAGED_SNAP:
13845 // but we will allow creating an unmanaged snapshot on any pool
13846 // as long as it is not in 'pool' snaps mode.
13847 if (pool->is_pool_snaps_mode()) {
13848 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13849 return false;
13850 }
13851 }
13852
13853 // projected pool info
13854 pg_pool_t pp;
13855 if (pending_inc.new_pools.count(m->pool))
13856 pp = pending_inc.new_pools[m->pool];
13857 else
13858 pp = *osdmap.get_pg_pool(m->pool);
13859
13860 bufferlist reply_data;
13861
13862 // pool snaps vs unmanaged snaps are mutually exclusive
13863 switch (m->op) {
13864 case POOL_OP_CREATE_SNAP:
13865 case POOL_OP_DELETE_SNAP:
13866 if (pp.is_unmanaged_snaps_mode()) {
13867 ret = -EINVAL;
13868 goto out;
13869 }
13870 break;
13871
13872 case POOL_OP_CREATE_UNMANAGED_SNAP:
13873 case POOL_OP_DELETE_UNMANAGED_SNAP:
13874 if (pp.is_pool_snaps_mode()) {
13875 ret = -EINVAL;
13876 goto out;
13877 }
13878 }
13879
13880 switch (m->op) {
13881 case POOL_OP_CREATE_SNAP:
13882 if (!pp.snap_exists(m->name.c_str())) {
13883 pp.add_snap(m->name.c_str(), ceph_clock_now());
13884 dout(10) << "create snap in pool " << m->pool << " " << m->name
13885 << " seq " << pp.get_snap_epoch() << dendl;
13886 changed = true;
13887 }
13888 break;
13889
13890 case POOL_OP_DELETE_SNAP:
13891 {
13892 snapid_t s = pp.snap_exists(m->name.c_str());
13893 if (s) {
13894 pp.remove_snap(s);
13895 pending_inc.new_removed_snaps[m->pool].insert(s);
13896 changed = true;
13897 }
13898 }
13899 break;
13900
13901 case POOL_OP_CREATE_UNMANAGED_SNAP:
13902 {
13903 uint64_t snapid = pp.add_unmanaged_snap(
13904 osdmap.require_osd_release < ceph_release_t::octopus);
13905 encode(snapid, reply_data);
13906 changed = true;
13907 }
13908 break;
13909
13910 case POOL_OP_DELETE_UNMANAGED_SNAP:
13911 if (!_is_removed_snap(m->pool, m->snapid) &&
13912 !_is_pending_removed_snap(m->pool, m->snapid)) {
13913 if (m->snapid > pp.get_snap_seq()) {
13914 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13915 return false;
13916 }
13917 pp.remove_unmanaged_snap(
13918 m->snapid,
13919 osdmap.require_osd_release < ceph_release_t::octopus);
13920 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13921 // also record the new seq as purged: this avoids a discontinuity
13922 // after all of the snaps have been purged, since the seq assigned
13923 // during removal lives in the same namespace as the actual snaps.
13924 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13925 changed = true;
13926 }
13927 break;
13928
13929 case POOL_OP_AUID_CHANGE:
13930 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13931 return false;
13932
13933 default:
13934 ceph_abort();
13935 break;
13936 }
13937
13938 if (changed) {
13939 pp.set_snap_epoch(pending_inc.epoch);
13940 pending_inc.new_pools[m->pool] = pp;
13941 }
13942
13943 out:
13944 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13945 return true;
13946 }
13947
13948 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13949 {
13950 op->mark_osdmon_event(__func__);
13951 int err = prepare_new_pool(op);
13952 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13953 return true;
13954 }
13955
13956 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13957 ostream *ss)
13958 {
13959 const string& poolstr = osdmap.get_pool_name(pool_id);
13960
13961 // If the Pool is in use by CephFS, refuse to delete it
13962 FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
13963 if (pending_fsmap.pool_in_use(pool_id)) {
13964 *ss << "pool '" << poolstr << "' is in use by CephFS";
13965 return -EBUSY;
13966 }
13967
13968 if (pool.tier_of >= 0) {
13969 *ss << "pool '" << poolstr << "' is a tier of '"
13970 << osdmap.get_pool_name(pool.tier_of) << "'";
13971 return -EBUSY;
13972 }
13973 if (!pool.tiers.empty()) {
13974 *ss << "pool '" << poolstr << "' has tiers";
13975 for(auto tier : pool.tiers) {
13976 *ss << " " << osdmap.get_pool_name(tier);
13977 }
13978 return -EBUSY;
13979 }
13980
13981 if (!g_conf()->mon_allow_pool_delete) {
13982 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13983 return -EPERM;
13984 }
13985
13986 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13987 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13988 return -EPERM;
13989 }
13990
13991 *ss << "pool '" << poolstr << "' removed";
13992 return 0;
13993 }
13994
13995 /**
13996 * Check if it is safe to add a tier to a base pool
13997 *
13998 * @return
13999 * True if the operation should proceed, false if we should abort here
14000 * (abort doesn't necessarily mean error, could be idempotency)
14001 */
14002 bool OSDMonitor::_check_become_tier(
14003 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14004 const int64_t base_pool_id, const pg_pool_t *base_pool,
14005 int *err,
14006 ostream *ss) const
14007 {
14008 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14009 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14010
14011 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14012 if (pending_fsmap.pool_in_use(tier_pool_id)) {
14013 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14014 *err = -EBUSY;
14015 return false;
14016 }
14017
14018 if (base_pool->tiers.count(tier_pool_id)) {
14019 ceph_assert(tier_pool->tier_of == base_pool_id);
14020 *err = 0;
14021 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14022 << base_pool_name << "'";
14023 return false;
14024 }
14025
14026 if (base_pool->is_tier()) {
14027 *ss << "pool '" << base_pool_name << "' is already a tier of '"
14028 << osdmap.get_pool_name(base_pool->tier_of) << "', "
14029 << "multiple tiers are not yet supported.";
14030 *err = -EINVAL;
14031 return false;
14032 }
14033
14034 if (tier_pool->has_tiers()) {
14035 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14036 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14037 it != tier_pool->tiers.end(); ++it)
14038 *ss << "'" << osdmap.get_pool_name(*it) << "',";
14039 *ss << " multiple tiers are not yet supported.";
14040 *err = -EINVAL;
14041 return false;
14042 }
14043
14044 if (tier_pool->is_tier()) {
14045 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14046 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14047 *err = -EINVAL;
14048 return false;
14049 }
14050
14051 *err = 0;
14052 return true;
14053 }
14054
14055
14056 /**
14057 * Check if it is safe to remove a tier from this base pool
14058 *
14059 * @return
14060 * True if the operation should proceed, false if we should abort here
14061 * (abort doesn't necessarily mean error, could be idempotency)
14062 */
14063 bool OSDMonitor::_check_remove_tier(
14064 const int64_t base_pool_id, const pg_pool_t *base_pool,
14065 const pg_pool_t *tier_pool,
14066 int *err, ostream *ss) const
14067 {
14068 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14069
14070 // Apply CephFS-specific checks
14071 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14072 if (pending_fsmap.pool_in_use(base_pool_id)) {
14073 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14074 // If the underlying pool is erasure coded and does not allow EC
14075 // overwrites, we can't permit the removal of the replicated tier that
14076 // CephFS relies on to access it
14077 *ss << "pool '" << base_pool_name <<
14078 "' does not allow EC overwrites and is in use by CephFS"
14079 " via its tier";
14080 *err = -EBUSY;
14081 return false;
14082 }
14083
14084 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14085 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14086 "tier is still in use as a writeback cache. Change the cache "
14087 "mode and flush the cache before removing it";
14088 *err = -EBUSY;
14089 return false;
14090 }
14091 }
14092
14093 *err = 0;
14094 return true;
14095 }
14096
14097 int OSDMonitor::_prepare_remove_pool(
14098 int64_t pool, ostream *ss, bool no_fake)
14099 {
14100 dout(10) << __func__ << " " << pool << dendl;
14101 const pg_pool_t *p = osdmap.get_pg_pool(pool);
14102 int r = _check_remove_pool(pool, *p, ss);
14103 if (r < 0)
14104 return r;
14105
14106 auto new_pool = pending_inc.new_pools.find(pool);
14107 if (new_pool != pending_inc.new_pools.end()) {
14108 // if there is a problem with the pending info, wait and retry
14109 // this op.
14110 const auto& p = new_pool->second;
14111 int r = _check_remove_pool(pool, p, ss);
14112 if (r < 0)
14113 return -EAGAIN;
14114 }
14115
14116 if (pending_inc.old_pools.count(pool)) {
14117 dout(10) << __func__ << " " << pool << " already pending removal"
14118 << dendl;
14119 return 0;
14120 }
14121
14122 if (g_conf()->mon_fake_pool_delete && !no_fake) {
14123 string old_name = osdmap.get_pool_name(pool);
14124 string new_name = old_name + "." + stringify(pool) + ".DELETED";
14125 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14126 << old_name << " -> " << new_name << dendl;
14127 pending_inc.new_pool_names[pool] = new_name;
14128 return 0;
14129 }
14130
14131 // remove
14132 pending_inc.old_pools.insert(pool);
14133
14134 // remove any pg_temp mappings for this pool
14135 for (auto p = osdmap.pg_temp->begin();
14136 p != osdmap.pg_temp->end();
14137 ++p) {
14138 if (p->first.pool() == pool) {
14139 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14140 << p->first << dendl;
14141 pending_inc.new_pg_temp[p->first].clear();
14142 }
14143 }
14144 // remove any primary_temp mappings for this pool
14145 for (auto p = osdmap.primary_temp->begin();
14146 p != osdmap.primary_temp->end();
14147 ++p) {
14148 if (p->first.pool() == pool) {
14149 dout(10) << __func__ << " " << pool
14150 << " removing obsolete primary_temp" << p->first << dendl;
14151 pending_inc.new_primary_temp[p->first] = -1;
14152 }
14153 }
14154 // remove any pg_upmap mappings for this pool
14155 for (auto& p : osdmap.pg_upmap) {
14156 if (p.first.pool() == pool) {
14157 dout(10) << __func__ << " " << pool
14158 << " removing obsolete pg_upmap "
14159 << p.first << dendl;
14160 pending_inc.old_pg_upmap.insert(p.first);
14161 }
14162 }
14163 // remove any pending pg_upmap mappings for this pool
14164 {
14165 auto it = pending_inc.new_pg_upmap.begin();
14166 while (it != pending_inc.new_pg_upmap.end()) {
14167 if (it->first.pool() == pool) {
14168 dout(10) << __func__ << " " << pool
14169 << " removing pending pg_upmap "
14170 << it->first << dendl;
14171 it = pending_inc.new_pg_upmap.erase(it);
14172 } else {
14173 it++;
14174 }
14175 }
14176 }
14177 // remove any pg_upmap_items mappings for this pool
14178 for (auto& p : osdmap.pg_upmap_items) {
14179 if (p.first.pool() == pool) {
14180 dout(10) << __func__ << " " << pool
14181 << " removing obsolete pg_upmap_items " << p.first
14182 << dendl;
14183 pending_inc.old_pg_upmap_items.insert(p.first);
14184 }
14185 }
14186 // remove any pending pg_upmap mappings for this pool
14187 {
14188 auto it = pending_inc.new_pg_upmap_items.begin();
14189 while (it != pending_inc.new_pg_upmap_items.end()) {
14190 if (it->first.pool() == pool) {
14191 dout(10) << __func__ << " " << pool
14192 << " removing pending pg_upmap_items "
14193 << it->first << dendl;
14194 it = pending_inc.new_pg_upmap_items.erase(it);
14195 } else {
14196 it++;
14197 }
14198 }
14199 }
14200
14201 // remove any choose_args for this pool
14202 CrushWrapper newcrush;
14203 _get_pending_crush(newcrush);
14204 if (newcrush.have_choose_args(pool)) {
14205 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14206 newcrush.rm_choose_args(pool);
14207 pending_inc.crush.clear();
14208 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14209 }
14210 return 0;
14211 }
14212
14213 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14214 {
14215 dout(10) << "_prepare_rename_pool " << pool << dendl;
14216 if (pending_inc.old_pools.count(pool)) {
14217 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14218 return -ENOENT;
14219 }
14220 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14221 p != pending_inc.new_pool_names.end();
14222 ++p) {
14223 if (p->second == newname && p->first != pool) {
14224 return -EEXIST;
14225 }
14226 }
14227
14228 pending_inc.new_pool_names[pool] = newname;
14229 return 0;
14230 }
14231
14232 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14233 {
14234 op->mark_osdmon_event(__func__);
14235 auto m = op->get_req<MPoolOp>();
14236 ostringstream ss;
14237 int ret = _prepare_remove_pool(m->pool, &ss, false);
14238 if (ret == -EAGAIN) {
14239 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14240 return true;
14241 }
14242 if (ret < 0)
14243 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14244 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14245 pending_inc.epoch));
14246 return true;
14247 }
14248
14249 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14250 int ret, epoch_t epoch, bufferlist *blp)
14251 {
14252 op->mark_osdmon_event(__func__);
14253 auto m = op->get_req<MPoolOp>();
14254 dout(20) << "_pool_op_reply " << ret << dendl;
14255 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14256 ret, epoch, get_last_committed(), blp);
14257 mon.send_reply(op, reply);
14258 }
14259
14260 void OSDMonitor::convert_pool_priorities(void)
14261 {
14262 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14263 int64_t max_prio = 0;
14264 int64_t min_prio = 0;
14265 for (const auto &i : osdmap.get_pools()) {
14266 const auto &pool = i.second;
14267
14268 if (pool.opts.is_set(key)) {
14269 int64_t prio = 0;
14270 pool.opts.get(key, &prio);
14271 if (prio > max_prio)
14272 max_prio = prio;
14273 if (prio < min_prio)
14274 min_prio = prio;
14275 }
14276 }
14277 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14278 dout(20) << __func__ << " nothing to fix" << dendl;
14279 return;
14280 }
14281 // Current pool priorities exceeds new maximum
14282 for (const auto &i : osdmap.get_pools()) {
14283 const auto pool_id = i.first;
14284 pg_pool_t pool = i.second;
14285
14286 int64_t prio = 0;
14287 pool.opts.get(key, &prio);
14288 int64_t n;
14289
14290 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14291 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14292 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14293 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14294 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14295 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14296 } else {
14297 continue;
14298 }
14299 if (n == 0) {
14300 pool.opts.unset(key);
14301 } else {
14302 pool.opts.set(key, static_cast<int64_t>(n));
14303 }
14304 dout(10) << __func__ << " pool " << pool_id
14305 << " recovery_priority adjusted "
14306 << prio << " to " << n << dendl;
14307 pool.last_change = pending_inc.epoch;
14308 pending_inc.new_pools[pool_id] = pool;
14309 }
14310 }
14311
14312 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14313 int *errcode,
14314 set<pg_pool_t*>* pools,
14315 const string& new_crush_rule)
14316 {
14317 dout(20) << __func__ << dendl;
14318 *okay = false;
14319 int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14320 if (new_crush_rule_result < 0) {
14321 ss << "unrecognized crush rule " << new_crush_rule_result;
14322 *errcode = new_crush_rule_result;
14323 return;
14324 }
14325 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14326 for (const auto& pooli : osdmap.pools) {
14327 int64_t poolid = pooli.first;
14328 const pg_pool_t *p = &pooli.second;
14329 if (!p->is_replicated()) {
14330 ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14331 *errcode = -EINVAL;
14332 return;
14333 }
14334 uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14335 if ((p->get_size() != default_size ||
14336 (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14337 (p->get_crush_rule() != new_rule)) {
14338 ss << "we currently require stretch mode pools start out with the"
14339 " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14340 *errcode = -EINVAL;
14341 return;
14342 }
14343 pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14344 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14345 // the attempt may fail and then we have these pool updates...but they won't do anything
14346 // if there is a failure, so if it's hard to change the interface, no need to bother
14347 pools->insert(pp);
14348 }
14349 *okay = true;
14350 return;
14351 }
14352
14353 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14354 int *errcode, bool commit,
14355 const string& dividing_bucket,
14356 uint32_t bucket_count,
14357 const set<pg_pool_t*>& pools,
14358 const string& new_crush_rule)
14359 {
14360 dout(20) << __func__ << dendl;
14361 *okay = false;
14362 CrushWrapper crush;
14363 _get_pending_crush(crush);
14364 int dividing_id;
14365 int retval = crush.get_validated_type_id(dividing_bucket, &dividing_id);
14366 if (retval == -1) {
14367 ss << dividing_bucket << " is not a valid crush bucket type";
14368 *errcode = -ENOENT;
14369 ceph_assert(!commit || retval != -1);
14370 return;
14371 }
14372 vector<int> subtrees;
14373 crush.get_subtree_of_type(dividing_id, &subtrees);
14374 if (subtrees.size() != 2) {
14375 ss << "there are " << subtrees.size() << dividing_bucket
14376 << "'s in the cluster but stretch mode currently only works with 2!";
14377 *errcode = -EINVAL;
14378 ceph_assert(!commit || subtrees.size() == 2);
14379 return;
14380 }
14381
14382 int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14383 if (new_crush_rule_result < 0) {
14384 ss << "unrecognized crush rule " << new_crush_rule;
14385 *errcode = new_crush_rule_result;
14386 ceph_assert(!commit || (new_crush_rule_result > 0));
14387 return;
14388 }
14389 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14390
14391 int weight1 = crush.get_item_weight(subtrees[0]);
14392 int weight2 = crush.get_item_weight(subtrees[1]);
14393 if (weight1 != weight2) {
14394 // TODO: I'm really not sure this is a good idea?
14395 ss << "the 2 " << dividing_bucket
14396 << "instances in the cluster have differing weights "
14397 << weight1 << " and " << weight2
14398 <<" but stretch mode currently requires they be the same!";
14399 *errcode = -EINVAL;
14400 ceph_assert(!commit || (weight1 == weight2));
14401 return;
14402 }
14403 if (bucket_count != 2) {
14404 ss << "currently we only support 2-site stretch clusters!";
14405 *errcode = -EINVAL;
14406 ceph_assert(!commit || bucket_count == 2);
14407 return;
14408 }
14409 // TODO: check CRUSH rules for pools so that we are appropriately divided
14410 if (commit) {
14411 for (auto pool : pools) {
14412 pool->crush_rule = new_rule;
14413 pool->peering_crush_bucket_count = bucket_count;
14414 pool->peering_crush_bucket_target = bucket_count;
14415 pool->peering_crush_bucket_barrier = dividing_id;
14416 pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14417 pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14418 pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14419 }
14420 pending_inc.change_stretch_mode = true;
14421 pending_inc.stretch_mode_enabled = true;
14422 pending_inc.new_stretch_bucket_count = bucket_count;
14423 pending_inc.new_degraded_stretch_mode = 0;
14424 pending_inc.new_stretch_mode_bucket = dividing_id;
14425 }
14426 *okay = true;
14427 return;
14428 }
14429
14430 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14431 set<int> *really_down_buckets,
14432 set<string> *really_down_mons)
14433 {
14434 dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14435 ceph_assert(is_readable());
14436 if (dead_buckets.empty()) return false;
14437 set<int> down_cache;
14438 bool really_down = false;
14439 for (auto dbi : dead_buckets) {
14440 const string& bucket_name = dbi.first;
14441 ceph_assert(osdmap.crush->name_exists(bucket_name));
14442 int bucket_id = osdmap.crush->get_item_id(bucket_name);
14443 dout(20) << "Checking " << bucket_name << " id " << bucket_id
14444 << " to see if OSDs are also down" << dendl;
14445 bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14446 if (subtree_down) {
14447 dout(20) << "subtree is down!" << dendl;
14448 really_down = true;
14449 really_down_buckets->insert(bucket_id);
14450 really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14451 }
14452 }
14453 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14454 << " and mons " << *really_down_mons << " are really down" << dendl;
14455 return really_down;
14456 }
14457
14458 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14459 const set<string>& live_zones)
14460 {
14461 dout(20) << __func__ << dendl;
14462 stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14463 // update the general OSDMap changes
14464 pending_inc.change_stretch_mode = true;
14465 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14466 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14467 int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14468 ceph_assert(new_site_count == 1); // stretch count 2!
14469 pending_inc.new_degraded_stretch_mode = new_site_count;
14470 pending_inc.new_recovering_stretch_mode = 0;
14471 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14472
14473 // and then apply them to all the pg_pool_ts
14474 ceph_assert(live_zones.size() == 1); // only support 2 zones now
14475 const string& remaining_site_name = *(live_zones.begin());
14476 ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14477 int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14478 for (auto pgi : osdmap.pools) {
14479 if (pgi.second.peering_crush_bucket_count) {
14480 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14481 newp.peering_crush_bucket_count = new_site_count;
14482 newp.peering_crush_mandatory_member = remaining_site;
14483 newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14484 newp.last_force_op_resend = pending_inc.epoch;
14485 }
14486 }
14487 propose_pending();
14488 }
14489
14490 void OSDMonitor::trigger_recovery_stretch_mode()
14491 {
14492 dout(20) << __func__ << dendl;
14493 stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14494 pending_inc.change_stretch_mode = true;
14495 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14496 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14497 pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14498 pending_inc.new_recovering_stretch_mode = 1;
14499 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14500
14501 for (auto pgi : osdmap.pools) {
14502 if (pgi.second.peering_crush_bucket_count) {
14503 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14504 newp.last_force_op_resend = pending_inc.epoch;
14505 }
14506 }
14507 propose_pending();
14508 }
14509
14510 void OSDMonitor::notify_new_pg_digest()
14511 {
14512 dout(20) << __func__ << dendl;
14513 if (!stretch_recovery_triggered.is_zero()) {
14514 try_end_recovery_stretch_mode(false);
14515 }
14516 }
14517
14518 struct CMonExitRecovery : public Context {
14519 OSDMonitor *m;
14520 bool force;
14521 CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14522 void finish(int r) {
14523 m->try_end_recovery_stretch_mode(force);
14524 }
14525 };
14526
14527 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
14528 {
14529 dout(20) << __func__ << dendl;
14530 if (!mon.is_leader()) return;
14531 if (!mon.is_degraded_stretch_mode()) return;
14532 if (!mon.is_recovering_stretch_mode()) return;
14533 if (!is_readable()) {
14534 wait_for_readable_ctx(new CMonExitRecovery(this, force));
14535 return;
14536 }
14537
14538 if (osdmap.recovering_stretch_mode &&
14539 ((!stretch_recovery_triggered.is_zero() &&
14540 ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
14541 stretch_recovery_triggered) ||
14542 force)) {
14543 if (!mon.mgrstatmon()->is_readable()) {
14544 mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
14545 return;
14546 }
14547 const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
14548 double misplaced, degraded, inactive, unknown;
14549 pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
14550 if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
14551 // we can exit degraded stretch mode!
14552 mon.trigger_healthy_stretch_mode();
14553 }
14554 }
14555 }
14556
14557 void OSDMonitor::trigger_healthy_stretch_mode()
14558 {
14559 ceph_assert(is_writeable());
14560 stretch_recovery_triggered.set_from_double(0);
14561 pending_inc.change_stretch_mode = true;
14562 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14563 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14564 pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
14565 pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
14566 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14567 for (auto pgi : osdmap.pools) {
14568 if (pgi.second.peering_crush_bucket_count) {
14569 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14570 newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
14571 newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14572 newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14573 newp.last_force_op_resend = pending_inc.epoch;
14574 }
14575 }
14576 propose_pending();
14577 }