]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
import ceph 15.2.14
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
61
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
69
70 #include "common/config.h"
71 #include "common/errno.h"
72
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
76
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
86
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
89
90 #include "json_spirit/json_spirit_reader.h"
91
92 #include <boost/algorithm/string/predicate.hpp>
93
94 #define dout_subsys ceph_subsys_mon
95 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
96 static const string OSD_METADATA_PREFIX("osd_metadata");
97 static const string OSD_SNAP_PREFIX("osd_snap");
98
99 /*
100
101 OSD snapshot metadata
102 ---------------------
103
104 -- starting with mimic, removed in octopus --
105
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
108
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
111
112
113 -- starting with mimic --
114
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
117
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
122
123
124 -- starting with octopus --
125
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
128
129 */
130 using namespace TOPNSPC::common;
131 namespace {
132
133 struct OSDMemCache : public PriorityCache::PriCache {
134 OSDMonitor *osdmon;
135 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
136 int64_t committed_bytes = 0;
137 double cache_ratio = 0;
138
139 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
140
141 virtual uint64_t _get_used_bytes() const = 0;
142
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri, uint64_t total_cache) const {
145 int64_t assigned = get_cache_bytes(pri);
146
147 switch (pri) {
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1:
150 {
151 int64_t request = _get_used_bytes();
152 return (request > assigned) ? request - assigned : 0;
153 }
154 default:
155 break;
156 }
157 return -EOPNOTSUPP;
158 }
159
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
161 return cache_bytes[pri];
162 }
163
164 virtual int64_t get_cache_bytes() const {
165 int64_t total = 0;
166
167 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
168 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
169 total += get_cache_bytes(pri);
170 }
171 return total;
172 }
173
174 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
175 cache_bytes[pri] = bytes;
176 }
177 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
178 cache_bytes[pri] += bytes;
179 }
180 virtual int64_t commit_cache_size(uint64_t total_cache) {
181 committed_bytes = PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache);
183 return committed_bytes;
184 }
185 virtual int64_t get_committed_size() const {
186 return committed_bytes;
187 }
188 virtual double get_cache_ratio() const {
189 return cache_ratio;
190 }
191 virtual void set_cache_ratio(double ratio) {
192 cache_ratio = ratio;
193 }
194 virtual string get_cache_name() const = 0;
195 };
196
197 struct IncCache : public OSDMemCache {
198 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
199
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon->inc_osd_cache.get_bytes();
202 }
203
204 virtual string get_cache_name() const {
205 return "OSDMap Inc Cache";
206 }
207
208 uint64_t _get_num_osdmaps() const {
209 return osdmon->inc_osd_cache.get_size();
210 }
211 };
212
213 struct FullCache : public OSDMemCache {
214 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
215
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon->full_osd_cache.get_bytes();
218 }
219
220 virtual string get_cache_name() const {
221 return "OSDMap Full Cache";
222 }
223
224 uint64_t _get_num_osdmaps() const {
225 return osdmon->full_osd_cache.get_size();
226 }
227 };
228
229 std::shared_ptr<IncCache> inc_cache;
230 std::shared_ptr<FullCache> full_cache;
231
232 const uint32_t MAX_POOL_APPLICATIONS = 4;
233 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
234 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
235
236 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant.spec.allow & OSD_CAP_W) != 0) {
239 auto& match = grant.match;
240 if (match.is_match_all()) {
241 return true;
242 } else if (pool_name != nullptr &&
243 !match.pool_namespace.pool_name.empty() &&
244 match.pool_namespace.pool_name == *pool_name) {
245 return true;
246 }
247 }
248 return false;
249 }
250
251 bool is_unmanaged_snap_op_permitted(CephContext* cct,
252 const KeyServer& key_server,
253 const EntityName& entity_name,
254 const MonCap& mon_caps,
255 const entity_addr_t& peer_socket_addr,
256 const std::string* pool_name)
257 {
258 typedef std::map<std::string, std::string> CommandArgs;
259
260 if (mon_caps.is_capable(
261 cct, entity_name, "osd",
262 "osd pool op unmanaged-snap",
263 (pool_name == nullptr ?
264 CommandArgs{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs{{"poolname", *pool_name}}),
266 false, true, false,
267 peer_socket_addr)) {
268 return true;
269 }
270
271 AuthCapsInfo caps_info;
272 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
273 caps_info)) {
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl;
276 return false;
277 }
278
279 string caps_str;
280 if (caps_info.caps.length() > 0) {
281 auto p = caps_info.caps.cbegin();
282 try {
283 decode(caps_str, p);
284 } catch (const buffer::error &err) {
285 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
286 << dendl;
287 return false;
288 }
289 }
290
291 OSDCap osd_cap;
292 if (!osd_cap.parse(caps_str, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl;
295 return false;
296 }
297
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap.allow_all()) {
301 return true;
302 }
303
304 for (auto& grant : osd_cap.grants) {
305 if (grant.profile.is_valid()) {
306 for (auto& profile_grant : grant.profile_grants) {
307 if (is_osd_writable(profile_grant, pool_name)) {
308 return true;
309 }
310 }
311 } else if (is_osd_writable(grant, pool_name)) {
312 return true;
313 }
314 }
315
316 return false;
317 }
318
319 } // anonymous namespace
320
321 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
322 {
323 if (epoch_by_pg.size() <= ps) {
324 epoch_by_pg.resize(ps + 1, 0);
325 }
326 const auto old_lec = epoch_by_pg[ps];
327 if (old_lec >= last_epoch_clean) {
328 // stale lec
329 return;
330 }
331 epoch_by_pg[ps] = last_epoch_clean;
332 if (last_epoch_clean < floor) {
333 floor = last_epoch_clean;
334 } else if (last_epoch_clean > floor) {
335 if (old_lec == floor) {
336 // probably should increase floor?
337 auto new_floor = std::min_element(std::begin(epoch_by_pg),
338 std::end(epoch_by_pg));
339 floor = *new_floor;
340 }
341 }
342 if (ps != next_missing) {
343 return;
344 }
345 for (; next_missing < epoch_by_pg.size(); next_missing++) {
346 if (epoch_by_pg[next_missing] == 0) {
347 break;
348 }
349 }
350 }
351
352 void LastEpochClean::remove_pool(uint64_t pool)
353 {
354 report_by_pool.erase(pool);
355 }
356
357 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
358 {
359 auto& lec = report_by_pool[pg.pool()];
360 return lec.report(pg.ps(), last_epoch_clean);
361 }
362
363 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
364 {
365 auto floor = latest.get_epoch();
366 for (auto& pool : latest.get_pools()) {
367 auto reported = report_by_pool.find(pool.first);
368 if (reported == report_by_pool.end()) {
369 return 0;
370 }
371 if (reported->second.next_missing < pool.second.get_pg_num()) {
372 return 0;
373 }
374 if (reported->second.floor < floor) {
375 floor = reported->second.floor;
376 }
377 }
378 return floor;
379 }
380
381 void LastEpochClean::dump(Formatter *f) const
382 {
383 f->open_array_section("per_pool");
384
385 for (auto& it : report_by_pool) {
386 f->open_object_section("pool");
387 f->dump_unsigned("poolid", it.first);
388 f->dump_unsigned("floor", it.second.floor);
389 f->close_section();
390 }
391
392 f->close_section();
393 }
394
395 class C_UpdateCreatingPGs : public Context {
396 public:
397 OSDMonitor *osdmon;
398 utime_t start;
399 epoch_t epoch;
400 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
401 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
402 void finish(int r) override {
403 if (r >= 0) {
404 utime_t end = ceph_clock_now();
405 dout(10) << "osdmap epoch " << epoch << " mapping took "
406 << (end - start) << " seconds" << dendl;
407 osdmon->update_creating_pgs();
408 osdmon->check_pg_creates_subs();
409 }
410 }
411 };
412
413 #undef dout_prefix
414 #define dout_prefix _prefix(_dout, mon, osdmap)
415 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
416 return *_dout << "mon." << mon->name << "@" << mon->rank
417 << "(" << mon->get_state_name()
418 << ").osd e" << osdmap.get_epoch() << " ";
419 }
420
421 OSDMonitor::OSDMonitor(
422 CephContext *cct,
423 Monitor *mn,
424 Paxos *p,
425 const string& service_name)
426 : PaxosService(mn, p, service_name),
427 cct(cct),
428 inc_osd_cache(g_conf()->mon_osd_cache_size),
429 full_osd_cache(g_conf()->mon_osd_cache_size),
430 has_osdmap_manifest(false),
431 mapper(mn->cct, &mn->cpu_tp)
432 {
433 inc_cache = std::make_shared<IncCache>(this);
434 full_cache = std::make_shared<FullCache>(this);
435 cct->_conf.add_observer(this);
436 int r = _set_cache_sizes();
437 if (r < 0) {
438 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
439 << g_conf()->mon_osd_cache_size
440 << ") without priority cache management"
441 << dendl;
442 }
443 }
444
445 const char **OSDMonitor::get_tracked_conf_keys() const
446 {
447 static const char* KEYS[] = {
448 "mon_memory_target",
449 "mon_memory_autotune",
450 "rocksdb_cache_size",
451 NULL
452 };
453 return KEYS;
454 }
455
456 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
457 const std::set<std::string> &changed)
458 {
459 dout(10) << __func__ << " " << changed << dendl;
460
461 if (changed.count("mon_memory_autotune")) {
462 _set_cache_autotuning();
463 }
464 if (changed.count("mon_memory_target") ||
465 changed.count("rocksdb_cache_size")) {
466 int r = _update_mon_cache_settings();
467 if (r < 0) {
468 derr << __func__ << " mon_memory_target:"
469 << g_conf()->mon_memory_target
470 << " rocksdb_cache_size:"
471 << g_conf()->rocksdb_cache_size
472 << ". Unable to update cache size."
473 << dendl;
474 }
475 }
476 }
477
478 void OSDMonitor::_set_cache_autotuning()
479 {
480 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
481 // Disable cache autotuning
482 std::lock_guard l(balancer_lock);
483 pcm = nullptr;
484 }
485
486 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
487 int r = register_cache_with_pcm();
488 if (r < 0) {
489 dout(10) << __func__
490 << " Error while registering osdmon caches with pcm."
491 << " Cache auto tuning not enabled."
492 << dendl;
493 mon_memory_autotune = false;
494 } else {
495 mon_memory_autotune = true;
496 }
497 }
498 }
499
500 int OSDMonitor::_update_mon_cache_settings()
501 {
502 if (g_conf()->mon_memory_target <= 0 ||
503 g_conf()->mon_memory_target < mon_memory_min ||
504 g_conf()->rocksdb_cache_size <= 0) {
505 return -EINVAL;
506 }
507
508 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
509 derr << __func__ << " not using pcm and rocksdb" << dendl;
510 return -EINVAL;
511 }
512
513 uint64_t old_mon_memory_target = mon_memory_target;
514 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
515
516 // Set the new pcm memory cache sizes
517 mon_memory_target = g_conf()->mon_memory_target;
518 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
519
520 uint64_t base = mon_memory_base;
521 double fragmentation = mon_memory_fragmentation;
522 uint64_t target = mon_memory_target;
523 uint64_t min = mon_memory_min;
524 uint64_t max = min;
525
526 uint64_t ltarget = (1.0 - fragmentation) * target;
527 if (ltarget > base + min) {
528 max = ltarget - base;
529 }
530
531 int r = _set_cache_ratios();
532 if (r < 0) {
533 derr << __func__ << " Cache ratios for pcm could not be set."
534 << " Review the kv (rocksdb) and mon_memory_target sizes."
535 << dendl;
536 mon_memory_target = old_mon_memory_target;
537 rocksdb_cache_size = old_rocksdb_cache_size;
538 return -EINVAL;
539 }
540
541 if (mon_memory_autotune && pcm != nullptr) {
542 std::lock_guard l(balancer_lock);
543 // set pcm cache levels
544 pcm->set_target_memory(target);
545 pcm->set_min_memory(min);
546 pcm->set_max_memory(max);
547 // tune memory based on new values
548 pcm->tune_memory();
549 pcm->balance();
550 _set_new_cache_sizes();
551 dout(1) << __func__ << " Updated mon cache setting."
552 << " target: " << target
553 << " min: " << min
554 << " max: " << max
555 << dendl;
556 }
557 return 0;
558 }
559
560 int OSDMonitor::_set_cache_sizes()
561 {
562 if (g_conf()->mon_memory_autotune) {
563 // set the new osdmon cache targets to be managed by pcm
564 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
565 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
566 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
567 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
568 mon_memory_target = g_conf()->mon_memory_target;
569 mon_memory_min = g_conf()->mon_osd_cache_size_min;
570 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
571 derr << __func__ << " mon_memory_target:" << mon_memory_target
572 << " mon_memory_min:" << mon_memory_min
573 << ". Invalid size option(s) provided."
574 << dendl;
575 return -EINVAL;
576 }
577 // Set the initial inc and full LRU cache sizes
578 inc_osd_cache.set_bytes(mon_memory_min);
579 full_osd_cache.set_bytes(mon_memory_min);
580 mon_memory_autotune = g_conf()->mon_memory_autotune;
581 }
582 return 0;
583 }
584
585 bool OSDMonitor::_have_pending_crush()
586 {
587 return pending_inc.crush.length() > 0;
588 }
589
590 CrushWrapper &OSDMonitor::_get_stable_crush()
591 {
592 return *osdmap.crush;
593 }
594
595 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
596 {
597 bufferlist bl;
598 if (pending_inc.crush.length())
599 bl = pending_inc.crush;
600 else
601 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
602
603 auto p = bl.cbegin();
604 newcrush.decode(p);
605 }
606
607 void OSDMonitor::create_initial()
608 {
609 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
610
611 OSDMap newmap;
612
613 bufferlist bl;
614 mon->store->get("mkfs", "osdmap", bl);
615
616 if (bl.length()) {
617 newmap.decode(bl);
618 newmap.set_fsid(mon->monmap->fsid);
619 } else {
620 newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
621 }
622 newmap.set_epoch(1);
623 newmap.created = newmap.modified = ceph_clock_now();
624
625 // new clusters should sort bitwise by default.
626 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
627
628 newmap.flags |=
629 CEPH_OSDMAP_RECOVERY_DELETES |
630 CEPH_OSDMAP_PURGED_SNAPDIRS |
631 CEPH_OSDMAP_PGLOG_HARDLIMIT;
632 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
633 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
634 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
635 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
636 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
637 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
638
639 // new cluster should require latest by default
640 if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
641 if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
642 derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
643 newmap.require_osd_release = ceph_release_t::mimic;
644 } else {
645 derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
646 newmap.require_osd_release = ceph_release_t::nautilus;
647 }
648 } else {
649 newmap.require_osd_release = ceph_release_t::octopus;
650 ceph_release_t r = ceph_release_from_name(
651 g_conf()->mon_osd_initial_require_min_compat_client);
652 if (!r) {
653 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
654 }
655 newmap.require_min_compat_client = r;
656 }
657
658 // encode into pending incremental
659 uint64_t features = newmap.get_encoding_features();
660 newmap.encode(pending_inc.fullmap,
661 features | CEPH_FEATURE_RESERVED);
662 pending_inc.full_crc = newmap.get_crc();
663 dout(20) << " full crc " << pending_inc.full_crc << dendl;
664 }
665
666 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
667 {
668 s.insert(service_name);
669 s.insert(OSD_PG_CREATING_PREFIX);
670 s.insert(OSD_METADATA_PREFIX);
671 s.insert(OSD_SNAP_PREFIX);
672 }
673
674 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
675 {
676 // we really don't care if the version has been updated, because we may
677 // have trimmed without having increased the last committed; yet, we may
678 // need to update the in-memory manifest.
679 load_osdmap_manifest();
680
681 version_t version = get_last_committed();
682 if (version == osdmap.epoch)
683 return;
684 ceph_assert(version > osdmap.epoch);
685
686 dout(15) << "update_from_paxos paxos e " << version
687 << ", my e " << osdmap.epoch << dendl;
688
689 if (mapping_job) {
690 if (!mapping_job->is_done()) {
691 dout(1) << __func__ << " mapping job "
692 << mapping_job.get() << " did not complete, "
693 << mapping_job->shards << " left, canceling" << dendl;
694 mapping_job->abort();
695 }
696 mapping_job.reset();
697 }
698
699 load_health();
700
701 /*
702 * We will possibly have a stashed latest that *we* wrote, and we will
703 * always be sure to have the oldest full map in the first..last range
704 * due to encode_trim_extra(), which includes the oldest full map in the trim
705 * transaction.
706 *
707 * encode_trim_extra() does not however write the full map's
708 * version to 'full_latest'. This is only done when we are building the
709 * full maps from the incremental versions. But don't panic! We make sure
710 * that the following conditions find whichever full map version is newer.
711 */
712 version_t latest_full = get_version_latest_full();
713 if (latest_full == 0 && get_first_committed() > 1)
714 latest_full = get_first_committed();
715
716 if (get_first_committed() > 1 &&
717 latest_full < get_first_committed()) {
718 // the monitor could be just sync'ed with its peer, and the latest_full key
719 // is not encoded in the paxos commits in encode_pending(), so we need to
720 // make sure we get it pointing to a proper version.
721 version_t lc = get_last_committed();
722 version_t fc = get_first_committed();
723
724 dout(10) << __func__ << " looking for valid full map in interval"
725 << " [" << fc << ", " << lc << "]" << dendl;
726
727 latest_full = 0;
728 for (version_t v = lc; v >= fc; v--) {
729 string full_key = "full_" + stringify(v);
730 if (mon->store->exists(get_service_name(), full_key)) {
731 dout(10) << __func__ << " found latest full map v " << v << dendl;
732 latest_full = v;
733 break;
734 }
735 }
736
737 ceph_assert(latest_full > 0);
738 auto t(std::make_shared<MonitorDBStore::Transaction>());
739 put_version_latest_full(t, latest_full);
740 mon->store->apply_transaction(t);
741 dout(10) << __func__ << " updated the on-disk full map version to "
742 << latest_full << dendl;
743 }
744
745 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
746 bufferlist latest_bl;
747 get_version_full(latest_full, latest_bl);
748 ceph_assert(latest_bl.length() != 0);
749 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
750 osdmap = OSDMap();
751 osdmap.decode(latest_bl);
752 }
753
754 bufferlist bl;
755 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
756 auto p = bl.cbegin();
757 std::lock_guard<std::mutex> l(creating_pgs_lock);
758 creating_pgs.decode(p);
759 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
760 << creating_pgs.last_scan_epoch
761 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
762 } else {
763 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
764 << dendl;
765 }
766
767 // walk through incrementals
768 MonitorDBStore::TransactionRef t;
769 size_t tx_size = 0;
770 while (version > osdmap.epoch) {
771 bufferlist inc_bl;
772 int err = get_version(osdmap.epoch+1, inc_bl);
773 ceph_assert(err == 0);
774 ceph_assert(inc_bl.length());
775 // set priority cache manager levels if the osdmap is
776 // being populated for the first time.
777 if (mon_memory_autotune && pcm == nullptr) {
778 int r = register_cache_with_pcm();
779 if (r < 0) {
780 dout(10) << __func__
781 << " Error while registering osdmon caches with pcm."
782 << " Proceeding without cache auto tuning."
783 << dendl;
784 }
785 }
786
787 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
788 << dendl;
789 OSDMap::Incremental inc(inc_bl);
790 err = osdmap.apply_incremental(inc);
791 ceph_assert(err == 0);
792
793 if (!t)
794 t.reset(new MonitorDBStore::Transaction);
795
796 // Write out the full map for all past epochs. Encode the full
797 // map with the same features as the incremental. If we don't
798 // know, use the quorum features. If we don't know those either,
799 // encode with all features.
800 uint64_t f = inc.encode_features;
801 if (!f)
802 f = mon->get_quorum_con_features();
803 if (!f)
804 f = -1;
805 bufferlist full_bl;
806 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
807 tx_size += full_bl.length();
808
809 bufferlist orig_full_bl;
810 get_version_full(osdmap.epoch, orig_full_bl);
811 if (orig_full_bl.length()) {
812 // the primary provided the full map
813 ceph_assert(inc.have_crc);
814 if (inc.full_crc != osdmap.crc) {
815 // This will happen if the mons were running mixed versions in
816 // the past or some other circumstance made the full encoded
817 // maps divergent. Reloading here will bring us back into
818 // sync with the primary for this and all future maps. OSDs
819 // will also be brought back into sync when they discover the
820 // crc mismatch and request a full map from a mon.
821 derr << __func__ << " full map CRC mismatch, resetting to canonical"
822 << dendl;
823
824 dout(20) << __func__ << " my (bad) full osdmap:\n";
825 JSONFormatter jf(true);
826 jf.dump_object("osdmap", osdmap);
827 jf.flush(*_dout);
828 *_dout << "\nhexdump:\n";
829 full_bl.hexdump(*_dout);
830 *_dout << dendl;
831
832 osdmap = OSDMap();
833 osdmap.decode(orig_full_bl);
834
835 dout(20) << __func__ << " canonical full osdmap:\n";
836 JSONFormatter jf(true);
837 jf.dump_object("osdmap", osdmap);
838 jf.flush(*_dout);
839 *_dout << "\nhexdump:\n";
840 orig_full_bl.hexdump(*_dout);
841 *_dout << dendl;
842 }
843 } else {
844 ceph_assert(!inc.have_crc);
845 put_version_full(t, osdmap.epoch, full_bl);
846 }
847 put_version_latest_full(t, osdmap.epoch);
848
849 // share
850 dout(1) << osdmap << dendl;
851
852 if (osdmap.epoch == 1) {
853 t->erase("mkfs", "osdmap");
854 }
855
856 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
857 mon->store->apply_transaction(t);
858 t = MonitorDBStore::TransactionRef();
859 tx_size = 0;
860 }
861 for (const auto &osd_state : inc.new_state) {
862 if (osd_state.second & CEPH_OSD_UP) {
863 // could be marked up *or* down, but we're too lazy to check which
864 last_osd_report.erase(osd_state.first);
865 }
866 if (osd_state.second & CEPH_OSD_EXISTS) {
867 // could be created *or* destroyed, but we can safely drop it
868 osd_epochs.erase(osd_state.first);
869 }
870 }
871 }
872
873 if (t) {
874 mon->store->apply_transaction(t);
875 }
876
877 for (int o = 0; o < osdmap.get_max_osd(); o++) {
878 if (osdmap.is_out(o))
879 continue;
880 auto found = down_pending_out.find(o);
881 if (osdmap.is_down(o)) {
882 // populate down -> out map
883 if (found == down_pending_out.end()) {
884 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
885 down_pending_out[o] = ceph_clock_now();
886 }
887 } else {
888 if (found != down_pending_out.end()) {
889 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
890 down_pending_out.erase(found);
891 }
892 }
893 }
894 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
895
896 check_osdmap_subs();
897 check_pg_creates_subs();
898
899 share_map_with_random_osd();
900 update_logger();
901 process_failures();
902
903 // make sure our feature bits reflect the latest map
904 update_msgr_features();
905
906 if (!mon->is_leader()) {
907 // will be called by on_active() on the leader, avoid doing so twice
908 start_mapping();
909 }
910 }
911
912 int OSDMonitor::register_cache_with_pcm()
913 {
914 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
915 derr << __func__ << " Invalid memory size specified for mon caches."
916 << " Caches will not be auto-tuned."
917 << dendl;
918 return -EINVAL;
919 }
920 uint64_t base = mon_memory_base;
921 double fragmentation = mon_memory_fragmentation;
922 // For calculating total target memory, consider rocksdb cache size.
923 uint64_t target = mon_memory_target;
924 uint64_t min = mon_memory_min;
925 uint64_t max = min;
926
927 // Apply the same logic as in bluestore to set the max amount
928 // of memory to use for cache. Assume base memory for OSDMaps
929 // and then add in some overhead for fragmentation.
930 uint64_t ltarget = (1.0 - fragmentation) * target;
931 if (ltarget > base + min) {
932 max = ltarget - base;
933 }
934
935 rocksdb_binned_kv_cache = mon->store->get_priority_cache();
936 if (!rocksdb_binned_kv_cache) {
937 derr << __func__ << " not using rocksdb" << dendl;
938 return -EINVAL;
939 }
940
941 int r = _set_cache_ratios();
942 if (r < 0) {
943 derr << __func__ << " Cache ratios for pcm could not be set."
944 << " Review the kv (rocksdb) and mon_memory_target sizes."
945 << dendl;
946 return -EINVAL;
947 }
948
949 pcm = std::make_shared<PriorityCache::Manager>(
950 cct, min, max, target, true);
951 pcm->insert("kv", rocksdb_binned_kv_cache, true);
952 pcm->insert("inc", inc_cache, true);
953 pcm->insert("full", full_cache, true);
954 dout(1) << __func__ << " pcm target: " << target
955 << " pcm max: " << max
956 << " pcm min: " << min
957 << " inc_osd_cache size: " << inc_osd_cache.get_size()
958 << dendl;
959 return 0;
960 }
961
962 int OSDMonitor::_set_cache_ratios()
963 {
964 double old_cache_kv_ratio = cache_kv_ratio;
965
966 // Set the cache ratios for kv(rocksdb), inc and full caches
967 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
968 if (cache_kv_ratio >= 1.0) {
969 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
970 << ") must be in range [0,<1.0]."
971 << dendl;
972 cache_kv_ratio = old_cache_kv_ratio;
973 return -EINVAL;
974 }
975 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
976 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
977 inc_cache->set_cache_ratio(cache_inc_ratio);
978 full_cache->set_cache_ratio(cache_full_ratio);
979
980 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
981 << " inc ratio " << cache_inc_ratio
982 << " full ratio " << cache_full_ratio
983 << dendl;
984 return 0;
985 }
986
987 void OSDMonitor::start_mapping()
988 {
989 // initiate mapping job
990 if (mapping_job) {
991 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
992 << dendl;
993 mapping_job->abort();
994 }
995 if (!osdmap.get_pools().empty()) {
996 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
997 mapping_job = mapping.start_update(osdmap, mapper,
998 g_conf()->mon_osd_mapping_pgs_per_chunk);
999 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1000 << " at " << fin->start << dendl;
1001 mapping_job->set_finish_event(fin);
1002 } else {
1003 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1004 mapping_job = nullptr;
1005 }
1006 }
1007
1008 void OSDMonitor::update_msgr_features()
1009 {
1010 set<int> types;
1011 types.insert((int)entity_name_t::TYPE_OSD);
1012 types.insert((int)entity_name_t::TYPE_CLIENT);
1013 types.insert((int)entity_name_t::TYPE_MDS);
1014 types.insert((int)entity_name_t::TYPE_MON);
1015 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1016 uint64_t mask;
1017 uint64_t features = osdmap.get_features(*q, &mask);
1018 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1019 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1020 ceph::net::Policy p = mon->messenger->get_policy(*q);
1021 p.features_required = (p.features_required & ~mask) | features;
1022 mon->messenger->set_policy(*q, p);
1023 }
1024 }
1025 }
1026
1027 void OSDMonitor::on_active()
1028 {
1029 update_logger();
1030
1031 if (mon->is_leader()) {
1032 mon->clog->debug() << "osdmap " << osdmap;
1033 if (!priority_convert) {
1034 // Only do this once at start-up
1035 convert_pool_priorities();
1036 priority_convert = true;
1037 }
1038 } else {
1039 list<MonOpRequestRef> ls;
1040 take_all_failures(ls);
1041 while (!ls.empty()) {
1042 MonOpRequestRef op = ls.front();
1043 op->mark_osdmon_event(__func__);
1044 dispatch(op);
1045 ls.pop_front();
1046 }
1047 }
1048 start_mapping();
1049 }
1050
1051 void OSDMonitor::on_restart()
1052 {
1053 last_osd_report.clear();
1054 }
1055
1056 void OSDMonitor::on_shutdown()
1057 {
1058 dout(10) << __func__ << dendl;
1059 if (mapping_job) {
1060 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1061 << dendl;
1062 mapping_job->abort();
1063 }
1064
1065 // discard failure info, waiters
1066 list<MonOpRequestRef> ls;
1067 take_all_failures(ls);
1068 ls.clear();
1069 }
1070
1071 void OSDMonitor::update_logger()
1072 {
1073 dout(10) << "update_logger" << dendl;
1074
1075 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1076 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1077 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1078 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1079 }
1080
1081 void OSDMonitor::create_pending()
1082 {
1083 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1084 pending_inc.fsid = mon->monmap->fsid;
1085 pending_metadata.clear();
1086 pending_metadata_rm.clear();
1087 pending_pseudo_purged_snaps.clear();
1088
1089 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1090
1091 // safety checks (this shouldn't really happen)
1092 {
1093 if (osdmap.backfillfull_ratio <= 0) {
1094 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1095 if (pending_inc.new_backfillfull_ratio > 1.0)
1096 pending_inc.new_backfillfull_ratio /= 100;
1097 dout(1) << __func__ << " setting backfillfull_ratio = "
1098 << pending_inc.new_backfillfull_ratio << dendl;
1099 }
1100 if (osdmap.full_ratio <= 0) {
1101 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1102 if (pending_inc.new_full_ratio > 1.0)
1103 pending_inc.new_full_ratio /= 100;
1104 dout(1) << __func__ << " setting full_ratio = "
1105 << pending_inc.new_full_ratio << dendl;
1106 }
1107 if (osdmap.nearfull_ratio <= 0) {
1108 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1109 if (pending_inc.new_nearfull_ratio > 1.0)
1110 pending_inc.new_nearfull_ratio /= 100;
1111 dout(1) << __func__ << " setting nearfull_ratio = "
1112 << pending_inc.new_nearfull_ratio << dendl;
1113 }
1114 }
1115
1116 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1117 // structure.
1118 if (osdmap.crush->has_legacy_rule_ids()) {
1119 CrushWrapper newcrush;
1120 _get_pending_crush(newcrush);
1121
1122 // First, for all pools, work out which rule they really used
1123 // by resolving ruleset to rule.
1124 for (const auto &i : osdmap.get_pools()) {
1125 const auto pool_id = i.first;
1126 const auto &pool = i.second;
1127 int new_rule_id = newcrush.find_rule(pool.crush_rule,
1128 pool.type, pool.size);
1129
1130 dout(1) << __func__ << " rewriting pool "
1131 << osdmap.get_pool_name(pool_id) << " crush ruleset "
1132 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1133 if (pending_inc.new_pools.count(pool_id) == 0) {
1134 pending_inc.new_pools[pool_id] = pool;
1135 }
1136 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1137 }
1138
1139 // Now, go ahead and renumber all the rules so that their
1140 // rule_id field corresponds to their position in the array
1141 auto old_to_new = newcrush.renumber_rules();
1142 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1143 for (const auto &i : old_to_new) {
1144 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1145 }
1146 pending_inc.crush.clear();
1147 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1148 }
1149 }
1150
1151 creating_pgs_t
1152 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1153 const OSDMap& nextmap)
1154 {
1155 dout(10) << __func__ << dendl;
1156 creating_pgs_t pending_creatings;
1157 {
1158 std::lock_guard<std::mutex> l(creating_pgs_lock);
1159 pending_creatings = creating_pgs;
1160 }
1161 // check for new or old pools
1162 if (pending_creatings.last_scan_epoch < inc.epoch) {
1163 unsigned queued = 0;
1164 queued += scan_for_creating_pgs(osdmap.get_pools(),
1165 inc.old_pools,
1166 inc.modified,
1167 &pending_creatings);
1168 queued += scan_for_creating_pgs(inc.new_pools,
1169 inc.old_pools,
1170 inc.modified,
1171 &pending_creatings);
1172 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1173 for (auto deleted_pool : inc.old_pools) {
1174 auto removed = pending_creatings.remove_pool(deleted_pool);
1175 dout(10) << __func__ << " " << removed
1176 << " pg removed because containing pool deleted: "
1177 << deleted_pool << dendl;
1178 last_epoch_clean.remove_pool(deleted_pool);
1179 }
1180 // pgmon updates its creating_pgs in check_osd_map() which is called by
1181 // on_active() and check_osd_map() could be delayed if lease expires, so its
1182 // creating_pgs could be stale in comparison with the one of osdmon. let's
1183 // trim them here. otherwise, they will be added back after being erased.
1184 unsigned removed = 0;
1185 for (auto& pg : pending_created_pgs) {
1186 dout(20) << __func__ << " noting created pg " << pg << dendl;
1187 pending_creatings.created_pools.insert(pg.pool());
1188 removed += pending_creatings.pgs.erase(pg);
1189 }
1190 pending_created_pgs.clear();
1191 dout(10) << __func__ << " " << removed
1192 << " pgs removed because they're created" << dendl;
1193 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1194 }
1195
1196 // filter out any pgs that shouldn't exist.
1197 {
1198 auto i = pending_creatings.pgs.begin();
1199 while (i != pending_creatings.pgs.end()) {
1200 if (!nextmap.pg_exists(i->first)) {
1201 dout(10) << __func__ << " removing pg " << i->first
1202 << " which should not exist" << dendl;
1203 i = pending_creatings.pgs.erase(i);
1204 } else {
1205 ++i;
1206 }
1207 }
1208 }
1209
1210 // process queue
1211 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1212 const auto total = pending_creatings.pgs.size();
1213 while (pending_creatings.pgs.size() < max &&
1214 !pending_creatings.queue.empty()) {
1215 auto p = pending_creatings.queue.begin();
1216 int64_t poolid = p->first;
1217 dout(10) << __func__ << " pool " << poolid
1218 << " created " << p->second.created
1219 << " modified " << p->second.modified
1220 << " [" << p->second.start << "-" << p->second.end << ")"
1221 << dendl;
1222 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1223 p->second.end - p->second.start);
1224 ps_t first = p->second.start;
1225 ps_t end = first + n;
1226 for (ps_t ps = first; ps < end; ++ps) {
1227 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1228 // NOTE: use the *current* epoch as the PG creation epoch so that the
1229 // OSD does not have to generate a long set of PastIntervals.
1230 pending_creatings.pgs.emplace(
1231 pgid,
1232 creating_pgs_t::pg_create_info(inc.epoch,
1233 p->second.modified));
1234 dout(10) << __func__ << " adding " << pgid << dendl;
1235 }
1236 p->second.start = end;
1237 if (p->second.done()) {
1238 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1239 pending_creatings.queue.erase(p);
1240 } else {
1241 dout(10) << __func__ << " pool " << poolid
1242 << " now [" << p->second.start << "-" << p->second.end << ")"
1243 << dendl;
1244 }
1245 }
1246 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1247 << " pools" << dendl;
1248
1249 if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1250 // walk creating pgs' history and past_intervals forward
1251 for (auto& i : pending_creatings.pgs) {
1252 // this mirrors PG::start_peering_interval()
1253 pg_t pgid = i.first;
1254
1255 // this is a bit imprecise, but sufficient?
1256 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1257 const pg_pool_t *pi;
1258 bool operator()(const set<pg_shard_t> &have) const {
1259 return have.size() >= pi->min_size;
1260 }
1261 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1262 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1263
1264 vector<int> up, acting;
1265 int up_primary, acting_primary;
1266 nextmap.pg_to_up_acting_osds(
1267 pgid, &up, &up_primary, &acting, &acting_primary);
1268 if (i.second.history.epoch_created == 0) {
1269 // new pg entry, set it up
1270 i.second.up = up;
1271 i.second.acting = acting;
1272 i.second.up_primary = up_primary;
1273 i.second.acting_primary = acting_primary;
1274 i.second.history = pg_history_t(i.second.create_epoch,
1275 i.second.create_stamp);
1276 dout(10) << __func__ << " pg " << pgid << " just added, "
1277 << " up " << i.second.up
1278 << " p " << i.second.up_primary
1279 << " acting " << i.second.acting
1280 << " p " << i.second.acting_primary
1281 << " history " << i.second.history
1282 << " past_intervals " << i.second.past_intervals
1283 << dendl;
1284 } else {
1285 std::stringstream debug;
1286 if (PastIntervals::check_new_interval(
1287 i.second.acting_primary, acting_primary,
1288 i.second.acting, acting,
1289 i.second.up_primary, up_primary,
1290 i.second.up, up,
1291 i.second.history.same_interval_since,
1292 i.second.history.last_epoch_clean,
1293 &nextmap,
1294 &osdmap,
1295 pgid,
1296 min_size_predicate,
1297 &i.second.past_intervals,
1298 &debug)) {
1299 epoch_t e = inc.epoch;
1300 i.second.history.same_interval_since = e;
1301 if (i.second.up != up) {
1302 i.second.history.same_up_since = e;
1303 }
1304 if (i.second.acting_primary != acting_primary) {
1305 i.second.history.same_primary_since = e;
1306 }
1307 if (pgid.is_split(
1308 osdmap.get_pg_num(pgid.pool()),
1309 nextmap.get_pg_num(pgid.pool()),
1310 nullptr)) {
1311 i.second.history.last_epoch_split = e;
1312 }
1313 dout(10) << __func__ << " pg " << pgid << " new interval,"
1314 << " up " << i.second.up << " -> " << up
1315 << " p " << i.second.up_primary << " -> " << up_primary
1316 << " acting " << i.second.acting << " -> " << acting
1317 << " p " << i.second.acting_primary << " -> "
1318 << acting_primary
1319 << " history " << i.second.history
1320 << " past_intervals " << i.second.past_intervals
1321 << dendl;
1322 dout(20) << " debug: " << debug.str() << dendl;
1323 i.second.up = up;
1324 i.second.acting = acting;
1325 i.second.up_primary = up_primary;
1326 i.second.acting_primary = acting_primary;
1327 }
1328 }
1329 }
1330 }
1331 dout(10) << __func__
1332 << " " << (pending_creatings.pgs.size() - total)
1333 << "/" << pending_creatings.pgs.size()
1334 << " pgs added from queued pools" << dendl;
1335 return pending_creatings;
1336 }
1337
1338 void OSDMonitor::maybe_prime_pg_temp()
1339 {
1340 bool all = false;
1341 if (pending_inc.crush.length()) {
1342 dout(10) << __func__ << " new crush map, all" << dendl;
1343 all = true;
1344 }
1345
1346 if (!pending_inc.new_up_client.empty()) {
1347 dout(10) << __func__ << " new up osds, all" << dendl;
1348 all = true;
1349 }
1350
1351 // check for interesting OSDs
1352 set<int> osds;
1353 for (auto p = pending_inc.new_state.begin();
1354 !all && p != pending_inc.new_state.end();
1355 ++p) {
1356 if ((p->second & CEPH_OSD_UP) &&
1357 osdmap.is_up(p->first)) {
1358 osds.insert(p->first);
1359 }
1360 }
1361 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1362 !all && p != pending_inc.new_weight.end();
1363 ++p) {
1364 if (p->second < osdmap.get_weight(p->first)) {
1365 // weight reduction
1366 osds.insert(p->first);
1367 } else {
1368 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1369 << dendl;
1370 all = true;
1371 }
1372 }
1373
1374 if (!all && osds.empty())
1375 return;
1376
1377 if (!all) {
1378 unsigned estimate =
1379 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1380 if (estimate > mapping.get_num_pgs() *
1381 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1382 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1383 << osds.size() << " osds >= "
1384 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1385 << mapping.get_num_pgs() << " pgs, all"
1386 << dendl;
1387 all = true;
1388 } else {
1389 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1390 << osds.size() << " osds" << dendl;
1391 }
1392 }
1393
1394 OSDMap next;
1395 next.deepish_copy_from(osdmap);
1396 next.apply_incremental(pending_inc);
1397
1398 if (next.get_pools().empty()) {
1399 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1400 } else if (all) {
1401 PrimeTempJob job(next, this);
1402 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1403 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1404 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1405 } else {
1406 dout(10) << __func__ << " did not finish in "
1407 << g_conf()->mon_osd_prime_pg_temp_max_time
1408 << ", stopping" << dendl;
1409 job.abort();
1410 }
1411 } else {
1412 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1413 utime_t stop = ceph_clock_now();
1414 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1415 const int chunk = 1000;
1416 int n = chunk;
1417 std::unordered_set<pg_t> did_pgs;
1418 for (auto osd : osds) {
1419 auto& pgs = mapping.get_osd_acting_pgs(osd);
1420 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1421 for (auto pgid : pgs) {
1422 if (!did_pgs.insert(pgid).second) {
1423 continue;
1424 }
1425 prime_pg_temp(next, pgid);
1426 if (--n <= 0) {
1427 n = chunk;
1428 if (ceph_clock_now() > stop) {
1429 dout(10) << __func__ << " consumed more than "
1430 << g_conf()->mon_osd_prime_pg_temp_max_time
1431 << " seconds, stopping"
1432 << dendl;
1433 return;
1434 }
1435 }
1436 }
1437 }
1438 }
1439 }
1440
1441 void OSDMonitor::prime_pg_temp(
1442 const OSDMap& next,
1443 pg_t pgid)
1444 {
1445 // TODO: remove this creating_pgs direct access?
1446 if (creating_pgs.pgs.count(pgid)) {
1447 return;
1448 }
1449 if (!osdmap.pg_exists(pgid)) {
1450 return;
1451 }
1452
1453 vector<int> up, acting;
1454 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1455
1456 vector<int> next_up, next_acting;
1457 int next_up_primary, next_acting_primary;
1458 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1459 &next_acting, &next_acting_primary);
1460 if (acting == next_acting &&
1461 !(up != acting && next_up == next_acting))
1462 return; // no change since last epoch
1463
1464 if (acting.empty())
1465 return; // if previously empty now we can be no worse off
1466 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1467 if (pool && acting.size() < pool->min_size)
1468 return; // can be no worse off than before
1469
1470 if (next_up == next_acting) {
1471 acting.clear();
1472 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1473 << dendl;
1474 }
1475
1476 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1477 << " -> " << next_up << "/" << next_acting
1478 << ", priming " << acting
1479 << dendl;
1480 {
1481 std::lock_guard l(prime_pg_temp_lock);
1482 // do not touch a mapping if a change is pending
1483 pending_inc.new_pg_temp.emplace(
1484 pgid,
1485 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1486 }
1487 }
1488
1489 /**
1490 * @note receiving a transaction in this function gives a fair amount of
1491 * freedom to the service implementation if it does need it. It shouldn't.
1492 */
1493 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1494 {
1495 dout(10) << "encode_pending e " << pending_inc.epoch
1496 << dendl;
1497
1498 if (do_prune(t)) {
1499 dout(1) << __func__ << " osdmap full prune encoded e"
1500 << pending_inc.epoch << dendl;
1501 }
1502
1503 // finalize up pending_inc
1504 pending_inc.modified = ceph_clock_now();
1505
1506 int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1507 ceph_assert(r == 0);
1508
1509 if (mapping_job) {
1510 if (!mapping_job->is_done()) {
1511 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1512 << mapping_job.get() << " did not complete, "
1513 << mapping_job->shards << " left" << dendl;
1514 mapping_job->abort();
1515 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1516 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1517 << mapping_job.get() << " is prior epoch "
1518 << mapping.get_epoch() << dendl;
1519 } else {
1520 if (g_conf()->mon_osd_prime_pg_temp) {
1521 maybe_prime_pg_temp();
1522 }
1523 }
1524 } else if (g_conf()->mon_osd_prime_pg_temp) {
1525 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1526 << dendl;
1527 }
1528 mapping_job.reset();
1529
1530 // ensure we don't have blank new_state updates. these are interrpeted as
1531 // CEPH_OSD_UP (and almost certainly not what we want!).
1532 auto p = pending_inc.new_state.begin();
1533 while (p != pending_inc.new_state.end()) {
1534 if (p->second == 0) {
1535 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1536 p = pending_inc.new_state.erase(p);
1537 } else {
1538 if (p->second & CEPH_OSD_UP) {
1539 pending_inc.new_last_up_change = pending_inc.modified;
1540 }
1541 ++p;
1542 }
1543 }
1544 if (!pending_inc.new_up_client.empty()) {
1545 pending_inc.new_last_up_change = pending_inc.modified;
1546 }
1547 for (auto& i : pending_inc.new_weight) {
1548 if (i.first >= osdmap.max_osd) {
1549 if (i.second) {
1550 // new osd is already marked in
1551 pending_inc.new_last_in_change = pending_inc.modified;
1552 break;
1553 }
1554 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1555 // existing osd marked in or out
1556 pending_inc.new_last_in_change = pending_inc.modified;
1557 break;
1558 }
1559 }
1560
1561 {
1562 OSDMap tmp;
1563 tmp.deepish_copy_from(osdmap);
1564 tmp.apply_incremental(pending_inc);
1565
1566 // clean pg_temp mappings
1567 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1568
1569 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1570 {
1571 // check every upmapped pg for now
1572 // until we could reliably identify certain cases to ignore,
1573 // which is obviously the hard part TBD..
1574 vector<pg_t> pgs_to_check;
1575 tmp.get_upmap_pgs(&pgs_to_check);
1576 if (pgs_to_check.size() <
1577 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1578 // not enough pgs, do it inline
1579 tmp.clean_pg_upmaps(cct, &pending_inc);
1580 } else {
1581 CleanUpmapJob job(cct, tmp, pending_inc);
1582 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1583 job.wait();
1584 }
1585 }
1586
1587 // update creating pgs first so that we can remove the created pgid and
1588 // process the pool flag removal below in the same osdmap epoch.
1589 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1590 bufferlist creatings_bl;
1591 uint64_t features = CEPH_FEATURES_ALL;
1592 if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1593 dout(20) << __func__ << " encoding pending pgs without octopus features"
1594 << dendl;
1595 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1596 }
1597 encode(pending_creatings, creatings_bl, features);
1598 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1599
1600 // remove any old (or incompat) POOL_CREATING flags
1601 for (auto& i : tmp.get_pools()) {
1602 if (tmp.require_osd_release < ceph_release_t::nautilus) {
1603 // pre-nautilus OSDMaps shouldn't get this flag.
1604 if (pending_inc.new_pools.count(i.first)) {
1605 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1606 }
1607 }
1608 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1609 !pending_creatings.still_creating_pool(i.first)) {
1610 dout(10) << __func__ << " done creating pool " << i.first
1611 << ", clearing CREATING flag" << dendl;
1612 if (pending_inc.new_pools.count(i.first) == 0) {
1613 pending_inc.new_pools[i.first] = i.second;
1614 }
1615 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1616 }
1617 }
1618
1619 // collect which pools are currently affected by
1620 // the near/backfill/full osd(s),
1621 // and set per-pool near/backfill/full flag instead
1622 set<int64_t> full_pool_ids;
1623 set<int64_t> backfillfull_pool_ids;
1624 set<int64_t> nearfull_pool_ids;
1625 tmp.get_full_pools(cct,
1626 &full_pool_ids,
1627 &backfillfull_pool_ids,
1628 &nearfull_pool_ids);
1629 if (full_pool_ids.empty() ||
1630 backfillfull_pool_ids.empty() ||
1631 nearfull_pool_ids.empty()) {
1632 // normal case - no nearfull, backfillfull or full osds
1633 // try cancel any improper nearfull/backfillfull/full pool
1634 // flags first
1635 for (auto &pool: tmp.get_pools()) {
1636 auto p = pool.first;
1637 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1638 nearfull_pool_ids.empty()) {
1639 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1640 << "'s nearfull flag" << dendl;
1641 if (pending_inc.new_pools.count(p) == 0) {
1642 // load original pool info first!
1643 pending_inc.new_pools[p] = pool.second;
1644 }
1645 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1646 }
1647 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1648 backfillfull_pool_ids.empty()) {
1649 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650 << "'s backfillfull flag" << dendl;
1651 if (pending_inc.new_pools.count(p) == 0) {
1652 pending_inc.new_pools[p] = pool.second;
1653 }
1654 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1655 }
1656 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1657 full_pool_ids.empty()) {
1658 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1659 // set by EQUOTA, skipping
1660 continue;
1661 }
1662 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1663 << "'s full flag" << dendl;
1664 if (pending_inc.new_pools.count(p) == 0) {
1665 pending_inc.new_pools[p] = pool.second;
1666 }
1667 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1668 }
1669 }
1670 }
1671 if (!full_pool_ids.empty()) {
1672 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1673 << " as full" << dendl;
1674 for (auto &p: full_pool_ids) {
1675 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1676 continue;
1677 }
1678 if (pending_inc.new_pools.count(p) == 0) {
1679 pending_inc.new_pools[p] = tmp.pools[p];
1680 }
1681 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1682 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1683 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1684 }
1685 // cancel FLAG_FULL for pools which are no longer full too
1686 for (auto &pool: tmp.get_pools()) {
1687 auto p = pool.first;
1688 if (full_pool_ids.count(p)) {
1689 // skip pools we have just marked as full above
1690 continue;
1691 }
1692 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1693 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1694 // don't touch if currently is not full
1695 // or is running out of quota (and hence considered as full)
1696 continue;
1697 }
1698 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1699 << "'s full flag" << dendl;
1700 if (pending_inc.new_pools.count(p) == 0) {
1701 pending_inc.new_pools[p] = pool.second;
1702 }
1703 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1704 }
1705 }
1706 if (!backfillfull_pool_ids.empty()) {
1707 for (auto &p: backfillfull_pool_ids) {
1708 if (full_pool_ids.count(p)) {
1709 // skip pools we have already considered as full above
1710 continue;
1711 }
1712 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1713 // make sure FLAG_FULL is truly set, so we are safe not
1714 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1716 continue;
1717 }
1718 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1719 // don't bother if pool is already marked as backfillfull
1720 continue;
1721 }
1722 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1723 << "'s as backfillfull" << dendl;
1724 if (pending_inc.new_pools.count(p) == 0) {
1725 pending_inc.new_pools[p] = tmp.pools[p];
1726 }
1727 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1728 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1729 }
1730 // cancel FLAG_BACKFILLFULL for pools
1731 // which are no longer backfillfull too
1732 for (auto &pool: tmp.get_pools()) {
1733 auto p = pool.first;
1734 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1735 // skip pools we have just marked as backfillfull/full above
1736 continue;
1737 }
1738 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1739 // and don't touch if currently is not backfillfull
1740 continue;
1741 }
1742 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743 << "'s backfillfull flag" << dendl;
1744 if (pending_inc.new_pools.count(p) == 0) {
1745 pending_inc.new_pools[p] = pool.second;
1746 }
1747 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1748 }
1749 }
1750 if (!nearfull_pool_ids.empty()) {
1751 for (auto &p: nearfull_pool_ids) {
1752 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1753 continue;
1754 }
1755 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1756 // make sure FLAG_FULL is truly set, so we are safe not
1757 // to set a extra (redundant) FLAG_NEARFULL flag
1758 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1759 continue;
1760 }
1761 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1762 // don't bother if pool is already marked as nearfull
1763 continue;
1764 }
1765 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1766 << "'s as nearfull" << dendl;
1767 if (pending_inc.new_pools.count(p) == 0) {
1768 pending_inc.new_pools[p] = tmp.pools[p];
1769 }
1770 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1771 }
1772 // cancel FLAG_NEARFULL for pools
1773 // which are no longer nearfull too
1774 for (auto &pool: tmp.get_pools()) {
1775 auto p = pool.first;
1776 if (full_pool_ids.count(p) ||
1777 backfillfull_pool_ids.count(p) ||
1778 nearfull_pool_ids.count(p)) {
1779 // skip pools we have just marked as
1780 // nearfull/backfillfull/full above
1781 continue;
1782 }
1783 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1784 // and don't touch if currently is not nearfull
1785 continue;
1786 }
1787 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1788 << "'s nearfull flag" << dendl;
1789 if (pending_inc.new_pools.count(p) == 0) {
1790 pending_inc.new_pools[p] = pool.second;
1791 }
1792 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1793 }
1794 }
1795
1796 // min_compat_client?
1797 if (!tmp.require_min_compat_client) {
1798 auto mv = tmp.get_min_compat_client();
1799 dout(1) << __func__ << " setting require_min_compat_client to currently "
1800 << "required " << mv << dendl;
1801 mon->clog->info() << "setting require_min_compat_client to currently "
1802 << "required " << mv;
1803 pending_inc.new_require_min_compat_client = mv;
1804 }
1805
1806 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1807 tmp.require_osd_release >= ceph_release_t::nautilus) {
1808 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1809 // add creating flags?
1810 for (auto& i : tmp.get_pools()) {
1811 if (pending_creatings.still_creating_pool(i.first)) {
1812 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1813 << dendl;
1814 if (pending_inc.new_pools.count(i.first) == 0) {
1815 pending_inc.new_pools[i.first] = i.second;
1816 }
1817 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1818 }
1819 }
1820 // adjust blacklist items to all be TYPE_ANY
1821 for (auto& i : tmp.blacklist) {
1822 auto a = i.first;
1823 a.set_type(entity_addr_t::TYPE_ANY);
1824 pending_inc.new_blacklist[a] = i.second;
1825 pending_inc.old_blacklist.push_back(i.first);
1826 }
1827 }
1828
1829 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1830 tmp.require_osd_release >= ceph_release_t::octopus) {
1831 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1832
1833 // adjust obsoleted cache modes
1834 for (auto& [poolid, pi] : tmp.pools) {
1835 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1836 if (pending_inc.new_pools.count(poolid) == 0) {
1837 pending_inc.new_pools[poolid] = pi;
1838 }
1839 dout(10) << __func__ << " switching pool " << poolid
1840 << " cachemode from forward -> proxy" << dendl;
1841 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1842 }
1843 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1844 if (pending_inc.new_pools.count(poolid) == 0) {
1845 pending_inc.new_pools[poolid] = pi;
1846 }
1847 dout(10) << __func__ << " switching pool " << poolid
1848 << " cachemode from readforward -> readproxy" << dendl;
1849 pending_inc.new_pools[poolid].cache_mode =
1850 pg_pool_t::CACHEMODE_READPROXY;
1851 }
1852 }
1853
1854 // clear removed_snaps for every pool
1855 for (auto& [poolid, pi] : tmp.pools) {
1856 if (pi.removed_snaps.empty()) {
1857 continue;
1858 }
1859 if (pending_inc.new_pools.count(poolid) == 0) {
1860 pending_inc.new_pools[poolid] = pi;
1861 }
1862 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1863 << dendl;
1864 pending_inc.new_pools[poolid].removed_snaps.clear();
1865 }
1866
1867 // create a combined purged snap epoch key for all purged snaps
1868 // prior to this epoch, and store it in the current epoch (i.e.,
1869 // the last pre-octopus epoch, just prior to the one we're
1870 // encoding now).
1871 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1872 it->lower_bound("purged_snap_");
1873 map<int64_t,snap_interval_set_t> combined;
1874 while (it->valid()) {
1875 if (it->key().find("purged_snap_") != 0) {
1876 break;
1877 }
1878 string k = it->key();
1879 long long unsigned pool;
1880 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1881 if (n != 1) {
1882 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1883 } else {
1884 bufferlist v = it->value();
1885 auto p = v.cbegin();
1886 snapid_t begin, end;
1887 ceph::decode(begin, p);
1888 ceph::decode(end, p);
1889 combined[pool].insert(begin, end - begin);
1890 }
1891 it->next();
1892 }
1893 if (!combined.empty()) {
1894 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1895 bufferlist v;
1896 ceph::encode(combined, v);
1897 t->put(OSD_SNAP_PREFIX, k, v);
1898 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1899 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1900 << dendl;
1901 } else {
1902 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1903 << dendl;
1904 }
1905
1906 // clean out the old removed_snap_ and removed_epoch keys
1907 // ('`' is ASCII '_' + 1)
1908 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1909 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1910 }
1911 }
1912
1913 // tell me about it
1914 for (auto i = pending_inc.new_state.begin();
1915 i != pending_inc.new_state.end();
1916 ++i) {
1917 int s = i->second ? i->second : CEPH_OSD_UP;
1918 if (s & CEPH_OSD_UP) {
1919 dout(2) << " osd." << i->first << " DOWN" << dendl;
1920 // Reset laggy parameters if failure interval exceeds a threshold.
1921 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1922 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1923 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1924 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1925 set_default_laggy_params(i->first);
1926 }
1927 }
1928 }
1929 if (s & CEPH_OSD_EXISTS)
1930 dout(2) << " osd." << i->first << " DNE" << dendl;
1931 }
1932 for (auto i = pending_inc.new_up_client.begin();
1933 i != pending_inc.new_up_client.end();
1934 ++i) {
1935 //FIXME: insert cluster addresses too
1936 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1937 }
1938 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1939 i != pending_inc.new_weight.end();
1940 ++i) {
1941 if (i->second == CEPH_OSD_OUT) {
1942 dout(2) << " osd." << i->first << " OUT" << dendl;
1943 } else if (i->second == CEPH_OSD_IN) {
1944 dout(2) << " osd." << i->first << " IN" << dendl;
1945 } else {
1946 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1947 }
1948 }
1949
1950 // features for osdmap and its incremental
1951 uint64_t features;
1952
1953 // encode full map and determine its crc
1954 OSDMap tmp;
1955 {
1956 tmp.deepish_copy_from(osdmap);
1957 tmp.apply_incremental(pending_inc);
1958
1959 // determine appropriate features
1960 features = tmp.get_encoding_features();
1961 dout(10) << __func__ << " encoding full map with "
1962 << tmp.require_osd_release
1963 << " features " << features << dendl;
1964
1965 // the features should be a subset of the mon quorum's features!
1966 ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1967
1968 bufferlist fullbl;
1969 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1970 pending_inc.full_crc = tmp.get_crc();
1971
1972 // include full map in the txn. note that old monitors will
1973 // overwrite this. new ones will now skip the local full map
1974 // encode and reload from this.
1975 put_version_full(t, pending_inc.epoch, fullbl);
1976 }
1977
1978 // encode
1979 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1980 bufferlist bl;
1981 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1982
1983 dout(20) << " full_crc " << tmp.get_crc()
1984 << " inc_crc " << pending_inc.inc_crc << dendl;
1985
1986 /* put everything in the transaction */
1987 put_version(t, pending_inc.epoch, bl);
1988 put_last_committed(t, pending_inc.epoch);
1989
1990 // metadata, too!
1991 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1992 p != pending_metadata.end();
1993 ++p)
1994 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1995 for (set<int>::iterator p = pending_metadata_rm.begin();
1996 p != pending_metadata_rm.end();
1997 ++p)
1998 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1999 pending_metadata.clear();
2000 pending_metadata_rm.clear();
2001
2002 // purged_snaps
2003 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2004 !pending_inc.new_purged_snaps.empty()) {
2005 // all snaps purged this epoch (across all pools)
2006 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2007 bufferlist v;
2008 encode(pending_inc.new_purged_snaps, v);
2009 t->put(OSD_SNAP_PREFIX, k, v);
2010 }
2011 for (auto& i : pending_inc.new_purged_snaps) {
2012 for (auto q = i.second.begin();
2013 q != i.second.end();
2014 ++q) {
2015 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2016 pending_inc.epoch,
2017 t);
2018 }
2019 }
2020 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2021 for (auto snap : snaps) {
2022 insert_purged_snap_update(pool, snap, snap + 1,
2023 pending_inc.epoch,
2024 t);
2025 }
2026 }
2027
2028 // health
2029 health_check_map_t next;
2030 tmp.check_health(cct, &next);
2031 encode_health(next, t);
2032 }
2033
2034 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2035 {
2036 bufferlist bl;
2037 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2038 if (r < 0)
2039 return r;
2040 try {
2041 auto p = bl.cbegin();
2042 decode(m, p);
2043 }
2044 catch (buffer::error& e) {
2045 if (err)
2046 *err << "osd." << osd << " metadata is corrupt";
2047 return -EIO;
2048 }
2049 return 0;
2050 }
2051
2052 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2053 {
2054 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2055 if (osdmap.is_up(osd)) {
2056 map<string,string> meta;
2057 load_metadata(osd, meta, nullptr);
2058 auto p = meta.find(field);
2059 if (p == meta.end()) {
2060 (*out)["unknown"]++;
2061 } else {
2062 (*out)[p->second]++;
2063 }
2064 }
2065 }
2066 }
2067
2068 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2069 {
2070 map<string,int> by_val;
2071 count_metadata(field, &by_val);
2072 f->open_object_section(field.c_str());
2073 for (auto& p : by_val) {
2074 f->dump_int(p.first.c_str(), p.second);
2075 }
2076 f->close_section();
2077 }
2078
2079 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2080 {
2081 map<string, string> metadata;
2082 int r = load_metadata(osd, metadata, nullptr);
2083 if (r < 0)
2084 return r;
2085
2086 auto it = metadata.find("osd_objectstore");
2087 if (it == metadata.end())
2088 return -ENOENT;
2089 *type = it->second;
2090 return 0;
2091 }
2092
2093 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2094 const pg_pool_t &pool,
2095 ostream *err)
2096 {
2097 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2098 // since filestore osds could always join the pool later
2099 set<int> checked_osds;
2100 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2101 vector<int> up, acting;
2102 pg_t pgid(ps, pool_id);
2103 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2104 for (int osd : up) {
2105 if (checked_osds.find(osd) != checked_osds.end())
2106 continue;
2107 string objectstore_type;
2108 int r = get_osd_objectstore_type(osd, &objectstore_type);
2109 // allow with missing metadata, e.g. due to an osd never booting yet
2110 if (r < 0 || objectstore_type == "bluestore") {
2111 checked_osds.insert(osd);
2112 continue;
2113 }
2114 *err << "osd." << osd << " uses " << objectstore_type;
2115 return false;
2116 }
2117 }
2118 return true;
2119 }
2120
2121 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2122 {
2123 map<string,string> m;
2124 if (int r = load_metadata(osd, m, err))
2125 return r;
2126 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2127 f->dump_string(p->first.c_str(), p->second);
2128 return 0;
2129 }
2130
2131 void OSDMonitor::print_nodes(Formatter *f)
2132 {
2133 // group OSDs by their hosts
2134 map<string, list<int> > osds; // hostname => osd
2135 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2136 map<string, string> m;
2137 if (load_metadata(osd, m, NULL)) {
2138 continue;
2139 }
2140 map<string, string>::iterator hostname = m.find("hostname");
2141 if (hostname == m.end()) {
2142 // not likely though
2143 continue;
2144 }
2145 osds[hostname->second].push_back(osd);
2146 }
2147
2148 dump_services(f, osds, "osd");
2149 }
2150
2151 void OSDMonitor::share_map_with_random_osd()
2152 {
2153 if (osdmap.get_num_up_osds() == 0) {
2154 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2155 return;
2156 }
2157
2158 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2159 if (!s) {
2160 dout(10) << __func__ << " no up osd on our session map" << dendl;
2161 return;
2162 }
2163
2164 dout(10) << "committed, telling random " << s->name
2165 << " all about it" << dendl;
2166
2167 // get feature of the peer
2168 // use quorum_con_features, if it's an anonymous connection.
2169 uint64_t features = s->con_features ? s->con_features :
2170 mon->get_quorum_con_features();
2171 // whatev, they'll request more if they need it
2172 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2173 s->con->send_message(m);
2174 // NOTE: do *not* record osd has up to this epoch (as we do
2175 // elsewhere) as they may still need to request older values.
2176 }
2177
2178 version_t OSDMonitor::get_trim_to() const
2179 {
2180 if (mon->get_quorum().empty()) {
2181 dout(10) << __func__ << ": quorum not formed" << dendl;
2182 return 0;
2183 }
2184
2185 {
2186 std::lock_guard<std::mutex> l(creating_pgs_lock);
2187 if (!creating_pgs.pgs.empty()) {
2188 return 0;
2189 }
2190 }
2191
2192 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2193 dout(0) << __func__
2194 << " blocking osdmap trim"
2195 " ('mon_debug_block_osdmap_trim' set to 'true')"
2196 << dendl;
2197 return 0;
2198 }
2199
2200 {
2201 epoch_t floor = get_min_last_epoch_clean();
2202 dout(10) << " min_last_epoch_clean " << floor << dendl;
2203 if (g_conf()->mon_osd_force_trim_to > 0 &&
2204 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2205 floor = g_conf()->mon_osd_force_trim_to;
2206 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2207 }
2208 unsigned min = g_conf()->mon_min_osdmap_epochs;
2209 if (floor + min > get_last_committed()) {
2210 if (min < get_last_committed())
2211 floor = get_last_committed() - min;
2212 else
2213 floor = 0;
2214 }
2215 if (floor > get_first_committed())
2216 return floor;
2217 }
2218 return 0;
2219 }
2220
2221 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2222 {
2223 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2224 // also scan osd epochs
2225 // don't trim past the oldest reported osd epoch
2226 for (auto& osd_epoch : osd_epochs) {
2227 if (osd_epoch.second < floor &&
2228 osdmap.is_in(osd_epoch.first)) {
2229 floor = osd_epoch.second;
2230 }
2231 }
2232 return floor;
2233 }
2234
2235 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2236 version_t first)
2237 {
2238 dout(10) << __func__ << " including full map for e " << first << dendl;
2239 bufferlist bl;
2240 get_version_full(first, bl);
2241 put_version_full(tx, first, bl);
2242
2243 if (has_osdmap_manifest &&
2244 first > osdmap_manifest.get_first_pinned()) {
2245 _prune_update_trimmed(tx, first);
2246 }
2247 }
2248
2249
2250 /* full osdmap prune
2251 *
2252 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2253 */
2254
2255 void OSDMonitor::load_osdmap_manifest()
2256 {
2257 bool store_has_manifest =
2258 mon->store->exists(get_service_name(), "osdmap_manifest");
2259
2260 if (!store_has_manifest) {
2261 if (!has_osdmap_manifest) {
2262 return;
2263 }
2264
2265 dout(20) << __func__
2266 << " dropping osdmap manifest from memory." << dendl;
2267 osdmap_manifest = osdmap_manifest_t();
2268 has_osdmap_manifest = false;
2269 return;
2270 }
2271
2272 dout(20) << __func__
2273 << " osdmap manifest detected in store; reload." << dendl;
2274
2275 bufferlist manifest_bl;
2276 int r = get_value("osdmap_manifest", manifest_bl);
2277 if (r < 0) {
2278 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2279 ceph_abort_msg("error reading manifest");
2280 }
2281 osdmap_manifest.decode(manifest_bl);
2282 has_osdmap_manifest = true;
2283
2284 dout(10) << __func__ << " store osdmap manifest pinned ("
2285 << osdmap_manifest.get_first_pinned()
2286 << " .. "
2287 << osdmap_manifest.get_last_pinned()
2288 << ")"
2289 << dendl;
2290 }
2291
2292 bool OSDMonitor::should_prune() const
2293 {
2294 version_t first = get_first_committed();
2295 version_t last = get_last_committed();
2296 version_t min_osdmap_epochs =
2297 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2298 version_t prune_min =
2299 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2300 version_t prune_interval =
2301 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2302 version_t last_pinned = osdmap_manifest.get_last_pinned();
2303 version_t last_to_pin = last - min_osdmap_epochs;
2304
2305 // Make it or break it constraints.
2306 //
2307 // If any of these conditions fails, we will not prune, regardless of
2308 // whether we have an on-disk manifest with an on-going pruning state.
2309 //
2310 if ((last - first) <= min_osdmap_epochs) {
2311 // between the first and last committed epochs, we don't have
2312 // enough epochs to trim, much less to prune.
2313 dout(10) << __func__
2314 << " currently holding only " << (last - first)
2315 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2316 << "); do not prune."
2317 << dendl;
2318 return false;
2319
2320 } else if ((last_to_pin - first) < prune_min) {
2321 // between the first committed epoch and the last epoch we would prune,
2322 // we simply don't have enough versions over the minimum to prune maps.
2323 dout(10) << __func__
2324 << " could only prune " << (last_to_pin - first)
2325 << " epochs (" << first << ".." << last_to_pin << "), which"
2326 " is less than the required minimum (" << prune_min << ")"
2327 << dendl;
2328 return false;
2329
2330 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2331 dout(10) << __func__
2332 << " we have pruned as far as we can; do not prune."
2333 << dendl;
2334 return false;
2335
2336 } else if (last_pinned + prune_interval > last_to_pin) {
2337 dout(10) << __func__
2338 << " not enough epochs to form an interval (last pinned: "
2339 << last_pinned << ", last to pin: "
2340 << last_to_pin << ", interval: " << prune_interval << ")"
2341 << dendl;
2342 return false;
2343 }
2344
2345 dout(15) << __func__
2346 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2347 << " lc (" << first << ".." << last << ")"
2348 << dendl;
2349 return true;
2350 }
2351
2352 void OSDMonitor::_prune_update_trimmed(
2353 MonitorDBStore::TransactionRef tx,
2354 version_t first)
2355 {
2356 dout(10) << __func__
2357 << " first " << first
2358 << " last_pinned " << osdmap_manifest.get_last_pinned()
2359 << " last_pinned " << osdmap_manifest.get_last_pinned()
2360 << dendl;
2361
2362 osdmap_manifest_t manifest = osdmap_manifest;
2363
2364 if (!manifest.is_pinned(first)) {
2365 manifest.pin(first);
2366 }
2367
2368 set<version_t>::iterator p_end = manifest.pinned.find(first);
2369 set<version_t>::iterator p = manifest.pinned.begin();
2370 manifest.pinned.erase(p, p_end);
2371 ceph_assert(manifest.get_first_pinned() == first);
2372
2373 if (manifest.get_last_pinned() == first+1 ||
2374 manifest.pinned.size() == 1) {
2375 // we reached the end of the line, as pinned maps go; clean up our
2376 // manifest, and let `should_prune()` decide whether we should prune
2377 // again.
2378 tx->erase(get_service_name(), "osdmap_manifest");
2379 return;
2380 }
2381
2382 bufferlist bl;
2383 manifest.encode(bl);
2384 tx->put(get_service_name(), "osdmap_manifest", bl);
2385 }
2386
2387 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2388 {
2389 dout(1) << __func__ << dendl;
2390
2391 version_t pin_first;
2392
2393 // verify constrainsts on stable in-memory state
2394 if (!has_osdmap_manifest) {
2395 // we must have never pruned, OR if we pruned the state must no longer
2396 // be relevant (i.e., the state must have been removed alongside with
2397 // the trim that *must* have removed past the last pinned map in a
2398 // previous prune).
2399 ceph_assert(osdmap_manifest.pinned.empty());
2400 ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2401 pin_first = get_first_committed();
2402
2403 } else {
2404 // we must have pruned in the past AND its state is still relevant
2405 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2406 // and thus we still hold a manifest in the store).
2407 ceph_assert(!osdmap_manifest.pinned.empty());
2408 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2409 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2410
2411 dout(10) << __func__
2412 << " first_pinned " << osdmap_manifest.get_first_pinned()
2413 << " last_pinned " << osdmap_manifest.get_last_pinned()
2414 << dendl;
2415
2416 pin_first = osdmap_manifest.get_last_pinned();
2417 }
2418
2419 manifest.pin(pin_first);
2420 }
2421
2422 bool OSDMonitor::_prune_sanitize_options() const
2423 {
2424 uint64_t prune_interval =
2425 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2426 uint64_t prune_min =
2427 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2428 uint64_t txsize =
2429 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2430
2431 bool r = true;
2432
2433 if (prune_interval == 0) {
2434 derr << __func__
2435 << " prune is enabled BUT prune interval is zero; abort."
2436 << dendl;
2437 r = false;
2438 } else if (prune_interval == 1) {
2439 derr << __func__
2440 << " prune interval is equal to one, which essentially means"
2441 " no pruning; abort."
2442 << dendl;
2443 r = false;
2444 }
2445 if (prune_min == 0) {
2446 derr << __func__
2447 << " prune is enabled BUT prune min is zero; abort."
2448 << dendl;
2449 r = false;
2450 }
2451 if (prune_interval > prune_min) {
2452 derr << __func__
2453 << " impossible to ascertain proper prune interval because"
2454 << " it is greater than the minimum prune epochs"
2455 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2456 << dendl;
2457 r = false;
2458 }
2459
2460 if (txsize < prune_interval - 1) {
2461 derr << __func__
2462 << "'mon_osdmap_full_prune_txsize' (" << txsize
2463 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2464 << "); abort." << dendl;
2465 r = false;
2466 }
2467 return r;
2468 }
2469
2470 bool OSDMonitor::is_prune_enabled() const {
2471 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2472 }
2473
2474 bool OSDMonitor::is_prune_supported() const {
2475 return mon->get_required_mon_features().contains_any(
2476 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2477 }
2478
2479 /** do_prune
2480 *
2481 * @returns true if has side-effects; false otherwise.
2482 */
2483 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2484 {
2485 bool enabled = is_prune_enabled();
2486
2487 dout(1) << __func__ << " osdmap full prune "
2488 << ( enabled ? "enabled" : "disabled")
2489 << dendl;
2490
2491 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2492 return false;
2493 }
2494
2495 // we are beyond the minimum prune versions, we need to remove maps because
2496 // otherwise the store will grow unbounded and we may end up having issues
2497 // with available disk space or store hangs.
2498
2499 // we will not pin all versions. We will leave a buffer number of versions.
2500 // this allows us the monitor to trim maps without caring too much about
2501 // pinned maps, and then allow us to use another ceph-mon without these
2502 // capabilities, without having to repair the store.
2503
2504 osdmap_manifest_t manifest = osdmap_manifest;
2505
2506 version_t first = get_first_committed();
2507 version_t last = get_last_committed();
2508
2509 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2510 version_t last_pinned = manifest.get_last_pinned();
2511 uint64_t prune_interval =
2512 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2513 uint64_t txsize =
2514 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2515
2516 prune_init(manifest);
2517
2518 // we need to get rid of some osdmaps
2519
2520 dout(5) << __func__
2521 << " lc (" << first << " .. " << last << ")"
2522 << " last_pinned " << last_pinned
2523 << " interval " << prune_interval
2524 << " last_to_pin " << last_to_pin
2525 << dendl;
2526
2527 // We will be erasing maps as we go.
2528 //
2529 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2530 //
2531 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2532 // we stop pruning. We could prune the maps between `next_to_pin` and
2533 // `last_to_pin`, but by not doing it we end up with neater pruned
2534 // intervals, aligned with `prune_interval`. Besides, this should not be a
2535 // problem as long as `prune_interval` is set to a sane value, instead of
2536 // hundreds or thousands of maps.
2537
2538 auto map_exists = [this](version_t v) {
2539 string k = mon->store->combine_strings("full", v);
2540 return mon->store->exists(get_service_name(), k);
2541 };
2542
2543 // 'interval' represents the number of maps from the last pinned
2544 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2545 // version 11 next; all intermediate versions will be removed.
2546 //
2547 // 'txsize' represents the maximum number of versions we'll be removing in
2548 // this iteration. If 'txsize' is large enough to perform multiple passes
2549 // pinning and removing maps, we will do so; if not, we'll do at least one
2550 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2551 // ensure that we never go *over* the maximum.
2552
2553 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2554 uint64_t removal_interval = prune_interval - 1;
2555
2556 if (txsize < removal_interval) {
2557 dout(5) << __func__
2558 << " setting txsize to removal interval size ("
2559 << removal_interval << " versions"
2560 << dendl;
2561 txsize = removal_interval;
2562 }
2563 ceph_assert(removal_interval > 0);
2564
2565 uint64_t num_pruned = 0;
2566 while (num_pruned + removal_interval <= txsize) {
2567 last_pinned = manifest.get_last_pinned();
2568
2569 if (last_pinned + prune_interval > last_to_pin) {
2570 break;
2571 }
2572 ceph_assert(last_pinned < last_to_pin);
2573
2574 version_t next_pinned = last_pinned + prune_interval;
2575 ceph_assert(next_pinned <= last_to_pin);
2576 manifest.pin(next_pinned);
2577
2578 dout(20) << __func__
2579 << " last_pinned " << last_pinned
2580 << " next_pinned " << next_pinned
2581 << " num_pruned " << num_pruned
2582 << " removal interval (" << (last_pinned+1)
2583 << ".." << (next_pinned-1) << ")"
2584 << " txsize " << txsize << dendl;
2585
2586 ceph_assert(map_exists(last_pinned));
2587 ceph_assert(map_exists(next_pinned));
2588
2589 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2590 ceph_assert(!manifest.is_pinned(v));
2591
2592 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2593 string full_key = mon->store->combine_strings("full", v);
2594 tx->erase(get_service_name(), full_key);
2595 ++num_pruned;
2596 }
2597 }
2598
2599 ceph_assert(num_pruned > 0);
2600
2601 bufferlist bl;
2602 manifest.encode(bl);
2603 tx->put(get_service_name(), "osdmap_manifest", bl);
2604
2605 return true;
2606 }
2607
2608
2609 // -------------
2610
2611 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2612 {
2613 op->mark_osdmon_event(__func__);
2614 Message *m = op->get_req();
2615 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2616
2617 switch (m->get_type()) {
2618 // READs
2619 case MSG_MON_COMMAND:
2620 try {
2621 return preprocess_command(op);
2622 } catch (const bad_cmd_get& e) {
2623 bufferlist bl;
2624 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2625 return true;
2626 }
2627 case CEPH_MSG_MON_GET_OSDMAP:
2628 return preprocess_get_osdmap(op);
2629
2630 // damp updates
2631 case MSG_OSD_MARK_ME_DOWN:
2632 return preprocess_mark_me_down(op);
2633 case MSG_OSD_MARK_ME_DEAD:
2634 return preprocess_mark_me_dead(op);
2635 case MSG_OSD_FULL:
2636 return preprocess_full(op);
2637 case MSG_OSD_FAILURE:
2638 return preprocess_failure(op);
2639 case MSG_OSD_BOOT:
2640 return preprocess_boot(op);
2641 case MSG_OSD_ALIVE:
2642 return preprocess_alive(op);
2643 case MSG_OSD_PG_CREATED:
2644 return preprocess_pg_created(op);
2645 case MSG_OSD_PG_READY_TO_MERGE:
2646 return preprocess_pg_ready_to_merge(op);
2647 case MSG_OSD_PGTEMP:
2648 return preprocess_pgtemp(op);
2649 case MSG_OSD_BEACON:
2650 return preprocess_beacon(op);
2651
2652 case CEPH_MSG_POOLOP:
2653 return preprocess_pool_op(op);
2654
2655 case MSG_REMOVE_SNAPS:
2656 return preprocess_remove_snaps(op);
2657
2658 case MSG_MON_GET_PURGED_SNAPS:
2659 return preprocess_get_purged_snaps(op);
2660
2661 default:
2662 ceph_abort();
2663 return true;
2664 }
2665 }
2666
2667 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2668 {
2669 op->mark_osdmon_event(__func__);
2670 Message *m = op->get_req();
2671 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2672
2673 switch (m->get_type()) {
2674 // damp updates
2675 case MSG_OSD_MARK_ME_DOWN:
2676 return prepare_mark_me_down(op);
2677 case MSG_OSD_MARK_ME_DEAD:
2678 return prepare_mark_me_dead(op);
2679 case MSG_OSD_FULL:
2680 return prepare_full(op);
2681 case MSG_OSD_FAILURE:
2682 return prepare_failure(op);
2683 case MSG_OSD_BOOT:
2684 return prepare_boot(op);
2685 case MSG_OSD_ALIVE:
2686 return prepare_alive(op);
2687 case MSG_OSD_PG_CREATED:
2688 return prepare_pg_created(op);
2689 case MSG_OSD_PGTEMP:
2690 return prepare_pgtemp(op);
2691 case MSG_OSD_PG_READY_TO_MERGE:
2692 return prepare_pg_ready_to_merge(op);
2693 case MSG_OSD_BEACON:
2694 return prepare_beacon(op);
2695
2696 case MSG_MON_COMMAND:
2697 try {
2698 return prepare_command(op);
2699 } catch (const bad_cmd_get& e) {
2700 bufferlist bl;
2701 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2702 return true;
2703 }
2704
2705 case CEPH_MSG_POOLOP:
2706 return prepare_pool_op(op);
2707
2708 case MSG_REMOVE_SNAPS:
2709 return prepare_remove_snaps(op);
2710
2711
2712 default:
2713 ceph_abort();
2714 }
2715
2716 return false;
2717 }
2718
2719 bool OSDMonitor::should_propose(double& delay)
2720 {
2721 dout(10) << "should_propose" << dendl;
2722
2723 // if full map, propose immediately! any subsequent changes will be clobbered.
2724 if (pending_inc.fullmap.length())
2725 return true;
2726
2727 // adjust osd weights?
2728 if (!osd_weight.empty() &&
2729 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2730 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2731 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2732 delay = 0.0;
2733 osd_weight.clear();
2734 return true;
2735 }
2736
2737 return PaxosService::should_propose(delay);
2738 }
2739
2740
2741
2742 // ---------------------------
2743 // READs
2744
2745 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2746 {
2747 op->mark_osdmon_event(__func__);
2748 auto m = op->get_req<MMonGetOSDMap>();
2749
2750 uint64_t features = mon->get_quorum_con_features();
2751 if (op->get_session() && op->get_session()->con_features)
2752 features = op->get_session()->con_features;
2753
2754 dout(10) << __func__ << " " << *m << dendl;
2755 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2756 epoch_t first = get_first_committed();
2757 epoch_t last = osdmap.get_epoch();
2758 int max = g_conf()->osd_map_message_max;
2759 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2760 for (epoch_t e = std::max(first, m->get_full_first());
2761 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2762 ++e, --max) {
2763 bufferlist& bl = reply->maps[e];
2764 int r = get_version_full(e, features, bl);
2765 ceph_assert(r >= 0);
2766 max_bytes -= bl.length();
2767 }
2768 for (epoch_t e = std::max(first, m->get_inc_first());
2769 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2770 ++e, --max) {
2771 bufferlist& bl = reply->incremental_maps[e];
2772 int r = get_version(e, features, bl);
2773 ceph_assert(r >= 0);
2774 max_bytes -= bl.length();
2775 }
2776 reply->oldest_map = first;
2777 reply->newest_map = last;
2778 mon->send_reply(op, reply);
2779 return true;
2780 }
2781
2782
2783 // ---------------------------
2784 // UPDATEs
2785
2786 // failure --
2787
2788 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2789 // check permissions
2790 MonSession *session = op->get_session();
2791 if (!session)
2792 return true;
2793 if (!session->is_capable("osd", MON_CAP_X)) {
2794 dout(0) << "got MOSDFailure from entity with insufficient caps "
2795 << session->caps << dendl;
2796 return true;
2797 }
2798 if (fsid != mon->monmap->fsid) {
2799 dout(0) << "check_source: on fsid " << fsid
2800 << " != " << mon->monmap->fsid << dendl;
2801 return true;
2802 }
2803 return false;
2804 }
2805
2806
2807 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2808 {
2809 op->mark_osdmon_event(__func__);
2810 auto m = op->get_req<MOSDFailure>();
2811 // who is target_osd
2812 int badboy = m->get_target_osd();
2813
2814 // check permissions
2815 if (check_source(op, m->fsid))
2816 goto didit;
2817
2818 // first, verify the reporting host is valid
2819 if (m->get_orig_source().is_osd()) {
2820 int from = m->get_orig_source().num();
2821 if (!osdmap.exists(from) ||
2822 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2823 (osdmap.is_down(from) && m->if_osd_failed())) {
2824 dout(5) << "preprocess_failure from dead osd." << from
2825 << ", ignoring" << dendl;
2826 send_incremental(op, m->get_epoch()+1);
2827 goto didit;
2828 }
2829 }
2830
2831
2832 // weird?
2833 if (osdmap.is_down(badboy)) {
2834 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2835 << " " << m->get_target_addrs()
2836 << ", from " << m->get_orig_source() << dendl;
2837 if (m->get_epoch() < osdmap.get_epoch())
2838 send_incremental(op, m->get_epoch()+1);
2839 goto didit;
2840 }
2841 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2842 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2843 << " " << m->get_target_addrs()
2844 << " != map's " << osdmap.get_addrs(badboy)
2845 << ", from " << m->get_orig_source() << dendl;
2846 if (m->get_epoch() < osdmap.get_epoch())
2847 send_incremental(op, m->get_epoch()+1);
2848 goto didit;
2849 }
2850
2851 // already reported?
2852 if (osdmap.is_down(badboy) ||
2853 osdmap.get_up_from(badboy) > m->get_epoch()) {
2854 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2855 << " " << m->get_target_addrs()
2856 << ", from " << m->get_orig_source() << dendl;
2857 if (m->get_epoch() < osdmap.get_epoch())
2858 send_incremental(op, m->get_epoch()+1);
2859 goto didit;
2860 }
2861
2862 if (!can_mark_down(badboy)) {
2863 dout(5) << "preprocess_failure ignoring report of osd."
2864 << m->get_target_osd() << " " << m->get_target_addrs()
2865 << " from " << m->get_orig_source() << dendl;
2866 goto didit;
2867 }
2868
2869 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2870 << " " << m->get_target_addrs()
2871 << ", from " << m->get_orig_source() << dendl;
2872 return false;
2873
2874 didit:
2875 mon->no_reply(op);
2876 return true;
2877 }
2878
2879 class C_AckMarkedDown : public C_MonOp {
2880 OSDMonitor *osdmon;
2881 public:
2882 C_AckMarkedDown(
2883 OSDMonitor *osdmon,
2884 MonOpRequestRef op)
2885 : C_MonOp(op), osdmon(osdmon) {}
2886
2887 void _finish(int r) override {
2888 if (r == 0) {
2889 auto m = op->get_req<MOSDMarkMeDown>();
2890 osdmon->mon->send_reply(
2891 op,
2892 new MOSDMarkMeDown(
2893 m->fsid,
2894 m->target_osd,
2895 m->target_addrs,
2896 m->get_epoch(),
2897 false)); // ACK itself does not request an ack
2898 } else if (r == -EAGAIN) {
2899 osdmon->dispatch(op);
2900 } else {
2901 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2902 }
2903 }
2904 ~C_AckMarkedDown() override {
2905 }
2906 };
2907
2908 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2909 {
2910 op->mark_osdmon_event(__func__);
2911 auto m = op->get_req<MOSDMarkMeDown>();
2912 int from = m->target_osd;
2913
2914 // check permissions
2915 if (check_source(op, m->fsid))
2916 goto reply;
2917
2918 // first, verify the reporting host is valid
2919 if (!m->get_orig_source().is_osd())
2920 goto reply;
2921
2922 if (!osdmap.exists(from) ||
2923 osdmap.is_down(from) ||
2924 osdmap.get_addrs(from) != m->target_addrs) {
2925 dout(5) << "preprocess_mark_me_down from dead osd."
2926 << from << ", ignoring" << dendl;
2927 send_incremental(op, m->get_epoch()+1);
2928 goto reply;
2929 }
2930
2931 // no down might be set
2932 if (!can_mark_down(from))
2933 goto reply;
2934
2935 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2936 << " " << m->target_addrs << dendl;
2937 return false;
2938
2939 reply:
2940 if (m->request_ack) {
2941 Context *c(new C_AckMarkedDown(this, op));
2942 c->complete(0);
2943 }
2944 return true;
2945 }
2946
2947 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2948 {
2949 op->mark_osdmon_event(__func__);
2950 auto m = op->get_req<MOSDMarkMeDown>();
2951 int target_osd = m->target_osd;
2952
2953 ceph_assert(osdmap.is_up(target_osd));
2954 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2955
2956 mon->clog->info() << "osd." << target_osd << " marked itself down";
2957 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2958 if (m->request_ack)
2959 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2960 return true;
2961 }
2962
2963 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2964 {
2965 op->mark_osdmon_event(__func__);
2966 auto m = op->get_req<MOSDMarkMeDead>();
2967 int from = m->target_osd;
2968
2969 // check permissions
2970 if (check_source(op, m->fsid)) {
2971 mon->no_reply(op);
2972 return true;
2973 }
2974
2975 // first, verify the reporting host is valid
2976 if (!m->get_orig_source().is_osd()) {
2977 mon->no_reply(op);
2978 return true;
2979 }
2980
2981 if (!osdmap.exists(from) ||
2982 !osdmap.is_down(from)) {
2983 dout(5) << __func__ << " from nonexistent or up osd." << from
2984 << ", ignoring" << dendl;
2985 send_incremental(op, m->get_epoch()+1);
2986 mon->no_reply(op);
2987 return true;
2988 }
2989
2990 return false;
2991 }
2992
2993 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2994 {
2995 op->mark_osdmon_event(__func__);
2996 auto m = op->get_req<MOSDMarkMeDead>();
2997 int target_osd = m->target_osd;
2998
2999 ceph_assert(osdmap.is_down(target_osd));
3000
3001 mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
3002 << m->get_epoch();
3003 if (!pending_inc.new_xinfo.count(target_osd)) {
3004 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3005 }
3006 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3007 wait_for_finished_proposal(
3008 op,
3009 new LambdaContext(
3010 [op, this] (int r) {
3011 if (r >= 0) {
3012 mon->no_reply(op); // ignore on success
3013 }
3014 }
3015 ));
3016 return true;
3017 }
3018
3019 bool OSDMonitor::can_mark_down(int i)
3020 {
3021 if (osdmap.is_nodown(i)) {
3022 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3023 << "will not mark it down" << dendl;
3024 return false;
3025 }
3026
3027 int num_osds = osdmap.get_num_osds();
3028 if (num_osds == 0) {
3029 dout(5) << __func__ << " no osds" << dendl;
3030 return false;
3031 }
3032 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3033 float up_ratio = (float)up / (float)num_osds;
3034 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3035 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3036 << g_conf()->mon_osd_min_up_ratio
3037 << ", will not mark osd." << i << " down" << dendl;
3038 return false;
3039 }
3040 return true;
3041 }
3042
3043 bool OSDMonitor::can_mark_up(int i)
3044 {
3045 if (osdmap.is_noup(i)) {
3046 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3047 << "will not mark it up" << dendl;
3048 return false;
3049 }
3050
3051 return true;
3052 }
3053
3054 /**
3055 * @note the parameter @p i apparently only exists here so we can output the
3056 * osd's id on messages.
3057 */
3058 bool OSDMonitor::can_mark_out(int i)
3059 {
3060 if (osdmap.is_noout(i)) {
3061 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3062 << "will not mark it out" << dendl;
3063 return false;
3064 }
3065
3066 int num_osds = osdmap.get_num_osds();
3067 if (num_osds == 0) {
3068 dout(5) << __func__ << " no osds" << dendl;
3069 return false;
3070 }
3071 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3072 float in_ratio = (float)in / (float)num_osds;
3073 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3074 if (i >= 0)
3075 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3076 << g_conf()->mon_osd_min_in_ratio
3077 << ", will not mark osd." << i << " out" << dendl;
3078 else
3079 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3080 << g_conf()->mon_osd_min_in_ratio
3081 << ", will not mark osds out" << dendl;
3082 return false;
3083 }
3084
3085 return true;
3086 }
3087
3088 bool OSDMonitor::can_mark_in(int i)
3089 {
3090 if (osdmap.is_noin(i)) {
3091 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3092 << "will not mark it in" << dendl;
3093 return false;
3094 }
3095
3096 return true;
3097 }
3098
3099 bool OSDMonitor::check_failures(utime_t now)
3100 {
3101 bool found_failure = false;
3102 auto p = failure_info.begin();
3103 while (p != failure_info.end()) {
3104 auto& [target_osd, fi] = *p;
3105 if (can_mark_down(target_osd) &&
3106 check_failure(now, target_osd, fi)) {
3107 found_failure = true;
3108 ++p;
3109 } else if (is_failure_stale(now, fi)) {
3110 dout(10) << " dropping stale failure_info for osd." << target_osd
3111 << " from " << fi.reporters.size() << " reporters"
3112 << dendl;
3113 p = failure_info.erase(p);
3114 } else {
3115 ++p;
3116 }
3117 }
3118 return found_failure;
3119 }
3120
3121 utime_t OSDMonitor::get_grace_time(utime_t now,
3122 int target_osd,
3123 failure_info_t& fi) const
3124 {
3125 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3126 if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3127 return orig_grace;
3128 }
3129 utime_t grace = orig_grace;
3130 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3131 double decay_k = ::log(.5) / halflife;
3132
3133 // scale grace period based on historical probability of 'lagginess'
3134 // (false positive failures due to slowness).
3135 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3136 const utime_t failed_for = now - fi.get_failed_since();
3137 double decay = exp((double)failed_for * decay_k);
3138 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3139 << " failed_for " << failed_for << " decay " << decay << dendl;
3140 double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3141 grace += my_grace;
3142
3143 // consider the peers reporting a failure a proxy for a potential
3144 // 'subcluster' over the overall cluster that is similarly
3145 // laggy. this is clearly not true in all cases, but will sometimes
3146 // help us localize the grace correction to a subset of the system
3147 // (say, a rack with a bad switch) that is unhappy.
3148 double peer_grace = 0;
3149 for (auto& [reporter, report] : fi.reporters) {
3150 if (osdmap.exists(reporter)) {
3151 const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3152 utime_t elapsed = now - xi.down_stamp;
3153 double decay = exp((double)elapsed * decay_k);
3154 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3155 }
3156 }
3157 peer_grace /= (double)fi.reporters.size();
3158 grace += peer_grace;
3159 dout(10) << " osd." << target_osd << " has "
3160 << fi.reporters.size() << " reporters, "
3161 << grace << " grace (" << orig_grace << " + " << my_grace
3162 << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3163 << dendl;
3164
3165 return grace;
3166 }
3167
3168 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3169 {
3170 // already pending failure?
3171 if (pending_inc.new_state.count(target_osd) &&
3172 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3173 dout(10) << " already pending failure" << dendl;
3174 return true;
3175 }
3176
3177 set<string> reporters_by_subtree;
3178 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3179 ceph_assert(fi.reporters.size());
3180 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3181 // get the parent bucket whose type matches with "reporter_subtree_level".
3182 // fall back to OSD if the level doesn't exist.
3183 if (osdmap.exists(p->first)) {
3184 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3185 if (auto iter = reporter_loc.find(reporter_subtree_level);
3186 iter == reporter_loc.end()) {
3187 reporters_by_subtree.insert("osd." + to_string(p->first));
3188 } else {
3189 reporters_by_subtree.insert(iter->second);
3190 }
3191 ++p;
3192 } else {
3193 fi.cancel_report(p->first);;
3194 p = fi.reporters.erase(p);
3195 }
3196 }
3197 if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3198 return false;
3199 }
3200 const utime_t failed_for = now - fi.get_failed_since();
3201 const utime_t grace = get_grace_time(now, target_osd, fi);
3202 if (failed_for >= grace) {
3203 dout(1) << " we have enough reporters to mark osd." << target_osd
3204 << " down" << dendl;
3205 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3206
3207 mon->clog->info() << "osd." << target_osd << " failed ("
3208 << osdmap.crush->get_full_location_ordered_string(
3209 target_osd)
3210 << ") ("
3211 << (int)reporters_by_subtree.size()
3212 << " reporters from different "
3213 << reporter_subtree_level << " after "
3214 << failed_for << " >= grace " << grace << ")";
3215 return true;
3216 }
3217 return false;
3218 }
3219
3220 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3221 {
3222 // if it takes too long to either cancel the report to mark the osd down,
3223 // some reporters must have failed to cancel their reports. let's just
3224 // forget these reports.
3225 const utime_t failed_for = now - fi.get_failed_since();
3226 auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3227 auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3228 return failed_for >= (heartbeat_grace + heartbeat_stale);
3229 }
3230
3231 void OSDMonitor::force_failure(int target_osd, int by)
3232 {
3233 // already pending failure?
3234 if (pending_inc.new_state.count(target_osd) &&
3235 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3236 dout(10) << " already pending failure" << dendl;
3237 return;
3238 }
3239
3240 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3241 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3242 if (!pending_inc.new_xinfo.count(target_osd)) {
3243 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3244 }
3245 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3246
3247 mon->clog->info() << "osd." << target_osd << " failed ("
3248 << osdmap.crush->get_full_location_ordered_string(target_osd)
3249 << ") (connection refused reported by osd." << by << ")";
3250 return;
3251 }
3252
3253 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3254 {
3255 op->mark_osdmon_event(__func__);
3256 auto m = op->get_req<MOSDFailure>();
3257 dout(1) << "prepare_failure osd." << m->get_target_osd()
3258 << " " << m->get_target_addrs()
3259 << " from " << m->get_orig_source()
3260 << " is reporting failure:" << m->if_osd_failed() << dendl;
3261
3262 int target_osd = m->get_target_osd();
3263 int reporter = m->get_orig_source().num();
3264 ceph_assert(osdmap.is_up(target_osd));
3265 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3266
3267 mon->no_reply(op);
3268
3269 if (m->if_osd_failed()) {
3270 // calculate failure time
3271 utime_t now = ceph_clock_now();
3272 utime_t failed_since =
3273 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3274
3275 // add a report
3276 if (m->is_immediate()) {
3277 mon->clog->debug() << "osd." << m->get_target_osd()
3278 << " reported immediately failed by "
3279 << m->get_orig_source();
3280 force_failure(target_osd, reporter);
3281 return true;
3282 }
3283 mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3284 << m->get_orig_source();
3285
3286 failure_info_t& fi = failure_info[target_osd];
3287 fi.add_report(reporter, failed_since, op);
3288 return check_failure(now, target_osd, fi);
3289 } else {
3290 // remove the report
3291 mon->clog->debug() << "osd." << m->get_target_osd()
3292 << " failure report canceled by "
3293 << m->get_orig_source();
3294 if (failure_info.count(target_osd)) {
3295 failure_info_t& fi = failure_info[target_osd];
3296 fi.cancel_report(reporter);
3297 if (fi.reporters.empty()) {
3298 dout(10) << " removing last failure_info for osd." << target_osd
3299 << dendl;
3300 failure_info.erase(target_osd);
3301 } else {
3302 dout(10) << " failure_info for osd." << target_osd << " now "
3303 << fi.reporters.size() << " reporters" << dendl;
3304 }
3305 } else {
3306 dout(10) << " no failure_info for osd." << target_osd << dendl;
3307 }
3308 }
3309
3310 return false;
3311 }
3312
3313 void OSDMonitor::process_failures()
3314 {
3315 map<int,failure_info_t>::iterator p = failure_info.begin();
3316 while (p != failure_info.end()) {
3317 if (osdmap.is_up(p->first)) {
3318 ++p;
3319 } else {
3320 dout(10) << "process_failures osd." << p->first << dendl;
3321 list<MonOpRequestRef> ls;
3322 p->second.take_report_messages(ls);
3323 failure_info.erase(p++);
3324
3325 while (!ls.empty()) {
3326 MonOpRequestRef o = ls.front();
3327 if (o) {
3328 o->mark_event(__func__);
3329 MOSDFailure *m = o->get_req<MOSDFailure>();
3330 send_latest(o, m->get_epoch());
3331 mon->no_reply(o);
3332 }
3333 ls.pop_front();
3334 }
3335 }
3336 }
3337 }
3338
3339 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3340 {
3341 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3342
3343 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3344 p != failure_info.end();
3345 ++p) {
3346 p->second.take_report_messages(ls);
3347 }
3348 failure_info.clear();
3349 }
3350
3351 int OSDMonitor::get_grace_interval_threshold()
3352 {
3353 int halflife = g_conf()->mon_osd_laggy_halflife;
3354 // Scale the halflife period (default: 1_hr) by
3355 // a factor (48) to calculate the threshold.
3356 int grace_threshold_factor = 48;
3357 return halflife * grace_threshold_factor;
3358 }
3359
3360 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3361 {
3362 int grace_interval_threshold_secs = get_grace_interval_threshold();
3363 if (last_failed_interval > grace_interval_threshold_secs) {
3364 dout(1) << " last_failed_interval " << last_failed_interval
3365 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3366 << dendl;
3367 return true;
3368 }
3369 return false;
3370 }
3371
3372 void OSDMonitor::set_default_laggy_params(int target_osd)
3373 {
3374 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3375 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3376 }
3377 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3378 xi.down_stamp = pending_inc.modified;
3379 xi.laggy_probability = 0.0;
3380 xi.laggy_interval = 0;
3381 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3382 }
3383
3384
3385 // boot --
3386
3387 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3388 {
3389 op->mark_osdmon_event(__func__);
3390 auto m = op->get_req<MOSDBoot>();
3391 int from = m->get_orig_source_inst().name.num();
3392
3393 // check permissions, ignore if failed (no response expected)
3394 MonSession *session = op->get_session();
3395 if (!session)
3396 goto ignore;
3397 if (!session->is_capable("osd", MON_CAP_X)) {
3398 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3399 << session->caps << dendl;
3400 goto ignore;
3401 }
3402
3403 if (m->sb.cluster_fsid != mon->monmap->fsid) {
3404 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3405 << " != " << mon->monmap->fsid << dendl;
3406 goto ignore;
3407 }
3408
3409 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3410 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3411 goto ignore;
3412 }
3413
3414 ceph_assert(m->get_orig_source_inst().name.is_osd());
3415
3416 // force all osds to have gone through luminous prior to upgrade to nautilus
3417 {
3418 vector<string> missing;
3419 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3420 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3421 }
3422 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3423 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3424 }
3425 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3426 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3427 }
3428 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3429 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3430 }
3431
3432 if (!missing.empty()) {
3433 using std::experimental::make_ostream_joiner;
3434
3435 stringstream ss;
3436 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3437
3438 mon->clog->info() << "disallowing boot of OSD "
3439 << m->get_orig_source_inst()
3440 << " because the osd lacks " << ss.str();
3441 goto ignore;
3442 }
3443 }
3444
3445 // make sure osd versions do not span more than 3 releases
3446 if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3447 osdmap.require_osd_release < ceph_release_t::mimic) {
3448 mon->clog->info() << "disallowing boot of octopus+ OSD "
3449 << m->get_orig_source_inst()
3450 << " because require_osd_release < mimic";
3451 goto ignore;
3452 }
3453
3454 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3455 // we are reusing a jewel feature bit that was retired in luminous.
3456 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3457 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3458 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3459 mon->clog->info() << "disallowing boot of OSD "
3460 << m->get_orig_source_inst()
3461 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3462 goto ignore;
3463 }
3464
3465 // already booted?
3466 if (osdmap.is_up(from) &&
3467 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3468 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3469 // yup.
3470 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3471 << " " << m->get_orig_source_addrs()
3472 << " =~ " << osdmap.get_addrs(from) << dendl;
3473 _booted(op, false);
3474 return true;
3475 }
3476
3477 if (osdmap.exists(from) &&
3478 !osdmap.get_uuid(from).is_zero() &&
3479 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3480 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3481 << " clashes with existing osd: different fsid"
3482 << " (ours: " << osdmap.get_uuid(from)
3483 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3484 goto ignore;
3485 }
3486
3487 if (osdmap.exists(from) &&
3488 osdmap.get_info(from).up_from > m->version &&
3489 osdmap.get_most_recent_addrs(from).legacy_equals(
3490 m->get_orig_source_addrs())) {
3491 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3492 send_latest(op, m->sb.current_epoch+1);
3493 return true;
3494 }
3495
3496 // noup?
3497 if (!can_mark_up(from)) {
3498 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3499 send_latest(op, m->sb.current_epoch+1);
3500 return true;
3501 }
3502
3503 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3504 return false;
3505
3506 ignore:
3507 return true;
3508 }
3509
3510 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3511 {
3512 op->mark_osdmon_event(__func__);
3513 auto m = op->get_req<MOSDBoot>();
3514 dout(7) << __func__ << " from " << m->get_source()
3515 << " sb " << m->sb
3516 << " client_addrs" << m->get_connection()->get_peer_addrs()
3517 << " cluster_addrs " << m->cluster_addrs
3518 << " hb_back_addrs " << m->hb_back_addrs
3519 << " hb_front_addrs " << m->hb_front_addrs
3520 << dendl;
3521
3522 ceph_assert(m->get_orig_source().is_osd());
3523 int from = m->get_orig_source().num();
3524
3525 // does this osd exist?
3526 if (from >= osdmap.get_max_osd()) {
3527 dout(1) << "boot from osd." << from << " >= max_osd "
3528 << osdmap.get_max_osd() << dendl;
3529 return false;
3530 }
3531
3532 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3533 if (pending_inc.new_state.count(from))
3534 oldstate ^= pending_inc.new_state[from];
3535
3536 // already up? mark down first?
3537 if (osdmap.is_up(from)) {
3538 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3539 << osdmap.get_addrs(from) << dendl;
3540 // preprocess should have caught these; if not, assert.
3541 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3542 m->get_orig_source_addrs()) ||
3543 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3544 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3545
3546 if (pending_inc.new_state.count(from) == 0 ||
3547 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3548 // mark previous guy down
3549 pending_inc.new_state[from] = CEPH_OSD_UP;
3550 }
3551 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3552 } else if (pending_inc.new_up_client.count(from)) {
3553 // already prepared, just wait
3554 dout(7) << __func__ << " already prepared, waiting on "
3555 << m->get_orig_source_addr() << dendl;
3556 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3557 } else {
3558 // mark new guy up.
3559 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3560 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3561 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3562 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3563
3564 down_pending_out.erase(from); // if any
3565
3566 if (m->sb.weight)
3567 osd_weight[from] = m->sb.weight;
3568
3569 // set uuid?
3570 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3571 << dendl;
3572 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3573 // preprocess should have caught this; if not, assert.
3574 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3575 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3576 }
3577
3578 // fresh osd?
3579 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3580 const osd_info_t& i = osdmap.get_info(from);
3581 if (i.up_from > i.lost_at) {
3582 dout(10) << " fresh osd; marking lost_at too" << dendl;
3583 pending_inc.new_lost[from] = osdmap.get_epoch();
3584 }
3585 }
3586
3587 // metadata
3588 bufferlist osd_metadata;
3589 encode(m->metadata, osd_metadata);
3590 pending_metadata[from] = osd_metadata;
3591 pending_metadata_rm.erase(from);
3592
3593 // adjust last clean unmount epoch?
3594 const osd_info_t& info = osdmap.get_info(from);
3595 dout(10) << " old osd_info: " << info << dendl;
3596 if (m->sb.mounted > info.last_clean_begin ||
3597 (m->sb.mounted == info.last_clean_begin &&
3598 m->sb.clean_thru > info.last_clean_end)) {
3599 epoch_t begin = m->sb.mounted;
3600 epoch_t end = m->sb.clean_thru;
3601
3602 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3603 << "[" << info.last_clean_begin << "," << info.last_clean_end
3604 << ") -> [" << begin << "-" << end << ")"
3605 << dendl;
3606 pending_inc.new_last_clean_interval[from] =
3607 pair<epoch_t,epoch_t>(begin, end);
3608 }
3609
3610 if (pending_inc.new_xinfo.count(from) == 0)
3611 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3612 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3613 if (m->boot_epoch == 0) {
3614 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3615 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3616 dout(10) << " not laggy, new xi " << xi << dendl;
3617 } else {
3618 if (xi.down_stamp.sec()) {
3619 int interval = ceph_clock_now().sec() -
3620 xi.down_stamp.sec();
3621 if (g_conf()->mon_osd_laggy_max_interval &&
3622 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3623 interval = g_conf()->mon_osd_laggy_max_interval;
3624 }
3625 xi.laggy_interval =
3626 interval * g_conf()->mon_osd_laggy_weight +
3627 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3628 }
3629 xi.laggy_probability =
3630 g_conf()->mon_osd_laggy_weight +
3631 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3632 dout(10) << " laggy, now xi " << xi << dendl;
3633 }
3634
3635 // set features shared by the osd
3636 if (m->osd_features)
3637 xi.features = m->osd_features;
3638 else
3639 xi.features = m->get_connection()->get_features();
3640
3641 // mark in?
3642 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3643 (oldstate & CEPH_OSD_AUTOOUT)) ||
3644 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3645 (g_conf()->mon_osd_auto_mark_in)) {
3646 if (can_mark_in(from)) {
3647 if (xi.old_weight > 0) {
3648 pending_inc.new_weight[from] = xi.old_weight;
3649 xi.old_weight = 0;
3650 } else {
3651 pending_inc.new_weight[from] = CEPH_OSD_IN;
3652 }
3653 } else {
3654 dout(7) << __func__ << " NOIN set, will not mark in "
3655 << m->get_orig_source_addr() << dendl;
3656 }
3657 }
3658
3659 // wait
3660 wait_for_finished_proposal(op, new C_Booted(this, op));
3661 }
3662 return true;
3663 }
3664
3665 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3666 {
3667 op->mark_osdmon_event(__func__);
3668 auto m = op->get_req<MOSDBoot>();
3669 dout(7) << "_booted " << m->get_orig_source_inst()
3670 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3671
3672 if (logit) {
3673 mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3674 << " boot";
3675 }
3676
3677 send_latest(op, m->sb.current_epoch+1);
3678 }
3679
3680
3681 // -------------
3682 // full
3683
3684 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3685 {
3686 op->mark_osdmon_event(__func__);
3687 auto m = op->get_req<MOSDFull>();
3688 int from = m->get_orig_source().num();
3689 set<string> state;
3690 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3691
3692 // check permissions, ignore if failed
3693 MonSession *session = op->get_session();
3694 if (!session)
3695 goto ignore;
3696 if (!session->is_capable("osd", MON_CAP_X)) {
3697 dout(0) << "MOSDFull from entity with insufficient privileges:"
3698 << session->caps << dendl;
3699 goto ignore;
3700 }
3701
3702 // ignore a full message from the osd instance that already went down
3703 if (!osdmap.exists(from)) {
3704 dout(7) << __func__ << " ignoring full message from nonexistent "
3705 << m->get_orig_source_inst() << dendl;
3706 goto ignore;
3707 }
3708 if ((!osdmap.is_up(from) &&
3709 osdmap.get_most_recent_addrs(from).legacy_equals(
3710 m->get_orig_source_addrs())) ||
3711 (osdmap.is_up(from) &&
3712 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3713 dout(7) << __func__ << " ignoring full message from down "
3714 << m->get_orig_source_inst() << dendl;
3715 goto ignore;
3716 }
3717
3718 OSDMap::calc_state_set(osdmap.get_state(from), state);
3719
3720 if ((osdmap.get_state(from) & mask) == m->state) {
3721 dout(7) << __func__ << " state already " << state << " for osd." << from
3722 << " " << m->get_orig_source_inst() << dendl;
3723 _reply_map(op, m->version);
3724 goto ignore;
3725 }
3726
3727 dout(10) << __func__ << " want state " << state << " for osd." << from
3728 << " " << m->get_orig_source_inst() << dendl;
3729 return false;
3730
3731 ignore:
3732 return true;
3733 }
3734
3735 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3736 {
3737 op->mark_osdmon_event(__func__);
3738 auto m = op->get_req<MOSDFull>();
3739 const int from = m->get_orig_source().num();
3740
3741 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3742 const unsigned want_state = m->state & mask; // safety first
3743
3744 unsigned cur_state = osdmap.get_state(from);
3745 auto p = pending_inc.new_state.find(from);
3746 if (p != pending_inc.new_state.end()) {
3747 cur_state ^= p->second;
3748 }
3749 cur_state &= mask;
3750
3751 set<string> want_state_set, cur_state_set;
3752 OSDMap::calc_state_set(want_state, want_state_set);
3753 OSDMap::calc_state_set(cur_state, cur_state_set);
3754
3755 if (cur_state != want_state) {
3756 if (p != pending_inc.new_state.end()) {
3757 p->second &= ~mask;
3758 } else {
3759 pending_inc.new_state[from] = 0;
3760 }
3761 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3762 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3763 << " -> " << want_state_set << dendl;
3764 } else {
3765 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3766 << " = wanted " << want_state_set << ", just waiting" << dendl;
3767 }
3768
3769 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3770 return true;
3771 }
3772
3773 // -------------
3774 // alive
3775
3776 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3777 {
3778 op->mark_osdmon_event(__func__);
3779 auto m = op->get_req<MOSDAlive>();
3780 int from = m->get_orig_source().num();
3781
3782 // check permissions, ignore if failed
3783 MonSession *session = op->get_session();
3784 if (!session)
3785 goto ignore;
3786 if (!session->is_capable("osd", MON_CAP_X)) {
3787 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3788 << session->caps << dendl;
3789 goto ignore;
3790 }
3791
3792 if (!osdmap.is_up(from) ||
3793 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3794 dout(7) << "preprocess_alive ignoring alive message from down "
3795 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3796 << dendl;
3797 goto ignore;
3798 }
3799
3800 if (osdmap.get_up_thru(from) >= m->want) {
3801 // yup.
3802 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3803 _reply_map(op, m->version);
3804 return true;
3805 }
3806
3807 dout(10) << "preprocess_alive want up_thru " << m->want
3808 << " from " << m->get_orig_source_inst() << dendl;
3809 return false;
3810
3811 ignore:
3812 return true;
3813 }
3814
3815 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3816 {
3817 op->mark_osdmon_event(__func__);
3818 auto m = op->get_req<MOSDAlive>();
3819 int from = m->get_orig_source().num();
3820
3821 if (0) { // we probably don't care much about these
3822 mon->clog->debug() << m->get_orig_source_inst() << " alive";
3823 }
3824
3825 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3826 << " from " << m->get_orig_source_inst() << dendl;
3827
3828 update_up_thru(from, m->version); // set to the latest map the OSD has
3829 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3830 return true;
3831 }
3832
3833 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3834 {
3835 op->mark_osdmon_event(__func__);
3836 dout(7) << "_reply_map " << e
3837 << " from " << op->get_req()->get_orig_source_inst()
3838 << dendl;
3839 send_latest(op, e);
3840 }
3841
3842 // pg_created
3843 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3844 {
3845 op->mark_osdmon_event(__func__);
3846 auto m = op->get_req<MOSDPGCreated>();
3847 dout(10) << __func__ << " " << *m << dendl;
3848 auto session = op->get_session();
3849 mon->no_reply(op);
3850 if (!session) {
3851 dout(10) << __func__ << ": no monitor session!" << dendl;
3852 return true;
3853 }
3854 if (!session->is_capable("osd", MON_CAP_X)) {
3855 derr << __func__ << " received from entity "
3856 << "with insufficient privileges " << session->caps << dendl;
3857 return true;
3858 }
3859 // always forward the "created!" to the leader
3860 return false;
3861 }
3862
3863 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3864 {
3865 op->mark_osdmon_event(__func__);
3866 auto m = op->get_req<MOSDPGCreated>();
3867 dout(10) << __func__ << " " << *m << dendl;
3868 auto src = m->get_orig_source();
3869 auto from = src.num();
3870 if (!src.is_osd() ||
3871 !mon->osdmon()->osdmap.is_up(from) ||
3872 !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3873 m->get_orig_source_addrs())) {
3874 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3875 return false;
3876 }
3877 pending_created_pgs.push_back(m->pgid);
3878 return true;
3879 }
3880
3881 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3882 {
3883 op->mark_osdmon_event(__func__);
3884 auto m = op->get_req<MOSDPGReadyToMerge>();
3885 dout(10) << __func__ << " " << *m << dendl;
3886 const pg_pool_t *pi;
3887 auto session = op->get_session();
3888 if (!session) {
3889 dout(10) << __func__ << ": no monitor session!" << dendl;
3890 goto ignore;
3891 }
3892 if (!session->is_capable("osd", MON_CAP_X)) {
3893 derr << __func__ << " received from entity "
3894 << "with insufficient privileges " << session->caps << dendl;
3895 goto ignore;
3896 }
3897 pi = osdmap.get_pg_pool(m->pgid.pool());
3898 if (!pi) {
3899 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3900 goto ignore;
3901 }
3902 if (pi->get_pg_num() <= m->pgid.ps()) {
3903 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3904 goto ignore;
3905 }
3906 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3907 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3908 goto ignore;
3909 }
3910 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3911 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3912 goto ignore;
3913 }
3914 return false;
3915
3916 ignore:
3917 mon->no_reply(op);
3918 return true;
3919 }
3920
3921 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3922 {
3923 op->mark_osdmon_event(__func__);
3924 auto m = op->get_req<MOSDPGReadyToMerge>();
3925 dout(10) << __func__ << " " << *m << dendl;
3926 pg_pool_t p;
3927 if (pending_inc.new_pools.count(m->pgid.pool()))
3928 p = pending_inc.new_pools[m->pgid.pool()];
3929 else
3930 p = *osdmap.get_pg_pool(m->pgid.pool());
3931 if (p.get_pg_num() != m->pgid.ps() + 1 ||
3932 p.get_pg_num_pending() > m->pgid.ps()) {
3933 dout(10) << __func__
3934 << " race with concurrent pg_num[_pending] update, will retry"
3935 << dendl;
3936 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3937 return true;
3938 }
3939
3940 if (m->ready) {
3941 p.dec_pg_num(m->pgid,
3942 pending_inc.epoch,
3943 m->source_version,
3944 m->target_version,
3945 m->last_epoch_started,
3946 m->last_epoch_clean);
3947 p.last_change = pending_inc.epoch;
3948 } else {
3949 // back off the merge attempt!
3950 p.set_pg_num_pending(p.get_pg_num());
3951 }
3952
3953 // force pre-nautilus clients to resend their ops, since they
3954 // don't understand pg_num_pending changes form a new interval
3955 p.last_force_op_resend_prenautilus = pending_inc.epoch;
3956
3957 pending_inc.new_pools[m->pgid.pool()] = p;
3958
3959 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3960 if (m->ready &&
3961 prob > 0 &&
3962 prob > (double)(rand() % 1000)/1000.0) {
3963 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3964 auto n = new MMonCommand(mon->monmap->get_fsid());
3965 n->set_connection(m->get_connection());
3966 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3967 osdmap.get_pool_name(m->pgid.pool()) +
3968 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3969 stringify(m->pgid.ps() + 1) + "\"}" };
3970 MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3971 nop->set_type_service();
3972 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3973 } else {
3974 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3975 }
3976 return true;
3977 }
3978
3979
3980 // -------------
3981 // pg_temp changes
3982
3983 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3984 {
3985 auto m = op->get_req<MOSDPGTemp>();
3986 dout(10) << "preprocess_pgtemp " << *m << dendl;
3987 mempool::osdmap::vector<int> empty;
3988 int from = m->get_orig_source().num();
3989 size_t ignore_cnt = 0;
3990
3991 // check caps
3992 MonSession *session = op->get_session();
3993 if (!session)
3994 goto ignore;
3995 if (!session->is_capable("osd", MON_CAP_X)) {
3996 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3997 << session->caps << dendl;
3998 goto ignore;
3999 }
4000
4001 if (!osdmap.is_up(from) ||
4002 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4003 dout(7) << "ignoring pgtemp message from down "
4004 << m->get_orig_source() << " " << m->get_orig_source_addrs()
4005 << dendl;
4006 goto ignore;
4007 }
4008
4009 if (m->forced) {
4010 return false;
4011 }
4012
4013 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4014 dout(20) << " " << p->first
4015 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4016 << " -> " << p->second << dendl;
4017
4018 // does the pool exist?
4019 if (!osdmap.have_pg_pool(p->first.pool())) {
4020 /*
4021 * 1. If the osdmap does not have the pool, it means the pool has been
4022 * removed in-between the osd sending this message and us handling it.
4023 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4024 * not exist in the pending either, as the osds would not send a
4025 * message about a pool they know nothing about (yet).
4026 * 3. However, if the pool does exist in the pending, then it must be a
4027 * new pool, and not relevant to this message (see 1).
4028 */
4029 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4030 << ": pool has been removed" << dendl;
4031 ignore_cnt++;
4032 continue;
4033 }
4034
4035 int acting_primary = -1;
4036 osdmap.pg_to_up_acting_osds(
4037 p->first, nullptr, nullptr, nullptr, &acting_primary);
4038 if (acting_primary != from) {
4039 /* If the source isn't the primary based on the current osdmap, we know
4040 * that the interval changed and that we can discard this message.
4041 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4042 * which of two pg temp mappings on the same pg is more recent.
4043 */
4044 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4045 << ": primary has changed" << dendl;
4046 ignore_cnt++;
4047 continue;
4048 }
4049
4050 // removal?
4051 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4052 osdmap.primary_temp->count(p->first)))
4053 return false;
4054 // change?
4055 // NOTE: we assume that this will clear pg_primary, so consider
4056 // an existing pg_primary field to imply a change
4057 if (p->second.size() &&
4058 (osdmap.pg_temp->count(p->first) == 0 ||
4059 osdmap.pg_temp->get(p->first) != p->second ||
4060 osdmap.primary_temp->count(p->first)))
4061 return false;
4062 }
4063
4064 // should we ignore all the pgs?
4065 if (ignore_cnt == m->pg_temp.size())
4066 goto ignore;
4067
4068 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4069 _reply_map(op, m->map_epoch);
4070 return true;
4071
4072 ignore:
4073 mon->no_reply(op);
4074 return true;
4075 }
4076
4077 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4078 {
4079 epoch_t old_up_thru = osdmap.get_up_thru(from);
4080 auto ut = pending_inc.new_up_thru.find(from);
4081 if (ut != pending_inc.new_up_thru.end()) {
4082 old_up_thru = ut->second;
4083 }
4084 if (up_thru > old_up_thru) {
4085 // set up_thru too, so the osd doesn't have to ask again
4086 pending_inc.new_up_thru[from] = up_thru;
4087 }
4088 }
4089
4090 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4091 {
4092 op->mark_osdmon_event(__func__);
4093 auto m = op->get_req<MOSDPGTemp>();
4094 int from = m->get_orig_source().num();
4095 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4096 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4097 uint64_t pool = p->first.pool();
4098 if (pending_inc.old_pools.count(pool)) {
4099 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4100 << ": pool pending removal" << dendl;
4101 continue;
4102 }
4103 if (!osdmap.have_pg_pool(pool)) {
4104 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4105 << ": pool has been removed" << dendl;
4106 continue;
4107 }
4108 pending_inc.new_pg_temp[p->first] =
4109 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4110
4111 // unconditionally clear pg_primary (until this message can encode
4112 // a change for that, too.. at which point we need to also fix
4113 // preprocess_pg_temp)
4114 if (osdmap.primary_temp->count(p->first) ||
4115 pending_inc.new_primary_temp.count(p->first))
4116 pending_inc.new_primary_temp[p->first] = -1;
4117 }
4118
4119 // set up_thru too, so the osd doesn't have to ask again
4120 update_up_thru(from, m->map_epoch);
4121
4122 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4123 return true;
4124 }
4125
4126
4127 // ---
4128
4129 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4130 {
4131 op->mark_osdmon_event(__func__);
4132 auto m = op->get_req<MRemoveSnaps>();
4133 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4134
4135 // check privilege, ignore if failed
4136 MonSession *session = op->get_session();
4137 mon->no_reply(op);
4138 if (!session)
4139 goto ignore;
4140 if (!session->caps.is_capable(
4141 cct,
4142 session->entity_name,
4143 "osd", "osd pool rmsnap", {}, true, true, false,
4144 session->get_peer_socket_addr())) {
4145 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4146 << session->caps << dendl;
4147 goto ignore;
4148 }
4149
4150 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4151 q != m->snaps.end();
4152 ++q) {
4153 if (!osdmap.have_pg_pool(q->first)) {
4154 dout(10) << " ignoring removed_snaps " << q->second
4155 << " on non-existent pool " << q->first << dendl;
4156 continue;
4157 }
4158 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4159 for (vector<snapid_t>::iterator p = q->second.begin();
4160 p != q->second.end();
4161 ++p) {
4162 if (*p > pi->get_snap_seq() ||
4163 !_is_removed_snap(q->first, *p)) {
4164 return false;
4165 }
4166 }
4167 }
4168
4169 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4170 auto reply = make_message<MRemoveSnaps>();
4171 reply->snaps = m->snaps;
4172 mon->send_reply(op, reply.detach());
4173 }
4174
4175 ignore:
4176 return true;
4177 }
4178
4179 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4180 {
4181 op->mark_osdmon_event(__func__);
4182 auto m = op->get_req<MRemoveSnaps>();
4183 dout(7) << "prepare_remove_snaps " << *m << dendl;
4184
4185 for (auto& [pool, snaps] : m->snaps) {
4186 if (!osdmap.have_pg_pool(pool)) {
4187 dout(10) << " ignoring removed_snaps " << snaps
4188 << " on non-existent pool " << pool << dendl;
4189 continue;
4190 }
4191
4192 pg_pool_t& pi = osdmap.pools[pool];
4193 for (auto s : snaps) {
4194 if (!_is_removed_snap(pool, s) &&
4195 (!pending_inc.new_pools.count(pool) ||
4196 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4197 (!pending_inc.new_removed_snaps.count(pool) ||
4198 !pending_inc.new_removed_snaps[pool].contains(s))) {
4199 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4200 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4201 newpi->removed_snaps.insert(s);
4202 dout(10) << " pool " << pool << " removed_snaps added " << s
4203 << " (now " << newpi->removed_snaps << ")" << dendl;
4204 }
4205 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4206 if (s > newpi->get_snap_seq()) {
4207 dout(10) << " pool " << pool << " snap_seq "
4208 << newpi->get_snap_seq() << " -> " << s << dendl;
4209 newpi->set_snap_seq(s);
4210 }
4211 newpi->set_snap_epoch(pending_inc.epoch);
4212 dout(10) << " added pool " << pool << " snap " << s
4213 << " to removed_snaps queue" << dendl;
4214 pending_inc.new_removed_snaps[pool].insert(s);
4215 }
4216 }
4217 }
4218
4219 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4220 auto reply = make_message<MRemoveSnaps>();
4221 reply->snaps = m->snaps;
4222 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4223 }
4224
4225 return true;
4226 }
4227
4228 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4229 {
4230 op->mark_osdmon_event(__func__);
4231 auto m = op->get_req<MMonGetPurgedSnaps>();
4232 dout(7) << __func__ << " " << *m << dendl;
4233
4234 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4235
4236 string k = make_purged_snap_epoch_key(m->start);
4237 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4238 it->upper_bound(k);
4239 unsigned long epoch = m->last;
4240 while (it->valid()) {
4241 if (it->key().find("purged_epoch_") != 0) {
4242 break;
4243 }
4244 string k = it->key();
4245 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4246 if (n != 1) {
4247 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4248 } else if (epoch > m->last) {
4249 break;
4250 } else {
4251 bufferlist bl = it->value();
4252 auto p = bl.cbegin();
4253 auto &v = r[epoch];
4254 try {
4255 ceph::decode(v, p);
4256 } catch (buffer::error& e) {
4257 derr << __func__ << " unable to parse value for key '" << it->key()
4258 << "': \n";
4259 bl.hexdump(*_dout);
4260 *_dout << dendl;
4261 }
4262 n += 4 + v.size() * 16;
4263 }
4264 if (n > 1048576) {
4265 // impose a semi-arbitrary limit to message size
4266 break;
4267 }
4268 it->next();
4269 }
4270
4271 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4272 reply->purged_snaps.swap(r);
4273 mon->send_reply(op, reply.detach());
4274
4275 return true;
4276 }
4277
4278 // osd beacon
4279 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4280 {
4281 op->mark_osdmon_event(__func__);
4282 // check caps
4283 auto session = op->get_session();
4284 mon->no_reply(op);
4285 if (!session) {
4286 dout(10) << __func__ << " no monitor session!" << dendl;
4287 return true;
4288 }
4289 if (!session->is_capable("osd", MON_CAP_X)) {
4290 derr << __func__ << " received from entity "
4291 << "with insufficient privileges " << session->caps << dendl;
4292 return true;
4293 }
4294 // Always forward the beacon to the leader, even if they are the same as
4295 // the old one. The leader will mark as down osds that haven't sent
4296 // beacon for a few minutes.
4297 return false;
4298 }
4299
4300 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4301 {
4302 op->mark_osdmon_event(__func__);
4303 const auto beacon = op->get_req<MOSDBeacon>();
4304 const auto src = beacon->get_orig_source();
4305 dout(10) << __func__ << " " << *beacon
4306 << " from " << src << dendl;
4307 int from = src.num();
4308
4309 if (!src.is_osd() ||
4310 !osdmap.is_up(from) ||
4311 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4312 if (src.is_osd() && !osdmap.is_up(from)) {
4313 // share some new maps with this guy in case it may not be
4314 // aware of its own deadness...
4315 send_latest(op, beacon->version+1);
4316 }
4317 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4318 return false;
4319 }
4320
4321 last_osd_report[from] = ceph_clock_now();
4322 osd_epochs[from] = beacon->version;
4323
4324 for (const auto& pg : beacon->pgs) {
4325 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4326 }
4327
4328 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4329 beacon->last_purged_snaps_scrub) {
4330 if (pending_inc.new_xinfo.count(from) == 0) {
4331 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4332 }
4333 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4334 beacon->last_purged_snaps_scrub;
4335 return true;
4336 } else {
4337 return false;
4338 }
4339 }
4340
4341 // ---------------
4342 // map helpers
4343
4344 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4345 {
4346 op->mark_osdmon_event(__func__);
4347 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4348 << " start " << start << dendl;
4349 if (start == 0)
4350 send_full(op);
4351 else
4352 send_incremental(op, start);
4353 }
4354
4355
4356 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4357 {
4358 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4359 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4360 r->oldest_map = get_first_committed();
4361 r->newest_map = osdmap.get_epoch();
4362 return r;
4363 }
4364
4365 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4366 {
4367 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4368 << std::hex << features << std::dec << dendl;
4369 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
4370 m->oldest_map = get_first_committed();
4371 m->newest_map = osdmap.get_epoch();
4372
4373 for (epoch_t e = to; e >= from && e > 0; e--) {
4374 bufferlist bl;
4375 int err = get_version(e, features, bl);
4376 if (err == 0) {
4377 ceph_assert(bl.length());
4378 // if (get_version(e, bl) > 0) {
4379 dout(20) << "build_incremental inc " << e << " "
4380 << bl.length() << " bytes" << dendl;
4381 m->incremental_maps[e] = bl;
4382 } else {
4383 ceph_assert(err == -ENOENT);
4384 ceph_assert(!bl.length());
4385 get_version_full(e, features, bl);
4386 if (bl.length() > 0) {
4387 //else if (get_version("full", e, bl) > 0) {
4388 dout(20) << "build_incremental full " << e << " "
4389 << bl.length() << " bytes" << dendl;
4390 m->maps[e] = bl;
4391 } else {
4392 ceph_abort(); // we should have all maps.
4393 }
4394 }
4395 }
4396 return m;
4397 }
4398
4399 void OSDMonitor::send_full(MonOpRequestRef op)
4400 {
4401 op->mark_osdmon_event(__func__);
4402 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4403 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
4404 }
4405
4406 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4407 {
4408 op->mark_osdmon_event(__func__);
4409
4410 MonSession *s = op->get_session();
4411 ceph_assert(s);
4412
4413 if (s->proxy_con) {
4414 // oh, we can tell the other mon to do it
4415 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4416 << first << dendl;
4417 MRoute *r = new MRoute(s->proxy_tid, NULL);
4418 r->send_osdmap_first = first;
4419 s->proxy_con->send_message(r);
4420 op->mark_event("reply: send routed send_osdmap_first reply");
4421 } else {
4422 // do it ourselves
4423 send_incremental(first, s, false, op);
4424 }
4425 }
4426
4427 void OSDMonitor::send_incremental(epoch_t first,
4428 MonSession *session,
4429 bool onetime,
4430 MonOpRequestRef req)
4431 {
4432 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4433 << " to " << session->name << dendl;
4434
4435 // get feature of the peer
4436 // use quorum_con_features, if it's an anonymous connection.
4437 uint64_t features = session->con_features ? session->con_features :
4438 mon->get_quorum_con_features();
4439
4440 if (first <= session->osd_epoch) {
4441 dout(10) << __func__ << " " << session->name << " should already have epoch "
4442 << session->osd_epoch << dendl;
4443 first = session->osd_epoch + 1;
4444 }
4445
4446 if (first < get_first_committed()) {
4447 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4448 m->oldest_map = get_first_committed();
4449 m->newest_map = osdmap.get_epoch();
4450
4451 first = get_first_committed();
4452 bufferlist bl;
4453 int err = get_version_full(first, features, bl);
4454 ceph_assert(err == 0);
4455 ceph_assert(bl.length());
4456 dout(20) << "send_incremental starting with base full "
4457 << first << " " << bl.length() << " bytes" << dendl;
4458 m->maps[first] = bl;
4459
4460 if (req) {
4461 mon->send_reply(req, m);
4462 session->osd_epoch = first;
4463 return;
4464 } else {
4465 session->con->send_message(m);
4466 session->osd_epoch = first;
4467 }
4468 first++;
4469 }
4470
4471 while (first <= osdmap.get_epoch()) {
4472 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4473 osdmap.get_epoch());
4474 MOSDMap *m = build_incremental(first, last, features);
4475
4476 if (req) {
4477 // send some maps. it may not be all of them, but it will get them
4478 // started.
4479 mon->send_reply(req, m);
4480 } else {
4481 session->con->send_message(m);
4482 first = last + 1;
4483 }
4484 session->osd_epoch = last;
4485 if (onetime || req)
4486 break;
4487 }
4488 }
4489
4490 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4491 {
4492 return get_version(ver, mon->get_quorum_con_features(), bl);
4493 }
4494
4495 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4496 {
4497 OSDMap::Incremental inc;
4498 auto q = bl.cbegin();
4499 inc.decode(q);
4500 // always encode with subset of osdmap's canonical features
4501 uint64_t f = features & inc.encode_features;
4502 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4503 << dendl;
4504 bl.clear();
4505 if (inc.fullmap.length()) {
4506 // embedded full map?
4507 OSDMap m;
4508 m.decode(inc.fullmap);
4509 inc.fullmap.clear();
4510 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4511 }
4512 if (inc.crush.length()) {
4513 // embedded crush map
4514 CrushWrapper c;
4515 auto p = inc.crush.cbegin();
4516 c.decode(p);
4517 inc.crush.clear();
4518 c.encode(inc.crush, f);
4519 }
4520 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4521 }
4522
4523 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4524 {
4525 OSDMap m;
4526 auto q = bl.cbegin();
4527 m.decode(q);
4528 // always encode with subset of osdmap's canonical features
4529 uint64_t f = features & m.get_encoding_features();
4530 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4531 << dendl;
4532 bl.clear();
4533 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4534 }
4535
4536 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4537 {
4538 uint64_t significant_features = OSDMap::get_significant_features(features);
4539 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4540 return 0;
4541 }
4542 int ret = PaxosService::get_version(ver, bl);
4543 if (ret < 0) {
4544 return ret;
4545 }
4546 // NOTE: this check is imprecise; the OSDMap encoding features may
4547 // be a subset of the latest mon quorum features, but worst case we
4548 // reencode once and then cache the (identical) result under both
4549 // feature masks.
4550 if (significant_features !=
4551 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4552 reencode_incremental_map(bl, features);
4553 }
4554 inc_osd_cache.add_bytes({ver, significant_features}, bl);
4555 return 0;
4556 }
4557
4558 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4559 {
4560 bufferlist inc_bl;
4561 int err = get_version(ver, inc_bl);
4562 ceph_assert(err == 0);
4563 ceph_assert(inc_bl.length());
4564
4565 auto p = inc_bl.cbegin();
4566 inc.decode(p);
4567 dout(10) << __func__ << " "
4568 << " epoch " << inc.epoch
4569 << " inc_crc " << inc.inc_crc
4570 << " full_crc " << inc.full_crc
4571 << " encode_features " << inc.encode_features << dendl;
4572 return 0;
4573 }
4574
4575 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4576 {
4577 dout(10) << __func__ << " ver " << ver << dendl;
4578
4579 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4580 if (closest_pinned == 0) {
4581 return -ENOENT;
4582 }
4583 if (closest_pinned > ver) {
4584 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4585 }
4586 ceph_assert(closest_pinned <= ver);
4587
4588 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4589
4590 // get osdmap incremental maps and apply on top of this one.
4591 bufferlist osdm_bl;
4592 bool has_cached_osdmap = false;
4593 for (version_t v = ver-1; v >= closest_pinned; --v) {
4594 if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4595 &osdm_bl)) {
4596 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4597 closest_pinned = v;
4598 has_cached_osdmap = true;
4599 break;
4600 }
4601 }
4602
4603 if (!has_cached_osdmap) {
4604 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4605 if (err != 0) {
4606 derr << __func__ << " closest pinned map ver " << closest_pinned
4607 << " not available! error: " << cpp_strerror(err) << dendl;
4608 }
4609 ceph_assert(err == 0);
4610 }
4611
4612 ceph_assert(osdm_bl.length());
4613
4614 OSDMap osdm;
4615 osdm.decode(osdm_bl);
4616
4617 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4618 << " e" << osdm.epoch
4619 << " crc " << osdm.get_crc()
4620 << " -- applying incremental maps." << dendl;
4621
4622 uint64_t encode_features = 0;
4623 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4624 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4625
4626 OSDMap::Incremental inc;
4627 int err = get_inc(v, inc);
4628 ceph_assert(err == 0);
4629
4630 encode_features = inc.encode_features;
4631
4632 err = osdm.apply_incremental(inc);
4633 ceph_assert(err == 0);
4634
4635 // this block performs paranoid checks on map retrieval
4636 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4637 inc.full_crc != 0) {
4638
4639 uint64_t f = encode_features;
4640 if (!f) {
4641 f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4642 }
4643
4644 // encode osdmap to force calculating crcs
4645 bufferlist tbl;
4646 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4647 // decode osdmap to compare crcs with what's expected by incremental
4648 OSDMap tosdm;
4649 tosdm.decode(tbl);
4650
4651 if (tosdm.get_crc() != inc.full_crc) {
4652 derr << __func__
4653 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4654 << ", expected " << inc.full_crc << ")" << dendl;
4655 ceph_abort_msg("osdmap crc mismatch");
4656 }
4657 }
4658
4659 // note: we cannot add the recently computed map to the cache, as is,
4660 // because we have not encoded the map into a bl.
4661 }
4662
4663 if (!encode_features) {
4664 dout(10) << __func__
4665 << " last incremental map didn't have features;"
4666 << " defaulting to quorum's or all" << dendl;
4667 encode_features =
4668 (mon->quorum_con_features ? mon->quorum_con_features : -1);
4669 }
4670 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4671
4672 return 0;
4673 }
4674
4675 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4676 {
4677 return get_version_full(ver, mon->get_quorum_con_features(), bl);
4678 }
4679
4680 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4681 bufferlist& bl)
4682 {
4683 uint64_t significant_features = OSDMap::get_significant_features(features);
4684 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4685 return 0;
4686 }
4687 int ret = PaxosService::get_version_full(ver, bl);
4688 if (ret == -ENOENT) {
4689 // build map?
4690 ret = get_full_from_pinned_map(ver, bl);
4691 }
4692 if (ret < 0) {
4693 return ret;
4694 }
4695 // NOTE: this check is imprecise; the OSDMap encoding features may
4696 // be a subset of the latest mon quorum features, but worst case we
4697 // reencode once and then cache the (identical) result under both
4698 // feature masks.
4699 if (significant_features !=
4700 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4701 reencode_full_map(bl, features);
4702 }
4703 full_osd_cache.add_bytes({ver, significant_features}, bl);
4704 return 0;
4705 }
4706
4707 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4708 {
4709 dout(10) << "blacklist " << av << " until " << until << dendl;
4710 for (auto a : av.v) {
4711 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4712 a.set_type(entity_addr_t::TYPE_ANY);
4713 } else {
4714 a.set_type(entity_addr_t::TYPE_LEGACY);
4715 }
4716 pending_inc.new_blacklist[a] = until;
4717 }
4718 return pending_inc.epoch;
4719 }
4720
4721 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4722 {
4723 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4724 a.set_type(entity_addr_t::TYPE_ANY);
4725 } else {
4726 a.set_type(entity_addr_t::TYPE_LEGACY);
4727 }
4728 dout(10) << "blacklist " << a << " until " << until << dendl;
4729 pending_inc.new_blacklist[a] = until;
4730 return pending_inc.epoch;
4731 }
4732
4733
4734 void OSDMonitor::check_osdmap_subs()
4735 {
4736 dout(10) << __func__ << dendl;
4737 if (!osdmap.get_epoch()) {
4738 return;
4739 }
4740 auto osdmap_subs = mon->session_map.subs.find("osdmap");
4741 if (osdmap_subs == mon->session_map.subs.end()) {
4742 return;
4743 }
4744 auto p = osdmap_subs->second->begin();
4745 while (!p.end()) {
4746 auto sub = *p;
4747 ++p;
4748 check_osdmap_sub(sub);
4749 }
4750 }
4751
4752 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4753 {
4754 dout(10) << __func__ << " " << sub << " next " << sub->next
4755 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4756 if (sub->next <= osdmap.get_epoch()) {
4757 if (sub->next >= 1)
4758 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4759 else
4760 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4761 if (sub->onetime)
4762 mon->session_map.remove_sub(sub);
4763 else
4764 sub->next = osdmap.get_epoch() + 1;
4765 }
4766 }
4767
4768 void OSDMonitor::check_pg_creates_subs()
4769 {
4770 if (!osdmap.get_num_up_osds()) {
4771 return;
4772 }
4773 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4774 mon->with_session_map([this](const MonSessionMap& session_map) {
4775 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4776 if (pg_creates_subs == session_map.subs.end()) {
4777 return;
4778 }
4779 for (auto sub : *pg_creates_subs->second) {
4780 check_pg_creates_sub(sub);
4781 }
4782 });
4783 }
4784
4785 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4786 {
4787 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4788 ceph_assert(sub->type == "osd_pg_creates");
4789 // only send these if the OSD is up. we will check_subs() when they do
4790 // come up so they will get the creates then.
4791 if (sub->session->name.is_osd() &&
4792 mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4793 sub->next = send_pg_creates(sub->session->name.num(),
4794 sub->session->con.get(),
4795 sub->next);
4796 }
4797 }
4798
4799 void OSDMonitor::do_application_enable(int64_t pool_id,
4800 const std::string &app_name,
4801 const std::string &app_key,
4802 const std::string &app_value,
4803 bool force)
4804 {
4805 ceph_assert(paxos->is_plugged() && is_writeable());
4806
4807 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4808 << dendl;
4809
4810 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4811
4812 auto pp = osdmap.get_pg_pool(pool_id);
4813 ceph_assert(pp != nullptr);
4814
4815 pg_pool_t p = *pp;
4816 if (pending_inc.new_pools.count(pool_id)) {
4817 p = pending_inc.new_pools[pool_id];
4818 }
4819
4820 if (app_key.empty()) {
4821 p.application_metadata.insert({app_name, {}});
4822 } else {
4823 if (force) {
4824 p.application_metadata[app_name][app_key] = app_value;
4825 } else {
4826 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4827 }
4828 }
4829 p.last_change = pending_inc.epoch;
4830 pending_inc.new_pools[pool_id] = p;
4831 }
4832
4833 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4834 pool_opts_t::key_t opt,
4835 pool_opts_t::value_t val)
4836 {
4837 auto p = pending_inc.new_pools.try_emplace(
4838 pool_id, *osdmap.get_pg_pool(pool_id));
4839 p.first->second.opts.set(opt, val);
4840 }
4841
4842 unsigned OSDMonitor::scan_for_creating_pgs(
4843 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4844 const mempool::osdmap::set<int64_t>& removed_pools,
4845 utime_t modified,
4846 creating_pgs_t* creating_pgs) const
4847 {
4848 unsigned queued = 0;
4849 for (auto& p : pools) {
4850 int64_t poolid = p.first;
4851 if (creating_pgs->created_pools.count(poolid)) {
4852 dout(10) << __func__ << " already created " << poolid << dendl;
4853 continue;
4854 }
4855 const pg_pool_t& pool = p.second;
4856 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4857 pool.get_type(), pool.get_size());
4858 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4859 continue;
4860
4861 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4862 const auto created = pool.get_last_change();
4863 if (last_scan_epoch && created <= last_scan_epoch) {
4864 dout(10) << __func__ << " no change in pool " << poolid
4865 << " " << pool << dendl;
4866 continue;
4867 }
4868 if (removed_pools.count(poolid)) {
4869 dout(10) << __func__ << " pool is being removed: " << poolid
4870 << " " << pool << dendl;
4871 continue;
4872 }
4873 dout(10) << __func__ << " queueing pool create for " << poolid
4874 << " " << pool << dendl;
4875 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4876 created, modified);
4877 queued++;
4878 }
4879 return queued;
4880 }
4881
4882 void OSDMonitor::update_creating_pgs()
4883 {
4884 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4885 << creating_pgs.queue.size() << " pools in queue" << dendl;
4886 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4887 std::lock_guard<std::mutex> l(creating_pgs_lock);
4888 for (const auto& pg : creating_pgs.pgs) {
4889 int acting_primary = -1;
4890 auto pgid = pg.first;
4891 if (!osdmap.pg_exists(pgid)) {
4892 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4893 << dendl;
4894 continue;
4895 }
4896 auto mapped = pg.second.create_epoch;
4897 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4898 spg_t spgid(pgid);
4899 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4900 // check the previous creating_pgs, look for the target to whom the pg was
4901 // previously mapped
4902 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4903 const auto last_acting_primary = pgs_by_epoch.first;
4904 for (auto& pgs: pgs_by_epoch.second) {
4905 if (pgs.second.count(spgid)) {
4906 if (last_acting_primary == acting_primary) {
4907 mapped = pgs.first;
4908 } else {
4909 dout(20) << __func__ << " " << pgid << " "
4910 << " acting_primary:" << last_acting_primary
4911 << " -> " << acting_primary << dendl;
4912 // note epoch if the target of the create message changed.
4913 mapped = mapping.get_epoch();
4914 }
4915 break;
4916 } else {
4917 // newly creating
4918 mapped = mapping.get_epoch();
4919 }
4920 }
4921 }
4922 dout(10) << __func__ << " will instruct osd." << acting_primary
4923 << " to create " << pgid << "@" << mapped << dendl;
4924 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4925 }
4926 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4927 creating_pgs_epoch = mapping.get_epoch();
4928 }
4929
4930 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4931 {
4932 dout(30) << __func__ << " osd." << osd << " next=" << next
4933 << " " << creating_pgs_by_osd_epoch << dendl;
4934 std::lock_guard<std::mutex> l(creating_pgs_lock);
4935 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4936 dout(20) << __func__
4937 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4938 // the subscribers will be updated when the mapping is completed anyway
4939 return next;
4940 }
4941 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4942 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4943 return next;
4944 ceph_assert(!creating_pgs_by_epoch->second.empty());
4945
4946 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4947 MOSDPGCreate2 *m = nullptr;
4948
4949 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
4950
4951 epoch_t last = 0;
4952 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4953 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4954 auto epoch = epoch_pgs->first;
4955 auto& pgs = epoch_pgs->second;
4956 dout(20) << __func__ << " osd." << osd << " from " << next
4957 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4958 last = epoch;
4959 for (auto& pg : pgs) {
4960 // Need the create time from the monitor using its clock to set
4961 // last_scrub_stamp upon pg creation.
4962 auto create = creating_pgs.pgs.find(pg.pgid);
4963 ceph_assert(create != creating_pgs.pgs.end());
4964 if (old) {
4965 if (!oldm) {
4966 oldm = new MOSDPGCreate(creating_pgs_epoch);
4967 }
4968 oldm->mkpg.emplace(pg.pgid,
4969 pg_create_t{create->second.create_epoch, pg.pgid, 0});
4970 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
4971 } else {
4972 if (!m) {
4973 m = new MOSDPGCreate2(creating_pgs_epoch);
4974 }
4975 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4976 create->second.create_stamp));
4977 if (create->second.history.epoch_created) {
4978 dout(20) << __func__ << " " << pg << " " << create->second.history
4979 << " " << create->second.past_intervals << dendl;
4980 m->pg_extra.emplace(pg, make_pair(create->second.history,
4981 create->second.past_intervals));
4982 }
4983 }
4984 dout(20) << __func__ << " will create " << pg
4985 << " at " << create->second.create_epoch << dendl;
4986 }
4987 }
4988 if (m) {
4989 con->send_message(m);
4990 } else if (oldm) {
4991 con->send_message(oldm);
4992 } else {
4993 dout(20) << __func__ << " osd." << osd << " from " << next
4994 << " has nothing to send" << dendl;
4995 return next;
4996 }
4997
4998 // sub is current through last + 1
4999 return last + 1;
5000 }
5001
5002 // TICK
5003
5004
5005 void OSDMonitor::tick()
5006 {
5007 if (!is_active()) return;
5008
5009 dout(10) << osdmap << dendl;
5010
5011 // always update osdmap manifest, regardless of being the leader.
5012 load_osdmap_manifest();
5013
5014 // always tune priority cache manager memory on leader and peons
5015 if (ceph_using_tcmalloc() && mon_memory_autotune) {
5016 std::lock_guard l(balancer_lock);
5017 if (pcm != nullptr) {
5018 pcm->tune_memory();
5019 pcm->balance();
5020 _set_new_cache_sizes();
5021 dout(10) << "tick balancer "
5022 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5023 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5024 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5025 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5026 << dendl;
5027 dout(10) << "tick balancer "
5028 << " full cache_bytes: " << full_cache->get_cache_bytes()
5029 << " full comtd_bytes: " << full_cache->get_committed_size()
5030 << " full used_bytes: " << full_cache->_get_used_bytes()
5031 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5032 << dendl;
5033 }
5034 }
5035
5036 if (!mon->is_leader()) return;
5037
5038 bool do_propose = false;
5039 utime_t now = ceph_clock_now();
5040
5041 if (handle_osd_timeouts(now, last_osd_report)) {
5042 do_propose = true;
5043 }
5044
5045 // mark osds down?
5046 if (check_failures(now)) {
5047 do_propose = true;
5048 }
5049
5050 // Force a proposal if we need to prune; pruning is performed on
5051 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5052 // even if there's nothing going on.
5053 if (is_prune_enabled() && should_prune()) {
5054 do_propose = true;
5055 }
5056
5057 // mark down osds out?
5058
5059 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5060 * influence at all. The decision is made based on the ratio of "in" osds,
5061 * and the function returns false if this ratio is lower that the minimum
5062 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5063 */
5064 if (can_mark_out(-1)) {
5065 string down_out_subtree_limit = g_conf().get_val<string>(
5066 "mon_osd_down_out_subtree_limit");
5067 set<int> down_cache; // quick cache of down subtrees
5068
5069 map<int,utime_t>::iterator i = down_pending_out.begin();
5070 while (i != down_pending_out.end()) {
5071 int o = i->first;
5072 utime_t down = now;
5073 down -= i->second;
5074 ++i;
5075
5076 if (osdmap.is_down(o) &&
5077 osdmap.is_in(o) &&
5078 can_mark_out(o)) {
5079 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5080 utime_t grace = orig_grace;
5081 double my_grace = 0.0;
5082
5083 if (g_conf()->mon_osd_adjust_down_out_interval) {
5084 // scale grace period the same way we do the heartbeat grace.
5085 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5086 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5087 double decay_k = ::log(.5) / halflife;
5088 double decay = exp((double)down * decay_k);
5089 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5090 << " down for " << down << " decay " << decay << dendl;
5091 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5092 grace += my_grace;
5093 }
5094
5095 // is this an entire large subtree down?
5096 if (down_out_subtree_limit.length()) {
5097 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5098 if (type > 0) {
5099 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5100 dout(10) << "tick entire containing " << down_out_subtree_limit
5101 << " subtree for osd." << o
5102 << " is down; resetting timer" << dendl;
5103 // reset timer, too.
5104 down_pending_out[o] = now;
5105 continue;
5106 }
5107 }
5108 }
5109
5110 bool down_out = !osdmap.is_destroyed(o) &&
5111 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5112 bool destroyed_out = osdmap.is_destroyed(o) &&
5113 g_conf()->mon_osd_destroyed_out_interval > 0 &&
5114 // this is not precise enough as we did not make a note when this osd
5115 // was marked as destroyed, but let's not bother with that
5116 // complexity for now.
5117 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5118 if (down_out || destroyed_out) {
5119 dout(10) << "tick marking osd." << o << " OUT after " << down
5120 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5121 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5122
5123 // set the AUTOOUT bit.
5124 if (pending_inc.new_state.count(o) == 0)
5125 pending_inc.new_state[o] = 0;
5126 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5127
5128 // remember previous weight
5129 if (pending_inc.new_xinfo.count(o) == 0)
5130 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5131 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5132
5133 do_propose = true;
5134
5135 mon->clog->info() << "Marking osd." << o << " out (has been down for "
5136 << int(down.sec()) << " seconds)";
5137 } else
5138 continue;
5139 }
5140
5141 down_pending_out.erase(o);
5142 }
5143 } else {
5144 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5145 }
5146
5147 // expire blacklisted items?
5148 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5149 p != osdmap.blacklist.end();
5150 ++p) {
5151 if (p->second < now) {
5152 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5153 pending_inc.old_blacklist.push_back(p->first);
5154 do_propose = true;
5155 }
5156 }
5157
5158 if (try_prune_purged_snaps()) {
5159 do_propose = true;
5160 }
5161
5162 if (update_pools_status())
5163 do_propose = true;
5164
5165 if (do_propose ||
5166 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5167 propose_pending();
5168 }
5169
5170 void OSDMonitor::_set_new_cache_sizes()
5171 {
5172 uint64_t cache_size = 0;
5173 int64_t inc_alloc = 0;
5174 int64_t full_alloc = 0;
5175 int64_t kv_alloc = 0;
5176
5177 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5178 cache_size = pcm->get_tuned_mem();
5179 inc_alloc = inc_cache->get_committed_size();
5180 full_alloc = full_cache->get_committed_size();
5181 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5182 }
5183
5184 inc_osd_cache.set_bytes(inc_alloc);
5185 full_osd_cache.set_bytes(full_alloc);
5186
5187 dout(1) << __func__ << " cache_size:" << cache_size
5188 << " inc_alloc: " << inc_alloc
5189 << " full_alloc: " << full_alloc
5190 << " kv_alloc: " << kv_alloc
5191 << dendl;
5192 }
5193
5194 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5195 std::map<int,utime_t> &last_osd_report)
5196 {
5197 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5198 if (now - mon->get_leader_since() < timeo) {
5199 // We haven't been the leader for long enough to consider OSD timeouts
5200 return false;
5201 }
5202
5203 int max_osd = osdmap.get_max_osd();
5204 bool new_down = false;
5205
5206 for (int i=0; i < max_osd; ++i) {
5207 dout(30) << __func__ << ": checking up on osd " << i << dendl;
5208 if (!osdmap.exists(i)) {
5209 last_osd_report.erase(i); // if any
5210 continue;
5211 }
5212 if (!osdmap.is_up(i))
5213 continue;
5214 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5215 if (t == last_osd_report.end()) {
5216 // it wasn't in the map; start the timer.
5217 last_osd_report[i] = now;
5218 } else if (can_mark_down(i)) {
5219 utime_t diff = now - t->second;
5220 if (diff > timeo) {
5221 mon->clog->info() << "osd." << i << " marked down after no beacon for "
5222 << diff << " seconds";
5223 derr << "no beacon from osd." << i << " since " << t->second
5224 << ", " << diff << " seconds ago. marking down" << dendl;
5225 pending_inc.new_state[i] = CEPH_OSD_UP;
5226 new_down = true;
5227 }
5228 }
5229 }
5230 return new_down;
5231 }
5232
5233 static void dump_cpu_list(Formatter *f, const char *name,
5234 const string& strlist)
5235 {
5236 cpu_set_t cpu_set;
5237 size_t cpu_set_size;
5238 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5239 return;
5240 }
5241 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5242 f->open_array_section(name);
5243 for (auto cpu : cpus) {
5244 f->dump_int("cpu", cpu);
5245 }
5246 f->close_section();
5247 }
5248
5249 void OSDMonitor::dump_info(Formatter *f)
5250 {
5251 f->open_object_section("osdmap");
5252 osdmap.dump(f);
5253 f->close_section();
5254
5255 f->open_array_section("osd_metadata");
5256 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5257 if (osdmap.exists(i)) {
5258 f->open_object_section("osd");
5259 f->dump_unsigned("id", i);
5260 dump_osd_metadata(i, f, NULL);
5261 f->close_section();
5262 }
5263 }
5264 f->close_section();
5265
5266 f->open_object_section("osdmap_clean_epochs");
5267 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5268
5269 f->open_object_section("last_epoch_clean");
5270 last_epoch_clean.dump(f);
5271 f->close_section();
5272
5273 f->open_array_section("osd_epochs");
5274 for (auto& osd_epoch : osd_epochs) {
5275 f->open_object_section("osd");
5276 f->dump_unsigned("id", osd_epoch.first);
5277 f->dump_unsigned("epoch", osd_epoch.second);
5278 f->close_section();
5279 }
5280 f->close_section(); // osd_epochs
5281
5282 f->close_section(); // osd_clean_epochs
5283
5284 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5285 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5286
5287 f->open_object_section("crushmap");
5288 osdmap.crush->dump(f);
5289 f->close_section();
5290
5291 if (has_osdmap_manifest) {
5292 f->open_object_section("osdmap_manifest");
5293 osdmap_manifest.dump(f);
5294 f->close_section();
5295 }
5296 }
5297
5298 namespace {
5299 enum osd_pool_get_choices {
5300 SIZE, MIN_SIZE,
5301 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5302 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5303 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5304 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5305 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5306 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5307 CACHE_TARGET_FULL_RATIO,
5308 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5309 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5310 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5311 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5312 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5313 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5314 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5315 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5316 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5317 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5318 PG_AUTOSCALE_BIAS };
5319
5320 std::set<osd_pool_get_choices>
5321 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5322 const std::set<osd_pool_get_choices>& second)
5323 {
5324 std::set<osd_pool_get_choices> result;
5325 std::set_difference(first.begin(), first.end(),
5326 second.begin(), second.end(),
5327 std::inserter(result, result.end()));
5328 return result;
5329 }
5330 }
5331
5332
5333 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5334 {
5335 op->mark_osdmon_event(__func__);
5336 auto m = op->get_req<MMonCommand>();
5337 int r = 0;
5338 bufferlist rdata;
5339 stringstream ss, ds;
5340
5341 cmdmap_t cmdmap;
5342 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5343 string rs = ss.str();
5344 mon->reply_command(op, -EINVAL, rs, get_last_committed());
5345 return true;
5346 }
5347
5348 MonSession *session = op->get_session();
5349 if (!session) {
5350 derr << __func__ << " no session" << dendl;
5351 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5352 return true;
5353 }
5354
5355 string prefix;
5356 cmd_getval(cmdmap, "prefix", prefix);
5357
5358 string format;
5359 cmd_getval(cmdmap, "format", format, string("plain"));
5360 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5361
5362 if (prefix == "osd stat") {
5363 if (f) {
5364 f->open_object_section("osdmap");
5365 osdmap.print_summary(f.get(), ds, "", true);
5366 f->close_section();
5367 f->flush(rdata);
5368 } else {
5369 osdmap.print_summary(nullptr, ds, "", true);
5370 rdata.append(ds);
5371 }
5372 }
5373 else if (prefix == "osd dump" ||
5374 prefix == "osd tree" ||
5375 prefix == "osd tree-from" ||
5376 prefix == "osd ls" ||
5377 prefix == "osd getmap" ||
5378 prefix == "osd getcrushmap" ||
5379 prefix == "osd ls-tree" ||
5380 prefix == "osd info") {
5381 string val;
5382
5383 epoch_t epoch = 0;
5384 int64_t epochnum;
5385 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5386 epoch = epochnum;
5387
5388 bufferlist osdmap_bl;
5389 int err = get_version_full(epoch, osdmap_bl);
5390 if (err == -ENOENT) {
5391 r = -ENOENT;
5392 ss << "there is no map for epoch " << epoch;
5393 goto reply;
5394 }
5395 ceph_assert(err == 0);
5396 ceph_assert(osdmap_bl.length());
5397
5398 OSDMap *p;
5399 if (epoch == osdmap.get_epoch()) {
5400 p = &osdmap;
5401 } else {
5402 p = new OSDMap;
5403 p->decode(osdmap_bl);
5404 }
5405
5406 auto sg = make_scope_guard([&] {
5407 if (p != &osdmap) {
5408 delete p;
5409 }
5410 });
5411
5412 if (prefix == "osd dump") {
5413 stringstream ds;
5414 if (f) {
5415 f->open_object_section("osdmap");
5416 p->dump(f.get());
5417 f->close_section();
5418 f->flush(ds);
5419 } else {
5420 p->print(ds);
5421 }
5422 rdata.append(ds);
5423 if (!f)
5424 ds << " ";
5425 } else if (prefix == "osd ls") {
5426 if (f) {
5427 f->open_array_section("osds");
5428 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5429 if (osdmap.exists(i)) {
5430 f->dump_int("osd", i);
5431 }
5432 }
5433 f->close_section();
5434 f->flush(ds);
5435 } else {
5436 bool first = true;
5437 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5438 if (osdmap.exists(i)) {
5439 if (!first)
5440 ds << "\n";
5441 first = false;
5442 ds << i;
5443 }
5444 }
5445 }
5446 rdata.append(ds);
5447 } else if (prefix == "osd info") {
5448 int64_t osd_id;
5449 bool do_single_osd = true;
5450 if (!cmd_getval(cmdmap, "id", osd_id)) {
5451 do_single_osd = false;
5452 }
5453
5454 if (do_single_osd && !osdmap.exists(osd_id)) {
5455 ss << "osd." << osd_id << " does not exist";
5456 r = -EINVAL;
5457 goto reply;
5458 }
5459
5460 if (f) {
5461 if (do_single_osd) {
5462 osdmap.dump_osd(osd_id, f.get());
5463 } else {
5464 osdmap.dump_osds(f.get());
5465 }
5466 f->flush(ds);
5467 } else {
5468 if (do_single_osd) {
5469 osdmap.print_osd(osd_id, ds);
5470 } else {
5471 osdmap.print_osds(ds);
5472 }
5473 }
5474 rdata.append(ds);
5475 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5476 string bucket;
5477 if (prefix == "osd tree-from") {
5478 cmd_getval(cmdmap, "bucket", bucket);
5479 if (!osdmap.crush->name_exists(bucket)) {
5480 ss << "bucket '" << bucket << "' does not exist";
5481 r = -ENOENT;
5482 goto reply;
5483 }
5484 int id = osdmap.crush->get_item_id(bucket);
5485 if (id >= 0) {
5486 ss << "\"" << bucket << "\" is not a bucket";
5487 r = -EINVAL;
5488 goto reply;
5489 }
5490 }
5491
5492 vector<string> states;
5493 cmd_getval(cmdmap, "states", states);
5494 unsigned filter = 0;
5495 for (auto& s : states) {
5496 if (s == "up") {
5497 filter |= OSDMap::DUMP_UP;
5498 } else if (s == "down") {
5499 filter |= OSDMap::DUMP_DOWN;
5500 } else if (s == "in") {
5501 filter |= OSDMap::DUMP_IN;
5502 } else if (s == "out") {
5503 filter |= OSDMap::DUMP_OUT;
5504 } else if (s == "destroyed") {
5505 filter |= OSDMap::DUMP_DESTROYED;
5506 } else {
5507 ss << "unrecognized state '" << s << "'";
5508 r = -EINVAL;
5509 goto reply;
5510 }
5511 }
5512 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5513 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5514 ss << "cannot specify both 'in' and 'out'";
5515 r = -EINVAL;
5516 goto reply;
5517 }
5518 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5519 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5520 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5521 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5522 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5523 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5524 ss << "can specify only one of 'up', 'down' and 'destroyed'";
5525 r = -EINVAL;
5526 goto reply;
5527 }
5528 if (f) {
5529 f->open_object_section("tree");
5530 p->print_tree(f.get(), NULL, filter, bucket);
5531 f->close_section();
5532 f->flush(ds);
5533 } else {
5534 p->print_tree(NULL, &ds, filter, bucket);
5535 }
5536 rdata.append(ds);
5537 } else if (prefix == "osd getmap") {
5538 rdata.append(osdmap_bl);
5539 ss << "got osdmap epoch " << p->get_epoch();
5540 } else if (prefix == "osd getcrushmap") {
5541 p->crush->encode(rdata, mon->get_quorum_con_features());
5542 ss << p->get_crush_version();
5543 } else if (prefix == "osd ls-tree") {
5544 string bucket_name;
5545 cmd_getval(cmdmap, "name", bucket_name);
5546 set<int> osds;
5547 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5548 if (r == -ENOENT) {
5549 ss << "\"" << bucket_name << "\" does not exist";
5550 goto reply;
5551 } else if (r < 0) {
5552 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5553 goto reply;
5554 }
5555
5556 if (f) {
5557 f->open_array_section("osds");
5558 for (auto &i : osds) {
5559 if (osdmap.exists(i)) {
5560 f->dump_int("osd", i);
5561 }
5562 }
5563 f->close_section();
5564 f->flush(ds);
5565 } else {
5566 bool first = true;
5567 for (auto &i : osds) {
5568 if (osdmap.exists(i)) {
5569 if (!first)
5570 ds << "\n";
5571 first = false;
5572 ds << i;
5573 }
5574 }
5575 }
5576
5577 rdata.append(ds);
5578 }
5579 } else if (prefix == "osd getmaxosd") {
5580 if (f) {
5581 f->open_object_section("getmaxosd");
5582 f->dump_unsigned("epoch", osdmap.get_epoch());
5583 f->dump_int("max_osd", osdmap.get_max_osd());
5584 f->close_section();
5585 f->flush(rdata);
5586 } else {
5587 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5588 rdata.append(ds);
5589 }
5590 } else if (prefix == "osd utilization") {
5591 string out;
5592 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5593 if (f)
5594 f->flush(rdata);
5595 else
5596 rdata.append(out);
5597 r = 0;
5598 goto reply;
5599 } else if (prefix == "osd find") {
5600 int64_t osd;
5601 if (!cmd_getval(cmdmap, "id", osd)) {
5602 ss << "unable to parse osd id value '"
5603 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5604 r = -EINVAL;
5605 goto reply;
5606 }
5607 if (!osdmap.exists(osd)) {
5608 ss << "osd." << osd << " does not exist";
5609 r = -ENOENT;
5610 goto reply;
5611 }
5612 string format;
5613 cmd_getval(cmdmap, "format", format);
5614 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5615 f->open_object_section("osd_location");
5616 f->dump_int("osd", osd);
5617 f->dump_object("addrs", osdmap.get_addrs(osd));
5618 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5619
5620 // try to identify host, pod/container name, etc.
5621 map<string,string> m;
5622 load_metadata(osd, m, nullptr);
5623 if (auto p = m.find("hostname"); p != m.end()) {
5624 f->dump_string("host", p->second);
5625 }
5626 for (auto& k : {
5627 "pod_name", "pod_namespace", // set by rook
5628 "container_name" // set by cephadm, ceph-ansible
5629 }) {
5630 if (auto p = m.find(k); p != m.end()) {
5631 f->dump_string(k, p->second);
5632 }
5633 }
5634
5635 // crush is helpful too
5636 f->open_object_section("crush_location");
5637 map<string,string> loc = osdmap.crush->get_full_location(osd);
5638 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5639 f->dump_string(p->first.c_str(), p->second);
5640 f->close_section();
5641 f->close_section();
5642 f->flush(rdata);
5643 } else if (prefix == "osd metadata") {
5644 int64_t osd = -1;
5645 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5646 !cmd_getval(cmdmap, "id", osd)) {
5647 ss << "unable to parse osd id value '"
5648 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5649 r = -EINVAL;
5650 goto reply;
5651 }
5652 if (osd >= 0 && !osdmap.exists(osd)) {
5653 ss << "osd." << osd << " does not exist";
5654 r = -ENOENT;
5655 goto reply;
5656 }
5657 string format;
5658 cmd_getval(cmdmap, "format", format);
5659 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5660 if (osd >= 0) {
5661 f->open_object_section("osd_metadata");
5662 f->dump_unsigned("id", osd);
5663 r = dump_osd_metadata(osd, f.get(), &ss);
5664 if (r < 0)
5665 goto reply;
5666 f->close_section();
5667 } else {
5668 r = 0;
5669 f->open_array_section("osd_metadata");
5670 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5671 if (osdmap.exists(i)) {
5672 f->open_object_section("osd");
5673 f->dump_unsigned("id", i);
5674 r = dump_osd_metadata(i, f.get(), NULL);
5675 if (r == -EINVAL || r == -ENOENT) {
5676 // Drop error, continue to get other daemons' metadata
5677 dout(4) << "No metadata for osd." << i << dendl;
5678 r = 0;
5679 } else if (r < 0) {
5680 // Unexpected error
5681 goto reply;
5682 }
5683 f->close_section();
5684 }
5685 }
5686 f->close_section();
5687 }
5688 f->flush(rdata);
5689 } else if (prefix == "osd versions") {
5690 if (!f)
5691 f.reset(Formatter::create("json-pretty"));
5692 count_metadata("ceph_version", f.get());
5693 f->flush(rdata);
5694 r = 0;
5695 } else if (prefix == "osd count-metadata") {
5696 if (!f)
5697 f.reset(Formatter::create("json-pretty"));
5698 string field;
5699 cmd_getval(cmdmap, "property", field);
5700 count_metadata(field, f.get());
5701 f->flush(rdata);
5702 r = 0;
5703 } else if (prefix == "osd numa-status") {
5704 TextTable tbl;
5705 if (f) {
5706 f->open_array_section("osds");
5707 } else {
5708 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5709 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5710 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5711 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5712 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5713 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5714 }
5715 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5716 if (osdmap.exists(i)) {
5717 map<string,string> m;
5718 ostringstream err;
5719 if (load_metadata(i, m, &err) < 0) {
5720 continue;
5721 }
5722 string host;
5723 auto p = m.find("hostname");
5724 if (p != m.end()) {
5725 host = p->second;
5726 }
5727 if (f) {
5728 f->open_object_section("osd");
5729 f->dump_int("osd", i);
5730 f->dump_string("host", host);
5731 for (auto n : { "network_numa_node", "objectstore_numa_node",
5732 "numa_node" }) {
5733 p = m.find(n);
5734 if (p != m.end()) {
5735 f->dump_int(n, atoi(p->second.c_str()));
5736 }
5737 }
5738 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5739 p = m.find(n);
5740 if (p != m.end()) {
5741 list<string> ls = get_str_list(p->second, ",");
5742 f->open_array_section(n);
5743 for (auto node : ls) {
5744 f->dump_int("node", atoi(node.c_str()));
5745 }
5746 f->close_section();
5747 }
5748 }
5749 for (auto n : { "numa_node_cpus" }) {
5750 p = m.find(n);
5751 if (p != m.end()) {
5752 dump_cpu_list(f.get(), n, p->second);
5753 }
5754 }
5755 f->close_section();
5756 } else {
5757 tbl << i;
5758 tbl << host;
5759 p = m.find("network_numa_nodes");
5760 if (p != m.end()) {
5761 tbl << p->second;
5762 } else {
5763 tbl << "-";
5764 }
5765 p = m.find("objectstore_numa_nodes");
5766 if (p != m.end()) {
5767 tbl << p->second;
5768 } else {
5769 tbl << "-";
5770 }
5771 p = m.find("numa_node");
5772 auto q = m.find("numa_node_cpus");
5773 if (p != m.end() && q != m.end()) {
5774 tbl << p->second;
5775 tbl << q->second;
5776 } else {
5777 tbl << "-";
5778 tbl << "-";
5779 }
5780 tbl << TextTable::endrow;
5781 }
5782 }
5783 }
5784 if (f) {
5785 f->close_section();
5786 f->flush(rdata);
5787 } else {
5788 rdata.append(stringify(tbl));
5789 }
5790 } else if (prefix == "osd map") {
5791 string poolstr, objstr, namespacestr;
5792 cmd_getval(cmdmap, "pool", poolstr);
5793 cmd_getval(cmdmap, "object", objstr);
5794 cmd_getval(cmdmap, "nspace", namespacestr);
5795
5796 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5797 if (pool < 0) {
5798 ss << "pool " << poolstr << " does not exist";
5799 r = -ENOENT;
5800 goto reply;
5801 }
5802 object_locator_t oloc(pool, namespacestr);
5803 object_t oid(objstr);
5804 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5805 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5806 vector<int> up, acting;
5807 int up_p, acting_p;
5808 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5809
5810 string fullobjname;
5811 if (!namespacestr.empty())
5812 fullobjname = namespacestr + string("/") + oid.name;
5813 else
5814 fullobjname = oid.name;
5815 if (f) {
5816 f->open_object_section("osd_map");
5817 f->dump_unsigned("epoch", osdmap.get_epoch());
5818 f->dump_string("pool", poolstr);
5819 f->dump_int("pool_id", pool);
5820 f->dump_stream("objname") << fullobjname;
5821 f->dump_stream("raw_pgid") << pgid;
5822 f->dump_stream("pgid") << mpgid;
5823 f->open_array_section("up");
5824 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5825 f->dump_int("osd", *p);
5826 f->close_section();
5827 f->dump_int("up_primary", up_p);
5828 f->open_array_section("acting");
5829 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5830 f->dump_int("osd", *p);
5831 f->close_section();
5832 f->dump_int("acting_primary", acting_p);
5833 f->close_section(); // osd_map
5834 f->flush(rdata);
5835 } else {
5836 ds << "osdmap e" << osdmap.get_epoch()
5837 << " pool '" << poolstr << "' (" << pool << ")"
5838 << " object '" << fullobjname << "' ->"
5839 << " pg " << pgid << " (" << mpgid << ")"
5840 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5841 << pg_vector_string(acting) << ", p" << acting_p << ")";
5842 rdata.append(ds);
5843 }
5844
5845 } else if (prefix == "pg map") {
5846 pg_t pgid;
5847 string pgidstr;
5848 cmd_getval(cmdmap, "pgid", pgidstr);
5849 if (!pgid.parse(pgidstr.c_str())) {
5850 ss << "invalid pgid '" << pgidstr << "'";
5851 r = -EINVAL;
5852 goto reply;
5853 }
5854 vector<int> up, acting;
5855 if (!osdmap.have_pg_pool(pgid.pool())) {
5856 ss << "pg '" << pgidstr << "' does not exist";
5857 r = -ENOENT;
5858 goto reply;
5859 }
5860 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5861 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5862 if (f) {
5863 f->open_object_section("pg_map");
5864 f->dump_unsigned("epoch", osdmap.get_epoch());
5865 f->dump_stream("raw_pgid") << pgid;
5866 f->dump_stream("pgid") << mpgid;
5867 f->open_array_section("up");
5868 for (auto osd : up) {
5869 f->dump_int("up_osd", osd);
5870 }
5871 f->close_section();
5872 f->open_array_section("acting");
5873 for (auto osd : acting) {
5874 f->dump_int("acting_osd", osd);
5875 }
5876 f->close_section();
5877 f->close_section();
5878 f->flush(rdata);
5879 } else {
5880 ds << "osdmap e" << osdmap.get_epoch()
5881 << " pg " << pgid << " (" << mpgid << ")"
5882 << " -> up " << up << " acting " << acting;
5883 rdata.append(ds);
5884 }
5885 goto reply;
5886
5887 } else if (prefix == "osd lspools") {
5888 if (f)
5889 f->open_array_section("pools");
5890 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5891 p != osdmap.pools.end();
5892 ++p) {
5893 if (f) {
5894 f->open_object_section("pool");
5895 f->dump_int("poolnum", p->first);
5896 f->dump_string("poolname", osdmap.pool_name[p->first]);
5897 f->close_section();
5898 } else {
5899 ds << p->first << ' ' << osdmap.pool_name[p->first];
5900 if (next(p) != osdmap.pools.end()) {
5901 ds << '\n';
5902 }
5903 }
5904 }
5905 if (f) {
5906 f->close_section();
5907 f->flush(ds);
5908 }
5909 rdata.append(ds);
5910 } else if (prefix == "osd blacklist ls") {
5911 if (f)
5912 f->open_array_section("blacklist");
5913
5914 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5915 p != osdmap.blacklist.end();
5916 ++p) {
5917 if (f) {
5918 f->open_object_section("entry");
5919 f->dump_string("addr", p->first.get_legacy_str());
5920 f->dump_stream("until") << p->second;
5921 f->close_section();
5922 } else {
5923 stringstream ss;
5924 string s;
5925 ss << p->first << " " << p->second;
5926 getline(ss, s);
5927 s += "\n";
5928 rdata.append(s);
5929 }
5930 }
5931 if (f) {
5932 f->close_section();
5933 f->flush(rdata);
5934 }
5935 ss << "listed " << osdmap.blacklist.size() << " entries";
5936
5937 } else if (prefix == "osd pool ls") {
5938 string detail;
5939 cmd_getval(cmdmap, "detail", detail);
5940 if (!f && detail == "detail") {
5941 ostringstream ss;
5942 osdmap.print_pools(ss);
5943 rdata.append(ss.str());
5944 } else {
5945 if (f)
5946 f->open_array_section("pools");
5947 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5948 it != osdmap.get_pools().end();
5949 ++it) {
5950 if (f) {
5951 if (detail == "detail") {
5952 f->open_object_section("pool");
5953 f->dump_int("pool_id", it->first);
5954 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5955 it->second.dump(f.get());
5956 f->close_section();
5957 } else {
5958 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5959 }
5960 } else {
5961 rdata.append(osdmap.get_pool_name(it->first) + "\n");
5962 }
5963 }
5964 if (f) {
5965 f->close_section();
5966 f->flush(rdata);
5967 }
5968 }
5969
5970 } else if (prefix == "osd crush get-tunable") {
5971 string tunable;
5972 cmd_getval(cmdmap, "tunable", tunable);
5973 ostringstream rss;
5974 if (f)
5975 f->open_object_section("tunable");
5976 if (tunable == "straw_calc_version") {
5977 if (f)
5978 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5979 else
5980 rss << osdmap.crush->get_straw_calc_version() << "\n";
5981 } else {
5982 r = -EINVAL;
5983 goto reply;
5984 }
5985 if (f) {
5986 f->close_section();
5987 f->flush(rdata);
5988 } else {
5989 rdata.append(rss.str());
5990 }
5991 r = 0;
5992
5993 } else if (prefix == "osd pool get") {
5994 string poolstr;
5995 cmd_getval(cmdmap, "pool", poolstr);
5996 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5997 if (pool < 0) {
5998 ss << "unrecognized pool '" << poolstr << "'";
5999 r = -ENOENT;
6000 goto reply;
6001 }
6002
6003 const pg_pool_t *p = osdmap.get_pg_pool(pool);
6004 string var;
6005 cmd_getval(cmdmap, "var", var);
6006
6007 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6008 const choices_map_t ALL_CHOICES = {
6009 {"size", SIZE},
6010 {"min_size", MIN_SIZE},
6011 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6012 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
6013 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6014 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6015 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6016 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6017 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6018 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6019 {"use_gmt_hitset", USE_GMT_HITSET},
6020 {"target_max_objects", TARGET_MAX_OBJECTS},
6021 {"target_max_bytes", TARGET_MAX_BYTES},
6022 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6023 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6024 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6025 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6026 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6027 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6028 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6029 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6030 {"fast_read", FAST_READ},
6031 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6032 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6033 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6034 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6035 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6036 {"recovery_priority", RECOVERY_PRIORITY},
6037 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6038 {"scrub_priority", SCRUB_PRIORITY},
6039 {"compression_mode", COMPRESSION_MODE},
6040 {"compression_algorithm", COMPRESSION_ALGORITHM},
6041 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6042 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6043 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6044 {"csum_type", CSUM_TYPE},
6045 {"csum_max_block", CSUM_MAX_BLOCK},
6046 {"csum_min_block", CSUM_MIN_BLOCK},
6047 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6048 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6049 {"pg_num_min", PG_NUM_MIN},
6050 {"target_size_bytes", TARGET_SIZE_BYTES},
6051 {"target_size_ratio", TARGET_SIZE_RATIO},
6052 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6053 };
6054
6055 typedef std::set<osd_pool_get_choices> choices_set_t;
6056
6057 const choices_set_t ONLY_TIER_CHOICES = {
6058 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6059 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6060 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6061 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6062 MIN_READ_RECENCY_FOR_PROMOTE,
6063 MIN_WRITE_RECENCY_FOR_PROMOTE,
6064 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6065 };
6066 const choices_set_t ONLY_ERASURE_CHOICES = {
6067 EC_OVERWRITES, ERASURE_CODE_PROFILE
6068 };
6069
6070 choices_set_t selected_choices;
6071 if (var == "all") {
6072 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6073 it != ALL_CHOICES.end(); ++it) {
6074 selected_choices.insert(it->second);
6075 }
6076
6077 if(!p->is_tier()) {
6078 selected_choices = subtract_second_from_first(selected_choices,
6079 ONLY_TIER_CHOICES);
6080 }
6081
6082 if(!p->is_erasure()) {
6083 selected_choices = subtract_second_from_first(selected_choices,
6084 ONLY_ERASURE_CHOICES);
6085 }
6086 } else /* var != "all" */ {
6087 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6088 osd_pool_get_choices selected = found->second;
6089
6090 if (!p->is_tier() &&
6091 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6092 ss << "pool '" << poolstr
6093 << "' is not a tier pool: variable not applicable";
6094 r = -EACCES;
6095 goto reply;
6096 }
6097
6098 if (!p->is_erasure() &&
6099 ONLY_ERASURE_CHOICES.find(selected)
6100 != ONLY_ERASURE_CHOICES.end()) {
6101 ss << "pool '" << poolstr
6102 << "' is not a erasure pool: variable not applicable";
6103 r = -EACCES;
6104 goto reply;
6105 }
6106
6107 if (pool_opts_t::is_opt_name(var) &&
6108 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6109 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6110 r = -ENOENT;
6111 goto reply;
6112 }
6113
6114 selected_choices.insert(selected);
6115 }
6116
6117 if (f) {
6118 f->open_object_section("pool");
6119 f->dump_string("pool", poolstr);
6120 f->dump_int("pool_id", pool);
6121 for(choices_set_t::const_iterator it = selected_choices.begin();
6122 it != selected_choices.end(); ++it) {
6123 choices_map_t::const_iterator i;
6124 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6125 if (i->second == *it) {
6126 break;
6127 }
6128 }
6129 ceph_assert(i != ALL_CHOICES.end());
6130 switch(*it) {
6131 case PG_NUM:
6132 f->dump_int("pg_num", p->get_pg_num());
6133 break;
6134 case PGP_NUM:
6135 f->dump_int("pgp_num", p->get_pgp_num());
6136 break;
6137 case SIZE:
6138 f->dump_int("size", p->get_size());
6139 break;
6140 case MIN_SIZE:
6141 f->dump_int("min_size", p->get_min_size());
6142 break;
6143 case CRUSH_RULE:
6144 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6145 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6146 p->get_crush_rule()));
6147 } else {
6148 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6149 }
6150 break;
6151 case EC_OVERWRITES:
6152 f->dump_bool("allow_ec_overwrites",
6153 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6154 break;
6155 case PG_AUTOSCALE_MODE:
6156 f->dump_string("pg_autoscale_mode",
6157 pg_pool_t::get_pg_autoscale_mode_name(
6158 p->pg_autoscale_mode));
6159 break;
6160 case HASHPSPOOL:
6161 case NODELETE:
6162 case NOPGCHANGE:
6163 case NOSIZECHANGE:
6164 case WRITE_FADVISE_DONTNEED:
6165 case NOSCRUB:
6166 case NODEEP_SCRUB:
6167 f->dump_bool(i->first.c_str(),
6168 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6169 break;
6170 case HIT_SET_PERIOD:
6171 f->dump_int("hit_set_period", p->hit_set_period);
6172 break;
6173 case HIT_SET_COUNT:
6174 f->dump_int("hit_set_count", p->hit_set_count);
6175 break;
6176 case HIT_SET_TYPE:
6177 f->dump_string("hit_set_type",
6178 HitSet::get_type_name(p->hit_set_params.get_type()));
6179 break;
6180 case HIT_SET_FPP:
6181 {
6182 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6183 BloomHitSet::Params *bloomp =
6184 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6185 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6186 } else if(var != "all") {
6187 f->close_section();
6188 ss << "hit set is not of type Bloom; " <<
6189 "invalid to get a false positive rate!";
6190 r = -EINVAL;
6191 goto reply;
6192 }
6193 }
6194 break;
6195 case USE_GMT_HITSET:
6196 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6197 break;
6198 case TARGET_MAX_OBJECTS:
6199 f->dump_unsigned("target_max_objects", p->target_max_objects);
6200 break;
6201 case TARGET_MAX_BYTES:
6202 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6203 break;
6204 case CACHE_TARGET_DIRTY_RATIO:
6205 f->dump_unsigned("cache_target_dirty_ratio_micro",
6206 p->cache_target_dirty_ratio_micro);
6207 f->dump_float("cache_target_dirty_ratio",
6208 ((float)p->cache_target_dirty_ratio_micro/1000000));
6209 break;
6210 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6211 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6212 p->cache_target_dirty_high_ratio_micro);
6213 f->dump_float("cache_target_dirty_high_ratio",
6214 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6215 break;
6216 case CACHE_TARGET_FULL_RATIO:
6217 f->dump_unsigned("cache_target_full_ratio_micro",
6218 p->cache_target_full_ratio_micro);
6219 f->dump_float("cache_target_full_ratio",
6220 ((float)p->cache_target_full_ratio_micro/1000000));
6221 break;
6222 case CACHE_MIN_FLUSH_AGE:
6223 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6224 break;
6225 case CACHE_MIN_EVICT_AGE:
6226 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6227 break;
6228 case ERASURE_CODE_PROFILE:
6229 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6230 break;
6231 case MIN_READ_RECENCY_FOR_PROMOTE:
6232 f->dump_int("min_read_recency_for_promote",
6233 p->min_read_recency_for_promote);
6234 break;
6235 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6236 f->dump_int("min_write_recency_for_promote",
6237 p->min_write_recency_for_promote);
6238 break;
6239 case FAST_READ:
6240 f->dump_int("fast_read", p->fast_read);
6241 break;
6242 case HIT_SET_GRADE_DECAY_RATE:
6243 f->dump_int("hit_set_grade_decay_rate",
6244 p->hit_set_grade_decay_rate);
6245 break;
6246 case HIT_SET_SEARCH_LAST_N:
6247 f->dump_int("hit_set_search_last_n",
6248 p->hit_set_search_last_n);
6249 break;
6250 case SCRUB_MIN_INTERVAL:
6251 case SCRUB_MAX_INTERVAL:
6252 case DEEP_SCRUB_INTERVAL:
6253 case RECOVERY_PRIORITY:
6254 case RECOVERY_OP_PRIORITY:
6255 case SCRUB_PRIORITY:
6256 case COMPRESSION_MODE:
6257 case COMPRESSION_ALGORITHM:
6258 case COMPRESSION_REQUIRED_RATIO:
6259 case COMPRESSION_MAX_BLOB_SIZE:
6260 case COMPRESSION_MIN_BLOB_SIZE:
6261 case CSUM_TYPE:
6262 case CSUM_MAX_BLOCK:
6263 case CSUM_MIN_BLOCK:
6264 case FINGERPRINT_ALGORITHM:
6265 case PG_NUM_MIN:
6266 case TARGET_SIZE_BYTES:
6267 case TARGET_SIZE_RATIO:
6268 case PG_AUTOSCALE_BIAS:
6269 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6270 if (p->opts.is_set(key)) {
6271 if(*it == CSUM_TYPE) {
6272 int64_t val;
6273 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6274 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6275 } else {
6276 p->opts.dump(i->first, f.get());
6277 }
6278 }
6279 break;
6280 }
6281 }
6282 f->close_section();
6283 f->flush(rdata);
6284 } else /* !f */ {
6285 for(choices_set_t::const_iterator it = selected_choices.begin();
6286 it != selected_choices.end(); ++it) {
6287 choices_map_t::const_iterator i;
6288 switch(*it) {
6289 case PG_NUM:
6290 ss << "pg_num: " << p->get_pg_num() << "\n";
6291 break;
6292 case PGP_NUM:
6293 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6294 break;
6295 case SIZE:
6296 ss << "size: " << p->get_size() << "\n";
6297 break;
6298 case MIN_SIZE:
6299 ss << "min_size: " << p->get_min_size() << "\n";
6300 break;
6301 case CRUSH_RULE:
6302 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6303 ss << "crush_rule: " << osdmap.crush->get_rule_name(
6304 p->get_crush_rule()) << "\n";
6305 } else {
6306 ss << "crush_rule: " << p->get_crush_rule() << "\n";
6307 }
6308 break;
6309 case PG_AUTOSCALE_MODE:
6310 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6311 p->pg_autoscale_mode) <<"\n";
6312 break;
6313 case HIT_SET_PERIOD:
6314 ss << "hit_set_period: " << p->hit_set_period << "\n";
6315 break;
6316 case HIT_SET_COUNT:
6317 ss << "hit_set_count: " << p->hit_set_count << "\n";
6318 break;
6319 case HIT_SET_TYPE:
6320 ss << "hit_set_type: " <<
6321 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6322 break;
6323 case HIT_SET_FPP:
6324 {
6325 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6326 BloomHitSet::Params *bloomp =
6327 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6328 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6329 } else if(var != "all") {
6330 ss << "hit set is not of type Bloom; " <<
6331 "invalid to get a false positive rate!";
6332 r = -EINVAL;
6333 goto reply;
6334 }
6335 }
6336 break;
6337 case USE_GMT_HITSET:
6338 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6339 break;
6340 case TARGET_MAX_OBJECTS:
6341 ss << "target_max_objects: " << p->target_max_objects << "\n";
6342 break;
6343 case TARGET_MAX_BYTES:
6344 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6345 break;
6346 case CACHE_TARGET_DIRTY_RATIO:
6347 ss << "cache_target_dirty_ratio: "
6348 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6349 break;
6350 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6351 ss << "cache_target_dirty_high_ratio: "
6352 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6353 break;
6354 case CACHE_TARGET_FULL_RATIO:
6355 ss << "cache_target_full_ratio: "
6356 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6357 break;
6358 case CACHE_MIN_FLUSH_AGE:
6359 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6360 break;
6361 case CACHE_MIN_EVICT_AGE:
6362 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6363 break;
6364 case ERASURE_CODE_PROFILE:
6365 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6366 break;
6367 case MIN_READ_RECENCY_FOR_PROMOTE:
6368 ss << "min_read_recency_for_promote: " <<
6369 p->min_read_recency_for_promote << "\n";
6370 break;
6371 case HIT_SET_GRADE_DECAY_RATE:
6372 ss << "hit_set_grade_decay_rate: " <<
6373 p->hit_set_grade_decay_rate << "\n";
6374 break;
6375 case HIT_SET_SEARCH_LAST_N:
6376 ss << "hit_set_search_last_n: " <<
6377 p->hit_set_search_last_n << "\n";
6378 break;
6379 case EC_OVERWRITES:
6380 ss << "allow_ec_overwrites: " <<
6381 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6382 "\n";
6383 break;
6384 case HASHPSPOOL:
6385 case NODELETE:
6386 case NOPGCHANGE:
6387 case NOSIZECHANGE:
6388 case WRITE_FADVISE_DONTNEED:
6389 case NOSCRUB:
6390 case NODEEP_SCRUB:
6391 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6392 if (i->second == *it)
6393 break;
6394 }
6395 ceph_assert(i != ALL_CHOICES.end());
6396 ss << i->first << ": " <<
6397 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6398 "true" : "false") << "\n";
6399 break;
6400 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6401 ss << "min_write_recency_for_promote: " <<
6402 p->min_write_recency_for_promote << "\n";
6403 break;
6404 case FAST_READ:
6405 ss << "fast_read: " << p->fast_read << "\n";
6406 break;
6407 case SCRUB_MIN_INTERVAL:
6408 case SCRUB_MAX_INTERVAL:
6409 case DEEP_SCRUB_INTERVAL:
6410 case RECOVERY_PRIORITY:
6411 case RECOVERY_OP_PRIORITY:
6412 case SCRUB_PRIORITY:
6413 case COMPRESSION_MODE:
6414 case COMPRESSION_ALGORITHM:
6415 case COMPRESSION_REQUIRED_RATIO:
6416 case COMPRESSION_MAX_BLOB_SIZE:
6417 case COMPRESSION_MIN_BLOB_SIZE:
6418 case CSUM_TYPE:
6419 case CSUM_MAX_BLOCK:
6420 case CSUM_MIN_BLOCK:
6421 case FINGERPRINT_ALGORITHM:
6422 case PG_NUM_MIN:
6423 case TARGET_SIZE_BYTES:
6424 case TARGET_SIZE_RATIO:
6425 case PG_AUTOSCALE_BIAS:
6426 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6427 if (i->second == *it)
6428 break;
6429 }
6430 ceph_assert(i != ALL_CHOICES.end());
6431 {
6432 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6433 if (p->opts.is_set(key)) {
6434 if(key == pool_opts_t::CSUM_TYPE) {
6435 int64_t val;
6436 p->opts.get(key, &val);
6437 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6438 } else {
6439 ss << i->first << ": " << p->opts.get(key) << "\n";
6440 }
6441 }
6442 }
6443 break;
6444 }
6445 rdata.append(ss.str());
6446 ss.str("");
6447 }
6448 }
6449 r = 0;
6450 } else if (prefix == "osd pool get-quota") {
6451 string pool_name;
6452 cmd_getval(cmdmap, "pool", pool_name);
6453
6454 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6455 if (poolid < 0) {
6456 ceph_assert(poolid == -ENOENT);
6457 ss << "unrecognized pool '" << pool_name << "'";
6458 r = -ENOENT;
6459 goto reply;
6460 }
6461 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6462 const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6463 const object_stat_sum_t& sum = pstat->stats.sum;
6464 if (f) {
6465 f->open_object_section("pool_quotas");
6466 f->dump_string("pool_name", pool_name);
6467 f->dump_unsigned("pool_id", poolid);
6468 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6469 f->dump_int("current_num_objects", sum.num_objects);
6470 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6471 f->dump_int("current_num_bytes", sum.num_bytes);
6472 f->close_section();
6473 f->flush(rdata);
6474 } else {
6475 stringstream rs;
6476 rs << "quotas for pool '" << pool_name << "':\n"
6477 << " max objects: ";
6478 if (p->quota_max_objects == 0)
6479 rs << "N/A";
6480 else {
6481 rs << si_u_t(p->quota_max_objects) << " objects";
6482 rs << " (current num objects: " << sum.num_objects << " objects)";
6483 }
6484 rs << "\n"
6485 << " max bytes : ";
6486 if (p->quota_max_bytes == 0)
6487 rs << "N/A";
6488 else {
6489 rs << byte_u_t(p->quota_max_bytes);
6490 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6491 }
6492 rdata.append(rs.str());
6493 }
6494 rdata.append("\n");
6495 r = 0;
6496 } else if (prefix == "osd crush rule list" ||
6497 prefix == "osd crush rule ls") {
6498 if (f) {
6499 f->open_array_section("rules");
6500 osdmap.crush->list_rules(f.get());
6501 f->close_section();
6502 f->flush(rdata);
6503 } else {
6504 ostringstream ss;
6505 osdmap.crush->list_rules(&ss);
6506 rdata.append(ss.str());
6507 }
6508 } else if (prefix == "osd crush rule ls-by-class") {
6509 string class_name;
6510 cmd_getval(cmdmap, "class", class_name);
6511 if (class_name.empty()) {
6512 ss << "no class specified";
6513 r = -EINVAL;
6514 goto reply;
6515 }
6516 set<int> rules;
6517 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6518 if (r < 0) {
6519 ss << "failed to get rules by class '" << class_name << "'";
6520 goto reply;
6521 }
6522 if (f) {
6523 f->open_array_section("rules");
6524 for (auto &rule: rules) {
6525 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6526 }
6527 f->close_section();
6528 f->flush(rdata);
6529 } else {
6530 ostringstream rs;
6531 for (auto &rule: rules) {
6532 rs << osdmap.crush->get_rule_name(rule) << "\n";
6533 }
6534 rdata.append(rs.str());
6535 }
6536 } else if (prefix == "osd crush rule dump") {
6537 string name;
6538 cmd_getval(cmdmap, "name", name);
6539 string format;
6540 cmd_getval(cmdmap, "format", format);
6541 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6542 if (name == "") {
6543 f->open_array_section("rules");
6544 osdmap.crush->dump_rules(f.get());
6545 f->close_section();
6546 } else {
6547 int ruleno = osdmap.crush->get_rule_id(name);
6548 if (ruleno < 0) {
6549 ss << "unknown crush rule '" << name << "'";
6550 r = ruleno;
6551 goto reply;
6552 }
6553 osdmap.crush->dump_rule(ruleno, f.get());
6554 }
6555 ostringstream rs;
6556 f->flush(rs);
6557 rs << "\n";
6558 rdata.append(rs.str());
6559 } else if (prefix == "osd crush dump") {
6560 string format;
6561 cmd_getval(cmdmap, "format", format);
6562 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6563 f->open_object_section("crush_map");
6564 osdmap.crush->dump(f.get());
6565 f->close_section();
6566 ostringstream rs;
6567 f->flush(rs);
6568 rs << "\n";
6569 rdata.append(rs.str());
6570 } else if (prefix == "osd crush show-tunables") {
6571 string format;
6572 cmd_getval(cmdmap, "format", format);
6573 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6574 f->open_object_section("crush_map_tunables");
6575 osdmap.crush->dump_tunables(f.get());
6576 f->close_section();
6577 ostringstream rs;
6578 f->flush(rs);
6579 rs << "\n";
6580 rdata.append(rs.str());
6581 } else if (prefix == "osd crush tree") {
6582 string shadow;
6583 cmd_getval(cmdmap, "shadow", shadow);
6584 bool show_shadow = shadow == "--show-shadow";
6585 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6586 if (f) {
6587 f->open_object_section("crush_tree");
6588 osdmap.crush->dump_tree(nullptr,
6589 f.get(),
6590 osdmap.get_pool_names(),
6591 show_shadow);
6592 f->close_section();
6593 f->flush(rdata);
6594 } else {
6595 ostringstream ss;
6596 osdmap.crush->dump_tree(&ss,
6597 nullptr,
6598 osdmap.get_pool_names(),
6599 show_shadow);
6600 rdata.append(ss.str());
6601 }
6602 } else if (prefix == "osd crush ls") {
6603 string name;
6604 if (!cmd_getval(cmdmap, "node", name)) {
6605 ss << "no node specified";
6606 r = -EINVAL;
6607 goto reply;
6608 }
6609 if (!osdmap.crush->name_exists(name)) {
6610 ss << "node '" << name << "' does not exist";
6611 r = -ENOENT;
6612 goto reply;
6613 }
6614 int id = osdmap.crush->get_item_id(name);
6615 list<int> result;
6616 if (id >= 0) {
6617 result.push_back(id);
6618 } else {
6619 int num = osdmap.crush->get_bucket_size(id);
6620 for (int i = 0; i < num; ++i) {
6621 result.push_back(osdmap.crush->get_bucket_item(id, i));
6622 }
6623 }
6624 if (f) {
6625 f->open_array_section("items");
6626 for (auto i : result) {
6627 f->dump_string("item", osdmap.crush->get_item_name(i));
6628 }
6629 f->close_section();
6630 f->flush(rdata);
6631 } else {
6632 ostringstream ss;
6633 for (auto i : result) {
6634 ss << osdmap.crush->get_item_name(i) << "\n";
6635 }
6636 rdata.append(ss.str());
6637 }
6638 r = 0;
6639 } else if (prefix == "osd crush class ls") {
6640 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6641 f->open_array_section("crush_classes");
6642 for (auto i : osdmap.crush->class_name)
6643 f->dump_string("class", i.second);
6644 f->close_section();
6645 f->flush(rdata);
6646 } else if (prefix == "osd crush class ls-osd") {
6647 string name;
6648 cmd_getval(cmdmap, "class", name);
6649 set<int> osds;
6650 osdmap.crush->get_devices_by_class(name, &osds);
6651 if (f) {
6652 f->open_array_section("osds");
6653 for (auto &osd: osds)
6654 f->dump_int("osd", osd);
6655 f->close_section();
6656 f->flush(rdata);
6657 } else {
6658 bool first = true;
6659 for (auto &osd : osds) {
6660 if (!first)
6661 ds << "\n";
6662 first = false;
6663 ds << osd;
6664 }
6665 rdata.append(ds);
6666 }
6667 } else if (prefix == "osd crush get-device-class") {
6668 vector<string> idvec;
6669 cmd_getval(cmdmap, "ids", idvec);
6670 map<int, string> class_by_osd;
6671 for (auto& id : idvec) {
6672 ostringstream ts;
6673 long osd = parse_osd_id(id.c_str(), &ts);
6674 if (osd < 0) {
6675 ss << "unable to parse osd id:'" << id << "'";
6676 r = -EINVAL;
6677 goto reply;
6678 }
6679 auto device_class = osdmap.crush->get_item_class(osd);
6680 if (device_class)
6681 class_by_osd[osd] = device_class;
6682 else
6683 class_by_osd[osd] = ""; // no class
6684 }
6685 if (f) {
6686 f->open_array_section("osd_device_classes");
6687 for (auto& i : class_by_osd) {
6688 f->open_object_section("osd_device_class");
6689 f->dump_int("osd", i.first);
6690 f->dump_string("device_class", i.second);
6691 f->close_section();
6692 }
6693 f->close_section();
6694 f->flush(rdata);
6695 } else {
6696 if (class_by_osd.size() == 1) {
6697 // for single input, make a clean output
6698 ds << class_by_osd.begin()->second;
6699 } else {
6700 // note that we do not group osds by class here
6701 for (auto it = class_by_osd.begin();
6702 it != class_by_osd.end();
6703 it++) {
6704 ds << "osd." << it->first << ' ' << it->second;
6705 if (next(it) != class_by_osd.end())
6706 ds << '\n';
6707 }
6708 }
6709 rdata.append(ds);
6710 }
6711 } else if (prefix == "osd erasure-code-profile ls") {
6712 const auto &profiles = osdmap.get_erasure_code_profiles();
6713 if (f)
6714 f->open_array_section("erasure-code-profiles");
6715 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6716 if (f)
6717 f->dump_string("profile", i->first.c_str());
6718 else
6719 rdata.append(i->first + "\n");
6720 }
6721 if (f) {
6722 f->close_section();
6723 ostringstream rs;
6724 f->flush(rs);
6725 rs << "\n";
6726 rdata.append(rs.str());
6727 }
6728 } else if (prefix == "osd crush weight-set ls") {
6729 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6730 if (f) {
6731 f->open_array_section("weight_sets");
6732 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6733 f->dump_string("pool", "(compat)");
6734 }
6735 for (auto& i : osdmap.crush->choose_args) {
6736 if (i.first >= 0) {
6737 f->dump_string("pool", osdmap.get_pool_name(i.first));
6738 }
6739 }
6740 f->close_section();
6741 f->flush(rdata);
6742 } else {
6743 ostringstream rs;
6744 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6745 rs << "(compat)\n";
6746 }
6747 for (auto& i : osdmap.crush->choose_args) {
6748 if (i.first >= 0) {
6749 rs << osdmap.get_pool_name(i.first) << "\n";
6750 }
6751 }
6752 rdata.append(rs.str());
6753 }
6754 } else if (prefix == "osd crush weight-set dump") {
6755 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6756 "json-pretty"));
6757 osdmap.crush->dump_choose_args(f.get());
6758 f->flush(rdata);
6759 } else if (prefix == "osd erasure-code-profile get") {
6760 string name;
6761 cmd_getval(cmdmap, "name", name);
6762 if (!osdmap.has_erasure_code_profile(name)) {
6763 ss << "unknown erasure code profile '" << name << "'";
6764 r = -ENOENT;
6765 goto reply;
6766 }
6767 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6768 if (f)
6769 f->open_object_section("profile");
6770 for (map<string,string>::const_iterator i = profile.begin();
6771 i != profile.end();
6772 ++i) {
6773 if (f)
6774 f->dump_string(i->first.c_str(), i->second.c_str());
6775 else
6776 rdata.append(i->first + "=" + i->second + "\n");
6777 }
6778 if (f) {
6779 f->close_section();
6780 ostringstream rs;
6781 f->flush(rs);
6782 rs << "\n";
6783 rdata.append(rs.str());
6784 }
6785 } else if (prefix == "osd pool application get") {
6786 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6787 "json-pretty"));
6788 string pool_name;
6789 cmd_getval(cmdmap, "pool", pool_name);
6790 string app;
6791 cmd_getval(cmdmap, "app", app);
6792 string key;
6793 cmd_getval(cmdmap, "key", key);
6794
6795 if (pool_name.empty()) {
6796 // all
6797 f->open_object_section("pools");
6798 for (const auto &pool : osdmap.pools) {
6799 std::string name("<unknown>");
6800 const auto &pni = osdmap.pool_name.find(pool.first);
6801 if (pni != osdmap.pool_name.end())
6802 name = pni->second;
6803 f->open_object_section(name.c_str());
6804 for (auto &app_pair : pool.second.application_metadata) {
6805 f->open_object_section(app_pair.first.c_str());
6806 for (auto &kv_pair : app_pair.second) {
6807 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6808 }
6809 f->close_section();
6810 }
6811 f->close_section(); // name
6812 }
6813 f->close_section(); // pools
6814 f->flush(rdata);
6815 } else {
6816 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6817 if (pool < 0) {
6818 ss << "unrecognized pool '" << pool_name << "'";
6819 r = -ENOENT;
6820 goto reply;
6821 }
6822 auto p = osdmap.get_pg_pool(pool);
6823 // filter by pool
6824 if (app.empty()) {
6825 f->open_object_section(pool_name.c_str());
6826 for (auto &app_pair : p->application_metadata) {
6827 f->open_object_section(app_pair.first.c_str());
6828 for (auto &kv_pair : app_pair.second) {
6829 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6830 }
6831 f->close_section(); // application
6832 }
6833 f->close_section(); // pool_name
6834 f->flush(rdata);
6835 goto reply;
6836 }
6837
6838 auto app_it = p->application_metadata.find(app);
6839 if (app_it == p->application_metadata.end()) {
6840 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6841 r = -ENOENT;
6842 goto reply;
6843 }
6844 // filter by pool + app
6845 if (key.empty()) {
6846 f->open_object_section(app_it->first.c_str());
6847 for (auto &kv_pair : app_it->second) {
6848 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6849 }
6850 f->close_section(); // application
6851 f->flush(rdata);
6852 goto reply;
6853 }
6854 // filter by pool + app + key
6855 auto key_it = app_it->second.find(key);
6856 if (key_it == app_it->second.end()) {
6857 ss << "application '" << app << "' on pool '" << pool_name
6858 << "' does not have key '" << key << "'";
6859 r = -ENOENT;
6860 goto reply;
6861 }
6862 ss << key_it->second << "\n";
6863 rdata.append(ss.str());
6864 ss.str("");
6865 }
6866 } else if (prefix == "osd get-require-min-compat-client") {
6867 ss << osdmap.require_min_compat_client << std::endl;
6868 rdata.append(ss.str());
6869 ss.str("");
6870 goto reply;
6871 } else if (prefix == "osd pool application enable" ||
6872 prefix == "osd pool application disable" ||
6873 prefix == "osd pool application set" ||
6874 prefix == "osd pool application rm") {
6875 bool changed = false;
6876 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6877 if (r != 0) {
6878 // Error, reply.
6879 goto reply;
6880 } else if (changed) {
6881 // Valid mutation, proceed to prepare phase
6882 return false;
6883 } else {
6884 // Idempotent case, reply
6885 goto reply;
6886 }
6887 } else {
6888 // try prepare update
6889 return false;
6890 }
6891
6892 reply:
6893 string rs;
6894 getline(ss, rs);
6895 mon->reply_command(op, r, rs, rdata, get_last_committed());
6896 return true;
6897 }
6898
6899 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6900 {
6901 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6902 osdmap.get_pg_pool(pool_id));
6903 ceph_assert(pool);
6904 pool->set_flag(flags);
6905 }
6906
6907 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6908 {
6909 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6910 osdmap.get_pg_pool(pool_id));
6911 ceph_assert(pool);
6912 pool->unset_flag(flags);
6913 }
6914
6915 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
6916 {
6917 char k[80];
6918 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
6919 return k;
6920 }
6921
6922 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
6923 {
6924 char k[80];
6925 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6926 (unsigned long long)pool, (unsigned long long)snap);
6927 return k;
6928 }
6929
6930 string OSDMonitor::make_purged_snap_key_value(
6931 int64_t pool, snapid_t snap, snapid_t num,
6932 epoch_t epoch, bufferlist *v)
6933 {
6934 // encode the *last* epoch in the key so that we can use forward
6935 // iteration only to search for an epoch in an interval.
6936 encode(snap, *v);
6937 encode(snap + num, *v);
6938 encode(epoch, *v);
6939 return make_purged_snap_key(pool, snap + num - 1);
6940 }
6941
6942
6943 int OSDMonitor::lookup_purged_snap(
6944 int64_t pool, snapid_t snap,
6945 snapid_t *begin, snapid_t *end)
6946 {
6947 string k = make_purged_snap_key(pool, snap);
6948 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6949 it->lower_bound(k);
6950 if (!it->valid()) {
6951 dout(20) << __func__
6952 << " pool " << pool << " snap " << snap
6953 << " - key '" << k << "' not found" << dendl;
6954 return -ENOENT;
6955 }
6956 if (it->key().find("purged_snap_") != 0) {
6957 dout(20) << __func__
6958 << " pool " << pool << " snap " << snap
6959 << " - key '" << k << "' got '" << it->key()
6960 << "', wrong prefix" << dendl;
6961 return -ENOENT;
6962 }
6963 string gotk = it->key();
6964 const char *format = "purged_snap_%llu_";
6965 long long int keypool;
6966 int n = sscanf(gotk.c_str(), format, &keypool);
6967 if (n != 1) {
6968 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6969 return -ENOENT;
6970 }
6971 if (pool != keypool) {
6972 dout(20) << __func__
6973 << " pool " << pool << " snap " << snap
6974 << " - key '" << k << "' got '" << gotk
6975 << "', wrong pool " << keypool
6976 << dendl;
6977 return -ENOENT;
6978 }
6979 bufferlist v = it->value();
6980 auto p = v.cbegin();
6981 decode(*begin, p);
6982 decode(*end, p);
6983 if (snap < *begin || snap >= *end) {
6984 dout(20) << __func__
6985 << " pool " << pool << " snap " << snap
6986 << " - found [" << *begin << "," << *end << "), no overlap"
6987 << dendl;
6988 return -ENOENT;
6989 }
6990 return 0;
6991 }
6992
6993 void OSDMonitor::insert_purged_snap_update(
6994 int64_t pool,
6995 snapid_t start, snapid_t end,
6996 epoch_t epoch,
6997 MonitorDBStore::TransactionRef t)
6998 {
6999 snapid_t before_begin, before_end;
7000 snapid_t after_begin, after_end;
7001 int b = lookup_purged_snap(pool, start - 1,
7002 &before_begin, &before_end);
7003 int a = lookup_purged_snap(pool, end,
7004 &after_begin, &after_end);
7005 if (!b && !a) {
7006 dout(10) << __func__
7007 << " [" << start << "," << end << ") - joins ["
7008 << before_begin << "," << before_end << ") and ["
7009 << after_begin << "," << after_end << ")" << dendl;
7010 // erase only the begin record; we'll overwrite the end one.
7011 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7012 bufferlist v;
7013 string k = make_purged_snap_key_value(pool,
7014 before_begin, after_end - before_begin,
7015 pending_inc.epoch, &v);
7016 t->put(OSD_SNAP_PREFIX, k, v);
7017 } else if (!b) {
7018 dout(10) << __func__
7019 << " [" << start << "," << end << ") - join with earlier ["
7020 << before_begin << "," << before_end << ")" << dendl;
7021 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7022 bufferlist v;
7023 string k = make_purged_snap_key_value(pool,
7024 before_begin, end - before_begin,
7025 pending_inc.epoch, &v);
7026 t->put(OSD_SNAP_PREFIX, k, v);
7027 } else if (!a) {
7028 dout(10) << __func__
7029 << " [" << start << "," << end << ") - join with later ["
7030 << after_begin << "," << after_end << ")" << dendl;
7031 // overwrite after record
7032 bufferlist v;
7033 string k = make_purged_snap_key_value(pool,
7034 start, after_end - start,
7035 pending_inc.epoch, &v);
7036 t->put(OSD_SNAP_PREFIX, k, v);
7037 } else {
7038 dout(10) << __func__
7039 << " [" << start << "," << end << ") - new"
7040 << dendl;
7041 bufferlist v;
7042 string k = make_purged_snap_key_value(pool,
7043 start, end - start,
7044 pending_inc.epoch, &v);
7045 t->put(OSD_SNAP_PREFIX, k, v);
7046 }
7047 }
7048
7049 bool OSDMonitor::try_prune_purged_snaps()
7050 {
7051 if (!mon->mgrstatmon()->is_readable()) {
7052 return false;
7053 }
7054 if (!pending_inc.new_purged_snaps.empty()) {
7055 return false; // we already pruned for this epoch
7056 }
7057
7058 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7059 "mon_max_snap_prune_per_epoch");
7060 if (!max_prune) {
7061 max_prune = 100000;
7062 }
7063 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7064
7065 unsigned actually_pruned = 0;
7066 auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
7067 for (auto& p : osdmap.get_pools()) {
7068 auto q = purged_snaps.find(p.first);
7069 if (q == purged_snaps.end()) {
7070 continue;
7071 }
7072 auto& purged = q->second;
7073 if (purged.empty()) {
7074 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7075 continue;
7076 }
7077 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7078 snap_interval_set_t to_prune;
7079 unsigned maybe_pruned = actually_pruned;
7080 for (auto i = purged.begin(); i != purged.end(); ++i) {
7081 snapid_t begin = i.get_start();
7082 auto end = i.get_start() + i.get_len();
7083 snapid_t pbegin = 0, pend = 0;
7084 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7085 if (r == 0) {
7086 // already purged.
7087 // be a bit aggressive about backing off here, because the mon may
7088 // do a lot of work going through this set, and if we know the
7089 // purged set from the OSDs is at least *partly* stale we may as
7090 // well wait for it to be fresh.
7091 dout(20) << __func__ << " we've already purged " << pbegin
7092 << "~" << (pend - pbegin) << dendl;
7093 break; // next pool
7094 }
7095 if (pbegin && pbegin > begin && pbegin < end) {
7096 // the tail of [begin,end) is purged; shorten the range
7097 end = pbegin;
7098 }
7099 to_prune.insert(begin, end - begin);
7100 maybe_pruned += end - begin;
7101 if (maybe_pruned >= max_prune) {
7102 break;
7103 }
7104 }
7105 if (!to_prune.empty()) {
7106 // PGs may still be reporting things as purged that we have already
7107 // pruned from removed_snaps_queue.
7108 snap_interval_set_t actual;
7109 auto r = osdmap.removed_snaps_queue.find(p.first);
7110 if (r != osdmap.removed_snaps_queue.end()) {
7111 actual.intersection_of(to_prune, r->second);
7112 }
7113 actually_pruned += actual.size();
7114 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7115 << ", actual pruned " << actual << dendl;
7116 if (!actual.empty()) {
7117 pending_inc.new_purged_snaps[p.first].swap(actual);
7118 }
7119 }
7120 if (actually_pruned >= max_prune) {
7121 break;
7122 }
7123 }
7124 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7125 return !!actually_pruned;
7126 }
7127
7128 bool OSDMonitor::update_pools_status()
7129 {
7130 if (!mon->mgrstatmon()->is_readable())
7131 return false;
7132
7133 bool ret = false;
7134
7135 auto& pools = osdmap.get_pools();
7136 for (auto it = pools.begin(); it != pools.end(); ++it) {
7137 const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
7138 if (!pstat)
7139 continue;
7140 const object_stat_sum_t& sum = pstat->stats.sum;
7141 const pg_pool_t &pool = it->second;
7142 const string& pool_name = osdmap.get_pool_name(it->first);
7143
7144 bool pool_is_full =
7145 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7146 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7147
7148 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7149 if (pool_is_full)
7150 continue;
7151
7152 mon->clog->info() << "pool '" << pool_name
7153 << "' no longer out of quota; removing NO_QUOTA flag";
7154 // below we cancel FLAG_FULL too, we'll set it again in
7155 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7156 clear_pool_flags(it->first,
7157 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7158 ret = true;
7159 } else {
7160 if (!pool_is_full)
7161 continue;
7162
7163 if (pool.quota_max_bytes > 0 &&
7164 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7165 mon->clog->warn() << "pool '" << pool_name << "' is full"
7166 << " (reached quota's max_bytes: "
7167 << byte_u_t(pool.quota_max_bytes) << ")";
7168 }
7169 if (pool.quota_max_objects > 0 &&
7170 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7171 mon->clog->warn() << "pool '" << pool_name << "' is full"
7172 << " (reached quota's max_objects: "
7173 << pool.quota_max_objects << ")";
7174 }
7175 // set both FLAG_FULL_QUOTA and FLAG_FULL
7176 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7177 // since FLAG_FULL should always take precedence
7178 set_pool_flags(it->first,
7179 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7180 clear_pool_flags(it->first,
7181 pg_pool_t::FLAG_NEARFULL |
7182 pg_pool_t::FLAG_BACKFILLFULL);
7183 ret = true;
7184 }
7185 }
7186 return ret;
7187 }
7188
7189 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7190 {
7191 op->mark_osdmon_event(__func__);
7192 auto m = op->get_req<MPoolOp>();
7193 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7194 MonSession *session = op->get_session();
7195 if (!session)
7196 return -EPERM;
7197 string erasure_code_profile;
7198 stringstream ss;
7199 string rule_name;
7200 int ret = 0;
7201 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7202 0, 0, 0, 0, 0, 0.0,
7203 erasure_code_profile,
7204 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7205 &ss);
7206
7207 if (ret < 0) {
7208 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7209 }
7210 return ret;
7211 }
7212
7213 int OSDMonitor::crush_rename_bucket(const string& srcname,
7214 const string& dstname,
7215 ostream *ss)
7216 {
7217 int ret;
7218 //
7219 // Avoid creating a pending crush if it does not already exists and
7220 // the rename would fail.
7221 //
7222 if (!_have_pending_crush()) {
7223 ret = _get_stable_crush().can_rename_bucket(srcname,
7224 dstname,
7225 ss);
7226 if (ret)
7227 return ret;
7228 }
7229
7230 CrushWrapper newcrush;
7231 _get_pending_crush(newcrush);
7232
7233 ret = newcrush.rename_bucket(srcname,
7234 dstname,
7235 ss);
7236 if (ret)
7237 return ret;
7238
7239 pending_inc.crush.clear();
7240 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7241 *ss << "renamed bucket " << srcname << " into " << dstname;
7242 return 0;
7243 }
7244
7245 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7246 {
7247 string replacement = "";
7248
7249 if (plugin == "jerasure_generic" ||
7250 plugin == "jerasure_sse3" ||
7251 plugin == "jerasure_sse4" ||
7252 plugin == "jerasure_neon") {
7253 replacement = "jerasure";
7254 } else if (plugin == "shec_generic" ||
7255 plugin == "shec_sse3" ||
7256 plugin == "shec_sse4" ||
7257 plugin == "shec_neon") {
7258 replacement = "shec";
7259 }
7260
7261 if (replacement != "") {
7262 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7263 << plugin << " that has been deprecated. Please use "
7264 << replacement << " instead." << dendl;
7265 }
7266 }
7267
7268 int OSDMonitor::normalize_profile(const string& profilename,
7269 ErasureCodeProfile &profile,
7270 bool force,
7271 ostream *ss)
7272 {
7273 ErasureCodeInterfaceRef erasure_code;
7274 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7275 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7276 check_legacy_ec_plugin(plugin->second, profilename);
7277 int err = instance.factory(plugin->second,
7278 g_conf().get_val<std::string>("erasure_code_dir"),
7279 profile, &erasure_code, ss);
7280 if (err) {
7281 return err;
7282 }
7283
7284 err = erasure_code->init(profile, ss);
7285 if (err) {
7286 return err;
7287 }
7288
7289 auto it = profile.find("stripe_unit");
7290 if (it != profile.end()) {
7291 string err_str;
7292 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7293 if (!err_str.empty()) {
7294 *ss << "could not parse stripe_unit '" << it->second
7295 << "': " << err_str << std::endl;
7296 return -EINVAL;
7297 }
7298 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7299 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7300 if (chunk_size != stripe_unit) {
7301 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7302 << "alignment. Would be padded to " << chunk_size
7303 << std::endl;
7304 return -EINVAL;
7305 }
7306 if ((stripe_unit % 4096) != 0 && !force) {
7307 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7308 << "use --force to override this check" << std::endl;
7309 return -EINVAL;
7310 }
7311 }
7312 return 0;
7313 }
7314
7315 int OSDMonitor::crush_rule_create_erasure(const string &name,
7316 const string &profile,
7317 int *rule,
7318 ostream *ss)
7319 {
7320 int ruleid = osdmap.crush->get_rule_id(name);
7321 if (ruleid != -ENOENT) {
7322 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7323 return -EEXIST;
7324 }
7325
7326 CrushWrapper newcrush;
7327 _get_pending_crush(newcrush);
7328
7329 ruleid = newcrush.get_rule_id(name);
7330 if (ruleid != -ENOENT) {
7331 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7332 return -EALREADY;
7333 } else {
7334 ErasureCodeInterfaceRef erasure_code;
7335 int err = get_erasure_code(profile, &erasure_code, ss);
7336 if (err) {
7337 *ss << "failed to load plugin using profile " << profile << std::endl;
7338 return err;
7339 }
7340
7341 err = erasure_code->create_rule(name, newcrush, ss);
7342 erasure_code.reset();
7343 if (err < 0)
7344 return err;
7345 *rule = err;
7346 pending_inc.crush.clear();
7347 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7348 return 0;
7349 }
7350 }
7351
7352 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7353 ErasureCodeInterfaceRef *erasure_code,
7354 ostream *ss) const
7355 {
7356 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7357 return -EAGAIN;
7358 ErasureCodeProfile profile =
7359 osdmap.get_erasure_code_profile(erasure_code_profile);
7360 ErasureCodeProfile::const_iterator plugin =
7361 profile.find("plugin");
7362 if (plugin == profile.end()) {
7363 *ss << "cannot determine the erasure code plugin"
7364 << " because there is no 'plugin' entry in the erasure_code_profile "
7365 << profile << std::endl;
7366 return -EINVAL;
7367 }
7368 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7369 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7370 return instance.factory(plugin->second,
7371 g_conf().get_val<std::string>("erasure_code_dir"),
7372 profile, erasure_code, ss);
7373 }
7374
7375 int OSDMonitor::check_cluster_features(uint64_t features,
7376 stringstream &ss)
7377 {
7378 stringstream unsupported_ss;
7379 int unsupported_count = 0;
7380 if ((mon->get_quorum_con_features() & features) != features) {
7381 unsupported_ss << "the monitor cluster";
7382 ++unsupported_count;
7383 }
7384
7385 set<int32_t> up_osds;
7386 osdmap.get_up_osds(up_osds);
7387 for (set<int32_t>::iterator it = up_osds.begin();
7388 it != up_osds.end(); ++it) {
7389 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7390 if ((xi.features & features) != features) {
7391 if (unsupported_count > 0)
7392 unsupported_ss << ", ";
7393 unsupported_ss << "osd." << *it;
7394 unsupported_count ++;
7395 }
7396 }
7397
7398 if (unsupported_count > 0) {
7399 ss << "features " << features << " unsupported by: "
7400 << unsupported_ss.str();
7401 return -ENOTSUP;
7402 }
7403
7404 // check pending osd state, too!
7405 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7406 pending_inc.new_xinfo.begin();
7407 p != pending_inc.new_xinfo.end(); ++p) {
7408 const osd_xinfo_t &xi = p->second;
7409 if ((xi.features & features) != features) {
7410 dout(10) << __func__ << " pending osd." << p->first
7411 << " features are insufficient; retry" << dendl;
7412 return -EAGAIN;
7413 }
7414 }
7415
7416 return 0;
7417 }
7418
7419 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7420 stringstream& ss)
7421 {
7422 OSDMap::Incremental new_pending = pending_inc;
7423 encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7424 OSDMap newmap;
7425 newmap.deepish_copy_from(osdmap);
7426 newmap.apply_incremental(new_pending);
7427
7428 // client compat
7429 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7430 auto mv = newmap.get_min_compat_client();
7431 if (mv > newmap.require_min_compat_client) {
7432 ss << "new crush map requires client version " << mv
7433 << " but require_min_compat_client is "
7434 << newmap.require_min_compat_client;
7435 return false;
7436 }
7437 }
7438
7439 // osd compat
7440 uint64_t features =
7441 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7442 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7443 stringstream features_ss;
7444 int r = check_cluster_features(features, features_ss);
7445 if (r) {
7446 ss << "Could not change CRUSH: " << features_ss.str();
7447 return false;
7448 }
7449
7450 return true;
7451 }
7452
7453 bool OSDMonitor::erasure_code_profile_in_use(
7454 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7455 const string &profile,
7456 ostream *ss)
7457 {
7458 bool found = false;
7459 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7460 p != pools.end();
7461 ++p) {
7462 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7463 *ss << osdmap.pool_name[p->first] << " ";
7464 found = true;
7465 }
7466 }
7467 if (found) {
7468 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7469 }
7470 return found;
7471 }
7472
7473 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7474 map<string,string> *erasure_code_profile_map,
7475 ostream *ss)
7476 {
7477 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7478 get_json_str_map,
7479 *ss,
7480 erasure_code_profile_map,
7481 true);
7482 if (r)
7483 return r;
7484 ceph_assert((*erasure_code_profile_map).count("plugin"));
7485 string default_plugin = (*erasure_code_profile_map)["plugin"];
7486 map<string,string> user_map;
7487 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7488 i != erasure_code_profile.end();
7489 ++i) {
7490 size_t equal = i->find('=');
7491 if (equal == string::npos) {
7492 user_map[*i] = string();
7493 (*erasure_code_profile_map)[*i] = string();
7494 } else {
7495 const string key = i->substr(0, equal);
7496 equal++;
7497 const string value = i->substr(equal);
7498 if (key.find("ruleset-") == 0) {
7499 *ss << "property '" << key << "' is no longer supported; try "
7500 << "'crush-" << key.substr(8) << "' instead";
7501 return -EINVAL;
7502 }
7503 user_map[key] = value;
7504 (*erasure_code_profile_map)[key] = value;
7505 }
7506 }
7507
7508 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7509 (*erasure_code_profile_map) = user_map;
7510
7511 return 0;
7512 }
7513
7514 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7515 const string &erasure_code_profile,
7516 uint8_t repl_size,
7517 unsigned *size, unsigned *min_size,
7518 ostream *ss)
7519 {
7520 int err = 0;
7521 switch (pool_type) {
7522 case pg_pool_t::TYPE_REPLICATED:
7523 if (repl_size == 0) {
7524 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7525 }
7526 *size = repl_size;
7527 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7528 break;
7529 case pg_pool_t::TYPE_ERASURE:
7530 {
7531 ErasureCodeInterfaceRef erasure_code;
7532 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7533 if (err == 0) {
7534 *size = erasure_code->get_chunk_count();
7535 *min_size =
7536 erasure_code->get_data_chunk_count() +
7537 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7538 assert(*min_size <= *size);
7539 assert(*min_size >= erasure_code->get_data_chunk_count());
7540 }
7541 }
7542 break;
7543 default:
7544 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7545 err = -EINVAL;
7546 break;
7547 }
7548 return err;
7549 }
7550
7551 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7552 const string &erasure_code_profile,
7553 uint32_t *stripe_width,
7554 ostream *ss)
7555 {
7556 int err = 0;
7557 switch (pool_type) {
7558 case pg_pool_t::TYPE_REPLICATED:
7559 // ignored
7560 break;
7561 case pg_pool_t::TYPE_ERASURE:
7562 {
7563 ErasureCodeProfile profile =
7564 osdmap.get_erasure_code_profile(erasure_code_profile);
7565 ErasureCodeInterfaceRef erasure_code;
7566 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7567 if (err)
7568 break;
7569 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7570 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7571 auto it = profile.find("stripe_unit");
7572 if (it != profile.end()) {
7573 string err_str;
7574 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7575 ceph_assert(err_str.empty());
7576 }
7577 *stripe_width = data_chunks *
7578 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7579 }
7580 break;
7581 default:
7582 *ss << "prepare_pool_stripe_width: "
7583 << pool_type << " is not a known pool type";
7584 err = -EINVAL;
7585 break;
7586 }
7587 return err;
7588 }
7589
7590 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7591 const string &erasure_code_profile,
7592 const string &rule_name,
7593 int *crush_rule,
7594 ostream *ss)
7595 {
7596
7597 if (*crush_rule < 0) {
7598 switch (pool_type) {
7599 case pg_pool_t::TYPE_REPLICATED:
7600 {
7601 if (rule_name == "") {
7602 // Use default rule
7603 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7604 if (*crush_rule < 0) {
7605 // Errors may happen e.g. if no valid rule is available
7606 *ss << "No suitable CRUSH rule exists, check "
7607 << "'osd pool default crush *' config options";
7608 return -ENOENT;
7609 }
7610 } else {
7611 return get_crush_rule(rule_name, crush_rule, ss);
7612 }
7613 }
7614 break;
7615 case pg_pool_t::TYPE_ERASURE:
7616 {
7617 int err = crush_rule_create_erasure(rule_name,
7618 erasure_code_profile,
7619 crush_rule, ss);
7620 switch (err) {
7621 case -EALREADY:
7622 dout(20) << "prepare_pool_crush_rule: rule "
7623 << rule_name << " try again" << dendl;
7624 // fall through
7625 case 0:
7626 // need to wait for the crush rule to be proposed before proceeding
7627 err = -EAGAIN;
7628 break;
7629 case -EEXIST:
7630 err = 0;
7631 break;
7632 }
7633 return err;
7634 }
7635 break;
7636 default:
7637 *ss << "prepare_pool_crush_rule: " << pool_type
7638 << " is not a known pool type";
7639 return -EINVAL;
7640 break;
7641 }
7642 } else {
7643 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7644 *ss << "CRUSH rule " << *crush_rule << " not found";
7645 return -ENOENT;
7646 }
7647 }
7648
7649 return 0;
7650 }
7651
7652 int OSDMonitor::get_crush_rule(const string &rule_name,
7653 int *crush_rule,
7654 ostream *ss)
7655 {
7656 int ret;
7657 ret = osdmap.crush->get_rule_id(rule_name);
7658 if (ret != -ENOENT) {
7659 // found it, use it
7660 *crush_rule = ret;
7661 } else {
7662 CrushWrapper newcrush;
7663 _get_pending_crush(newcrush);
7664
7665 ret = newcrush.get_rule_id(rule_name);
7666 if (ret != -ENOENT) {
7667 // found it, wait for it to be proposed
7668 dout(20) << __func__ << ": rule " << rule_name
7669 << " try again" << dendl;
7670 return -EAGAIN;
7671 } else {
7672 // Cannot find it , return error
7673 *ss << "specified rule " << rule_name << " doesn't exist";
7674 return ret;
7675 }
7676 }
7677 return 0;
7678 }
7679
7680 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7681 {
7682 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7683 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
7684 auto max_pgs = max_pgs_per_osd * num_osds;
7685 uint64_t projected = 0;
7686 if (pool < 0) {
7687 projected += pg_num * size;
7688 }
7689 for (const auto& i : osdmap.get_pools()) {
7690 if (i.first == pool) {
7691 projected += pg_num * size;
7692 } else {
7693 projected += i.second.get_pg_num_target() * i.second.get_size();
7694 }
7695 }
7696 if (projected > max_pgs) {
7697 if (pool >= 0) {
7698 *ss << "pool id " << pool;
7699 }
7700 *ss << " pg_num " << pg_num << " size " << size
7701 << " would mean " << projected
7702 << " total pgs, which exceeds max " << max_pgs
7703 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7704 << " * num_in_osds " << num_osds << ")";
7705 return -ERANGE;
7706 }
7707 return 0;
7708 }
7709
7710 /**
7711 * @param name The name of the new pool
7712 * @param crush_rule The crush rule to use. If <0, will use the system default
7713 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7714 * @param pg_num The pg_num to use. If set to 0, will use the system default
7715 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7716 * @param repl_size Replication factor, or 0 for default
7717 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7718 * @param pool_type TYPE_ERASURE, or TYPE_REP
7719 * @param expected_num_objects expected number of objects on the pool
7720 * @param fast_read fast read type.
7721 * @param ss human readable error message, if any.
7722 *
7723 * @return 0 on success, negative errno on failure.
7724 */
7725 int OSDMonitor::prepare_new_pool(string& name,
7726 int crush_rule,
7727 const string &crush_rule_name,
7728 unsigned pg_num, unsigned pgp_num,
7729 unsigned pg_num_min,
7730 const uint64_t repl_size,
7731 const uint64_t target_size_bytes,
7732 const float target_size_ratio,
7733 const string &erasure_code_profile,
7734 const unsigned pool_type,
7735 const uint64_t expected_num_objects,
7736 FastReadType fast_read,
7737 const string& pg_autoscale_mode,
7738 ostream *ss)
7739 {
7740 if (name.length() == 0)
7741 return -EINVAL;
7742 if (pg_num == 0)
7743 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7744 if (pgp_num == 0)
7745 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7746 if (!pgp_num)
7747 pgp_num = pg_num;
7748 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7749 *ss << "'pg_num' must be greater than 0 and less than or equal to "
7750 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7751 << " (you may adjust 'mon max pool pg num' for higher values)";
7752 return -ERANGE;
7753 }
7754 if (pgp_num > pg_num) {
7755 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7756 << ", which in this case is " << pg_num;
7757 return -ERANGE;
7758 }
7759 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7760 *ss << "'fast_read' can only apply to erasure coding pool";
7761 return -EINVAL;
7762 }
7763 int r;
7764 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7765 crush_rule_name, &crush_rule, ss);
7766 if (r) {
7767 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7768 return r;
7769 }
7770 if (g_conf()->mon_osd_crush_smoke_test) {
7771 CrushWrapper newcrush;
7772 _get_pending_crush(newcrush);
7773 ostringstream err;
7774 CrushTester tester(newcrush, err);
7775 tester.set_min_x(0);
7776 tester.set_max_x(50);
7777 tester.set_rule(crush_rule);
7778 auto start = ceph::coarse_mono_clock::now();
7779 r = tester.test_with_fork(g_conf()->mon_lease);
7780 auto duration = ceph::coarse_mono_clock::now() - start;
7781 if (r < 0) {
7782 dout(10) << "tester.test_with_fork returns " << r
7783 << ": " << err.str() << dendl;
7784 *ss << "crush test failed with " << r << ": " << err.str();
7785 return r;
7786 }
7787 dout(10) << __func__ << " crush smoke test duration: "
7788 << duration << dendl;
7789 }
7790 unsigned size, min_size;
7791 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7792 &size, &min_size, ss);
7793 if (r) {
7794 dout(10) << "prepare_pool_size returns " << r << dendl;
7795 return r;
7796 }
7797 r = check_pg_num(-1, pg_num, size, ss);
7798 if (r) {
7799 dout(10) << "check_pg_num returns " << r << dendl;
7800 return r;
7801 }
7802
7803 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7804 return -EINVAL;
7805 }
7806
7807 uint32_t stripe_width = 0;
7808 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7809 if (r) {
7810 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7811 return r;
7812 }
7813
7814 bool fread = false;
7815 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7816 switch (fast_read) {
7817 case FAST_READ_OFF:
7818 fread = false;
7819 break;
7820 case FAST_READ_ON:
7821 fread = true;
7822 break;
7823 case FAST_READ_DEFAULT:
7824 fread = g_conf()->osd_pool_default_ec_fast_read;
7825 break;
7826 default:
7827 *ss << "invalid fast_read setting: " << fast_read;
7828 return -EINVAL;
7829 }
7830 }
7831
7832 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7833 p != pending_inc.new_pool_names.end();
7834 ++p) {
7835 if (p->second == name)
7836 return 0;
7837 }
7838
7839 if (-1 == pending_inc.new_pool_max)
7840 pending_inc.new_pool_max = osdmap.pool_max;
7841 int64_t pool = ++pending_inc.new_pool_max;
7842 pg_pool_t empty;
7843 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7844 pi->create_time = ceph_clock_now();
7845 pi->type = pool_type;
7846 pi->fast_read = fread;
7847 pi->flags = g_conf()->osd_pool_default_flags;
7848 if (g_conf()->osd_pool_default_flag_hashpspool)
7849 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7850 if (g_conf()->osd_pool_default_flag_nodelete)
7851 pi->set_flag(pg_pool_t::FLAG_NODELETE);
7852 if (g_conf()->osd_pool_default_flag_nopgchange)
7853 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7854 if (g_conf()->osd_pool_default_flag_nosizechange)
7855 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7856 pi->set_flag(pg_pool_t::FLAG_CREATING);
7857 if (g_conf()->osd_pool_use_gmt_hitset)
7858 pi->use_gmt_hitset = true;
7859 else
7860 pi->use_gmt_hitset = false;
7861
7862 pi->size = size;
7863 pi->min_size = min_size;
7864 pi->crush_rule = crush_rule;
7865 pi->expected_num_objects = expected_num_objects;
7866 pi->object_hash = CEPH_STR_HASH_RJENKINS;
7867
7868 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7869 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7870 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7871 pi->pg_autoscale_mode = m;
7872 } else {
7873 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7874 }
7875 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7876 pi->set_pg_num(
7877 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7878 : pg_num);
7879 pi->set_pg_num_pending(pi->get_pg_num());
7880 pi->set_pg_num_target(pg_num);
7881 pi->set_pgp_num(pi->get_pg_num());
7882 pi->set_pgp_num_target(pgp_num);
7883 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7884 pg_num_min) {
7885 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7886 }
7887 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7888 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7889 pi->pg_autoscale_mode = m;
7890 }
7891
7892 pi->last_change = pending_inc.epoch;
7893 pi->auid = 0;
7894
7895 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7896 pi->erasure_code_profile = erasure_code_profile;
7897 } else {
7898 pi->erasure_code_profile = "";
7899 }
7900 pi->stripe_width = stripe_width;
7901
7902 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7903 target_size_bytes) {
7904 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7905 // larger than int32_t max.
7906 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7907 }
7908 if (target_size_ratio > 0.0 &&
7909 osdmap.require_osd_release >= ceph_release_t::nautilus) {
7910 // only store for nautilus+, just to be consistent and tidy.
7911 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7912 }
7913
7914 pi->cache_target_dirty_ratio_micro =
7915 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7916 pi->cache_target_dirty_high_ratio_micro =
7917 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7918 pi->cache_target_full_ratio_micro =
7919 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7920 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7921 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7922
7923 pending_inc.new_pool_names[pool] = name;
7924 return 0;
7925 }
7926
7927 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7928 {
7929 op->mark_osdmon_event(__func__);
7930 ostringstream ss;
7931 if (pending_inc.new_flags < 0)
7932 pending_inc.new_flags = osdmap.get_flags();
7933 pending_inc.new_flags |= flag;
7934 ss << OSDMap::get_flag_string(flag) << " is set";
7935 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7936 get_last_committed() + 1));
7937 return true;
7938 }
7939
7940 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7941 {
7942 op->mark_osdmon_event(__func__);
7943 ostringstream ss;
7944 if (pending_inc.new_flags < 0)
7945 pending_inc.new_flags = osdmap.get_flags();
7946 pending_inc.new_flags &= ~flag;
7947 ss << OSDMap::get_flag_string(flag) << " is unset";
7948 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7949 get_last_committed() + 1));
7950 return true;
7951 }
7952
7953 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7954 stringstream& ss)
7955 {
7956 string poolstr;
7957 cmd_getval(cmdmap, "pool", poolstr);
7958 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7959 if (pool < 0) {
7960 ss << "unrecognized pool '" << poolstr << "'";
7961 return -ENOENT;
7962 }
7963 string var;
7964 cmd_getval(cmdmap, "var", var);
7965
7966 pg_pool_t p = *osdmap.get_pg_pool(pool);
7967 if (pending_inc.new_pools.count(pool))
7968 p = pending_inc.new_pools[pool];
7969
7970 // accept val as a json string in the normal case (current
7971 // generation monitor). parse out int or float values from the
7972 // string as needed. however, if it is not a string, try to pull
7973 // out an int, in case an older monitor with an older json schema is
7974 // forwarding a request.
7975 string val;
7976 string interr, floaterr;
7977 int64_t n = 0;
7978 double f = 0;
7979 int64_t uf = 0; // micro-f
7980 cmd_getval(cmdmap, "val", val);
7981
7982 auto si_options = {
7983 "target_max_objects"
7984 };
7985 auto iec_options = {
7986 "target_max_bytes",
7987 "target_size_bytes",
7988 "compression_max_blob_size",
7989 "compression_min_blob_size",
7990 "csum_max_block",
7991 "csum_min_block",
7992 };
7993 if (count(begin(si_options), end(si_options), var)) {
7994 n = strict_si_cast<int64_t>(val.c_str(), &interr);
7995 } else if (count(begin(iec_options), end(iec_options), var)) {
7996 n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7997 } else {
7998 // parse string as both int and float; different fields use different types.
7999 n = strict_strtoll(val.c_str(), 10, &interr);
8000 f = strict_strtod(val.c_str(), &floaterr);
8001 uf = llrintl(f * (double)1000000.0);
8002 }
8003
8004 if (!p.is_tier() &&
8005 (var == "hit_set_type" || var == "hit_set_period" ||
8006 var == "hit_set_count" || var == "hit_set_fpp" ||
8007 var == "target_max_objects" || var == "target_max_bytes" ||
8008 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8009 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8010 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8011 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8012 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8013 return -EACCES;
8014 }
8015
8016 if (var == "size") {
8017 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8018 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8019 return -EPERM;
8020 }
8021 if (p.type == pg_pool_t::TYPE_ERASURE) {
8022 ss << "can not change the size of an erasure-coded pool";
8023 return -ENOTSUP;
8024 }
8025 if (interr.length()) {
8026 ss << "error parsing integer value '" << val << "': " << interr;
8027 return -EINVAL;
8028 }
8029 if (n <= 0 || n > 10) {
8030 ss << "pool size must be between 1 and 10";
8031 return -EINVAL;
8032 }
8033 if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
8034 return -EINVAL;
8035 }
8036 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
8037 if (r < 0) {
8038 return r;
8039 }
8040 p.size = n;
8041 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8042 } else if (var == "min_size") {
8043 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8044 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8045 return -EPERM;
8046 }
8047 if (interr.length()) {
8048 ss << "error parsing integer value '" << val << "': " << interr;
8049 return -EINVAL;
8050 }
8051
8052 if (p.type != pg_pool_t::TYPE_ERASURE) {
8053 if (n < 1 || n > p.size) {
8054 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8055 return -EINVAL;
8056 }
8057 } else {
8058 ErasureCodeInterfaceRef erasure_code;
8059 int k;
8060 stringstream tmp;
8061 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8062 if (err == 0) {
8063 k = erasure_code->get_data_chunk_count();
8064 } else {
8065 ss << __func__ << " get_erasure_code failed: " << tmp.str();
8066 return err;
8067 }
8068
8069 if (n < k || n > p.size) {
8070 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8071 return -EINVAL;
8072 }
8073 }
8074 p.min_size = n;
8075 } else if (var == "pg_num_actual") {
8076 if (interr.length()) {
8077 ss << "error parsing integer value '" << val << "': " << interr;
8078 return -EINVAL;
8079 }
8080 if (n == (int)p.get_pg_num()) {
8081 return 0;
8082 }
8083 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8084 ss << "'pg_num' must be greater than 0 and less than or equal to "
8085 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8086 << " (you may adjust 'mon max pool pg num' for higher values)";
8087 return -ERANGE;
8088 }
8089 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8090 ss << "cannot adjust pg_num while initial PGs are being created";
8091 return -EBUSY;
8092 }
8093 if (n > (int)p.get_pg_num()) {
8094 if (p.get_pg_num() != p.get_pg_num_pending()) {
8095 // force pre-nautilus clients to resend their ops, since they
8096 // don't understand pg_num_pending changes form a new interval
8097 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8098 }
8099 p.set_pg_num(n);
8100 } else {
8101 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8102 ss << "nautilus OSDs are required to adjust pg_num_pending";
8103 return -EPERM;
8104 }
8105 if (n < (int)p.get_pgp_num()) {
8106 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8107 return -EINVAL;
8108 }
8109 if (n < (int)p.get_pg_num() - 1) {
8110 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8111 << ") - 1; only single pg decrease is currently supported";
8112 return -EINVAL;
8113 }
8114 p.set_pg_num_pending(n);
8115 // force pre-nautilus clients to resend their ops, since they
8116 // don't understand pg_num_pending changes form a new interval
8117 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8118 }
8119 // force pre-luminous clients to resend their ops, since they
8120 // don't understand that split PGs now form a new interval.
8121 p.last_force_op_resend_preluminous = pending_inc.epoch;
8122 } else if (var == "pg_num") {
8123 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8124 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8125 return -EPERM;
8126 }
8127 if (interr.length()) {
8128 ss << "error parsing integer value '" << val << "': " << interr;
8129 return -EINVAL;
8130 }
8131 if (n == (int)p.get_pg_num_target()) {
8132 return 0;
8133 }
8134 if (n <= 0 || static_cast<uint64_t>(n) >
8135 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8136 ss << "'pg_num' must be greater than 0 and less than or equal to "
8137 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8138 << " (you may adjust 'mon max pool pg num' for higher values)";
8139 return -ERANGE;
8140 }
8141 if (n > (int)p.get_pg_num_target()) {
8142 int r = check_pg_num(pool, n, p.get_size(), &ss);
8143 if (r) {
8144 return r;
8145 }
8146 bool force = false;
8147 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8148 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8149 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8150 return -EPERM;
8151 }
8152 } else {
8153 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8154 ss << "nautilus OSDs are required to decrease pg_num";
8155 return -EPERM;
8156 }
8157 }
8158 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8159 // pre-nautilus osdmap format; increase pg_num directly
8160 assert(n > (int)p.get_pg_num());
8161 // force pre-nautilus clients to resend their ops, since they
8162 // don't understand pg_num_target changes form a new interval
8163 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8164 // force pre-luminous clients to resend their ops, since they
8165 // don't understand that split PGs now form a new interval.
8166 p.last_force_op_resend_preluminous = pending_inc.epoch;
8167 p.set_pg_num(n);
8168 } else {
8169 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8170 // make pgp_num track pg_num if it already matches. if it is set
8171 // differently, leave it different and let the user control it
8172 // manually.
8173 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8174 p.set_pgp_num_target(n);
8175 }
8176 p.set_pg_num_target(n);
8177 }
8178 } else if (var == "pgp_num_actual") {
8179 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8180 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8181 return -EPERM;
8182 }
8183 if (interr.length()) {
8184 ss << "error parsing integer value '" << val << "': " << interr;
8185 return -EINVAL;
8186 }
8187 if (n <= 0) {
8188 ss << "specified pgp_num must > 0, but you set to " << n;
8189 return -EINVAL;
8190 }
8191 if (n > (int)p.get_pg_num()) {
8192 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8193 return -EINVAL;
8194 }
8195 if (n > (int)p.get_pg_num_pending()) {
8196 ss << "specified pgp_num " << n
8197 << " > pg_num_pending " << p.get_pg_num_pending();
8198 return -EINVAL;
8199 }
8200 p.set_pgp_num(n);
8201 } else if (var == "pgp_num") {
8202 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8203 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8204 return -EPERM;
8205 }
8206 if (interr.length()) {
8207 ss << "error parsing integer value '" << val << "': " << interr;
8208 return -EINVAL;
8209 }
8210 if (n <= 0) {
8211 ss << "specified pgp_num must > 0, but you set to " << n;
8212 return -EINVAL;
8213 }
8214 if (n > (int)p.get_pg_num_target()) {
8215 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8216 return -EINVAL;
8217 }
8218 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8219 // pre-nautilus osdmap format; increase pgp_num directly
8220 p.set_pgp_num(n);
8221 } else {
8222 p.set_pgp_num_target(n);
8223 }
8224 } else if (var == "pg_autoscale_mode") {
8225 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8226 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8227 ss << "specified invalid mode " << val;
8228 return -EINVAL;
8229 }
8230 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8231 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8232 return -EINVAL;
8233 }
8234 p.pg_autoscale_mode = m;
8235 } else if (var == "crush_rule") {
8236 int id = osdmap.crush->get_rule_id(val);
8237 if (id == -ENOENT) {
8238 ss << "crush rule " << val << " does not exist";
8239 return -ENOENT;
8240 }
8241 if (id < 0) {
8242 ss << cpp_strerror(id);
8243 return -ENOENT;
8244 }
8245 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8246 return -EINVAL;
8247 }
8248 p.crush_rule = id;
8249 } else if (var == "nodelete" || var == "nopgchange" ||
8250 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8251 var == "noscrub" || var == "nodeep-scrub") {
8252 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8253 // make sure we only compare against 'n' if we didn't receive a string
8254 if (val == "true" || (interr.empty() && n == 1)) {
8255 p.set_flag(flag);
8256 } else if (val == "false" || (interr.empty() && n == 0)) {
8257 p.unset_flag(flag);
8258 } else {
8259 ss << "expecting value 'true', 'false', '0', or '1'";
8260 return -EINVAL;
8261 }
8262 } else if (var == "hashpspool") {
8263 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8264 bool force = false;
8265 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8266
8267 if (!force) {
8268 ss << "are you SURE? this will remap all placement groups in this pool,"
8269 " this triggers large data movement,"
8270 " pass --yes-i-really-mean-it if you really do.";
8271 return -EPERM;
8272 }
8273 // make sure we only compare against 'n' if we didn't receive a string
8274 if (val == "true" || (interr.empty() && n == 1)) {
8275 p.set_flag(flag);
8276 } else if (val == "false" || (interr.empty() && n == 0)) {
8277 p.unset_flag(flag);
8278 } else {
8279 ss << "expecting value 'true', 'false', '0', or '1'";
8280 return -EINVAL;
8281 }
8282 } else if (var == "hit_set_type") {
8283 if (val == "none")
8284 p.hit_set_params = HitSet::Params();
8285 else {
8286 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8287 if (err)
8288 return err;
8289 if (val == "bloom") {
8290 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8291 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8292 p.hit_set_params = HitSet::Params(bsp);
8293 } else if (val == "explicit_hash")
8294 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8295 else if (val == "explicit_object")
8296 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8297 else {
8298 ss << "unrecognized hit_set type '" << val << "'";
8299 return -EINVAL;
8300 }
8301 }
8302 } else if (var == "hit_set_period") {
8303 if (interr.length()) {
8304 ss << "error parsing integer value '" << val << "': " << interr;
8305 return -EINVAL;
8306 } else if (n < 0) {
8307 ss << "hit_set_period should be non-negative";
8308 return -EINVAL;
8309 }
8310 p.hit_set_period = n;
8311 } else if (var == "hit_set_count") {
8312 if (interr.length()) {
8313 ss << "error parsing integer value '" << val << "': " << interr;
8314 return -EINVAL;
8315 } else if (n < 0) {
8316 ss << "hit_set_count should be non-negative";
8317 return -EINVAL;
8318 }
8319 p.hit_set_count = n;
8320 } else if (var == "hit_set_fpp") {
8321 if (floaterr.length()) {
8322 ss << "error parsing floating point value '" << val << "': " << floaterr;
8323 return -EINVAL;
8324 } else if (f < 0 || f > 1.0) {
8325 ss << "hit_set_fpp should be in the range 0..1";
8326 return -EINVAL;
8327 }
8328 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8329 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8330 return -EINVAL;
8331 }
8332 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8333 bloomp->set_fpp(f);
8334 } else if (var == "use_gmt_hitset") {
8335 if (val == "true" || (interr.empty() && n == 1)) {
8336 p.use_gmt_hitset = true;
8337 } else {
8338 ss << "expecting value 'true' or '1'";
8339 return -EINVAL;
8340 }
8341 } else if (var == "allow_ec_overwrites") {
8342 if (!p.is_erasure()) {
8343 ss << "ec overwrites can only be enabled for an erasure coded pool";
8344 return -EINVAL;
8345 }
8346 stringstream err;
8347 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8348 !is_pool_currently_all_bluestore(pool, p, &err)) {
8349 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8350 return -EINVAL;
8351 }
8352 if (val == "true" || (interr.empty() && n == 1)) {
8353 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8354 } else if (val == "false" || (interr.empty() && n == 0)) {
8355 ss << "ec overwrites cannot be disabled once enabled";
8356 return -EINVAL;
8357 } else {
8358 ss << "expecting value 'true', 'false', '0', or '1'";
8359 return -EINVAL;
8360 }
8361 } else if (var == "target_max_objects") {
8362 if (interr.length()) {
8363 ss << "error parsing int '" << val << "': " << interr;
8364 return -EINVAL;
8365 }
8366 p.target_max_objects = n;
8367 } else if (var == "target_max_bytes") {
8368 if (interr.length()) {
8369 ss << "error parsing int '" << val << "': " << interr;
8370 return -EINVAL;
8371 }
8372 p.target_max_bytes = n;
8373 } else if (var == "cache_target_dirty_ratio") {
8374 if (floaterr.length()) {
8375 ss << "error parsing float '" << val << "': " << floaterr;
8376 return -EINVAL;
8377 }
8378 if (f < 0 || f > 1.0) {
8379 ss << "value must be in the range 0..1";
8380 return -ERANGE;
8381 }
8382 p.cache_target_dirty_ratio_micro = uf;
8383 } else if (var == "cache_target_dirty_high_ratio") {
8384 if (floaterr.length()) {
8385 ss << "error parsing float '" << val << "': " << floaterr;
8386 return -EINVAL;
8387 }
8388 if (f < 0 || f > 1.0) {
8389 ss << "value must be in the range 0..1";
8390 return -ERANGE;
8391 }
8392 p.cache_target_dirty_high_ratio_micro = uf;
8393 } else if (var == "cache_target_full_ratio") {
8394 if (floaterr.length()) {
8395 ss << "error parsing float '" << val << "': " << floaterr;
8396 return -EINVAL;
8397 }
8398 if (f < 0 || f > 1.0) {
8399 ss << "value must be in the range 0..1";
8400 return -ERANGE;
8401 }
8402 p.cache_target_full_ratio_micro = uf;
8403 } else if (var == "cache_min_flush_age") {
8404 if (interr.length()) {
8405 ss << "error parsing int '" << val << "': " << interr;
8406 return -EINVAL;
8407 }
8408 p.cache_min_flush_age = n;
8409 } else if (var == "cache_min_evict_age") {
8410 if (interr.length()) {
8411 ss << "error parsing int '" << val << "': " << interr;
8412 return -EINVAL;
8413 }
8414 p.cache_min_evict_age = n;
8415 } else if (var == "min_read_recency_for_promote") {
8416 if (interr.length()) {
8417 ss << "error parsing integer value '" << val << "': " << interr;
8418 return -EINVAL;
8419 }
8420 p.min_read_recency_for_promote = n;
8421 } else if (var == "hit_set_grade_decay_rate") {
8422 if (interr.length()) {
8423 ss << "error parsing integer value '" << val << "': " << interr;
8424 return -EINVAL;
8425 }
8426 if (n > 100 || n < 0) {
8427 ss << "value out of range,valid range is 0 - 100";
8428 return -EINVAL;
8429 }
8430 p.hit_set_grade_decay_rate = n;
8431 } else if (var == "hit_set_search_last_n") {
8432 if (interr.length()) {
8433 ss << "error parsing integer value '" << val << "': " << interr;
8434 return -EINVAL;
8435 }
8436 if (n > p.hit_set_count || n < 0) {
8437 ss << "value out of range,valid range is 0 - hit_set_count";
8438 return -EINVAL;
8439 }
8440 p.hit_set_search_last_n = n;
8441 } else if (var == "min_write_recency_for_promote") {
8442 if (interr.length()) {
8443 ss << "error parsing integer value '" << val << "': " << interr;
8444 return -EINVAL;
8445 }
8446 p.min_write_recency_for_promote = n;
8447 } else if (var == "fast_read") {
8448 if (p.is_replicated()) {
8449 ss << "fast read is not supported in replication pool";
8450 return -EINVAL;
8451 }
8452 if (val == "true" || (interr.empty() && n == 1)) {
8453 p.fast_read = true;
8454 } else if (val == "false" || (interr.empty() && n == 0)) {
8455 p.fast_read = false;
8456 } else {
8457 ss << "expecting value 'true', 'false', '0', or '1'";
8458 return -EINVAL;
8459 }
8460 } else if (pool_opts_t::is_opt_name(var)) {
8461 bool unset = val == "unset";
8462 if (var == "compression_mode") {
8463 if (!unset) {
8464 auto cmode = Compressor::get_comp_mode_type(val);
8465 if (!cmode) {
8466 ss << "unrecognized compression mode '" << val << "'";
8467 return -EINVAL;
8468 }
8469 }
8470 } else if (var == "compression_algorithm") {
8471 if (!unset) {
8472 auto alg = Compressor::get_comp_alg_type(val);
8473 if (!alg) {
8474 ss << "unrecognized compression_algorithm '" << val << "'";
8475 return -EINVAL;
8476 }
8477 }
8478 } else if (var == "compression_required_ratio") {
8479 if (floaterr.length()) {
8480 ss << "error parsing float value '" << val << "': " << floaterr;
8481 return -EINVAL;
8482 }
8483 if (f < 0 || f > 1) {
8484 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8485 return -EINVAL;
8486 }
8487 } else if (var == "csum_type") {
8488 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8489 if (t < 0 ) {
8490 ss << "unrecognized csum_type '" << val << "'";
8491 return -EINVAL;
8492 }
8493 //preserve csum_type numeric value
8494 n = t;
8495 interr.clear();
8496 } else if (var == "compression_max_blob_size" ||
8497 var == "compression_min_blob_size" ||
8498 var == "csum_max_block" ||
8499 var == "csum_min_block") {
8500 if (interr.length()) {
8501 ss << "error parsing int value '" << val << "': " << interr;
8502 return -EINVAL;
8503 }
8504 } else if (var == "fingerprint_algorithm") {
8505 if (!unset) {
8506 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8507 if (!alg) {
8508 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8509 return -EINVAL;
8510 }
8511 }
8512 } else if (var == "target_size_bytes") {
8513 if (interr.length()) {
8514 ss << "error parsing unit value '" << val << "': " << interr;
8515 return -EINVAL;
8516 }
8517 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8518 ss << "must set require_osd_release to nautilus or "
8519 << "later before setting target_size_bytes";
8520 return -EINVAL;
8521 }
8522 } else if (var == "pg_num_min") {
8523 if (interr.length()) {
8524 ss << "error parsing int value '" << val << "': " << interr;
8525 return -EINVAL;
8526 }
8527 if (n > (int)p.get_pg_num_target()) {
8528 ss << "specified pg_num_min " << n
8529 << " > pg_num " << p.get_pg_num_target();
8530 return -EINVAL;
8531 }
8532 } else if (var == "recovery_priority") {
8533 if (interr.length()) {
8534 ss << "error parsing int value '" << val << "': " << interr;
8535 return -EINVAL;
8536 }
8537 if (!g_conf()->debug_allow_any_pool_priority) {
8538 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8539 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8540 << " and " << OSD_POOL_PRIORITY_MAX;
8541 return -EINVAL;
8542 }
8543 }
8544 } else if (var == "pg_autoscale_bias") {
8545 if (f < 0.0 || f > 1000.0) {
8546 ss << "pg_autoscale_bias must be between 0 and 1000";
8547 return -EINVAL;
8548 }
8549 }
8550
8551 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8552 switch (desc.type) {
8553 case pool_opts_t::STR:
8554 if (unset) {
8555 p.opts.unset(desc.key);
8556 } else {
8557 p.opts.set(desc.key, static_cast<std::string>(val));
8558 }
8559 break;
8560 case pool_opts_t::INT:
8561 if (interr.length()) {
8562 ss << "error parsing integer value '" << val << "': " << interr;
8563 return -EINVAL;
8564 }
8565 if (n == 0) {
8566 p.opts.unset(desc.key);
8567 } else {
8568 p.opts.set(desc.key, static_cast<int64_t>(n));
8569 }
8570 break;
8571 case pool_opts_t::DOUBLE:
8572 if (floaterr.length()) {
8573 ss << "error parsing floating point value '" << val << "': " << floaterr;
8574 return -EINVAL;
8575 }
8576 if (f == 0) {
8577 p.opts.unset(desc.key);
8578 } else {
8579 p.opts.set(desc.key, static_cast<double>(f));
8580 }
8581 break;
8582 default:
8583 ceph_assert(!"unknown type");
8584 }
8585 } else {
8586 ss << "unrecognized variable '" << var << "'";
8587 return -EINVAL;
8588 }
8589 if (val != "unset") {
8590 ss << "set pool " << pool << " " << var << " to " << val;
8591 } else {
8592 ss << "unset pool " << pool << " " << var;
8593 }
8594 p.last_change = pending_inc.epoch;
8595 pending_inc.new_pools[pool] = p;
8596 return 0;
8597 }
8598
8599 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8600 const cmdmap_t& cmdmap,
8601 stringstream& ss)
8602 {
8603 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8604 }
8605
8606 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8607 const cmdmap_t& cmdmap,
8608 stringstream& ss,
8609 bool *modified)
8610 {
8611 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8612 }
8613
8614
8615 /**
8616 * Common logic for preprocess and prepare phases of pool application
8617 * tag commands. In preprocess mode we're only detecting invalid
8618 * commands, and determining whether it was a modification or a no-op.
8619 * In prepare mode we're actually updating the pending state.
8620 */
8621 int OSDMonitor::_command_pool_application(const string &prefix,
8622 const cmdmap_t& cmdmap,
8623 stringstream& ss,
8624 bool *modified,
8625 bool preparing)
8626 {
8627 string pool_name;
8628 cmd_getval(cmdmap, "pool", pool_name);
8629 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8630 if (pool < 0) {
8631 ss << "unrecognized pool '" << pool_name << "'";
8632 return -ENOENT;
8633 }
8634
8635 pg_pool_t p = *osdmap.get_pg_pool(pool);
8636 if (preparing) {
8637 if (pending_inc.new_pools.count(pool)) {
8638 p = pending_inc.new_pools[pool];
8639 }
8640 }
8641
8642 string app;
8643 cmd_getval(cmdmap, "app", app);
8644 bool app_exists = (p.application_metadata.count(app) > 0);
8645
8646 string key;
8647 cmd_getval(cmdmap, "key", key);
8648 if (key == "all") {
8649 ss << "key cannot be 'all'";
8650 return -EINVAL;
8651 }
8652
8653 string value;
8654 cmd_getval(cmdmap, "value", value);
8655 if (value == "all") {
8656 ss << "value cannot be 'all'";
8657 return -EINVAL;
8658 }
8659
8660 if (boost::algorithm::ends_with(prefix, "enable")) {
8661 if (app.empty()) {
8662 ss << "application name must be provided";
8663 return -EINVAL;
8664 }
8665
8666 if (p.is_tier()) {
8667 ss << "application must be enabled on base tier";
8668 return -EINVAL;
8669 }
8670
8671 bool force = false;
8672 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8673
8674 if (!app_exists && !p.application_metadata.empty() && !force) {
8675 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8676 << "application; pass --yes-i-really-mean-it to proceed anyway";
8677 return -EPERM;
8678 }
8679
8680 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8681 ss << "too many enabled applications on pool '" << pool_name << "'; "
8682 << "max " << MAX_POOL_APPLICATIONS;
8683 return -EINVAL;
8684 }
8685
8686 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8687 ss << "application name '" << app << "' too long; max length "
8688 << MAX_POOL_APPLICATION_LENGTH;
8689 return -EINVAL;
8690 }
8691
8692 if (!app_exists) {
8693 p.application_metadata[app] = {};
8694 }
8695 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8696
8697 } else if (boost::algorithm::ends_with(prefix, "disable")) {
8698 bool force = false;
8699 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8700
8701 if (!force) {
8702 ss << "Are you SURE? Disabling an application within a pool might result "
8703 << "in loss of application functionality; pass "
8704 << "--yes-i-really-mean-it to proceed anyway";
8705 return -EPERM;
8706 }
8707
8708 if (!app_exists) {
8709 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8710 << "'";
8711 return 0; // idempotent
8712 }
8713
8714 p.application_metadata.erase(app);
8715 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8716
8717 } else if (boost::algorithm::ends_with(prefix, "set")) {
8718 if (p.is_tier()) {
8719 ss << "application metadata must be set on base tier";
8720 return -EINVAL;
8721 }
8722
8723 if (!app_exists) {
8724 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8725 << "'";
8726 return -ENOENT;
8727 }
8728
8729 string key;
8730 cmd_getval(cmdmap, "key", key);
8731
8732 if (key.empty()) {
8733 ss << "key must be provided";
8734 return -EINVAL;
8735 }
8736
8737 auto &app_keys = p.application_metadata[app];
8738 if (app_keys.count(key) == 0 &&
8739 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8740 ss << "too many keys set for application '" << app << "' on pool '"
8741 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8742 return -EINVAL;
8743 }
8744
8745 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8746 ss << "key '" << app << "' too long; max length "
8747 << MAX_POOL_APPLICATION_LENGTH;
8748 return -EINVAL;
8749 }
8750
8751 string value;
8752 cmd_getval(cmdmap, "value", value);
8753 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8754 ss << "value '" << value << "' too long; max length "
8755 << MAX_POOL_APPLICATION_LENGTH;
8756 return -EINVAL;
8757 }
8758
8759 p.application_metadata[app][key] = value;
8760 ss << "set application '" << app << "' key '" << key << "' to '"
8761 << value << "' on pool '" << pool_name << "'";
8762 } else if (boost::algorithm::ends_with(prefix, "rm")) {
8763 if (!app_exists) {
8764 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8765 << "'";
8766 return -ENOENT;
8767 }
8768
8769 string key;
8770 cmd_getval(cmdmap, "key", key);
8771 auto it = p.application_metadata[app].find(key);
8772 if (it == p.application_metadata[app].end()) {
8773 ss << "application '" << app << "' on pool '" << pool_name
8774 << "' does not have key '" << key << "'";
8775 return 0; // idempotent
8776 }
8777
8778 p.application_metadata[app].erase(it);
8779 ss << "removed application '" << app << "' key '" << key << "' on pool '"
8780 << pool_name << "'";
8781 } else {
8782 ceph_abort();
8783 }
8784
8785 if (preparing) {
8786 p.last_change = pending_inc.epoch;
8787 pending_inc.new_pools[pool] = p;
8788 }
8789
8790 // Because we fell through this far, we didn't hit no-op cases,
8791 // so pool was definitely modified
8792 if (modified != nullptr) {
8793 *modified = true;
8794 }
8795
8796 return 0;
8797 }
8798
8799 int OSDMonitor::_prepare_command_osd_crush_remove(
8800 CrushWrapper &newcrush,
8801 int32_t id,
8802 int32_t ancestor,
8803 bool has_ancestor,
8804 bool unlink_only)
8805 {
8806 int err = 0;
8807
8808 if (has_ancestor) {
8809 err = newcrush.remove_item_under(cct, id, ancestor,
8810 unlink_only);
8811 } else {
8812 err = newcrush.remove_item(cct, id, unlink_only);
8813 }
8814 return err;
8815 }
8816
8817 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8818 {
8819 pending_inc.crush.clear();
8820 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8821 }
8822
8823 int OSDMonitor::prepare_command_osd_crush_remove(
8824 CrushWrapper &newcrush,
8825 int32_t id,
8826 int32_t ancestor,
8827 bool has_ancestor,
8828 bool unlink_only)
8829 {
8830 int err = _prepare_command_osd_crush_remove(
8831 newcrush, id, ancestor,
8832 has_ancestor, unlink_only);
8833
8834 if (err < 0)
8835 return err;
8836
8837 ceph_assert(err == 0);
8838 do_osd_crush_remove(newcrush);
8839
8840 return 0;
8841 }
8842
8843 int OSDMonitor::prepare_command_osd_remove(int32_t id)
8844 {
8845 if (osdmap.is_up(id)) {
8846 return -EBUSY;
8847 }
8848
8849 pending_inc.new_state[id] = osdmap.get_state(id);
8850 pending_inc.new_uuid[id] = uuid_d();
8851 pending_metadata_rm.insert(id);
8852 pending_metadata.erase(id);
8853
8854 return 0;
8855 }
8856
8857 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8858 {
8859 ceph_assert(existing_id);
8860 *existing_id = -1;
8861
8862 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8863 if (!osdmap.exists(i) &&
8864 pending_inc.new_up_client.count(i) == 0 &&
8865 (pending_inc.new_state.count(i) == 0 ||
8866 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8867 *existing_id = i;
8868 return -1;
8869 }
8870 }
8871
8872 if (pending_inc.new_max_osd < 0) {
8873 return osdmap.get_max_osd();
8874 }
8875 return pending_inc.new_max_osd;
8876 }
8877
8878 void OSDMonitor::do_osd_create(
8879 const int32_t id,
8880 const uuid_d& uuid,
8881 const string& device_class,
8882 int32_t* new_id)
8883 {
8884 dout(10) << __func__ << " uuid " << uuid << dendl;
8885 ceph_assert(new_id);
8886
8887 // We presume validation has been performed prior to calling this
8888 // function. We assert with prejudice.
8889
8890 int32_t allocated_id = -1; // declare here so we can jump
8891 int32_t existing_id = -1;
8892 if (!uuid.is_zero()) {
8893 existing_id = osdmap.identify_osd(uuid);
8894 if (existing_id >= 0) {
8895 ceph_assert(id < 0 || id == existing_id);
8896 *new_id = existing_id;
8897 goto out;
8898 } else if (id >= 0) {
8899 // uuid does not exist, and id has been provided, so just create
8900 // the new osd.id
8901 *new_id = id;
8902 goto out;
8903 }
8904 }
8905
8906 // allocate a new id
8907 allocated_id = _allocate_osd_id(&existing_id);
8908 dout(10) << __func__ << " allocated id " << allocated_id
8909 << " existing id " << existing_id << dendl;
8910 if (existing_id >= 0) {
8911 ceph_assert(existing_id < osdmap.get_max_osd());
8912 ceph_assert(allocated_id < 0);
8913 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8914 *new_id = existing_id;
8915 } else if (allocated_id >= 0) {
8916 ceph_assert(existing_id < 0);
8917 // raise max_osd
8918 if (pending_inc.new_max_osd < 0) {
8919 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8920 } else {
8921 ++pending_inc.new_max_osd;
8922 }
8923 *new_id = pending_inc.new_max_osd - 1;
8924 ceph_assert(*new_id == allocated_id);
8925 } else {
8926 ceph_abort_msg("unexpected condition");
8927 }
8928
8929 out:
8930 if (device_class.size()) {
8931 CrushWrapper newcrush;
8932 _get_pending_crush(newcrush);
8933 if (newcrush.get_max_devices() < *new_id + 1) {
8934 newcrush.set_max_devices(*new_id + 1);
8935 }
8936 string name = string("osd.") + stringify(*new_id);
8937 if (!newcrush.item_exists(*new_id)) {
8938 newcrush.set_item_name(*new_id, name);
8939 }
8940 ostringstream ss;
8941 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8942 if (r < 0) {
8943 derr << __func__ << " failed to set " << name << " device_class "
8944 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8945 << dendl;
8946 // non-fatal... this might be a replay and we want to be idempotent.
8947 } else {
8948 dout(20) << __func__ << " set " << name << " device_class " << device_class
8949 << dendl;
8950 pending_inc.crush.clear();
8951 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8952 }
8953 } else {
8954 dout(20) << __func__ << " no device_class" << dendl;
8955 }
8956
8957 dout(10) << __func__ << " using id " << *new_id << dendl;
8958 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8959 pending_inc.new_max_osd = *new_id + 1;
8960 }
8961
8962 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8963 if (!uuid.is_zero())
8964 pending_inc.new_uuid[*new_id] = uuid;
8965 }
8966
8967 int OSDMonitor::validate_osd_create(
8968 const int32_t id,
8969 const uuid_d& uuid,
8970 const bool check_osd_exists,
8971 int32_t* existing_id,
8972 stringstream& ss)
8973 {
8974
8975 dout(10) << __func__ << " id " << id << " uuid " << uuid
8976 << " check_osd_exists " << check_osd_exists << dendl;
8977
8978 ceph_assert(existing_id);
8979
8980 if (id < 0 && uuid.is_zero()) {
8981 // we have nothing to validate
8982 *existing_id = -1;
8983 return 0;
8984 } else if (uuid.is_zero()) {
8985 // we have an id but we will ignore it - because that's what
8986 // `osd create` does.
8987 return 0;
8988 }
8989
8990 /*
8991 * This function will be used to validate whether we are able to
8992 * create a new osd when the `uuid` is specified.
8993 *
8994 * It will be used by both `osd create` and `osd new`, as the checks
8995 * are basically the same when it pertains to osd id and uuid validation.
8996 * However, `osd create` presumes an `uuid` is optional, for legacy
8997 * reasons, while `osd new` requires the `uuid` to be provided. This
8998 * means that `osd create` will not be idempotent if an `uuid` is not
8999 * provided, but we will always guarantee the idempotency of `osd new`.
9000 */
9001
9002 ceph_assert(!uuid.is_zero());
9003 if (pending_inc.identify_osd(uuid) >= 0) {
9004 // osd is about to exist
9005 return -EAGAIN;
9006 }
9007
9008 int32_t i = osdmap.identify_osd(uuid);
9009 if (i >= 0) {
9010 // osd already exists
9011 if (id >= 0 && i != id) {
9012 ss << "uuid " << uuid << " already in use for different id " << i;
9013 return -EEXIST;
9014 }
9015 // return a positive errno to distinguish between a blocking error
9016 // and an error we consider to not be a problem (i.e., this would be
9017 // an idempotent operation).
9018 *existing_id = i;
9019 return EEXIST;
9020 }
9021 // i < 0
9022 if (id >= 0) {
9023 if (pending_inc.new_state.count(id)) {
9024 // osd is about to exist
9025 return -EAGAIN;
9026 }
9027 // we may not care if an osd exists if we are recreating a previously
9028 // destroyed osd.
9029 if (check_osd_exists && osdmap.exists(id)) {
9030 ss << "id " << id << " already in use and does not match uuid "
9031 << uuid;
9032 return -EINVAL;
9033 }
9034 }
9035 return 0;
9036 }
9037
9038 int OSDMonitor::prepare_command_osd_create(
9039 const int32_t id,
9040 const uuid_d& uuid,
9041 int32_t* existing_id,
9042 stringstream& ss)
9043 {
9044 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9045 ceph_assert(existing_id);
9046 if (osdmap.is_destroyed(id)) {
9047 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9048 "instead.";
9049 return -EINVAL;
9050 }
9051
9052 if (uuid.is_zero()) {
9053 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9054 }
9055
9056 return validate_osd_create(id, uuid, true, existing_id, ss);
9057 }
9058
9059 int OSDMonitor::prepare_command_osd_new(
9060 MonOpRequestRef op,
9061 const cmdmap_t& cmdmap,
9062 const map<string,string>& params,
9063 stringstream &ss,
9064 Formatter *f)
9065 {
9066 uuid_d uuid;
9067 string uuidstr;
9068 int64_t id = -1;
9069
9070 ceph_assert(paxos->is_plugged());
9071
9072 dout(10) << __func__ << " " << op << dendl;
9073
9074 /* validate command. abort now if something's wrong. */
9075
9076 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9077 *
9078 * If `id` is not specified, we will identify any existing osd based
9079 * on `uuid`. Operation will be idempotent iff secrets match.
9080 *
9081 * If `id` is specified, we will identify any existing osd based on
9082 * `uuid` and match against `id`. If they match, operation will be
9083 * idempotent iff secrets match.
9084 *
9085 * `-i secrets.json` will be optional. If supplied, will be used
9086 * to check for idempotency when `id` and `uuid` match.
9087 *
9088 * If `id` is not specified, and `uuid` does not exist, an id will
9089 * be found or allocated for the osd.
9090 *
9091 * If `id` is specified, and the osd has been previously marked
9092 * as destroyed, then the `id` will be reused.
9093 */
9094 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9095 ss << "requires the OSD's UUID to be specified.";
9096 return -EINVAL;
9097 } else if (!uuid.parse(uuidstr.c_str())) {
9098 ss << "invalid UUID value '" << uuidstr << "'.";
9099 return -EINVAL;
9100 }
9101
9102 if (cmd_getval(cmdmap, "id", id) &&
9103 (id < 0)) {
9104 ss << "invalid OSD id; must be greater or equal than zero.";
9105 return -EINVAL;
9106 }
9107
9108 // are we running an `osd create`-like command, or recreating
9109 // a previously destroyed osd?
9110
9111 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9112
9113 // we will care about `id` to assess whether osd is `destroyed`, or
9114 // to create a new osd.
9115 // we will need an `id` by the time we reach auth.
9116
9117 int32_t existing_id = -1;
9118 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9119 &existing_id, ss);
9120
9121 bool may_be_idempotent = false;
9122 if (err == EEXIST) {
9123 // this is idempotent from the osdmon's point-of-view
9124 may_be_idempotent = true;
9125 ceph_assert(existing_id >= 0);
9126 id = existing_id;
9127 } else if (err < 0) {
9128 return err;
9129 }
9130
9131 if (!may_be_idempotent) {
9132 // idempotency is out of the window. We are either creating a new
9133 // osd or recreating a destroyed osd.
9134 //
9135 // We now need to figure out if we have an `id` (and if it's valid),
9136 // of find an `id` if we don't have one.
9137
9138 // NOTE: we need to consider the case where the `id` is specified for
9139 // `osd create`, and we must honor it. So this means checking if
9140 // the `id` is destroyed, and if so assume the destroy; otherwise,
9141 // check if it `exists` - in which case we complain about not being
9142 // `destroyed`. In the end, if nothing fails, we must allow the
9143 // creation, so that we are compatible with `create`.
9144 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9145 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9146 ss << "OSD " << id << " has not yet been destroyed";
9147 return -EINVAL;
9148 } else if (id < 0) {
9149 // find an `id`
9150 id = _allocate_osd_id(&existing_id);
9151 if (id < 0) {
9152 ceph_assert(existing_id >= 0);
9153 id = existing_id;
9154 }
9155 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9156 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9157 dout(10) << __func__ << " recreating osd." << id << dendl;
9158 } else {
9159 dout(10) << __func__ << " creating new osd." << id << dendl;
9160 }
9161 } else {
9162 ceph_assert(id >= 0);
9163 ceph_assert(osdmap.exists(id));
9164 }
9165
9166 // we are now able to either create a brand new osd or reuse an existing
9167 // osd that has been previously destroyed.
9168
9169 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9170
9171 if (may_be_idempotent && params.empty()) {
9172 // nothing to do, really.
9173 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9174 ceph_assert(id >= 0);
9175 if (f) {
9176 f->open_object_section("created_osd");
9177 f->dump_int("osdid", id);
9178 f->close_section();
9179 } else {
9180 ss << id;
9181 }
9182 return EEXIST;
9183 }
9184
9185 string device_class;
9186 auto p = params.find("crush_device_class");
9187 if (p != params.end()) {
9188 device_class = p->second;
9189 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9190 }
9191 string cephx_secret, lockbox_secret, dmcrypt_key;
9192 bool has_lockbox = false;
9193 bool has_secrets = params.count("cephx_secret")
9194 || params.count("cephx_lockbox_secret")
9195 || params.count("dmcrypt_key");
9196
9197 ConfigKeyService *svc = nullptr;
9198 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9199
9200 if (has_secrets) {
9201 if (params.count("cephx_secret") == 0) {
9202 ss << "requires a cephx secret.";
9203 return -EINVAL;
9204 }
9205 cephx_secret = params.at("cephx_secret");
9206
9207 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9208 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9209
9210 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9211 << " dmcrypt " << has_dmcrypt_key << dendl;
9212
9213 if (has_lockbox_secret && has_dmcrypt_key) {
9214 has_lockbox = true;
9215 lockbox_secret = params.at("cephx_lockbox_secret");
9216 dmcrypt_key = params.at("dmcrypt_key");
9217 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9218 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9219 return -EINVAL;
9220 }
9221
9222 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9223
9224 err = mon->authmon()->validate_osd_new(id, uuid,
9225 cephx_secret,
9226 lockbox_secret,
9227 cephx_entity,
9228 lockbox_entity,
9229 ss);
9230 if (err < 0) {
9231 return err;
9232 } else if (may_be_idempotent && err != EEXIST) {
9233 // for this to be idempotent, `id` should already be >= 0; no need
9234 // to use validate_id.
9235 ceph_assert(id >= 0);
9236 ss << "osd." << id << " exists but secrets do not match";
9237 return -EEXIST;
9238 }
9239
9240 if (has_lockbox) {
9241 svc = (ConfigKeyService*)mon->config_key_service;
9242 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9243 if (err < 0) {
9244 return err;
9245 } else if (may_be_idempotent && err != EEXIST) {
9246 ceph_assert(id >= 0);
9247 ss << "osd." << id << " exists but dm-crypt key does not match.";
9248 return -EEXIST;
9249 }
9250 }
9251 }
9252 ceph_assert(!has_secrets || !cephx_secret.empty());
9253 ceph_assert(!has_lockbox || !lockbox_secret.empty());
9254
9255 if (may_be_idempotent) {
9256 // we have nothing to do for either the osdmon or the authmon,
9257 // and we have no lockbox - so the config key service will not be
9258 // touched. This is therefore an idempotent operation, and we can
9259 // just return right away.
9260 dout(10) << __func__ << " idempotent -- no op." << dendl;
9261 ceph_assert(id >= 0);
9262 if (f) {
9263 f->open_object_section("created_osd");
9264 f->dump_int("osdid", id);
9265 f->close_section();
9266 } else {
9267 ss << id;
9268 }
9269 return EEXIST;
9270 }
9271 ceph_assert(!may_be_idempotent);
9272
9273 // perform updates.
9274 if (has_secrets) {
9275 ceph_assert(!cephx_secret.empty());
9276 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9277 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9278
9279 err = mon->authmon()->do_osd_new(cephx_entity,
9280 lockbox_entity,
9281 has_lockbox);
9282 ceph_assert(0 == err);
9283
9284 if (has_lockbox) {
9285 ceph_assert(nullptr != svc);
9286 svc->do_osd_new(uuid, dmcrypt_key);
9287 }
9288 }
9289
9290 if (is_recreate_destroyed) {
9291 ceph_assert(id >= 0);
9292 ceph_assert(osdmap.is_destroyed(id));
9293 pending_inc.new_weight[id] = CEPH_OSD_OUT;
9294 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9295 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9296 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9297 }
9298 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9299 // due to http://tracker.ceph.com/issues/20751 some clusters may
9300 // have UP set for non-existent OSDs; make sure it is cleared
9301 // for a newly created osd.
9302 pending_inc.new_state[id] |= CEPH_OSD_UP;
9303 }
9304 pending_inc.new_uuid[id] = uuid;
9305 } else {
9306 ceph_assert(id >= 0);
9307 int32_t new_id = -1;
9308 do_osd_create(id, uuid, device_class, &new_id);
9309 ceph_assert(new_id >= 0);
9310 ceph_assert(id == new_id);
9311 }
9312
9313 if (f) {
9314 f->open_object_section("created_osd");
9315 f->dump_int("osdid", id);
9316 f->close_section();
9317 } else {
9318 ss << id;
9319 }
9320
9321 return 0;
9322 }
9323
9324 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9325 {
9326 op->mark_osdmon_event(__func__);
9327 auto m = op->get_req<MMonCommand>();
9328 stringstream ss;
9329 cmdmap_t cmdmap;
9330 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9331 string rs = ss.str();
9332 mon->reply_command(op, -EINVAL, rs, get_last_committed());
9333 return true;
9334 }
9335
9336 MonSession *session = op->get_session();
9337 if (!session) {
9338 derr << __func__ << " no session" << dendl;
9339 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9340 return true;
9341 }
9342
9343 return prepare_command_impl(op, cmdmap);
9344 }
9345
9346 static int parse_reweights(CephContext *cct,
9347 const cmdmap_t& cmdmap,
9348 const OSDMap& osdmap,
9349 map<int32_t, uint32_t>* weights)
9350 {
9351 string weights_str;
9352 if (!cmd_getval(cmdmap, "weights", weights_str)) {
9353 return -EINVAL;
9354 }
9355 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9356 json_spirit::mValue json_value;
9357 if (!json_spirit::read(weights_str, json_value)) {
9358 return -EINVAL;
9359 }
9360 if (json_value.type() != json_spirit::obj_type) {
9361 return -EINVAL;
9362 }
9363 const auto obj = json_value.get_obj();
9364 try {
9365 for (auto& osd_weight : obj) {
9366 auto osd_id = std::stoi(osd_weight.first);
9367 if (!osdmap.exists(osd_id)) {
9368 return -ENOENT;
9369 }
9370 if (osd_weight.second.type() != json_spirit::str_type) {
9371 return -EINVAL;
9372 }
9373 auto weight = std::stoul(osd_weight.second.get_str());
9374 weights->insert({osd_id, weight});
9375 }
9376 } catch (const std::logic_error& e) {
9377 return -EINVAL;
9378 }
9379 return 0;
9380 }
9381
9382 int OSDMonitor::prepare_command_osd_destroy(
9383 int32_t id,
9384 stringstream& ss)
9385 {
9386 ceph_assert(paxos->is_plugged());
9387
9388 // we check if the osd exists for the benefit of `osd purge`, which may
9389 // have previously removed the osd. If the osd does not exist, return
9390 // -ENOENT to convey this, and let the caller deal with it.
9391 //
9392 // we presume that all auth secrets and config keys were removed prior
9393 // to this command being called. if they exist by now, we also assume
9394 // they must have been created by some other command and do not pertain
9395 // to this non-existent osd.
9396 if (!osdmap.exists(id)) {
9397 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9398 return -ENOENT;
9399 }
9400
9401 uuid_d uuid = osdmap.get_uuid(id);
9402 dout(10) << __func__ << " destroying osd." << id
9403 << " uuid " << uuid << dendl;
9404
9405 // if it has been destroyed, we assume our work here is done.
9406 if (osdmap.is_destroyed(id)) {
9407 ss << "destroyed osd." << id;
9408 return 0;
9409 }
9410
9411 EntityName cephx_entity, lockbox_entity;
9412 bool idempotent_auth = false, idempotent_cks = false;
9413
9414 int err = mon->authmon()->validate_osd_destroy(id, uuid,
9415 cephx_entity,
9416 lockbox_entity,
9417 ss);
9418 if (err < 0) {
9419 if (err == -ENOENT) {
9420 idempotent_auth = true;
9421 } else {
9422 return err;
9423 }
9424 }
9425
9426 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9427 err = svc->validate_osd_destroy(id, uuid);
9428 if (err < 0) {
9429 ceph_assert(err == -ENOENT);
9430 err = 0;
9431 idempotent_cks = true;
9432 }
9433
9434 if (!idempotent_auth) {
9435 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9436 ceph_assert(0 == err);
9437 }
9438
9439 if (!idempotent_cks) {
9440 svc->do_osd_destroy(id, uuid);
9441 }
9442
9443 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9444 pending_inc.new_uuid[id] = uuid_d();
9445
9446 // we can only propose_pending() once per service, otherwise we'll be
9447 // defying PaxosService and all laws of nature. Therefore, as we may
9448 // be used during 'osd purge', let's keep the caller responsible for
9449 // proposing.
9450 ceph_assert(err == 0);
9451 return 0;
9452 }
9453
9454 int OSDMonitor::prepare_command_osd_purge(
9455 int32_t id,
9456 stringstream& ss)
9457 {
9458 ceph_assert(paxos->is_plugged());
9459 dout(10) << __func__ << " purging osd." << id << dendl;
9460
9461 ceph_assert(!osdmap.is_up(id));
9462
9463 /*
9464 * This may look a bit weird, but this is what's going to happen:
9465 *
9466 * 1. we make sure that removing from crush works
9467 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9468 * error, then we abort the whole operation, as no updates
9469 * have been made. However, we this function will have
9470 * side-effects, thus we need to make sure that all operations
9471 * performed henceforth will *always* succeed.
9472 * 3. we call `prepare_command_osd_remove()`. Although this
9473 * function can return an error, it currently only checks if the
9474 * osd is up - and we have made sure that it is not so, so there
9475 * is no conflict, and it is effectively an update.
9476 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9477 * the crush update we delayed from before.
9478 */
9479
9480 CrushWrapper newcrush;
9481 _get_pending_crush(newcrush);
9482
9483 bool may_be_idempotent = false;
9484
9485 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9486 if (err == -ENOENT) {
9487 err = 0;
9488 may_be_idempotent = true;
9489 } else if (err < 0) {
9490 ss << "error removing osd." << id << " from crush";
9491 return err;
9492 }
9493
9494 // no point destroying the osd again if it has already been marked destroyed
9495 if (!osdmap.is_destroyed(id)) {
9496 err = prepare_command_osd_destroy(id, ss);
9497 if (err < 0) {
9498 if (err == -ENOENT) {
9499 err = 0;
9500 } else {
9501 return err;
9502 }
9503 } else {
9504 may_be_idempotent = false;
9505 }
9506 }
9507 ceph_assert(0 == err);
9508
9509 if (may_be_idempotent && !osdmap.exists(id)) {
9510 dout(10) << __func__ << " osd." << id << " does not exist and "
9511 << "we are idempotent." << dendl;
9512 return -ENOENT;
9513 }
9514
9515 err = prepare_command_osd_remove(id);
9516 // we should not be busy, as we should have made sure this id is not up.
9517 ceph_assert(0 == err);
9518
9519 do_osd_crush_remove(newcrush);
9520 return 0;
9521 }
9522
9523 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9524 const cmdmap_t& cmdmap)
9525 {
9526 op->mark_osdmon_event(__func__);
9527 auto m = op->get_req<MMonCommand>();
9528 bool ret = false;
9529 stringstream ss;
9530 string rs;
9531 bufferlist rdata;
9532 int err = 0;
9533
9534 string format;
9535 cmd_getval(cmdmap, "format", format, string("plain"));
9536 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9537
9538 string prefix;
9539 cmd_getval(cmdmap, "prefix", prefix);
9540
9541 int64_t osdid;
9542 string osd_name;
9543 bool osdid_present = false;
9544 if (prefix != "osd pg-temp" &&
9545 prefix != "osd pg-upmap" &&
9546 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9547 osdid_present = cmd_getval(cmdmap, "id", osdid);
9548 }
9549 if (osdid_present) {
9550 ostringstream oss;
9551 oss << "osd." << osdid;
9552 osd_name = oss.str();
9553 }
9554
9555 // Even if there's a pending state with changes that could affect
9556 // a command, considering that said state isn't yet committed, we
9557 // just don't care about those changes if the command currently being
9558 // handled acts as a no-op against the current committed state.
9559 // In a nutshell, we assume this command happens *before*.
9560 //
9561 // Let me make this clearer:
9562 //
9563 // - If we have only one client, and that client issues some
9564 // operation that would conflict with this operation but is
9565 // still on the pending state, then we would be sure that said
9566 // operation wouldn't have returned yet, so the client wouldn't
9567 // issue this operation (unless the client didn't wait for the
9568 // operation to finish, and that would be the client's own fault).
9569 //
9570 // - If we have more than one client, each client will observe
9571 // whatever is the state at the moment of the commit. So, if we
9572 // have two clients, one issuing an unlink and another issuing a
9573 // link, and if the link happens while the unlink is still on the
9574 // pending state, from the link's point-of-view this is a no-op.
9575 // If different clients are issuing conflicting operations and
9576 // they care about that, then the clients should make sure they
9577 // enforce some kind of concurrency mechanism -- from our
9578 // perspective that's what Douglas Adams would call an SEP.
9579 //
9580 // This should be used as a general guideline for most commands handled
9581 // in this function. Adapt as you see fit, but please bear in mind that
9582 // this is the expected behavior.
9583
9584
9585 if (prefix == "osd setcrushmap" ||
9586 (prefix == "osd crush set" && !osdid_present)) {
9587 if (pending_inc.crush.length()) {
9588 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9589 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9590 return true;
9591 }
9592 dout(10) << "prepare_command setting new crush map" << dendl;
9593 bufferlist data(m->get_data());
9594 CrushWrapper crush;
9595 try {
9596 auto bl = data.cbegin();
9597 crush.decode(bl);
9598 }
9599 catch (const std::exception &e) {
9600 err = -EINVAL;
9601 ss << "Failed to parse crushmap: " << e.what();
9602 goto reply;
9603 }
9604
9605 int64_t prior_version = 0;
9606 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9607 if (prior_version == osdmap.get_crush_version() - 1) {
9608 // see if we are a resend of the last update. this is imperfect
9609 // (multiple racing updaters may not both get reliable success)
9610 // but we expect crush updaters (via this interface) to be rare-ish.
9611 bufferlist current, proposed;
9612 osdmap.crush->encode(current, mon->get_quorum_con_features());
9613 crush.encode(proposed, mon->get_quorum_con_features());
9614 if (current.contents_equal(proposed)) {
9615 dout(10) << __func__
9616 << " proposed matches current and version equals previous"
9617 << dendl;
9618 err = 0;
9619 ss << osdmap.get_crush_version();
9620 goto reply;
9621 }
9622 }
9623 if (prior_version != osdmap.get_crush_version()) {
9624 err = -EPERM;
9625 ss << "prior_version " << prior_version << " != crush version "
9626 << osdmap.get_crush_version();
9627 goto reply;
9628 }
9629 }
9630
9631 if (crush.has_legacy_rule_ids()) {
9632 err = -EINVAL;
9633 ss << "crush maps with ruleset != ruleid are no longer allowed";
9634 goto reply;
9635 }
9636 if (!validate_crush_against_features(&crush, ss)) {
9637 err = -EINVAL;
9638 goto reply;
9639 }
9640
9641 err = osdmap.validate_crush_rules(&crush, &ss);
9642 if (err < 0) {
9643 goto reply;
9644 }
9645
9646 if (g_conf()->mon_osd_crush_smoke_test) {
9647 // sanity check: test some inputs to make sure this map isn't
9648 // totally broken
9649 dout(10) << " testing map" << dendl;
9650 stringstream ess;
9651 CrushTester tester(crush, ess);
9652 tester.set_min_x(0);
9653 tester.set_max_x(50);
9654 auto start = ceph::coarse_mono_clock::now();
9655 int r = tester.test_with_fork(g_conf()->mon_lease);
9656 auto duration = ceph::coarse_mono_clock::now() - start;
9657 if (r < 0) {
9658 dout(10) << " tester.test_with_fork returns " << r
9659 << ": " << ess.str() << dendl;
9660 ss << "crush smoke test failed with " << r << ": " << ess.str();
9661 err = r;
9662 goto reply;
9663 }
9664 dout(10) << __func__ << " crush somke test duration: "
9665 << duration << ", result: " << ess.str() << dendl;
9666 }
9667
9668 pending_inc.crush = data;
9669 ss << osdmap.get_crush_version() + 1;
9670 goto update;
9671
9672 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9673 CrushWrapper newcrush;
9674 _get_pending_crush(newcrush);
9675 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9676 int bid = -1 - b;
9677 if (newcrush.bucket_exists(bid) &&
9678 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9679 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9680 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9681 }
9682 }
9683 if (!validate_crush_against_features(&newcrush, ss)) {
9684 err = -EINVAL;
9685 goto reply;
9686 }
9687 pending_inc.crush.clear();
9688 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9689 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9690 get_last_committed() + 1));
9691 return true;
9692 } else if (prefix == "osd crush set-device-class") {
9693 string device_class;
9694 if (!cmd_getval(cmdmap, "class", device_class)) {
9695 err = -EINVAL; // no value!
9696 goto reply;
9697 }
9698
9699 bool stop = false;
9700 vector<string> idvec;
9701 cmd_getval(cmdmap, "ids", idvec);
9702 CrushWrapper newcrush;
9703 _get_pending_crush(newcrush);
9704 set<int> updated;
9705 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9706 set<int> osds;
9707 // wildcard?
9708 if (j == 0 &&
9709 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9710 osdmap.get_all_osds(osds);
9711 stop = true;
9712 } else {
9713 // try traditional single osd way
9714 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9715 if (osd < 0) {
9716 // ss has reason for failure
9717 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9718 err = -EINVAL;
9719 continue;
9720 }
9721 osds.insert(osd);
9722 }
9723
9724 for (auto &osd : osds) {
9725 if (!osdmap.exists(osd)) {
9726 ss << "osd." << osd << " does not exist. ";
9727 continue;
9728 }
9729
9730 ostringstream oss;
9731 oss << "osd." << osd;
9732 string name = oss.str();
9733
9734 if (newcrush.get_max_devices() < osd + 1) {
9735 newcrush.set_max_devices(osd + 1);
9736 }
9737 string action;
9738 if (newcrush.item_exists(osd)) {
9739 action = "updating";
9740 } else {
9741 action = "creating";
9742 newcrush.set_item_name(osd, name);
9743 }
9744
9745 dout(5) << action << " crush item id " << osd << " name '" << name
9746 << "' device_class '" << device_class << "'"
9747 << dendl;
9748 err = newcrush.update_device_class(osd, device_class, name, &ss);
9749 if (err < 0) {
9750 goto reply;
9751 }
9752 if (err == 0 && !_have_pending_crush()) {
9753 if (!stop) {
9754 // for single osd only, wildcard makes too much noise
9755 ss << "set-device-class item id " << osd << " name '" << name
9756 << "' device_class '" << device_class << "': no change. ";
9757 }
9758 } else {
9759 updated.insert(osd);
9760 }
9761 }
9762 }
9763
9764 pending_inc.crush.clear();
9765 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9766 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9767 getline(ss, rs);
9768 wait_for_finished_proposal(
9769 op,
9770 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9771 return true;
9772 } else if (prefix == "osd crush rm-device-class") {
9773 bool stop = false;
9774 vector<string> idvec;
9775 cmd_getval(cmdmap, "ids", idvec);
9776 CrushWrapper newcrush;
9777 _get_pending_crush(newcrush);
9778 set<int> updated;
9779
9780 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9781 set<int> osds;
9782
9783 // wildcard?
9784 if (j == 0 &&
9785 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9786 osdmap.get_all_osds(osds);
9787 stop = true;
9788 } else {
9789 // try traditional single osd way
9790 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9791 if (osd < 0) {
9792 // ss has reason for failure
9793 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9794 err = -EINVAL;
9795 goto reply;
9796 }
9797 osds.insert(osd);
9798 }
9799
9800 for (auto &osd : osds) {
9801 if (!osdmap.exists(osd)) {
9802 ss << "osd." << osd << " does not exist. ";
9803 continue;
9804 }
9805
9806 auto class_name = newcrush.get_item_class(osd);
9807 if (!class_name) {
9808 ss << "osd." << osd << " belongs to no class, ";
9809 continue;
9810 }
9811 // note that we do not verify if class_is_in_use here
9812 // in case the device is misclassified and user wants
9813 // to overridely reset...
9814
9815 err = newcrush.remove_device_class(cct, osd, &ss);
9816 if (err < 0) {
9817 // ss has reason for failure
9818 goto reply;
9819 }
9820 updated.insert(osd);
9821 }
9822 }
9823
9824 pending_inc.crush.clear();
9825 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9826 ss << "done removing class of osd(s): " << updated;
9827 getline(ss, rs);
9828 wait_for_finished_proposal(
9829 op,
9830 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9831 return true;
9832 } else if (prefix == "osd crush class create") {
9833 string device_class;
9834 if (!cmd_getval(cmdmap, "class", device_class)) {
9835 err = -EINVAL; // no value!
9836 goto reply;
9837 }
9838 if (osdmap.require_osd_release < ceph_release_t::luminous) {
9839 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9840 << "luminous' before using crush device classes";
9841 err = -EPERM;
9842 goto reply;
9843 }
9844 if (!_have_pending_crush() &&
9845 _get_stable_crush().class_exists(device_class)) {
9846 ss << "class '" << device_class << "' already exists";
9847 goto reply;
9848 }
9849 CrushWrapper newcrush;
9850 _get_pending_crush(newcrush);
9851 if (newcrush.class_exists(device_class)) {
9852 ss << "class '" << device_class << "' already exists";
9853 goto update;
9854 }
9855 int class_id = newcrush.get_or_create_class_id(device_class);
9856 pending_inc.crush.clear();
9857 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9858 ss << "created class " << device_class << " with id " << class_id
9859 << " to crush map";
9860 goto update;
9861 } else if (prefix == "osd crush class rm") {
9862 string device_class;
9863 if (!cmd_getval(cmdmap, "class", device_class)) {
9864 err = -EINVAL; // no value!
9865 goto reply;
9866 }
9867 if (osdmap.require_osd_release < ceph_release_t::luminous) {
9868 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9869 << "luminous' before using crush device classes";
9870 err = -EPERM;
9871 goto reply;
9872 }
9873
9874 if (!osdmap.crush->class_exists(device_class)) {
9875 err = 0;
9876 goto reply;
9877 }
9878
9879 CrushWrapper newcrush;
9880 _get_pending_crush(newcrush);
9881 if (!newcrush.class_exists(device_class)) {
9882 err = 0; // make command idempotent
9883 goto wait;
9884 }
9885 int class_id = newcrush.get_class_id(device_class);
9886 stringstream ts;
9887 if (newcrush.class_is_in_use(class_id, &ts)) {
9888 err = -EBUSY;
9889 ss << "class '" << device_class << "' " << ts.str();
9890 goto reply;
9891 }
9892
9893 // check if class is used by any erasure-code-profiles
9894 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9895 osdmap.get_erasure_code_profiles();
9896 auto ec_profiles = pending_inc.get_erasure_code_profiles();
9897 #ifdef HAVE_STDLIB_MAP_SPLICING
9898 ec_profiles.merge(old_ec_profiles);
9899 #else
9900 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9901 make_move_iterator(end(old_ec_profiles)));
9902 #endif
9903 list<string> referenced_by;
9904 for (auto &i: ec_profiles) {
9905 for (auto &j: i.second) {
9906 if ("crush-device-class" == j.first && device_class == j.second) {
9907 referenced_by.push_back(i.first);
9908 }
9909 }
9910 }
9911 if (!referenced_by.empty()) {
9912 err = -EBUSY;
9913 ss << "class '" << device_class
9914 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9915 goto reply;
9916 }
9917
9918 set<int> osds;
9919 newcrush.get_devices_by_class(device_class, &osds);
9920 for (auto& p: osds) {
9921 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9922 if (err < 0) {
9923 // ss has reason for failure
9924 goto reply;
9925 }
9926 }
9927
9928 if (osds.empty()) {
9929 // empty class, remove directly
9930 err = newcrush.remove_class_name(device_class);
9931 if (err < 0) {
9932 ss << "class '" << device_class << "' cannot be removed '"
9933 << cpp_strerror(err) << "'";
9934 goto reply;
9935 }
9936 }
9937
9938 pending_inc.crush.clear();
9939 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9940 ss << "removed class " << device_class << " with id " << class_id
9941 << " from crush map";
9942 goto update;
9943 } else if (prefix == "osd crush class rename") {
9944 string srcname, dstname;
9945 if (!cmd_getval(cmdmap, "srcname", srcname)) {
9946 err = -EINVAL;
9947 goto reply;
9948 }
9949 if (!cmd_getval(cmdmap, "dstname", dstname)) {
9950 err = -EINVAL;
9951 goto reply;
9952 }
9953
9954 CrushWrapper newcrush;
9955 _get_pending_crush(newcrush);
9956 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9957 // suppose this is a replay and return success
9958 // so command is idempotent
9959 ss << "already renamed to '" << dstname << "'";
9960 err = 0;
9961 goto reply;
9962 }
9963
9964 err = newcrush.rename_class(srcname, dstname);
9965 if (err < 0) {
9966 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9967 << cpp_strerror(err);
9968 goto reply;
9969 }
9970
9971 pending_inc.crush.clear();
9972 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9973 ss << "rename class '" << srcname << "' to '" << dstname << "'";
9974 goto update;
9975 } else if (prefix == "osd crush add-bucket") {
9976 // os crush add-bucket <name> <type>
9977 string name, typestr;
9978 vector<string> argvec;
9979 cmd_getval(cmdmap, "name", name);
9980 cmd_getval(cmdmap, "type", typestr);
9981 cmd_getval(cmdmap, "args", argvec);
9982 map<string,string> loc;
9983 if (!argvec.empty()) {
9984 CrushWrapper::parse_loc_map(argvec, &loc);
9985 dout(0) << "will create and move bucket '" << name
9986 << "' to location " << loc << dendl;
9987 }
9988
9989 if (!_have_pending_crush() &&
9990 _get_stable_crush().name_exists(name)) {
9991 ss << "bucket '" << name << "' already exists";
9992 goto reply;
9993 }
9994
9995 CrushWrapper newcrush;
9996 _get_pending_crush(newcrush);
9997
9998 if (newcrush.name_exists(name)) {
9999 ss << "bucket '" << name << "' already exists";
10000 goto update;
10001 }
10002 int type = newcrush.get_type_id(typestr);
10003 if (type < 0) {
10004 ss << "type '" << typestr << "' does not exist";
10005 err = -EINVAL;
10006 goto reply;
10007 }
10008 if (type == 0) {
10009 ss << "type '" << typestr << "' is for devices, not buckets";
10010 err = -EINVAL;
10011 goto reply;
10012 }
10013 int bucketno;
10014 err = newcrush.add_bucket(0, 0,
10015 CRUSH_HASH_DEFAULT, type, 0, NULL,
10016 NULL, &bucketno);
10017 if (err < 0) {
10018 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10019 goto reply;
10020 }
10021 err = newcrush.set_item_name(bucketno, name);
10022 if (err < 0) {
10023 ss << "error setting bucket name to '" << name << "'";
10024 goto reply;
10025 }
10026
10027 if (!loc.empty()) {
10028 if (!newcrush.check_item_loc(cct, bucketno, loc,
10029 (int *)NULL)) {
10030 err = newcrush.move_bucket(cct, bucketno, loc);
10031 if (err < 0) {
10032 ss << "error moving bucket '" << name << "' to location " << loc;
10033 goto reply;
10034 }
10035 } else {
10036 ss << "no need to move item id " << bucketno << " name '" << name
10037 << "' to location " << loc << " in crush map";
10038 }
10039 }
10040
10041 pending_inc.crush.clear();
10042 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10043 if (loc.empty()) {
10044 ss << "added bucket " << name << " type " << typestr
10045 << " to crush map";
10046 } else {
10047 ss << "added bucket " << name << " type " << typestr
10048 << " to location " << loc;
10049 }
10050 goto update;
10051 } else if (prefix == "osd crush rename-bucket") {
10052 string srcname, dstname;
10053 cmd_getval(cmdmap, "srcname", srcname);
10054 cmd_getval(cmdmap, "dstname", dstname);
10055
10056 err = crush_rename_bucket(srcname, dstname, &ss);
10057 if (err == -EALREADY) // equivalent to success for idempotency
10058 err = 0;
10059 if (err)
10060 goto reply;
10061 else
10062 goto update;
10063 } else if (prefix == "osd crush weight-set create" ||
10064 prefix == "osd crush weight-set create-compat") {
10065 CrushWrapper newcrush;
10066 _get_pending_crush(newcrush);
10067 int64_t pool;
10068 int positions;
10069 if (newcrush.has_non_straw2_buckets()) {
10070 ss << "crush map contains one or more bucket(s) that are not straw2";
10071 err = -EPERM;
10072 goto reply;
10073 }
10074 if (prefix == "osd crush weight-set create") {
10075 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10076 osdmap.require_min_compat_client < ceph_release_t::luminous) {
10077 ss << "require_min_compat_client "
10078 << osdmap.require_min_compat_client
10079 << " < luminous, which is required for per-pool weight-sets. "
10080 << "Try 'ceph osd set-require-min-compat-client luminous' "
10081 << "before using the new interface";
10082 err = -EPERM;
10083 goto reply;
10084 }
10085 string poolname, mode;
10086 cmd_getval(cmdmap, "pool", poolname);
10087 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10088 if (pool < 0) {
10089 ss << "pool '" << poolname << "' not found";
10090 err = -ENOENT;
10091 goto reply;
10092 }
10093 cmd_getval(cmdmap, "mode", mode);
10094 if (mode != "flat" && mode != "positional") {
10095 ss << "unrecognized weight-set mode '" << mode << "'";
10096 err = -EINVAL;
10097 goto reply;
10098 }
10099 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10100 } else {
10101 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10102 positions = 1;
10103 }
10104 if (!newcrush.create_choose_args(pool, positions)) {
10105 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10106 ss << "compat weight-set already created";
10107 } else {
10108 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10109 << "' already created";
10110 }
10111 goto reply;
10112 }
10113 pending_inc.crush.clear();
10114 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10115 goto update;
10116
10117 } else if (prefix == "osd crush weight-set rm" ||
10118 prefix == "osd crush weight-set rm-compat") {
10119 CrushWrapper newcrush;
10120 _get_pending_crush(newcrush);
10121 int64_t pool;
10122 if (prefix == "osd crush weight-set rm") {
10123 string poolname;
10124 cmd_getval(cmdmap, "pool", poolname);
10125 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10126 if (pool < 0) {
10127 ss << "pool '" << poolname << "' not found";
10128 err = -ENOENT;
10129 goto reply;
10130 }
10131 } else {
10132 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10133 }
10134 newcrush.rm_choose_args(pool);
10135 pending_inc.crush.clear();
10136 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10137 goto update;
10138
10139 } else if (prefix == "osd crush weight-set reweight" ||
10140 prefix == "osd crush weight-set reweight-compat") {
10141 string poolname, item;
10142 vector<double> weight;
10143 cmd_getval(cmdmap, "pool", poolname);
10144 cmd_getval(cmdmap, "item", item);
10145 cmd_getval(cmdmap, "weight", weight);
10146 CrushWrapper newcrush;
10147 _get_pending_crush(newcrush);
10148 int64_t pool;
10149 if (prefix == "osd crush weight-set reweight") {
10150 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10151 if (pool < 0) {
10152 ss << "pool '" << poolname << "' not found";
10153 err = -ENOENT;
10154 goto reply;
10155 }
10156 if (!newcrush.have_choose_args(pool)) {
10157 ss << "no weight-set for pool '" << poolname << "'";
10158 err = -ENOENT;
10159 goto reply;
10160 }
10161 auto arg_map = newcrush.choose_args_get(pool);
10162 int positions = newcrush.get_choose_args_positions(arg_map);
10163 if (weight.size() != (size_t)positions) {
10164 ss << "must specify exact " << positions << " weight values";
10165 err = -EINVAL;
10166 goto reply;
10167 }
10168 } else {
10169 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10170 if (!newcrush.have_choose_args(pool)) {
10171 ss << "no backward-compatible weight-set";
10172 err = -ENOENT;
10173 goto reply;
10174 }
10175 }
10176 if (!newcrush.name_exists(item)) {
10177 ss << "item '" << item << "' does not exist";
10178 err = -ENOENT;
10179 goto reply;
10180 }
10181 err = newcrush.choose_args_adjust_item_weightf(
10182 cct,
10183 newcrush.choose_args_get(pool),
10184 newcrush.get_item_id(item),
10185 weight,
10186 &ss);
10187 if (err < 0) {
10188 goto reply;
10189 }
10190 err = 0;
10191 pending_inc.crush.clear();
10192 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10193 goto update;
10194 } else if (osdid_present &&
10195 (prefix == "osd crush set" || prefix == "osd crush add")) {
10196 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10197 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10198 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10199
10200 if (!osdmap.exists(osdid)) {
10201 err = -ENOENT;
10202 ss << osd_name
10203 << " does not exist. Create it before updating the crush map";
10204 goto reply;
10205 }
10206
10207 double weight;
10208 if (!cmd_getval(cmdmap, "weight", weight)) {
10209 ss << "unable to parse weight value '"
10210 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10211 err = -EINVAL;
10212 goto reply;
10213 }
10214
10215 string args;
10216 vector<string> argvec;
10217 cmd_getval(cmdmap, "args", argvec);
10218 map<string,string> loc;
10219 CrushWrapper::parse_loc_map(argvec, &loc);
10220
10221 if (prefix == "osd crush set"
10222 && !_get_stable_crush().item_exists(osdid)) {
10223 err = -ENOENT;
10224 ss << "unable to set item id " << osdid << " name '" << osd_name
10225 << "' weight " << weight << " at location " << loc
10226 << ": does not exist";
10227 goto reply;
10228 }
10229
10230 dout(5) << "adding/updating crush item id " << osdid << " name '"
10231 << osd_name << "' weight " << weight << " at location "
10232 << loc << dendl;
10233 CrushWrapper newcrush;
10234 _get_pending_crush(newcrush);
10235
10236 string action;
10237 if (prefix == "osd crush set" ||
10238 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10239 action = "set";
10240 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10241 } else {
10242 action = "add";
10243 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10244 if (err == 0)
10245 err = 1;
10246 }
10247
10248 if (err < 0)
10249 goto reply;
10250
10251 if (err == 0 && !_have_pending_crush()) {
10252 ss << action << " item id " << osdid << " name '" << osd_name
10253 << "' weight " << weight << " at location " << loc << ": no change";
10254 goto reply;
10255 }
10256
10257 pending_inc.crush.clear();
10258 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10259 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10260 << weight << " at location " << loc << " to crush map";
10261 getline(ss, rs);
10262 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10263 get_last_committed() + 1));
10264 return true;
10265
10266 } else if (prefix == "osd crush create-or-move") {
10267 do {
10268 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10269 if (!osdmap.exists(osdid)) {
10270 err = -ENOENT;
10271 ss << osd_name
10272 << " does not exist. create it before updating the crush map";
10273 goto reply;
10274 }
10275
10276 double weight;
10277 if (!cmd_getval(cmdmap, "weight", weight)) {
10278 ss << "unable to parse weight value '"
10279 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10280 err = -EINVAL;
10281 goto reply;
10282 }
10283
10284 string args;
10285 vector<string> argvec;
10286 cmd_getval(cmdmap, "args", argvec);
10287 map<string,string> loc;
10288 CrushWrapper::parse_loc_map(argvec, &loc);
10289
10290 dout(0) << "create-or-move crush item name '" << osd_name
10291 << "' initial_weight " << weight << " at location " << loc
10292 << dendl;
10293
10294 CrushWrapper newcrush;
10295 _get_pending_crush(newcrush);
10296
10297 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10298 g_conf()->osd_crush_update_weight_set);
10299 if (err == 0) {
10300 ss << "create-or-move updated item name '" << osd_name
10301 << "' weight " << weight
10302 << " at location " << loc << " to crush map";
10303 break;
10304 }
10305 if (err > 0) {
10306 pending_inc.crush.clear();
10307 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10308 ss << "create-or-move updating item name '" << osd_name
10309 << "' weight " << weight
10310 << " at location " << loc << " to crush map";
10311 getline(ss, rs);
10312 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10313 get_last_committed() + 1));
10314 return true;
10315 }
10316 } while (false);
10317
10318 } else if (prefix == "osd crush move") {
10319 do {
10320 // osd crush move <name> <loc1> [<loc2> ...]
10321 string name;
10322 vector<string> argvec;
10323 cmd_getval(cmdmap, "name", name);
10324 cmd_getval(cmdmap, "args", argvec);
10325 map<string,string> loc;
10326 CrushWrapper::parse_loc_map(argvec, &loc);
10327
10328 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10329 CrushWrapper newcrush;
10330 _get_pending_crush(newcrush);
10331
10332 if (!newcrush.name_exists(name)) {
10333 err = -ENOENT;
10334 ss << "item " << name << " does not exist";
10335 break;
10336 }
10337 int id = newcrush.get_item_id(name);
10338
10339 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10340 if (id >= 0) {
10341 err = newcrush.create_or_move_item(
10342 cct, id, 0, name, loc,
10343 g_conf()->osd_crush_update_weight_set);
10344 } else {
10345 err = newcrush.move_bucket(cct, id, loc);
10346 }
10347 if (err >= 0) {
10348 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10349 pending_inc.crush.clear();
10350 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10351 getline(ss, rs);
10352 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10353 get_last_committed() + 1));
10354 return true;
10355 }
10356 } else {
10357 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10358 err = 0;
10359 }
10360 } while (false);
10361 } else if (prefix == "osd crush swap-bucket") {
10362 string source, dest;
10363 cmd_getval(cmdmap, "source", source);
10364 cmd_getval(cmdmap, "dest", dest);
10365
10366 bool force = false;
10367 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10368
10369 CrushWrapper newcrush;
10370 _get_pending_crush(newcrush);
10371 if (!newcrush.name_exists(source)) {
10372 ss << "source item " << source << " does not exist";
10373 err = -ENOENT;
10374 goto reply;
10375 }
10376 if (!newcrush.name_exists(dest)) {
10377 ss << "dest item " << dest << " does not exist";
10378 err = -ENOENT;
10379 goto reply;
10380 }
10381 int sid = newcrush.get_item_id(source);
10382 int did = newcrush.get_item_id(dest);
10383 int sparent;
10384 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10385 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10386 err = -EPERM;
10387 goto reply;
10388 }
10389 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10390 !force) {
10391 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10392 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10393 << "; pass --yes-i-really-mean-it to proceed anyway";
10394 err = -EPERM;
10395 goto reply;
10396 }
10397 int r = newcrush.swap_bucket(cct, sid, did);
10398 if (r < 0) {
10399 ss << "failed to swap bucket contents: " << cpp_strerror(r);
10400 err = r;
10401 goto reply;
10402 }
10403 ss << "swapped bucket of " << source << " to " << dest;
10404 pending_inc.crush.clear();
10405 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10406 wait_for_finished_proposal(op,
10407 new Monitor::C_Command(mon, op, err, ss.str(),
10408 get_last_committed() + 1));
10409 return true;
10410 } else if (prefix == "osd crush link") {
10411 // osd crush link <name> <loc1> [<loc2> ...]
10412 string name;
10413 cmd_getval(cmdmap, "name", name);
10414 vector<string> argvec;
10415 cmd_getval(cmdmap, "args", argvec);
10416 map<string,string> loc;
10417 CrushWrapper::parse_loc_map(argvec, &loc);
10418
10419 // Need an explicit check for name_exists because get_item_id returns
10420 // 0 on unfound.
10421 int id = osdmap.crush->get_item_id(name);
10422 if (!osdmap.crush->name_exists(name)) {
10423 err = -ENOENT;
10424 ss << "item " << name << " does not exist";
10425 goto reply;
10426 } else {
10427 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10428 }
10429 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10430 ss << "no need to move item id " << id << " name '" << name
10431 << "' to location " << loc << " in crush map";
10432 err = 0;
10433 goto reply;
10434 }
10435
10436 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10437 CrushWrapper newcrush;
10438 _get_pending_crush(newcrush);
10439
10440 if (!newcrush.name_exists(name)) {
10441 err = -ENOENT;
10442 ss << "item " << name << " does not exist";
10443 goto reply;
10444 } else {
10445 int id = newcrush.get_item_id(name);
10446 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10447 err = newcrush.link_bucket(cct, id, loc);
10448 if (err >= 0) {
10449 ss << "linked item id " << id << " name '" << name
10450 << "' to location " << loc << " in crush map";
10451 pending_inc.crush.clear();
10452 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10453 } else {
10454 ss << "cannot link item id " << id << " name '" << name
10455 << "' to location " << loc;
10456 goto reply;
10457 }
10458 } else {
10459 ss << "no need to move item id " << id << " name '" << name
10460 << "' to location " << loc << " in crush map";
10461 err = 0;
10462 }
10463 }
10464 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10465 get_last_committed() + 1));
10466 return true;
10467 } else if (prefix == "osd crush rm" ||
10468 prefix == "osd crush remove" ||
10469 prefix == "osd crush unlink") {
10470 do {
10471 // osd crush rm <id> [ancestor]
10472 CrushWrapper newcrush;
10473 _get_pending_crush(newcrush);
10474
10475 string name;
10476 cmd_getval(cmdmap, "name", name);
10477
10478 if (!osdmap.crush->name_exists(name)) {
10479 err = 0;
10480 ss << "device '" << name << "' does not appear in the crush map";
10481 break;
10482 }
10483 if (!newcrush.name_exists(name)) {
10484 err = 0;
10485 ss << "device '" << name << "' does not appear in the crush map";
10486 getline(ss, rs);
10487 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10488 get_last_committed() + 1));
10489 return true;
10490 }
10491 int id = newcrush.get_item_id(name);
10492 int ancestor = 0;
10493
10494 bool unlink_only = prefix == "osd crush unlink";
10495 string ancestor_str;
10496 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10497 if (!newcrush.name_exists(ancestor_str)) {
10498 err = -ENOENT;
10499 ss << "ancestor item '" << ancestor_str
10500 << "' does not appear in the crush map";
10501 break;
10502 }
10503 ancestor = newcrush.get_item_id(ancestor_str);
10504 }
10505
10506 err = prepare_command_osd_crush_remove(
10507 newcrush,
10508 id, ancestor,
10509 (ancestor < 0), unlink_only);
10510
10511 if (err == -ENOENT) {
10512 ss << "item " << id << " does not appear in that position";
10513 err = 0;
10514 break;
10515 }
10516 if (err == 0) {
10517 if (!unlink_only)
10518 pending_inc.new_crush_node_flags[id] = 0;
10519 ss << "removed item id " << id << " name '" << name << "' from crush map";
10520 getline(ss, rs);
10521 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10522 get_last_committed() + 1));
10523 return true;
10524 }
10525 } while (false);
10526
10527 } else if (prefix == "osd crush reweight-all") {
10528 CrushWrapper newcrush;
10529 _get_pending_crush(newcrush);
10530
10531 newcrush.reweight(cct);
10532 pending_inc.crush.clear();
10533 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10534 ss << "reweighted crush hierarchy";
10535 getline(ss, rs);
10536 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10537 get_last_committed() + 1));
10538 return true;
10539 } else if (prefix == "osd crush reweight") {
10540 // osd crush reweight <name> <weight>
10541 CrushWrapper newcrush;
10542 _get_pending_crush(newcrush);
10543
10544 string name;
10545 cmd_getval(cmdmap, "name", name);
10546 if (!newcrush.name_exists(name)) {
10547 err = -ENOENT;
10548 ss << "device '" << name << "' does not appear in the crush map";
10549 goto reply;
10550 }
10551
10552 int id = newcrush.get_item_id(name);
10553 if (id < 0) {
10554 ss << "device '" << name << "' is not a leaf in the crush map";
10555 err = -EINVAL;
10556 goto reply;
10557 }
10558 double w;
10559 if (!cmd_getval(cmdmap, "weight", w)) {
10560 ss << "unable to parse weight value '"
10561 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10562 err = -EINVAL;
10563 goto reply;
10564 }
10565
10566 err = newcrush.adjust_item_weightf(cct, id, w,
10567 g_conf()->osd_crush_update_weight_set);
10568 if (err < 0)
10569 goto reply;
10570 pending_inc.crush.clear();
10571 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10572 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10573 << " in crush map";
10574 getline(ss, rs);
10575 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10576 get_last_committed() + 1));
10577 return true;
10578 } else if (prefix == "osd crush reweight-subtree") {
10579 // osd crush reweight <name> <weight>
10580 CrushWrapper newcrush;
10581 _get_pending_crush(newcrush);
10582
10583 string name;
10584 cmd_getval(cmdmap, "name", name);
10585 if (!newcrush.name_exists(name)) {
10586 err = -ENOENT;
10587 ss << "device '" << name << "' does not appear in the crush map";
10588 goto reply;
10589 }
10590
10591 int id = newcrush.get_item_id(name);
10592 if (id >= 0) {
10593 ss << "device '" << name << "' is not a subtree in the crush map";
10594 err = -EINVAL;
10595 goto reply;
10596 }
10597 double w;
10598 if (!cmd_getval(cmdmap, "weight", w)) {
10599 ss << "unable to parse weight value '"
10600 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10601 err = -EINVAL;
10602 goto reply;
10603 }
10604
10605 err = newcrush.adjust_subtree_weightf(cct, id, w,
10606 g_conf()->osd_crush_update_weight_set);
10607 if (err < 0)
10608 goto reply;
10609 pending_inc.crush.clear();
10610 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10611 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10612 << " in crush map";
10613 getline(ss, rs);
10614 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10615 get_last_committed() + 1));
10616 return true;
10617 } else if (prefix == "osd crush tunables") {
10618 CrushWrapper newcrush;
10619 _get_pending_crush(newcrush);
10620
10621 err = 0;
10622 string profile;
10623 cmd_getval(cmdmap, "profile", profile);
10624 if (profile == "legacy" || profile == "argonaut") {
10625 newcrush.set_tunables_legacy();
10626 } else if (profile == "bobtail") {
10627 newcrush.set_tunables_bobtail();
10628 } else if (profile == "firefly") {
10629 newcrush.set_tunables_firefly();
10630 } else if (profile == "hammer") {
10631 newcrush.set_tunables_hammer();
10632 } else if (profile == "jewel") {
10633 newcrush.set_tunables_jewel();
10634 } else if (profile == "optimal") {
10635 newcrush.set_tunables_optimal();
10636 } else if (profile == "default") {
10637 newcrush.set_tunables_default();
10638 } else {
10639 ss << "unrecognized profile '" << profile << "'";
10640 err = -EINVAL;
10641 goto reply;
10642 }
10643
10644 if (!validate_crush_against_features(&newcrush, ss)) {
10645 err = -EINVAL;
10646 goto reply;
10647 }
10648
10649 pending_inc.crush.clear();
10650 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10651 ss << "adjusted tunables profile to " << profile;
10652 getline(ss, rs);
10653 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10654 get_last_committed() + 1));
10655 return true;
10656 } else if (prefix == "osd crush set-tunable") {
10657 CrushWrapper newcrush;
10658 _get_pending_crush(newcrush);
10659
10660 err = 0;
10661 string tunable;
10662 cmd_getval(cmdmap, "tunable", tunable);
10663
10664 int64_t value = -1;
10665 if (!cmd_getval(cmdmap, "value", value)) {
10666 err = -EINVAL;
10667 ss << "failed to parse integer value "
10668 << cmd_vartype_stringify(cmdmap.at("value"));
10669 goto reply;
10670 }
10671
10672 if (tunable == "straw_calc_version") {
10673 if (value != 0 && value != 1) {
10674 ss << "value must be 0 or 1; got " << value;
10675 err = -EINVAL;
10676 goto reply;
10677 }
10678 newcrush.set_straw_calc_version(value);
10679 } else {
10680 ss << "unrecognized tunable '" << tunable << "'";
10681 err = -EINVAL;
10682 goto reply;
10683 }
10684
10685 if (!validate_crush_against_features(&newcrush, ss)) {
10686 err = -EINVAL;
10687 goto reply;
10688 }
10689
10690 pending_inc.crush.clear();
10691 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10692 ss << "adjusted tunable " << tunable << " to " << value;
10693 getline(ss, rs);
10694 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10695 get_last_committed() + 1));
10696 return true;
10697
10698 } else if (prefix == "osd crush rule create-simple") {
10699 string name, root, type, mode;
10700 cmd_getval(cmdmap, "name", name);
10701 cmd_getval(cmdmap, "root", root);
10702 cmd_getval(cmdmap, "type", type);
10703 cmd_getval(cmdmap, "mode", mode);
10704 if (mode == "")
10705 mode = "firstn";
10706
10707 if (osdmap.crush->rule_exists(name)) {
10708 // The name is uniquely associated to a ruleid and the rule it contains
10709 // From the user point of view, the rule is more meaningfull.
10710 ss << "rule " << name << " already exists";
10711 err = 0;
10712 goto reply;
10713 }
10714
10715 CrushWrapper newcrush;
10716 _get_pending_crush(newcrush);
10717
10718 if (newcrush.rule_exists(name)) {
10719 // The name is uniquely associated to a ruleid and the rule it contains
10720 // From the user point of view, the rule is more meaningfull.
10721 ss << "rule " << name << " already exists";
10722 err = 0;
10723 } else {
10724 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10725 pg_pool_t::TYPE_REPLICATED, &ss);
10726 if (ruleno < 0) {
10727 err = ruleno;
10728 goto reply;
10729 }
10730
10731 pending_inc.crush.clear();
10732 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10733 }
10734 getline(ss, rs);
10735 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10736 get_last_committed() + 1));
10737 return true;
10738
10739 } else if (prefix == "osd crush rule create-replicated") {
10740 string name, root, type, device_class;
10741 cmd_getval(cmdmap, "name", name);
10742 cmd_getval(cmdmap, "root", root);
10743 cmd_getval(cmdmap, "type", type);
10744 cmd_getval(cmdmap, "class", device_class);
10745
10746 if (osdmap.crush->rule_exists(name)) {
10747 // The name is uniquely associated to a ruleid and the rule it contains
10748 // From the user point of view, the rule is more meaningfull.
10749 ss << "rule " << name << " already exists";
10750 err = 0;
10751 goto reply;
10752 }
10753
10754 CrushWrapper newcrush;
10755 _get_pending_crush(newcrush);
10756
10757 if (newcrush.rule_exists(name)) {
10758 // The name is uniquely associated to a ruleid and the rule it contains
10759 // From the user point of view, the rule is more meaningfull.
10760 ss << "rule " << name << " already exists";
10761 err = 0;
10762 } else {
10763 int ruleno = newcrush.add_simple_rule(
10764 name, root, type, device_class,
10765 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10766 if (ruleno < 0) {
10767 err = ruleno;
10768 goto reply;
10769 }
10770
10771 pending_inc.crush.clear();
10772 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10773 }
10774 getline(ss, rs);
10775 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10776 get_last_committed() + 1));
10777 return true;
10778
10779 } else if (prefix == "osd erasure-code-profile rm") {
10780 string name;
10781 cmd_getval(cmdmap, "name", name);
10782
10783 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10784 goto wait;
10785
10786 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10787 err = -EBUSY;
10788 goto reply;
10789 }
10790
10791 if (osdmap.has_erasure_code_profile(name) ||
10792 pending_inc.new_erasure_code_profiles.count(name)) {
10793 if (osdmap.has_erasure_code_profile(name)) {
10794 pending_inc.old_erasure_code_profiles.push_back(name);
10795 } else {
10796 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10797 pending_inc.new_erasure_code_profiles.erase(name);
10798 }
10799
10800 getline(ss, rs);
10801 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10802 get_last_committed() + 1));
10803 return true;
10804 } else {
10805 ss << "erasure-code-profile " << name << " does not exist";
10806 err = 0;
10807 goto reply;
10808 }
10809
10810 } else if (prefix == "osd erasure-code-profile set") {
10811 string name;
10812 cmd_getval(cmdmap, "name", name);
10813 vector<string> profile;
10814 cmd_getval(cmdmap, "profile", profile);
10815
10816 bool force = false;
10817 cmd_getval(cmdmap, "force", force);
10818
10819 map<string,string> profile_map;
10820 err = parse_erasure_code_profile(profile, &profile_map, &ss);
10821 if (err)
10822 goto reply;
10823 if (auto found = profile_map.find("crush-failure-domain");
10824 found != profile_map.end()) {
10825 const auto& failure_domain = found->second;
10826 int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
10827 if (failure_domain_type < 0) {
10828 ss << "erasure-code-profile " << profile_map
10829 << " contains an invalid failure-domain " << std::quoted(failure_domain);
10830 err = -EINVAL;
10831 goto reply;
10832 }
10833 }
10834
10835 if (profile_map.find("plugin") == profile_map.end()) {
10836 ss << "erasure-code-profile " << profile_map
10837 << " must contain a plugin entry" << std::endl;
10838 err = -EINVAL;
10839 goto reply;
10840 }
10841 string plugin = profile_map["plugin"];
10842
10843 if (pending_inc.has_erasure_code_profile(name)) {
10844 dout(20) << "erasure code profile " << name << " try again" << dendl;
10845 goto wait;
10846 } else {
10847 err = normalize_profile(name, profile_map, force, &ss);
10848 if (err)
10849 goto reply;
10850
10851 if (osdmap.has_erasure_code_profile(name)) {
10852 ErasureCodeProfile existing_profile_map =
10853 osdmap.get_erasure_code_profile(name);
10854 err = normalize_profile(name, existing_profile_map, force, &ss);
10855 if (err)
10856 goto reply;
10857
10858 if (existing_profile_map == profile_map) {
10859 err = 0;
10860 goto reply;
10861 }
10862 if (!force) {
10863 err = -EPERM;
10864 ss << "will not override erasure code profile " << name
10865 << " because the existing profile "
10866 << existing_profile_map
10867 << " is different from the proposed profile "
10868 << profile_map;
10869 goto reply;
10870 }
10871 }
10872
10873 dout(20) << "erasure code profile set " << name << "="
10874 << profile_map << dendl;
10875 pending_inc.set_erasure_code_profile(name, profile_map);
10876 }
10877
10878 getline(ss, rs);
10879 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10880 get_last_committed() + 1));
10881 return true;
10882
10883 } else if (prefix == "osd crush rule create-erasure") {
10884 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10885 if (err == -EAGAIN)
10886 goto wait;
10887 if (err)
10888 goto reply;
10889 string name, poolstr;
10890 cmd_getval(cmdmap, "name", name);
10891 string profile;
10892 cmd_getval(cmdmap, "profile", profile);
10893 if (profile == "")
10894 profile = "default";
10895 if (profile == "default") {
10896 if (!osdmap.has_erasure_code_profile(profile)) {
10897 if (pending_inc.has_erasure_code_profile(profile)) {
10898 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10899 goto wait;
10900 }
10901
10902 map<string,string> profile_map;
10903 err = osdmap.get_erasure_code_profile_default(cct,
10904 profile_map,
10905 &ss);
10906 if (err)
10907 goto reply;
10908 err = normalize_profile(name, profile_map, true, &ss);
10909 if (err)
10910 goto reply;
10911 dout(20) << "erasure code profile set " << profile << "="
10912 << profile_map << dendl;
10913 pending_inc.set_erasure_code_profile(profile, profile_map);
10914 goto wait;
10915 }
10916 }
10917
10918 int rule;
10919 err = crush_rule_create_erasure(name, profile, &rule, &ss);
10920 if (err < 0) {
10921 switch(err) {
10922 case -EEXIST: // return immediately
10923 ss << "rule " << name << " already exists";
10924 err = 0;
10925 goto reply;
10926 break;
10927 case -EALREADY: // wait for pending to be proposed
10928 ss << "rule " << name << " already exists";
10929 err = 0;
10930 break;
10931 default: // non recoverable error
10932 goto reply;
10933 break;
10934 }
10935 } else {
10936 ss << "created rule " << name << " at " << rule;
10937 }
10938
10939 getline(ss, rs);
10940 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10941 get_last_committed() + 1));
10942 return true;
10943
10944 } else if (prefix == "osd crush rule rm") {
10945 string name;
10946 cmd_getval(cmdmap, "name", name);
10947
10948 if (!osdmap.crush->rule_exists(name)) {
10949 ss << "rule " << name << " does not exist";
10950 err = 0;
10951 goto reply;
10952 }
10953
10954 CrushWrapper newcrush;
10955 _get_pending_crush(newcrush);
10956
10957 if (!newcrush.rule_exists(name)) {
10958 ss << "rule " << name << " does not exist";
10959 err = 0;
10960 } else {
10961 int ruleno = newcrush.get_rule_id(name);
10962 ceph_assert(ruleno >= 0);
10963
10964 // make sure it is not in use.
10965 // FIXME: this is ok in some situations, but let's not bother with that
10966 // complexity now.
10967 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10968 if (osdmap.crush_rule_in_use(ruleset)) {
10969 ss << "crush ruleset " << name << " " << ruleset << " is in use";
10970 err = -EBUSY;
10971 goto reply;
10972 }
10973
10974 err = newcrush.remove_rule(ruleno);
10975 if (err < 0) {
10976 goto reply;
10977 }
10978
10979 pending_inc.crush.clear();
10980 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10981 }
10982 getline(ss, rs);
10983 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10984 get_last_committed() + 1));
10985 return true;
10986
10987 } else if (prefix == "osd crush rule rename") {
10988 string srcname;
10989 string dstname;
10990 cmd_getval(cmdmap, "srcname", srcname);
10991 cmd_getval(cmdmap, "dstname", dstname);
10992 if (srcname.empty() || dstname.empty()) {
10993 ss << "must specify both source rule name and destination rule name";
10994 err = -EINVAL;
10995 goto reply;
10996 }
10997 if (srcname == dstname) {
10998 ss << "destination rule name is equal to source rule name";
10999 err = 0;
11000 goto reply;
11001 }
11002
11003 CrushWrapper newcrush;
11004 _get_pending_crush(newcrush);
11005 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11006 // srcname does not exist and dstname already exists
11007 // suppose this is a replay and return success
11008 // (so this command is idempotent)
11009 ss << "already renamed to '" << dstname << "'";
11010 err = 0;
11011 goto reply;
11012 }
11013
11014 err = newcrush.rename_rule(srcname, dstname, &ss);
11015 if (err < 0) {
11016 // ss has reason for failure
11017 goto reply;
11018 }
11019 pending_inc.crush.clear();
11020 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11021 getline(ss, rs);
11022 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11023 get_last_committed() + 1));
11024 return true;
11025
11026 } else if (prefix == "osd setmaxosd") {
11027 int64_t newmax;
11028 if (!cmd_getval(cmdmap, "newmax", newmax)) {
11029 ss << "unable to parse 'newmax' value '"
11030 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11031 err = -EINVAL;
11032 goto reply;
11033 }
11034
11035 if (newmax > g_conf()->mon_max_osd) {
11036 err = -ERANGE;
11037 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11038 << g_conf()->mon_max_osd << ")";
11039 goto reply;
11040 }
11041
11042 // Don't allow shrinking OSD number as this will cause data loss
11043 // and may cause kernel crashes.
11044 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11045 if (newmax < osdmap.get_max_osd()) {
11046 // Check if the OSDs exist between current max and new value.
11047 // If there are any OSDs exist, then don't allow shrinking number
11048 // of OSDs.
11049 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11050 if (osdmap.exists(i)) {
11051 err = -EBUSY;
11052 ss << "cannot shrink max_osd to " << newmax
11053 << " because osd." << i << " (and possibly others) still in use";
11054 goto reply;
11055 }
11056 }
11057 }
11058
11059 pending_inc.new_max_osd = newmax;
11060 ss << "set new max_osd = " << pending_inc.new_max_osd;
11061 getline(ss, rs);
11062 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11063 get_last_committed() + 1));
11064 return true;
11065
11066 } else if (prefix == "osd set-full-ratio" ||
11067 prefix == "osd set-backfillfull-ratio" ||
11068 prefix == "osd set-nearfull-ratio") {
11069 double n;
11070 if (!cmd_getval(cmdmap, "ratio", n)) {
11071 ss << "unable to parse 'ratio' value '"
11072 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11073 err = -EINVAL;
11074 goto reply;
11075 }
11076 if (prefix == "osd set-full-ratio")
11077 pending_inc.new_full_ratio = n;
11078 else if (prefix == "osd set-backfillfull-ratio")
11079 pending_inc.new_backfillfull_ratio = n;
11080 else if (prefix == "osd set-nearfull-ratio")
11081 pending_inc.new_nearfull_ratio = n;
11082 ss << prefix << " " << n;
11083 getline(ss, rs);
11084 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11085 get_last_committed() + 1));
11086 return true;
11087 } else if (prefix == "osd set-require-min-compat-client") {
11088 string v;
11089 cmd_getval(cmdmap, "version", v);
11090 ceph_release_t vno = ceph_release_from_name(v);
11091 if (!vno) {
11092 ss << "version " << v << " is not recognized";
11093 err = -EINVAL;
11094 goto reply;
11095 }
11096 OSDMap newmap;
11097 newmap.deepish_copy_from(osdmap);
11098 newmap.apply_incremental(pending_inc);
11099 newmap.require_min_compat_client = vno;
11100 auto mvno = newmap.get_min_compat_client();
11101 if (vno < mvno) {
11102 ss << "osdmap current utilizes features that require " << mvno
11103 << "; cannot set require_min_compat_client below that to " << vno;
11104 err = -EPERM;
11105 goto reply;
11106 }
11107 bool sure = false;
11108 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11109 if (!sure) {
11110 FeatureMap m;
11111 mon->get_combined_feature_map(&m);
11112 uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
11113 bool first = true;
11114 bool ok = true;
11115 for (int type : {
11116 CEPH_ENTITY_TYPE_CLIENT,
11117 CEPH_ENTITY_TYPE_MDS,
11118 CEPH_ENTITY_TYPE_MGR }) {
11119 auto p = m.m.find(type);
11120 if (p == m.m.end()) {
11121 continue;
11122 }
11123 for (auto& q : p->second) {
11124 uint64_t missing = ~q.first & features;
11125 if (missing) {
11126 if (first) {
11127 ss << "cannot set require_min_compat_client to " << v << ": ";
11128 } else {
11129 ss << "; ";
11130 }
11131 first = false;
11132 ss << q.second << " connected " << ceph_entity_type_name(type)
11133 << "(s) look like " << ceph_release_name(
11134 ceph_release_from_features(q.first))
11135 << " (missing 0x" << std::hex << missing << std::dec << ")";
11136 ok = false;
11137 }
11138 }
11139 }
11140 if (!ok) {
11141 ss << "; add --yes-i-really-mean-it to do it anyway";
11142 err = -EPERM;
11143 goto reply;
11144 }
11145 }
11146 ss << "set require_min_compat_client to " << vno;
11147 pending_inc.new_require_min_compat_client = vno;
11148 getline(ss, rs);
11149 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11150 get_last_committed() + 1));
11151 return true;
11152 } else if (prefix == "osd pause") {
11153 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11154
11155 } else if (prefix == "osd unpause") {
11156 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11157
11158 } else if (prefix == "osd set") {
11159 bool sure = false;
11160 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11161
11162 string key;
11163 cmd_getval(cmdmap, "key", key);
11164 if (key == "pause")
11165 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11166 else if (key == "noup")
11167 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11168 else if (key == "nodown")
11169 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11170 else if (key == "noout")
11171 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11172 else if (key == "noin")
11173 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11174 else if (key == "nobackfill")
11175 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11176 else if (key == "norebalance")
11177 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11178 else if (key == "norecover")
11179 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11180 else if (key == "noscrub")
11181 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11182 else if (key == "nodeep-scrub")
11183 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11184 else if (key == "notieragent")
11185 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11186 else if (key == "nosnaptrim")
11187 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11188 else if (key == "pglog_hardlimit") {
11189 if (!osdmap.get_num_up_osds() && !sure) {
11190 ss << "Not advisable to continue since no OSDs are up. Pass "
11191 << "--yes-i-really-mean-it if you really wish to continue.";
11192 err = -EPERM;
11193 goto reply;
11194 }
11195 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11196 // we are reusing a jewel feature bit that was retired in luminous.
11197 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11198 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11199 || sure)) {
11200 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11201 } else {
11202 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11203 err = -EPERM;
11204 goto reply;
11205 }
11206 } else {
11207 ss << "unrecognized flag '" << key << "'";
11208 err = -EINVAL;
11209 }
11210
11211 } else if (prefix == "osd unset") {
11212 string key;
11213 cmd_getval(cmdmap, "key", key);
11214 if (key == "pause")
11215 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11216 else if (key == "noup")
11217 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11218 else if (key == "nodown")
11219 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11220 else if (key == "noout")
11221 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11222 else if (key == "noin")
11223 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11224 else if (key == "nobackfill")
11225 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11226 else if (key == "norebalance")
11227 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11228 else if (key == "norecover")
11229 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11230 else if (key == "noscrub")
11231 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11232 else if (key == "nodeep-scrub")
11233 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11234 else if (key == "notieragent")
11235 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11236 else if (key == "nosnaptrim")
11237 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11238 else {
11239 ss << "unrecognized flag '" << key << "'";
11240 err = -EINVAL;
11241 }
11242
11243 } else if (prefix == "osd require-osd-release") {
11244 string release;
11245 cmd_getval(cmdmap, "release", release);
11246 bool sure = false;
11247 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11248 ceph_release_t rel = ceph_release_from_name(release.c_str());
11249 if (!rel) {
11250 ss << "unrecognized release " << release;
11251 err = -EINVAL;
11252 goto reply;
11253 }
11254 if (rel == osdmap.require_osd_release) {
11255 // idempotent
11256 err = 0;
11257 goto reply;
11258 }
11259 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11260 if (!osdmap.get_num_up_osds() && !sure) {
11261 ss << "Not advisable to continue since no OSDs are up. Pass "
11262 << "--yes-i-really-mean-it if you really wish to continue.";
11263 err = -EPERM;
11264 goto reply;
11265 }
11266 if (rel == ceph_release_t::mimic) {
11267 if (!mon->monmap->get_required_features().contains_all(
11268 ceph::features::mon::FEATURE_MIMIC)) {
11269 ss << "not all mons are mimic";
11270 err = -EPERM;
11271 goto reply;
11272 }
11273 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11274 && !sure) {
11275 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11276 err = -EPERM;
11277 goto reply;
11278 }
11279 } else if (rel == ceph_release_t::nautilus) {
11280 if (!mon->monmap->get_required_features().contains_all(
11281 ceph::features::mon::FEATURE_NAUTILUS)) {
11282 ss << "not all mons are nautilus";
11283 err = -EPERM;
11284 goto reply;
11285 }
11286 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11287 && !sure) {
11288 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11289 err = -EPERM;
11290 goto reply;
11291 }
11292 } else if (rel == ceph_release_t::octopus) {
11293 if (!mon->monmap->get_required_features().contains_all(
11294 ceph::features::mon::FEATURE_OCTOPUS)) {
11295 ss << "not all mons are octopus";
11296 err = -EPERM;
11297 goto reply;
11298 }
11299 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11300 && !sure) {
11301 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11302 err = -EPERM;
11303 goto reply;
11304 }
11305 } else {
11306 ss << "not supported for this release yet";
11307 err = -EPERM;
11308 goto reply;
11309 }
11310 if (rel < osdmap.require_osd_release) {
11311 ss << "require_osd_release cannot be lowered once it has been set";
11312 err = -EPERM;
11313 goto reply;
11314 }
11315 pending_inc.new_require_osd_release = rel;
11316 goto update;
11317 } else if (prefix == "osd down" ||
11318 prefix == "osd out" ||
11319 prefix == "osd in" ||
11320 prefix == "osd rm" ||
11321 prefix == "osd stop") {
11322
11323 bool any = false;
11324 bool stop = false;
11325 bool verbose = true;
11326 bool definitely_dead = false;
11327
11328 vector<string> idvec;
11329 cmd_getval(cmdmap, "ids", idvec);
11330 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11331 derr << "definitely_dead " << (int)definitely_dead << dendl;
11332 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11333 set<int> osds;
11334
11335 // wildcard?
11336 if (j == 0 &&
11337 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11338 if (prefix == "osd in") {
11339 // touch out osds only
11340 osdmap.get_out_existing_osds(osds);
11341 } else {
11342 osdmap.get_all_osds(osds);
11343 }
11344 stop = true;
11345 verbose = false; // so the output is less noisy.
11346 } else {
11347 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11348 if (osd < 0) {
11349 ss << "invalid osd id" << osd;
11350 err = -EINVAL;
11351 continue;
11352 } else if (!osdmap.exists(osd)) {
11353 ss << "osd." << osd << " does not exist. ";
11354 continue;
11355 }
11356
11357 osds.insert(osd);
11358 }
11359
11360 for (auto &osd : osds) {
11361 if (prefix == "osd down") {
11362 if (osdmap.is_down(osd)) {
11363 if (verbose)
11364 ss << "osd." << osd << " is already down. ";
11365 } else {
11366 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11367 ss << "marked down osd." << osd << ". ";
11368 any = true;
11369 }
11370 if (definitely_dead) {
11371 if (!pending_inc.new_xinfo.count(osd)) {
11372 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11373 }
11374 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11375 any = true;
11376 }
11377 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11378 }
11379 } else if (prefix == "osd out") {
11380 if (osdmap.is_out(osd)) {
11381 if (verbose)
11382 ss << "osd." << osd << " is already out. ";
11383 } else {
11384 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11385 if (osdmap.osd_weight[osd]) {
11386 if (pending_inc.new_xinfo.count(osd) == 0) {
11387 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11388 }
11389 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11390 }
11391 ss << "marked out osd." << osd << ". ";
11392 std::ostringstream msg;
11393 msg << "Client " << op->get_session()->entity_name
11394 << " marked osd." << osd << " out";
11395 if (osdmap.is_up(osd)) {
11396 msg << ", while it was still marked up";
11397 } else {
11398 auto period = ceph_clock_now() - down_pending_out[osd];
11399 msg << ", after it was down for " << int(period.sec())
11400 << " seconds";
11401 }
11402
11403 mon->clog->info() << msg.str();
11404 any = true;
11405 }
11406 } else if (prefix == "osd in") {
11407 if (osdmap.is_in(osd)) {
11408 if (verbose)
11409 ss << "osd." << osd << " is already in. ";
11410 } else {
11411 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11412 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11413 if (pending_inc.new_xinfo.count(osd) == 0) {
11414 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11415 }
11416 pending_inc.new_xinfo[osd].old_weight = 0;
11417 } else {
11418 pending_inc.new_weight[osd] = CEPH_OSD_IN;
11419 }
11420 ss << "marked in osd." << osd << ". ";
11421 any = true;
11422 }
11423 } else if (prefix == "osd rm") {
11424 err = prepare_command_osd_remove(osd);
11425
11426 if (err == -EBUSY) {
11427 if (any)
11428 ss << ", ";
11429 ss << "osd." << osd << " is still up; must be down before removal. ";
11430 } else {
11431 ceph_assert(err == 0);
11432 if (any) {
11433 ss << ", osd." << osd;
11434 } else {
11435 ss << "removed osd." << osd;
11436 }
11437 any = true;
11438 }
11439 } else if (prefix == "osd stop") {
11440 if (osdmap.is_stop(osd)) {
11441 if (verbose)
11442 ss << "osd." << osd << " is already stopped. ";
11443 } else if (osdmap.is_down(osd)) {
11444 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11445 ss << "stop down osd." << osd << ". ";
11446 any = true;
11447 } else {
11448 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11449 ss << "stop osd." << osd << ". ";
11450 any = true;
11451 }
11452 }
11453 }
11454 }
11455 if (any) {
11456 getline(ss, rs);
11457 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11458 get_last_committed() + 1));
11459 return true;
11460 }
11461 } else if (prefix == "osd set-group" ||
11462 prefix == "osd unset-group" ||
11463 prefix == "osd add-noup" ||
11464 prefix == "osd add-nodown" ||
11465 prefix == "osd add-noin" ||
11466 prefix == "osd add-noout" ||
11467 prefix == "osd rm-noup" ||
11468 prefix == "osd rm-nodown" ||
11469 prefix == "osd rm-noin" ||
11470 prefix == "osd rm-noout") {
11471 bool do_set = prefix == "osd set-group" ||
11472 prefix.find("add") != string::npos;
11473 string flag_str;
11474 unsigned flags = 0;
11475 vector<string> who;
11476 if (prefix == "osd set-group" || prefix == "osd unset-group") {
11477 cmd_getval(cmdmap, "flags", flag_str);
11478 cmd_getval(cmdmap, "who", who);
11479 vector<string> raw_flags;
11480 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11481 for (auto& f : raw_flags) {
11482 if (f == "noup")
11483 flags |= CEPH_OSD_NOUP;
11484 else if (f == "nodown")
11485 flags |= CEPH_OSD_NODOWN;
11486 else if (f == "noin")
11487 flags |= CEPH_OSD_NOIN;
11488 else if (f == "noout")
11489 flags |= CEPH_OSD_NOOUT;
11490 else {
11491 ss << "unrecognized flag '" << f << "', must be one of "
11492 << "{noup,nodown,noin,noout}";
11493 err = -EINVAL;
11494 goto reply;
11495 }
11496 }
11497 } else {
11498 cmd_getval(cmdmap, "ids", who);
11499 if (prefix.find("noup") != string::npos)
11500 flags = CEPH_OSD_NOUP;
11501 else if (prefix.find("nodown") != string::npos)
11502 flags = CEPH_OSD_NODOWN;
11503 else if (prefix.find("noin") != string::npos)
11504 flags = CEPH_OSD_NOIN;
11505 else if (prefix.find("noout") != string::npos)
11506 flags = CEPH_OSD_NOOUT;
11507 else
11508 ceph_assert(0 == "Unreachable!");
11509 }
11510 if (flags == 0) {
11511 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11512 err = -EINVAL;
11513 goto reply;
11514 }
11515 if (who.empty()) {
11516 ss << "must specify at least one or more targets to set/unset";
11517 err = -EINVAL;
11518 goto reply;
11519 }
11520 set<int> osds;
11521 set<int> crush_nodes;
11522 set<int> device_classes;
11523 for (auto& w : who) {
11524 if (w == "any" || w == "all" || w == "*") {
11525 osdmap.get_all_osds(osds);
11526 break;
11527 }
11528 std::stringstream ts;
11529 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11530 osds.insert(osd);
11531 } else if (osdmap.crush->name_exists(w)) {
11532 crush_nodes.insert(osdmap.crush->get_item_id(w));
11533 } else if (osdmap.crush->class_exists(w)) {
11534 device_classes.insert(osdmap.crush->get_class_id(w));
11535 } else {
11536 ss << "unable to parse osd id or crush node or device class: "
11537 << "\"" << w << "\". ";
11538 }
11539 }
11540 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11541 // ss has reason for failure
11542 err = -EINVAL;
11543 goto reply;
11544 }
11545 bool any = false;
11546 for (auto osd : osds) {
11547 if (!osdmap.exists(osd)) {
11548 ss << "osd." << osd << " does not exist. ";
11549 continue;
11550 }
11551 if (do_set) {
11552 if (flags & CEPH_OSD_NOUP) {
11553 any |= osdmap.is_noup_by_osd(osd) ?
11554 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11555 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11556 }
11557 if (flags & CEPH_OSD_NODOWN) {
11558 any |= osdmap.is_nodown_by_osd(osd) ?
11559 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11560 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11561 }
11562 if (flags & CEPH_OSD_NOIN) {
11563 any |= osdmap.is_noin_by_osd(osd) ?
11564 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11565 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11566 }
11567 if (flags & CEPH_OSD_NOOUT) {
11568 any |= osdmap.is_noout_by_osd(osd) ?
11569 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11570 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11571 }
11572 } else {
11573 if (flags & CEPH_OSD_NOUP) {
11574 any |= osdmap.is_noup_by_osd(osd) ?
11575 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11576 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11577 }
11578 if (flags & CEPH_OSD_NODOWN) {
11579 any |= osdmap.is_nodown_by_osd(osd) ?
11580 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11581 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11582 }
11583 if (flags & CEPH_OSD_NOIN) {
11584 any |= osdmap.is_noin_by_osd(osd) ?
11585 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11586 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11587 }
11588 if (flags & CEPH_OSD_NOOUT) {
11589 any |= osdmap.is_noout_by_osd(osd) ?
11590 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11591 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11592 }
11593 }
11594 }
11595 for (auto& id : crush_nodes) {
11596 auto old_flags = osdmap.get_crush_node_flags(id);
11597 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11598 pending_flags |= old_flags; // adopt existing flags first!
11599 if (do_set) {
11600 pending_flags |= flags;
11601 } else {
11602 pending_flags &= ~flags;
11603 }
11604 any = true;
11605 }
11606 for (auto& id : device_classes) {
11607 auto old_flags = osdmap.get_device_class_flags(id);
11608 auto& pending_flags = pending_inc.new_device_class_flags[id];
11609 pending_flags |= old_flags;
11610 if (do_set) {
11611 pending_flags |= flags;
11612 } else {
11613 pending_flags &= ~flags;
11614 }
11615 any = true;
11616 }
11617 if (any) {
11618 getline(ss, rs);
11619 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11620 get_last_committed() + 1));
11621 return true;
11622 }
11623 } else if (prefix == "osd pg-temp") {
11624 string pgidstr;
11625 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11626 ss << "unable to parse 'pgid' value '"
11627 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11628 err = -EINVAL;
11629 goto reply;
11630 }
11631 pg_t pgid;
11632 if (!pgid.parse(pgidstr.c_str())) {
11633 ss << "invalid pgid '" << pgidstr << "'";
11634 err = -EINVAL;
11635 goto reply;
11636 }
11637 if (!osdmap.pg_exists(pgid)) {
11638 ss << "pg " << pgid << " does not exist";
11639 err = -ENOENT;
11640 goto reply;
11641 }
11642 if (pending_inc.new_pg_temp.count(pgid)) {
11643 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11644 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11645 return true;
11646 }
11647
11648 vector<int64_t> id_vec;
11649 vector<int32_t> new_pg_temp;
11650 cmd_getval(cmdmap, "id", id_vec);
11651 if (id_vec.empty()) {
11652 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11653 ss << "done cleaning up pg_temp of " << pgid;
11654 goto update;
11655 }
11656 for (auto osd : id_vec) {
11657 if (!osdmap.exists(osd)) {
11658 ss << "osd." << osd << " does not exist";
11659 err = -ENOENT;
11660 goto reply;
11661 }
11662 new_pg_temp.push_back(osd);
11663 }
11664
11665 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11666 if ((int)new_pg_temp.size() < pool_min_size) {
11667 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11668 << pool_min_size << ")";
11669 err = -EINVAL;
11670 goto reply;
11671 }
11672
11673 int pool_size = osdmap.get_pg_pool_size(pgid);
11674 if ((int)new_pg_temp.size() > pool_size) {
11675 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11676 << pool_size << ")";
11677 err = -EINVAL;
11678 goto reply;
11679 }
11680
11681 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11682 new_pg_temp.begin(), new_pg_temp.end());
11683 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11684 goto update;
11685 } else if (prefix == "osd primary-temp") {
11686 string pgidstr;
11687 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11688 ss << "unable to parse 'pgid' value '"
11689 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11690 err = -EINVAL;
11691 goto reply;
11692 }
11693 pg_t pgid;
11694 if (!pgid.parse(pgidstr.c_str())) {
11695 ss << "invalid pgid '" << pgidstr << "'";
11696 err = -EINVAL;
11697 goto reply;
11698 }
11699 if (!osdmap.pg_exists(pgid)) {
11700 ss << "pg " << pgid << " does not exist";
11701 err = -ENOENT;
11702 goto reply;
11703 }
11704
11705 int64_t osd;
11706 if (!cmd_getval(cmdmap, "id", osd)) {
11707 ss << "unable to parse 'id' value '"
11708 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11709 err = -EINVAL;
11710 goto reply;
11711 }
11712 if (osd != -1 && !osdmap.exists(osd)) {
11713 ss << "osd." << osd << " does not exist";
11714 err = -ENOENT;
11715 goto reply;
11716 }
11717
11718 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11719 osdmap.require_min_compat_client < ceph_release_t::firefly) {
11720 ss << "require_min_compat_client "
11721 << osdmap.require_min_compat_client
11722 << " < firefly, which is required for primary-temp";
11723 err = -EPERM;
11724 goto reply;
11725 }
11726
11727 pending_inc.new_primary_temp[pgid] = osd;
11728 ss << "set " << pgid << " primary_temp mapping to " << osd;
11729 goto update;
11730 } else if (prefix == "pg repeer") {
11731 pg_t pgid;
11732 string pgidstr;
11733 cmd_getval(cmdmap, "pgid", pgidstr);
11734 if (!pgid.parse(pgidstr.c_str())) {
11735 ss << "invalid pgid '" << pgidstr << "'";
11736 err = -EINVAL;
11737 goto reply;
11738 }
11739 if (!osdmap.pg_exists(pgid)) {
11740 ss << "pg '" << pgidstr << "' does not exist";
11741 err = -ENOENT;
11742 goto reply;
11743 }
11744 vector<int> acting;
11745 int primary;
11746 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11747 if (primary < 0) {
11748 err = -EAGAIN;
11749 ss << "pg currently has no primary";
11750 goto reply;
11751 }
11752 if (acting.size() > 1) {
11753 // map to just primary; it will map back to what it wants
11754 pending_inc.new_pg_temp[pgid] = { primary };
11755 } else {
11756 // hmm, pick another arbitrary osd to induce a change. Note
11757 // that this won't work if there is only one suitable OSD in the cluster.
11758 int i;
11759 bool done = false;
11760 for (i = 0; i < osdmap.get_max_osd(); ++i) {
11761 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11762 continue;
11763 }
11764 pending_inc.new_pg_temp[pgid] = { primary, i };
11765 done = true;
11766 break;
11767 }
11768 if (!done) {
11769 err = -EAGAIN;
11770 ss << "not enough up OSDs in the cluster to force repeer";
11771 goto reply;
11772 }
11773 }
11774 goto update;
11775 } else if (prefix == "osd pg-upmap" ||
11776 prefix == "osd rm-pg-upmap" ||
11777 prefix == "osd pg-upmap-items" ||
11778 prefix == "osd rm-pg-upmap-items") {
11779 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11780 ss << "min_compat_client "
11781 << osdmap.require_min_compat_client
11782 << " < luminous, which is required for pg-upmap. "
11783 << "Try 'ceph osd set-require-min-compat-client luminous' "
11784 << "before using the new interface";
11785 err = -EPERM;
11786 goto reply;
11787 }
11788 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11789 if (err == -EAGAIN)
11790 goto wait;
11791 if (err < 0)
11792 goto reply;
11793 string pgidstr;
11794 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11795 ss << "unable to parse 'pgid' value '"
11796 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11797 err = -EINVAL;
11798 goto reply;
11799 }
11800 pg_t pgid;
11801 if (!pgid.parse(pgidstr.c_str())) {
11802 ss << "invalid pgid '" << pgidstr << "'";
11803 err = -EINVAL;
11804 goto reply;
11805 }
11806 if (!osdmap.pg_exists(pgid)) {
11807 ss << "pg " << pgid << " does not exist";
11808 err = -ENOENT;
11809 goto reply;
11810 }
11811 if (pending_inc.old_pools.count(pgid.pool())) {
11812 ss << "pool of " << pgid << " is pending removal";
11813 err = -ENOENT;
11814 getline(ss, rs);
11815 wait_for_finished_proposal(op,
11816 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11817 return true;
11818 }
11819
11820 enum {
11821 OP_PG_UPMAP,
11822 OP_RM_PG_UPMAP,
11823 OP_PG_UPMAP_ITEMS,
11824 OP_RM_PG_UPMAP_ITEMS,
11825 } option;
11826
11827 if (prefix == "osd pg-upmap") {
11828 option = OP_PG_UPMAP;
11829 } else if (prefix == "osd rm-pg-upmap") {
11830 option = OP_RM_PG_UPMAP;
11831 } else if (prefix == "osd pg-upmap-items") {
11832 option = OP_PG_UPMAP_ITEMS;
11833 } else {
11834 option = OP_RM_PG_UPMAP_ITEMS;
11835 }
11836
11837 // check pending upmap changes
11838 switch (option) {
11839 case OP_PG_UPMAP: // fall through
11840 case OP_RM_PG_UPMAP:
11841 if (pending_inc.new_pg_upmap.count(pgid) ||
11842 pending_inc.old_pg_upmap.count(pgid)) {
11843 dout(10) << __func__ << " waiting for pending update on "
11844 << pgid << dendl;
11845 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11846 return true;
11847 }
11848 break;
11849
11850 case OP_PG_UPMAP_ITEMS: // fall through
11851 case OP_RM_PG_UPMAP_ITEMS:
11852 if (pending_inc.new_pg_upmap_items.count(pgid) ||
11853 pending_inc.old_pg_upmap_items.count(pgid)) {
11854 dout(10) << __func__ << " waiting for pending update on "
11855 << pgid << dendl;
11856 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11857 return true;
11858 }
11859 break;
11860
11861 default:
11862 ceph_abort_msg("invalid option");
11863 }
11864
11865 switch (option) {
11866 case OP_PG_UPMAP:
11867 {
11868 vector<int64_t> id_vec;
11869 if (!cmd_getval(cmdmap, "id", id_vec)) {
11870 ss << "unable to parse 'id' value(s) '"
11871 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11872 err = -EINVAL;
11873 goto reply;
11874 }
11875
11876 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11877 if ((int)id_vec.size() < pool_min_size) {
11878 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11879 << pool_min_size << ")";
11880 err = -EINVAL;
11881 goto reply;
11882 }
11883
11884 int pool_size = osdmap.get_pg_pool_size(pgid);
11885 if ((int)id_vec.size() > pool_size) {
11886 ss << "num of osds (" << id_vec.size() <<") > pool size ("
11887 << pool_size << ")";
11888 err = -EINVAL;
11889 goto reply;
11890 }
11891
11892 vector<int32_t> new_pg_upmap;
11893 for (auto osd : id_vec) {
11894 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11895 ss << "osd." << osd << " does not exist";
11896 err = -ENOENT;
11897 goto reply;
11898 }
11899 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11900 if (it != new_pg_upmap.end()) {
11901 ss << "osd." << osd << " already exists, ";
11902 continue;
11903 }
11904 new_pg_upmap.push_back(osd);
11905 }
11906
11907 if (new_pg_upmap.empty()) {
11908 ss << "no valid upmap items(pairs) is specified";
11909 err = -EINVAL;
11910 goto reply;
11911 }
11912
11913 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11914 new_pg_upmap.begin(), new_pg_upmap.end());
11915 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
11916 }
11917 break;
11918
11919 case OP_RM_PG_UPMAP:
11920 {
11921 pending_inc.old_pg_upmap.insert(pgid);
11922 ss << "clear " << pgid << " pg_upmap mapping";
11923 }
11924 break;
11925
11926 case OP_PG_UPMAP_ITEMS:
11927 {
11928 vector<int64_t> id_vec;
11929 if (!cmd_getval(cmdmap, "id", id_vec)) {
11930 ss << "unable to parse 'id' value(s) '"
11931 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11932 err = -EINVAL;
11933 goto reply;
11934 }
11935
11936 if (id_vec.size() % 2) {
11937 ss << "you must specify pairs of osd ids to be remapped";
11938 err = -EINVAL;
11939 goto reply;
11940 }
11941
11942 int pool_size = osdmap.get_pg_pool_size(pgid);
11943 if ((int)(id_vec.size() / 2) > pool_size) {
11944 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11945 << pool_size << ")";
11946 err = -EINVAL;
11947 goto reply;
11948 }
11949
11950 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11951 ostringstream items;
11952 items << "[";
11953 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11954 int from = *p++;
11955 int to = *p;
11956 if (from == to) {
11957 ss << "from osd." << from << " == to osd." << to << ", ";
11958 continue;
11959 }
11960 if (!osdmap.exists(from)) {
11961 ss << "osd." << from << " does not exist";
11962 err = -ENOENT;
11963 goto reply;
11964 }
11965 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11966 ss << "osd." << to << " does not exist";
11967 err = -ENOENT;
11968 goto reply;
11969 }
11970 pair<int32_t,int32_t> entry = make_pair(from, to);
11971 auto it = std::find(new_pg_upmap_items.begin(),
11972 new_pg_upmap_items.end(), entry);
11973 if (it != new_pg_upmap_items.end()) {
11974 ss << "osd." << from << " -> osd." << to << " already exists, ";
11975 continue;
11976 }
11977 new_pg_upmap_items.push_back(entry);
11978 items << from << "->" << to << ",";
11979 }
11980 string out(items.str());
11981 out.resize(out.size() - 1); // drop last ','
11982 out += "]";
11983
11984 if (new_pg_upmap_items.empty()) {
11985 ss << "no valid upmap items(pairs) is specified";
11986 err = -EINVAL;
11987 goto reply;
11988 }
11989
11990 pending_inc.new_pg_upmap_items[pgid] =
11991 mempool::osdmap::vector<pair<int32_t,int32_t>>(
11992 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11993 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11994 }
11995 break;
11996
11997 case OP_RM_PG_UPMAP_ITEMS:
11998 {
11999 pending_inc.old_pg_upmap_items.insert(pgid);
12000 ss << "clear " << pgid << " pg_upmap_items mapping";
12001 }
12002 break;
12003
12004 default:
12005 ceph_abort_msg("invalid option");
12006 }
12007
12008 goto update;
12009 } else if (prefix == "osd primary-affinity") {
12010 int64_t id;
12011 if (!cmd_getval(cmdmap, "id", id)) {
12012 ss << "invalid osd id value '"
12013 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12014 err = -EINVAL;
12015 goto reply;
12016 }
12017 double w;
12018 if (!cmd_getval(cmdmap, "weight", w)) {
12019 ss << "unable to parse 'weight' value '"
12020 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12021 err = -EINVAL;
12022 goto reply;
12023 }
12024 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12025 if (ww < 0L) {
12026 ss << "weight must be >= 0";
12027 err = -EINVAL;
12028 goto reply;
12029 }
12030 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12031 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12032 ss << "require_min_compat_client "
12033 << osdmap.require_min_compat_client
12034 << " < firefly, which is required for primary-affinity";
12035 err = -EPERM;
12036 goto reply;
12037 }
12038 if (osdmap.exists(id)) {
12039 pending_inc.new_primary_affinity[id] = ww;
12040 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
12041 getline(ss, rs);
12042 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12043 get_last_committed() + 1));
12044 return true;
12045 } else {
12046 ss << "osd." << id << " does not exist";
12047 err = -ENOENT;
12048 goto reply;
12049 }
12050 } else if (prefix == "osd reweight") {
12051 int64_t id;
12052 if (!cmd_getval(cmdmap, "id", id)) {
12053 ss << "unable to parse osd id value '"
12054 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12055 err = -EINVAL;
12056 goto reply;
12057 }
12058 double w;
12059 if (!cmd_getval(cmdmap, "weight", w)) {
12060 ss << "unable to parse weight value '"
12061 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12062 err = -EINVAL;
12063 goto reply;
12064 }
12065 long ww = (int)((double)CEPH_OSD_IN*w);
12066 if (ww < 0L) {
12067 ss << "weight must be >= 0";
12068 err = -EINVAL;
12069 goto reply;
12070 }
12071 if (osdmap.exists(id)) {
12072 pending_inc.new_weight[id] = ww;
12073 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12074 getline(ss, rs);
12075 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12076 get_last_committed() + 1));
12077 return true;
12078 } else {
12079 ss << "osd." << id << " does not exist";
12080 err = -ENOENT;
12081 goto reply;
12082 }
12083 } else if (prefix == "osd reweightn") {
12084 map<int32_t, uint32_t> weights;
12085 err = parse_reweights(cct, cmdmap, osdmap, &weights);
12086 if (err) {
12087 ss << "unable to parse 'weights' value '"
12088 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12089 goto reply;
12090 }
12091 pending_inc.new_weight.insert(weights.begin(), weights.end());
12092 wait_for_finished_proposal(
12093 op,
12094 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12095 return true;
12096 } else if (prefix == "osd lost") {
12097 int64_t id;
12098 if (!cmd_getval(cmdmap, "id", id)) {
12099 ss << "unable to parse osd id value '"
12100 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12101 err = -EINVAL;
12102 goto reply;
12103 }
12104 bool sure = false;
12105 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12106 if (!sure) {
12107 ss << "are you SURE? this might mean real, permanent data loss. pass "
12108 "--yes-i-really-mean-it if you really do.";
12109 err = -EPERM;
12110 goto reply;
12111 } else if (!osdmap.exists(id)) {
12112 ss << "osd." << id << " does not exist";
12113 err = -ENOENT;
12114 goto reply;
12115 } else if (!osdmap.is_down(id)) {
12116 ss << "osd." << id << " is not down";
12117 err = -EBUSY;
12118 goto reply;
12119 } else {
12120 epoch_t e = osdmap.get_info(id).down_at;
12121 pending_inc.new_lost[id] = e;
12122 ss << "marked osd lost in epoch " << e;
12123 getline(ss, rs);
12124 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12125 get_last_committed() + 1));
12126 return true;
12127 }
12128
12129 } else if (prefix == "osd destroy-actual" ||
12130 prefix == "osd purge-actual" ||
12131 prefix == "osd purge-new") {
12132 /* Destroying an OSD means that we don't expect to further make use of
12133 * the OSDs data (which may even become unreadable after this operation),
12134 * and that we are okay with scrubbing all its cephx keys and config-key
12135 * data (which may include lockbox keys, thus rendering the osd's data
12136 * unreadable).
12137 *
12138 * The OSD will not be removed. Instead, we will mark it as destroyed,
12139 * such that a subsequent call to `create` will not reuse the osd id.
12140 * This will play into being able to recreate the OSD, at the same
12141 * crush location, with minimal data movement.
12142 */
12143
12144 // make sure authmon is writeable.
12145 if (!mon->authmon()->is_writeable()) {
12146 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12147 << "osd destroy" << dendl;
12148 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12149 return false;
12150 }
12151
12152 int64_t id;
12153 if (!cmd_getval(cmdmap, "id", id)) {
12154 auto p = cmdmap.find("id");
12155 if (p == cmdmap.end()) {
12156 ss << "no osd id specified";
12157 } else {
12158 ss << "unable to parse osd id value '"
12159 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12160 }
12161 err = -EINVAL;
12162 goto reply;
12163 }
12164
12165 bool is_destroy = (prefix == "osd destroy-actual");
12166 if (!is_destroy) {
12167 ceph_assert("osd purge-actual" == prefix ||
12168 "osd purge-new" == prefix);
12169 }
12170
12171 bool sure = false;
12172 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12173 if (!sure) {
12174 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12175 << "This will mean real, permanent data loss, as well "
12176 << "as deletion of cephx and lockbox keys. "
12177 << "Pass --yes-i-really-mean-it if you really do.";
12178 err = -EPERM;
12179 goto reply;
12180 } else if (!osdmap.exists(id)) {
12181 ss << "osd." << id << " does not exist";
12182 err = 0; // idempotent
12183 goto reply;
12184 } else if (osdmap.is_up(id)) {
12185 ss << "osd." << id << " is not `down`.";
12186 err = -EBUSY;
12187 goto reply;
12188 } else if (is_destroy && osdmap.is_destroyed(id)) {
12189 ss << "destroyed osd." << id;
12190 err = 0;
12191 goto reply;
12192 }
12193
12194 if (prefix == "osd purge-new" &&
12195 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12196 ss << "osd." << id << " is not new";
12197 err = -EPERM;
12198 goto reply;
12199 }
12200
12201 bool goto_reply = false;
12202
12203 paxos->plug();
12204 if (is_destroy) {
12205 err = prepare_command_osd_destroy(id, ss);
12206 // we checked above that it should exist.
12207 ceph_assert(err != -ENOENT);
12208 } else {
12209 err = prepare_command_osd_purge(id, ss);
12210 if (err == -ENOENT) {
12211 err = 0;
12212 ss << "osd." << id << " does not exist.";
12213 goto_reply = true;
12214 }
12215 }
12216 paxos->unplug();
12217
12218 if (err < 0 || goto_reply) {
12219 goto reply;
12220 }
12221
12222 if (is_destroy) {
12223 ss << "destroyed osd." << id;
12224 } else {
12225 ss << "purged osd." << id;
12226 }
12227
12228 getline(ss, rs);
12229 wait_for_finished_proposal(op,
12230 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12231 force_immediate_propose();
12232 return true;
12233
12234 } else if (prefix == "osd new") {
12235
12236 // make sure authmon is writeable.
12237 if (!mon->authmon()->is_writeable()) {
12238 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12239 << "osd new" << dendl;
12240 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12241 return false;
12242 }
12243
12244 map<string,string> param_map;
12245
12246 bufferlist bl = m->get_data();
12247 string param_json = bl.to_str();
12248 dout(20) << __func__ << " osd new json = " << param_json << dendl;
12249
12250 err = get_json_str_map(param_json, ss, &param_map);
12251 if (err < 0)
12252 goto reply;
12253
12254 dout(20) << __func__ << " osd new params " << param_map << dendl;
12255
12256 paxos->plug();
12257 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12258 paxos->unplug();
12259
12260 if (err < 0) {
12261 goto reply;
12262 }
12263
12264 if (f) {
12265 f->flush(rdata);
12266 } else {
12267 rdata.append(ss);
12268 }
12269
12270 if (err == EEXIST) {
12271 // idempotent operation
12272 err = 0;
12273 goto reply;
12274 }
12275
12276 wait_for_finished_proposal(op,
12277 new Monitor::C_Command(mon, op, 0, rs, rdata,
12278 get_last_committed() + 1));
12279 force_immediate_propose();
12280 return true;
12281
12282 } else if (prefix == "osd create") {
12283
12284 // optional id provided?
12285 int64_t id = -1, cmd_id = -1;
12286 if (cmd_getval(cmdmap, "id", cmd_id)) {
12287 if (cmd_id < 0) {
12288 ss << "invalid osd id value '" << cmd_id << "'";
12289 err = -EINVAL;
12290 goto reply;
12291 }
12292 dout(10) << " osd create got id " << cmd_id << dendl;
12293 }
12294
12295 uuid_d uuid;
12296 string uuidstr;
12297 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12298 if (!uuid.parse(uuidstr.c_str())) {
12299 ss << "invalid uuid value '" << uuidstr << "'";
12300 err = -EINVAL;
12301 goto reply;
12302 }
12303 // we only care about the id if we also have the uuid, to
12304 // ensure the operation's idempotency.
12305 id = cmd_id;
12306 }
12307
12308 int32_t new_id = -1;
12309 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12310 if (err < 0) {
12311 if (err == -EAGAIN) {
12312 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12313 return true;
12314 }
12315 // a check has failed; reply to the user.
12316 goto reply;
12317
12318 } else if (err == EEXIST) {
12319 // this is an idempotent operation; we can go ahead and reply.
12320 if (f) {
12321 f->open_object_section("created_osd");
12322 f->dump_int("osdid", new_id);
12323 f->close_section();
12324 f->flush(rdata);
12325 } else {
12326 ss << new_id;
12327 rdata.append(ss);
12328 }
12329 err = 0;
12330 goto reply;
12331 }
12332
12333 string empty_device_class;
12334 do_osd_create(id, uuid, empty_device_class, &new_id);
12335
12336 if (f) {
12337 f->open_object_section("created_osd");
12338 f->dump_int("osdid", new_id);
12339 f->close_section();
12340 f->flush(rdata);
12341 } else {
12342 ss << new_id;
12343 rdata.append(ss);
12344 }
12345 wait_for_finished_proposal(op,
12346 new Monitor::C_Command(mon, op, 0, rs, rdata,
12347 get_last_committed() + 1));
12348 return true;
12349
12350 } else if (prefix == "osd blacklist clear") {
12351 pending_inc.new_blacklist.clear();
12352 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12353 osdmap.get_blacklist(&blacklist);
12354 for (const auto &entry : blacklist) {
12355 pending_inc.old_blacklist.push_back(entry.first);
12356 }
12357 ss << " removed all blacklist entries";
12358 getline(ss, rs);
12359 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12360 get_last_committed() + 1));
12361 return true;
12362 } else if (prefix == "osd blacklist") {
12363 string addrstr;
12364 cmd_getval(cmdmap, "addr", addrstr);
12365 entity_addr_t addr;
12366 if (!addr.parse(addrstr.c_str(), 0)) {
12367 ss << "unable to parse address " << addrstr;
12368 err = -EINVAL;
12369 goto reply;
12370 }
12371 else {
12372 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12373 // always blacklist type ANY
12374 addr.set_type(entity_addr_t::TYPE_ANY);
12375 } else {
12376 addr.set_type(entity_addr_t::TYPE_LEGACY);
12377 }
12378
12379 string blacklistop;
12380 cmd_getval(cmdmap, "blacklistop", blacklistop);
12381 if (blacklistop == "add") {
12382 utime_t expires = ceph_clock_now();
12383 double d;
12384 // default one hour
12385 cmd_getval(cmdmap, "expire", d,
12386 g_conf()->mon_osd_blacklist_default_expire);
12387 expires += d;
12388
12389 pending_inc.new_blacklist[addr] = expires;
12390
12391 {
12392 // cancel any pending un-blacklisting request too
12393 auto it = std::find(pending_inc.old_blacklist.begin(),
12394 pending_inc.old_blacklist.end(), addr);
12395 if (it != pending_inc.old_blacklist.end()) {
12396 pending_inc.old_blacklist.erase(it);
12397 }
12398 }
12399
12400 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12401 getline(ss, rs);
12402 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12403 get_last_committed() + 1));
12404 return true;
12405 } else if (blacklistop == "rm") {
12406 if (osdmap.is_blacklisted(addr) ||
12407 pending_inc.new_blacklist.count(addr)) {
12408 if (osdmap.is_blacklisted(addr))
12409 pending_inc.old_blacklist.push_back(addr);
12410 else
12411 pending_inc.new_blacklist.erase(addr);
12412 ss << "un-blacklisting " << addr;
12413 getline(ss, rs);
12414 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12415 get_last_committed() + 1));
12416 return true;
12417 }
12418 ss << addr << " isn't blacklisted";
12419 err = 0;
12420 goto reply;
12421 }
12422 }
12423 } else if (prefix == "osd pool mksnap") {
12424 string poolstr;
12425 cmd_getval(cmdmap, "pool", poolstr);
12426 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12427 if (pool < 0) {
12428 ss << "unrecognized pool '" << poolstr << "'";
12429 err = -ENOENT;
12430 goto reply;
12431 }
12432 string snapname;
12433 cmd_getval(cmdmap, "snap", snapname);
12434 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12435 if (p->is_unmanaged_snaps_mode()) {
12436 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12437 err = -EINVAL;
12438 goto reply;
12439 } else if (p->snap_exists(snapname.c_str())) {
12440 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12441 err = 0;
12442 goto reply;
12443 } else if (p->is_tier()) {
12444 ss << "pool " << poolstr << " is a cache tier";
12445 err = -EINVAL;
12446 goto reply;
12447 }
12448 pg_pool_t *pp = 0;
12449 if (pending_inc.new_pools.count(pool))
12450 pp = &pending_inc.new_pools[pool];
12451 if (!pp) {
12452 pp = &pending_inc.new_pools[pool];
12453 *pp = *p;
12454 }
12455 if (pp->snap_exists(snapname.c_str())) {
12456 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12457 } else {
12458 pp->add_snap(snapname.c_str(), ceph_clock_now());
12459 pp->set_snap_epoch(pending_inc.epoch);
12460 ss << "created pool " << poolstr << " snap " << snapname;
12461 }
12462 getline(ss, rs);
12463 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12464 get_last_committed() + 1));
12465 return true;
12466 } else if (prefix == "osd pool rmsnap") {
12467 string poolstr;
12468 cmd_getval(cmdmap, "pool", poolstr);
12469 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12470 if (pool < 0) {
12471 ss << "unrecognized pool '" << poolstr << "'";
12472 err = -ENOENT;
12473 goto reply;
12474 }
12475 string snapname;
12476 cmd_getval(cmdmap, "snap", snapname);
12477 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12478 if (p->is_unmanaged_snaps_mode()) {
12479 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12480 err = -EINVAL;
12481 goto reply;
12482 } else if (!p->snap_exists(snapname.c_str())) {
12483 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12484 err = 0;
12485 goto reply;
12486 }
12487 pg_pool_t *pp = 0;
12488 if (pending_inc.new_pools.count(pool))
12489 pp = &pending_inc.new_pools[pool];
12490 if (!pp) {
12491 pp = &pending_inc.new_pools[pool];
12492 *pp = *p;
12493 }
12494 snapid_t sn = pp->snap_exists(snapname.c_str());
12495 if (sn) {
12496 pp->remove_snap(sn);
12497 pp->set_snap_epoch(pending_inc.epoch);
12498 ss << "removed pool " << poolstr << " snap " << snapname;
12499 } else {
12500 ss << "already removed pool " << poolstr << " snap " << snapname;
12501 }
12502 getline(ss, rs);
12503 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12504 get_last_committed() + 1));
12505 return true;
12506 } else if (prefix == "osd pool create") {
12507 int64_t pg_num, pg_num_min;
12508 int64_t pgp_num;
12509 cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12510 cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12511 cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12512
12513 string pool_type_str;
12514 cmd_getval(cmdmap, "pool_type", pool_type_str);
12515 if (pool_type_str.empty())
12516 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12517
12518 string poolstr;
12519 cmd_getval(cmdmap, "pool", poolstr);
12520 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12521 if (pool_id >= 0) {
12522 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12523 if (pool_type_str != p->get_type_name()) {
12524 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12525 err = -EINVAL;
12526 } else {
12527 ss << "pool '" << poolstr << "' already exists";
12528 err = 0;
12529 }
12530 goto reply;
12531 }
12532
12533 int pool_type;
12534 if (pool_type_str == "replicated") {
12535 pool_type = pg_pool_t::TYPE_REPLICATED;
12536 } else if (pool_type_str == "erasure") {
12537 pool_type = pg_pool_t::TYPE_ERASURE;
12538 } else {
12539 ss << "unknown pool type '" << pool_type_str << "'";
12540 err = -EINVAL;
12541 goto reply;
12542 }
12543
12544 bool implicit_rule_creation = false;
12545 int64_t expected_num_objects = 0;
12546 string rule_name;
12547 cmd_getval(cmdmap, "rule", rule_name);
12548 string erasure_code_profile;
12549 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12550
12551 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12552 if (erasure_code_profile == "")
12553 erasure_code_profile = "default";
12554 //handle the erasure code profile
12555 if (erasure_code_profile == "default") {
12556 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12557 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12558 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12559 goto wait;
12560 }
12561
12562 map<string,string> profile_map;
12563 err = osdmap.get_erasure_code_profile_default(cct,
12564 profile_map,
12565 &ss);
12566 if (err)
12567 goto reply;
12568 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12569 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12570 goto wait;
12571 }
12572 }
12573 if (rule_name == "") {
12574 implicit_rule_creation = true;
12575 if (erasure_code_profile == "default") {
12576 rule_name = "erasure-code";
12577 } else {
12578 dout(1) << "implicitly use rule named after the pool: "
12579 << poolstr << dendl;
12580 rule_name = poolstr;
12581 }
12582 }
12583 cmd_getval(cmdmap, "expected_num_objects",
12584 expected_num_objects, int64_t(0));
12585 } else {
12586 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12587 // and put expected_num_objects to rule field
12588 if (erasure_code_profile != "") { // cmd is from CLI
12589 if (rule_name != "") {
12590 string interr;
12591 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12592 if (interr.length()) {
12593 ss << "error parsing integer value '" << rule_name << "': " << interr;
12594 err = -EINVAL;
12595 goto reply;
12596 }
12597 }
12598 rule_name = erasure_code_profile;
12599 } else { // cmd is well-formed
12600 cmd_getval(cmdmap, "expected_num_objects",
12601 expected_num_objects, int64_t(0));
12602 }
12603 }
12604
12605 if (!implicit_rule_creation && rule_name != "") {
12606 int rule;
12607 err = get_crush_rule(rule_name, &rule, &ss);
12608 if (err == -EAGAIN) {
12609 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12610 return true;
12611 }
12612 if (err)
12613 goto reply;
12614 }
12615
12616 if (expected_num_objects < 0) {
12617 ss << "'expected_num_objects' must be non-negative";
12618 err = -EINVAL;
12619 goto reply;
12620 }
12621
12622 set<int32_t> osds;
12623 osdmap.get_all_osds(osds);
12624 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12625 string type;
12626 if (!get_osd_objectstore_type(osd, &type)) {
12627 return type == "filestore";
12628 } else {
12629 return false;
12630 }
12631 });
12632
12633 if (has_filestore_osd &&
12634 expected_num_objects > 0 &&
12635 cct->_conf->filestore_merge_threshold > 0) {
12636 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12637 err = -EINVAL;
12638 goto reply;
12639 }
12640
12641 if (has_filestore_osd &&
12642 expected_num_objects == 0 &&
12643 cct->_conf->filestore_merge_threshold < 0) {
12644 int osds = osdmap.get_num_osds();
12645 bool sure = false;
12646 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12647 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12648 ss << "For better initial performance on pools expected to store a "
12649 << "large number of objects, consider supplying the "
12650 << "expected_num_objects parameter when creating the pool."
12651 << " Pass --yes-i-really-mean-it to ignore it";
12652 err = -EPERM;
12653 goto reply;
12654 }
12655 }
12656
12657 int64_t fast_read_param;
12658 cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12659 FastReadType fast_read = FAST_READ_DEFAULT;
12660 if (fast_read_param == 0)
12661 fast_read = FAST_READ_OFF;
12662 else if (fast_read_param > 0)
12663 fast_read = FAST_READ_ON;
12664
12665 int64_t repl_size = 0;
12666 cmd_getval(cmdmap, "size", repl_size);
12667 int64_t target_size_bytes = 0;
12668 double target_size_ratio = 0.0;
12669 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12670 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12671
12672 string pg_autoscale_mode;
12673 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12674
12675 err = prepare_new_pool(poolstr,
12676 -1, // default crush rule
12677 rule_name,
12678 pg_num, pgp_num, pg_num_min,
12679 repl_size, target_size_bytes, target_size_ratio,
12680 erasure_code_profile, pool_type,
12681 (uint64_t)expected_num_objects,
12682 fast_read,
12683 pg_autoscale_mode,
12684 &ss);
12685 if (err < 0) {
12686 switch(err) {
12687 case -EEXIST:
12688 ss << "pool '" << poolstr << "' already exists";
12689 break;
12690 case -EAGAIN:
12691 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12692 return true;
12693 case -ERANGE:
12694 goto reply;
12695 default:
12696 goto reply;
12697 break;
12698 }
12699 } else {
12700 ss << "pool '" << poolstr << "' created";
12701 }
12702 getline(ss, rs);
12703 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12704 get_last_committed() + 1));
12705 return true;
12706
12707 } else if (prefix == "osd pool delete" ||
12708 prefix == "osd pool rm") {
12709 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12710 string poolstr, poolstr2, sure;
12711 cmd_getval(cmdmap, "pool", poolstr);
12712 cmd_getval(cmdmap, "pool2", poolstr2);
12713 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12714 if (pool < 0) {
12715 ss << "pool '" << poolstr << "' does not exist";
12716 err = 0;
12717 goto reply;
12718 }
12719
12720 bool force_no_fake = false;
12721 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12722 bool force = false;
12723 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12724 if (poolstr2 != poolstr ||
12725 (!force && !force_no_fake)) {
12726 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12727 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12728 << "followed by --yes-i-really-really-mean-it.";
12729 err = -EPERM;
12730 goto reply;
12731 }
12732 err = _prepare_remove_pool(pool, &ss, force_no_fake);
12733 if (err == -EAGAIN) {
12734 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12735 return true;
12736 }
12737 if (err < 0)
12738 goto reply;
12739 goto update;
12740 } else if (prefix == "osd pool rename") {
12741 string srcpoolstr, destpoolstr;
12742 cmd_getval(cmdmap, "srcpool", srcpoolstr);
12743 cmd_getval(cmdmap, "destpool", destpoolstr);
12744 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12745 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12746
12747 if (pool_src < 0) {
12748 if (pool_dst >= 0) {
12749 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12750 // of operations, assume this rename succeeded, as it is not changing
12751 // the current state. Make sure we output something understandable
12752 // for whoever is issuing the command, if they are paying attention,
12753 // in case it was not intentional; or to avoid a "wtf?" and a bug
12754 // report in case it was intentional, while expecting a failure.
12755 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12756 << destpoolstr << "' does -- assuming successful rename";
12757 err = 0;
12758 } else {
12759 ss << "unrecognized pool '" << srcpoolstr << "'";
12760 err = -ENOENT;
12761 }
12762 goto reply;
12763 } else if (pool_dst >= 0) {
12764 // source pool exists and so does the destination pool
12765 ss << "pool '" << destpoolstr << "' already exists";
12766 err = -EEXIST;
12767 goto reply;
12768 }
12769
12770 int ret = _prepare_rename_pool(pool_src, destpoolstr);
12771 if (ret == 0) {
12772 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12773 } else {
12774 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12775 << cpp_strerror(ret);
12776 }
12777 getline(ss, rs);
12778 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12779 get_last_committed() + 1));
12780 return true;
12781
12782 } else if (prefix == "osd pool set") {
12783 err = prepare_command_pool_set(cmdmap, ss);
12784 if (err == -EAGAIN)
12785 goto wait;
12786 if (err < 0)
12787 goto reply;
12788
12789 getline(ss, rs);
12790 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12791 get_last_committed() + 1));
12792 return true;
12793 } else if (prefix == "osd tier add") {
12794 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12795 if (err == -EAGAIN)
12796 goto wait;
12797 if (err)
12798 goto reply;
12799 string poolstr;
12800 cmd_getval(cmdmap, "pool", poolstr);
12801 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12802 if (pool_id < 0) {
12803 ss << "unrecognized pool '" << poolstr << "'";
12804 err = -ENOENT;
12805 goto reply;
12806 }
12807 string tierpoolstr;
12808 cmd_getval(cmdmap, "tierpool", tierpoolstr);
12809 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12810 if (tierpool_id < 0) {
12811 ss << "unrecognized pool '" << tierpoolstr << "'";
12812 err = -ENOENT;
12813 goto reply;
12814 }
12815 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12816 ceph_assert(p);
12817 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12818 ceph_assert(tp);
12819
12820 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12821 goto reply;
12822 }
12823
12824 // make sure new tier is empty
12825 string force_nonempty;
12826 cmd_getval(cmdmap, "force_nonempty", force_nonempty);
12827 const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
12828 if (pstats && pstats->stats.sum.num_objects != 0 &&
12829 force_nonempty != "--force-nonempty") {
12830 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12831 err = -ENOTEMPTY;
12832 goto reply;
12833 }
12834 if (tp->is_erasure()) {
12835 ss << "tier pool '" << tierpoolstr
12836 << "' is an ec pool, which cannot be a tier";
12837 err = -ENOTSUP;
12838 goto reply;
12839 }
12840 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12841 ((force_nonempty != "--force-nonempty") ||
12842 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
12843 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12844 err = -ENOTEMPTY;
12845 goto reply;
12846 }
12847 // go
12848 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12849 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12850 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12851 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12852 return true;
12853 }
12854 np->tiers.insert(tierpool_id);
12855 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12856 ntp->tier_of = pool_id;
12857 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12858 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12859 get_last_committed() + 1));
12860 return true;
12861 } else if (prefix == "osd tier remove" ||
12862 prefix == "osd tier rm") {
12863 string poolstr;
12864 cmd_getval(cmdmap, "pool", poolstr);
12865 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12866 if (pool_id < 0) {
12867 ss << "unrecognized pool '" << poolstr << "'";
12868 err = -ENOENT;
12869 goto reply;
12870 }
12871 string tierpoolstr;
12872 cmd_getval(cmdmap, "tierpool", tierpoolstr);
12873 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12874 if (tierpool_id < 0) {
12875 ss << "unrecognized pool '" << tierpoolstr << "'";
12876 err = -ENOENT;
12877 goto reply;
12878 }
12879 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12880 ceph_assert(p);
12881 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12882 ceph_assert(tp);
12883
12884 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12885 goto reply;
12886 }
12887
12888 if (p->tiers.count(tierpool_id) == 0) {
12889 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12890 err = 0;
12891 goto reply;
12892 }
12893 if (tp->tier_of != pool_id) {
12894 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12895 << osdmap.get_pool_name(tp->tier_of) << "': "
12896 // be scary about it; this is an inconsistency and bells must go off
12897 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12898 err = -EINVAL;
12899 goto reply;
12900 }
12901 if (p->read_tier == tierpool_id) {
12902 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12903 err = -EBUSY;
12904 goto reply;
12905 }
12906 // go
12907 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12908 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12909 if (np->tiers.count(tierpool_id) == 0 ||
12910 ntp->tier_of != pool_id ||
12911 np->read_tier == tierpool_id) {
12912 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12913 return true;
12914 }
12915 np->tiers.erase(tierpool_id);
12916 ntp->clear_tier();
12917 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12918 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12919 get_last_committed() + 1));
12920 return true;
12921 } else if (prefix == "osd tier set-overlay") {
12922 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12923 if (err == -EAGAIN)
12924 goto wait;
12925 if (err)
12926 goto reply;
12927 string poolstr;
12928 cmd_getval(cmdmap, "pool", poolstr);
12929 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12930 if (pool_id < 0) {
12931 ss << "unrecognized pool '" << poolstr << "'";
12932 err = -ENOENT;
12933 goto reply;
12934 }
12935 string overlaypoolstr;
12936 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
12937 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12938 if (overlaypool_id < 0) {
12939 ss << "unrecognized pool '" << overlaypoolstr << "'";
12940 err = -ENOENT;
12941 goto reply;
12942 }
12943 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12944 ceph_assert(p);
12945 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
12946 ceph_assert(overlay_p);
12947 if (p->tiers.count(overlaypool_id) == 0) {
12948 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12949 err = -EINVAL;
12950 goto reply;
12951 }
12952 if (p->read_tier == overlaypool_id) {
12953 err = 0;
12954 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12955 goto reply;
12956 }
12957 if (p->has_read_tier()) {
12958 ss << "pool '" << poolstr << "' has overlay '"
12959 << osdmap.get_pool_name(p->read_tier)
12960 << "'; please remove-overlay first";
12961 err = -EINVAL;
12962 goto reply;
12963 }
12964
12965 // go
12966 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12967 np->read_tier = overlaypool_id;
12968 np->write_tier = overlaypool_id;
12969 np->set_last_force_op_resend(pending_inc.epoch);
12970 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12971 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12972 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12973 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12974 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12975 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12976 get_last_committed() + 1));
12977 return true;
12978 } else if (prefix == "osd tier remove-overlay" ||
12979 prefix == "osd tier rm-overlay") {
12980 string poolstr;
12981 cmd_getval(cmdmap, "pool", poolstr);
12982 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12983 if (pool_id < 0) {
12984 ss << "unrecognized pool '" << poolstr << "'";
12985 err = -ENOENT;
12986 goto reply;
12987 }
12988 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12989 ceph_assert(p);
12990 if (!p->has_read_tier()) {
12991 err = 0;
12992 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12993 goto reply;
12994 }
12995
12996 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12997 goto reply;
12998 }
12999
13000 // go
13001 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13002 if (np->has_read_tier()) {
13003 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13004 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13005 nop->set_last_force_op_resend(pending_inc.epoch);
13006 }
13007 if (np->has_write_tier()) {
13008 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13009 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13010 nop->set_last_force_op_resend(pending_inc.epoch);
13011 }
13012 np->clear_read_tier();
13013 np->clear_write_tier();
13014 np->set_last_force_op_resend(pending_inc.epoch);
13015 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13016 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13017 get_last_committed() + 1));
13018 return true;
13019 } else if (prefix == "osd tier cache-mode") {
13020 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13021 if (err == -EAGAIN)
13022 goto wait;
13023 if (err)
13024 goto reply;
13025 string poolstr;
13026 cmd_getval(cmdmap, "pool", poolstr);
13027 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13028 if (pool_id < 0) {
13029 ss << "unrecognized pool '" << poolstr << "'";
13030 err = -ENOENT;
13031 goto reply;
13032 }
13033 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13034 ceph_assert(p);
13035 if (!p->is_tier()) {
13036 ss << "pool '" << poolstr << "' is not a tier";
13037 err = -EINVAL;
13038 goto reply;
13039 }
13040 string modestr;
13041 cmd_getval(cmdmap, "mode", modestr);
13042 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13043 if (int(mode) < 0) {
13044 ss << "'" << modestr << "' is not a valid cache mode";
13045 err = -EINVAL;
13046 goto reply;
13047 }
13048
13049 bool sure = false;
13050 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13051
13052 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13053 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13054 ss << "'" << modestr << "' is no longer a supported cache mode";
13055 err = -EPERM;
13056 goto reply;
13057 }
13058 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13059 mode != pg_pool_t::CACHEMODE_NONE &&
13060 mode != pg_pool_t::CACHEMODE_PROXY &&
13061 mode != pg_pool_t::CACHEMODE_READPROXY) &&
13062 !sure) {
13063 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13064 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13065 err = -EPERM;
13066 goto reply;
13067 }
13068
13069 // pool already has this cache-mode set and there are no pending changes
13070 if (p->cache_mode == mode &&
13071 (pending_inc.new_pools.count(pool_id) == 0 ||
13072 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13073 ss << "set cache-mode for pool '" << poolstr << "'"
13074 << " to " << pg_pool_t::get_cache_mode_name(mode);
13075 err = 0;
13076 goto reply;
13077 }
13078
13079 /* Mode description:
13080 *
13081 * none: No cache-mode defined
13082 * forward: Forward all reads and writes to base pool [removed]
13083 * writeback: Cache writes, promote reads from base pool
13084 * readonly: Forward writes to base pool
13085 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13086 * proxy: Proxy all reads and writes to base pool
13087 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13088 *
13089 * Hence, these are the allowed transitions:
13090 *
13091 * none -> any
13092 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13093 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13094 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13095 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13096 * writeback -> readproxy || proxy
13097 * readonly -> any
13098 */
13099
13100 // We check if the transition is valid against the current pool mode, as
13101 // it is the only committed state thus far. We will blantly squash
13102 // whatever mode is on the pending state.
13103
13104 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13105 (mode != pg_pool_t::CACHEMODE_PROXY &&
13106 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13107 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13108 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13109 << "' pool; only '"
13110 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13111 << "' allowed.";
13112 err = -EINVAL;
13113 goto reply;
13114 }
13115 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13116 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13117 mode != pg_pool_t::CACHEMODE_PROXY &&
13118 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13119
13120 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13121 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13122 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13123
13124 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13125 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13126 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13127
13128 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13129 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13130 mode != pg_pool_t::CACHEMODE_PROXY &&
13131 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13132
13133 const pool_stat_t* pstats =
13134 mon->mgrstatmon()->get_pool_stat(pool_id);
13135
13136 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13137 ss << "unable to set cache-mode '"
13138 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13139 << "': dirty objects found";
13140 err = -EBUSY;
13141 goto reply;
13142 }
13143 }
13144 // go
13145 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13146 np->cache_mode = mode;
13147 // set this both when moving to and from cache_mode NONE. this is to
13148 // capture legacy pools that were set up before this flag existed.
13149 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13150 ss << "set cache-mode for pool '" << poolstr
13151 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13152 if (mode == pg_pool_t::CACHEMODE_NONE) {
13153 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13154 ceph_assert(base_pool);
13155 if (base_pool->read_tier == pool_id ||
13156 base_pool->write_tier == pool_id)
13157 ss <<" (WARNING: pool is still configured as read or write tier)";
13158 }
13159 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13160 get_last_committed() + 1));
13161 return true;
13162 } else if (prefix == "osd tier add-cache") {
13163 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13164 if (err == -EAGAIN)
13165 goto wait;
13166 if (err)
13167 goto reply;
13168 string poolstr;
13169 cmd_getval(cmdmap, "pool", poolstr);
13170 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13171 if (pool_id < 0) {
13172 ss << "unrecognized pool '" << poolstr << "'";
13173 err = -ENOENT;
13174 goto reply;
13175 }
13176 string tierpoolstr;
13177 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13178 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13179 if (tierpool_id < 0) {
13180 ss << "unrecognized pool '" << tierpoolstr << "'";
13181 err = -ENOENT;
13182 goto reply;
13183 }
13184 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13185 ceph_assert(p);
13186 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13187 ceph_assert(tp);
13188
13189 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13190 goto reply;
13191 }
13192
13193 int64_t size = 0;
13194 if (!cmd_getval(cmdmap, "size", size)) {
13195 ss << "unable to parse 'size' value '"
13196 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13197 err = -EINVAL;
13198 goto reply;
13199 }
13200 // make sure new tier is empty
13201 const pool_stat_t *pstats =
13202 mon->mgrstatmon()->get_pool_stat(tierpool_id);
13203 if (pstats && pstats->stats.sum.num_objects != 0) {
13204 ss << "tier pool '" << tierpoolstr << "' is not empty";
13205 err = -ENOTEMPTY;
13206 goto reply;
13207 }
13208 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13209 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13210 if (int(mode) < 0) {
13211 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13212 err = -EINVAL;
13213 goto reply;
13214 }
13215 HitSet::Params hsp;
13216 auto& cache_hit_set_type =
13217 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13218 if (cache_hit_set_type == "bloom") {
13219 BloomHitSet::Params *bsp = new BloomHitSet::Params;
13220 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13221 hsp = HitSet::Params(bsp);
13222 } else if (cache_hit_set_type == "explicit_hash") {
13223 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13224 } else if (cache_hit_set_type == "explicit_object") {
13225 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13226 } else {
13227 ss << "osd tier cache default hit set type '"
13228 << cache_hit_set_type << "' is not a known type";
13229 err = -EINVAL;
13230 goto reply;
13231 }
13232 // go
13233 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13234 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13235 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13236 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13237 return true;
13238 }
13239 np->tiers.insert(tierpool_id);
13240 np->read_tier = np->write_tier = tierpool_id;
13241 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13242 np->set_last_force_op_resend(pending_inc.epoch);
13243 ntp->set_last_force_op_resend(pending_inc.epoch);
13244 ntp->tier_of = pool_id;
13245 ntp->cache_mode = mode;
13246 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13247 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13248 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13249 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13250 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13251 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13252 ntp->hit_set_params = hsp;
13253 ntp->target_max_bytes = size;
13254 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13255 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13256 get_last_committed() + 1));
13257 return true;
13258 } else if (prefix == "osd pool set-quota") {
13259 string poolstr;
13260 cmd_getval(cmdmap, "pool", poolstr);
13261 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13262 if (pool_id < 0) {
13263 ss << "unrecognized pool '" << poolstr << "'";
13264 err = -ENOENT;
13265 goto reply;
13266 }
13267
13268 string field;
13269 cmd_getval(cmdmap, "field", field);
13270 if (field != "max_objects" && field != "max_bytes") {
13271 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13272 err = -EINVAL;
13273 goto reply;
13274 }
13275
13276 // val could contain unit designations, so we treat as a string
13277 string val;
13278 cmd_getval(cmdmap, "val", val);
13279 string tss;
13280 int64_t value;
13281 if (field == "max_objects") {
13282 value = strict_sistrtoll(val.c_str(), &tss);
13283 } else if (field == "max_bytes") {
13284 value = strict_iecstrtoll(val.c_str(), &tss);
13285 } else {
13286 ceph_abort_msg("unrecognized option");
13287 }
13288 if (!tss.empty()) {
13289 ss << "error parsing value '" << val << "': " << tss;
13290 err = -EINVAL;
13291 goto reply;
13292 }
13293
13294 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13295 if (field == "max_objects") {
13296 pi->quota_max_objects = value;
13297 } else if (field == "max_bytes") {
13298 pi->quota_max_bytes = value;
13299 } else {
13300 ceph_abort_msg("unrecognized option");
13301 }
13302 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13303 rs = ss.str();
13304 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13305 get_last_committed() + 1));
13306 return true;
13307 } else if (prefix == "osd pool application enable" ||
13308 prefix == "osd pool application disable" ||
13309 prefix == "osd pool application set" ||
13310 prefix == "osd pool application rm") {
13311 err = prepare_command_pool_application(prefix, cmdmap, ss);
13312 if (err == -EAGAIN) {
13313 goto wait;
13314 } else if (err < 0) {
13315 goto reply;
13316 } else {
13317 goto update;
13318 }
13319 } else if (prefix == "osd force-create-pg") {
13320 pg_t pgid;
13321 string pgidstr;
13322 cmd_getval(cmdmap, "pgid", pgidstr);
13323 if (!pgid.parse(pgidstr.c_str())) {
13324 ss << "invalid pgid '" << pgidstr << "'";
13325 err = -EINVAL;
13326 goto reply;
13327 }
13328 if (!osdmap.pg_exists(pgid)) {
13329 ss << "pg " << pgid << " should not exist";
13330 err = -ENOENT;
13331 goto reply;
13332 }
13333 bool sure = false;
13334 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13335 if (!sure) {
13336 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13337 << "that the cluster will give up ever trying to recover the lost data. Do this "
13338 << "only if you are certain that all copies of the PG are in fact lost and you are "
13339 << "willing to accept that the data is permanently destroyed. Pass "
13340 << "--yes-i-really-mean-it to proceed.";
13341 err = -EPERM;
13342 goto reply;
13343 }
13344 bool creating_now;
13345 {
13346 std::lock_guard<std::mutex> l(creating_pgs_lock);
13347 auto emplaced = creating_pgs.pgs.emplace(
13348 pgid,
13349 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13350 ceph_clock_now()));
13351 creating_now = emplaced.second;
13352 }
13353 if (creating_now) {
13354 ss << "pg " << pgidstr << " now creating, ok";
13355 // set the pool's CREATING flag so that (1) the osd won't ignore our
13356 // create message and (2) we won't propose any future pg_num changes
13357 // until after the PG has been instantiated.
13358 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13359 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13360 }
13361 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13362 err = 0;
13363 goto update;
13364 } else {
13365 ss << "pg " << pgid << " already creating";
13366 err = 0;
13367 goto reply;
13368 }
13369 } else {
13370 err = -EINVAL;
13371 }
13372
13373 reply:
13374 getline(ss, rs);
13375 if (err < 0 && rs.length() == 0)
13376 rs = cpp_strerror(err);
13377 mon->reply_command(op, err, rs, rdata, get_last_committed());
13378 return ret;
13379
13380 update:
13381 getline(ss, rs);
13382 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13383 get_last_committed() + 1));
13384 return true;
13385
13386 wait:
13387 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13388 return true;
13389 }
13390
13391 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13392 {
13393 op->mark_osdmon_event(__func__);
13394
13395 auto m = op->get_req<MPoolOp>();
13396 MonSession *session = op->get_session();
13397 if (!session) {
13398 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13399 return true;
13400 }
13401
13402 switch (m->op) {
13403 case POOL_OP_CREATE_UNMANAGED_SNAP:
13404 case POOL_OP_DELETE_UNMANAGED_SNAP:
13405 {
13406 const std::string* pool_name = nullptr;
13407 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13408 if (pg_pool != nullptr) {
13409 pool_name = &osdmap.get_pool_name(m->pool);
13410 }
13411
13412 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13413 session->entity_name, session->caps,
13414 session->get_peer_socket_addr(),
13415 pool_name)) {
13416 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13417 << "privileges. message: " << *m << std::endl
13418 << "caps: " << session->caps << dendl;
13419 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13420 return true;
13421 }
13422 }
13423 break;
13424 default:
13425 if (!session->is_capable("osd", MON_CAP_W)) {
13426 dout(0) << "got pool op from entity with insufficient privileges. "
13427 << "message: " << *m << std::endl
13428 << "caps: " << session->caps << dendl;
13429 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13430 return true;
13431 }
13432 break;
13433 }
13434
13435 return false;
13436 }
13437
13438 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13439 {
13440 op->mark_osdmon_event(__func__);
13441 auto m = op->get_req<MPoolOp>();
13442
13443 if (enforce_pool_op_caps(op)) {
13444 return true;
13445 }
13446
13447 if (m->fsid != mon->monmap->fsid) {
13448 dout(0) << __func__ << " drop message on fsid " << m->fsid
13449 << " != " << mon->monmap->fsid << " for " << *m << dendl;
13450 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13451 return true;
13452 }
13453
13454 if (m->op == POOL_OP_CREATE)
13455 return preprocess_pool_op_create(op);
13456
13457 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13458 if (p == nullptr) {
13459 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13460 if (m->op == POOL_OP_DELETE) {
13461 _pool_op_reply(op, 0, osdmap.get_epoch());
13462 } else {
13463 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13464 }
13465 return true;
13466 }
13467
13468 // check if the snap and snapname exist
13469 bool snap_exists = false;
13470 if (p->snap_exists(m->name.c_str()))
13471 snap_exists = true;
13472
13473 switch (m->op) {
13474 case POOL_OP_CREATE_SNAP:
13475 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13476 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13477 return true;
13478 }
13479 if (snap_exists) {
13480 _pool_op_reply(op, 0, osdmap.get_epoch());
13481 return true;
13482 }
13483 return false;
13484 case POOL_OP_CREATE_UNMANAGED_SNAP:
13485 if (p->is_pool_snaps_mode()) {
13486 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13487 return true;
13488 }
13489 return false;
13490 case POOL_OP_DELETE_SNAP:
13491 if (p->is_unmanaged_snaps_mode()) {
13492 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13493 return true;
13494 }
13495 if (!snap_exists) {
13496 _pool_op_reply(op, 0, osdmap.get_epoch());
13497 return true;
13498 }
13499 return false;
13500 case POOL_OP_DELETE_UNMANAGED_SNAP:
13501 if (p->is_pool_snaps_mode()) {
13502 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13503 return true;
13504 }
13505 if (_is_removed_snap(m->pool, m->snapid)) {
13506 _pool_op_reply(op, 0, osdmap.get_epoch());
13507 return true;
13508 }
13509 return false;
13510 case POOL_OP_DELETE:
13511 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13512 _pool_op_reply(op, 0, osdmap.get_epoch());
13513 return true;
13514 }
13515 return false;
13516 case POOL_OP_AUID_CHANGE:
13517 return false;
13518 default:
13519 ceph_abort();
13520 break;
13521 }
13522
13523 return false;
13524 }
13525
13526 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13527 {
13528 if (!osdmap.have_pg_pool(pool)) {
13529 dout(10) << __func__ << " pool " << pool << " snap " << snap
13530 << " - pool dne" << dendl;
13531 return true;
13532 }
13533 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13534 dout(10) << __func__ << " pool " << pool << " snap " << snap
13535 << " - in osdmap removed_snaps_queue" << dendl;
13536 return true;
13537 }
13538 snapid_t begin, end;
13539 int r = lookup_purged_snap(pool, snap, &begin, &end);
13540 if (r == 0) {
13541 dout(10) << __func__ << " pool " << pool << " snap " << snap
13542 << " - purged, [" << begin << "," << end << ")" << dendl;
13543 return true;
13544 }
13545 return false;
13546 }
13547
13548 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13549 {
13550 if (pending_inc.old_pools.count(pool)) {
13551 dout(10) << __func__ << " pool " << pool << " snap " << snap
13552 << " - pool pending deletion" << dendl;
13553 return true;
13554 }
13555 if (pending_inc.in_new_removed_snaps(pool, snap)) {
13556 dout(10) << __func__ << " pool " << pool << " snap " << snap
13557 << " - in pending new_removed_snaps" << dendl;
13558 return true;
13559 }
13560 return false;
13561 }
13562
13563 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13564 {
13565 op->mark_osdmon_event(__func__);
13566 auto m = op->get_req<MPoolOp>();
13567 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13568 if (pool >= 0) {
13569 _pool_op_reply(op, 0, osdmap.get_epoch());
13570 return true;
13571 }
13572
13573 return false;
13574 }
13575
13576 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13577 {
13578 op->mark_osdmon_event(__func__);
13579 auto m = op->get_req<MPoolOp>();
13580 dout(10) << "prepare_pool_op " << *m << dendl;
13581 if (m->op == POOL_OP_CREATE) {
13582 return prepare_pool_op_create(op);
13583 } else if (m->op == POOL_OP_DELETE) {
13584 return prepare_pool_op_delete(op);
13585 }
13586
13587 int ret = 0;
13588 bool changed = false;
13589
13590 if (!osdmap.have_pg_pool(m->pool)) {
13591 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13592 return false;
13593 }
13594
13595 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13596
13597 switch (m->op) {
13598 case POOL_OP_CREATE_SNAP:
13599 if (pool->is_tier()) {
13600 ret = -EINVAL;
13601 _pool_op_reply(op, ret, osdmap.get_epoch());
13602 return false;
13603 } // else, fall through
13604 case POOL_OP_DELETE_SNAP:
13605 if (!pool->is_unmanaged_snaps_mode()) {
13606 bool snap_exists = pool->snap_exists(m->name.c_str());
13607 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13608 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13609 ret = 0;
13610 } else {
13611 break;
13612 }
13613 } else {
13614 ret = -EINVAL;
13615 }
13616 _pool_op_reply(op, ret, osdmap.get_epoch());
13617 return false;
13618
13619 case POOL_OP_DELETE_UNMANAGED_SNAP:
13620 // we won't allow removal of an unmanaged snapshot from a pool
13621 // not in unmanaged snaps mode.
13622 if (!pool->is_unmanaged_snaps_mode()) {
13623 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13624 return false;
13625 }
13626 /* fall-thru */
13627 case POOL_OP_CREATE_UNMANAGED_SNAP:
13628 // but we will allow creating an unmanaged snapshot on any pool
13629 // as long as it is not in 'pool' snaps mode.
13630 if (pool->is_pool_snaps_mode()) {
13631 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13632 return false;
13633 }
13634 }
13635
13636 // projected pool info
13637 pg_pool_t pp;
13638 if (pending_inc.new_pools.count(m->pool))
13639 pp = pending_inc.new_pools[m->pool];
13640 else
13641 pp = *osdmap.get_pg_pool(m->pool);
13642
13643 bufferlist reply_data;
13644
13645 // pool snaps vs unmanaged snaps are mutually exclusive
13646 switch (m->op) {
13647 case POOL_OP_CREATE_SNAP:
13648 case POOL_OP_DELETE_SNAP:
13649 if (pp.is_unmanaged_snaps_mode()) {
13650 ret = -EINVAL;
13651 goto out;
13652 }
13653 break;
13654
13655 case POOL_OP_CREATE_UNMANAGED_SNAP:
13656 case POOL_OP_DELETE_UNMANAGED_SNAP:
13657 if (pp.is_pool_snaps_mode()) {
13658 ret = -EINVAL;
13659 goto out;
13660 }
13661 }
13662
13663 switch (m->op) {
13664 case POOL_OP_CREATE_SNAP:
13665 if (!pp.snap_exists(m->name.c_str())) {
13666 pp.add_snap(m->name.c_str(), ceph_clock_now());
13667 dout(10) << "create snap in pool " << m->pool << " " << m->name
13668 << " seq " << pp.get_snap_epoch() << dendl;
13669 changed = true;
13670 }
13671 break;
13672
13673 case POOL_OP_DELETE_SNAP:
13674 {
13675 snapid_t s = pp.snap_exists(m->name.c_str());
13676 if (s) {
13677 pp.remove_snap(s);
13678 pending_inc.new_removed_snaps[m->pool].insert(s);
13679 changed = true;
13680 }
13681 }
13682 break;
13683
13684 case POOL_OP_CREATE_UNMANAGED_SNAP:
13685 {
13686 uint64_t snapid = pp.add_unmanaged_snap(
13687 osdmap.require_osd_release < ceph_release_t::octopus);
13688 encode(snapid, reply_data);
13689 changed = true;
13690 }
13691 break;
13692
13693 case POOL_OP_DELETE_UNMANAGED_SNAP:
13694 if (!_is_removed_snap(m->pool, m->snapid) &&
13695 !_is_pending_removed_snap(m->pool, m->snapid)) {
13696 if (m->snapid > pp.get_snap_seq()) {
13697 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13698 return false;
13699 }
13700 pp.remove_unmanaged_snap(
13701 m->snapid,
13702 osdmap.require_osd_release < ceph_release_t::octopus);
13703 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13704 // also record the new seq as purged: this avoids a discontinuity
13705 // after all of the snaps have been purged, since the seq assigned
13706 // during removal lives in the same namespace as the actual snaps.
13707 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13708 changed = true;
13709 }
13710 break;
13711
13712 case POOL_OP_AUID_CHANGE:
13713 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13714 return false;
13715
13716 default:
13717 ceph_abort();
13718 break;
13719 }
13720
13721 if (changed) {
13722 pp.set_snap_epoch(pending_inc.epoch);
13723 pending_inc.new_pools[m->pool] = pp;
13724 }
13725
13726 out:
13727 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13728 return true;
13729 }
13730
13731 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13732 {
13733 op->mark_osdmon_event(__func__);
13734 int err = prepare_new_pool(op);
13735 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13736 return true;
13737 }
13738
13739 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13740 ostream *ss)
13741 {
13742 const string& poolstr = osdmap.get_pool_name(pool_id);
13743
13744 // If the Pool is in use by CephFS, refuse to delete it
13745 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13746 if (pending_fsmap.pool_in_use(pool_id)) {
13747 *ss << "pool '" << poolstr << "' is in use by CephFS";
13748 return -EBUSY;
13749 }
13750
13751 if (pool.tier_of >= 0) {
13752 *ss << "pool '" << poolstr << "' is a tier of '"
13753 << osdmap.get_pool_name(pool.tier_of) << "'";
13754 return -EBUSY;
13755 }
13756 if (!pool.tiers.empty()) {
13757 *ss << "pool '" << poolstr << "' has tiers";
13758 for(auto tier : pool.tiers) {
13759 *ss << " " << osdmap.get_pool_name(tier);
13760 }
13761 return -EBUSY;
13762 }
13763
13764 if (!g_conf()->mon_allow_pool_delete) {
13765 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13766 return -EPERM;
13767 }
13768
13769 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13770 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13771 return -EPERM;
13772 }
13773
13774 *ss << "pool '" << poolstr << "' removed";
13775 return 0;
13776 }
13777
13778 /**
13779 * Check if it is safe to add a tier to a base pool
13780 *
13781 * @return
13782 * True if the operation should proceed, false if we should abort here
13783 * (abort doesn't necessarily mean error, could be idempotency)
13784 */
13785 bool OSDMonitor::_check_become_tier(
13786 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13787 const int64_t base_pool_id, const pg_pool_t *base_pool,
13788 int *err,
13789 ostream *ss) const
13790 {
13791 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13792 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13793
13794 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13795 if (pending_fsmap.pool_in_use(tier_pool_id)) {
13796 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13797 *err = -EBUSY;
13798 return false;
13799 }
13800
13801 if (base_pool->tiers.count(tier_pool_id)) {
13802 ceph_assert(tier_pool->tier_of == base_pool_id);
13803 *err = 0;
13804 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13805 << base_pool_name << "'";
13806 return false;
13807 }
13808
13809 if (base_pool->is_tier()) {
13810 *ss << "pool '" << base_pool_name << "' is already a tier of '"
13811 << osdmap.get_pool_name(base_pool->tier_of) << "', "
13812 << "multiple tiers are not yet supported.";
13813 *err = -EINVAL;
13814 return false;
13815 }
13816
13817 if (tier_pool->has_tiers()) {
13818 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13819 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13820 it != tier_pool->tiers.end(); ++it)
13821 *ss << "'" << osdmap.get_pool_name(*it) << "',";
13822 *ss << " multiple tiers are not yet supported.";
13823 *err = -EINVAL;
13824 return false;
13825 }
13826
13827 if (tier_pool->is_tier()) {
13828 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13829 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13830 *err = -EINVAL;
13831 return false;
13832 }
13833
13834 *err = 0;
13835 return true;
13836 }
13837
13838
13839 /**
13840 * Check if it is safe to remove a tier from this base pool
13841 *
13842 * @return
13843 * True if the operation should proceed, false if we should abort here
13844 * (abort doesn't necessarily mean error, could be idempotency)
13845 */
13846 bool OSDMonitor::_check_remove_tier(
13847 const int64_t base_pool_id, const pg_pool_t *base_pool,
13848 const pg_pool_t *tier_pool,
13849 int *err, ostream *ss) const
13850 {
13851 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13852
13853 // Apply CephFS-specific checks
13854 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13855 if (pending_fsmap.pool_in_use(base_pool_id)) {
13856 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13857 // If the underlying pool is erasure coded and does not allow EC
13858 // overwrites, we can't permit the removal of the replicated tier that
13859 // CephFS relies on to access it
13860 *ss << "pool '" << base_pool_name <<
13861 "' does not allow EC overwrites and is in use by CephFS"
13862 " via its tier";
13863 *err = -EBUSY;
13864 return false;
13865 }
13866
13867 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13868 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13869 "tier is still in use as a writeback cache. Change the cache "
13870 "mode and flush the cache before removing it";
13871 *err = -EBUSY;
13872 return false;
13873 }
13874 }
13875
13876 *err = 0;
13877 return true;
13878 }
13879
13880 int OSDMonitor::_prepare_remove_pool(
13881 int64_t pool, ostream *ss, bool no_fake)
13882 {
13883 dout(10) << __func__ << " " << pool << dendl;
13884 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13885 int r = _check_remove_pool(pool, *p, ss);
13886 if (r < 0)
13887 return r;
13888
13889 auto new_pool = pending_inc.new_pools.find(pool);
13890 if (new_pool != pending_inc.new_pools.end()) {
13891 // if there is a problem with the pending info, wait and retry
13892 // this op.
13893 const auto& p = new_pool->second;
13894 int r = _check_remove_pool(pool, p, ss);
13895 if (r < 0)
13896 return -EAGAIN;
13897 }
13898
13899 if (pending_inc.old_pools.count(pool)) {
13900 dout(10) << __func__ << " " << pool << " already pending removal"
13901 << dendl;
13902 return 0;
13903 }
13904
13905 if (g_conf()->mon_fake_pool_delete && !no_fake) {
13906 string old_name = osdmap.get_pool_name(pool);
13907 string new_name = old_name + "." + stringify(pool) + ".DELETED";
13908 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13909 << old_name << " -> " << new_name << dendl;
13910 pending_inc.new_pool_names[pool] = new_name;
13911 return 0;
13912 }
13913
13914 // remove
13915 pending_inc.old_pools.insert(pool);
13916
13917 // remove any pg_temp mappings for this pool
13918 for (auto p = osdmap.pg_temp->begin();
13919 p != osdmap.pg_temp->end();
13920 ++p) {
13921 if (p->first.pool() == pool) {
13922 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
13923 << p->first << dendl;
13924 pending_inc.new_pg_temp[p->first].clear();
13925 }
13926 }
13927 // remove any primary_temp mappings for this pool
13928 for (auto p = osdmap.primary_temp->begin();
13929 p != osdmap.primary_temp->end();
13930 ++p) {
13931 if (p->first.pool() == pool) {
13932 dout(10) << __func__ << " " << pool
13933 << " removing obsolete primary_temp" << p->first << dendl;
13934 pending_inc.new_primary_temp[p->first] = -1;
13935 }
13936 }
13937 // remove any pg_upmap mappings for this pool
13938 for (auto& p : osdmap.pg_upmap) {
13939 if (p.first.pool() == pool) {
13940 dout(10) << __func__ << " " << pool
13941 << " removing obsolete pg_upmap "
13942 << p.first << dendl;
13943 pending_inc.old_pg_upmap.insert(p.first);
13944 }
13945 }
13946 // remove any pending pg_upmap mappings for this pool
13947 {
13948 auto it = pending_inc.new_pg_upmap.begin();
13949 while (it != pending_inc.new_pg_upmap.end()) {
13950 if (it->first.pool() == pool) {
13951 dout(10) << __func__ << " " << pool
13952 << " removing pending pg_upmap "
13953 << it->first << dendl;
13954 it = pending_inc.new_pg_upmap.erase(it);
13955 } else {
13956 it++;
13957 }
13958 }
13959 }
13960 // remove any pg_upmap_items mappings for this pool
13961 for (auto& p : osdmap.pg_upmap_items) {
13962 if (p.first.pool() == pool) {
13963 dout(10) << __func__ << " " << pool
13964 << " removing obsolete pg_upmap_items " << p.first
13965 << dendl;
13966 pending_inc.old_pg_upmap_items.insert(p.first);
13967 }
13968 }
13969 // remove any pending pg_upmap mappings for this pool
13970 {
13971 auto it = pending_inc.new_pg_upmap_items.begin();
13972 while (it != pending_inc.new_pg_upmap_items.end()) {
13973 if (it->first.pool() == pool) {
13974 dout(10) << __func__ << " " << pool
13975 << " removing pending pg_upmap_items "
13976 << it->first << dendl;
13977 it = pending_inc.new_pg_upmap_items.erase(it);
13978 } else {
13979 it++;
13980 }
13981 }
13982 }
13983
13984 // remove any choose_args for this pool
13985 CrushWrapper newcrush;
13986 _get_pending_crush(newcrush);
13987 if (newcrush.have_choose_args(pool)) {
13988 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13989 newcrush.rm_choose_args(pool);
13990 pending_inc.crush.clear();
13991 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13992 }
13993 return 0;
13994 }
13995
13996 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13997 {
13998 dout(10) << "_prepare_rename_pool " << pool << dendl;
13999 if (pending_inc.old_pools.count(pool)) {
14000 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14001 return -ENOENT;
14002 }
14003 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14004 p != pending_inc.new_pool_names.end();
14005 ++p) {
14006 if (p->second == newname && p->first != pool) {
14007 return -EEXIST;
14008 }
14009 }
14010
14011 pending_inc.new_pool_names[pool] = newname;
14012 return 0;
14013 }
14014
14015 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14016 {
14017 op->mark_osdmon_event(__func__);
14018 auto m = op->get_req<MPoolOp>();
14019 ostringstream ss;
14020 int ret = _prepare_remove_pool(m->pool, &ss, false);
14021 if (ret == -EAGAIN) {
14022 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14023 return true;
14024 }
14025 if (ret < 0)
14026 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14027 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14028 pending_inc.epoch));
14029 return true;
14030 }
14031
14032 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14033 int ret, epoch_t epoch, bufferlist *blp)
14034 {
14035 op->mark_osdmon_event(__func__);
14036 auto m = op->get_req<MPoolOp>();
14037 dout(20) << "_pool_op_reply " << ret << dendl;
14038 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14039 ret, epoch, get_last_committed(), blp);
14040 mon->send_reply(op, reply);
14041 }
14042
14043 void OSDMonitor::convert_pool_priorities(void)
14044 {
14045 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14046 int64_t max_prio = 0;
14047 int64_t min_prio = 0;
14048 for (const auto &i : osdmap.get_pools()) {
14049 const auto &pool = i.second;
14050
14051 if (pool.opts.is_set(key)) {
14052 int64_t prio = 0;
14053 pool.opts.get(key, &prio);
14054 if (prio > max_prio)
14055 max_prio = prio;
14056 if (prio < min_prio)
14057 min_prio = prio;
14058 }
14059 }
14060 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14061 dout(20) << __func__ << " nothing to fix" << dendl;
14062 return;
14063 }
14064 // Current pool priorities exceeds new maximum
14065 for (const auto &i : osdmap.get_pools()) {
14066 const auto pool_id = i.first;
14067 pg_pool_t pool = i.second;
14068
14069 int64_t prio = 0;
14070 pool.opts.get(key, &prio);
14071 int64_t n;
14072
14073 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14074 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14075 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14076 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14077 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14078 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14079 } else {
14080 continue;
14081 }
14082 if (n == 0) {
14083 pool.opts.unset(key);
14084 } else {
14085 pool.opts.set(key, static_cast<int64_t>(n));
14086 }
14087 dout(10) << __func__ << " pool " << pool_id
14088 << " recovery_priority adjusted "
14089 << prio << " to " << n << dendl;
14090 pool.last_change = pending_inc.epoch;
14091 pending_inc.new_pools[pool_id] = pool;
14092 }
14093 }