]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
Import ceph 15.2.8
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
61
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
69
70 #include "common/config.h"
71 #include "common/errno.h"
72
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
76
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
86
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
89
90 #include "json_spirit/json_spirit_reader.h"
91
92 #include <boost/algorithm/string/predicate.hpp>
93
94 #define dout_subsys ceph_subsys_mon
95 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
96 static const string OSD_METADATA_PREFIX("osd_metadata");
97 static const string OSD_SNAP_PREFIX("osd_snap");
98
99 /*
100
101 OSD snapshot metadata
102 ---------------------
103
104 -- starting with mimic, removed in octopus --
105
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
108
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
111
112
113 -- starting with mimic --
114
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
117
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
122
123
124 -- starting with octopus --
125
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
128
129 */
130 using namespace TOPNSPC::common;
131 namespace {
132
133 struct OSDMemCache : public PriorityCache::PriCache {
134 OSDMonitor *osdmon;
135 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
136 int64_t committed_bytes = 0;
137 double cache_ratio = 0;
138
139 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
140
141 virtual uint64_t _get_used_bytes() const = 0;
142
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri, uint64_t total_cache) const {
145 int64_t assigned = get_cache_bytes(pri);
146
147 switch (pri) {
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1:
150 {
151 int64_t request = _get_used_bytes();
152 return (request > assigned) ? request - assigned : 0;
153 }
154 default:
155 break;
156 }
157 return -EOPNOTSUPP;
158 }
159
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
161 return cache_bytes[pri];
162 }
163
164 virtual int64_t get_cache_bytes() const {
165 int64_t total = 0;
166
167 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
168 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
169 total += get_cache_bytes(pri);
170 }
171 return total;
172 }
173
174 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
175 cache_bytes[pri] = bytes;
176 }
177 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
178 cache_bytes[pri] += bytes;
179 }
180 virtual int64_t commit_cache_size(uint64_t total_cache) {
181 committed_bytes = PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache);
183 return committed_bytes;
184 }
185 virtual int64_t get_committed_size() const {
186 return committed_bytes;
187 }
188 virtual double get_cache_ratio() const {
189 return cache_ratio;
190 }
191 virtual void set_cache_ratio(double ratio) {
192 cache_ratio = ratio;
193 }
194 virtual string get_cache_name() const = 0;
195 };
196
197 struct IncCache : public OSDMemCache {
198 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
199
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon->inc_osd_cache.get_bytes();
202 }
203
204 virtual string get_cache_name() const {
205 return "OSDMap Inc Cache";
206 }
207
208 uint64_t _get_num_osdmaps() const {
209 return osdmon->inc_osd_cache.get_size();
210 }
211 };
212
213 struct FullCache : public OSDMemCache {
214 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
215
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon->full_osd_cache.get_bytes();
218 }
219
220 virtual string get_cache_name() const {
221 return "OSDMap Full Cache";
222 }
223
224 uint64_t _get_num_osdmaps() const {
225 return osdmon->full_osd_cache.get_size();
226 }
227 };
228
229 std::shared_ptr<IncCache> inc_cache;
230 std::shared_ptr<FullCache> full_cache;
231
232 const uint32_t MAX_POOL_APPLICATIONS = 4;
233 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
234 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
235
236 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant.spec.allow & OSD_CAP_W) != 0) {
239 auto& match = grant.match;
240 if (match.is_match_all()) {
241 return true;
242 } else if (pool_name != nullptr &&
243 !match.pool_namespace.pool_name.empty() &&
244 match.pool_namespace.pool_name == *pool_name) {
245 return true;
246 }
247 }
248 return false;
249 }
250
251 bool is_unmanaged_snap_op_permitted(CephContext* cct,
252 const KeyServer& key_server,
253 const EntityName& entity_name,
254 const MonCap& mon_caps,
255 const entity_addr_t& peer_socket_addr,
256 const std::string* pool_name)
257 {
258 typedef std::map<std::string, std::string> CommandArgs;
259
260 if (mon_caps.is_capable(
261 cct, entity_name, "osd",
262 "osd pool op unmanaged-snap",
263 (pool_name == nullptr ?
264 CommandArgs{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs{{"poolname", *pool_name}}),
266 false, true, false,
267 peer_socket_addr)) {
268 return true;
269 }
270
271 AuthCapsInfo caps_info;
272 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
273 caps_info)) {
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl;
276 return false;
277 }
278
279 string caps_str;
280 if (caps_info.caps.length() > 0) {
281 auto p = caps_info.caps.cbegin();
282 try {
283 decode(caps_str, p);
284 } catch (const buffer::error &err) {
285 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
286 << dendl;
287 return false;
288 }
289 }
290
291 OSDCap osd_cap;
292 if (!osd_cap.parse(caps_str, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl;
295 return false;
296 }
297
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap.allow_all()) {
301 return true;
302 }
303
304 for (auto& grant : osd_cap.grants) {
305 if (grant.profile.is_valid()) {
306 for (auto& profile_grant : grant.profile_grants) {
307 if (is_osd_writable(profile_grant, pool_name)) {
308 return true;
309 }
310 }
311 } else if (is_osd_writable(grant, pool_name)) {
312 return true;
313 }
314 }
315
316 return false;
317 }
318
319 } // anonymous namespace
320
321 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
322 {
323 if (epoch_by_pg.size() <= ps) {
324 epoch_by_pg.resize(ps + 1, 0);
325 }
326 const auto old_lec = epoch_by_pg[ps];
327 if (old_lec >= last_epoch_clean) {
328 // stale lec
329 return;
330 }
331 epoch_by_pg[ps] = last_epoch_clean;
332 if (last_epoch_clean < floor) {
333 floor = last_epoch_clean;
334 } else if (last_epoch_clean > floor) {
335 if (old_lec == floor) {
336 // probably should increase floor?
337 auto new_floor = std::min_element(std::begin(epoch_by_pg),
338 std::end(epoch_by_pg));
339 floor = *new_floor;
340 }
341 }
342 if (ps != next_missing) {
343 return;
344 }
345 for (; next_missing < epoch_by_pg.size(); next_missing++) {
346 if (epoch_by_pg[next_missing] == 0) {
347 break;
348 }
349 }
350 }
351
352 void LastEpochClean::remove_pool(uint64_t pool)
353 {
354 report_by_pool.erase(pool);
355 }
356
357 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
358 {
359 auto& lec = report_by_pool[pg.pool()];
360 return lec.report(pg.ps(), last_epoch_clean);
361 }
362
363 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
364 {
365 auto floor = latest.get_epoch();
366 for (auto& pool : latest.get_pools()) {
367 auto reported = report_by_pool.find(pool.first);
368 if (reported == report_by_pool.end()) {
369 return 0;
370 }
371 if (reported->second.next_missing < pool.second.get_pg_num()) {
372 return 0;
373 }
374 if (reported->second.floor < floor) {
375 floor = reported->second.floor;
376 }
377 }
378 return floor;
379 }
380
381 void LastEpochClean::dump(Formatter *f) const
382 {
383 f->open_array_section("per_pool");
384
385 for (auto& it : report_by_pool) {
386 f->open_object_section("pool");
387 f->dump_unsigned("poolid", it.first);
388 f->dump_unsigned("floor", it.second.floor);
389 f->close_section();
390 }
391
392 f->close_section();
393 }
394
395 class C_UpdateCreatingPGs : public Context {
396 public:
397 OSDMonitor *osdmon;
398 utime_t start;
399 epoch_t epoch;
400 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
401 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
402 void finish(int r) override {
403 if (r >= 0) {
404 utime_t end = ceph_clock_now();
405 dout(10) << "osdmap epoch " << epoch << " mapping took "
406 << (end - start) << " seconds" << dendl;
407 osdmon->update_creating_pgs();
408 osdmon->check_pg_creates_subs();
409 }
410 }
411 };
412
413 #undef dout_prefix
414 #define dout_prefix _prefix(_dout, mon, osdmap)
415 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
416 return *_dout << "mon." << mon->name << "@" << mon->rank
417 << "(" << mon->get_state_name()
418 << ").osd e" << osdmap.get_epoch() << " ";
419 }
420
421 OSDMonitor::OSDMonitor(
422 CephContext *cct,
423 Monitor *mn,
424 Paxos *p,
425 const string& service_name)
426 : PaxosService(mn, p, service_name),
427 cct(cct),
428 inc_osd_cache(g_conf()->mon_osd_cache_size),
429 full_osd_cache(g_conf()->mon_osd_cache_size),
430 has_osdmap_manifest(false),
431 mapper(mn->cct, &mn->cpu_tp)
432 {
433 inc_cache = std::make_shared<IncCache>(this);
434 full_cache = std::make_shared<FullCache>(this);
435 cct->_conf.add_observer(this);
436 int r = _set_cache_sizes();
437 if (r < 0) {
438 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
439 << g_conf()->mon_osd_cache_size
440 << ") without priority cache management"
441 << dendl;
442 }
443 }
444
445 const char **OSDMonitor::get_tracked_conf_keys() const
446 {
447 static const char* KEYS[] = {
448 "mon_memory_target",
449 "mon_memory_autotune",
450 "rocksdb_cache_size",
451 NULL
452 };
453 return KEYS;
454 }
455
456 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
457 const std::set<std::string> &changed)
458 {
459 dout(10) << __func__ << " " << changed << dendl;
460
461 if (changed.count("mon_memory_autotune")) {
462 _set_cache_autotuning();
463 }
464 if (changed.count("mon_memory_target") ||
465 changed.count("rocksdb_cache_size")) {
466 int r = _update_mon_cache_settings();
467 if (r < 0) {
468 derr << __func__ << " mon_memory_target:"
469 << g_conf()->mon_memory_target
470 << " rocksdb_cache_size:"
471 << g_conf()->rocksdb_cache_size
472 << ". Unable to update cache size."
473 << dendl;
474 }
475 }
476 }
477
478 void OSDMonitor::_set_cache_autotuning()
479 {
480 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
481 // Disable cache autotuning
482 std::lock_guard l(balancer_lock);
483 pcm = nullptr;
484 }
485
486 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
487 int r = register_cache_with_pcm();
488 if (r < 0) {
489 dout(10) << __func__
490 << " Error while registering osdmon caches with pcm."
491 << " Cache auto tuning not enabled."
492 << dendl;
493 mon_memory_autotune = false;
494 } else {
495 mon_memory_autotune = true;
496 }
497 }
498 }
499
500 int OSDMonitor::_update_mon_cache_settings()
501 {
502 if (g_conf()->mon_memory_target <= 0 ||
503 g_conf()->mon_memory_target < mon_memory_min ||
504 g_conf()->rocksdb_cache_size <= 0) {
505 return -EINVAL;
506 }
507
508 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
509 derr << __func__ << " not using pcm and rocksdb" << dendl;
510 return -EINVAL;
511 }
512
513 uint64_t old_mon_memory_target = mon_memory_target;
514 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
515
516 // Set the new pcm memory cache sizes
517 mon_memory_target = g_conf()->mon_memory_target;
518 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
519
520 uint64_t base = mon_memory_base;
521 double fragmentation = mon_memory_fragmentation;
522 uint64_t target = mon_memory_target;
523 uint64_t min = mon_memory_min;
524 uint64_t max = min;
525
526 uint64_t ltarget = (1.0 - fragmentation) * target;
527 if (ltarget > base + min) {
528 max = ltarget - base;
529 }
530
531 int r = _set_cache_ratios();
532 if (r < 0) {
533 derr << __func__ << " Cache ratios for pcm could not be set."
534 << " Review the kv (rocksdb) and mon_memory_target sizes."
535 << dendl;
536 mon_memory_target = old_mon_memory_target;
537 rocksdb_cache_size = old_rocksdb_cache_size;
538 return -EINVAL;
539 }
540
541 if (mon_memory_autotune && pcm != nullptr) {
542 std::lock_guard l(balancer_lock);
543 // set pcm cache levels
544 pcm->set_target_memory(target);
545 pcm->set_min_memory(min);
546 pcm->set_max_memory(max);
547 // tune memory based on new values
548 pcm->tune_memory();
549 pcm->balance();
550 _set_new_cache_sizes();
551 dout(1) << __func__ << " Updated mon cache setting."
552 << " target: " << target
553 << " min: " << min
554 << " max: " << max
555 << dendl;
556 }
557 return 0;
558 }
559
560 int OSDMonitor::_set_cache_sizes()
561 {
562 if (g_conf()->mon_memory_autotune) {
563 // set the new osdmon cache targets to be managed by pcm
564 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
565 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
566 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
567 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
568 mon_memory_target = g_conf()->mon_memory_target;
569 mon_memory_min = g_conf()->mon_osd_cache_size_min;
570 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
571 derr << __func__ << " mon_memory_target:" << mon_memory_target
572 << " mon_memory_min:" << mon_memory_min
573 << ". Invalid size option(s) provided."
574 << dendl;
575 return -EINVAL;
576 }
577 // Set the initial inc and full LRU cache sizes
578 inc_osd_cache.set_bytes(mon_memory_min);
579 full_osd_cache.set_bytes(mon_memory_min);
580 mon_memory_autotune = g_conf()->mon_memory_autotune;
581 }
582 return 0;
583 }
584
585 bool OSDMonitor::_have_pending_crush()
586 {
587 return pending_inc.crush.length() > 0;
588 }
589
590 CrushWrapper &OSDMonitor::_get_stable_crush()
591 {
592 return *osdmap.crush;
593 }
594
595 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
596 {
597 bufferlist bl;
598 if (pending_inc.crush.length())
599 bl = pending_inc.crush;
600 else
601 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
602
603 auto p = bl.cbegin();
604 newcrush.decode(p);
605 }
606
607 void OSDMonitor::create_initial()
608 {
609 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
610
611 OSDMap newmap;
612
613 bufferlist bl;
614 mon->store->get("mkfs", "osdmap", bl);
615
616 if (bl.length()) {
617 newmap.decode(bl);
618 newmap.set_fsid(mon->monmap->fsid);
619 } else {
620 newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
621 }
622 newmap.set_epoch(1);
623 newmap.created = newmap.modified = ceph_clock_now();
624
625 // new clusters should sort bitwise by default.
626 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
627
628 newmap.flags |=
629 CEPH_OSDMAP_RECOVERY_DELETES |
630 CEPH_OSDMAP_PURGED_SNAPDIRS |
631 CEPH_OSDMAP_PGLOG_HARDLIMIT;
632 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
633 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
634 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
635 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
636 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
637 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
638
639 // new cluster should require latest by default
640 if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
641 if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
642 derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
643 newmap.require_osd_release = ceph_release_t::mimic;
644 } else {
645 derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
646 newmap.require_osd_release = ceph_release_t::nautilus;
647 }
648 } else {
649 newmap.require_osd_release = ceph_release_t::octopus;
650 ceph_release_t r = ceph_release_from_name(
651 g_conf()->mon_osd_initial_require_min_compat_client);
652 if (!r) {
653 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
654 }
655 newmap.require_min_compat_client = r;
656 }
657
658 // encode into pending incremental
659 uint64_t features = newmap.get_encoding_features();
660 newmap.encode(pending_inc.fullmap,
661 features | CEPH_FEATURE_RESERVED);
662 pending_inc.full_crc = newmap.get_crc();
663 dout(20) << " full crc " << pending_inc.full_crc << dendl;
664 }
665
666 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
667 {
668 s.insert(service_name);
669 s.insert(OSD_PG_CREATING_PREFIX);
670 s.insert(OSD_METADATA_PREFIX);
671 s.insert(OSD_SNAP_PREFIX);
672 }
673
674 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
675 {
676 // we really don't care if the version has been updated, because we may
677 // have trimmed without having increased the last committed; yet, we may
678 // need to update the in-memory manifest.
679 load_osdmap_manifest();
680
681 version_t version = get_last_committed();
682 if (version == osdmap.epoch)
683 return;
684 ceph_assert(version > osdmap.epoch);
685
686 dout(15) << "update_from_paxos paxos e " << version
687 << ", my e " << osdmap.epoch << dendl;
688
689 if (mapping_job) {
690 if (!mapping_job->is_done()) {
691 dout(1) << __func__ << " mapping job "
692 << mapping_job.get() << " did not complete, "
693 << mapping_job->shards << " left, canceling" << dendl;
694 mapping_job->abort();
695 }
696 mapping_job.reset();
697 }
698
699 load_health();
700
701 /*
702 * We will possibly have a stashed latest that *we* wrote, and we will
703 * always be sure to have the oldest full map in the first..last range
704 * due to encode_trim_extra(), which includes the oldest full map in the trim
705 * transaction.
706 *
707 * encode_trim_extra() does not however write the full map's
708 * version to 'full_latest'. This is only done when we are building the
709 * full maps from the incremental versions. But don't panic! We make sure
710 * that the following conditions find whichever full map version is newer.
711 */
712 version_t latest_full = get_version_latest_full();
713 if (latest_full == 0 && get_first_committed() > 1)
714 latest_full = get_first_committed();
715
716 if (get_first_committed() > 1 &&
717 latest_full < get_first_committed()) {
718 // the monitor could be just sync'ed with its peer, and the latest_full key
719 // is not encoded in the paxos commits in encode_pending(), so we need to
720 // make sure we get it pointing to a proper version.
721 version_t lc = get_last_committed();
722 version_t fc = get_first_committed();
723
724 dout(10) << __func__ << " looking for valid full map in interval"
725 << " [" << fc << ", " << lc << "]" << dendl;
726
727 latest_full = 0;
728 for (version_t v = lc; v >= fc; v--) {
729 string full_key = "full_" + stringify(v);
730 if (mon->store->exists(get_service_name(), full_key)) {
731 dout(10) << __func__ << " found latest full map v " << v << dendl;
732 latest_full = v;
733 break;
734 }
735 }
736
737 ceph_assert(latest_full > 0);
738 auto t(std::make_shared<MonitorDBStore::Transaction>());
739 put_version_latest_full(t, latest_full);
740 mon->store->apply_transaction(t);
741 dout(10) << __func__ << " updated the on-disk full map version to "
742 << latest_full << dendl;
743 }
744
745 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
746 bufferlist latest_bl;
747 get_version_full(latest_full, latest_bl);
748 ceph_assert(latest_bl.length() != 0);
749 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
750 osdmap = OSDMap();
751 osdmap.decode(latest_bl);
752 }
753
754 bufferlist bl;
755 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
756 auto p = bl.cbegin();
757 std::lock_guard<std::mutex> l(creating_pgs_lock);
758 creating_pgs.decode(p);
759 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
760 << creating_pgs.last_scan_epoch
761 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
762 } else {
763 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
764 << dendl;
765 }
766
767 // walk through incrementals
768 MonitorDBStore::TransactionRef t;
769 size_t tx_size = 0;
770 while (version > osdmap.epoch) {
771 bufferlist inc_bl;
772 int err = get_version(osdmap.epoch+1, inc_bl);
773 ceph_assert(err == 0);
774 ceph_assert(inc_bl.length());
775 // set priority cache manager levels if the osdmap is
776 // being populated for the first time.
777 if (mon_memory_autotune && pcm == nullptr) {
778 int r = register_cache_with_pcm();
779 if (r < 0) {
780 dout(10) << __func__
781 << " Error while registering osdmon caches with pcm."
782 << " Proceeding without cache auto tuning."
783 << dendl;
784 }
785 }
786
787 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
788 << dendl;
789 OSDMap::Incremental inc(inc_bl);
790 err = osdmap.apply_incremental(inc);
791 ceph_assert(err == 0);
792
793 if (!t)
794 t.reset(new MonitorDBStore::Transaction);
795
796 // Write out the full map for all past epochs. Encode the full
797 // map with the same features as the incremental. If we don't
798 // know, use the quorum features. If we don't know those either,
799 // encode with all features.
800 uint64_t f = inc.encode_features;
801 if (!f)
802 f = mon->get_quorum_con_features();
803 if (!f)
804 f = -1;
805 bufferlist full_bl;
806 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
807 tx_size += full_bl.length();
808
809 bufferlist orig_full_bl;
810 get_version_full(osdmap.epoch, orig_full_bl);
811 if (orig_full_bl.length()) {
812 // the primary provided the full map
813 ceph_assert(inc.have_crc);
814 if (inc.full_crc != osdmap.crc) {
815 // This will happen if the mons were running mixed versions in
816 // the past or some other circumstance made the full encoded
817 // maps divergent. Reloading here will bring us back into
818 // sync with the primary for this and all future maps. OSDs
819 // will also be brought back into sync when they discover the
820 // crc mismatch and request a full map from a mon.
821 derr << __func__ << " full map CRC mismatch, resetting to canonical"
822 << dendl;
823
824 dout(20) << __func__ << " my (bad) full osdmap:\n";
825 JSONFormatter jf(true);
826 jf.dump_object("osdmap", osdmap);
827 jf.flush(*_dout);
828 *_dout << "\nhexdump:\n";
829 full_bl.hexdump(*_dout);
830 *_dout << dendl;
831
832 osdmap = OSDMap();
833 osdmap.decode(orig_full_bl);
834
835 dout(20) << __func__ << " canonical full osdmap:\n";
836 JSONFormatter jf(true);
837 jf.dump_object("osdmap", osdmap);
838 jf.flush(*_dout);
839 *_dout << "\nhexdump:\n";
840 orig_full_bl.hexdump(*_dout);
841 *_dout << dendl;
842 }
843 } else {
844 ceph_assert(!inc.have_crc);
845 put_version_full(t, osdmap.epoch, full_bl);
846 }
847 put_version_latest_full(t, osdmap.epoch);
848
849 // share
850 dout(1) << osdmap << dendl;
851
852 if (osdmap.epoch == 1) {
853 t->erase("mkfs", "osdmap");
854 }
855
856 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
857 mon->store->apply_transaction(t);
858 t = MonitorDBStore::TransactionRef();
859 tx_size = 0;
860 }
861 for (const auto &osd_state : inc.new_state) {
862 if (osd_state.second & CEPH_OSD_UP) {
863 // could be marked up *or* down, but we're too lazy to check which
864 last_osd_report.erase(osd_state.first);
865 }
866 if (osd_state.second & CEPH_OSD_EXISTS) {
867 // could be created *or* destroyed, but we can safely drop it
868 osd_epochs.erase(osd_state.first);
869 }
870 }
871 }
872
873 if (t) {
874 mon->store->apply_transaction(t);
875 }
876
877 for (int o = 0; o < osdmap.get_max_osd(); o++) {
878 if (osdmap.is_out(o))
879 continue;
880 auto found = down_pending_out.find(o);
881 if (osdmap.is_down(o)) {
882 // populate down -> out map
883 if (found == down_pending_out.end()) {
884 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
885 down_pending_out[o] = ceph_clock_now();
886 }
887 } else {
888 if (found != down_pending_out.end()) {
889 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
890 down_pending_out.erase(found);
891 }
892 }
893 }
894 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
895
896 check_osdmap_subs();
897 check_pg_creates_subs();
898
899 share_map_with_random_osd();
900 update_logger();
901 process_failures();
902
903 // make sure our feature bits reflect the latest map
904 update_msgr_features();
905
906 if (!mon->is_leader()) {
907 // will be called by on_active() on the leader, avoid doing so twice
908 start_mapping();
909 }
910 }
911
912 int OSDMonitor::register_cache_with_pcm()
913 {
914 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
915 derr << __func__ << " Invalid memory size specified for mon caches."
916 << " Caches will not be auto-tuned."
917 << dendl;
918 return -EINVAL;
919 }
920 uint64_t base = mon_memory_base;
921 double fragmentation = mon_memory_fragmentation;
922 // For calculating total target memory, consider rocksdb cache size.
923 uint64_t target = mon_memory_target;
924 uint64_t min = mon_memory_min;
925 uint64_t max = min;
926
927 // Apply the same logic as in bluestore to set the max amount
928 // of memory to use for cache. Assume base memory for OSDMaps
929 // and then add in some overhead for fragmentation.
930 uint64_t ltarget = (1.0 - fragmentation) * target;
931 if (ltarget > base + min) {
932 max = ltarget - base;
933 }
934
935 rocksdb_binned_kv_cache = mon->store->get_priority_cache();
936 if (!rocksdb_binned_kv_cache) {
937 derr << __func__ << " not using rocksdb" << dendl;
938 return -EINVAL;
939 }
940
941 int r = _set_cache_ratios();
942 if (r < 0) {
943 derr << __func__ << " Cache ratios for pcm could not be set."
944 << " Review the kv (rocksdb) and mon_memory_target sizes."
945 << dendl;
946 return -EINVAL;
947 }
948
949 pcm = std::make_shared<PriorityCache::Manager>(
950 cct, min, max, target, true);
951 pcm->insert("kv", rocksdb_binned_kv_cache, true);
952 pcm->insert("inc", inc_cache, true);
953 pcm->insert("full", full_cache, true);
954 dout(1) << __func__ << " pcm target: " << target
955 << " pcm max: " << max
956 << " pcm min: " << min
957 << " inc_osd_cache size: " << inc_osd_cache.get_size()
958 << dendl;
959 return 0;
960 }
961
962 int OSDMonitor::_set_cache_ratios()
963 {
964 double old_cache_kv_ratio = cache_kv_ratio;
965
966 // Set the cache ratios for kv(rocksdb), inc and full caches
967 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
968 if (cache_kv_ratio >= 1.0) {
969 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
970 << ") must be in range [0,<1.0]."
971 << dendl;
972 cache_kv_ratio = old_cache_kv_ratio;
973 return -EINVAL;
974 }
975 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
976 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
977 inc_cache->set_cache_ratio(cache_inc_ratio);
978 full_cache->set_cache_ratio(cache_full_ratio);
979
980 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
981 << " inc ratio " << cache_inc_ratio
982 << " full ratio " << cache_full_ratio
983 << dendl;
984 return 0;
985 }
986
987 void OSDMonitor::start_mapping()
988 {
989 // initiate mapping job
990 if (mapping_job) {
991 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
992 << dendl;
993 mapping_job->abort();
994 }
995 if (!osdmap.get_pools().empty()) {
996 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
997 mapping_job = mapping.start_update(osdmap, mapper,
998 g_conf()->mon_osd_mapping_pgs_per_chunk);
999 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1000 << " at " << fin->start << dendl;
1001 mapping_job->set_finish_event(fin);
1002 } else {
1003 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1004 mapping_job = nullptr;
1005 }
1006 }
1007
1008 void OSDMonitor::update_msgr_features()
1009 {
1010 set<int> types;
1011 types.insert((int)entity_name_t::TYPE_OSD);
1012 types.insert((int)entity_name_t::TYPE_CLIENT);
1013 types.insert((int)entity_name_t::TYPE_MDS);
1014 types.insert((int)entity_name_t::TYPE_MON);
1015 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1016 uint64_t mask;
1017 uint64_t features = osdmap.get_features(*q, &mask);
1018 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1019 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1020 ceph::net::Policy p = mon->messenger->get_policy(*q);
1021 p.features_required = (p.features_required & ~mask) | features;
1022 mon->messenger->set_policy(*q, p);
1023 }
1024 }
1025 }
1026
1027 void OSDMonitor::on_active()
1028 {
1029 update_logger();
1030
1031 if (mon->is_leader()) {
1032 mon->clog->debug() << "osdmap " << osdmap;
1033 if (!priority_convert) {
1034 // Only do this once at start-up
1035 convert_pool_priorities();
1036 priority_convert = true;
1037 }
1038 } else {
1039 list<MonOpRequestRef> ls;
1040 take_all_failures(ls);
1041 while (!ls.empty()) {
1042 MonOpRequestRef op = ls.front();
1043 op->mark_osdmon_event(__func__);
1044 dispatch(op);
1045 ls.pop_front();
1046 }
1047 }
1048 start_mapping();
1049 }
1050
1051 void OSDMonitor::on_restart()
1052 {
1053 last_osd_report.clear();
1054 }
1055
1056 void OSDMonitor::on_shutdown()
1057 {
1058 dout(10) << __func__ << dendl;
1059 if (mapping_job) {
1060 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1061 << dendl;
1062 mapping_job->abort();
1063 }
1064
1065 // discard failure info, waiters
1066 list<MonOpRequestRef> ls;
1067 take_all_failures(ls);
1068 ls.clear();
1069 }
1070
1071 void OSDMonitor::update_logger()
1072 {
1073 dout(10) << "update_logger" << dendl;
1074
1075 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1076 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1077 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1078 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1079 }
1080
1081 void OSDMonitor::create_pending()
1082 {
1083 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1084 pending_inc.fsid = mon->monmap->fsid;
1085 pending_metadata.clear();
1086 pending_metadata_rm.clear();
1087 pending_pseudo_purged_snaps.clear();
1088
1089 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1090
1091 // safety checks (this shouldn't really happen)
1092 {
1093 if (osdmap.backfillfull_ratio <= 0) {
1094 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1095 if (pending_inc.new_backfillfull_ratio > 1.0)
1096 pending_inc.new_backfillfull_ratio /= 100;
1097 dout(1) << __func__ << " setting backfillfull_ratio = "
1098 << pending_inc.new_backfillfull_ratio << dendl;
1099 }
1100 if (osdmap.full_ratio <= 0) {
1101 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1102 if (pending_inc.new_full_ratio > 1.0)
1103 pending_inc.new_full_ratio /= 100;
1104 dout(1) << __func__ << " setting full_ratio = "
1105 << pending_inc.new_full_ratio << dendl;
1106 }
1107 if (osdmap.nearfull_ratio <= 0) {
1108 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1109 if (pending_inc.new_nearfull_ratio > 1.0)
1110 pending_inc.new_nearfull_ratio /= 100;
1111 dout(1) << __func__ << " setting nearfull_ratio = "
1112 << pending_inc.new_nearfull_ratio << dendl;
1113 }
1114 }
1115
1116 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1117 // structure.
1118 if (osdmap.crush->has_legacy_rule_ids()) {
1119 CrushWrapper newcrush;
1120 _get_pending_crush(newcrush);
1121
1122 // First, for all pools, work out which rule they really used
1123 // by resolving ruleset to rule.
1124 for (const auto &i : osdmap.get_pools()) {
1125 const auto pool_id = i.first;
1126 const auto &pool = i.second;
1127 int new_rule_id = newcrush.find_rule(pool.crush_rule,
1128 pool.type, pool.size);
1129
1130 dout(1) << __func__ << " rewriting pool "
1131 << osdmap.get_pool_name(pool_id) << " crush ruleset "
1132 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1133 if (pending_inc.new_pools.count(pool_id) == 0) {
1134 pending_inc.new_pools[pool_id] = pool;
1135 }
1136 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1137 }
1138
1139 // Now, go ahead and renumber all the rules so that their
1140 // rule_id field corresponds to their position in the array
1141 auto old_to_new = newcrush.renumber_rules();
1142 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1143 for (const auto &i : old_to_new) {
1144 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1145 }
1146 pending_inc.crush.clear();
1147 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1148 }
1149 }
1150
1151 creating_pgs_t
1152 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1153 const OSDMap& nextmap)
1154 {
1155 dout(10) << __func__ << dendl;
1156 creating_pgs_t pending_creatings;
1157 {
1158 std::lock_guard<std::mutex> l(creating_pgs_lock);
1159 pending_creatings = creating_pgs;
1160 }
1161 // check for new or old pools
1162 if (pending_creatings.last_scan_epoch < inc.epoch) {
1163 unsigned queued = 0;
1164 queued += scan_for_creating_pgs(osdmap.get_pools(),
1165 inc.old_pools,
1166 inc.modified,
1167 &pending_creatings);
1168 queued += scan_for_creating_pgs(inc.new_pools,
1169 inc.old_pools,
1170 inc.modified,
1171 &pending_creatings);
1172 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1173 for (auto deleted_pool : inc.old_pools) {
1174 auto removed = pending_creatings.remove_pool(deleted_pool);
1175 dout(10) << __func__ << " " << removed
1176 << " pg removed because containing pool deleted: "
1177 << deleted_pool << dendl;
1178 last_epoch_clean.remove_pool(deleted_pool);
1179 }
1180 // pgmon updates its creating_pgs in check_osd_map() which is called by
1181 // on_active() and check_osd_map() could be delayed if lease expires, so its
1182 // creating_pgs could be stale in comparison with the one of osdmon. let's
1183 // trim them here. otherwise, they will be added back after being erased.
1184 unsigned removed = 0;
1185 for (auto& pg : pending_created_pgs) {
1186 dout(20) << __func__ << " noting created pg " << pg << dendl;
1187 pending_creatings.created_pools.insert(pg.pool());
1188 removed += pending_creatings.pgs.erase(pg);
1189 }
1190 pending_created_pgs.clear();
1191 dout(10) << __func__ << " " << removed
1192 << " pgs removed because they're created" << dendl;
1193 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1194 }
1195
1196 // filter out any pgs that shouldn't exist.
1197 {
1198 auto i = pending_creatings.pgs.begin();
1199 while (i != pending_creatings.pgs.end()) {
1200 if (!nextmap.pg_exists(i->first)) {
1201 dout(10) << __func__ << " removing pg " << i->first
1202 << " which should not exist" << dendl;
1203 i = pending_creatings.pgs.erase(i);
1204 } else {
1205 ++i;
1206 }
1207 }
1208 }
1209
1210 // process queue
1211 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1212 const auto total = pending_creatings.pgs.size();
1213 while (pending_creatings.pgs.size() < max &&
1214 !pending_creatings.queue.empty()) {
1215 auto p = pending_creatings.queue.begin();
1216 int64_t poolid = p->first;
1217 dout(10) << __func__ << " pool " << poolid
1218 << " created " << p->second.created
1219 << " modified " << p->second.modified
1220 << " [" << p->second.start << "-" << p->second.end << ")"
1221 << dendl;
1222 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1223 p->second.end - p->second.start);
1224 ps_t first = p->second.start;
1225 ps_t end = first + n;
1226 for (ps_t ps = first; ps < end; ++ps) {
1227 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1228 // NOTE: use the *current* epoch as the PG creation epoch so that the
1229 // OSD does not have to generate a long set of PastIntervals.
1230 pending_creatings.pgs.emplace(
1231 pgid,
1232 creating_pgs_t::pg_create_info(inc.epoch,
1233 p->second.modified));
1234 dout(10) << __func__ << " adding " << pgid << dendl;
1235 }
1236 p->second.start = end;
1237 if (p->second.done()) {
1238 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1239 pending_creatings.queue.erase(p);
1240 } else {
1241 dout(10) << __func__ << " pool " << poolid
1242 << " now [" << p->second.start << "-" << p->second.end << ")"
1243 << dendl;
1244 }
1245 }
1246 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1247 << " pools" << dendl;
1248
1249 if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1250 // walk creating pgs' history and past_intervals forward
1251 for (auto& i : pending_creatings.pgs) {
1252 // this mirrors PG::start_peering_interval()
1253 pg_t pgid = i.first;
1254
1255 // this is a bit imprecise, but sufficient?
1256 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1257 const pg_pool_t *pi;
1258 bool operator()(const set<pg_shard_t> &have) const {
1259 return have.size() >= pi->min_size;
1260 }
1261 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1262 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1263
1264 vector<int> up, acting;
1265 int up_primary, acting_primary;
1266 nextmap.pg_to_up_acting_osds(
1267 pgid, &up, &up_primary, &acting, &acting_primary);
1268 if (i.second.history.epoch_created == 0) {
1269 // new pg entry, set it up
1270 i.second.up = up;
1271 i.second.acting = acting;
1272 i.second.up_primary = up_primary;
1273 i.second.acting_primary = acting_primary;
1274 i.second.history = pg_history_t(i.second.create_epoch,
1275 i.second.create_stamp);
1276 dout(10) << __func__ << " pg " << pgid << " just added, "
1277 << " up " << i.second.up
1278 << " p " << i.second.up_primary
1279 << " acting " << i.second.acting
1280 << " p " << i.second.acting_primary
1281 << " history " << i.second.history
1282 << " past_intervals " << i.second.past_intervals
1283 << dendl;
1284 } else {
1285 std::stringstream debug;
1286 if (PastIntervals::check_new_interval(
1287 i.second.acting_primary, acting_primary,
1288 i.second.acting, acting,
1289 i.second.up_primary, up_primary,
1290 i.second.up, up,
1291 i.second.history.same_interval_since,
1292 i.second.history.last_epoch_clean,
1293 &nextmap,
1294 &osdmap,
1295 pgid,
1296 min_size_predicate,
1297 &i.second.past_intervals,
1298 &debug)) {
1299 epoch_t e = inc.epoch;
1300 i.second.history.same_interval_since = e;
1301 if (i.second.up != up) {
1302 i.second.history.same_up_since = e;
1303 }
1304 if (i.second.acting_primary != acting_primary) {
1305 i.second.history.same_primary_since = e;
1306 }
1307 if (pgid.is_split(
1308 osdmap.get_pg_num(pgid.pool()),
1309 nextmap.get_pg_num(pgid.pool()),
1310 nullptr)) {
1311 i.second.history.last_epoch_split = e;
1312 }
1313 dout(10) << __func__ << " pg " << pgid << " new interval,"
1314 << " up " << i.second.up << " -> " << up
1315 << " p " << i.second.up_primary << " -> " << up_primary
1316 << " acting " << i.second.acting << " -> " << acting
1317 << " p " << i.second.acting_primary << " -> "
1318 << acting_primary
1319 << " history " << i.second.history
1320 << " past_intervals " << i.second.past_intervals
1321 << dendl;
1322 dout(20) << " debug: " << debug.str() << dendl;
1323 i.second.up = up;
1324 i.second.acting = acting;
1325 i.second.up_primary = up_primary;
1326 i.second.acting_primary = acting_primary;
1327 }
1328 }
1329 }
1330 }
1331 dout(10) << __func__
1332 << " " << (pending_creatings.pgs.size() - total)
1333 << "/" << pending_creatings.pgs.size()
1334 << " pgs added from queued pools" << dendl;
1335 return pending_creatings;
1336 }
1337
1338 void OSDMonitor::maybe_prime_pg_temp()
1339 {
1340 bool all = false;
1341 if (pending_inc.crush.length()) {
1342 dout(10) << __func__ << " new crush map, all" << dendl;
1343 all = true;
1344 }
1345
1346 if (!pending_inc.new_up_client.empty()) {
1347 dout(10) << __func__ << " new up osds, all" << dendl;
1348 all = true;
1349 }
1350
1351 // check for interesting OSDs
1352 set<int> osds;
1353 for (auto p = pending_inc.new_state.begin();
1354 !all && p != pending_inc.new_state.end();
1355 ++p) {
1356 if ((p->second & CEPH_OSD_UP) &&
1357 osdmap.is_up(p->first)) {
1358 osds.insert(p->first);
1359 }
1360 }
1361 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1362 !all && p != pending_inc.new_weight.end();
1363 ++p) {
1364 if (p->second < osdmap.get_weight(p->first)) {
1365 // weight reduction
1366 osds.insert(p->first);
1367 } else {
1368 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1369 << dendl;
1370 all = true;
1371 }
1372 }
1373
1374 if (!all && osds.empty())
1375 return;
1376
1377 if (!all) {
1378 unsigned estimate =
1379 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1380 if (estimate > mapping.get_num_pgs() *
1381 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1382 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1383 << osds.size() << " osds >= "
1384 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1385 << mapping.get_num_pgs() << " pgs, all"
1386 << dendl;
1387 all = true;
1388 } else {
1389 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1390 << osds.size() << " osds" << dendl;
1391 }
1392 }
1393
1394 OSDMap next;
1395 next.deepish_copy_from(osdmap);
1396 next.apply_incremental(pending_inc);
1397
1398 if (next.get_pools().empty()) {
1399 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1400 } else if (all) {
1401 PrimeTempJob job(next, this);
1402 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1403 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1404 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1405 } else {
1406 dout(10) << __func__ << " did not finish in "
1407 << g_conf()->mon_osd_prime_pg_temp_max_time
1408 << ", stopping" << dendl;
1409 job.abort();
1410 }
1411 } else {
1412 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1413 utime_t stop = ceph_clock_now();
1414 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1415 const int chunk = 1000;
1416 int n = chunk;
1417 std::unordered_set<pg_t> did_pgs;
1418 for (auto osd : osds) {
1419 auto& pgs = mapping.get_osd_acting_pgs(osd);
1420 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1421 for (auto pgid : pgs) {
1422 if (!did_pgs.insert(pgid).second) {
1423 continue;
1424 }
1425 prime_pg_temp(next, pgid);
1426 if (--n <= 0) {
1427 n = chunk;
1428 if (ceph_clock_now() > stop) {
1429 dout(10) << __func__ << " consumed more than "
1430 << g_conf()->mon_osd_prime_pg_temp_max_time
1431 << " seconds, stopping"
1432 << dendl;
1433 return;
1434 }
1435 }
1436 }
1437 }
1438 }
1439 }
1440
1441 void OSDMonitor::prime_pg_temp(
1442 const OSDMap& next,
1443 pg_t pgid)
1444 {
1445 // TODO: remove this creating_pgs direct access?
1446 if (creating_pgs.pgs.count(pgid)) {
1447 return;
1448 }
1449 if (!osdmap.pg_exists(pgid)) {
1450 return;
1451 }
1452
1453 vector<int> up, acting;
1454 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1455
1456 vector<int> next_up, next_acting;
1457 int next_up_primary, next_acting_primary;
1458 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1459 &next_acting, &next_acting_primary);
1460 if (acting == next_acting &&
1461 !(up != acting && next_up == next_acting))
1462 return; // no change since last epoch
1463
1464 if (acting.empty())
1465 return; // if previously empty now we can be no worse off
1466 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1467 if (pool && acting.size() < pool->min_size)
1468 return; // can be no worse off than before
1469
1470 if (next_up == next_acting) {
1471 acting.clear();
1472 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1473 << dendl;
1474 }
1475
1476 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1477 << " -> " << next_up << "/" << next_acting
1478 << ", priming " << acting
1479 << dendl;
1480 {
1481 std::lock_guard l(prime_pg_temp_lock);
1482 // do not touch a mapping if a change is pending
1483 pending_inc.new_pg_temp.emplace(
1484 pgid,
1485 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1486 }
1487 }
1488
1489 /**
1490 * @note receiving a transaction in this function gives a fair amount of
1491 * freedom to the service implementation if it does need it. It shouldn't.
1492 */
1493 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1494 {
1495 dout(10) << "encode_pending e " << pending_inc.epoch
1496 << dendl;
1497
1498 if (do_prune(t)) {
1499 dout(1) << __func__ << " osdmap full prune encoded e"
1500 << pending_inc.epoch << dendl;
1501 }
1502
1503 // finalize up pending_inc
1504 pending_inc.modified = ceph_clock_now();
1505
1506 int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1507 ceph_assert(r == 0);
1508
1509 if (mapping_job) {
1510 if (!mapping_job->is_done()) {
1511 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1512 << mapping_job.get() << " did not complete, "
1513 << mapping_job->shards << " left" << dendl;
1514 mapping_job->abort();
1515 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1516 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1517 << mapping_job.get() << " is prior epoch "
1518 << mapping.get_epoch() << dendl;
1519 } else {
1520 if (g_conf()->mon_osd_prime_pg_temp) {
1521 maybe_prime_pg_temp();
1522 }
1523 }
1524 } else if (g_conf()->mon_osd_prime_pg_temp) {
1525 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1526 << dendl;
1527 }
1528 mapping_job.reset();
1529
1530 // ensure we don't have blank new_state updates. these are interrpeted as
1531 // CEPH_OSD_UP (and almost certainly not what we want!).
1532 auto p = pending_inc.new_state.begin();
1533 while (p != pending_inc.new_state.end()) {
1534 if (p->second == 0) {
1535 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1536 p = pending_inc.new_state.erase(p);
1537 } else {
1538 if (p->second & CEPH_OSD_UP) {
1539 pending_inc.new_last_up_change = pending_inc.modified;
1540 }
1541 ++p;
1542 }
1543 }
1544 if (!pending_inc.new_up_client.empty()) {
1545 pending_inc.new_last_up_change = pending_inc.modified;
1546 }
1547 for (auto& i : pending_inc.new_weight) {
1548 if (i.first >= osdmap.max_osd) {
1549 if (i.second) {
1550 // new osd is already marked in
1551 pending_inc.new_last_in_change = pending_inc.modified;
1552 break;
1553 }
1554 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1555 // existing osd marked in or out
1556 pending_inc.new_last_in_change = pending_inc.modified;
1557 break;
1558 }
1559 }
1560
1561 {
1562 OSDMap tmp;
1563 tmp.deepish_copy_from(osdmap);
1564 tmp.apply_incremental(pending_inc);
1565
1566 // clean pg_temp mappings
1567 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1568
1569 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1570 {
1571 // check every upmapped pg for now
1572 // until we could reliably identify certain cases to ignore,
1573 // which is obviously the hard part TBD..
1574 vector<pg_t> pgs_to_check;
1575 tmp.get_upmap_pgs(&pgs_to_check);
1576 if (pgs_to_check.size() <
1577 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1578 // not enough pgs, do it inline
1579 tmp.clean_pg_upmaps(cct, &pending_inc);
1580 } else {
1581 CleanUpmapJob job(cct, tmp, pending_inc);
1582 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1583 job.wait();
1584 }
1585 }
1586
1587 // update creating pgs first so that we can remove the created pgid and
1588 // process the pool flag removal below in the same osdmap epoch.
1589 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1590 bufferlist creatings_bl;
1591 uint64_t features = CEPH_FEATURES_ALL;
1592 if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1593 dout(20) << __func__ << " encoding pending pgs without octopus features"
1594 << dendl;
1595 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1596 }
1597 encode(pending_creatings, creatings_bl, features);
1598 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1599
1600 // remove any old (or incompat) POOL_CREATING flags
1601 for (auto& i : tmp.get_pools()) {
1602 if (tmp.require_osd_release < ceph_release_t::nautilus) {
1603 // pre-nautilus OSDMaps shouldn't get this flag.
1604 if (pending_inc.new_pools.count(i.first)) {
1605 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1606 }
1607 }
1608 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1609 !pending_creatings.still_creating_pool(i.first)) {
1610 dout(10) << __func__ << " done creating pool " << i.first
1611 << ", clearing CREATING flag" << dendl;
1612 if (pending_inc.new_pools.count(i.first) == 0) {
1613 pending_inc.new_pools[i.first] = i.second;
1614 }
1615 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1616 }
1617 }
1618
1619 // collect which pools are currently affected by
1620 // the near/backfill/full osd(s),
1621 // and set per-pool near/backfill/full flag instead
1622 set<int64_t> full_pool_ids;
1623 set<int64_t> backfillfull_pool_ids;
1624 set<int64_t> nearfull_pool_ids;
1625 tmp.get_full_pools(cct,
1626 &full_pool_ids,
1627 &backfillfull_pool_ids,
1628 &nearfull_pool_ids);
1629 if (full_pool_ids.empty() ||
1630 backfillfull_pool_ids.empty() ||
1631 nearfull_pool_ids.empty()) {
1632 // normal case - no nearfull, backfillfull or full osds
1633 // try cancel any improper nearfull/backfillfull/full pool
1634 // flags first
1635 for (auto &pool: tmp.get_pools()) {
1636 auto p = pool.first;
1637 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1638 nearfull_pool_ids.empty()) {
1639 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1640 << "'s nearfull flag" << dendl;
1641 if (pending_inc.new_pools.count(p) == 0) {
1642 // load original pool info first!
1643 pending_inc.new_pools[p] = pool.second;
1644 }
1645 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1646 }
1647 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1648 backfillfull_pool_ids.empty()) {
1649 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650 << "'s backfillfull flag" << dendl;
1651 if (pending_inc.new_pools.count(p) == 0) {
1652 pending_inc.new_pools[p] = pool.second;
1653 }
1654 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1655 }
1656 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1657 full_pool_ids.empty()) {
1658 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1659 // set by EQUOTA, skipping
1660 continue;
1661 }
1662 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1663 << "'s full flag" << dendl;
1664 if (pending_inc.new_pools.count(p) == 0) {
1665 pending_inc.new_pools[p] = pool.second;
1666 }
1667 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1668 }
1669 }
1670 }
1671 if (!full_pool_ids.empty()) {
1672 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1673 << " as full" << dendl;
1674 for (auto &p: full_pool_ids) {
1675 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1676 continue;
1677 }
1678 if (pending_inc.new_pools.count(p) == 0) {
1679 pending_inc.new_pools[p] = tmp.pools[p];
1680 }
1681 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1682 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1683 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1684 }
1685 // cancel FLAG_FULL for pools which are no longer full too
1686 for (auto &pool: tmp.get_pools()) {
1687 auto p = pool.first;
1688 if (full_pool_ids.count(p)) {
1689 // skip pools we have just marked as full above
1690 continue;
1691 }
1692 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1693 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1694 // don't touch if currently is not full
1695 // or is running out of quota (and hence considered as full)
1696 continue;
1697 }
1698 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1699 << "'s full flag" << dendl;
1700 if (pending_inc.new_pools.count(p) == 0) {
1701 pending_inc.new_pools[p] = pool.second;
1702 }
1703 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1704 }
1705 }
1706 if (!backfillfull_pool_ids.empty()) {
1707 for (auto &p: backfillfull_pool_ids) {
1708 if (full_pool_ids.count(p)) {
1709 // skip pools we have already considered as full above
1710 continue;
1711 }
1712 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1713 // make sure FLAG_FULL is truly set, so we are safe not
1714 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1716 continue;
1717 }
1718 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1719 // don't bother if pool is already marked as backfillfull
1720 continue;
1721 }
1722 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1723 << "'s as backfillfull" << dendl;
1724 if (pending_inc.new_pools.count(p) == 0) {
1725 pending_inc.new_pools[p] = tmp.pools[p];
1726 }
1727 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1728 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1729 }
1730 // cancel FLAG_BACKFILLFULL for pools
1731 // which are no longer backfillfull too
1732 for (auto &pool: tmp.get_pools()) {
1733 auto p = pool.first;
1734 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1735 // skip pools we have just marked as backfillfull/full above
1736 continue;
1737 }
1738 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1739 // and don't touch if currently is not backfillfull
1740 continue;
1741 }
1742 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743 << "'s backfillfull flag" << dendl;
1744 if (pending_inc.new_pools.count(p) == 0) {
1745 pending_inc.new_pools[p] = pool.second;
1746 }
1747 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1748 }
1749 }
1750 if (!nearfull_pool_ids.empty()) {
1751 for (auto &p: nearfull_pool_ids) {
1752 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1753 continue;
1754 }
1755 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1756 // make sure FLAG_FULL is truly set, so we are safe not
1757 // to set a extra (redundant) FLAG_NEARFULL flag
1758 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1759 continue;
1760 }
1761 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1762 // don't bother if pool is already marked as nearfull
1763 continue;
1764 }
1765 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1766 << "'s as nearfull" << dendl;
1767 if (pending_inc.new_pools.count(p) == 0) {
1768 pending_inc.new_pools[p] = tmp.pools[p];
1769 }
1770 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1771 }
1772 // cancel FLAG_NEARFULL for pools
1773 // which are no longer nearfull too
1774 for (auto &pool: tmp.get_pools()) {
1775 auto p = pool.first;
1776 if (full_pool_ids.count(p) ||
1777 backfillfull_pool_ids.count(p) ||
1778 nearfull_pool_ids.count(p)) {
1779 // skip pools we have just marked as
1780 // nearfull/backfillfull/full above
1781 continue;
1782 }
1783 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1784 // and don't touch if currently is not nearfull
1785 continue;
1786 }
1787 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1788 << "'s nearfull flag" << dendl;
1789 if (pending_inc.new_pools.count(p) == 0) {
1790 pending_inc.new_pools[p] = pool.second;
1791 }
1792 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1793 }
1794 }
1795
1796 // min_compat_client?
1797 if (!tmp.require_min_compat_client) {
1798 auto mv = tmp.get_min_compat_client();
1799 dout(1) << __func__ << " setting require_min_compat_client to currently "
1800 << "required " << mv << dendl;
1801 mon->clog->info() << "setting require_min_compat_client to currently "
1802 << "required " << mv;
1803 pending_inc.new_require_min_compat_client = mv;
1804 }
1805
1806 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1807 tmp.require_osd_release >= ceph_release_t::nautilus) {
1808 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1809 // add creating flags?
1810 for (auto& i : tmp.get_pools()) {
1811 if (pending_creatings.still_creating_pool(i.first)) {
1812 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1813 << dendl;
1814 if (pending_inc.new_pools.count(i.first) == 0) {
1815 pending_inc.new_pools[i.first] = i.second;
1816 }
1817 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1818 }
1819 }
1820 // adjust blacklist items to all be TYPE_ANY
1821 for (auto& i : tmp.blacklist) {
1822 auto a = i.first;
1823 a.set_type(entity_addr_t::TYPE_ANY);
1824 pending_inc.new_blacklist[a] = i.second;
1825 pending_inc.old_blacklist.push_back(i.first);
1826 }
1827 }
1828
1829 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1830 tmp.require_osd_release >= ceph_release_t::octopus) {
1831 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1832
1833 // adjust obsoleted cache modes
1834 for (auto& [poolid, pi] : tmp.pools) {
1835 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1836 if (pending_inc.new_pools.count(poolid) == 0) {
1837 pending_inc.new_pools[poolid] = pi;
1838 }
1839 dout(10) << __func__ << " switching pool " << poolid
1840 << " cachemode from forward -> proxy" << dendl;
1841 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1842 }
1843 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1844 if (pending_inc.new_pools.count(poolid) == 0) {
1845 pending_inc.new_pools[poolid] = pi;
1846 }
1847 dout(10) << __func__ << " switching pool " << poolid
1848 << " cachemode from readforward -> readproxy" << dendl;
1849 pending_inc.new_pools[poolid].cache_mode =
1850 pg_pool_t::CACHEMODE_READPROXY;
1851 }
1852 }
1853
1854 // clear removed_snaps for every pool
1855 for (auto& [poolid, pi] : tmp.pools) {
1856 if (pi.removed_snaps.empty()) {
1857 continue;
1858 }
1859 if (pending_inc.new_pools.count(poolid) == 0) {
1860 pending_inc.new_pools[poolid] = pi;
1861 }
1862 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1863 << dendl;
1864 pending_inc.new_pools[poolid].removed_snaps.clear();
1865 }
1866
1867 // create a combined purged snap epoch key for all purged snaps
1868 // prior to this epoch, and store it in the current epoch (i.e.,
1869 // the last pre-octopus epoch, just prior to the one we're
1870 // encoding now).
1871 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1872 it->lower_bound("purged_snap_");
1873 map<int64_t,snap_interval_set_t> combined;
1874 while (it->valid()) {
1875 if (it->key().find("purged_snap_") != 0) {
1876 break;
1877 }
1878 string k = it->key();
1879 long long unsigned pool;
1880 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1881 if (n != 1) {
1882 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1883 } else {
1884 bufferlist v = it->value();
1885 auto p = v.cbegin();
1886 snapid_t begin, end;
1887 ceph::decode(begin, p);
1888 ceph::decode(end, p);
1889 combined[pool].insert(begin, end - begin);
1890 }
1891 it->next();
1892 }
1893 if (!combined.empty()) {
1894 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1895 bufferlist v;
1896 ceph::encode(combined, v);
1897 t->put(OSD_SNAP_PREFIX, k, v);
1898 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1899 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1900 << dendl;
1901 } else {
1902 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1903 << dendl;
1904 }
1905
1906 // clean out the old removed_snap_ and removed_epoch keys
1907 // ('`' is ASCII '_' + 1)
1908 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1909 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1910 }
1911 }
1912
1913 // tell me about it
1914 for (auto i = pending_inc.new_state.begin();
1915 i != pending_inc.new_state.end();
1916 ++i) {
1917 int s = i->second ? i->second : CEPH_OSD_UP;
1918 if (s & CEPH_OSD_UP) {
1919 dout(2) << " osd." << i->first << " DOWN" << dendl;
1920 // Reset laggy parameters if failure interval exceeds a threshold.
1921 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1922 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1923 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1924 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1925 set_default_laggy_params(i->first);
1926 }
1927 }
1928 }
1929 if (s & CEPH_OSD_EXISTS)
1930 dout(2) << " osd." << i->first << " DNE" << dendl;
1931 }
1932 for (auto i = pending_inc.new_up_client.begin();
1933 i != pending_inc.new_up_client.end();
1934 ++i) {
1935 //FIXME: insert cluster addresses too
1936 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1937 }
1938 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1939 i != pending_inc.new_weight.end();
1940 ++i) {
1941 if (i->second == CEPH_OSD_OUT) {
1942 dout(2) << " osd." << i->first << " OUT" << dendl;
1943 } else if (i->second == CEPH_OSD_IN) {
1944 dout(2) << " osd." << i->first << " IN" << dendl;
1945 } else {
1946 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1947 }
1948 }
1949
1950 // features for osdmap and its incremental
1951 uint64_t features;
1952
1953 // encode full map and determine its crc
1954 OSDMap tmp;
1955 {
1956 tmp.deepish_copy_from(osdmap);
1957 tmp.apply_incremental(pending_inc);
1958
1959 // determine appropriate features
1960 features = tmp.get_encoding_features();
1961 dout(10) << __func__ << " encoding full map with "
1962 << tmp.require_osd_release
1963 << " features " << features << dendl;
1964
1965 // the features should be a subset of the mon quorum's features!
1966 ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1967
1968 bufferlist fullbl;
1969 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1970 pending_inc.full_crc = tmp.get_crc();
1971
1972 // include full map in the txn. note that old monitors will
1973 // overwrite this. new ones will now skip the local full map
1974 // encode and reload from this.
1975 put_version_full(t, pending_inc.epoch, fullbl);
1976 }
1977
1978 // encode
1979 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1980 bufferlist bl;
1981 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1982
1983 dout(20) << " full_crc " << tmp.get_crc()
1984 << " inc_crc " << pending_inc.inc_crc << dendl;
1985
1986 /* put everything in the transaction */
1987 put_version(t, pending_inc.epoch, bl);
1988 put_last_committed(t, pending_inc.epoch);
1989
1990 // metadata, too!
1991 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1992 p != pending_metadata.end();
1993 ++p)
1994 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1995 for (set<int>::iterator p = pending_metadata_rm.begin();
1996 p != pending_metadata_rm.end();
1997 ++p)
1998 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1999 pending_metadata.clear();
2000 pending_metadata_rm.clear();
2001
2002 // purged_snaps
2003 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2004 !pending_inc.new_purged_snaps.empty()) {
2005 // all snaps purged this epoch (across all pools)
2006 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2007 bufferlist v;
2008 encode(pending_inc.new_purged_snaps, v);
2009 t->put(OSD_SNAP_PREFIX, k, v);
2010 }
2011 for (auto& i : pending_inc.new_purged_snaps) {
2012 for (auto q = i.second.begin();
2013 q != i.second.end();
2014 ++q) {
2015 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2016 pending_inc.epoch,
2017 t);
2018 }
2019 }
2020 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2021 for (auto snap : snaps) {
2022 insert_purged_snap_update(pool, snap, snap + 1,
2023 pending_inc.epoch,
2024 t);
2025 }
2026 }
2027
2028 // health
2029 health_check_map_t next;
2030 tmp.check_health(cct, &next);
2031 encode_health(next, t);
2032 }
2033
2034 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2035 {
2036 bufferlist bl;
2037 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2038 if (r < 0)
2039 return r;
2040 try {
2041 auto p = bl.cbegin();
2042 decode(m, p);
2043 }
2044 catch (buffer::error& e) {
2045 if (err)
2046 *err << "osd." << osd << " metadata is corrupt";
2047 return -EIO;
2048 }
2049 return 0;
2050 }
2051
2052 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2053 {
2054 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2055 if (osdmap.is_up(osd)) {
2056 map<string,string> meta;
2057 load_metadata(osd, meta, nullptr);
2058 auto p = meta.find(field);
2059 if (p == meta.end()) {
2060 (*out)["unknown"]++;
2061 } else {
2062 (*out)[p->second]++;
2063 }
2064 }
2065 }
2066 }
2067
2068 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2069 {
2070 map<string,int> by_val;
2071 count_metadata(field, &by_val);
2072 f->open_object_section(field.c_str());
2073 for (auto& p : by_val) {
2074 f->dump_int(p.first.c_str(), p.second);
2075 }
2076 f->close_section();
2077 }
2078
2079 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2080 {
2081 map<string, string> metadata;
2082 int r = load_metadata(osd, metadata, nullptr);
2083 if (r < 0)
2084 return r;
2085
2086 auto it = metadata.find("osd_objectstore");
2087 if (it == metadata.end())
2088 return -ENOENT;
2089 *type = it->second;
2090 return 0;
2091 }
2092
2093 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2094 const pg_pool_t &pool,
2095 ostream *err)
2096 {
2097 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2098 // since filestore osds could always join the pool later
2099 set<int> checked_osds;
2100 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2101 vector<int> up, acting;
2102 pg_t pgid(ps, pool_id);
2103 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2104 for (int osd : up) {
2105 if (checked_osds.find(osd) != checked_osds.end())
2106 continue;
2107 string objectstore_type;
2108 int r = get_osd_objectstore_type(osd, &objectstore_type);
2109 // allow with missing metadata, e.g. due to an osd never booting yet
2110 if (r < 0 || objectstore_type == "bluestore") {
2111 checked_osds.insert(osd);
2112 continue;
2113 }
2114 *err << "osd." << osd << " uses " << objectstore_type;
2115 return false;
2116 }
2117 }
2118 return true;
2119 }
2120
2121 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2122 {
2123 map<string,string> m;
2124 if (int r = load_metadata(osd, m, err))
2125 return r;
2126 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2127 f->dump_string(p->first.c_str(), p->second);
2128 return 0;
2129 }
2130
2131 void OSDMonitor::print_nodes(Formatter *f)
2132 {
2133 // group OSDs by their hosts
2134 map<string, list<int> > osds; // hostname => osd
2135 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2136 map<string, string> m;
2137 if (load_metadata(osd, m, NULL)) {
2138 continue;
2139 }
2140 map<string, string>::iterator hostname = m.find("hostname");
2141 if (hostname == m.end()) {
2142 // not likely though
2143 continue;
2144 }
2145 osds[hostname->second].push_back(osd);
2146 }
2147
2148 dump_services(f, osds, "osd");
2149 }
2150
2151 void OSDMonitor::share_map_with_random_osd()
2152 {
2153 if (osdmap.get_num_up_osds() == 0) {
2154 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2155 return;
2156 }
2157
2158 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2159 if (!s) {
2160 dout(10) << __func__ << " no up osd on our session map" << dendl;
2161 return;
2162 }
2163
2164 dout(10) << "committed, telling random " << s->name
2165 << " all about it" << dendl;
2166
2167 // get feature of the peer
2168 // use quorum_con_features, if it's an anonymous connection.
2169 uint64_t features = s->con_features ? s->con_features :
2170 mon->get_quorum_con_features();
2171 // whatev, they'll request more if they need it
2172 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2173 s->con->send_message(m);
2174 // NOTE: do *not* record osd has up to this epoch (as we do
2175 // elsewhere) as they may still need to request older values.
2176 }
2177
2178 version_t OSDMonitor::get_trim_to() const
2179 {
2180 if (mon->get_quorum().empty()) {
2181 dout(10) << __func__ << ": quorum not formed" << dendl;
2182 return 0;
2183 }
2184
2185 {
2186 std::lock_guard<std::mutex> l(creating_pgs_lock);
2187 if (!creating_pgs.pgs.empty()) {
2188 return 0;
2189 }
2190 }
2191
2192 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2193 dout(0) << __func__
2194 << " blocking osdmap trim"
2195 " ('mon_debug_block_osdmap_trim' set to 'true')"
2196 << dendl;
2197 return 0;
2198 }
2199
2200 {
2201 epoch_t floor = get_min_last_epoch_clean();
2202 dout(10) << " min_last_epoch_clean " << floor << dendl;
2203 if (g_conf()->mon_osd_force_trim_to > 0 &&
2204 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2205 floor = g_conf()->mon_osd_force_trim_to;
2206 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2207 }
2208 unsigned min = g_conf()->mon_min_osdmap_epochs;
2209 if (floor + min > get_last_committed()) {
2210 if (min < get_last_committed())
2211 floor = get_last_committed() - min;
2212 else
2213 floor = 0;
2214 }
2215 if (floor > get_first_committed())
2216 return floor;
2217 }
2218 return 0;
2219 }
2220
2221 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2222 {
2223 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2224 // also scan osd epochs
2225 // don't trim past the oldest reported osd epoch
2226 for (auto& osd_epoch : osd_epochs) {
2227 if (osd_epoch.second < floor &&
2228 osdmap.is_in(osd_epoch.first)) {
2229 floor = osd_epoch.second;
2230 }
2231 }
2232 return floor;
2233 }
2234
2235 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2236 version_t first)
2237 {
2238 dout(10) << __func__ << " including full map for e " << first << dendl;
2239 bufferlist bl;
2240 get_version_full(first, bl);
2241 put_version_full(tx, first, bl);
2242
2243 if (has_osdmap_manifest &&
2244 first > osdmap_manifest.get_first_pinned()) {
2245 _prune_update_trimmed(tx, first);
2246 }
2247 }
2248
2249
2250 /* full osdmap prune
2251 *
2252 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2253 */
2254
2255 void OSDMonitor::load_osdmap_manifest()
2256 {
2257 bool store_has_manifest =
2258 mon->store->exists(get_service_name(), "osdmap_manifest");
2259
2260 if (!store_has_manifest) {
2261 if (!has_osdmap_manifest) {
2262 return;
2263 }
2264
2265 dout(20) << __func__
2266 << " dropping osdmap manifest from memory." << dendl;
2267 osdmap_manifest = osdmap_manifest_t();
2268 has_osdmap_manifest = false;
2269 return;
2270 }
2271
2272 dout(20) << __func__
2273 << " osdmap manifest detected in store; reload." << dendl;
2274
2275 bufferlist manifest_bl;
2276 int r = get_value("osdmap_manifest", manifest_bl);
2277 if (r < 0) {
2278 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2279 ceph_abort_msg("error reading manifest");
2280 }
2281 osdmap_manifest.decode(manifest_bl);
2282 has_osdmap_manifest = true;
2283
2284 dout(10) << __func__ << " store osdmap manifest pinned ("
2285 << osdmap_manifest.get_first_pinned()
2286 << " .. "
2287 << osdmap_manifest.get_last_pinned()
2288 << ")"
2289 << dendl;
2290 }
2291
2292 bool OSDMonitor::should_prune() const
2293 {
2294 version_t first = get_first_committed();
2295 version_t last = get_last_committed();
2296 version_t min_osdmap_epochs =
2297 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2298 version_t prune_min =
2299 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2300 version_t prune_interval =
2301 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2302 version_t last_pinned = osdmap_manifest.get_last_pinned();
2303 version_t last_to_pin = last - min_osdmap_epochs;
2304
2305 // Make it or break it constraints.
2306 //
2307 // If any of these conditions fails, we will not prune, regardless of
2308 // whether we have an on-disk manifest with an on-going pruning state.
2309 //
2310 if ((last - first) <= min_osdmap_epochs) {
2311 // between the first and last committed epochs, we don't have
2312 // enough epochs to trim, much less to prune.
2313 dout(10) << __func__
2314 << " currently holding only " << (last - first)
2315 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2316 << "); do not prune."
2317 << dendl;
2318 return false;
2319
2320 } else if ((last_to_pin - first) < prune_min) {
2321 // between the first committed epoch and the last epoch we would prune,
2322 // we simply don't have enough versions over the minimum to prune maps.
2323 dout(10) << __func__
2324 << " could only prune " << (last_to_pin - first)
2325 << " epochs (" << first << ".." << last_to_pin << "), which"
2326 " is less than the required minimum (" << prune_min << ")"
2327 << dendl;
2328 return false;
2329
2330 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2331 dout(10) << __func__
2332 << " we have pruned as far as we can; do not prune."
2333 << dendl;
2334 return false;
2335
2336 } else if (last_pinned + prune_interval > last_to_pin) {
2337 dout(10) << __func__
2338 << " not enough epochs to form an interval (last pinned: "
2339 << last_pinned << ", last to pin: "
2340 << last_to_pin << ", interval: " << prune_interval << ")"
2341 << dendl;
2342 return false;
2343 }
2344
2345 dout(15) << __func__
2346 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2347 << " lc (" << first << ".." << last << ")"
2348 << dendl;
2349 return true;
2350 }
2351
2352 void OSDMonitor::_prune_update_trimmed(
2353 MonitorDBStore::TransactionRef tx,
2354 version_t first)
2355 {
2356 dout(10) << __func__
2357 << " first " << first
2358 << " last_pinned " << osdmap_manifest.get_last_pinned()
2359 << " last_pinned " << osdmap_manifest.get_last_pinned()
2360 << dendl;
2361
2362 osdmap_manifest_t manifest = osdmap_manifest;
2363
2364 if (!manifest.is_pinned(first)) {
2365 manifest.pin(first);
2366 }
2367
2368 set<version_t>::iterator p_end = manifest.pinned.find(first);
2369 set<version_t>::iterator p = manifest.pinned.begin();
2370 manifest.pinned.erase(p, p_end);
2371 ceph_assert(manifest.get_first_pinned() == first);
2372
2373 if (manifest.get_last_pinned() == first+1 ||
2374 manifest.pinned.size() == 1) {
2375 // we reached the end of the line, as pinned maps go; clean up our
2376 // manifest, and let `should_prune()` decide whether we should prune
2377 // again.
2378 tx->erase(get_service_name(), "osdmap_manifest");
2379 return;
2380 }
2381
2382 bufferlist bl;
2383 manifest.encode(bl);
2384 tx->put(get_service_name(), "osdmap_manifest", bl);
2385 }
2386
2387 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2388 {
2389 dout(1) << __func__ << dendl;
2390
2391 version_t pin_first;
2392
2393 // verify constrainsts on stable in-memory state
2394 if (!has_osdmap_manifest) {
2395 // we must have never pruned, OR if we pruned the state must no longer
2396 // be relevant (i.e., the state must have been removed alongside with
2397 // the trim that *must* have removed past the last pinned map in a
2398 // previous prune).
2399 ceph_assert(osdmap_manifest.pinned.empty());
2400 ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2401 pin_first = get_first_committed();
2402
2403 } else {
2404 // we must have pruned in the past AND its state is still relevant
2405 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2406 // and thus we still hold a manifest in the store).
2407 ceph_assert(!osdmap_manifest.pinned.empty());
2408 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2409 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2410
2411 dout(10) << __func__
2412 << " first_pinned " << osdmap_manifest.get_first_pinned()
2413 << " last_pinned " << osdmap_manifest.get_last_pinned()
2414 << dendl;
2415
2416 pin_first = osdmap_manifest.get_last_pinned();
2417 }
2418
2419 manifest.pin(pin_first);
2420 }
2421
2422 bool OSDMonitor::_prune_sanitize_options() const
2423 {
2424 uint64_t prune_interval =
2425 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2426 uint64_t prune_min =
2427 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2428 uint64_t txsize =
2429 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2430
2431 bool r = true;
2432
2433 if (prune_interval == 0) {
2434 derr << __func__
2435 << " prune is enabled BUT prune interval is zero; abort."
2436 << dendl;
2437 r = false;
2438 } else if (prune_interval == 1) {
2439 derr << __func__
2440 << " prune interval is equal to one, which essentially means"
2441 " no pruning; abort."
2442 << dendl;
2443 r = false;
2444 }
2445 if (prune_min == 0) {
2446 derr << __func__
2447 << " prune is enabled BUT prune min is zero; abort."
2448 << dendl;
2449 r = false;
2450 }
2451 if (prune_interval > prune_min) {
2452 derr << __func__
2453 << " impossible to ascertain proper prune interval because"
2454 << " it is greater than the minimum prune epochs"
2455 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2456 << dendl;
2457 r = false;
2458 }
2459
2460 if (txsize < prune_interval - 1) {
2461 derr << __func__
2462 << "'mon_osdmap_full_prune_txsize' (" << txsize
2463 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2464 << "); abort." << dendl;
2465 r = false;
2466 }
2467 return r;
2468 }
2469
2470 bool OSDMonitor::is_prune_enabled() const {
2471 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2472 }
2473
2474 bool OSDMonitor::is_prune_supported() const {
2475 return mon->get_required_mon_features().contains_any(
2476 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2477 }
2478
2479 /** do_prune
2480 *
2481 * @returns true if has side-effects; false otherwise.
2482 */
2483 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2484 {
2485 bool enabled = is_prune_enabled();
2486
2487 dout(1) << __func__ << " osdmap full prune "
2488 << ( enabled ? "enabled" : "disabled")
2489 << dendl;
2490
2491 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2492 return false;
2493 }
2494
2495 // we are beyond the minimum prune versions, we need to remove maps because
2496 // otherwise the store will grow unbounded and we may end up having issues
2497 // with available disk space or store hangs.
2498
2499 // we will not pin all versions. We will leave a buffer number of versions.
2500 // this allows us the monitor to trim maps without caring too much about
2501 // pinned maps, and then allow us to use another ceph-mon without these
2502 // capabilities, without having to repair the store.
2503
2504 osdmap_manifest_t manifest = osdmap_manifest;
2505
2506 version_t first = get_first_committed();
2507 version_t last = get_last_committed();
2508
2509 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2510 version_t last_pinned = manifest.get_last_pinned();
2511 uint64_t prune_interval =
2512 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2513 uint64_t txsize =
2514 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2515
2516 prune_init(manifest);
2517
2518 // we need to get rid of some osdmaps
2519
2520 dout(5) << __func__
2521 << " lc (" << first << " .. " << last << ")"
2522 << " last_pinned " << last_pinned
2523 << " interval " << prune_interval
2524 << " last_to_pin " << last_to_pin
2525 << dendl;
2526
2527 // We will be erasing maps as we go.
2528 //
2529 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2530 //
2531 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2532 // we stop pruning. We could prune the maps between `next_to_pin` and
2533 // `last_to_pin`, but by not doing it we end up with neater pruned
2534 // intervals, aligned with `prune_interval`. Besides, this should not be a
2535 // problem as long as `prune_interval` is set to a sane value, instead of
2536 // hundreds or thousands of maps.
2537
2538 auto map_exists = [this](version_t v) {
2539 string k = mon->store->combine_strings("full", v);
2540 return mon->store->exists(get_service_name(), k);
2541 };
2542
2543 // 'interval' represents the number of maps from the last pinned
2544 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2545 // version 11 next; all intermediate versions will be removed.
2546 //
2547 // 'txsize' represents the maximum number of versions we'll be removing in
2548 // this iteration. If 'txsize' is large enough to perform multiple passes
2549 // pinning and removing maps, we will do so; if not, we'll do at least one
2550 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2551 // ensure that we never go *over* the maximum.
2552
2553 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2554 uint64_t removal_interval = prune_interval - 1;
2555
2556 if (txsize < removal_interval) {
2557 dout(5) << __func__
2558 << " setting txsize to removal interval size ("
2559 << removal_interval << " versions"
2560 << dendl;
2561 txsize = removal_interval;
2562 }
2563 ceph_assert(removal_interval > 0);
2564
2565 uint64_t num_pruned = 0;
2566 while (num_pruned + removal_interval <= txsize) {
2567 last_pinned = manifest.get_last_pinned();
2568
2569 if (last_pinned + prune_interval > last_to_pin) {
2570 break;
2571 }
2572 ceph_assert(last_pinned < last_to_pin);
2573
2574 version_t next_pinned = last_pinned + prune_interval;
2575 ceph_assert(next_pinned <= last_to_pin);
2576 manifest.pin(next_pinned);
2577
2578 dout(20) << __func__
2579 << " last_pinned " << last_pinned
2580 << " next_pinned " << next_pinned
2581 << " num_pruned " << num_pruned
2582 << " removal interval (" << (last_pinned+1)
2583 << ".." << (next_pinned-1) << ")"
2584 << " txsize " << txsize << dendl;
2585
2586 ceph_assert(map_exists(last_pinned));
2587 ceph_assert(map_exists(next_pinned));
2588
2589 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2590 ceph_assert(!manifest.is_pinned(v));
2591
2592 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2593 string full_key = mon->store->combine_strings("full", v);
2594 tx->erase(get_service_name(), full_key);
2595 ++num_pruned;
2596 }
2597 }
2598
2599 ceph_assert(num_pruned > 0);
2600
2601 bufferlist bl;
2602 manifest.encode(bl);
2603 tx->put(get_service_name(), "osdmap_manifest", bl);
2604
2605 return true;
2606 }
2607
2608
2609 // -------------
2610
2611 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2612 {
2613 op->mark_osdmon_event(__func__);
2614 Message *m = op->get_req();
2615 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2616
2617 switch (m->get_type()) {
2618 // READs
2619 case MSG_MON_COMMAND:
2620 try {
2621 return preprocess_command(op);
2622 } catch (const bad_cmd_get& e) {
2623 bufferlist bl;
2624 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2625 return true;
2626 }
2627 case CEPH_MSG_MON_GET_OSDMAP:
2628 return preprocess_get_osdmap(op);
2629
2630 // damp updates
2631 case MSG_OSD_MARK_ME_DOWN:
2632 return preprocess_mark_me_down(op);
2633 case MSG_OSD_MARK_ME_DEAD:
2634 return preprocess_mark_me_dead(op);
2635 case MSG_OSD_FULL:
2636 return preprocess_full(op);
2637 case MSG_OSD_FAILURE:
2638 return preprocess_failure(op);
2639 case MSG_OSD_BOOT:
2640 return preprocess_boot(op);
2641 case MSG_OSD_ALIVE:
2642 return preprocess_alive(op);
2643 case MSG_OSD_PG_CREATED:
2644 return preprocess_pg_created(op);
2645 case MSG_OSD_PG_READY_TO_MERGE:
2646 return preprocess_pg_ready_to_merge(op);
2647 case MSG_OSD_PGTEMP:
2648 return preprocess_pgtemp(op);
2649 case MSG_OSD_BEACON:
2650 return preprocess_beacon(op);
2651
2652 case CEPH_MSG_POOLOP:
2653 return preprocess_pool_op(op);
2654
2655 case MSG_REMOVE_SNAPS:
2656 return preprocess_remove_snaps(op);
2657
2658 case MSG_MON_GET_PURGED_SNAPS:
2659 return preprocess_get_purged_snaps(op);
2660
2661 default:
2662 ceph_abort();
2663 return true;
2664 }
2665 }
2666
2667 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2668 {
2669 op->mark_osdmon_event(__func__);
2670 Message *m = op->get_req();
2671 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2672
2673 switch (m->get_type()) {
2674 // damp updates
2675 case MSG_OSD_MARK_ME_DOWN:
2676 return prepare_mark_me_down(op);
2677 case MSG_OSD_MARK_ME_DEAD:
2678 return prepare_mark_me_dead(op);
2679 case MSG_OSD_FULL:
2680 return prepare_full(op);
2681 case MSG_OSD_FAILURE:
2682 return prepare_failure(op);
2683 case MSG_OSD_BOOT:
2684 return prepare_boot(op);
2685 case MSG_OSD_ALIVE:
2686 return prepare_alive(op);
2687 case MSG_OSD_PG_CREATED:
2688 return prepare_pg_created(op);
2689 case MSG_OSD_PGTEMP:
2690 return prepare_pgtemp(op);
2691 case MSG_OSD_PG_READY_TO_MERGE:
2692 return prepare_pg_ready_to_merge(op);
2693 case MSG_OSD_BEACON:
2694 return prepare_beacon(op);
2695
2696 case MSG_MON_COMMAND:
2697 try {
2698 return prepare_command(op);
2699 } catch (const bad_cmd_get& e) {
2700 bufferlist bl;
2701 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2702 return true;
2703 }
2704
2705 case CEPH_MSG_POOLOP:
2706 return prepare_pool_op(op);
2707
2708 case MSG_REMOVE_SNAPS:
2709 return prepare_remove_snaps(op);
2710
2711
2712 default:
2713 ceph_abort();
2714 }
2715
2716 return false;
2717 }
2718
2719 bool OSDMonitor::should_propose(double& delay)
2720 {
2721 dout(10) << "should_propose" << dendl;
2722
2723 // if full map, propose immediately! any subsequent changes will be clobbered.
2724 if (pending_inc.fullmap.length())
2725 return true;
2726
2727 // adjust osd weights?
2728 if (!osd_weight.empty() &&
2729 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2730 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2731 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2732 delay = 0.0;
2733 osd_weight.clear();
2734 return true;
2735 }
2736
2737 return PaxosService::should_propose(delay);
2738 }
2739
2740
2741
2742 // ---------------------------
2743 // READs
2744
2745 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2746 {
2747 op->mark_osdmon_event(__func__);
2748 auto m = op->get_req<MMonGetOSDMap>();
2749
2750 uint64_t features = mon->get_quorum_con_features();
2751 if (op->get_session() && op->get_session()->con_features)
2752 features = op->get_session()->con_features;
2753
2754 dout(10) << __func__ << " " << *m << dendl;
2755 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2756 epoch_t first = get_first_committed();
2757 epoch_t last = osdmap.get_epoch();
2758 int max = g_conf()->osd_map_message_max;
2759 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2760 for (epoch_t e = std::max(first, m->get_full_first());
2761 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2762 ++e, --max) {
2763 bufferlist& bl = reply->maps[e];
2764 int r = get_version_full(e, features, bl);
2765 ceph_assert(r >= 0);
2766 max_bytes -= bl.length();
2767 }
2768 for (epoch_t e = std::max(first, m->get_inc_first());
2769 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2770 ++e, --max) {
2771 bufferlist& bl = reply->incremental_maps[e];
2772 int r = get_version(e, features, bl);
2773 ceph_assert(r >= 0);
2774 max_bytes -= bl.length();
2775 }
2776 reply->oldest_map = first;
2777 reply->newest_map = last;
2778 mon->send_reply(op, reply);
2779 return true;
2780 }
2781
2782
2783 // ---------------------------
2784 // UPDATEs
2785
2786 // failure --
2787
2788 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2789 // check permissions
2790 MonSession *session = op->get_session();
2791 if (!session)
2792 return true;
2793 if (!session->is_capable("osd", MON_CAP_X)) {
2794 dout(0) << "got MOSDFailure from entity with insufficient caps "
2795 << session->caps << dendl;
2796 return true;
2797 }
2798 if (fsid != mon->monmap->fsid) {
2799 dout(0) << "check_source: on fsid " << fsid
2800 << " != " << mon->monmap->fsid << dendl;
2801 return true;
2802 }
2803 return false;
2804 }
2805
2806
2807 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2808 {
2809 op->mark_osdmon_event(__func__);
2810 auto m = op->get_req<MOSDFailure>();
2811 // who is target_osd
2812 int badboy = m->get_target_osd();
2813
2814 // check permissions
2815 if (check_source(op, m->fsid))
2816 goto didit;
2817
2818 // first, verify the reporting host is valid
2819 if (m->get_orig_source().is_osd()) {
2820 int from = m->get_orig_source().num();
2821 if (!osdmap.exists(from) ||
2822 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2823 (osdmap.is_down(from) && m->if_osd_failed())) {
2824 dout(5) << "preprocess_failure from dead osd." << from
2825 << ", ignoring" << dendl;
2826 send_incremental(op, m->get_epoch()+1);
2827 goto didit;
2828 }
2829 }
2830
2831
2832 // weird?
2833 if (osdmap.is_down(badboy)) {
2834 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2835 << " " << m->get_target_addrs()
2836 << ", from " << m->get_orig_source() << dendl;
2837 if (m->get_epoch() < osdmap.get_epoch())
2838 send_incremental(op, m->get_epoch()+1);
2839 goto didit;
2840 }
2841 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2842 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2843 << " " << m->get_target_addrs()
2844 << " != map's " << osdmap.get_addrs(badboy)
2845 << ", from " << m->get_orig_source() << dendl;
2846 if (m->get_epoch() < osdmap.get_epoch())
2847 send_incremental(op, m->get_epoch()+1);
2848 goto didit;
2849 }
2850
2851 // already reported?
2852 if (osdmap.is_down(badboy) ||
2853 osdmap.get_up_from(badboy) > m->get_epoch()) {
2854 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2855 << " " << m->get_target_addrs()
2856 << ", from " << m->get_orig_source() << dendl;
2857 if (m->get_epoch() < osdmap.get_epoch())
2858 send_incremental(op, m->get_epoch()+1);
2859 goto didit;
2860 }
2861
2862 if (!can_mark_down(badboy)) {
2863 dout(5) << "preprocess_failure ignoring report of osd."
2864 << m->get_target_osd() << " " << m->get_target_addrs()
2865 << " from " << m->get_orig_source() << dendl;
2866 goto didit;
2867 }
2868
2869 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2870 << " " << m->get_target_addrs()
2871 << ", from " << m->get_orig_source() << dendl;
2872 return false;
2873
2874 didit:
2875 mon->no_reply(op);
2876 return true;
2877 }
2878
2879 class C_AckMarkedDown : public C_MonOp {
2880 OSDMonitor *osdmon;
2881 public:
2882 C_AckMarkedDown(
2883 OSDMonitor *osdmon,
2884 MonOpRequestRef op)
2885 : C_MonOp(op), osdmon(osdmon) {}
2886
2887 void _finish(int r) override {
2888 if (r == 0) {
2889 auto m = op->get_req<MOSDMarkMeDown>();
2890 osdmon->mon->send_reply(
2891 op,
2892 new MOSDMarkMeDown(
2893 m->fsid,
2894 m->target_osd,
2895 m->target_addrs,
2896 m->get_epoch(),
2897 false)); // ACK itself does not request an ack
2898 } else if (r == -EAGAIN) {
2899 osdmon->dispatch(op);
2900 } else {
2901 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2902 }
2903 }
2904 ~C_AckMarkedDown() override {
2905 }
2906 };
2907
2908 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2909 {
2910 op->mark_osdmon_event(__func__);
2911 auto m = op->get_req<MOSDMarkMeDown>();
2912 int from = m->target_osd;
2913
2914 // check permissions
2915 if (check_source(op, m->fsid))
2916 goto reply;
2917
2918 // first, verify the reporting host is valid
2919 if (!m->get_orig_source().is_osd())
2920 goto reply;
2921
2922 if (!osdmap.exists(from) ||
2923 osdmap.is_down(from) ||
2924 osdmap.get_addrs(from) != m->target_addrs) {
2925 dout(5) << "preprocess_mark_me_down from dead osd."
2926 << from << ", ignoring" << dendl;
2927 send_incremental(op, m->get_epoch()+1);
2928 goto reply;
2929 }
2930
2931 // no down might be set
2932 if (!can_mark_down(from))
2933 goto reply;
2934
2935 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2936 << " " << m->target_addrs << dendl;
2937 return false;
2938
2939 reply:
2940 if (m->request_ack) {
2941 Context *c(new C_AckMarkedDown(this, op));
2942 c->complete(0);
2943 }
2944 return true;
2945 }
2946
2947 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2948 {
2949 op->mark_osdmon_event(__func__);
2950 auto m = op->get_req<MOSDMarkMeDown>();
2951 int target_osd = m->target_osd;
2952
2953 ceph_assert(osdmap.is_up(target_osd));
2954 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2955
2956 mon->clog->info() << "osd." << target_osd << " marked itself down";
2957 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2958 if (m->request_ack)
2959 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2960 return true;
2961 }
2962
2963 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2964 {
2965 op->mark_osdmon_event(__func__);
2966 auto m = op->get_req<MOSDMarkMeDead>();
2967 int from = m->target_osd;
2968
2969 // check permissions
2970 if (check_source(op, m->fsid)) {
2971 mon->no_reply(op);
2972 return true;
2973 }
2974
2975 // first, verify the reporting host is valid
2976 if (!m->get_orig_source().is_osd()) {
2977 mon->no_reply(op);
2978 return true;
2979 }
2980
2981 if (!osdmap.exists(from) ||
2982 !osdmap.is_down(from)) {
2983 dout(5) << __func__ << " from nonexistent or up osd." << from
2984 << ", ignoring" << dendl;
2985 send_incremental(op, m->get_epoch()+1);
2986 mon->no_reply(op);
2987 return true;
2988 }
2989
2990 return false;
2991 }
2992
2993 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2994 {
2995 op->mark_osdmon_event(__func__);
2996 auto m = op->get_req<MOSDMarkMeDead>();
2997 int target_osd = m->target_osd;
2998
2999 ceph_assert(osdmap.is_down(target_osd));
3000
3001 mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
3002 << m->get_epoch();
3003 if (!pending_inc.new_xinfo.count(target_osd)) {
3004 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3005 }
3006 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3007 wait_for_finished_proposal(
3008 op,
3009 new LambdaContext(
3010 [op, this] (int r) {
3011 if (r >= 0) {
3012 mon->no_reply(op); // ignore on success
3013 }
3014 }
3015 ));
3016 return true;
3017 }
3018
3019 bool OSDMonitor::can_mark_down(int i)
3020 {
3021 if (osdmap.is_nodown(i)) {
3022 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3023 << "will not mark it down" << dendl;
3024 return false;
3025 }
3026
3027 int num_osds = osdmap.get_num_osds();
3028 if (num_osds == 0) {
3029 dout(5) << __func__ << " no osds" << dendl;
3030 return false;
3031 }
3032 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3033 float up_ratio = (float)up / (float)num_osds;
3034 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3035 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3036 << g_conf()->mon_osd_min_up_ratio
3037 << ", will not mark osd." << i << " down" << dendl;
3038 return false;
3039 }
3040 return true;
3041 }
3042
3043 bool OSDMonitor::can_mark_up(int i)
3044 {
3045 if (osdmap.is_noup(i)) {
3046 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3047 << "will not mark it up" << dendl;
3048 return false;
3049 }
3050
3051 return true;
3052 }
3053
3054 /**
3055 * @note the parameter @p i apparently only exists here so we can output the
3056 * osd's id on messages.
3057 */
3058 bool OSDMonitor::can_mark_out(int i)
3059 {
3060 if (osdmap.is_noout(i)) {
3061 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3062 << "will not mark it out" << dendl;
3063 return false;
3064 }
3065
3066 int num_osds = osdmap.get_num_osds();
3067 if (num_osds == 0) {
3068 dout(5) << __func__ << " no osds" << dendl;
3069 return false;
3070 }
3071 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3072 float in_ratio = (float)in / (float)num_osds;
3073 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3074 if (i >= 0)
3075 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3076 << g_conf()->mon_osd_min_in_ratio
3077 << ", will not mark osd." << i << " out" << dendl;
3078 else
3079 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3080 << g_conf()->mon_osd_min_in_ratio
3081 << ", will not mark osds out" << dendl;
3082 return false;
3083 }
3084
3085 return true;
3086 }
3087
3088 bool OSDMonitor::can_mark_in(int i)
3089 {
3090 if (osdmap.is_noin(i)) {
3091 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3092 << "will not mark it in" << dendl;
3093 return false;
3094 }
3095
3096 return true;
3097 }
3098
3099 bool OSDMonitor::check_failures(utime_t now)
3100 {
3101 bool found_failure = false;
3102 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3103 p != failure_info.end();
3104 ++p) {
3105 if (can_mark_down(p->first)) {
3106 found_failure |= check_failure(now, p->first, p->second);
3107 }
3108 }
3109 return found_failure;
3110 }
3111
3112 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3113 {
3114 // already pending failure?
3115 if (pending_inc.new_state.count(target_osd) &&
3116 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3117 dout(10) << " already pending failure" << dendl;
3118 return true;
3119 }
3120
3121 set<string> reporters_by_subtree;
3122 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3123 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3124 utime_t max_failed_since = fi.get_failed_since();
3125 utime_t failed_for = now - max_failed_since;
3126
3127 utime_t grace = orig_grace;
3128 double my_grace = 0, peer_grace = 0;
3129 double decay_k = 0;
3130 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3131 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3132 decay_k = ::log(.5) / halflife;
3133
3134 // scale grace period based on historical probability of 'lagginess'
3135 // (false positive failures due to slowness).
3136 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3137 double decay = exp((double)failed_for * decay_k);
3138 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3139 << " failed_for " << failed_for << " decay " << decay << dendl;
3140 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3141 grace += my_grace;
3142 }
3143
3144 // consider the peers reporting a failure a proxy for a potential
3145 // 'subcluster' over the overall cluster that is similarly
3146 // laggy. this is clearly not true in all cases, but will sometimes
3147 // help us localize the grace correction to a subset of the system
3148 // (say, a rack with a bad switch) that is unhappy.
3149 ceph_assert(fi.reporters.size());
3150 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3151 // get the parent bucket whose type matches with "reporter_subtree_level".
3152 // fall back to OSD if the level doesn't exist.
3153 if (osdmap.exists(p->first)) {
3154 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3155 if (auto iter = reporter_loc.find(reporter_subtree_level);
3156 iter == reporter_loc.end()) {
3157 reporters_by_subtree.insert("osd." + to_string(p->first));
3158 } else {
3159 reporters_by_subtree.insert(iter->second);
3160 }
3161 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3162 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3163 utime_t elapsed = now - xi.down_stamp;
3164 double decay = exp((double)elapsed * decay_k);
3165 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3166 }
3167 ++p;
3168 } else {
3169 fi.cancel_report(p->first);;
3170 p = fi.reporters.erase(p);
3171 }
3172 }
3173
3174 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3175 peer_grace /= (double)fi.reporters.size();
3176 grace += peer_grace;
3177 }
3178
3179 dout(10) << " osd." << target_osd << " has "
3180 << fi.reporters.size() << " reporters, "
3181 << grace << " grace (" << orig_grace << " + " << my_grace
3182 << " + " << peer_grace << "), max_failed_since " << max_failed_since
3183 << dendl;
3184
3185 if (failed_for >= grace &&
3186 reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3187 dout(1) << " we have enough reporters to mark osd." << target_osd
3188 << " down" << dendl;
3189 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3190
3191 mon->clog->info() << "osd." << target_osd << " failed ("
3192 << osdmap.crush->get_full_location_ordered_string(
3193 target_osd)
3194 << ") ("
3195 << (int)reporters_by_subtree.size()
3196 << " reporters from different "
3197 << reporter_subtree_level << " after "
3198 << failed_for << " >= grace " << grace << ")";
3199 return true;
3200 }
3201 return false;
3202 }
3203
3204 void OSDMonitor::force_failure(int target_osd, int by)
3205 {
3206 // already pending failure?
3207 if (pending_inc.new_state.count(target_osd) &&
3208 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3209 dout(10) << " already pending failure" << dendl;
3210 return;
3211 }
3212
3213 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3214 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3215 if (!pending_inc.new_xinfo.count(target_osd)) {
3216 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3217 }
3218 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3219
3220 mon->clog->info() << "osd." << target_osd << " failed ("
3221 << osdmap.crush->get_full_location_ordered_string(target_osd)
3222 << ") (connection refused reported by osd." << by << ")";
3223 return;
3224 }
3225
3226 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3227 {
3228 op->mark_osdmon_event(__func__);
3229 auto m = op->get_req<MOSDFailure>();
3230 dout(1) << "prepare_failure osd." << m->get_target_osd()
3231 << " " << m->get_target_addrs()
3232 << " from " << m->get_orig_source()
3233 << " is reporting failure:" << m->if_osd_failed() << dendl;
3234
3235 int target_osd = m->get_target_osd();
3236 int reporter = m->get_orig_source().num();
3237 ceph_assert(osdmap.is_up(target_osd));
3238 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3239
3240 mon->no_reply(op);
3241
3242 if (m->if_osd_failed()) {
3243 // calculate failure time
3244 utime_t now = ceph_clock_now();
3245 utime_t failed_since =
3246 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3247
3248 // add a report
3249 if (m->is_immediate()) {
3250 mon->clog->debug() << "osd." << m->get_target_osd()
3251 << " reported immediately failed by "
3252 << m->get_orig_source();
3253 force_failure(target_osd, reporter);
3254 return true;
3255 }
3256 mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3257 << m->get_orig_source();
3258
3259 failure_info_t& fi = failure_info[target_osd];
3260 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3261 if (old_op) {
3262 mon->no_reply(old_op);
3263 }
3264
3265 return check_failure(now, target_osd, fi);
3266 } else {
3267 // remove the report
3268 mon->clog->debug() << "osd." << m->get_target_osd()
3269 << " failure report canceled by "
3270 << m->get_orig_source();
3271 if (failure_info.count(target_osd)) {
3272 failure_info_t& fi = failure_info[target_osd];
3273 MonOpRequestRef report_op = fi.cancel_report(reporter);
3274 if (report_op) {
3275 mon->no_reply(report_op);
3276 }
3277 if (fi.reporters.empty()) {
3278 dout(10) << " removing last failure_info for osd." << target_osd
3279 << dendl;
3280 failure_info.erase(target_osd);
3281 } else {
3282 dout(10) << " failure_info for osd." << target_osd << " now "
3283 << fi.reporters.size() << " reporters" << dendl;
3284 }
3285 } else {
3286 dout(10) << " no failure_info for osd." << target_osd << dendl;
3287 }
3288 }
3289
3290 return false;
3291 }
3292
3293 void OSDMonitor::process_failures()
3294 {
3295 map<int,failure_info_t>::iterator p = failure_info.begin();
3296 while (p != failure_info.end()) {
3297 if (osdmap.is_up(p->first)) {
3298 ++p;
3299 } else {
3300 dout(10) << "process_failures osd." << p->first << dendl;
3301 list<MonOpRequestRef> ls;
3302 p->second.take_report_messages(ls);
3303 failure_info.erase(p++);
3304
3305 while (!ls.empty()) {
3306 MonOpRequestRef o = ls.front();
3307 if (o) {
3308 o->mark_event(__func__);
3309 MOSDFailure *m = o->get_req<MOSDFailure>();
3310 send_latest(o, m->get_epoch());
3311 mon->no_reply(o);
3312 }
3313 ls.pop_front();
3314 }
3315 }
3316 }
3317 }
3318
3319 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3320 {
3321 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3322
3323 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3324 p != failure_info.end();
3325 ++p) {
3326 p->second.take_report_messages(ls);
3327 }
3328 failure_info.clear();
3329 }
3330
3331 int OSDMonitor::get_grace_interval_threshold()
3332 {
3333 int halflife = g_conf()->mon_osd_laggy_halflife;
3334 // Scale the halflife period (default: 1_hr) by
3335 // a factor (48) to calculate the threshold.
3336 int grace_threshold_factor = 48;
3337 return halflife * grace_threshold_factor;
3338 }
3339
3340 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3341 {
3342 int grace_interval_threshold_secs = get_grace_interval_threshold();
3343 if (last_failed_interval > grace_interval_threshold_secs) {
3344 dout(1) << " last_failed_interval " << last_failed_interval
3345 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3346 << dendl;
3347 return true;
3348 }
3349 return false;
3350 }
3351
3352 void OSDMonitor::set_default_laggy_params(int target_osd)
3353 {
3354 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3355 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3356 }
3357 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3358 xi.down_stamp = pending_inc.modified;
3359 xi.laggy_probability = 0.0;
3360 xi.laggy_interval = 0;
3361 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3362 }
3363
3364
3365 // boot --
3366
3367 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3368 {
3369 op->mark_osdmon_event(__func__);
3370 auto m = op->get_req<MOSDBoot>();
3371 int from = m->get_orig_source_inst().name.num();
3372
3373 // check permissions, ignore if failed (no response expected)
3374 MonSession *session = op->get_session();
3375 if (!session)
3376 goto ignore;
3377 if (!session->is_capable("osd", MON_CAP_X)) {
3378 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3379 << session->caps << dendl;
3380 goto ignore;
3381 }
3382
3383 if (m->sb.cluster_fsid != mon->monmap->fsid) {
3384 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3385 << " != " << mon->monmap->fsid << dendl;
3386 goto ignore;
3387 }
3388
3389 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3390 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3391 goto ignore;
3392 }
3393
3394 ceph_assert(m->get_orig_source_inst().name.is_osd());
3395
3396 // force all osds to have gone through luminous prior to upgrade to nautilus
3397 {
3398 vector<string> missing;
3399 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3400 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3401 }
3402 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3403 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3404 }
3405 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3406 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3407 }
3408 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3409 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3410 }
3411
3412 if (!missing.empty()) {
3413 using std::experimental::make_ostream_joiner;
3414
3415 stringstream ss;
3416 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3417
3418 mon->clog->info() << "disallowing boot of OSD "
3419 << m->get_orig_source_inst()
3420 << " because the osd lacks " << ss.str();
3421 goto ignore;
3422 }
3423 }
3424
3425 // make sure osd versions do not span more than 3 releases
3426 if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3427 osdmap.require_osd_release < ceph_release_t::mimic) {
3428 mon->clog->info() << "disallowing boot of octopus+ OSD "
3429 << m->get_orig_source_inst()
3430 << " because require_osd_release < mimic";
3431 goto ignore;
3432 }
3433
3434 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3435 // we are reusing a jewel feature bit that was retired in luminous.
3436 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3437 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3438 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3439 mon->clog->info() << "disallowing boot of OSD "
3440 << m->get_orig_source_inst()
3441 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3442 goto ignore;
3443 }
3444
3445 // already booted?
3446 if (osdmap.is_up(from) &&
3447 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3448 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3449 // yup.
3450 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3451 << " " << m->get_orig_source_addrs()
3452 << " =~ " << osdmap.get_addrs(from) << dendl;
3453 _booted(op, false);
3454 return true;
3455 }
3456
3457 if (osdmap.exists(from) &&
3458 !osdmap.get_uuid(from).is_zero() &&
3459 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3460 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3461 << " clashes with existing osd: different fsid"
3462 << " (ours: " << osdmap.get_uuid(from)
3463 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3464 goto ignore;
3465 }
3466
3467 if (osdmap.exists(from) &&
3468 osdmap.get_info(from).up_from > m->version &&
3469 osdmap.get_most_recent_addrs(from).legacy_equals(
3470 m->get_orig_source_addrs())) {
3471 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3472 send_latest(op, m->sb.current_epoch+1);
3473 return true;
3474 }
3475
3476 // noup?
3477 if (!can_mark_up(from)) {
3478 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3479 send_latest(op, m->sb.current_epoch+1);
3480 return true;
3481 }
3482
3483 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3484 return false;
3485
3486 ignore:
3487 return true;
3488 }
3489
3490 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3491 {
3492 op->mark_osdmon_event(__func__);
3493 auto m = op->get_req<MOSDBoot>();
3494 dout(7) << __func__ << " from " << m->get_source()
3495 << " sb " << m->sb
3496 << " client_addrs" << m->get_connection()->get_peer_addrs()
3497 << " cluster_addrs " << m->cluster_addrs
3498 << " hb_back_addrs " << m->hb_back_addrs
3499 << " hb_front_addrs " << m->hb_front_addrs
3500 << dendl;
3501
3502 ceph_assert(m->get_orig_source().is_osd());
3503 int from = m->get_orig_source().num();
3504
3505 // does this osd exist?
3506 if (from >= osdmap.get_max_osd()) {
3507 dout(1) << "boot from osd." << from << " >= max_osd "
3508 << osdmap.get_max_osd() << dendl;
3509 return false;
3510 }
3511
3512 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3513 if (pending_inc.new_state.count(from))
3514 oldstate ^= pending_inc.new_state[from];
3515
3516 // already up? mark down first?
3517 if (osdmap.is_up(from)) {
3518 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3519 << osdmap.get_addrs(from) << dendl;
3520 // preprocess should have caught these; if not, assert.
3521 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3522 m->get_orig_source_addrs()) ||
3523 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3524 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3525
3526 if (pending_inc.new_state.count(from) == 0 ||
3527 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3528 // mark previous guy down
3529 pending_inc.new_state[from] = CEPH_OSD_UP;
3530 }
3531 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3532 } else if (pending_inc.new_up_client.count(from)) {
3533 // already prepared, just wait
3534 dout(7) << __func__ << " already prepared, waiting on "
3535 << m->get_orig_source_addr() << dendl;
3536 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3537 } else {
3538 // mark new guy up.
3539 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3540 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3541 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3542 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3543
3544 down_pending_out.erase(from); // if any
3545
3546 if (m->sb.weight)
3547 osd_weight[from] = m->sb.weight;
3548
3549 // set uuid?
3550 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3551 << dendl;
3552 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3553 // preprocess should have caught this; if not, assert.
3554 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3555 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3556 }
3557
3558 // fresh osd?
3559 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3560 const osd_info_t& i = osdmap.get_info(from);
3561 if (i.up_from > i.lost_at) {
3562 dout(10) << " fresh osd; marking lost_at too" << dendl;
3563 pending_inc.new_lost[from] = osdmap.get_epoch();
3564 }
3565 }
3566
3567 // metadata
3568 bufferlist osd_metadata;
3569 encode(m->metadata, osd_metadata);
3570 pending_metadata[from] = osd_metadata;
3571 pending_metadata_rm.erase(from);
3572
3573 // adjust last clean unmount epoch?
3574 const osd_info_t& info = osdmap.get_info(from);
3575 dout(10) << " old osd_info: " << info << dendl;
3576 if (m->sb.mounted > info.last_clean_begin ||
3577 (m->sb.mounted == info.last_clean_begin &&
3578 m->sb.clean_thru > info.last_clean_end)) {
3579 epoch_t begin = m->sb.mounted;
3580 epoch_t end = m->sb.clean_thru;
3581
3582 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3583 << "[" << info.last_clean_begin << "," << info.last_clean_end
3584 << ") -> [" << begin << "-" << end << ")"
3585 << dendl;
3586 pending_inc.new_last_clean_interval[from] =
3587 pair<epoch_t,epoch_t>(begin, end);
3588 }
3589
3590 if (pending_inc.new_xinfo.count(from) == 0)
3591 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3592 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3593 if (m->boot_epoch == 0) {
3594 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3595 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3596 dout(10) << " not laggy, new xi " << xi << dendl;
3597 } else {
3598 if (xi.down_stamp.sec()) {
3599 int interval = ceph_clock_now().sec() -
3600 xi.down_stamp.sec();
3601 if (g_conf()->mon_osd_laggy_max_interval &&
3602 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3603 interval = g_conf()->mon_osd_laggy_max_interval;
3604 }
3605 xi.laggy_interval =
3606 interval * g_conf()->mon_osd_laggy_weight +
3607 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3608 }
3609 xi.laggy_probability =
3610 g_conf()->mon_osd_laggy_weight +
3611 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3612 dout(10) << " laggy, now xi " << xi << dendl;
3613 }
3614
3615 // set features shared by the osd
3616 if (m->osd_features)
3617 xi.features = m->osd_features;
3618 else
3619 xi.features = m->get_connection()->get_features();
3620
3621 // mark in?
3622 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3623 (oldstate & CEPH_OSD_AUTOOUT)) ||
3624 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3625 (g_conf()->mon_osd_auto_mark_in)) {
3626 if (can_mark_in(from)) {
3627 if (xi.old_weight > 0) {
3628 pending_inc.new_weight[from] = xi.old_weight;
3629 xi.old_weight = 0;
3630 } else {
3631 pending_inc.new_weight[from] = CEPH_OSD_IN;
3632 }
3633 } else {
3634 dout(7) << __func__ << " NOIN set, will not mark in "
3635 << m->get_orig_source_addr() << dendl;
3636 }
3637 }
3638
3639 // wait
3640 wait_for_finished_proposal(op, new C_Booted(this, op));
3641 }
3642 return true;
3643 }
3644
3645 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3646 {
3647 op->mark_osdmon_event(__func__);
3648 auto m = op->get_req<MOSDBoot>();
3649 dout(7) << "_booted " << m->get_orig_source_inst()
3650 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3651
3652 if (logit) {
3653 mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3654 << " boot";
3655 }
3656
3657 send_latest(op, m->sb.current_epoch+1);
3658 }
3659
3660
3661 // -------------
3662 // full
3663
3664 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3665 {
3666 op->mark_osdmon_event(__func__);
3667 auto m = op->get_req<MOSDFull>();
3668 int from = m->get_orig_source().num();
3669 set<string> state;
3670 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3671
3672 // check permissions, ignore if failed
3673 MonSession *session = op->get_session();
3674 if (!session)
3675 goto ignore;
3676 if (!session->is_capable("osd", MON_CAP_X)) {
3677 dout(0) << "MOSDFull from entity with insufficient privileges:"
3678 << session->caps << dendl;
3679 goto ignore;
3680 }
3681
3682 // ignore a full message from the osd instance that already went down
3683 if (!osdmap.exists(from)) {
3684 dout(7) << __func__ << " ignoring full message from nonexistent "
3685 << m->get_orig_source_inst() << dendl;
3686 goto ignore;
3687 }
3688 if ((!osdmap.is_up(from) &&
3689 osdmap.get_most_recent_addrs(from).legacy_equals(
3690 m->get_orig_source_addrs())) ||
3691 (osdmap.is_up(from) &&
3692 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3693 dout(7) << __func__ << " ignoring full message from down "
3694 << m->get_orig_source_inst() << dendl;
3695 goto ignore;
3696 }
3697
3698 OSDMap::calc_state_set(osdmap.get_state(from), state);
3699
3700 if ((osdmap.get_state(from) & mask) == m->state) {
3701 dout(7) << __func__ << " state already " << state << " for osd." << from
3702 << " " << m->get_orig_source_inst() << dendl;
3703 _reply_map(op, m->version);
3704 goto ignore;
3705 }
3706
3707 dout(10) << __func__ << " want state " << state << " for osd." << from
3708 << " " << m->get_orig_source_inst() << dendl;
3709 return false;
3710
3711 ignore:
3712 return true;
3713 }
3714
3715 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3716 {
3717 op->mark_osdmon_event(__func__);
3718 auto m = op->get_req<MOSDFull>();
3719 const int from = m->get_orig_source().num();
3720
3721 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3722 const unsigned want_state = m->state & mask; // safety first
3723
3724 unsigned cur_state = osdmap.get_state(from);
3725 auto p = pending_inc.new_state.find(from);
3726 if (p != pending_inc.new_state.end()) {
3727 cur_state ^= p->second;
3728 }
3729 cur_state &= mask;
3730
3731 set<string> want_state_set, cur_state_set;
3732 OSDMap::calc_state_set(want_state, want_state_set);
3733 OSDMap::calc_state_set(cur_state, cur_state_set);
3734
3735 if (cur_state != want_state) {
3736 if (p != pending_inc.new_state.end()) {
3737 p->second &= ~mask;
3738 } else {
3739 pending_inc.new_state[from] = 0;
3740 }
3741 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3742 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3743 << " -> " << want_state_set << dendl;
3744 } else {
3745 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3746 << " = wanted " << want_state_set << ", just waiting" << dendl;
3747 }
3748
3749 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3750 return true;
3751 }
3752
3753 // -------------
3754 // alive
3755
3756 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3757 {
3758 op->mark_osdmon_event(__func__);
3759 auto m = op->get_req<MOSDAlive>();
3760 int from = m->get_orig_source().num();
3761
3762 // check permissions, ignore if failed
3763 MonSession *session = op->get_session();
3764 if (!session)
3765 goto ignore;
3766 if (!session->is_capable("osd", MON_CAP_X)) {
3767 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3768 << session->caps << dendl;
3769 goto ignore;
3770 }
3771
3772 if (!osdmap.is_up(from) ||
3773 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3774 dout(7) << "preprocess_alive ignoring alive message from down "
3775 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3776 << dendl;
3777 goto ignore;
3778 }
3779
3780 if (osdmap.get_up_thru(from) >= m->want) {
3781 // yup.
3782 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3783 _reply_map(op, m->version);
3784 return true;
3785 }
3786
3787 dout(10) << "preprocess_alive want up_thru " << m->want
3788 << " from " << m->get_orig_source_inst() << dendl;
3789 return false;
3790
3791 ignore:
3792 return true;
3793 }
3794
3795 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3796 {
3797 op->mark_osdmon_event(__func__);
3798 auto m = op->get_req<MOSDAlive>();
3799 int from = m->get_orig_source().num();
3800
3801 if (0) { // we probably don't care much about these
3802 mon->clog->debug() << m->get_orig_source_inst() << " alive";
3803 }
3804
3805 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3806 << " from " << m->get_orig_source_inst() << dendl;
3807
3808 update_up_thru(from, m->version); // set to the latest map the OSD has
3809 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3810 return true;
3811 }
3812
3813 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3814 {
3815 op->mark_osdmon_event(__func__);
3816 dout(7) << "_reply_map " << e
3817 << " from " << op->get_req()->get_orig_source_inst()
3818 << dendl;
3819 send_latest(op, e);
3820 }
3821
3822 // pg_created
3823 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3824 {
3825 op->mark_osdmon_event(__func__);
3826 auto m = op->get_req<MOSDPGCreated>();
3827 dout(10) << __func__ << " " << *m << dendl;
3828 auto session = op->get_session();
3829 mon->no_reply(op);
3830 if (!session) {
3831 dout(10) << __func__ << ": no monitor session!" << dendl;
3832 return true;
3833 }
3834 if (!session->is_capable("osd", MON_CAP_X)) {
3835 derr << __func__ << " received from entity "
3836 << "with insufficient privileges " << session->caps << dendl;
3837 return true;
3838 }
3839 // always forward the "created!" to the leader
3840 return false;
3841 }
3842
3843 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3844 {
3845 op->mark_osdmon_event(__func__);
3846 auto m = op->get_req<MOSDPGCreated>();
3847 dout(10) << __func__ << " " << *m << dendl;
3848 auto src = m->get_orig_source();
3849 auto from = src.num();
3850 if (!src.is_osd() ||
3851 !mon->osdmon()->osdmap.is_up(from) ||
3852 !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3853 m->get_orig_source_addrs())) {
3854 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3855 return false;
3856 }
3857 pending_created_pgs.push_back(m->pgid);
3858 return true;
3859 }
3860
3861 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3862 {
3863 op->mark_osdmon_event(__func__);
3864 auto m = op->get_req<MOSDPGReadyToMerge>();
3865 dout(10) << __func__ << " " << *m << dendl;
3866 const pg_pool_t *pi;
3867 auto session = op->get_session();
3868 if (!session) {
3869 dout(10) << __func__ << ": no monitor session!" << dendl;
3870 goto ignore;
3871 }
3872 if (!session->is_capable("osd", MON_CAP_X)) {
3873 derr << __func__ << " received from entity "
3874 << "with insufficient privileges " << session->caps << dendl;
3875 goto ignore;
3876 }
3877 pi = osdmap.get_pg_pool(m->pgid.pool());
3878 if (!pi) {
3879 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3880 goto ignore;
3881 }
3882 if (pi->get_pg_num() <= m->pgid.ps()) {
3883 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3884 goto ignore;
3885 }
3886 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3887 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3888 goto ignore;
3889 }
3890 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3891 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3892 goto ignore;
3893 }
3894 return false;
3895
3896 ignore:
3897 mon->no_reply(op);
3898 return true;
3899 }
3900
3901 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3902 {
3903 op->mark_osdmon_event(__func__);
3904 auto m = op->get_req<MOSDPGReadyToMerge>();
3905 dout(10) << __func__ << " " << *m << dendl;
3906 pg_pool_t p;
3907 if (pending_inc.new_pools.count(m->pgid.pool()))
3908 p = pending_inc.new_pools[m->pgid.pool()];
3909 else
3910 p = *osdmap.get_pg_pool(m->pgid.pool());
3911 if (p.get_pg_num() != m->pgid.ps() + 1 ||
3912 p.get_pg_num_pending() > m->pgid.ps()) {
3913 dout(10) << __func__
3914 << " race with concurrent pg_num[_pending] update, will retry"
3915 << dendl;
3916 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3917 return true;
3918 }
3919
3920 if (m->ready) {
3921 p.dec_pg_num(m->pgid,
3922 pending_inc.epoch,
3923 m->source_version,
3924 m->target_version,
3925 m->last_epoch_started,
3926 m->last_epoch_clean);
3927 p.last_change = pending_inc.epoch;
3928 } else {
3929 // back off the merge attempt!
3930 p.set_pg_num_pending(p.get_pg_num());
3931 }
3932
3933 // force pre-nautilus clients to resend their ops, since they
3934 // don't understand pg_num_pending changes form a new interval
3935 p.last_force_op_resend_prenautilus = pending_inc.epoch;
3936
3937 pending_inc.new_pools[m->pgid.pool()] = p;
3938
3939 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3940 if (m->ready &&
3941 prob > 0 &&
3942 prob > (double)(rand() % 1000)/1000.0) {
3943 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3944 auto n = new MMonCommand(mon->monmap->get_fsid());
3945 n->set_connection(m->get_connection());
3946 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3947 osdmap.get_pool_name(m->pgid.pool()) +
3948 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3949 stringify(m->pgid.ps() + 1) + "\"}" };
3950 MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3951 nop->set_type_service();
3952 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3953 } else {
3954 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3955 }
3956 return true;
3957 }
3958
3959
3960 // -------------
3961 // pg_temp changes
3962
3963 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3964 {
3965 auto m = op->get_req<MOSDPGTemp>();
3966 dout(10) << "preprocess_pgtemp " << *m << dendl;
3967 mempool::osdmap::vector<int> empty;
3968 int from = m->get_orig_source().num();
3969 size_t ignore_cnt = 0;
3970
3971 // check caps
3972 MonSession *session = op->get_session();
3973 if (!session)
3974 goto ignore;
3975 if (!session->is_capable("osd", MON_CAP_X)) {
3976 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3977 << session->caps << dendl;
3978 goto ignore;
3979 }
3980
3981 if (!osdmap.is_up(from) ||
3982 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3983 dout(7) << "ignoring pgtemp message from down "
3984 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3985 << dendl;
3986 goto ignore;
3987 }
3988
3989 if (m->forced) {
3990 return false;
3991 }
3992
3993 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3994 dout(20) << " " << p->first
3995 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3996 << " -> " << p->second << dendl;
3997
3998 // does the pool exist?
3999 if (!osdmap.have_pg_pool(p->first.pool())) {
4000 /*
4001 * 1. If the osdmap does not have the pool, it means the pool has been
4002 * removed in-between the osd sending this message and us handling it.
4003 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4004 * not exist in the pending either, as the osds would not send a
4005 * message about a pool they know nothing about (yet).
4006 * 3. However, if the pool does exist in the pending, then it must be a
4007 * new pool, and not relevant to this message (see 1).
4008 */
4009 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4010 << ": pool has been removed" << dendl;
4011 ignore_cnt++;
4012 continue;
4013 }
4014
4015 int acting_primary = -1;
4016 osdmap.pg_to_up_acting_osds(
4017 p->first, nullptr, nullptr, nullptr, &acting_primary);
4018 if (acting_primary != from) {
4019 /* If the source isn't the primary based on the current osdmap, we know
4020 * that the interval changed and that we can discard this message.
4021 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4022 * which of two pg temp mappings on the same pg is more recent.
4023 */
4024 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4025 << ": primary has changed" << dendl;
4026 ignore_cnt++;
4027 continue;
4028 }
4029
4030 // removal?
4031 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4032 osdmap.primary_temp->count(p->first)))
4033 return false;
4034 // change?
4035 // NOTE: we assume that this will clear pg_primary, so consider
4036 // an existing pg_primary field to imply a change
4037 if (p->second.size() &&
4038 (osdmap.pg_temp->count(p->first) == 0 ||
4039 osdmap.pg_temp->get(p->first) != p->second ||
4040 osdmap.primary_temp->count(p->first)))
4041 return false;
4042 }
4043
4044 // should we ignore all the pgs?
4045 if (ignore_cnt == m->pg_temp.size())
4046 goto ignore;
4047
4048 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4049 _reply_map(op, m->map_epoch);
4050 return true;
4051
4052 ignore:
4053 mon->no_reply(op);
4054 return true;
4055 }
4056
4057 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4058 {
4059 epoch_t old_up_thru = osdmap.get_up_thru(from);
4060 auto ut = pending_inc.new_up_thru.find(from);
4061 if (ut != pending_inc.new_up_thru.end()) {
4062 old_up_thru = ut->second;
4063 }
4064 if (up_thru > old_up_thru) {
4065 // set up_thru too, so the osd doesn't have to ask again
4066 pending_inc.new_up_thru[from] = up_thru;
4067 }
4068 }
4069
4070 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4071 {
4072 op->mark_osdmon_event(__func__);
4073 auto m = op->get_req<MOSDPGTemp>();
4074 int from = m->get_orig_source().num();
4075 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4076 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4077 uint64_t pool = p->first.pool();
4078 if (pending_inc.old_pools.count(pool)) {
4079 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4080 << ": pool pending removal" << dendl;
4081 continue;
4082 }
4083 if (!osdmap.have_pg_pool(pool)) {
4084 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4085 << ": pool has been removed" << dendl;
4086 continue;
4087 }
4088 pending_inc.new_pg_temp[p->first] =
4089 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4090
4091 // unconditionally clear pg_primary (until this message can encode
4092 // a change for that, too.. at which point we need to also fix
4093 // preprocess_pg_temp)
4094 if (osdmap.primary_temp->count(p->first) ||
4095 pending_inc.new_primary_temp.count(p->first))
4096 pending_inc.new_primary_temp[p->first] = -1;
4097 }
4098
4099 // set up_thru too, so the osd doesn't have to ask again
4100 update_up_thru(from, m->map_epoch);
4101
4102 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4103 return true;
4104 }
4105
4106
4107 // ---
4108
4109 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4110 {
4111 op->mark_osdmon_event(__func__);
4112 auto m = op->get_req<MRemoveSnaps>();
4113 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4114
4115 // check privilege, ignore if failed
4116 MonSession *session = op->get_session();
4117 mon->no_reply(op);
4118 if (!session)
4119 goto ignore;
4120 if (!session->caps.is_capable(
4121 cct,
4122 session->entity_name,
4123 "osd", "osd pool rmsnap", {}, true, true, false,
4124 session->get_peer_socket_addr())) {
4125 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4126 << session->caps << dendl;
4127 goto ignore;
4128 }
4129
4130 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4131 q != m->snaps.end();
4132 ++q) {
4133 if (!osdmap.have_pg_pool(q->first)) {
4134 dout(10) << " ignoring removed_snaps " << q->second
4135 << " on non-existent pool " << q->first << dendl;
4136 continue;
4137 }
4138 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4139 for (vector<snapid_t>::iterator p = q->second.begin();
4140 p != q->second.end();
4141 ++p) {
4142 if (*p > pi->get_snap_seq() ||
4143 !_is_removed_snap(q->first, *p)) {
4144 return false;
4145 }
4146 }
4147 }
4148
4149 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4150 auto reply = make_message<MRemoveSnaps>();
4151 reply->snaps = m->snaps;
4152 mon->send_reply(op, reply.detach());
4153 }
4154
4155 ignore:
4156 return true;
4157 }
4158
4159 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4160 {
4161 op->mark_osdmon_event(__func__);
4162 auto m = op->get_req<MRemoveSnaps>();
4163 dout(7) << "prepare_remove_snaps " << *m << dendl;
4164
4165 for (auto& [pool, snaps] : m->snaps) {
4166 if (!osdmap.have_pg_pool(pool)) {
4167 dout(10) << " ignoring removed_snaps " << snaps
4168 << " on non-existent pool " << pool << dendl;
4169 continue;
4170 }
4171
4172 pg_pool_t& pi = osdmap.pools[pool];
4173 for (auto s : snaps) {
4174 if (!_is_removed_snap(pool, s) &&
4175 (!pending_inc.new_pools.count(pool) ||
4176 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4177 (!pending_inc.new_removed_snaps.count(pool) ||
4178 !pending_inc.new_removed_snaps[pool].contains(s))) {
4179 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4180 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4181 newpi->removed_snaps.insert(s);
4182 dout(10) << " pool " << pool << " removed_snaps added " << s
4183 << " (now " << newpi->removed_snaps << ")" << dendl;
4184 }
4185 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4186 if (s > newpi->get_snap_seq()) {
4187 dout(10) << " pool " << pool << " snap_seq "
4188 << newpi->get_snap_seq() << " -> " << s << dendl;
4189 newpi->set_snap_seq(s);
4190 }
4191 newpi->set_snap_epoch(pending_inc.epoch);
4192 dout(10) << " added pool " << pool << " snap " << s
4193 << " to removed_snaps queue" << dendl;
4194 pending_inc.new_removed_snaps[pool].insert(s);
4195 }
4196 }
4197 }
4198
4199 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4200 auto reply = make_message<MRemoveSnaps>();
4201 reply->snaps = m->snaps;
4202 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4203 }
4204
4205 return true;
4206 }
4207
4208 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4209 {
4210 op->mark_osdmon_event(__func__);
4211 auto m = op->get_req<MMonGetPurgedSnaps>();
4212 dout(7) << __func__ << " " << *m << dendl;
4213
4214 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4215
4216 string k = make_purged_snap_epoch_key(m->start);
4217 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4218 it->upper_bound(k);
4219 unsigned long epoch = m->last;
4220 while (it->valid()) {
4221 if (it->key().find("purged_epoch_") != 0) {
4222 break;
4223 }
4224 string k = it->key();
4225 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4226 if (n != 1) {
4227 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4228 } else if (epoch > m->last) {
4229 break;
4230 } else {
4231 bufferlist bl = it->value();
4232 auto p = bl.cbegin();
4233 auto &v = r[epoch];
4234 try {
4235 ceph::decode(v, p);
4236 } catch (buffer::error& e) {
4237 derr << __func__ << " unable to parse value for key '" << it->key()
4238 << "': \n";
4239 bl.hexdump(*_dout);
4240 *_dout << dendl;
4241 }
4242 n += 4 + v.size() * 16;
4243 }
4244 if (n > 1048576) {
4245 // impose a semi-arbitrary limit to message size
4246 break;
4247 }
4248 it->next();
4249 }
4250
4251 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4252 reply->purged_snaps.swap(r);
4253 mon->send_reply(op, reply.detach());
4254
4255 return true;
4256 }
4257
4258 // osd beacon
4259 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4260 {
4261 op->mark_osdmon_event(__func__);
4262 // check caps
4263 auto session = op->get_session();
4264 mon->no_reply(op);
4265 if (!session) {
4266 dout(10) << __func__ << " no monitor session!" << dendl;
4267 return true;
4268 }
4269 if (!session->is_capable("osd", MON_CAP_X)) {
4270 derr << __func__ << " received from entity "
4271 << "with insufficient privileges " << session->caps << dendl;
4272 return true;
4273 }
4274 // Always forward the beacon to the leader, even if they are the same as
4275 // the old one. The leader will mark as down osds that haven't sent
4276 // beacon for a few minutes.
4277 return false;
4278 }
4279
4280 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4281 {
4282 op->mark_osdmon_event(__func__);
4283 const auto beacon = op->get_req<MOSDBeacon>();
4284 const auto src = beacon->get_orig_source();
4285 dout(10) << __func__ << " " << *beacon
4286 << " from " << src << dendl;
4287 int from = src.num();
4288
4289 if (!src.is_osd() ||
4290 !osdmap.is_up(from) ||
4291 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4292 if (src.is_osd() && !osdmap.is_up(from)) {
4293 // share some new maps with this guy in case it may not be
4294 // aware of its own deadness...
4295 send_latest(op, beacon->version+1);
4296 }
4297 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4298 return false;
4299 }
4300
4301 last_osd_report[from] = ceph_clock_now();
4302 osd_epochs[from] = beacon->version;
4303
4304 for (const auto& pg : beacon->pgs) {
4305 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4306 }
4307
4308 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4309 beacon->last_purged_snaps_scrub) {
4310 if (pending_inc.new_xinfo.count(from) == 0) {
4311 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4312 }
4313 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4314 beacon->last_purged_snaps_scrub;
4315 return true;
4316 } else {
4317 return false;
4318 }
4319 }
4320
4321 // ---------------
4322 // map helpers
4323
4324 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4325 {
4326 op->mark_osdmon_event(__func__);
4327 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4328 << " start " << start << dendl;
4329 if (start == 0)
4330 send_full(op);
4331 else
4332 send_incremental(op, start);
4333 }
4334
4335
4336 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4337 {
4338 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4339 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4340 r->oldest_map = get_first_committed();
4341 r->newest_map = osdmap.get_epoch();
4342 return r;
4343 }
4344
4345 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4346 {
4347 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4348 << std::hex << features << std::dec << dendl;
4349 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
4350 m->oldest_map = get_first_committed();
4351 m->newest_map = osdmap.get_epoch();
4352
4353 for (epoch_t e = to; e >= from && e > 0; e--) {
4354 bufferlist bl;
4355 int err = get_version(e, features, bl);
4356 if (err == 0) {
4357 ceph_assert(bl.length());
4358 // if (get_version(e, bl) > 0) {
4359 dout(20) << "build_incremental inc " << e << " "
4360 << bl.length() << " bytes" << dendl;
4361 m->incremental_maps[e] = bl;
4362 } else {
4363 ceph_assert(err == -ENOENT);
4364 ceph_assert(!bl.length());
4365 get_version_full(e, features, bl);
4366 if (bl.length() > 0) {
4367 //else if (get_version("full", e, bl) > 0) {
4368 dout(20) << "build_incremental full " << e << " "
4369 << bl.length() << " bytes" << dendl;
4370 m->maps[e] = bl;
4371 } else {
4372 ceph_abort(); // we should have all maps.
4373 }
4374 }
4375 }
4376 return m;
4377 }
4378
4379 void OSDMonitor::send_full(MonOpRequestRef op)
4380 {
4381 op->mark_osdmon_event(__func__);
4382 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4383 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
4384 }
4385
4386 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4387 {
4388 op->mark_osdmon_event(__func__);
4389
4390 MonSession *s = op->get_session();
4391 ceph_assert(s);
4392
4393 if (s->proxy_con) {
4394 // oh, we can tell the other mon to do it
4395 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4396 << first << dendl;
4397 MRoute *r = new MRoute(s->proxy_tid, NULL);
4398 r->send_osdmap_first = first;
4399 s->proxy_con->send_message(r);
4400 op->mark_event("reply: send routed send_osdmap_first reply");
4401 } else {
4402 // do it ourselves
4403 send_incremental(first, s, false, op);
4404 }
4405 }
4406
4407 void OSDMonitor::send_incremental(epoch_t first,
4408 MonSession *session,
4409 bool onetime,
4410 MonOpRequestRef req)
4411 {
4412 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4413 << " to " << session->name << dendl;
4414
4415 // get feature of the peer
4416 // use quorum_con_features, if it's an anonymous connection.
4417 uint64_t features = session->con_features ? session->con_features :
4418 mon->get_quorum_con_features();
4419
4420 if (first <= session->osd_epoch) {
4421 dout(10) << __func__ << " " << session->name << " should already have epoch "
4422 << session->osd_epoch << dendl;
4423 first = session->osd_epoch + 1;
4424 }
4425
4426 if (first < get_first_committed()) {
4427 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4428 m->oldest_map = get_first_committed();
4429 m->newest_map = osdmap.get_epoch();
4430
4431 first = get_first_committed();
4432 bufferlist bl;
4433 int err = get_version_full(first, features, bl);
4434 ceph_assert(err == 0);
4435 ceph_assert(bl.length());
4436 dout(20) << "send_incremental starting with base full "
4437 << first << " " << bl.length() << " bytes" << dendl;
4438 m->maps[first] = bl;
4439
4440 if (req) {
4441 mon->send_reply(req, m);
4442 session->osd_epoch = first;
4443 return;
4444 } else {
4445 session->con->send_message(m);
4446 session->osd_epoch = first;
4447 }
4448 first++;
4449 }
4450
4451 while (first <= osdmap.get_epoch()) {
4452 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4453 osdmap.get_epoch());
4454 MOSDMap *m = build_incremental(first, last, features);
4455
4456 if (req) {
4457 // send some maps. it may not be all of them, but it will get them
4458 // started.
4459 mon->send_reply(req, m);
4460 } else {
4461 session->con->send_message(m);
4462 first = last + 1;
4463 }
4464 session->osd_epoch = last;
4465 if (onetime || req)
4466 break;
4467 }
4468 }
4469
4470 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4471 {
4472 return get_version(ver, mon->get_quorum_con_features(), bl);
4473 }
4474
4475 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4476 {
4477 OSDMap::Incremental inc;
4478 auto q = bl.cbegin();
4479 inc.decode(q);
4480 // always encode with subset of osdmap's canonical features
4481 uint64_t f = features & inc.encode_features;
4482 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4483 << dendl;
4484 bl.clear();
4485 if (inc.fullmap.length()) {
4486 // embedded full map?
4487 OSDMap m;
4488 m.decode(inc.fullmap);
4489 inc.fullmap.clear();
4490 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4491 }
4492 if (inc.crush.length()) {
4493 // embedded crush map
4494 CrushWrapper c;
4495 auto p = inc.crush.cbegin();
4496 c.decode(p);
4497 inc.crush.clear();
4498 c.encode(inc.crush, f);
4499 }
4500 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4501 }
4502
4503 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4504 {
4505 OSDMap m;
4506 auto q = bl.cbegin();
4507 m.decode(q);
4508 // always encode with subset of osdmap's canonical features
4509 uint64_t f = features & m.get_encoding_features();
4510 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4511 << dendl;
4512 bl.clear();
4513 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4514 }
4515
4516 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4517 {
4518 uint64_t significant_features = OSDMap::get_significant_features(features);
4519 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4520 return 0;
4521 }
4522 int ret = PaxosService::get_version(ver, bl);
4523 if (ret < 0) {
4524 return ret;
4525 }
4526 // NOTE: this check is imprecise; the OSDMap encoding features may
4527 // be a subset of the latest mon quorum features, but worst case we
4528 // reencode once and then cache the (identical) result under both
4529 // feature masks.
4530 if (significant_features !=
4531 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4532 reencode_incremental_map(bl, features);
4533 }
4534 inc_osd_cache.add_bytes({ver, significant_features}, bl);
4535 return 0;
4536 }
4537
4538 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4539 {
4540 bufferlist inc_bl;
4541 int err = get_version(ver, inc_bl);
4542 ceph_assert(err == 0);
4543 ceph_assert(inc_bl.length());
4544
4545 auto p = inc_bl.cbegin();
4546 inc.decode(p);
4547 dout(10) << __func__ << " "
4548 << " epoch " << inc.epoch
4549 << " inc_crc " << inc.inc_crc
4550 << " full_crc " << inc.full_crc
4551 << " encode_features " << inc.encode_features << dendl;
4552 return 0;
4553 }
4554
4555 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4556 {
4557 dout(10) << __func__ << " ver " << ver << dendl;
4558
4559 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4560 if (closest_pinned == 0) {
4561 return -ENOENT;
4562 }
4563 if (closest_pinned > ver) {
4564 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4565 }
4566 ceph_assert(closest_pinned <= ver);
4567
4568 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4569
4570 // get osdmap incremental maps and apply on top of this one.
4571 bufferlist osdm_bl;
4572 bool has_cached_osdmap = false;
4573 for (version_t v = ver-1; v >= closest_pinned; --v) {
4574 if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4575 &osdm_bl)) {
4576 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4577 closest_pinned = v;
4578 has_cached_osdmap = true;
4579 break;
4580 }
4581 }
4582
4583 if (!has_cached_osdmap) {
4584 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4585 if (err != 0) {
4586 derr << __func__ << " closest pinned map ver " << closest_pinned
4587 << " not available! error: " << cpp_strerror(err) << dendl;
4588 }
4589 ceph_assert(err == 0);
4590 }
4591
4592 ceph_assert(osdm_bl.length());
4593
4594 OSDMap osdm;
4595 osdm.decode(osdm_bl);
4596
4597 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4598 << " e" << osdm.epoch
4599 << " crc " << osdm.get_crc()
4600 << " -- applying incremental maps." << dendl;
4601
4602 uint64_t encode_features = 0;
4603 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4604 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4605
4606 OSDMap::Incremental inc;
4607 int err = get_inc(v, inc);
4608 ceph_assert(err == 0);
4609
4610 encode_features = inc.encode_features;
4611
4612 err = osdm.apply_incremental(inc);
4613 ceph_assert(err == 0);
4614
4615 // this block performs paranoid checks on map retrieval
4616 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4617 inc.full_crc != 0) {
4618
4619 uint64_t f = encode_features;
4620 if (!f) {
4621 f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4622 }
4623
4624 // encode osdmap to force calculating crcs
4625 bufferlist tbl;
4626 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4627 // decode osdmap to compare crcs with what's expected by incremental
4628 OSDMap tosdm;
4629 tosdm.decode(tbl);
4630
4631 if (tosdm.get_crc() != inc.full_crc) {
4632 derr << __func__
4633 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4634 << ", expected " << inc.full_crc << ")" << dendl;
4635 ceph_abort_msg("osdmap crc mismatch");
4636 }
4637 }
4638
4639 // note: we cannot add the recently computed map to the cache, as is,
4640 // because we have not encoded the map into a bl.
4641 }
4642
4643 if (!encode_features) {
4644 dout(10) << __func__
4645 << " last incremental map didn't have features;"
4646 << " defaulting to quorum's or all" << dendl;
4647 encode_features =
4648 (mon->quorum_con_features ? mon->quorum_con_features : -1);
4649 }
4650 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4651
4652 return 0;
4653 }
4654
4655 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4656 {
4657 return get_version_full(ver, mon->get_quorum_con_features(), bl);
4658 }
4659
4660 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4661 bufferlist& bl)
4662 {
4663 uint64_t significant_features = OSDMap::get_significant_features(features);
4664 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4665 return 0;
4666 }
4667 int ret = PaxosService::get_version_full(ver, bl);
4668 if (ret == -ENOENT) {
4669 // build map?
4670 ret = get_full_from_pinned_map(ver, bl);
4671 }
4672 if (ret < 0) {
4673 return ret;
4674 }
4675 // NOTE: this check is imprecise; the OSDMap encoding features may
4676 // be a subset of the latest mon quorum features, but worst case we
4677 // reencode once and then cache the (identical) result under both
4678 // feature masks.
4679 if (significant_features !=
4680 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4681 reencode_full_map(bl, features);
4682 }
4683 full_osd_cache.add_bytes({ver, significant_features}, bl);
4684 return 0;
4685 }
4686
4687 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4688 {
4689 dout(10) << "blacklist " << av << " until " << until << dendl;
4690 for (auto a : av.v) {
4691 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4692 a.set_type(entity_addr_t::TYPE_ANY);
4693 } else {
4694 a.set_type(entity_addr_t::TYPE_LEGACY);
4695 }
4696 pending_inc.new_blacklist[a] = until;
4697 }
4698 return pending_inc.epoch;
4699 }
4700
4701 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4702 {
4703 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4704 a.set_type(entity_addr_t::TYPE_ANY);
4705 } else {
4706 a.set_type(entity_addr_t::TYPE_LEGACY);
4707 }
4708 dout(10) << "blacklist " << a << " until " << until << dendl;
4709 pending_inc.new_blacklist[a] = until;
4710 return pending_inc.epoch;
4711 }
4712
4713
4714 void OSDMonitor::check_osdmap_subs()
4715 {
4716 dout(10) << __func__ << dendl;
4717 if (!osdmap.get_epoch()) {
4718 return;
4719 }
4720 auto osdmap_subs = mon->session_map.subs.find("osdmap");
4721 if (osdmap_subs == mon->session_map.subs.end()) {
4722 return;
4723 }
4724 auto p = osdmap_subs->second->begin();
4725 while (!p.end()) {
4726 auto sub = *p;
4727 ++p;
4728 check_osdmap_sub(sub);
4729 }
4730 }
4731
4732 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4733 {
4734 dout(10) << __func__ << " " << sub << " next " << sub->next
4735 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4736 if (sub->next <= osdmap.get_epoch()) {
4737 if (sub->next >= 1)
4738 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4739 else
4740 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4741 if (sub->onetime)
4742 mon->session_map.remove_sub(sub);
4743 else
4744 sub->next = osdmap.get_epoch() + 1;
4745 }
4746 }
4747
4748 void OSDMonitor::check_pg_creates_subs()
4749 {
4750 if (!osdmap.get_num_up_osds()) {
4751 return;
4752 }
4753 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4754 mon->with_session_map([this](const MonSessionMap& session_map) {
4755 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4756 if (pg_creates_subs == session_map.subs.end()) {
4757 return;
4758 }
4759 for (auto sub : *pg_creates_subs->second) {
4760 check_pg_creates_sub(sub);
4761 }
4762 });
4763 }
4764
4765 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4766 {
4767 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4768 ceph_assert(sub->type == "osd_pg_creates");
4769 // only send these if the OSD is up. we will check_subs() when they do
4770 // come up so they will get the creates then.
4771 if (sub->session->name.is_osd() &&
4772 mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4773 sub->next = send_pg_creates(sub->session->name.num(),
4774 sub->session->con.get(),
4775 sub->next);
4776 }
4777 }
4778
4779 void OSDMonitor::do_application_enable(int64_t pool_id,
4780 const std::string &app_name,
4781 const std::string &app_key,
4782 const std::string &app_value,
4783 bool force)
4784 {
4785 ceph_assert(paxos->is_plugged() && is_writeable());
4786
4787 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4788 << dendl;
4789
4790 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4791
4792 auto pp = osdmap.get_pg_pool(pool_id);
4793 ceph_assert(pp != nullptr);
4794
4795 pg_pool_t p = *pp;
4796 if (pending_inc.new_pools.count(pool_id)) {
4797 p = pending_inc.new_pools[pool_id];
4798 }
4799
4800 if (app_key.empty()) {
4801 p.application_metadata.insert({app_name, {}});
4802 } else {
4803 if (force) {
4804 p.application_metadata[app_name][app_key] = app_value;
4805 } else {
4806 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4807 }
4808 }
4809 p.last_change = pending_inc.epoch;
4810 pending_inc.new_pools[pool_id] = p;
4811 }
4812
4813 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4814 pool_opts_t::key_t opt,
4815 pool_opts_t::value_t val)
4816 {
4817 auto p = pending_inc.new_pools.try_emplace(
4818 pool_id, *osdmap.get_pg_pool(pool_id));
4819 p.first->second.opts.set(opt, val);
4820 }
4821
4822 unsigned OSDMonitor::scan_for_creating_pgs(
4823 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4824 const mempool::osdmap::set<int64_t>& removed_pools,
4825 utime_t modified,
4826 creating_pgs_t* creating_pgs) const
4827 {
4828 unsigned queued = 0;
4829 for (auto& p : pools) {
4830 int64_t poolid = p.first;
4831 if (creating_pgs->created_pools.count(poolid)) {
4832 dout(10) << __func__ << " already created " << poolid << dendl;
4833 continue;
4834 }
4835 const pg_pool_t& pool = p.second;
4836 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4837 pool.get_type(), pool.get_size());
4838 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4839 continue;
4840
4841 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4842 const auto created = pool.get_last_change();
4843 if (last_scan_epoch && created <= last_scan_epoch) {
4844 dout(10) << __func__ << " no change in pool " << poolid
4845 << " " << pool << dendl;
4846 continue;
4847 }
4848 if (removed_pools.count(poolid)) {
4849 dout(10) << __func__ << " pool is being removed: " << poolid
4850 << " " << pool << dendl;
4851 continue;
4852 }
4853 dout(10) << __func__ << " queueing pool create for " << poolid
4854 << " " << pool << dendl;
4855 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4856 created, modified);
4857 queued++;
4858 }
4859 return queued;
4860 }
4861
4862 void OSDMonitor::update_creating_pgs()
4863 {
4864 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4865 << creating_pgs.queue.size() << " pools in queue" << dendl;
4866 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4867 std::lock_guard<std::mutex> l(creating_pgs_lock);
4868 for (const auto& pg : creating_pgs.pgs) {
4869 int acting_primary = -1;
4870 auto pgid = pg.first;
4871 if (!osdmap.pg_exists(pgid)) {
4872 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4873 << dendl;
4874 continue;
4875 }
4876 auto mapped = pg.second.create_epoch;
4877 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4878 spg_t spgid(pgid);
4879 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4880 // check the previous creating_pgs, look for the target to whom the pg was
4881 // previously mapped
4882 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4883 const auto last_acting_primary = pgs_by_epoch.first;
4884 for (auto& pgs: pgs_by_epoch.second) {
4885 if (pgs.second.count(spgid)) {
4886 if (last_acting_primary == acting_primary) {
4887 mapped = pgs.first;
4888 } else {
4889 dout(20) << __func__ << " " << pgid << " "
4890 << " acting_primary:" << last_acting_primary
4891 << " -> " << acting_primary << dendl;
4892 // note epoch if the target of the create message changed.
4893 mapped = mapping.get_epoch();
4894 }
4895 break;
4896 } else {
4897 // newly creating
4898 mapped = mapping.get_epoch();
4899 }
4900 }
4901 }
4902 dout(10) << __func__ << " will instruct osd." << acting_primary
4903 << " to create " << pgid << "@" << mapped << dendl;
4904 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4905 }
4906 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4907 creating_pgs_epoch = mapping.get_epoch();
4908 }
4909
4910 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4911 {
4912 dout(30) << __func__ << " osd." << osd << " next=" << next
4913 << " " << creating_pgs_by_osd_epoch << dendl;
4914 std::lock_guard<std::mutex> l(creating_pgs_lock);
4915 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4916 dout(20) << __func__
4917 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4918 // the subscribers will be updated when the mapping is completed anyway
4919 return next;
4920 }
4921 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4922 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4923 return next;
4924 ceph_assert(!creating_pgs_by_epoch->second.empty());
4925
4926 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4927 MOSDPGCreate2 *m = nullptr;
4928
4929 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
4930
4931 epoch_t last = 0;
4932 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4933 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4934 auto epoch = epoch_pgs->first;
4935 auto& pgs = epoch_pgs->second;
4936 dout(20) << __func__ << " osd." << osd << " from " << next
4937 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4938 last = epoch;
4939 for (auto& pg : pgs) {
4940 // Need the create time from the monitor using its clock to set
4941 // last_scrub_stamp upon pg creation.
4942 auto create = creating_pgs.pgs.find(pg.pgid);
4943 ceph_assert(create != creating_pgs.pgs.end());
4944 if (old) {
4945 if (!oldm) {
4946 oldm = new MOSDPGCreate(creating_pgs_epoch);
4947 }
4948 oldm->mkpg.emplace(pg.pgid,
4949 pg_create_t{create->second.create_epoch, pg.pgid, 0});
4950 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
4951 } else {
4952 if (!m) {
4953 m = new MOSDPGCreate2(creating_pgs_epoch);
4954 }
4955 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4956 create->second.create_stamp));
4957 if (create->second.history.epoch_created) {
4958 dout(20) << __func__ << " " << pg << " " << create->second.history
4959 << " " << create->second.past_intervals << dendl;
4960 m->pg_extra.emplace(pg, make_pair(create->second.history,
4961 create->second.past_intervals));
4962 }
4963 }
4964 dout(20) << __func__ << " will create " << pg
4965 << " at " << create->second.create_epoch << dendl;
4966 }
4967 }
4968 if (m) {
4969 con->send_message(m);
4970 } else if (oldm) {
4971 con->send_message(oldm);
4972 } else {
4973 dout(20) << __func__ << " osd." << osd << " from " << next
4974 << " has nothing to send" << dendl;
4975 return next;
4976 }
4977
4978 // sub is current through last + 1
4979 return last + 1;
4980 }
4981
4982 // TICK
4983
4984
4985 void OSDMonitor::tick()
4986 {
4987 if (!is_active()) return;
4988
4989 dout(10) << osdmap << dendl;
4990
4991 // always update osdmap manifest, regardless of being the leader.
4992 load_osdmap_manifest();
4993
4994 // always tune priority cache manager memory on leader and peons
4995 if (ceph_using_tcmalloc() && mon_memory_autotune) {
4996 std::lock_guard l(balancer_lock);
4997 if (pcm != nullptr) {
4998 pcm->tune_memory();
4999 pcm->balance();
5000 _set_new_cache_sizes();
5001 dout(10) << "tick balancer "
5002 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5003 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5004 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5005 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5006 << dendl;
5007 dout(10) << "tick balancer "
5008 << " full cache_bytes: " << full_cache->get_cache_bytes()
5009 << " full comtd_bytes: " << full_cache->get_committed_size()
5010 << " full used_bytes: " << full_cache->_get_used_bytes()
5011 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5012 << dendl;
5013 }
5014 }
5015
5016 if (!mon->is_leader()) return;
5017
5018 bool do_propose = false;
5019 utime_t now = ceph_clock_now();
5020
5021 if (handle_osd_timeouts(now, last_osd_report)) {
5022 do_propose = true;
5023 }
5024
5025 // mark osds down?
5026 if (check_failures(now)) {
5027 do_propose = true;
5028 }
5029
5030 // Force a proposal if we need to prune; pruning is performed on
5031 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5032 // even if there's nothing going on.
5033 if (is_prune_enabled() && should_prune()) {
5034 do_propose = true;
5035 }
5036
5037 // mark down osds out?
5038
5039 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5040 * influence at all. The decision is made based on the ratio of "in" osds,
5041 * and the function returns false if this ratio is lower that the minimum
5042 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5043 */
5044 if (can_mark_out(-1)) {
5045 string down_out_subtree_limit = g_conf().get_val<string>(
5046 "mon_osd_down_out_subtree_limit");
5047 set<int> down_cache; // quick cache of down subtrees
5048
5049 map<int,utime_t>::iterator i = down_pending_out.begin();
5050 while (i != down_pending_out.end()) {
5051 int o = i->first;
5052 utime_t down = now;
5053 down -= i->second;
5054 ++i;
5055
5056 if (osdmap.is_down(o) &&
5057 osdmap.is_in(o) &&
5058 can_mark_out(o)) {
5059 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5060 utime_t grace = orig_grace;
5061 double my_grace = 0.0;
5062
5063 if (g_conf()->mon_osd_adjust_down_out_interval) {
5064 // scale grace period the same way we do the heartbeat grace.
5065 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5066 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5067 double decay_k = ::log(.5) / halflife;
5068 double decay = exp((double)down * decay_k);
5069 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5070 << " down for " << down << " decay " << decay << dendl;
5071 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5072 grace += my_grace;
5073 }
5074
5075 // is this an entire large subtree down?
5076 if (down_out_subtree_limit.length()) {
5077 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5078 if (type > 0) {
5079 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5080 dout(10) << "tick entire containing " << down_out_subtree_limit
5081 << " subtree for osd." << o
5082 << " is down; resetting timer" << dendl;
5083 // reset timer, too.
5084 down_pending_out[o] = now;
5085 continue;
5086 }
5087 }
5088 }
5089
5090 bool down_out = !osdmap.is_destroyed(o) &&
5091 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5092 bool destroyed_out = osdmap.is_destroyed(o) &&
5093 g_conf()->mon_osd_destroyed_out_interval > 0 &&
5094 // this is not precise enough as we did not make a note when this osd
5095 // was marked as destroyed, but let's not bother with that
5096 // complexity for now.
5097 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5098 if (down_out || destroyed_out) {
5099 dout(10) << "tick marking osd." << o << " OUT after " << down
5100 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5101 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5102
5103 // set the AUTOOUT bit.
5104 if (pending_inc.new_state.count(o) == 0)
5105 pending_inc.new_state[o] = 0;
5106 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5107
5108 // remember previous weight
5109 if (pending_inc.new_xinfo.count(o) == 0)
5110 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5111 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5112
5113 do_propose = true;
5114
5115 mon->clog->info() << "Marking osd." << o << " out (has been down for "
5116 << int(down.sec()) << " seconds)";
5117 } else
5118 continue;
5119 }
5120
5121 down_pending_out.erase(o);
5122 }
5123 } else {
5124 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5125 }
5126
5127 // expire blacklisted items?
5128 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5129 p != osdmap.blacklist.end();
5130 ++p) {
5131 if (p->second < now) {
5132 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5133 pending_inc.old_blacklist.push_back(p->first);
5134 do_propose = true;
5135 }
5136 }
5137
5138 if (try_prune_purged_snaps()) {
5139 do_propose = true;
5140 }
5141
5142 if (update_pools_status())
5143 do_propose = true;
5144
5145 if (do_propose ||
5146 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5147 propose_pending();
5148 }
5149
5150 void OSDMonitor::_set_new_cache_sizes()
5151 {
5152 uint64_t cache_size = 0;
5153 int64_t inc_alloc = 0;
5154 int64_t full_alloc = 0;
5155 int64_t kv_alloc = 0;
5156
5157 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5158 cache_size = pcm->get_tuned_mem();
5159 inc_alloc = inc_cache->get_committed_size();
5160 full_alloc = full_cache->get_committed_size();
5161 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5162 }
5163
5164 inc_osd_cache.set_bytes(inc_alloc);
5165 full_osd_cache.set_bytes(full_alloc);
5166
5167 dout(1) << __func__ << " cache_size:" << cache_size
5168 << " inc_alloc: " << inc_alloc
5169 << " full_alloc: " << full_alloc
5170 << " kv_alloc: " << kv_alloc
5171 << dendl;
5172 }
5173
5174 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5175 std::map<int,utime_t> &last_osd_report)
5176 {
5177 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5178 if (now - mon->get_leader_since() < timeo) {
5179 // We haven't been the leader for long enough to consider OSD timeouts
5180 return false;
5181 }
5182
5183 int max_osd = osdmap.get_max_osd();
5184 bool new_down = false;
5185
5186 for (int i=0; i < max_osd; ++i) {
5187 dout(30) << __func__ << ": checking up on osd " << i << dendl;
5188 if (!osdmap.exists(i)) {
5189 last_osd_report.erase(i); // if any
5190 continue;
5191 }
5192 if (!osdmap.is_up(i))
5193 continue;
5194 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5195 if (t == last_osd_report.end()) {
5196 // it wasn't in the map; start the timer.
5197 last_osd_report[i] = now;
5198 } else if (can_mark_down(i)) {
5199 utime_t diff = now - t->second;
5200 if (diff > timeo) {
5201 mon->clog->info() << "osd." << i << " marked down after no beacon for "
5202 << diff << " seconds";
5203 derr << "no beacon from osd." << i << " since " << t->second
5204 << ", " << diff << " seconds ago. marking down" << dendl;
5205 pending_inc.new_state[i] = CEPH_OSD_UP;
5206 new_down = true;
5207 }
5208 }
5209 }
5210 return new_down;
5211 }
5212
5213 static void dump_cpu_list(Formatter *f, const char *name,
5214 const string& strlist)
5215 {
5216 cpu_set_t cpu_set;
5217 size_t cpu_set_size;
5218 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5219 return;
5220 }
5221 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5222 f->open_array_section(name);
5223 for (auto cpu : cpus) {
5224 f->dump_int("cpu", cpu);
5225 }
5226 f->close_section();
5227 }
5228
5229 void OSDMonitor::dump_info(Formatter *f)
5230 {
5231 f->open_object_section("osdmap");
5232 osdmap.dump(f);
5233 f->close_section();
5234
5235 f->open_array_section("osd_metadata");
5236 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5237 if (osdmap.exists(i)) {
5238 f->open_object_section("osd");
5239 f->dump_unsigned("id", i);
5240 dump_osd_metadata(i, f, NULL);
5241 f->close_section();
5242 }
5243 }
5244 f->close_section();
5245
5246 f->open_object_section("osdmap_clean_epochs");
5247 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5248
5249 f->open_object_section("last_epoch_clean");
5250 last_epoch_clean.dump(f);
5251 f->close_section();
5252
5253 f->open_array_section("osd_epochs");
5254 for (auto& osd_epoch : osd_epochs) {
5255 f->open_object_section("osd");
5256 f->dump_unsigned("id", osd_epoch.first);
5257 f->dump_unsigned("epoch", osd_epoch.second);
5258 f->close_section();
5259 }
5260 f->close_section(); // osd_epochs
5261
5262 f->close_section(); // osd_clean_epochs
5263
5264 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5265 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5266
5267 f->open_object_section("crushmap");
5268 osdmap.crush->dump(f);
5269 f->close_section();
5270
5271 if (has_osdmap_manifest) {
5272 f->open_object_section("osdmap_manifest");
5273 osdmap_manifest.dump(f);
5274 f->close_section();
5275 }
5276 }
5277
5278 namespace {
5279 enum osd_pool_get_choices {
5280 SIZE, MIN_SIZE,
5281 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5282 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5283 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5284 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5285 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5286 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5287 CACHE_TARGET_FULL_RATIO,
5288 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5289 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5290 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5291 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5292 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5293 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5294 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5295 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5296 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5297 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5298 PG_AUTOSCALE_BIAS };
5299
5300 std::set<osd_pool_get_choices>
5301 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5302 const std::set<osd_pool_get_choices>& second)
5303 {
5304 std::set<osd_pool_get_choices> result;
5305 std::set_difference(first.begin(), first.end(),
5306 second.begin(), second.end(),
5307 std::inserter(result, result.end()));
5308 return result;
5309 }
5310 }
5311
5312
5313 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5314 {
5315 op->mark_osdmon_event(__func__);
5316 auto m = op->get_req<MMonCommand>();
5317 int r = 0;
5318 bufferlist rdata;
5319 stringstream ss, ds;
5320
5321 cmdmap_t cmdmap;
5322 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5323 string rs = ss.str();
5324 mon->reply_command(op, -EINVAL, rs, get_last_committed());
5325 return true;
5326 }
5327
5328 MonSession *session = op->get_session();
5329 if (!session) {
5330 derr << __func__ << " no session" << dendl;
5331 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5332 return true;
5333 }
5334
5335 string prefix;
5336 cmd_getval(cmdmap, "prefix", prefix);
5337
5338 string format;
5339 cmd_getval(cmdmap, "format", format, string("plain"));
5340 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5341
5342 if (prefix == "osd stat") {
5343 if (f) {
5344 f->open_object_section("osdmap");
5345 osdmap.print_summary(f.get(), ds, "", true);
5346 f->close_section();
5347 f->flush(rdata);
5348 } else {
5349 osdmap.print_summary(nullptr, ds, "", true);
5350 rdata.append(ds);
5351 }
5352 }
5353 else if (prefix == "osd dump" ||
5354 prefix == "osd tree" ||
5355 prefix == "osd tree-from" ||
5356 prefix == "osd ls" ||
5357 prefix == "osd getmap" ||
5358 prefix == "osd getcrushmap" ||
5359 prefix == "osd ls-tree" ||
5360 prefix == "osd info") {
5361 string val;
5362
5363 epoch_t epoch = 0;
5364 int64_t epochnum;
5365 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5366 epoch = epochnum;
5367
5368 bufferlist osdmap_bl;
5369 int err = get_version_full(epoch, osdmap_bl);
5370 if (err == -ENOENT) {
5371 r = -ENOENT;
5372 ss << "there is no map for epoch " << epoch;
5373 goto reply;
5374 }
5375 ceph_assert(err == 0);
5376 ceph_assert(osdmap_bl.length());
5377
5378 OSDMap *p;
5379 if (epoch == osdmap.get_epoch()) {
5380 p = &osdmap;
5381 } else {
5382 p = new OSDMap;
5383 p->decode(osdmap_bl);
5384 }
5385
5386 auto sg = make_scope_guard([&] {
5387 if (p != &osdmap) {
5388 delete p;
5389 }
5390 });
5391
5392 if (prefix == "osd dump") {
5393 stringstream ds;
5394 if (f) {
5395 f->open_object_section("osdmap");
5396 p->dump(f.get());
5397 f->close_section();
5398 f->flush(ds);
5399 } else {
5400 p->print(ds);
5401 }
5402 rdata.append(ds);
5403 if (!f)
5404 ds << " ";
5405 } else if (prefix == "osd ls") {
5406 if (f) {
5407 f->open_array_section("osds");
5408 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5409 if (osdmap.exists(i)) {
5410 f->dump_int("osd", i);
5411 }
5412 }
5413 f->close_section();
5414 f->flush(ds);
5415 } else {
5416 bool first = true;
5417 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5418 if (osdmap.exists(i)) {
5419 if (!first)
5420 ds << "\n";
5421 first = false;
5422 ds << i;
5423 }
5424 }
5425 }
5426 rdata.append(ds);
5427 } else if (prefix == "osd info") {
5428 int64_t osd_id;
5429 bool do_single_osd = true;
5430 if (!cmd_getval(cmdmap, "id", osd_id)) {
5431 do_single_osd = false;
5432 }
5433
5434 if (do_single_osd && !osdmap.exists(osd_id)) {
5435 ss << "osd." << osd_id << " does not exist";
5436 r = -EINVAL;
5437 goto reply;
5438 }
5439
5440 if (f) {
5441 if (do_single_osd) {
5442 osdmap.dump_osd(osd_id, f.get());
5443 } else {
5444 osdmap.dump_osds(f.get());
5445 }
5446 f->flush(ds);
5447 } else {
5448 if (do_single_osd) {
5449 osdmap.print_osd(osd_id, ds);
5450 } else {
5451 osdmap.print_osds(ds);
5452 }
5453 }
5454 rdata.append(ds);
5455 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5456 string bucket;
5457 if (prefix == "osd tree-from") {
5458 cmd_getval(cmdmap, "bucket", bucket);
5459 if (!osdmap.crush->name_exists(bucket)) {
5460 ss << "bucket '" << bucket << "' does not exist";
5461 r = -ENOENT;
5462 goto reply;
5463 }
5464 int id = osdmap.crush->get_item_id(bucket);
5465 if (id >= 0) {
5466 ss << "\"" << bucket << "\" is not a bucket";
5467 r = -EINVAL;
5468 goto reply;
5469 }
5470 }
5471
5472 vector<string> states;
5473 cmd_getval(cmdmap, "states", states);
5474 unsigned filter = 0;
5475 for (auto& s : states) {
5476 if (s == "up") {
5477 filter |= OSDMap::DUMP_UP;
5478 } else if (s == "down") {
5479 filter |= OSDMap::DUMP_DOWN;
5480 } else if (s == "in") {
5481 filter |= OSDMap::DUMP_IN;
5482 } else if (s == "out") {
5483 filter |= OSDMap::DUMP_OUT;
5484 } else if (s == "destroyed") {
5485 filter |= OSDMap::DUMP_DESTROYED;
5486 } else {
5487 ss << "unrecognized state '" << s << "'";
5488 r = -EINVAL;
5489 goto reply;
5490 }
5491 }
5492 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5493 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5494 ss << "cannot specify both 'in' and 'out'";
5495 r = -EINVAL;
5496 goto reply;
5497 }
5498 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5499 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5500 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5501 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5502 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5503 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5504 ss << "can specify only one of 'up', 'down' and 'destroyed'";
5505 r = -EINVAL;
5506 goto reply;
5507 }
5508 if (f) {
5509 f->open_object_section("tree");
5510 p->print_tree(f.get(), NULL, filter, bucket);
5511 f->close_section();
5512 f->flush(ds);
5513 } else {
5514 p->print_tree(NULL, &ds, filter, bucket);
5515 }
5516 rdata.append(ds);
5517 } else if (prefix == "osd getmap") {
5518 rdata.append(osdmap_bl);
5519 ss << "got osdmap epoch " << p->get_epoch();
5520 } else if (prefix == "osd getcrushmap") {
5521 p->crush->encode(rdata, mon->get_quorum_con_features());
5522 ss << p->get_crush_version();
5523 } else if (prefix == "osd ls-tree") {
5524 string bucket_name;
5525 cmd_getval(cmdmap, "name", bucket_name);
5526 set<int> osds;
5527 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5528 if (r == -ENOENT) {
5529 ss << "\"" << bucket_name << "\" does not exist";
5530 goto reply;
5531 } else if (r < 0) {
5532 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5533 goto reply;
5534 }
5535
5536 if (f) {
5537 f->open_array_section("osds");
5538 for (auto &i : osds) {
5539 if (osdmap.exists(i)) {
5540 f->dump_int("osd", i);
5541 }
5542 }
5543 f->close_section();
5544 f->flush(ds);
5545 } else {
5546 bool first = true;
5547 for (auto &i : osds) {
5548 if (osdmap.exists(i)) {
5549 if (!first)
5550 ds << "\n";
5551 first = false;
5552 ds << i;
5553 }
5554 }
5555 }
5556
5557 rdata.append(ds);
5558 }
5559 } else if (prefix == "osd getmaxosd") {
5560 if (f) {
5561 f->open_object_section("getmaxosd");
5562 f->dump_unsigned("epoch", osdmap.get_epoch());
5563 f->dump_int("max_osd", osdmap.get_max_osd());
5564 f->close_section();
5565 f->flush(rdata);
5566 } else {
5567 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5568 rdata.append(ds);
5569 }
5570 } else if (prefix == "osd utilization") {
5571 string out;
5572 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5573 if (f)
5574 f->flush(rdata);
5575 else
5576 rdata.append(out);
5577 r = 0;
5578 goto reply;
5579 } else if (prefix == "osd find") {
5580 int64_t osd;
5581 if (!cmd_getval(cmdmap, "id", osd)) {
5582 ss << "unable to parse osd id value '"
5583 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5584 r = -EINVAL;
5585 goto reply;
5586 }
5587 if (!osdmap.exists(osd)) {
5588 ss << "osd." << osd << " does not exist";
5589 r = -ENOENT;
5590 goto reply;
5591 }
5592 string format;
5593 cmd_getval(cmdmap, "format", format);
5594 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5595 f->open_object_section("osd_location");
5596 f->dump_int("osd", osd);
5597 f->dump_object("addrs", osdmap.get_addrs(osd));
5598 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5599
5600 // try to identify host, pod/container name, etc.
5601 map<string,string> m;
5602 load_metadata(osd, m, nullptr);
5603 if (auto p = m.find("hostname"); p != m.end()) {
5604 f->dump_string("host", p->second);
5605 }
5606 for (auto& k : {
5607 "pod_name", "pod_namespace", // set by rook
5608 "container_name" // set by cephadm, ceph-ansible
5609 }) {
5610 if (auto p = m.find(k); p != m.end()) {
5611 f->dump_string(k, p->second);
5612 }
5613 }
5614
5615 // crush is helpful too
5616 f->open_object_section("crush_location");
5617 map<string,string> loc = osdmap.crush->get_full_location(osd);
5618 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5619 f->dump_string(p->first.c_str(), p->second);
5620 f->close_section();
5621 f->close_section();
5622 f->flush(rdata);
5623 } else if (prefix == "osd metadata") {
5624 int64_t osd = -1;
5625 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5626 !cmd_getval(cmdmap, "id", osd)) {
5627 ss << "unable to parse osd id value '"
5628 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5629 r = -EINVAL;
5630 goto reply;
5631 }
5632 if (osd >= 0 && !osdmap.exists(osd)) {
5633 ss << "osd." << osd << " does not exist";
5634 r = -ENOENT;
5635 goto reply;
5636 }
5637 string format;
5638 cmd_getval(cmdmap, "format", format);
5639 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5640 if (osd >= 0) {
5641 f->open_object_section("osd_metadata");
5642 f->dump_unsigned("id", osd);
5643 r = dump_osd_metadata(osd, f.get(), &ss);
5644 if (r < 0)
5645 goto reply;
5646 f->close_section();
5647 } else {
5648 r = 0;
5649 f->open_array_section("osd_metadata");
5650 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5651 if (osdmap.exists(i)) {
5652 f->open_object_section("osd");
5653 f->dump_unsigned("id", i);
5654 r = dump_osd_metadata(i, f.get(), NULL);
5655 if (r == -EINVAL || r == -ENOENT) {
5656 // Drop error, continue to get other daemons' metadata
5657 dout(4) << "No metadata for osd." << i << dendl;
5658 r = 0;
5659 } else if (r < 0) {
5660 // Unexpected error
5661 goto reply;
5662 }
5663 f->close_section();
5664 }
5665 }
5666 f->close_section();
5667 }
5668 f->flush(rdata);
5669 } else if (prefix == "osd versions") {
5670 if (!f)
5671 f.reset(Formatter::create("json-pretty"));
5672 count_metadata("ceph_version", f.get());
5673 f->flush(rdata);
5674 r = 0;
5675 } else if (prefix == "osd count-metadata") {
5676 if (!f)
5677 f.reset(Formatter::create("json-pretty"));
5678 string field;
5679 cmd_getval(cmdmap, "property", field);
5680 count_metadata(field, f.get());
5681 f->flush(rdata);
5682 r = 0;
5683 } else if (prefix == "osd numa-status") {
5684 TextTable tbl;
5685 if (f) {
5686 f->open_array_section("osds");
5687 } else {
5688 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5689 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5690 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5691 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5692 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5693 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5694 }
5695 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5696 if (osdmap.exists(i)) {
5697 map<string,string> m;
5698 ostringstream err;
5699 if (load_metadata(i, m, &err) < 0) {
5700 continue;
5701 }
5702 string host;
5703 auto p = m.find("hostname");
5704 if (p != m.end()) {
5705 host = p->second;
5706 }
5707 if (f) {
5708 f->open_object_section("osd");
5709 f->dump_int("osd", i);
5710 f->dump_string("host", host);
5711 for (auto n : { "network_numa_node", "objectstore_numa_node",
5712 "numa_node" }) {
5713 p = m.find(n);
5714 if (p != m.end()) {
5715 f->dump_int(n, atoi(p->second.c_str()));
5716 }
5717 }
5718 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5719 p = m.find(n);
5720 if (p != m.end()) {
5721 list<string> ls = get_str_list(p->second, ",");
5722 f->open_array_section(n);
5723 for (auto node : ls) {
5724 f->dump_int("node", atoi(node.c_str()));
5725 }
5726 f->close_section();
5727 }
5728 }
5729 for (auto n : { "numa_node_cpus" }) {
5730 p = m.find(n);
5731 if (p != m.end()) {
5732 dump_cpu_list(f.get(), n, p->second);
5733 }
5734 }
5735 f->close_section();
5736 } else {
5737 tbl << i;
5738 tbl << host;
5739 p = m.find("network_numa_nodes");
5740 if (p != m.end()) {
5741 tbl << p->second;
5742 } else {
5743 tbl << "-";
5744 }
5745 p = m.find("objectstore_numa_nodes");
5746 if (p != m.end()) {
5747 tbl << p->second;
5748 } else {
5749 tbl << "-";
5750 }
5751 p = m.find("numa_node");
5752 auto q = m.find("numa_node_cpus");
5753 if (p != m.end() && q != m.end()) {
5754 tbl << p->second;
5755 tbl << q->second;
5756 } else {
5757 tbl << "-";
5758 tbl << "-";
5759 }
5760 tbl << TextTable::endrow;
5761 }
5762 }
5763 }
5764 if (f) {
5765 f->close_section();
5766 f->flush(rdata);
5767 } else {
5768 rdata.append(stringify(tbl));
5769 }
5770 } else if (prefix == "osd map") {
5771 string poolstr, objstr, namespacestr;
5772 cmd_getval(cmdmap, "pool", poolstr);
5773 cmd_getval(cmdmap, "object", objstr);
5774 cmd_getval(cmdmap, "nspace", namespacestr);
5775
5776 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5777 if (pool < 0) {
5778 ss << "pool " << poolstr << " does not exist";
5779 r = -ENOENT;
5780 goto reply;
5781 }
5782 object_locator_t oloc(pool, namespacestr);
5783 object_t oid(objstr);
5784 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5785 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5786 vector<int> up, acting;
5787 int up_p, acting_p;
5788 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5789
5790 string fullobjname;
5791 if (!namespacestr.empty())
5792 fullobjname = namespacestr + string("/") + oid.name;
5793 else
5794 fullobjname = oid.name;
5795 if (f) {
5796 f->open_object_section("osd_map");
5797 f->dump_unsigned("epoch", osdmap.get_epoch());
5798 f->dump_string("pool", poolstr);
5799 f->dump_int("pool_id", pool);
5800 f->dump_stream("objname") << fullobjname;
5801 f->dump_stream("raw_pgid") << pgid;
5802 f->dump_stream("pgid") << mpgid;
5803 f->open_array_section("up");
5804 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5805 f->dump_int("osd", *p);
5806 f->close_section();
5807 f->dump_int("up_primary", up_p);
5808 f->open_array_section("acting");
5809 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5810 f->dump_int("osd", *p);
5811 f->close_section();
5812 f->dump_int("acting_primary", acting_p);
5813 f->close_section(); // osd_map
5814 f->flush(rdata);
5815 } else {
5816 ds << "osdmap e" << osdmap.get_epoch()
5817 << " pool '" << poolstr << "' (" << pool << ")"
5818 << " object '" << fullobjname << "' ->"
5819 << " pg " << pgid << " (" << mpgid << ")"
5820 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5821 << pg_vector_string(acting) << ", p" << acting_p << ")";
5822 rdata.append(ds);
5823 }
5824
5825 } else if (prefix == "pg map") {
5826 pg_t pgid;
5827 string pgidstr;
5828 cmd_getval(cmdmap, "pgid", pgidstr);
5829 if (!pgid.parse(pgidstr.c_str())) {
5830 ss << "invalid pgid '" << pgidstr << "'";
5831 r = -EINVAL;
5832 goto reply;
5833 }
5834 vector<int> up, acting;
5835 if (!osdmap.have_pg_pool(pgid.pool())) {
5836 ss << "pg '" << pgidstr << "' does not exist";
5837 r = -ENOENT;
5838 goto reply;
5839 }
5840 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5841 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5842 if (f) {
5843 f->open_object_section("pg_map");
5844 f->dump_unsigned("epoch", osdmap.get_epoch());
5845 f->dump_stream("raw_pgid") << pgid;
5846 f->dump_stream("pgid") << mpgid;
5847 f->open_array_section("up");
5848 for (auto osd : up) {
5849 f->dump_int("up_osd", osd);
5850 }
5851 f->close_section();
5852 f->open_array_section("acting");
5853 for (auto osd : acting) {
5854 f->dump_int("acting_osd", osd);
5855 }
5856 f->close_section();
5857 f->close_section();
5858 f->flush(rdata);
5859 } else {
5860 ds << "osdmap e" << osdmap.get_epoch()
5861 << " pg " << pgid << " (" << mpgid << ")"
5862 << " -> up " << up << " acting " << acting;
5863 rdata.append(ds);
5864 }
5865 goto reply;
5866
5867 } else if (prefix == "osd lspools") {
5868 if (f)
5869 f->open_array_section("pools");
5870 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5871 p != osdmap.pools.end();
5872 ++p) {
5873 if (f) {
5874 f->open_object_section("pool");
5875 f->dump_int("poolnum", p->first);
5876 f->dump_string("poolname", osdmap.pool_name[p->first]);
5877 f->close_section();
5878 } else {
5879 ds << p->first << ' ' << osdmap.pool_name[p->first];
5880 if (next(p) != osdmap.pools.end()) {
5881 ds << '\n';
5882 }
5883 }
5884 }
5885 if (f) {
5886 f->close_section();
5887 f->flush(ds);
5888 }
5889 rdata.append(ds);
5890 } else if (prefix == "osd blacklist ls") {
5891 if (f)
5892 f->open_array_section("blacklist");
5893
5894 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5895 p != osdmap.blacklist.end();
5896 ++p) {
5897 if (f) {
5898 f->open_object_section("entry");
5899 f->dump_string("addr", p->first.get_legacy_str());
5900 f->dump_stream("until") << p->second;
5901 f->close_section();
5902 } else {
5903 stringstream ss;
5904 string s;
5905 ss << p->first << " " << p->second;
5906 getline(ss, s);
5907 s += "\n";
5908 rdata.append(s);
5909 }
5910 }
5911 if (f) {
5912 f->close_section();
5913 f->flush(rdata);
5914 }
5915 ss << "listed " << osdmap.blacklist.size() << " entries";
5916
5917 } else if (prefix == "osd pool ls") {
5918 string detail;
5919 cmd_getval(cmdmap, "detail", detail);
5920 if (!f && detail == "detail") {
5921 ostringstream ss;
5922 osdmap.print_pools(ss);
5923 rdata.append(ss.str());
5924 } else {
5925 if (f)
5926 f->open_array_section("pools");
5927 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5928 it != osdmap.get_pools().end();
5929 ++it) {
5930 if (f) {
5931 if (detail == "detail") {
5932 f->open_object_section("pool");
5933 f->dump_int("pool_id", it->first);
5934 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5935 it->second.dump(f.get());
5936 f->close_section();
5937 } else {
5938 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5939 }
5940 } else {
5941 rdata.append(osdmap.get_pool_name(it->first) + "\n");
5942 }
5943 }
5944 if (f) {
5945 f->close_section();
5946 f->flush(rdata);
5947 }
5948 }
5949
5950 } else if (prefix == "osd crush get-tunable") {
5951 string tunable;
5952 cmd_getval(cmdmap, "tunable", tunable);
5953 ostringstream rss;
5954 if (f)
5955 f->open_object_section("tunable");
5956 if (tunable == "straw_calc_version") {
5957 if (f)
5958 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5959 else
5960 rss << osdmap.crush->get_straw_calc_version() << "\n";
5961 } else {
5962 r = -EINVAL;
5963 goto reply;
5964 }
5965 if (f) {
5966 f->close_section();
5967 f->flush(rdata);
5968 } else {
5969 rdata.append(rss.str());
5970 }
5971 r = 0;
5972
5973 } else if (prefix == "osd pool get") {
5974 string poolstr;
5975 cmd_getval(cmdmap, "pool", poolstr);
5976 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5977 if (pool < 0) {
5978 ss << "unrecognized pool '" << poolstr << "'";
5979 r = -ENOENT;
5980 goto reply;
5981 }
5982
5983 const pg_pool_t *p = osdmap.get_pg_pool(pool);
5984 string var;
5985 cmd_getval(cmdmap, "var", var);
5986
5987 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5988 const choices_map_t ALL_CHOICES = {
5989 {"size", SIZE},
5990 {"min_size", MIN_SIZE},
5991 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5992 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5993 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5994 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5995 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5996 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5997 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5998 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5999 {"use_gmt_hitset", USE_GMT_HITSET},
6000 {"target_max_objects", TARGET_MAX_OBJECTS},
6001 {"target_max_bytes", TARGET_MAX_BYTES},
6002 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6003 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6004 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6005 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6006 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6007 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6008 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6009 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6010 {"fast_read", FAST_READ},
6011 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6012 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6013 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6014 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6015 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6016 {"recovery_priority", RECOVERY_PRIORITY},
6017 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6018 {"scrub_priority", SCRUB_PRIORITY},
6019 {"compression_mode", COMPRESSION_MODE},
6020 {"compression_algorithm", COMPRESSION_ALGORITHM},
6021 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6022 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6023 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6024 {"csum_type", CSUM_TYPE},
6025 {"csum_max_block", CSUM_MAX_BLOCK},
6026 {"csum_min_block", CSUM_MIN_BLOCK},
6027 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6028 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6029 {"pg_num_min", PG_NUM_MIN},
6030 {"target_size_bytes", TARGET_SIZE_BYTES},
6031 {"target_size_ratio", TARGET_SIZE_RATIO},
6032 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6033 };
6034
6035 typedef std::set<osd_pool_get_choices> choices_set_t;
6036
6037 const choices_set_t ONLY_TIER_CHOICES = {
6038 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6039 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6040 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6041 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6042 MIN_READ_RECENCY_FOR_PROMOTE,
6043 MIN_WRITE_RECENCY_FOR_PROMOTE,
6044 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6045 };
6046 const choices_set_t ONLY_ERASURE_CHOICES = {
6047 EC_OVERWRITES, ERASURE_CODE_PROFILE
6048 };
6049
6050 choices_set_t selected_choices;
6051 if (var == "all") {
6052 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6053 it != ALL_CHOICES.end(); ++it) {
6054 selected_choices.insert(it->second);
6055 }
6056
6057 if(!p->is_tier()) {
6058 selected_choices = subtract_second_from_first(selected_choices,
6059 ONLY_TIER_CHOICES);
6060 }
6061
6062 if(!p->is_erasure()) {
6063 selected_choices = subtract_second_from_first(selected_choices,
6064 ONLY_ERASURE_CHOICES);
6065 }
6066 } else /* var != "all" */ {
6067 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6068 osd_pool_get_choices selected = found->second;
6069
6070 if (!p->is_tier() &&
6071 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6072 ss << "pool '" << poolstr
6073 << "' is not a tier pool: variable not applicable";
6074 r = -EACCES;
6075 goto reply;
6076 }
6077
6078 if (!p->is_erasure() &&
6079 ONLY_ERASURE_CHOICES.find(selected)
6080 != ONLY_ERASURE_CHOICES.end()) {
6081 ss << "pool '" << poolstr
6082 << "' is not a erasure pool: variable not applicable";
6083 r = -EACCES;
6084 goto reply;
6085 }
6086
6087 if (pool_opts_t::is_opt_name(var) &&
6088 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6089 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6090 r = -ENOENT;
6091 goto reply;
6092 }
6093
6094 selected_choices.insert(selected);
6095 }
6096
6097 if (f) {
6098 f->open_object_section("pool");
6099 f->dump_string("pool", poolstr);
6100 f->dump_int("pool_id", pool);
6101 for(choices_set_t::const_iterator it = selected_choices.begin();
6102 it != selected_choices.end(); ++it) {
6103 choices_map_t::const_iterator i;
6104 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6105 if (i->second == *it) {
6106 break;
6107 }
6108 }
6109 ceph_assert(i != ALL_CHOICES.end());
6110 switch(*it) {
6111 case PG_NUM:
6112 f->dump_int("pg_num", p->get_pg_num());
6113 break;
6114 case PGP_NUM:
6115 f->dump_int("pgp_num", p->get_pgp_num());
6116 break;
6117 case SIZE:
6118 f->dump_int("size", p->get_size());
6119 break;
6120 case MIN_SIZE:
6121 f->dump_int("min_size", p->get_min_size());
6122 break;
6123 case CRUSH_RULE:
6124 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6125 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6126 p->get_crush_rule()));
6127 } else {
6128 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6129 }
6130 break;
6131 case EC_OVERWRITES:
6132 f->dump_bool("allow_ec_overwrites",
6133 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6134 break;
6135 case PG_AUTOSCALE_MODE:
6136 f->dump_string("pg_autoscale_mode",
6137 pg_pool_t::get_pg_autoscale_mode_name(
6138 p->pg_autoscale_mode));
6139 break;
6140 case HASHPSPOOL:
6141 case NODELETE:
6142 case NOPGCHANGE:
6143 case NOSIZECHANGE:
6144 case WRITE_FADVISE_DONTNEED:
6145 case NOSCRUB:
6146 case NODEEP_SCRUB:
6147 f->dump_bool(i->first.c_str(),
6148 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6149 break;
6150 case HIT_SET_PERIOD:
6151 f->dump_int("hit_set_period", p->hit_set_period);
6152 break;
6153 case HIT_SET_COUNT:
6154 f->dump_int("hit_set_count", p->hit_set_count);
6155 break;
6156 case HIT_SET_TYPE:
6157 f->dump_string("hit_set_type",
6158 HitSet::get_type_name(p->hit_set_params.get_type()));
6159 break;
6160 case HIT_SET_FPP:
6161 {
6162 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6163 BloomHitSet::Params *bloomp =
6164 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6165 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6166 } else if(var != "all") {
6167 f->close_section();
6168 ss << "hit set is not of type Bloom; " <<
6169 "invalid to get a false positive rate!";
6170 r = -EINVAL;
6171 goto reply;
6172 }
6173 }
6174 break;
6175 case USE_GMT_HITSET:
6176 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6177 break;
6178 case TARGET_MAX_OBJECTS:
6179 f->dump_unsigned("target_max_objects", p->target_max_objects);
6180 break;
6181 case TARGET_MAX_BYTES:
6182 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6183 break;
6184 case CACHE_TARGET_DIRTY_RATIO:
6185 f->dump_unsigned("cache_target_dirty_ratio_micro",
6186 p->cache_target_dirty_ratio_micro);
6187 f->dump_float("cache_target_dirty_ratio",
6188 ((float)p->cache_target_dirty_ratio_micro/1000000));
6189 break;
6190 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6191 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6192 p->cache_target_dirty_high_ratio_micro);
6193 f->dump_float("cache_target_dirty_high_ratio",
6194 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6195 break;
6196 case CACHE_TARGET_FULL_RATIO:
6197 f->dump_unsigned("cache_target_full_ratio_micro",
6198 p->cache_target_full_ratio_micro);
6199 f->dump_float("cache_target_full_ratio",
6200 ((float)p->cache_target_full_ratio_micro/1000000));
6201 break;
6202 case CACHE_MIN_FLUSH_AGE:
6203 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6204 break;
6205 case CACHE_MIN_EVICT_AGE:
6206 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6207 break;
6208 case ERASURE_CODE_PROFILE:
6209 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6210 break;
6211 case MIN_READ_RECENCY_FOR_PROMOTE:
6212 f->dump_int("min_read_recency_for_promote",
6213 p->min_read_recency_for_promote);
6214 break;
6215 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6216 f->dump_int("min_write_recency_for_promote",
6217 p->min_write_recency_for_promote);
6218 break;
6219 case FAST_READ:
6220 f->dump_int("fast_read", p->fast_read);
6221 break;
6222 case HIT_SET_GRADE_DECAY_RATE:
6223 f->dump_int("hit_set_grade_decay_rate",
6224 p->hit_set_grade_decay_rate);
6225 break;
6226 case HIT_SET_SEARCH_LAST_N:
6227 f->dump_int("hit_set_search_last_n",
6228 p->hit_set_search_last_n);
6229 break;
6230 case SCRUB_MIN_INTERVAL:
6231 case SCRUB_MAX_INTERVAL:
6232 case DEEP_SCRUB_INTERVAL:
6233 case RECOVERY_PRIORITY:
6234 case RECOVERY_OP_PRIORITY:
6235 case SCRUB_PRIORITY:
6236 case COMPRESSION_MODE:
6237 case COMPRESSION_ALGORITHM:
6238 case COMPRESSION_REQUIRED_RATIO:
6239 case COMPRESSION_MAX_BLOB_SIZE:
6240 case COMPRESSION_MIN_BLOB_SIZE:
6241 case CSUM_TYPE:
6242 case CSUM_MAX_BLOCK:
6243 case CSUM_MIN_BLOCK:
6244 case FINGERPRINT_ALGORITHM:
6245 case PG_NUM_MIN:
6246 case TARGET_SIZE_BYTES:
6247 case TARGET_SIZE_RATIO:
6248 case PG_AUTOSCALE_BIAS:
6249 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6250 if (p->opts.is_set(key)) {
6251 if(*it == CSUM_TYPE) {
6252 int64_t val;
6253 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6254 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6255 } else {
6256 p->opts.dump(i->first, f.get());
6257 }
6258 }
6259 break;
6260 }
6261 }
6262 f->close_section();
6263 f->flush(rdata);
6264 } else /* !f */ {
6265 for(choices_set_t::const_iterator it = selected_choices.begin();
6266 it != selected_choices.end(); ++it) {
6267 choices_map_t::const_iterator i;
6268 switch(*it) {
6269 case PG_NUM:
6270 ss << "pg_num: " << p->get_pg_num() << "\n";
6271 break;
6272 case PGP_NUM:
6273 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6274 break;
6275 case SIZE:
6276 ss << "size: " << p->get_size() << "\n";
6277 break;
6278 case MIN_SIZE:
6279 ss << "min_size: " << p->get_min_size() << "\n";
6280 break;
6281 case CRUSH_RULE:
6282 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6283 ss << "crush_rule: " << osdmap.crush->get_rule_name(
6284 p->get_crush_rule()) << "\n";
6285 } else {
6286 ss << "crush_rule: " << p->get_crush_rule() << "\n";
6287 }
6288 break;
6289 case PG_AUTOSCALE_MODE:
6290 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6291 p->pg_autoscale_mode) <<"\n";
6292 break;
6293 case HIT_SET_PERIOD:
6294 ss << "hit_set_period: " << p->hit_set_period << "\n";
6295 break;
6296 case HIT_SET_COUNT:
6297 ss << "hit_set_count: " << p->hit_set_count << "\n";
6298 break;
6299 case HIT_SET_TYPE:
6300 ss << "hit_set_type: " <<
6301 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6302 break;
6303 case HIT_SET_FPP:
6304 {
6305 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6306 BloomHitSet::Params *bloomp =
6307 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6308 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6309 } else if(var != "all") {
6310 ss << "hit set is not of type Bloom; " <<
6311 "invalid to get a false positive rate!";
6312 r = -EINVAL;
6313 goto reply;
6314 }
6315 }
6316 break;
6317 case USE_GMT_HITSET:
6318 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6319 break;
6320 case TARGET_MAX_OBJECTS:
6321 ss << "target_max_objects: " << p->target_max_objects << "\n";
6322 break;
6323 case TARGET_MAX_BYTES:
6324 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6325 break;
6326 case CACHE_TARGET_DIRTY_RATIO:
6327 ss << "cache_target_dirty_ratio: "
6328 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6329 break;
6330 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6331 ss << "cache_target_dirty_high_ratio: "
6332 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6333 break;
6334 case CACHE_TARGET_FULL_RATIO:
6335 ss << "cache_target_full_ratio: "
6336 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6337 break;
6338 case CACHE_MIN_FLUSH_AGE:
6339 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6340 break;
6341 case CACHE_MIN_EVICT_AGE:
6342 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6343 break;
6344 case ERASURE_CODE_PROFILE:
6345 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6346 break;
6347 case MIN_READ_RECENCY_FOR_PROMOTE:
6348 ss << "min_read_recency_for_promote: " <<
6349 p->min_read_recency_for_promote << "\n";
6350 break;
6351 case HIT_SET_GRADE_DECAY_RATE:
6352 ss << "hit_set_grade_decay_rate: " <<
6353 p->hit_set_grade_decay_rate << "\n";
6354 break;
6355 case HIT_SET_SEARCH_LAST_N:
6356 ss << "hit_set_search_last_n: " <<
6357 p->hit_set_search_last_n << "\n";
6358 break;
6359 case EC_OVERWRITES:
6360 ss << "allow_ec_overwrites: " <<
6361 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6362 "\n";
6363 break;
6364 case HASHPSPOOL:
6365 case NODELETE:
6366 case NOPGCHANGE:
6367 case NOSIZECHANGE:
6368 case WRITE_FADVISE_DONTNEED:
6369 case NOSCRUB:
6370 case NODEEP_SCRUB:
6371 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6372 if (i->second == *it)
6373 break;
6374 }
6375 ceph_assert(i != ALL_CHOICES.end());
6376 ss << i->first << ": " <<
6377 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6378 "true" : "false") << "\n";
6379 break;
6380 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6381 ss << "min_write_recency_for_promote: " <<
6382 p->min_write_recency_for_promote << "\n";
6383 break;
6384 case FAST_READ:
6385 ss << "fast_read: " << p->fast_read << "\n";
6386 break;
6387 case SCRUB_MIN_INTERVAL:
6388 case SCRUB_MAX_INTERVAL:
6389 case DEEP_SCRUB_INTERVAL:
6390 case RECOVERY_PRIORITY:
6391 case RECOVERY_OP_PRIORITY:
6392 case SCRUB_PRIORITY:
6393 case COMPRESSION_MODE:
6394 case COMPRESSION_ALGORITHM:
6395 case COMPRESSION_REQUIRED_RATIO:
6396 case COMPRESSION_MAX_BLOB_SIZE:
6397 case COMPRESSION_MIN_BLOB_SIZE:
6398 case CSUM_TYPE:
6399 case CSUM_MAX_BLOCK:
6400 case CSUM_MIN_BLOCK:
6401 case FINGERPRINT_ALGORITHM:
6402 case PG_NUM_MIN:
6403 case TARGET_SIZE_BYTES:
6404 case TARGET_SIZE_RATIO:
6405 case PG_AUTOSCALE_BIAS:
6406 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6407 if (i->second == *it)
6408 break;
6409 }
6410 ceph_assert(i != ALL_CHOICES.end());
6411 {
6412 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6413 if (p->opts.is_set(key)) {
6414 if(key == pool_opts_t::CSUM_TYPE) {
6415 int64_t val;
6416 p->opts.get(key, &val);
6417 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6418 } else {
6419 ss << i->first << ": " << p->opts.get(key) << "\n";
6420 }
6421 }
6422 }
6423 break;
6424 }
6425 rdata.append(ss.str());
6426 ss.str("");
6427 }
6428 }
6429 r = 0;
6430 } else if (prefix == "osd pool get-quota") {
6431 string pool_name;
6432 cmd_getval(cmdmap, "pool", pool_name);
6433
6434 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6435 if (poolid < 0) {
6436 ceph_assert(poolid == -ENOENT);
6437 ss << "unrecognized pool '" << pool_name << "'";
6438 r = -ENOENT;
6439 goto reply;
6440 }
6441 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6442 const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6443 const object_stat_sum_t& sum = pstat->stats.sum;
6444 if (f) {
6445 f->open_object_section("pool_quotas");
6446 f->dump_string("pool_name", pool_name);
6447 f->dump_unsigned("pool_id", poolid);
6448 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6449 f->dump_int("current_num_objects", sum.num_objects);
6450 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6451 f->dump_int("current_num_bytes", sum.num_bytes);
6452 f->close_section();
6453 f->flush(rdata);
6454 } else {
6455 stringstream rs;
6456 rs << "quotas for pool '" << pool_name << "':\n"
6457 << " max objects: ";
6458 if (p->quota_max_objects == 0)
6459 rs << "N/A";
6460 else {
6461 rs << si_u_t(p->quota_max_objects) << " objects";
6462 rs << " (current num objects: " << sum.num_objects << " objects)";
6463 }
6464 rs << "\n"
6465 << " max bytes : ";
6466 if (p->quota_max_bytes == 0)
6467 rs << "N/A";
6468 else {
6469 rs << byte_u_t(p->quota_max_bytes);
6470 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6471 }
6472 rdata.append(rs.str());
6473 }
6474 rdata.append("\n");
6475 r = 0;
6476 } else if (prefix == "osd crush rule list" ||
6477 prefix == "osd crush rule ls") {
6478 if (f) {
6479 f->open_array_section("rules");
6480 osdmap.crush->list_rules(f.get());
6481 f->close_section();
6482 f->flush(rdata);
6483 } else {
6484 ostringstream ss;
6485 osdmap.crush->list_rules(&ss);
6486 rdata.append(ss.str());
6487 }
6488 } else if (prefix == "osd crush rule ls-by-class") {
6489 string class_name;
6490 cmd_getval(cmdmap, "class", class_name);
6491 if (class_name.empty()) {
6492 ss << "no class specified";
6493 r = -EINVAL;
6494 goto reply;
6495 }
6496 set<int> rules;
6497 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6498 if (r < 0) {
6499 ss << "failed to get rules by class '" << class_name << "'";
6500 goto reply;
6501 }
6502 if (f) {
6503 f->open_array_section("rules");
6504 for (auto &rule: rules) {
6505 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6506 }
6507 f->close_section();
6508 f->flush(rdata);
6509 } else {
6510 ostringstream rs;
6511 for (auto &rule: rules) {
6512 rs << osdmap.crush->get_rule_name(rule) << "\n";
6513 }
6514 rdata.append(rs.str());
6515 }
6516 } else if (prefix == "osd crush rule dump") {
6517 string name;
6518 cmd_getval(cmdmap, "name", name);
6519 string format;
6520 cmd_getval(cmdmap, "format", format);
6521 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6522 if (name == "") {
6523 f->open_array_section("rules");
6524 osdmap.crush->dump_rules(f.get());
6525 f->close_section();
6526 } else {
6527 int ruleno = osdmap.crush->get_rule_id(name);
6528 if (ruleno < 0) {
6529 ss << "unknown crush rule '" << name << "'";
6530 r = ruleno;
6531 goto reply;
6532 }
6533 osdmap.crush->dump_rule(ruleno, f.get());
6534 }
6535 ostringstream rs;
6536 f->flush(rs);
6537 rs << "\n";
6538 rdata.append(rs.str());
6539 } else if (prefix == "osd crush dump") {
6540 string format;
6541 cmd_getval(cmdmap, "format", format);
6542 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6543 f->open_object_section("crush_map");
6544 osdmap.crush->dump(f.get());
6545 f->close_section();
6546 ostringstream rs;
6547 f->flush(rs);
6548 rs << "\n";
6549 rdata.append(rs.str());
6550 } else if (prefix == "osd crush show-tunables") {
6551 string format;
6552 cmd_getval(cmdmap, "format", format);
6553 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6554 f->open_object_section("crush_map_tunables");
6555 osdmap.crush->dump_tunables(f.get());
6556 f->close_section();
6557 ostringstream rs;
6558 f->flush(rs);
6559 rs << "\n";
6560 rdata.append(rs.str());
6561 } else if (prefix == "osd crush tree") {
6562 string shadow;
6563 cmd_getval(cmdmap, "shadow", shadow);
6564 bool show_shadow = shadow == "--show-shadow";
6565 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6566 if (f) {
6567 f->open_object_section("crush_tree");
6568 osdmap.crush->dump_tree(nullptr,
6569 f.get(),
6570 osdmap.get_pool_names(),
6571 show_shadow);
6572 f->close_section();
6573 f->flush(rdata);
6574 } else {
6575 ostringstream ss;
6576 osdmap.crush->dump_tree(&ss,
6577 nullptr,
6578 osdmap.get_pool_names(),
6579 show_shadow);
6580 rdata.append(ss.str());
6581 }
6582 } else if (prefix == "osd crush ls") {
6583 string name;
6584 if (!cmd_getval(cmdmap, "node", name)) {
6585 ss << "no node specified";
6586 r = -EINVAL;
6587 goto reply;
6588 }
6589 if (!osdmap.crush->name_exists(name)) {
6590 ss << "node '" << name << "' does not exist";
6591 r = -ENOENT;
6592 goto reply;
6593 }
6594 int id = osdmap.crush->get_item_id(name);
6595 list<int> result;
6596 if (id >= 0) {
6597 result.push_back(id);
6598 } else {
6599 int num = osdmap.crush->get_bucket_size(id);
6600 for (int i = 0; i < num; ++i) {
6601 result.push_back(osdmap.crush->get_bucket_item(id, i));
6602 }
6603 }
6604 if (f) {
6605 f->open_array_section("items");
6606 for (auto i : result) {
6607 f->dump_string("item", osdmap.crush->get_item_name(i));
6608 }
6609 f->close_section();
6610 f->flush(rdata);
6611 } else {
6612 ostringstream ss;
6613 for (auto i : result) {
6614 ss << osdmap.crush->get_item_name(i) << "\n";
6615 }
6616 rdata.append(ss.str());
6617 }
6618 r = 0;
6619 } else if (prefix == "osd crush class ls") {
6620 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6621 f->open_array_section("crush_classes");
6622 for (auto i : osdmap.crush->class_name)
6623 f->dump_string("class", i.second);
6624 f->close_section();
6625 f->flush(rdata);
6626 } else if (prefix == "osd crush class ls-osd") {
6627 string name;
6628 cmd_getval(cmdmap, "class", name);
6629 set<int> osds;
6630 osdmap.crush->get_devices_by_class(name, &osds);
6631 if (f) {
6632 f->open_array_section("osds");
6633 for (auto &osd: osds)
6634 f->dump_int("osd", osd);
6635 f->close_section();
6636 f->flush(rdata);
6637 } else {
6638 bool first = true;
6639 for (auto &osd : osds) {
6640 if (!first)
6641 ds << "\n";
6642 first = false;
6643 ds << osd;
6644 }
6645 rdata.append(ds);
6646 }
6647 } else if (prefix == "osd crush get-device-class") {
6648 vector<string> idvec;
6649 cmd_getval(cmdmap, "ids", idvec);
6650 map<int, string> class_by_osd;
6651 for (auto& id : idvec) {
6652 ostringstream ts;
6653 long osd = parse_osd_id(id.c_str(), &ts);
6654 if (osd < 0) {
6655 ss << "unable to parse osd id:'" << id << "'";
6656 r = -EINVAL;
6657 goto reply;
6658 }
6659 auto device_class = osdmap.crush->get_item_class(osd);
6660 if (device_class)
6661 class_by_osd[osd] = device_class;
6662 else
6663 class_by_osd[osd] = ""; // no class
6664 }
6665 if (f) {
6666 f->open_array_section("osd_device_classes");
6667 for (auto& i : class_by_osd) {
6668 f->open_object_section("osd_device_class");
6669 f->dump_int("osd", i.first);
6670 f->dump_string("device_class", i.second);
6671 f->close_section();
6672 }
6673 f->close_section();
6674 f->flush(rdata);
6675 } else {
6676 if (class_by_osd.size() == 1) {
6677 // for single input, make a clean output
6678 ds << class_by_osd.begin()->second;
6679 } else {
6680 // note that we do not group osds by class here
6681 for (auto it = class_by_osd.begin();
6682 it != class_by_osd.end();
6683 it++) {
6684 ds << "osd." << it->first << ' ' << it->second;
6685 if (next(it) != class_by_osd.end())
6686 ds << '\n';
6687 }
6688 }
6689 rdata.append(ds);
6690 }
6691 } else if (prefix == "osd erasure-code-profile ls") {
6692 const auto &profiles = osdmap.get_erasure_code_profiles();
6693 if (f)
6694 f->open_array_section("erasure-code-profiles");
6695 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6696 if (f)
6697 f->dump_string("profile", i->first.c_str());
6698 else
6699 rdata.append(i->first + "\n");
6700 }
6701 if (f) {
6702 f->close_section();
6703 ostringstream rs;
6704 f->flush(rs);
6705 rs << "\n";
6706 rdata.append(rs.str());
6707 }
6708 } else if (prefix == "osd crush weight-set ls") {
6709 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6710 if (f) {
6711 f->open_array_section("weight_sets");
6712 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6713 f->dump_string("pool", "(compat)");
6714 }
6715 for (auto& i : osdmap.crush->choose_args) {
6716 if (i.first >= 0) {
6717 f->dump_string("pool", osdmap.get_pool_name(i.first));
6718 }
6719 }
6720 f->close_section();
6721 f->flush(rdata);
6722 } else {
6723 ostringstream rs;
6724 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6725 rs << "(compat)\n";
6726 }
6727 for (auto& i : osdmap.crush->choose_args) {
6728 if (i.first >= 0) {
6729 rs << osdmap.get_pool_name(i.first) << "\n";
6730 }
6731 }
6732 rdata.append(rs.str());
6733 }
6734 } else if (prefix == "osd crush weight-set dump") {
6735 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6736 "json-pretty"));
6737 osdmap.crush->dump_choose_args(f.get());
6738 f->flush(rdata);
6739 } else if (prefix == "osd erasure-code-profile get") {
6740 string name;
6741 cmd_getval(cmdmap, "name", name);
6742 if (!osdmap.has_erasure_code_profile(name)) {
6743 ss << "unknown erasure code profile '" << name << "'";
6744 r = -ENOENT;
6745 goto reply;
6746 }
6747 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6748 if (f)
6749 f->open_object_section("profile");
6750 for (map<string,string>::const_iterator i = profile.begin();
6751 i != profile.end();
6752 ++i) {
6753 if (f)
6754 f->dump_string(i->first.c_str(), i->second.c_str());
6755 else
6756 rdata.append(i->first + "=" + i->second + "\n");
6757 }
6758 if (f) {
6759 f->close_section();
6760 ostringstream rs;
6761 f->flush(rs);
6762 rs << "\n";
6763 rdata.append(rs.str());
6764 }
6765 } else if (prefix == "osd pool application get") {
6766 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6767 "json-pretty"));
6768 string pool_name;
6769 cmd_getval(cmdmap, "pool", pool_name);
6770 string app;
6771 cmd_getval(cmdmap, "app", app);
6772 string key;
6773 cmd_getval(cmdmap, "key", key);
6774
6775 if (pool_name.empty()) {
6776 // all
6777 f->open_object_section("pools");
6778 for (const auto &pool : osdmap.pools) {
6779 std::string name("<unknown>");
6780 const auto &pni = osdmap.pool_name.find(pool.first);
6781 if (pni != osdmap.pool_name.end())
6782 name = pni->second;
6783 f->open_object_section(name.c_str());
6784 for (auto &app_pair : pool.second.application_metadata) {
6785 f->open_object_section(app_pair.first.c_str());
6786 for (auto &kv_pair : app_pair.second) {
6787 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6788 }
6789 f->close_section();
6790 }
6791 f->close_section(); // name
6792 }
6793 f->close_section(); // pools
6794 f->flush(rdata);
6795 } else {
6796 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6797 if (pool < 0) {
6798 ss << "unrecognized pool '" << pool_name << "'";
6799 r = -ENOENT;
6800 goto reply;
6801 }
6802 auto p = osdmap.get_pg_pool(pool);
6803 // filter by pool
6804 if (app.empty()) {
6805 f->open_object_section(pool_name.c_str());
6806 for (auto &app_pair : p->application_metadata) {
6807 f->open_object_section(app_pair.first.c_str());
6808 for (auto &kv_pair : app_pair.second) {
6809 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6810 }
6811 f->close_section(); // application
6812 }
6813 f->close_section(); // pool_name
6814 f->flush(rdata);
6815 goto reply;
6816 }
6817
6818 auto app_it = p->application_metadata.find(app);
6819 if (app_it == p->application_metadata.end()) {
6820 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6821 r = -ENOENT;
6822 goto reply;
6823 }
6824 // filter by pool + app
6825 if (key.empty()) {
6826 f->open_object_section(app_it->first.c_str());
6827 for (auto &kv_pair : app_it->second) {
6828 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6829 }
6830 f->close_section(); // application
6831 f->flush(rdata);
6832 goto reply;
6833 }
6834 // filter by pool + app + key
6835 auto key_it = app_it->second.find(key);
6836 if (key_it == app_it->second.end()) {
6837 ss << "application '" << app << "' on pool '" << pool_name
6838 << "' does not have key '" << key << "'";
6839 r = -ENOENT;
6840 goto reply;
6841 }
6842 ss << key_it->second << "\n";
6843 rdata.append(ss.str());
6844 ss.str("");
6845 }
6846 } else if (prefix == "osd get-require-min-compat-client") {
6847 ss << osdmap.require_min_compat_client << std::endl;
6848 rdata.append(ss.str());
6849 ss.str("");
6850 goto reply;
6851 } else if (prefix == "osd pool application enable" ||
6852 prefix == "osd pool application disable" ||
6853 prefix == "osd pool application set" ||
6854 prefix == "osd pool application rm") {
6855 bool changed = false;
6856 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6857 if (r != 0) {
6858 // Error, reply.
6859 goto reply;
6860 } else if (changed) {
6861 // Valid mutation, proceed to prepare phase
6862 return false;
6863 } else {
6864 // Idempotent case, reply
6865 goto reply;
6866 }
6867 } else {
6868 // try prepare update
6869 return false;
6870 }
6871
6872 reply:
6873 string rs;
6874 getline(ss, rs);
6875 mon->reply_command(op, r, rs, rdata, get_last_committed());
6876 return true;
6877 }
6878
6879 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6880 {
6881 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6882 osdmap.get_pg_pool(pool_id));
6883 ceph_assert(pool);
6884 pool->set_flag(flags);
6885 }
6886
6887 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6888 {
6889 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6890 osdmap.get_pg_pool(pool_id));
6891 ceph_assert(pool);
6892 pool->unset_flag(flags);
6893 }
6894
6895 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
6896 {
6897 char k[80];
6898 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
6899 return k;
6900 }
6901
6902 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
6903 {
6904 char k[80];
6905 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6906 (unsigned long long)pool, (unsigned long long)snap);
6907 return k;
6908 }
6909
6910 string OSDMonitor::make_purged_snap_key_value(
6911 int64_t pool, snapid_t snap, snapid_t num,
6912 epoch_t epoch, bufferlist *v)
6913 {
6914 // encode the *last* epoch in the key so that we can use forward
6915 // iteration only to search for an epoch in an interval.
6916 encode(snap, *v);
6917 encode(snap + num, *v);
6918 encode(epoch, *v);
6919 return make_purged_snap_key(pool, snap + num - 1);
6920 }
6921
6922
6923 int OSDMonitor::lookup_purged_snap(
6924 int64_t pool, snapid_t snap,
6925 snapid_t *begin, snapid_t *end)
6926 {
6927 string k = make_purged_snap_key(pool, snap);
6928 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6929 it->lower_bound(k);
6930 if (!it->valid()) {
6931 dout(20) << __func__
6932 << " pool " << pool << " snap " << snap
6933 << " - key '" << k << "' not found" << dendl;
6934 return -ENOENT;
6935 }
6936 if (it->key().find("purged_snap_") != 0) {
6937 dout(20) << __func__
6938 << " pool " << pool << " snap " << snap
6939 << " - key '" << k << "' got '" << it->key()
6940 << "', wrong prefix" << dendl;
6941 return -ENOENT;
6942 }
6943 string gotk = it->key();
6944 const char *format = "purged_snap_%llu_";
6945 long long int keypool;
6946 int n = sscanf(gotk.c_str(), format, &keypool);
6947 if (n != 1) {
6948 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6949 return -ENOENT;
6950 }
6951 if (pool != keypool) {
6952 dout(20) << __func__
6953 << " pool " << pool << " snap " << snap
6954 << " - key '" << k << "' got '" << gotk
6955 << "', wrong pool " << keypool
6956 << dendl;
6957 return -ENOENT;
6958 }
6959 bufferlist v = it->value();
6960 auto p = v.cbegin();
6961 decode(*begin, p);
6962 decode(*end, p);
6963 if (snap < *begin || snap >= *end) {
6964 dout(20) << __func__
6965 << " pool " << pool << " snap " << snap
6966 << " - found [" << *begin << "," << *end << "), no overlap"
6967 << dendl;
6968 return -ENOENT;
6969 }
6970 return 0;
6971 }
6972
6973 void OSDMonitor::insert_purged_snap_update(
6974 int64_t pool,
6975 snapid_t start, snapid_t end,
6976 epoch_t epoch,
6977 MonitorDBStore::TransactionRef t)
6978 {
6979 snapid_t before_begin, before_end;
6980 snapid_t after_begin, after_end;
6981 int b = lookup_purged_snap(pool, start - 1,
6982 &before_begin, &before_end);
6983 int a = lookup_purged_snap(pool, end,
6984 &after_begin, &after_end);
6985 if (!b && !a) {
6986 dout(10) << __func__
6987 << " [" << start << "," << end << ") - joins ["
6988 << before_begin << "," << before_end << ") and ["
6989 << after_begin << "," << after_end << ")" << dendl;
6990 // erase only the begin record; we'll overwrite the end one.
6991 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6992 bufferlist v;
6993 string k = make_purged_snap_key_value(pool,
6994 before_begin, after_end - before_begin,
6995 pending_inc.epoch, &v);
6996 t->put(OSD_SNAP_PREFIX, k, v);
6997 } else if (!b) {
6998 dout(10) << __func__
6999 << " [" << start << "," << end << ") - join with earlier ["
7000 << before_begin << "," << before_end << ")" << dendl;
7001 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7002 bufferlist v;
7003 string k = make_purged_snap_key_value(pool,
7004 before_begin, end - before_begin,
7005 pending_inc.epoch, &v);
7006 t->put(OSD_SNAP_PREFIX, k, v);
7007 } else if (!a) {
7008 dout(10) << __func__
7009 << " [" << start << "," << end << ") - join with later ["
7010 << after_begin << "," << after_end << ")" << dendl;
7011 // overwrite after record
7012 bufferlist v;
7013 string k = make_purged_snap_key_value(pool,
7014 start, after_end - start,
7015 pending_inc.epoch, &v);
7016 t->put(OSD_SNAP_PREFIX, k, v);
7017 } else {
7018 dout(10) << __func__
7019 << " [" << start << "," << end << ") - new"
7020 << dendl;
7021 bufferlist v;
7022 string k = make_purged_snap_key_value(pool,
7023 start, end - start,
7024 pending_inc.epoch, &v);
7025 t->put(OSD_SNAP_PREFIX, k, v);
7026 }
7027 }
7028
7029 bool OSDMonitor::try_prune_purged_snaps()
7030 {
7031 if (!mon->mgrstatmon()->is_readable()) {
7032 return false;
7033 }
7034 if (!pending_inc.new_purged_snaps.empty()) {
7035 return false; // we already pruned for this epoch
7036 }
7037
7038 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7039 "mon_max_snap_prune_per_epoch");
7040 if (!max_prune) {
7041 max_prune = 100000;
7042 }
7043 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7044
7045 unsigned actually_pruned = 0;
7046 auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
7047 for (auto& p : osdmap.get_pools()) {
7048 auto q = purged_snaps.find(p.first);
7049 if (q == purged_snaps.end()) {
7050 continue;
7051 }
7052 auto& purged = q->second;
7053 if (purged.empty()) {
7054 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7055 continue;
7056 }
7057 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7058 snap_interval_set_t to_prune;
7059 unsigned maybe_pruned = actually_pruned;
7060 for (auto i = purged.begin(); i != purged.end(); ++i) {
7061 snapid_t begin = i.get_start();
7062 auto end = i.get_start() + i.get_len();
7063 snapid_t pbegin = 0, pend = 0;
7064 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7065 if (r == 0) {
7066 // already purged.
7067 // be a bit aggressive about backing off here, because the mon may
7068 // do a lot of work going through this set, and if we know the
7069 // purged set from the OSDs is at least *partly* stale we may as
7070 // well wait for it to be fresh.
7071 dout(20) << __func__ << " we've already purged " << pbegin
7072 << "~" << (pend - pbegin) << dendl;
7073 break; // next pool
7074 }
7075 if (pbegin && pbegin > begin && pbegin < end) {
7076 // the tail of [begin,end) is purged; shorten the range
7077 end = pbegin;
7078 }
7079 to_prune.insert(begin, end - begin);
7080 maybe_pruned += end - begin;
7081 if (maybe_pruned >= max_prune) {
7082 break;
7083 }
7084 }
7085 if (!to_prune.empty()) {
7086 // PGs may still be reporting things as purged that we have already
7087 // pruned from removed_snaps_queue.
7088 snap_interval_set_t actual;
7089 auto r = osdmap.removed_snaps_queue.find(p.first);
7090 if (r != osdmap.removed_snaps_queue.end()) {
7091 actual.intersection_of(to_prune, r->second);
7092 }
7093 actually_pruned += actual.size();
7094 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7095 << ", actual pruned " << actual << dendl;
7096 if (!actual.empty()) {
7097 pending_inc.new_purged_snaps[p.first].swap(actual);
7098 }
7099 }
7100 if (actually_pruned >= max_prune) {
7101 break;
7102 }
7103 }
7104 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7105 return !!actually_pruned;
7106 }
7107
7108 bool OSDMonitor::update_pools_status()
7109 {
7110 if (!mon->mgrstatmon()->is_readable())
7111 return false;
7112
7113 bool ret = false;
7114
7115 auto& pools = osdmap.get_pools();
7116 for (auto it = pools.begin(); it != pools.end(); ++it) {
7117 const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
7118 if (!pstat)
7119 continue;
7120 const object_stat_sum_t& sum = pstat->stats.sum;
7121 const pg_pool_t &pool = it->second;
7122 const string& pool_name = osdmap.get_pool_name(it->first);
7123
7124 bool pool_is_full =
7125 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7126 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7127
7128 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7129 if (pool_is_full)
7130 continue;
7131
7132 mon->clog->info() << "pool '" << pool_name
7133 << "' no longer out of quota; removing NO_QUOTA flag";
7134 // below we cancel FLAG_FULL too, we'll set it again in
7135 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7136 clear_pool_flags(it->first,
7137 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7138 ret = true;
7139 } else {
7140 if (!pool_is_full)
7141 continue;
7142
7143 if (pool.quota_max_bytes > 0 &&
7144 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7145 mon->clog->warn() << "pool '" << pool_name << "' is full"
7146 << " (reached quota's max_bytes: "
7147 << byte_u_t(pool.quota_max_bytes) << ")";
7148 }
7149 if (pool.quota_max_objects > 0 &&
7150 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7151 mon->clog->warn() << "pool '" << pool_name << "' is full"
7152 << " (reached quota's max_objects: "
7153 << pool.quota_max_objects << ")";
7154 }
7155 // set both FLAG_FULL_QUOTA and FLAG_FULL
7156 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7157 // since FLAG_FULL should always take precedence
7158 set_pool_flags(it->first,
7159 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7160 clear_pool_flags(it->first,
7161 pg_pool_t::FLAG_NEARFULL |
7162 pg_pool_t::FLAG_BACKFILLFULL);
7163 ret = true;
7164 }
7165 }
7166 return ret;
7167 }
7168
7169 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7170 {
7171 op->mark_osdmon_event(__func__);
7172 auto m = op->get_req<MPoolOp>();
7173 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7174 MonSession *session = op->get_session();
7175 if (!session)
7176 return -EPERM;
7177 string erasure_code_profile;
7178 stringstream ss;
7179 string rule_name;
7180 int ret = 0;
7181 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7182 0, 0, 0, 0, 0, 0.0,
7183 erasure_code_profile,
7184 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7185 &ss);
7186
7187 if (ret < 0) {
7188 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7189 }
7190 return ret;
7191 }
7192
7193 int OSDMonitor::crush_rename_bucket(const string& srcname,
7194 const string& dstname,
7195 ostream *ss)
7196 {
7197 int ret;
7198 //
7199 // Avoid creating a pending crush if it does not already exists and
7200 // the rename would fail.
7201 //
7202 if (!_have_pending_crush()) {
7203 ret = _get_stable_crush().can_rename_bucket(srcname,
7204 dstname,
7205 ss);
7206 if (ret)
7207 return ret;
7208 }
7209
7210 CrushWrapper newcrush;
7211 _get_pending_crush(newcrush);
7212
7213 ret = newcrush.rename_bucket(srcname,
7214 dstname,
7215 ss);
7216 if (ret)
7217 return ret;
7218
7219 pending_inc.crush.clear();
7220 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7221 *ss << "renamed bucket " << srcname << " into " << dstname;
7222 return 0;
7223 }
7224
7225 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7226 {
7227 string replacement = "";
7228
7229 if (plugin == "jerasure_generic" ||
7230 plugin == "jerasure_sse3" ||
7231 plugin == "jerasure_sse4" ||
7232 plugin == "jerasure_neon") {
7233 replacement = "jerasure";
7234 } else if (plugin == "shec_generic" ||
7235 plugin == "shec_sse3" ||
7236 plugin == "shec_sse4" ||
7237 plugin == "shec_neon") {
7238 replacement = "shec";
7239 }
7240
7241 if (replacement != "") {
7242 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7243 << plugin << " that has been deprecated. Please use "
7244 << replacement << " instead." << dendl;
7245 }
7246 }
7247
7248 int OSDMonitor::normalize_profile(const string& profilename,
7249 ErasureCodeProfile &profile,
7250 bool force,
7251 ostream *ss)
7252 {
7253 ErasureCodeInterfaceRef erasure_code;
7254 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7255 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7256 check_legacy_ec_plugin(plugin->second, profilename);
7257 int err = instance.factory(plugin->second,
7258 g_conf().get_val<std::string>("erasure_code_dir"),
7259 profile, &erasure_code, ss);
7260 if (err) {
7261 return err;
7262 }
7263
7264 err = erasure_code->init(profile, ss);
7265 if (err) {
7266 return err;
7267 }
7268
7269 auto it = profile.find("stripe_unit");
7270 if (it != profile.end()) {
7271 string err_str;
7272 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7273 if (!err_str.empty()) {
7274 *ss << "could not parse stripe_unit '" << it->second
7275 << "': " << err_str << std::endl;
7276 return -EINVAL;
7277 }
7278 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7279 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7280 if (chunk_size != stripe_unit) {
7281 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7282 << "alignment. Would be padded to " << chunk_size
7283 << std::endl;
7284 return -EINVAL;
7285 }
7286 if ((stripe_unit % 4096) != 0 && !force) {
7287 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7288 << "use --force to override this check" << std::endl;
7289 return -EINVAL;
7290 }
7291 }
7292 return 0;
7293 }
7294
7295 int OSDMonitor::crush_rule_create_erasure(const string &name,
7296 const string &profile,
7297 int *rule,
7298 ostream *ss)
7299 {
7300 int ruleid = osdmap.crush->get_rule_id(name);
7301 if (ruleid != -ENOENT) {
7302 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7303 return -EEXIST;
7304 }
7305
7306 CrushWrapper newcrush;
7307 _get_pending_crush(newcrush);
7308
7309 ruleid = newcrush.get_rule_id(name);
7310 if (ruleid != -ENOENT) {
7311 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7312 return -EALREADY;
7313 } else {
7314 ErasureCodeInterfaceRef erasure_code;
7315 int err = get_erasure_code(profile, &erasure_code, ss);
7316 if (err) {
7317 *ss << "failed to load plugin using profile " << profile << std::endl;
7318 return err;
7319 }
7320
7321 err = erasure_code->create_rule(name, newcrush, ss);
7322 erasure_code.reset();
7323 if (err < 0)
7324 return err;
7325 *rule = err;
7326 pending_inc.crush.clear();
7327 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7328 return 0;
7329 }
7330 }
7331
7332 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7333 ErasureCodeInterfaceRef *erasure_code,
7334 ostream *ss) const
7335 {
7336 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7337 return -EAGAIN;
7338 ErasureCodeProfile profile =
7339 osdmap.get_erasure_code_profile(erasure_code_profile);
7340 ErasureCodeProfile::const_iterator plugin =
7341 profile.find("plugin");
7342 if (plugin == profile.end()) {
7343 *ss << "cannot determine the erasure code plugin"
7344 << " because there is no 'plugin' entry in the erasure_code_profile "
7345 << profile << std::endl;
7346 return -EINVAL;
7347 }
7348 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7349 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7350 return instance.factory(plugin->second,
7351 g_conf().get_val<std::string>("erasure_code_dir"),
7352 profile, erasure_code, ss);
7353 }
7354
7355 int OSDMonitor::check_cluster_features(uint64_t features,
7356 stringstream &ss)
7357 {
7358 stringstream unsupported_ss;
7359 int unsupported_count = 0;
7360 if ((mon->get_quorum_con_features() & features) != features) {
7361 unsupported_ss << "the monitor cluster";
7362 ++unsupported_count;
7363 }
7364
7365 set<int32_t> up_osds;
7366 osdmap.get_up_osds(up_osds);
7367 for (set<int32_t>::iterator it = up_osds.begin();
7368 it != up_osds.end(); ++it) {
7369 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7370 if ((xi.features & features) != features) {
7371 if (unsupported_count > 0)
7372 unsupported_ss << ", ";
7373 unsupported_ss << "osd." << *it;
7374 unsupported_count ++;
7375 }
7376 }
7377
7378 if (unsupported_count > 0) {
7379 ss << "features " << features << " unsupported by: "
7380 << unsupported_ss.str();
7381 return -ENOTSUP;
7382 }
7383
7384 // check pending osd state, too!
7385 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7386 pending_inc.new_xinfo.begin();
7387 p != pending_inc.new_xinfo.end(); ++p) {
7388 const osd_xinfo_t &xi = p->second;
7389 if ((xi.features & features) != features) {
7390 dout(10) << __func__ << " pending osd." << p->first
7391 << " features are insufficient; retry" << dendl;
7392 return -EAGAIN;
7393 }
7394 }
7395
7396 return 0;
7397 }
7398
7399 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7400 stringstream& ss)
7401 {
7402 OSDMap::Incremental new_pending = pending_inc;
7403 encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7404 OSDMap newmap;
7405 newmap.deepish_copy_from(osdmap);
7406 newmap.apply_incremental(new_pending);
7407
7408 // client compat
7409 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7410 auto mv = newmap.get_min_compat_client();
7411 if (mv > newmap.require_min_compat_client) {
7412 ss << "new crush map requires client version " << mv
7413 << " but require_min_compat_client is "
7414 << newmap.require_min_compat_client;
7415 return false;
7416 }
7417 }
7418
7419 // osd compat
7420 uint64_t features =
7421 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7422 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7423 stringstream features_ss;
7424 int r = check_cluster_features(features, features_ss);
7425 if (r) {
7426 ss << "Could not change CRUSH: " << features_ss.str();
7427 return false;
7428 }
7429
7430 return true;
7431 }
7432
7433 bool OSDMonitor::erasure_code_profile_in_use(
7434 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7435 const string &profile,
7436 ostream *ss)
7437 {
7438 bool found = false;
7439 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7440 p != pools.end();
7441 ++p) {
7442 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7443 *ss << osdmap.pool_name[p->first] << " ";
7444 found = true;
7445 }
7446 }
7447 if (found) {
7448 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7449 }
7450 return found;
7451 }
7452
7453 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7454 map<string,string> *erasure_code_profile_map,
7455 ostream *ss)
7456 {
7457 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7458 get_json_str_map,
7459 *ss,
7460 erasure_code_profile_map,
7461 true);
7462 if (r)
7463 return r;
7464 ceph_assert((*erasure_code_profile_map).count("plugin"));
7465 string default_plugin = (*erasure_code_profile_map)["plugin"];
7466 map<string,string> user_map;
7467 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7468 i != erasure_code_profile.end();
7469 ++i) {
7470 size_t equal = i->find('=');
7471 if (equal == string::npos) {
7472 user_map[*i] = string();
7473 (*erasure_code_profile_map)[*i] = string();
7474 } else {
7475 const string key = i->substr(0, equal);
7476 equal++;
7477 const string value = i->substr(equal);
7478 if (key.find("ruleset-") == 0) {
7479 *ss << "property '" << key << "' is no longer supported; try "
7480 << "'crush-" << key.substr(8) << "' instead";
7481 return -EINVAL;
7482 }
7483 user_map[key] = value;
7484 (*erasure_code_profile_map)[key] = value;
7485 }
7486 }
7487
7488 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7489 (*erasure_code_profile_map) = user_map;
7490
7491 return 0;
7492 }
7493
7494 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7495 const string &erasure_code_profile,
7496 uint8_t repl_size,
7497 unsigned *size, unsigned *min_size,
7498 ostream *ss)
7499 {
7500 int err = 0;
7501 switch (pool_type) {
7502 case pg_pool_t::TYPE_REPLICATED:
7503 if (repl_size == 0) {
7504 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7505 }
7506 *size = repl_size;
7507 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7508 break;
7509 case pg_pool_t::TYPE_ERASURE:
7510 {
7511 ErasureCodeInterfaceRef erasure_code;
7512 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7513 if (err == 0) {
7514 *size = erasure_code->get_chunk_count();
7515 *min_size =
7516 erasure_code->get_data_chunk_count() +
7517 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7518 assert(*min_size <= *size);
7519 assert(*min_size >= erasure_code->get_data_chunk_count());
7520 }
7521 }
7522 break;
7523 default:
7524 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7525 err = -EINVAL;
7526 break;
7527 }
7528 return err;
7529 }
7530
7531 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7532 const string &erasure_code_profile,
7533 uint32_t *stripe_width,
7534 ostream *ss)
7535 {
7536 int err = 0;
7537 switch (pool_type) {
7538 case pg_pool_t::TYPE_REPLICATED:
7539 // ignored
7540 break;
7541 case pg_pool_t::TYPE_ERASURE:
7542 {
7543 ErasureCodeProfile profile =
7544 osdmap.get_erasure_code_profile(erasure_code_profile);
7545 ErasureCodeInterfaceRef erasure_code;
7546 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7547 if (err)
7548 break;
7549 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7550 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7551 auto it = profile.find("stripe_unit");
7552 if (it != profile.end()) {
7553 string err_str;
7554 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7555 ceph_assert(err_str.empty());
7556 }
7557 *stripe_width = data_chunks *
7558 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7559 }
7560 break;
7561 default:
7562 *ss << "prepare_pool_stripe_width: "
7563 << pool_type << " is not a known pool type";
7564 err = -EINVAL;
7565 break;
7566 }
7567 return err;
7568 }
7569
7570 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7571 const string &erasure_code_profile,
7572 const string &rule_name,
7573 int *crush_rule,
7574 ostream *ss)
7575 {
7576
7577 if (*crush_rule < 0) {
7578 switch (pool_type) {
7579 case pg_pool_t::TYPE_REPLICATED:
7580 {
7581 if (rule_name == "") {
7582 // Use default rule
7583 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7584 if (*crush_rule < 0) {
7585 // Errors may happen e.g. if no valid rule is available
7586 *ss << "No suitable CRUSH rule exists, check "
7587 << "'osd pool default crush *' config options";
7588 return -ENOENT;
7589 }
7590 } else {
7591 return get_crush_rule(rule_name, crush_rule, ss);
7592 }
7593 }
7594 break;
7595 case pg_pool_t::TYPE_ERASURE:
7596 {
7597 int err = crush_rule_create_erasure(rule_name,
7598 erasure_code_profile,
7599 crush_rule, ss);
7600 switch (err) {
7601 case -EALREADY:
7602 dout(20) << "prepare_pool_crush_rule: rule "
7603 << rule_name << " try again" << dendl;
7604 // fall through
7605 case 0:
7606 // need to wait for the crush rule to be proposed before proceeding
7607 err = -EAGAIN;
7608 break;
7609 case -EEXIST:
7610 err = 0;
7611 break;
7612 }
7613 return err;
7614 }
7615 break;
7616 default:
7617 *ss << "prepare_pool_crush_rule: " << pool_type
7618 << " is not a known pool type";
7619 return -EINVAL;
7620 break;
7621 }
7622 } else {
7623 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7624 *ss << "CRUSH rule " << *crush_rule << " not found";
7625 return -ENOENT;
7626 }
7627 }
7628
7629 return 0;
7630 }
7631
7632 int OSDMonitor::get_crush_rule(const string &rule_name,
7633 int *crush_rule,
7634 ostream *ss)
7635 {
7636 int ret;
7637 ret = osdmap.crush->get_rule_id(rule_name);
7638 if (ret != -ENOENT) {
7639 // found it, use it
7640 *crush_rule = ret;
7641 } else {
7642 CrushWrapper newcrush;
7643 _get_pending_crush(newcrush);
7644
7645 ret = newcrush.get_rule_id(rule_name);
7646 if (ret != -ENOENT) {
7647 // found it, wait for it to be proposed
7648 dout(20) << __func__ << ": rule " << rule_name
7649 << " try again" << dendl;
7650 return -EAGAIN;
7651 } else {
7652 // Cannot find it , return error
7653 *ss << "specified rule " << rule_name << " doesn't exist";
7654 return ret;
7655 }
7656 }
7657 return 0;
7658 }
7659
7660 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7661 {
7662 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7663 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
7664 auto max_pgs = max_pgs_per_osd * num_osds;
7665 uint64_t projected = 0;
7666 if (pool < 0) {
7667 projected += pg_num * size;
7668 }
7669 for (const auto& i : osdmap.get_pools()) {
7670 if (i.first == pool) {
7671 projected += pg_num * size;
7672 } else {
7673 projected += i.second.get_pg_num_target() * i.second.get_size();
7674 }
7675 }
7676 if (projected > max_pgs) {
7677 if (pool >= 0) {
7678 *ss << "pool id " << pool;
7679 }
7680 *ss << " pg_num " << pg_num << " size " << size
7681 << " would mean " << projected
7682 << " total pgs, which exceeds max " << max_pgs
7683 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7684 << " * num_in_osds " << num_osds << ")";
7685 return -ERANGE;
7686 }
7687 return 0;
7688 }
7689
7690 /**
7691 * @param name The name of the new pool
7692 * @param crush_rule The crush rule to use. If <0, will use the system default
7693 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7694 * @param pg_num The pg_num to use. If set to 0, will use the system default
7695 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7696 * @param repl_size Replication factor, or 0 for default
7697 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7698 * @param pool_type TYPE_ERASURE, or TYPE_REP
7699 * @param expected_num_objects expected number of objects on the pool
7700 * @param fast_read fast read type.
7701 * @param ss human readable error message, if any.
7702 *
7703 * @return 0 on success, negative errno on failure.
7704 */
7705 int OSDMonitor::prepare_new_pool(string& name,
7706 int crush_rule,
7707 const string &crush_rule_name,
7708 unsigned pg_num, unsigned pgp_num,
7709 unsigned pg_num_min,
7710 const uint64_t repl_size,
7711 const uint64_t target_size_bytes,
7712 const float target_size_ratio,
7713 const string &erasure_code_profile,
7714 const unsigned pool_type,
7715 const uint64_t expected_num_objects,
7716 FastReadType fast_read,
7717 const string& pg_autoscale_mode,
7718 ostream *ss)
7719 {
7720 if (name.length() == 0)
7721 return -EINVAL;
7722 if (pg_num == 0)
7723 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7724 if (pgp_num == 0)
7725 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7726 if (!pgp_num)
7727 pgp_num = pg_num;
7728 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7729 *ss << "'pg_num' must be greater than 0 and less than or equal to "
7730 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7731 << " (you may adjust 'mon max pool pg num' for higher values)";
7732 return -ERANGE;
7733 }
7734 if (pgp_num > pg_num) {
7735 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7736 << ", which in this case is " << pg_num;
7737 return -ERANGE;
7738 }
7739 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7740 *ss << "'fast_read' can only apply to erasure coding pool";
7741 return -EINVAL;
7742 }
7743 int r;
7744 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7745 crush_rule_name, &crush_rule, ss);
7746 if (r) {
7747 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7748 return r;
7749 }
7750 if (g_conf()->mon_osd_crush_smoke_test) {
7751 CrushWrapper newcrush;
7752 _get_pending_crush(newcrush);
7753 ostringstream err;
7754 CrushTester tester(newcrush, err);
7755 tester.set_min_x(0);
7756 tester.set_max_x(50);
7757 tester.set_rule(crush_rule);
7758 auto start = ceph::coarse_mono_clock::now();
7759 r = tester.test_with_fork(g_conf()->mon_lease);
7760 auto duration = ceph::coarse_mono_clock::now() - start;
7761 if (r < 0) {
7762 dout(10) << "tester.test_with_fork returns " << r
7763 << ": " << err.str() << dendl;
7764 *ss << "crush test failed with " << r << ": " << err.str();
7765 return r;
7766 }
7767 dout(10) << __func__ << " crush smoke test duration: "
7768 << duration << dendl;
7769 }
7770 unsigned size, min_size;
7771 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7772 &size, &min_size, ss);
7773 if (r) {
7774 dout(10) << "prepare_pool_size returns " << r << dendl;
7775 return r;
7776 }
7777 r = check_pg_num(-1, pg_num, size, ss);
7778 if (r) {
7779 dout(10) << "check_pg_num returns " << r << dendl;
7780 return r;
7781 }
7782
7783 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7784 return -EINVAL;
7785 }
7786
7787 uint32_t stripe_width = 0;
7788 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7789 if (r) {
7790 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7791 return r;
7792 }
7793
7794 bool fread = false;
7795 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7796 switch (fast_read) {
7797 case FAST_READ_OFF:
7798 fread = false;
7799 break;
7800 case FAST_READ_ON:
7801 fread = true;
7802 break;
7803 case FAST_READ_DEFAULT:
7804 fread = g_conf()->osd_pool_default_ec_fast_read;
7805 break;
7806 default:
7807 *ss << "invalid fast_read setting: " << fast_read;
7808 return -EINVAL;
7809 }
7810 }
7811
7812 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7813 p != pending_inc.new_pool_names.end();
7814 ++p) {
7815 if (p->second == name)
7816 return 0;
7817 }
7818
7819 if (-1 == pending_inc.new_pool_max)
7820 pending_inc.new_pool_max = osdmap.pool_max;
7821 int64_t pool = ++pending_inc.new_pool_max;
7822 pg_pool_t empty;
7823 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7824 pi->create_time = ceph_clock_now();
7825 pi->type = pool_type;
7826 pi->fast_read = fread;
7827 pi->flags = g_conf()->osd_pool_default_flags;
7828 if (g_conf()->osd_pool_default_flag_hashpspool)
7829 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7830 if (g_conf()->osd_pool_default_flag_nodelete)
7831 pi->set_flag(pg_pool_t::FLAG_NODELETE);
7832 if (g_conf()->osd_pool_default_flag_nopgchange)
7833 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7834 if (g_conf()->osd_pool_default_flag_nosizechange)
7835 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7836 pi->set_flag(pg_pool_t::FLAG_CREATING);
7837 if (g_conf()->osd_pool_use_gmt_hitset)
7838 pi->use_gmt_hitset = true;
7839 else
7840 pi->use_gmt_hitset = false;
7841
7842 pi->size = size;
7843 pi->min_size = min_size;
7844 pi->crush_rule = crush_rule;
7845 pi->expected_num_objects = expected_num_objects;
7846 pi->object_hash = CEPH_STR_HASH_RJENKINS;
7847
7848 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7849 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7850 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7851 pi->pg_autoscale_mode = m;
7852 } else {
7853 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7854 }
7855 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7856 pi->set_pg_num(
7857 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7858 : pg_num);
7859 pi->set_pg_num_pending(pi->get_pg_num());
7860 pi->set_pg_num_target(pg_num);
7861 pi->set_pgp_num(pi->get_pg_num());
7862 pi->set_pgp_num_target(pgp_num);
7863 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7864 pg_num_min) {
7865 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7866 }
7867 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7868 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7869 pi->pg_autoscale_mode = m;
7870 }
7871
7872 pi->last_change = pending_inc.epoch;
7873 pi->auid = 0;
7874
7875 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7876 pi->erasure_code_profile = erasure_code_profile;
7877 } else {
7878 pi->erasure_code_profile = "";
7879 }
7880 pi->stripe_width = stripe_width;
7881
7882 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7883 target_size_bytes) {
7884 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7885 // larger than int32_t max.
7886 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7887 }
7888 if (target_size_ratio > 0.0 &&
7889 osdmap.require_osd_release >= ceph_release_t::nautilus) {
7890 // only store for nautilus+, just to be consistent and tidy.
7891 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7892 }
7893
7894 pi->cache_target_dirty_ratio_micro =
7895 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7896 pi->cache_target_dirty_high_ratio_micro =
7897 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7898 pi->cache_target_full_ratio_micro =
7899 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7900 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7901 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7902
7903 pending_inc.new_pool_names[pool] = name;
7904 return 0;
7905 }
7906
7907 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7908 {
7909 op->mark_osdmon_event(__func__);
7910 ostringstream ss;
7911 if (pending_inc.new_flags < 0)
7912 pending_inc.new_flags = osdmap.get_flags();
7913 pending_inc.new_flags |= flag;
7914 ss << OSDMap::get_flag_string(flag) << " is set";
7915 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7916 get_last_committed() + 1));
7917 return true;
7918 }
7919
7920 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7921 {
7922 op->mark_osdmon_event(__func__);
7923 ostringstream ss;
7924 if (pending_inc.new_flags < 0)
7925 pending_inc.new_flags = osdmap.get_flags();
7926 pending_inc.new_flags &= ~flag;
7927 ss << OSDMap::get_flag_string(flag) << " is unset";
7928 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7929 get_last_committed() + 1));
7930 return true;
7931 }
7932
7933 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7934 stringstream& ss)
7935 {
7936 string poolstr;
7937 cmd_getval(cmdmap, "pool", poolstr);
7938 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7939 if (pool < 0) {
7940 ss << "unrecognized pool '" << poolstr << "'";
7941 return -ENOENT;
7942 }
7943 string var;
7944 cmd_getval(cmdmap, "var", var);
7945
7946 pg_pool_t p = *osdmap.get_pg_pool(pool);
7947 if (pending_inc.new_pools.count(pool))
7948 p = pending_inc.new_pools[pool];
7949
7950 // accept val as a json string in the normal case (current
7951 // generation monitor). parse out int or float values from the
7952 // string as needed. however, if it is not a string, try to pull
7953 // out an int, in case an older monitor with an older json schema is
7954 // forwarding a request.
7955 string val;
7956 string interr, floaterr;
7957 int64_t n = 0;
7958 double f = 0;
7959 int64_t uf = 0; // micro-f
7960 cmd_getval(cmdmap, "val", val);
7961
7962 auto si_options = {
7963 "target_max_objects"
7964 };
7965 auto iec_options = {
7966 "target_max_bytes",
7967 "target_size_bytes",
7968 "compression_max_blob_size",
7969 "compression_min_blob_size",
7970 "csum_max_block",
7971 "csum_min_block",
7972 };
7973 if (count(begin(si_options), end(si_options), var)) {
7974 n = strict_si_cast<int64_t>(val.c_str(), &interr);
7975 } else if (count(begin(iec_options), end(iec_options), var)) {
7976 n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7977 } else {
7978 // parse string as both int and float; different fields use different types.
7979 n = strict_strtoll(val.c_str(), 10, &interr);
7980 f = strict_strtod(val.c_str(), &floaterr);
7981 uf = llrintl(f * (double)1000000.0);
7982 }
7983
7984 if (!p.is_tier() &&
7985 (var == "hit_set_type" || var == "hit_set_period" ||
7986 var == "hit_set_count" || var == "hit_set_fpp" ||
7987 var == "target_max_objects" || var == "target_max_bytes" ||
7988 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7989 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7990 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7991 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7992 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7993 return -EACCES;
7994 }
7995
7996 if (var == "size") {
7997 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7998 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7999 return -EPERM;
8000 }
8001 if (p.type == pg_pool_t::TYPE_ERASURE) {
8002 ss << "can not change the size of an erasure-coded pool";
8003 return -ENOTSUP;
8004 }
8005 if (interr.length()) {
8006 ss << "error parsing integer value '" << val << "': " << interr;
8007 return -EINVAL;
8008 }
8009 if (n <= 0 || n > 10) {
8010 ss << "pool size must be between 1 and 10";
8011 return -EINVAL;
8012 }
8013 if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
8014 return -EINVAL;
8015 }
8016 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
8017 if (r < 0) {
8018 return r;
8019 }
8020 p.size = n;
8021 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8022 } else if (var == "min_size") {
8023 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8024 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8025 return -EPERM;
8026 }
8027 if (interr.length()) {
8028 ss << "error parsing integer value '" << val << "': " << interr;
8029 return -EINVAL;
8030 }
8031
8032 if (p.type != pg_pool_t::TYPE_ERASURE) {
8033 if (n < 1 || n > p.size) {
8034 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8035 return -EINVAL;
8036 }
8037 } else {
8038 ErasureCodeInterfaceRef erasure_code;
8039 int k;
8040 stringstream tmp;
8041 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8042 if (err == 0) {
8043 k = erasure_code->get_data_chunk_count();
8044 } else {
8045 ss << __func__ << " get_erasure_code failed: " << tmp.str();
8046 return err;
8047 }
8048
8049 if (n < k || n > p.size) {
8050 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8051 return -EINVAL;
8052 }
8053 }
8054 p.min_size = n;
8055 } else if (var == "pg_num_actual") {
8056 if (interr.length()) {
8057 ss << "error parsing integer value '" << val << "': " << interr;
8058 return -EINVAL;
8059 }
8060 if (n == (int)p.get_pg_num()) {
8061 return 0;
8062 }
8063 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8064 ss << "'pg_num' must be greater than 0 and less than or equal to "
8065 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8066 << " (you may adjust 'mon max pool pg num' for higher values)";
8067 return -ERANGE;
8068 }
8069 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8070 ss << "cannot adjust pg_num while initial PGs are being created";
8071 return -EBUSY;
8072 }
8073 if (n > (int)p.get_pg_num()) {
8074 if (p.get_pg_num() != p.get_pg_num_pending()) {
8075 // force pre-nautilus clients to resend their ops, since they
8076 // don't understand pg_num_pending changes form a new interval
8077 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8078 }
8079 p.set_pg_num(n);
8080 } else {
8081 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8082 ss << "nautilus OSDs are required to adjust pg_num_pending";
8083 return -EPERM;
8084 }
8085 if (n < (int)p.get_pgp_num()) {
8086 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8087 return -EINVAL;
8088 }
8089 if (n < (int)p.get_pg_num() - 1) {
8090 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8091 << ") - 1; only single pg decrease is currently supported";
8092 return -EINVAL;
8093 }
8094 p.set_pg_num_pending(n);
8095 // force pre-nautilus clients to resend their ops, since they
8096 // don't understand pg_num_pending changes form a new interval
8097 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8098 }
8099 // force pre-luminous clients to resend their ops, since they
8100 // don't understand that split PGs now form a new interval.
8101 p.last_force_op_resend_preluminous = pending_inc.epoch;
8102 } else if (var == "pg_num") {
8103 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8104 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8105 return -EPERM;
8106 }
8107 if (interr.length()) {
8108 ss << "error parsing integer value '" << val << "': " << interr;
8109 return -EINVAL;
8110 }
8111 if (n == (int)p.get_pg_num_target()) {
8112 return 0;
8113 }
8114 if (n <= 0 || static_cast<uint64_t>(n) >
8115 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8116 ss << "'pg_num' must be greater than 0 and less than or equal to "
8117 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8118 << " (you may adjust 'mon max pool pg num' for higher values)";
8119 return -ERANGE;
8120 }
8121 if (n > (int)p.get_pg_num_target()) {
8122 int r = check_pg_num(pool, n, p.get_size(), &ss);
8123 if (r) {
8124 return r;
8125 }
8126 bool force = false;
8127 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8128 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8129 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8130 return -EPERM;
8131 }
8132 } else {
8133 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8134 ss << "nautilus OSDs are required to decrease pg_num";
8135 return -EPERM;
8136 }
8137 }
8138 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8139 // pre-nautilus osdmap format; increase pg_num directly
8140 assert(n > (int)p.get_pg_num());
8141 // force pre-nautilus clients to resend their ops, since they
8142 // don't understand pg_num_target changes form a new interval
8143 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8144 // force pre-luminous clients to resend their ops, since they
8145 // don't understand that split PGs now form a new interval.
8146 p.last_force_op_resend_preluminous = pending_inc.epoch;
8147 p.set_pg_num(n);
8148 } else {
8149 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8150 // make pgp_num track pg_num if it already matches. if it is set
8151 // differently, leave it different and let the user control it
8152 // manually.
8153 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8154 p.set_pgp_num_target(n);
8155 }
8156 p.set_pg_num_target(n);
8157 }
8158 } else if (var == "pgp_num_actual") {
8159 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8160 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8161 return -EPERM;
8162 }
8163 if (interr.length()) {
8164 ss << "error parsing integer value '" << val << "': " << interr;
8165 return -EINVAL;
8166 }
8167 if (n <= 0) {
8168 ss << "specified pgp_num must > 0, but you set to " << n;
8169 return -EINVAL;
8170 }
8171 if (n > (int)p.get_pg_num()) {
8172 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8173 return -EINVAL;
8174 }
8175 if (n > (int)p.get_pg_num_pending()) {
8176 ss << "specified pgp_num " << n
8177 << " > pg_num_pending " << p.get_pg_num_pending();
8178 return -EINVAL;
8179 }
8180 p.set_pgp_num(n);
8181 } else if (var == "pgp_num") {
8182 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8183 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8184 return -EPERM;
8185 }
8186 if (interr.length()) {
8187 ss << "error parsing integer value '" << val << "': " << interr;
8188 return -EINVAL;
8189 }
8190 if (n <= 0) {
8191 ss << "specified pgp_num must > 0, but you set to " << n;
8192 return -EINVAL;
8193 }
8194 if (n > (int)p.get_pg_num_target()) {
8195 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8196 return -EINVAL;
8197 }
8198 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8199 // pre-nautilus osdmap format; increase pgp_num directly
8200 p.set_pgp_num(n);
8201 } else {
8202 p.set_pgp_num_target(n);
8203 }
8204 } else if (var == "pg_autoscale_mode") {
8205 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8206 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8207 ss << "specified invalid mode " << val;
8208 return -EINVAL;
8209 }
8210 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8211 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8212 return -EINVAL;
8213 }
8214 p.pg_autoscale_mode = m;
8215 } else if (var == "crush_rule") {
8216 int id = osdmap.crush->get_rule_id(val);
8217 if (id == -ENOENT) {
8218 ss << "crush rule " << val << " does not exist";
8219 return -ENOENT;
8220 }
8221 if (id < 0) {
8222 ss << cpp_strerror(id);
8223 return -ENOENT;
8224 }
8225 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8226 return -EINVAL;
8227 }
8228 p.crush_rule = id;
8229 } else if (var == "nodelete" || var == "nopgchange" ||
8230 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8231 var == "noscrub" || var == "nodeep-scrub") {
8232 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8233 // make sure we only compare against 'n' if we didn't receive a string
8234 if (val == "true" || (interr.empty() && n == 1)) {
8235 p.set_flag(flag);
8236 } else if (val == "false" || (interr.empty() && n == 0)) {
8237 p.unset_flag(flag);
8238 } else {
8239 ss << "expecting value 'true', 'false', '0', or '1'";
8240 return -EINVAL;
8241 }
8242 } else if (var == "hashpspool") {
8243 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8244 bool force = false;
8245 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8246
8247 if (!force) {
8248 ss << "are you SURE? this will remap all placement groups in this pool,"
8249 " this triggers large data movement,"
8250 " pass --yes-i-really-mean-it if you really do.";
8251 return -EPERM;
8252 }
8253 // make sure we only compare against 'n' if we didn't receive a string
8254 if (val == "true" || (interr.empty() && n == 1)) {
8255 p.set_flag(flag);
8256 } else if (val == "false" || (interr.empty() && n == 0)) {
8257 p.unset_flag(flag);
8258 } else {
8259 ss << "expecting value 'true', 'false', '0', or '1'";
8260 return -EINVAL;
8261 }
8262 } else if (var == "hit_set_type") {
8263 if (val == "none")
8264 p.hit_set_params = HitSet::Params();
8265 else {
8266 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8267 if (err)
8268 return err;
8269 if (val == "bloom") {
8270 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8271 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8272 p.hit_set_params = HitSet::Params(bsp);
8273 } else if (val == "explicit_hash")
8274 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8275 else if (val == "explicit_object")
8276 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8277 else {
8278 ss << "unrecognized hit_set type '" << val << "'";
8279 return -EINVAL;
8280 }
8281 }
8282 } else if (var == "hit_set_period") {
8283 if (interr.length()) {
8284 ss << "error parsing integer value '" << val << "': " << interr;
8285 return -EINVAL;
8286 } else if (n < 0) {
8287 ss << "hit_set_period should be non-negative";
8288 return -EINVAL;
8289 }
8290 p.hit_set_period = n;
8291 } else if (var == "hit_set_count") {
8292 if (interr.length()) {
8293 ss << "error parsing integer value '" << val << "': " << interr;
8294 return -EINVAL;
8295 } else if (n < 0) {
8296 ss << "hit_set_count should be non-negative";
8297 return -EINVAL;
8298 }
8299 p.hit_set_count = n;
8300 } else if (var == "hit_set_fpp") {
8301 if (floaterr.length()) {
8302 ss << "error parsing floating point value '" << val << "': " << floaterr;
8303 return -EINVAL;
8304 } else if (f < 0 || f > 1.0) {
8305 ss << "hit_set_fpp should be in the range 0..1";
8306 return -EINVAL;
8307 }
8308 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8309 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8310 return -EINVAL;
8311 }
8312 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8313 bloomp->set_fpp(f);
8314 } else if (var == "use_gmt_hitset") {
8315 if (val == "true" || (interr.empty() && n == 1)) {
8316 p.use_gmt_hitset = true;
8317 } else {
8318 ss << "expecting value 'true' or '1'";
8319 return -EINVAL;
8320 }
8321 } else if (var == "allow_ec_overwrites") {
8322 if (!p.is_erasure()) {
8323 ss << "ec overwrites can only be enabled for an erasure coded pool";
8324 return -EINVAL;
8325 }
8326 stringstream err;
8327 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8328 !is_pool_currently_all_bluestore(pool, p, &err)) {
8329 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8330 return -EINVAL;
8331 }
8332 if (val == "true" || (interr.empty() && n == 1)) {
8333 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8334 } else if (val == "false" || (interr.empty() && n == 0)) {
8335 ss << "ec overwrites cannot be disabled once enabled";
8336 return -EINVAL;
8337 } else {
8338 ss << "expecting value 'true', 'false', '0', or '1'";
8339 return -EINVAL;
8340 }
8341 } else if (var == "target_max_objects") {
8342 if (interr.length()) {
8343 ss << "error parsing int '" << val << "': " << interr;
8344 return -EINVAL;
8345 }
8346 p.target_max_objects = n;
8347 } else if (var == "target_max_bytes") {
8348 if (interr.length()) {
8349 ss << "error parsing int '" << val << "': " << interr;
8350 return -EINVAL;
8351 }
8352 p.target_max_bytes = n;
8353 } else if (var == "cache_target_dirty_ratio") {
8354 if (floaterr.length()) {
8355 ss << "error parsing float '" << val << "': " << floaterr;
8356 return -EINVAL;
8357 }
8358 if (f < 0 || f > 1.0) {
8359 ss << "value must be in the range 0..1";
8360 return -ERANGE;
8361 }
8362 p.cache_target_dirty_ratio_micro = uf;
8363 } else if (var == "cache_target_dirty_high_ratio") {
8364 if (floaterr.length()) {
8365 ss << "error parsing float '" << val << "': " << floaterr;
8366 return -EINVAL;
8367 }
8368 if (f < 0 || f > 1.0) {
8369 ss << "value must be in the range 0..1";
8370 return -ERANGE;
8371 }
8372 p.cache_target_dirty_high_ratio_micro = uf;
8373 } else if (var == "cache_target_full_ratio") {
8374 if (floaterr.length()) {
8375 ss << "error parsing float '" << val << "': " << floaterr;
8376 return -EINVAL;
8377 }
8378 if (f < 0 || f > 1.0) {
8379 ss << "value must be in the range 0..1";
8380 return -ERANGE;
8381 }
8382 p.cache_target_full_ratio_micro = uf;
8383 } else if (var == "cache_min_flush_age") {
8384 if (interr.length()) {
8385 ss << "error parsing int '" << val << "': " << interr;
8386 return -EINVAL;
8387 }
8388 p.cache_min_flush_age = n;
8389 } else if (var == "cache_min_evict_age") {
8390 if (interr.length()) {
8391 ss << "error parsing int '" << val << "': " << interr;
8392 return -EINVAL;
8393 }
8394 p.cache_min_evict_age = n;
8395 } else if (var == "min_read_recency_for_promote") {
8396 if (interr.length()) {
8397 ss << "error parsing integer value '" << val << "': " << interr;
8398 return -EINVAL;
8399 }
8400 p.min_read_recency_for_promote = n;
8401 } else if (var == "hit_set_grade_decay_rate") {
8402 if (interr.length()) {
8403 ss << "error parsing integer value '" << val << "': " << interr;
8404 return -EINVAL;
8405 }
8406 if (n > 100 || n < 0) {
8407 ss << "value out of range,valid range is 0 - 100";
8408 return -EINVAL;
8409 }
8410 p.hit_set_grade_decay_rate = n;
8411 } else if (var == "hit_set_search_last_n") {
8412 if (interr.length()) {
8413 ss << "error parsing integer value '" << val << "': " << interr;
8414 return -EINVAL;
8415 }
8416 if (n > p.hit_set_count || n < 0) {
8417 ss << "value out of range,valid range is 0 - hit_set_count";
8418 return -EINVAL;
8419 }
8420 p.hit_set_search_last_n = n;
8421 } else if (var == "min_write_recency_for_promote") {
8422 if (interr.length()) {
8423 ss << "error parsing integer value '" << val << "': " << interr;
8424 return -EINVAL;
8425 }
8426 p.min_write_recency_for_promote = n;
8427 } else if (var == "fast_read") {
8428 if (p.is_replicated()) {
8429 ss << "fast read is not supported in replication pool";
8430 return -EINVAL;
8431 }
8432 if (val == "true" || (interr.empty() && n == 1)) {
8433 p.fast_read = true;
8434 } else if (val == "false" || (interr.empty() && n == 0)) {
8435 p.fast_read = false;
8436 } else {
8437 ss << "expecting value 'true', 'false', '0', or '1'";
8438 return -EINVAL;
8439 }
8440 } else if (pool_opts_t::is_opt_name(var)) {
8441 bool unset = val == "unset";
8442 if (var == "compression_mode") {
8443 if (!unset) {
8444 auto cmode = Compressor::get_comp_mode_type(val);
8445 if (!cmode) {
8446 ss << "unrecognized compression mode '" << val << "'";
8447 return -EINVAL;
8448 }
8449 }
8450 } else if (var == "compression_algorithm") {
8451 if (!unset) {
8452 auto alg = Compressor::get_comp_alg_type(val);
8453 if (!alg) {
8454 ss << "unrecognized compression_algorithm '" << val << "'";
8455 return -EINVAL;
8456 }
8457 }
8458 } else if (var == "compression_required_ratio") {
8459 if (floaterr.length()) {
8460 ss << "error parsing float value '" << val << "': " << floaterr;
8461 return -EINVAL;
8462 }
8463 if (f < 0 || f > 1) {
8464 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8465 return -EINVAL;
8466 }
8467 } else if (var == "csum_type") {
8468 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8469 if (t < 0 ) {
8470 ss << "unrecognized csum_type '" << val << "'";
8471 return -EINVAL;
8472 }
8473 //preserve csum_type numeric value
8474 n = t;
8475 interr.clear();
8476 } else if (var == "compression_max_blob_size" ||
8477 var == "compression_min_blob_size" ||
8478 var == "csum_max_block" ||
8479 var == "csum_min_block") {
8480 if (interr.length()) {
8481 ss << "error parsing int value '" << val << "': " << interr;
8482 return -EINVAL;
8483 }
8484 } else if (var == "fingerprint_algorithm") {
8485 if (!unset) {
8486 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8487 if (!alg) {
8488 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8489 return -EINVAL;
8490 }
8491 }
8492 } else if (var == "target_size_bytes") {
8493 if (interr.length()) {
8494 ss << "error parsing unit value '" << val << "': " << interr;
8495 return -EINVAL;
8496 }
8497 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8498 ss << "must set require_osd_release to nautilus or "
8499 << "later before setting target_size_bytes";
8500 return -EINVAL;
8501 }
8502 } else if (var == "pg_num_min") {
8503 if (interr.length()) {
8504 ss << "error parsing int value '" << val << "': " << interr;
8505 return -EINVAL;
8506 }
8507 if (n > (int)p.get_pg_num_target()) {
8508 ss << "specified pg_num_min " << n
8509 << " > pg_num " << p.get_pg_num_target();
8510 return -EINVAL;
8511 }
8512 } else if (var == "recovery_priority") {
8513 if (interr.length()) {
8514 ss << "error parsing int value '" << val << "': " << interr;
8515 return -EINVAL;
8516 }
8517 if (!g_conf()->debug_allow_any_pool_priority) {
8518 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8519 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8520 << " and " << OSD_POOL_PRIORITY_MAX;
8521 return -EINVAL;
8522 }
8523 }
8524 } else if (var == "pg_autoscale_bias") {
8525 if (f < 0.0 || f > 1000.0) {
8526 ss << "pg_autoscale_bias must be between 0 and 1000";
8527 return -EINVAL;
8528 }
8529 }
8530
8531 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8532 switch (desc.type) {
8533 case pool_opts_t::STR:
8534 if (unset) {
8535 p.opts.unset(desc.key);
8536 } else {
8537 p.opts.set(desc.key, static_cast<std::string>(val));
8538 }
8539 break;
8540 case pool_opts_t::INT:
8541 if (interr.length()) {
8542 ss << "error parsing integer value '" << val << "': " << interr;
8543 return -EINVAL;
8544 }
8545 if (n == 0) {
8546 p.opts.unset(desc.key);
8547 } else {
8548 p.opts.set(desc.key, static_cast<int64_t>(n));
8549 }
8550 break;
8551 case pool_opts_t::DOUBLE:
8552 if (floaterr.length()) {
8553 ss << "error parsing floating point value '" << val << "': " << floaterr;
8554 return -EINVAL;
8555 }
8556 if (f == 0) {
8557 p.opts.unset(desc.key);
8558 } else {
8559 p.opts.set(desc.key, static_cast<double>(f));
8560 }
8561 break;
8562 default:
8563 ceph_assert(!"unknown type");
8564 }
8565 } else {
8566 ss << "unrecognized variable '" << var << "'";
8567 return -EINVAL;
8568 }
8569 if (val != "unset") {
8570 ss << "set pool " << pool << " " << var << " to " << val;
8571 } else {
8572 ss << "unset pool " << pool << " " << var;
8573 }
8574 p.last_change = pending_inc.epoch;
8575 pending_inc.new_pools[pool] = p;
8576 return 0;
8577 }
8578
8579 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8580 const cmdmap_t& cmdmap,
8581 stringstream& ss)
8582 {
8583 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8584 }
8585
8586 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8587 const cmdmap_t& cmdmap,
8588 stringstream& ss,
8589 bool *modified)
8590 {
8591 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8592 }
8593
8594
8595 /**
8596 * Common logic for preprocess and prepare phases of pool application
8597 * tag commands. In preprocess mode we're only detecting invalid
8598 * commands, and determining whether it was a modification or a no-op.
8599 * In prepare mode we're actually updating the pending state.
8600 */
8601 int OSDMonitor::_command_pool_application(const string &prefix,
8602 const cmdmap_t& cmdmap,
8603 stringstream& ss,
8604 bool *modified,
8605 bool preparing)
8606 {
8607 string pool_name;
8608 cmd_getval(cmdmap, "pool", pool_name);
8609 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8610 if (pool < 0) {
8611 ss << "unrecognized pool '" << pool_name << "'";
8612 return -ENOENT;
8613 }
8614
8615 pg_pool_t p = *osdmap.get_pg_pool(pool);
8616 if (preparing) {
8617 if (pending_inc.new_pools.count(pool)) {
8618 p = pending_inc.new_pools[pool];
8619 }
8620 }
8621
8622 string app;
8623 cmd_getval(cmdmap, "app", app);
8624 bool app_exists = (p.application_metadata.count(app) > 0);
8625
8626 string key;
8627 cmd_getval(cmdmap, "key", key);
8628 if (key == "all") {
8629 ss << "key cannot be 'all'";
8630 return -EINVAL;
8631 }
8632
8633 string value;
8634 cmd_getval(cmdmap, "value", value);
8635 if (value == "all") {
8636 ss << "value cannot be 'all'";
8637 return -EINVAL;
8638 }
8639
8640 if (boost::algorithm::ends_with(prefix, "enable")) {
8641 if (app.empty()) {
8642 ss << "application name must be provided";
8643 return -EINVAL;
8644 }
8645
8646 if (p.is_tier()) {
8647 ss << "application must be enabled on base tier";
8648 return -EINVAL;
8649 }
8650
8651 bool force = false;
8652 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8653
8654 if (!app_exists && !p.application_metadata.empty() && !force) {
8655 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8656 << "application; pass --yes-i-really-mean-it to proceed anyway";
8657 return -EPERM;
8658 }
8659
8660 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8661 ss << "too many enabled applications on pool '" << pool_name << "'; "
8662 << "max " << MAX_POOL_APPLICATIONS;
8663 return -EINVAL;
8664 }
8665
8666 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8667 ss << "application name '" << app << "' too long; max length "
8668 << MAX_POOL_APPLICATION_LENGTH;
8669 return -EINVAL;
8670 }
8671
8672 if (!app_exists) {
8673 p.application_metadata[app] = {};
8674 }
8675 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8676
8677 } else if (boost::algorithm::ends_with(prefix, "disable")) {
8678 bool force = false;
8679 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8680
8681 if (!force) {
8682 ss << "Are you SURE? Disabling an application within a pool might result "
8683 << "in loss of application functionality; pass "
8684 << "--yes-i-really-mean-it to proceed anyway";
8685 return -EPERM;
8686 }
8687
8688 if (!app_exists) {
8689 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8690 << "'";
8691 return 0; // idempotent
8692 }
8693
8694 p.application_metadata.erase(app);
8695 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8696
8697 } else if (boost::algorithm::ends_with(prefix, "set")) {
8698 if (p.is_tier()) {
8699 ss << "application metadata must be set on base tier";
8700 return -EINVAL;
8701 }
8702
8703 if (!app_exists) {
8704 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8705 << "'";
8706 return -ENOENT;
8707 }
8708
8709 string key;
8710 cmd_getval(cmdmap, "key", key);
8711
8712 if (key.empty()) {
8713 ss << "key must be provided";
8714 return -EINVAL;
8715 }
8716
8717 auto &app_keys = p.application_metadata[app];
8718 if (app_keys.count(key) == 0 &&
8719 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8720 ss << "too many keys set for application '" << app << "' on pool '"
8721 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8722 return -EINVAL;
8723 }
8724
8725 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8726 ss << "key '" << app << "' too long; max length "
8727 << MAX_POOL_APPLICATION_LENGTH;
8728 return -EINVAL;
8729 }
8730
8731 string value;
8732 cmd_getval(cmdmap, "value", value);
8733 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8734 ss << "value '" << value << "' too long; max length "
8735 << MAX_POOL_APPLICATION_LENGTH;
8736 return -EINVAL;
8737 }
8738
8739 p.application_metadata[app][key] = value;
8740 ss << "set application '" << app << "' key '" << key << "' to '"
8741 << value << "' on pool '" << pool_name << "'";
8742 } else if (boost::algorithm::ends_with(prefix, "rm")) {
8743 if (!app_exists) {
8744 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8745 << "'";
8746 return -ENOENT;
8747 }
8748
8749 string key;
8750 cmd_getval(cmdmap, "key", key);
8751 auto it = p.application_metadata[app].find(key);
8752 if (it == p.application_metadata[app].end()) {
8753 ss << "application '" << app << "' on pool '" << pool_name
8754 << "' does not have key '" << key << "'";
8755 return 0; // idempotent
8756 }
8757
8758 p.application_metadata[app].erase(it);
8759 ss << "removed application '" << app << "' key '" << key << "' on pool '"
8760 << pool_name << "'";
8761 } else {
8762 ceph_abort();
8763 }
8764
8765 if (preparing) {
8766 p.last_change = pending_inc.epoch;
8767 pending_inc.new_pools[pool] = p;
8768 }
8769
8770 // Because we fell through this far, we didn't hit no-op cases,
8771 // so pool was definitely modified
8772 if (modified != nullptr) {
8773 *modified = true;
8774 }
8775
8776 return 0;
8777 }
8778
8779 int OSDMonitor::_prepare_command_osd_crush_remove(
8780 CrushWrapper &newcrush,
8781 int32_t id,
8782 int32_t ancestor,
8783 bool has_ancestor,
8784 bool unlink_only)
8785 {
8786 int err = 0;
8787
8788 if (has_ancestor) {
8789 err = newcrush.remove_item_under(cct, id, ancestor,
8790 unlink_only);
8791 } else {
8792 err = newcrush.remove_item(cct, id, unlink_only);
8793 }
8794 return err;
8795 }
8796
8797 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8798 {
8799 pending_inc.crush.clear();
8800 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8801 }
8802
8803 int OSDMonitor::prepare_command_osd_crush_remove(
8804 CrushWrapper &newcrush,
8805 int32_t id,
8806 int32_t ancestor,
8807 bool has_ancestor,
8808 bool unlink_only)
8809 {
8810 int err = _prepare_command_osd_crush_remove(
8811 newcrush, id, ancestor,
8812 has_ancestor, unlink_only);
8813
8814 if (err < 0)
8815 return err;
8816
8817 ceph_assert(err == 0);
8818 do_osd_crush_remove(newcrush);
8819
8820 return 0;
8821 }
8822
8823 int OSDMonitor::prepare_command_osd_remove(int32_t id)
8824 {
8825 if (osdmap.is_up(id)) {
8826 return -EBUSY;
8827 }
8828
8829 pending_inc.new_state[id] = osdmap.get_state(id);
8830 pending_inc.new_uuid[id] = uuid_d();
8831 pending_metadata_rm.insert(id);
8832 pending_metadata.erase(id);
8833
8834 return 0;
8835 }
8836
8837 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8838 {
8839 ceph_assert(existing_id);
8840 *existing_id = -1;
8841
8842 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8843 if (!osdmap.exists(i) &&
8844 pending_inc.new_up_client.count(i) == 0 &&
8845 (pending_inc.new_state.count(i) == 0 ||
8846 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8847 *existing_id = i;
8848 return -1;
8849 }
8850 }
8851
8852 if (pending_inc.new_max_osd < 0) {
8853 return osdmap.get_max_osd();
8854 }
8855 return pending_inc.new_max_osd;
8856 }
8857
8858 void OSDMonitor::do_osd_create(
8859 const int32_t id,
8860 const uuid_d& uuid,
8861 const string& device_class,
8862 int32_t* new_id)
8863 {
8864 dout(10) << __func__ << " uuid " << uuid << dendl;
8865 ceph_assert(new_id);
8866
8867 // We presume validation has been performed prior to calling this
8868 // function. We assert with prejudice.
8869
8870 int32_t allocated_id = -1; // declare here so we can jump
8871 int32_t existing_id = -1;
8872 if (!uuid.is_zero()) {
8873 existing_id = osdmap.identify_osd(uuid);
8874 if (existing_id >= 0) {
8875 ceph_assert(id < 0 || id == existing_id);
8876 *new_id = existing_id;
8877 goto out;
8878 } else if (id >= 0) {
8879 // uuid does not exist, and id has been provided, so just create
8880 // the new osd.id
8881 *new_id = id;
8882 goto out;
8883 }
8884 }
8885
8886 // allocate a new id
8887 allocated_id = _allocate_osd_id(&existing_id);
8888 dout(10) << __func__ << " allocated id " << allocated_id
8889 << " existing id " << existing_id << dendl;
8890 if (existing_id >= 0) {
8891 ceph_assert(existing_id < osdmap.get_max_osd());
8892 ceph_assert(allocated_id < 0);
8893 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8894 *new_id = existing_id;
8895 } else if (allocated_id >= 0) {
8896 ceph_assert(existing_id < 0);
8897 // raise max_osd
8898 if (pending_inc.new_max_osd < 0) {
8899 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8900 } else {
8901 ++pending_inc.new_max_osd;
8902 }
8903 *new_id = pending_inc.new_max_osd - 1;
8904 ceph_assert(*new_id == allocated_id);
8905 } else {
8906 ceph_abort_msg("unexpected condition");
8907 }
8908
8909 out:
8910 if (device_class.size()) {
8911 CrushWrapper newcrush;
8912 _get_pending_crush(newcrush);
8913 if (newcrush.get_max_devices() < *new_id + 1) {
8914 newcrush.set_max_devices(*new_id + 1);
8915 }
8916 string name = string("osd.") + stringify(*new_id);
8917 if (!newcrush.item_exists(*new_id)) {
8918 newcrush.set_item_name(*new_id, name);
8919 }
8920 ostringstream ss;
8921 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8922 if (r < 0) {
8923 derr << __func__ << " failed to set " << name << " device_class "
8924 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8925 << dendl;
8926 // non-fatal... this might be a replay and we want to be idempotent.
8927 } else {
8928 dout(20) << __func__ << " set " << name << " device_class " << device_class
8929 << dendl;
8930 pending_inc.crush.clear();
8931 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8932 }
8933 } else {
8934 dout(20) << __func__ << " no device_class" << dendl;
8935 }
8936
8937 dout(10) << __func__ << " using id " << *new_id << dendl;
8938 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8939 pending_inc.new_max_osd = *new_id + 1;
8940 }
8941
8942 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8943 if (!uuid.is_zero())
8944 pending_inc.new_uuid[*new_id] = uuid;
8945 }
8946
8947 int OSDMonitor::validate_osd_create(
8948 const int32_t id,
8949 const uuid_d& uuid,
8950 const bool check_osd_exists,
8951 int32_t* existing_id,
8952 stringstream& ss)
8953 {
8954
8955 dout(10) << __func__ << " id " << id << " uuid " << uuid
8956 << " check_osd_exists " << check_osd_exists << dendl;
8957
8958 ceph_assert(existing_id);
8959
8960 if (id < 0 && uuid.is_zero()) {
8961 // we have nothing to validate
8962 *existing_id = -1;
8963 return 0;
8964 } else if (uuid.is_zero()) {
8965 // we have an id but we will ignore it - because that's what
8966 // `osd create` does.
8967 return 0;
8968 }
8969
8970 /*
8971 * This function will be used to validate whether we are able to
8972 * create a new osd when the `uuid` is specified.
8973 *
8974 * It will be used by both `osd create` and `osd new`, as the checks
8975 * are basically the same when it pertains to osd id and uuid validation.
8976 * However, `osd create` presumes an `uuid` is optional, for legacy
8977 * reasons, while `osd new` requires the `uuid` to be provided. This
8978 * means that `osd create` will not be idempotent if an `uuid` is not
8979 * provided, but we will always guarantee the idempotency of `osd new`.
8980 */
8981
8982 ceph_assert(!uuid.is_zero());
8983 if (pending_inc.identify_osd(uuid) >= 0) {
8984 // osd is about to exist
8985 return -EAGAIN;
8986 }
8987
8988 int32_t i = osdmap.identify_osd(uuid);
8989 if (i >= 0) {
8990 // osd already exists
8991 if (id >= 0 && i != id) {
8992 ss << "uuid " << uuid << " already in use for different id " << i;
8993 return -EEXIST;
8994 }
8995 // return a positive errno to distinguish between a blocking error
8996 // and an error we consider to not be a problem (i.e., this would be
8997 // an idempotent operation).
8998 *existing_id = i;
8999 return EEXIST;
9000 }
9001 // i < 0
9002 if (id >= 0) {
9003 if (pending_inc.new_state.count(id)) {
9004 // osd is about to exist
9005 return -EAGAIN;
9006 }
9007 // we may not care if an osd exists if we are recreating a previously
9008 // destroyed osd.
9009 if (check_osd_exists && osdmap.exists(id)) {
9010 ss << "id " << id << " already in use and does not match uuid "
9011 << uuid;
9012 return -EINVAL;
9013 }
9014 }
9015 return 0;
9016 }
9017
9018 int OSDMonitor::prepare_command_osd_create(
9019 const int32_t id,
9020 const uuid_d& uuid,
9021 int32_t* existing_id,
9022 stringstream& ss)
9023 {
9024 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9025 ceph_assert(existing_id);
9026 if (osdmap.is_destroyed(id)) {
9027 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9028 "instead.";
9029 return -EINVAL;
9030 }
9031
9032 if (uuid.is_zero()) {
9033 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9034 }
9035
9036 return validate_osd_create(id, uuid, true, existing_id, ss);
9037 }
9038
9039 int OSDMonitor::prepare_command_osd_new(
9040 MonOpRequestRef op,
9041 const cmdmap_t& cmdmap,
9042 const map<string,string>& params,
9043 stringstream &ss,
9044 Formatter *f)
9045 {
9046 uuid_d uuid;
9047 string uuidstr;
9048 int64_t id = -1;
9049
9050 ceph_assert(paxos->is_plugged());
9051
9052 dout(10) << __func__ << " " << op << dendl;
9053
9054 /* validate command. abort now if something's wrong. */
9055
9056 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9057 *
9058 * If `id` is not specified, we will identify any existing osd based
9059 * on `uuid`. Operation will be idempotent iff secrets match.
9060 *
9061 * If `id` is specified, we will identify any existing osd based on
9062 * `uuid` and match against `id`. If they match, operation will be
9063 * idempotent iff secrets match.
9064 *
9065 * `-i secrets.json` will be optional. If supplied, will be used
9066 * to check for idempotency when `id` and `uuid` match.
9067 *
9068 * If `id` is not specified, and `uuid` does not exist, an id will
9069 * be found or allocated for the osd.
9070 *
9071 * If `id` is specified, and the osd has been previously marked
9072 * as destroyed, then the `id` will be reused.
9073 */
9074 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9075 ss << "requires the OSD's UUID to be specified.";
9076 return -EINVAL;
9077 } else if (!uuid.parse(uuidstr.c_str())) {
9078 ss << "invalid UUID value '" << uuidstr << "'.";
9079 return -EINVAL;
9080 }
9081
9082 if (cmd_getval(cmdmap, "id", id) &&
9083 (id < 0)) {
9084 ss << "invalid OSD id; must be greater or equal than zero.";
9085 return -EINVAL;
9086 }
9087
9088 // are we running an `osd create`-like command, or recreating
9089 // a previously destroyed osd?
9090
9091 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9092
9093 // we will care about `id` to assess whether osd is `destroyed`, or
9094 // to create a new osd.
9095 // we will need an `id` by the time we reach auth.
9096
9097 int32_t existing_id = -1;
9098 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9099 &existing_id, ss);
9100
9101 bool may_be_idempotent = false;
9102 if (err == EEXIST) {
9103 // this is idempotent from the osdmon's point-of-view
9104 may_be_idempotent = true;
9105 ceph_assert(existing_id >= 0);
9106 id = existing_id;
9107 } else if (err < 0) {
9108 return err;
9109 }
9110
9111 if (!may_be_idempotent) {
9112 // idempotency is out of the window. We are either creating a new
9113 // osd or recreating a destroyed osd.
9114 //
9115 // We now need to figure out if we have an `id` (and if it's valid),
9116 // of find an `id` if we don't have one.
9117
9118 // NOTE: we need to consider the case where the `id` is specified for
9119 // `osd create`, and we must honor it. So this means checking if
9120 // the `id` is destroyed, and if so assume the destroy; otherwise,
9121 // check if it `exists` - in which case we complain about not being
9122 // `destroyed`. In the end, if nothing fails, we must allow the
9123 // creation, so that we are compatible with `create`.
9124 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9125 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9126 ss << "OSD " << id << " has not yet been destroyed";
9127 return -EINVAL;
9128 } else if (id < 0) {
9129 // find an `id`
9130 id = _allocate_osd_id(&existing_id);
9131 if (id < 0) {
9132 ceph_assert(existing_id >= 0);
9133 id = existing_id;
9134 }
9135 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9136 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9137 dout(10) << __func__ << " recreating osd." << id << dendl;
9138 } else {
9139 dout(10) << __func__ << " creating new osd." << id << dendl;
9140 }
9141 } else {
9142 ceph_assert(id >= 0);
9143 ceph_assert(osdmap.exists(id));
9144 }
9145
9146 // we are now able to either create a brand new osd or reuse an existing
9147 // osd that has been previously destroyed.
9148
9149 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9150
9151 if (may_be_idempotent && params.empty()) {
9152 // nothing to do, really.
9153 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9154 ceph_assert(id >= 0);
9155 if (f) {
9156 f->open_object_section("created_osd");
9157 f->dump_int("osdid", id);
9158 f->close_section();
9159 } else {
9160 ss << id;
9161 }
9162 return EEXIST;
9163 }
9164
9165 string device_class;
9166 auto p = params.find("crush_device_class");
9167 if (p != params.end()) {
9168 device_class = p->second;
9169 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9170 }
9171 string cephx_secret, lockbox_secret, dmcrypt_key;
9172 bool has_lockbox = false;
9173 bool has_secrets = params.count("cephx_secret")
9174 || params.count("cephx_lockbox_secret")
9175 || params.count("dmcrypt_key");
9176
9177 ConfigKeyService *svc = nullptr;
9178 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9179
9180 if (has_secrets) {
9181 if (params.count("cephx_secret") == 0) {
9182 ss << "requires a cephx secret.";
9183 return -EINVAL;
9184 }
9185 cephx_secret = params.at("cephx_secret");
9186
9187 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9188 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9189
9190 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9191 << " dmcrypt " << has_dmcrypt_key << dendl;
9192
9193 if (has_lockbox_secret && has_dmcrypt_key) {
9194 has_lockbox = true;
9195 lockbox_secret = params.at("cephx_lockbox_secret");
9196 dmcrypt_key = params.at("dmcrypt_key");
9197 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9198 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9199 return -EINVAL;
9200 }
9201
9202 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9203
9204 err = mon->authmon()->validate_osd_new(id, uuid,
9205 cephx_secret,
9206 lockbox_secret,
9207 cephx_entity,
9208 lockbox_entity,
9209 ss);
9210 if (err < 0) {
9211 return err;
9212 } else if (may_be_idempotent && err != EEXIST) {
9213 // for this to be idempotent, `id` should already be >= 0; no need
9214 // to use validate_id.
9215 ceph_assert(id >= 0);
9216 ss << "osd." << id << " exists but secrets do not match";
9217 return -EEXIST;
9218 }
9219
9220 if (has_lockbox) {
9221 svc = (ConfigKeyService*)mon->config_key_service;
9222 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9223 if (err < 0) {
9224 return err;
9225 } else if (may_be_idempotent && err != EEXIST) {
9226 ceph_assert(id >= 0);
9227 ss << "osd." << id << " exists but dm-crypt key does not match.";
9228 return -EEXIST;
9229 }
9230 }
9231 }
9232 ceph_assert(!has_secrets || !cephx_secret.empty());
9233 ceph_assert(!has_lockbox || !lockbox_secret.empty());
9234
9235 if (may_be_idempotent) {
9236 // we have nothing to do for either the osdmon or the authmon,
9237 // and we have no lockbox - so the config key service will not be
9238 // touched. This is therefore an idempotent operation, and we can
9239 // just return right away.
9240 dout(10) << __func__ << " idempotent -- no op." << dendl;
9241 ceph_assert(id >= 0);
9242 if (f) {
9243 f->open_object_section("created_osd");
9244 f->dump_int("osdid", id);
9245 f->close_section();
9246 } else {
9247 ss << id;
9248 }
9249 return EEXIST;
9250 }
9251 ceph_assert(!may_be_idempotent);
9252
9253 // perform updates.
9254 if (has_secrets) {
9255 ceph_assert(!cephx_secret.empty());
9256 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9257 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9258
9259 err = mon->authmon()->do_osd_new(cephx_entity,
9260 lockbox_entity,
9261 has_lockbox);
9262 ceph_assert(0 == err);
9263
9264 if (has_lockbox) {
9265 ceph_assert(nullptr != svc);
9266 svc->do_osd_new(uuid, dmcrypt_key);
9267 }
9268 }
9269
9270 if (is_recreate_destroyed) {
9271 ceph_assert(id >= 0);
9272 ceph_assert(osdmap.is_destroyed(id));
9273 pending_inc.new_weight[id] = CEPH_OSD_OUT;
9274 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9275 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9276 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9277 }
9278 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9279 // due to http://tracker.ceph.com/issues/20751 some clusters may
9280 // have UP set for non-existent OSDs; make sure it is cleared
9281 // for a newly created osd.
9282 pending_inc.new_state[id] |= CEPH_OSD_UP;
9283 }
9284 pending_inc.new_uuid[id] = uuid;
9285 } else {
9286 ceph_assert(id >= 0);
9287 int32_t new_id = -1;
9288 do_osd_create(id, uuid, device_class, &new_id);
9289 ceph_assert(new_id >= 0);
9290 ceph_assert(id == new_id);
9291 }
9292
9293 if (f) {
9294 f->open_object_section("created_osd");
9295 f->dump_int("osdid", id);
9296 f->close_section();
9297 } else {
9298 ss << id;
9299 }
9300
9301 return 0;
9302 }
9303
9304 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9305 {
9306 op->mark_osdmon_event(__func__);
9307 auto m = op->get_req<MMonCommand>();
9308 stringstream ss;
9309 cmdmap_t cmdmap;
9310 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9311 string rs = ss.str();
9312 mon->reply_command(op, -EINVAL, rs, get_last_committed());
9313 return true;
9314 }
9315
9316 MonSession *session = op->get_session();
9317 if (!session) {
9318 derr << __func__ << " no session" << dendl;
9319 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9320 return true;
9321 }
9322
9323 return prepare_command_impl(op, cmdmap);
9324 }
9325
9326 static int parse_reweights(CephContext *cct,
9327 const cmdmap_t& cmdmap,
9328 const OSDMap& osdmap,
9329 map<int32_t, uint32_t>* weights)
9330 {
9331 string weights_str;
9332 if (!cmd_getval(cmdmap, "weights", weights_str)) {
9333 return -EINVAL;
9334 }
9335 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9336 json_spirit::mValue json_value;
9337 if (!json_spirit::read(weights_str, json_value)) {
9338 return -EINVAL;
9339 }
9340 if (json_value.type() != json_spirit::obj_type) {
9341 return -EINVAL;
9342 }
9343 const auto obj = json_value.get_obj();
9344 try {
9345 for (auto& osd_weight : obj) {
9346 auto osd_id = std::stoi(osd_weight.first);
9347 if (!osdmap.exists(osd_id)) {
9348 return -ENOENT;
9349 }
9350 if (osd_weight.second.type() != json_spirit::str_type) {
9351 return -EINVAL;
9352 }
9353 auto weight = std::stoul(osd_weight.second.get_str());
9354 weights->insert({osd_id, weight});
9355 }
9356 } catch (const std::logic_error& e) {
9357 return -EINVAL;
9358 }
9359 return 0;
9360 }
9361
9362 int OSDMonitor::prepare_command_osd_destroy(
9363 int32_t id,
9364 stringstream& ss)
9365 {
9366 ceph_assert(paxos->is_plugged());
9367
9368 // we check if the osd exists for the benefit of `osd purge`, which may
9369 // have previously removed the osd. If the osd does not exist, return
9370 // -ENOENT to convey this, and let the caller deal with it.
9371 //
9372 // we presume that all auth secrets and config keys were removed prior
9373 // to this command being called. if they exist by now, we also assume
9374 // they must have been created by some other command and do not pertain
9375 // to this non-existent osd.
9376 if (!osdmap.exists(id)) {
9377 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9378 return -ENOENT;
9379 }
9380
9381 uuid_d uuid = osdmap.get_uuid(id);
9382 dout(10) << __func__ << " destroying osd." << id
9383 << " uuid " << uuid << dendl;
9384
9385 // if it has been destroyed, we assume our work here is done.
9386 if (osdmap.is_destroyed(id)) {
9387 ss << "destroyed osd." << id;
9388 return 0;
9389 }
9390
9391 EntityName cephx_entity, lockbox_entity;
9392 bool idempotent_auth = false, idempotent_cks = false;
9393
9394 int err = mon->authmon()->validate_osd_destroy(id, uuid,
9395 cephx_entity,
9396 lockbox_entity,
9397 ss);
9398 if (err < 0) {
9399 if (err == -ENOENT) {
9400 idempotent_auth = true;
9401 } else {
9402 return err;
9403 }
9404 }
9405
9406 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9407 err = svc->validate_osd_destroy(id, uuid);
9408 if (err < 0) {
9409 ceph_assert(err == -ENOENT);
9410 err = 0;
9411 idempotent_cks = true;
9412 }
9413
9414 if (!idempotent_auth) {
9415 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9416 ceph_assert(0 == err);
9417 }
9418
9419 if (!idempotent_cks) {
9420 svc->do_osd_destroy(id, uuid);
9421 }
9422
9423 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9424 pending_inc.new_uuid[id] = uuid_d();
9425
9426 // we can only propose_pending() once per service, otherwise we'll be
9427 // defying PaxosService and all laws of nature. Therefore, as we may
9428 // be used during 'osd purge', let's keep the caller responsible for
9429 // proposing.
9430 ceph_assert(err == 0);
9431 return 0;
9432 }
9433
9434 int OSDMonitor::prepare_command_osd_purge(
9435 int32_t id,
9436 stringstream& ss)
9437 {
9438 ceph_assert(paxos->is_plugged());
9439 dout(10) << __func__ << " purging osd." << id << dendl;
9440
9441 ceph_assert(!osdmap.is_up(id));
9442
9443 /*
9444 * This may look a bit weird, but this is what's going to happen:
9445 *
9446 * 1. we make sure that removing from crush works
9447 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9448 * error, then we abort the whole operation, as no updates
9449 * have been made. However, we this function will have
9450 * side-effects, thus we need to make sure that all operations
9451 * performed henceforth will *always* succeed.
9452 * 3. we call `prepare_command_osd_remove()`. Although this
9453 * function can return an error, it currently only checks if the
9454 * osd is up - and we have made sure that it is not so, so there
9455 * is no conflict, and it is effectively an update.
9456 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9457 * the crush update we delayed from before.
9458 */
9459
9460 CrushWrapper newcrush;
9461 _get_pending_crush(newcrush);
9462
9463 bool may_be_idempotent = false;
9464
9465 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9466 if (err == -ENOENT) {
9467 err = 0;
9468 may_be_idempotent = true;
9469 } else if (err < 0) {
9470 ss << "error removing osd." << id << " from crush";
9471 return err;
9472 }
9473
9474 // no point destroying the osd again if it has already been marked destroyed
9475 if (!osdmap.is_destroyed(id)) {
9476 err = prepare_command_osd_destroy(id, ss);
9477 if (err < 0) {
9478 if (err == -ENOENT) {
9479 err = 0;
9480 } else {
9481 return err;
9482 }
9483 } else {
9484 may_be_idempotent = false;
9485 }
9486 }
9487 ceph_assert(0 == err);
9488
9489 if (may_be_idempotent && !osdmap.exists(id)) {
9490 dout(10) << __func__ << " osd." << id << " does not exist and "
9491 << "we are idempotent." << dendl;
9492 return -ENOENT;
9493 }
9494
9495 err = prepare_command_osd_remove(id);
9496 // we should not be busy, as we should have made sure this id is not up.
9497 ceph_assert(0 == err);
9498
9499 do_osd_crush_remove(newcrush);
9500 return 0;
9501 }
9502
9503 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9504 const cmdmap_t& cmdmap)
9505 {
9506 op->mark_osdmon_event(__func__);
9507 auto m = op->get_req<MMonCommand>();
9508 bool ret = false;
9509 stringstream ss;
9510 string rs;
9511 bufferlist rdata;
9512 int err = 0;
9513
9514 string format;
9515 cmd_getval(cmdmap, "format", format, string("plain"));
9516 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9517
9518 string prefix;
9519 cmd_getval(cmdmap, "prefix", prefix);
9520
9521 int64_t osdid;
9522 string osd_name;
9523 bool osdid_present = false;
9524 if (prefix != "osd pg-temp" &&
9525 prefix != "osd pg-upmap" &&
9526 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9527 osdid_present = cmd_getval(cmdmap, "id", osdid);
9528 }
9529 if (osdid_present) {
9530 ostringstream oss;
9531 oss << "osd." << osdid;
9532 osd_name = oss.str();
9533 }
9534
9535 // Even if there's a pending state with changes that could affect
9536 // a command, considering that said state isn't yet committed, we
9537 // just don't care about those changes if the command currently being
9538 // handled acts as a no-op against the current committed state.
9539 // In a nutshell, we assume this command happens *before*.
9540 //
9541 // Let me make this clearer:
9542 //
9543 // - If we have only one client, and that client issues some
9544 // operation that would conflict with this operation but is
9545 // still on the pending state, then we would be sure that said
9546 // operation wouldn't have returned yet, so the client wouldn't
9547 // issue this operation (unless the client didn't wait for the
9548 // operation to finish, and that would be the client's own fault).
9549 //
9550 // - If we have more than one client, each client will observe
9551 // whatever is the state at the moment of the commit. So, if we
9552 // have two clients, one issuing an unlink and another issuing a
9553 // link, and if the link happens while the unlink is still on the
9554 // pending state, from the link's point-of-view this is a no-op.
9555 // If different clients are issuing conflicting operations and
9556 // they care about that, then the clients should make sure they
9557 // enforce some kind of concurrency mechanism -- from our
9558 // perspective that's what Douglas Adams would call an SEP.
9559 //
9560 // This should be used as a general guideline for most commands handled
9561 // in this function. Adapt as you see fit, but please bear in mind that
9562 // this is the expected behavior.
9563
9564
9565 if (prefix == "osd setcrushmap" ||
9566 (prefix == "osd crush set" && !osdid_present)) {
9567 if (pending_inc.crush.length()) {
9568 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9569 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9570 return true;
9571 }
9572 dout(10) << "prepare_command setting new crush map" << dendl;
9573 bufferlist data(m->get_data());
9574 CrushWrapper crush;
9575 try {
9576 auto bl = data.cbegin();
9577 crush.decode(bl);
9578 }
9579 catch (const std::exception &e) {
9580 err = -EINVAL;
9581 ss << "Failed to parse crushmap: " << e.what();
9582 goto reply;
9583 }
9584
9585 int64_t prior_version = 0;
9586 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9587 if (prior_version == osdmap.get_crush_version() - 1) {
9588 // see if we are a resend of the last update. this is imperfect
9589 // (multiple racing updaters may not both get reliable success)
9590 // but we expect crush updaters (via this interface) to be rare-ish.
9591 bufferlist current, proposed;
9592 osdmap.crush->encode(current, mon->get_quorum_con_features());
9593 crush.encode(proposed, mon->get_quorum_con_features());
9594 if (current.contents_equal(proposed)) {
9595 dout(10) << __func__
9596 << " proposed matches current and version equals previous"
9597 << dendl;
9598 err = 0;
9599 ss << osdmap.get_crush_version();
9600 goto reply;
9601 }
9602 }
9603 if (prior_version != osdmap.get_crush_version()) {
9604 err = -EPERM;
9605 ss << "prior_version " << prior_version << " != crush version "
9606 << osdmap.get_crush_version();
9607 goto reply;
9608 }
9609 }
9610
9611 if (crush.has_legacy_rule_ids()) {
9612 err = -EINVAL;
9613 ss << "crush maps with ruleset != ruleid are no longer allowed";
9614 goto reply;
9615 }
9616 if (!validate_crush_against_features(&crush, ss)) {
9617 err = -EINVAL;
9618 goto reply;
9619 }
9620
9621 err = osdmap.validate_crush_rules(&crush, &ss);
9622 if (err < 0) {
9623 goto reply;
9624 }
9625
9626 if (g_conf()->mon_osd_crush_smoke_test) {
9627 // sanity check: test some inputs to make sure this map isn't
9628 // totally broken
9629 dout(10) << " testing map" << dendl;
9630 stringstream ess;
9631 CrushTester tester(crush, ess);
9632 tester.set_min_x(0);
9633 tester.set_max_x(50);
9634 auto start = ceph::coarse_mono_clock::now();
9635 int r = tester.test_with_fork(g_conf()->mon_lease);
9636 auto duration = ceph::coarse_mono_clock::now() - start;
9637 if (r < 0) {
9638 dout(10) << " tester.test_with_fork returns " << r
9639 << ": " << ess.str() << dendl;
9640 ss << "crush smoke test failed with " << r << ": " << ess.str();
9641 err = r;
9642 goto reply;
9643 }
9644 dout(10) << __func__ << " crush somke test duration: "
9645 << duration << ", result: " << ess.str() << dendl;
9646 }
9647
9648 pending_inc.crush = data;
9649 ss << osdmap.get_crush_version() + 1;
9650 goto update;
9651
9652 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9653 CrushWrapper newcrush;
9654 _get_pending_crush(newcrush);
9655 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9656 int bid = -1 - b;
9657 if (newcrush.bucket_exists(bid) &&
9658 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9659 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9660 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9661 }
9662 }
9663 if (!validate_crush_against_features(&newcrush, ss)) {
9664 err = -EINVAL;
9665 goto reply;
9666 }
9667 pending_inc.crush.clear();
9668 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9669 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9670 get_last_committed() + 1));
9671 return true;
9672 } else if (prefix == "osd crush set-device-class") {
9673 string device_class;
9674 if (!cmd_getval(cmdmap, "class", device_class)) {
9675 err = -EINVAL; // no value!
9676 goto reply;
9677 }
9678
9679 bool stop = false;
9680 vector<string> idvec;
9681 cmd_getval(cmdmap, "ids", idvec);
9682 CrushWrapper newcrush;
9683 _get_pending_crush(newcrush);
9684 set<int> updated;
9685 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9686 set<int> osds;
9687 // wildcard?
9688 if (j == 0 &&
9689 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9690 osdmap.get_all_osds(osds);
9691 stop = true;
9692 } else {
9693 // try traditional single osd way
9694 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9695 if (osd < 0) {
9696 // ss has reason for failure
9697 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9698 err = -EINVAL;
9699 continue;
9700 }
9701 osds.insert(osd);
9702 }
9703
9704 for (auto &osd : osds) {
9705 if (!osdmap.exists(osd)) {
9706 ss << "osd." << osd << " does not exist. ";
9707 continue;
9708 }
9709
9710 ostringstream oss;
9711 oss << "osd." << osd;
9712 string name = oss.str();
9713
9714 if (newcrush.get_max_devices() < osd + 1) {
9715 newcrush.set_max_devices(osd + 1);
9716 }
9717 string action;
9718 if (newcrush.item_exists(osd)) {
9719 action = "updating";
9720 } else {
9721 action = "creating";
9722 newcrush.set_item_name(osd, name);
9723 }
9724
9725 dout(5) << action << " crush item id " << osd << " name '" << name
9726 << "' device_class '" << device_class << "'"
9727 << dendl;
9728 err = newcrush.update_device_class(osd, device_class, name, &ss);
9729 if (err < 0) {
9730 goto reply;
9731 }
9732 if (err == 0 && !_have_pending_crush()) {
9733 if (!stop) {
9734 // for single osd only, wildcard makes too much noise
9735 ss << "set-device-class item id " << osd << " name '" << name
9736 << "' device_class '" << device_class << "': no change. ";
9737 }
9738 } else {
9739 updated.insert(osd);
9740 }
9741 }
9742 }
9743
9744 if (!updated.empty()) {
9745 pending_inc.crush.clear();
9746 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9747 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9748 getline(ss, rs);
9749 wait_for_finished_proposal(op,
9750 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9751 return true;
9752 }
9753
9754 } else if (prefix == "osd crush rm-device-class") {
9755 bool stop = false;
9756 vector<string> idvec;
9757 cmd_getval(cmdmap, "ids", idvec);
9758 CrushWrapper newcrush;
9759 _get_pending_crush(newcrush);
9760 set<int> updated;
9761
9762 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9763 set<int> osds;
9764
9765 // wildcard?
9766 if (j == 0 &&
9767 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9768 osdmap.get_all_osds(osds);
9769 stop = true;
9770 } else {
9771 // try traditional single osd way
9772 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9773 if (osd < 0) {
9774 // ss has reason for failure
9775 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9776 err = -EINVAL;
9777 goto reply;
9778 }
9779 osds.insert(osd);
9780 }
9781
9782 for (auto &osd : osds) {
9783 if (!osdmap.exists(osd)) {
9784 ss << "osd." << osd << " does not exist. ";
9785 continue;
9786 }
9787
9788 auto class_name = newcrush.get_item_class(osd);
9789 if (!class_name) {
9790 ss << "osd." << osd << " belongs to no class, ";
9791 continue;
9792 }
9793 // note that we do not verify if class_is_in_use here
9794 // in case the device is misclassified and user wants
9795 // to overridely reset...
9796
9797 err = newcrush.remove_device_class(cct, osd, &ss);
9798 if (err < 0) {
9799 // ss has reason for failure
9800 goto reply;
9801 }
9802 updated.insert(osd);
9803 }
9804 }
9805
9806 if (!updated.empty()) {
9807 pending_inc.crush.clear();
9808 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9809 ss << "done removing class of osd(s): " << updated;
9810 getline(ss, rs);
9811 wait_for_finished_proposal(op,
9812 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9813 return true;
9814 }
9815 } else if (prefix == "osd crush class create") {
9816 string device_class;
9817 if (!cmd_getval(cmdmap, "class", device_class)) {
9818 err = -EINVAL; // no value!
9819 goto reply;
9820 }
9821 if (osdmap.require_osd_release < ceph_release_t::luminous) {
9822 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9823 << "luminous' before using crush device classes";
9824 err = -EPERM;
9825 goto reply;
9826 }
9827 if (!_have_pending_crush() &&
9828 _get_stable_crush().class_exists(device_class)) {
9829 ss << "class '" << device_class << "' already exists";
9830 goto reply;
9831 }
9832 CrushWrapper newcrush;
9833 _get_pending_crush(newcrush);
9834 if (newcrush.class_exists(device_class)) {
9835 ss << "class '" << device_class << "' already exists";
9836 goto update;
9837 }
9838 int class_id = newcrush.get_or_create_class_id(device_class);
9839 pending_inc.crush.clear();
9840 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9841 ss << "created class " << device_class << " with id " << class_id
9842 << " to crush map";
9843 goto update;
9844 } else if (prefix == "osd crush class rm") {
9845 string device_class;
9846 if (!cmd_getval(cmdmap, "class", device_class)) {
9847 err = -EINVAL; // no value!
9848 goto reply;
9849 }
9850 if (osdmap.require_osd_release < ceph_release_t::luminous) {
9851 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9852 << "luminous' before using crush device classes";
9853 err = -EPERM;
9854 goto reply;
9855 }
9856
9857 if (!osdmap.crush->class_exists(device_class)) {
9858 err = 0;
9859 goto reply;
9860 }
9861
9862 CrushWrapper newcrush;
9863 _get_pending_crush(newcrush);
9864 if (!newcrush.class_exists(device_class)) {
9865 err = 0; // make command idempotent
9866 goto wait;
9867 }
9868 int class_id = newcrush.get_class_id(device_class);
9869 stringstream ts;
9870 if (newcrush.class_is_in_use(class_id, &ts)) {
9871 err = -EBUSY;
9872 ss << "class '" << device_class << "' " << ts.str();
9873 goto reply;
9874 }
9875
9876 // check if class is used by any erasure-code-profiles
9877 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9878 osdmap.get_erasure_code_profiles();
9879 auto ec_profiles = pending_inc.get_erasure_code_profiles();
9880 #ifdef HAVE_STDLIB_MAP_SPLICING
9881 ec_profiles.merge(old_ec_profiles);
9882 #else
9883 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9884 make_move_iterator(end(old_ec_profiles)));
9885 #endif
9886 list<string> referenced_by;
9887 for (auto &i: ec_profiles) {
9888 for (auto &j: i.second) {
9889 if ("crush-device-class" == j.first && device_class == j.second) {
9890 referenced_by.push_back(i.first);
9891 }
9892 }
9893 }
9894 if (!referenced_by.empty()) {
9895 err = -EBUSY;
9896 ss << "class '" << device_class
9897 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9898 goto reply;
9899 }
9900
9901 set<int> osds;
9902 newcrush.get_devices_by_class(device_class, &osds);
9903 for (auto& p: osds) {
9904 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9905 if (err < 0) {
9906 // ss has reason for failure
9907 goto reply;
9908 }
9909 }
9910
9911 if (osds.empty()) {
9912 // empty class, remove directly
9913 err = newcrush.remove_class_name(device_class);
9914 if (err < 0) {
9915 ss << "class '" << device_class << "' cannot be removed '"
9916 << cpp_strerror(err) << "'";
9917 goto reply;
9918 }
9919 }
9920
9921 pending_inc.crush.clear();
9922 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9923 ss << "removed class " << device_class << " with id " << class_id
9924 << " from crush map";
9925 goto update;
9926 } else if (prefix == "osd crush class rename") {
9927 string srcname, dstname;
9928 if (!cmd_getval(cmdmap, "srcname", srcname)) {
9929 err = -EINVAL;
9930 goto reply;
9931 }
9932 if (!cmd_getval(cmdmap, "dstname", dstname)) {
9933 err = -EINVAL;
9934 goto reply;
9935 }
9936
9937 CrushWrapper newcrush;
9938 _get_pending_crush(newcrush);
9939 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9940 // suppose this is a replay and return success
9941 // so command is idempotent
9942 ss << "already renamed to '" << dstname << "'";
9943 err = 0;
9944 goto reply;
9945 }
9946
9947 err = newcrush.rename_class(srcname, dstname);
9948 if (err < 0) {
9949 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9950 << cpp_strerror(err);
9951 goto reply;
9952 }
9953
9954 pending_inc.crush.clear();
9955 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9956 ss << "rename class '" << srcname << "' to '" << dstname << "'";
9957 goto update;
9958 } else if (prefix == "osd crush add-bucket") {
9959 // os crush add-bucket <name> <type>
9960 string name, typestr;
9961 vector<string> argvec;
9962 cmd_getval(cmdmap, "name", name);
9963 cmd_getval(cmdmap, "type", typestr);
9964 cmd_getval(cmdmap, "args", argvec);
9965 map<string,string> loc;
9966 if (!argvec.empty()) {
9967 CrushWrapper::parse_loc_map(argvec, &loc);
9968 dout(0) << "will create and move bucket '" << name
9969 << "' to location " << loc << dendl;
9970 }
9971
9972 if (!_have_pending_crush() &&
9973 _get_stable_crush().name_exists(name)) {
9974 ss << "bucket '" << name << "' already exists";
9975 goto reply;
9976 }
9977
9978 CrushWrapper newcrush;
9979 _get_pending_crush(newcrush);
9980
9981 if (newcrush.name_exists(name)) {
9982 ss << "bucket '" << name << "' already exists";
9983 goto update;
9984 }
9985 int type = newcrush.get_type_id(typestr);
9986 if (type < 0) {
9987 ss << "type '" << typestr << "' does not exist";
9988 err = -EINVAL;
9989 goto reply;
9990 }
9991 if (type == 0) {
9992 ss << "type '" << typestr << "' is for devices, not buckets";
9993 err = -EINVAL;
9994 goto reply;
9995 }
9996 int bucketno;
9997 err = newcrush.add_bucket(0, 0,
9998 CRUSH_HASH_DEFAULT, type, 0, NULL,
9999 NULL, &bucketno);
10000 if (err < 0) {
10001 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10002 goto reply;
10003 }
10004 err = newcrush.set_item_name(bucketno, name);
10005 if (err < 0) {
10006 ss << "error setting bucket name to '" << name << "'";
10007 goto reply;
10008 }
10009
10010 if (!loc.empty()) {
10011 if (!newcrush.check_item_loc(cct, bucketno, loc,
10012 (int *)NULL)) {
10013 err = newcrush.move_bucket(cct, bucketno, loc);
10014 if (err < 0) {
10015 ss << "error moving bucket '" << name << "' to location " << loc;
10016 goto reply;
10017 }
10018 } else {
10019 ss << "no need to move item id " << bucketno << " name '" << name
10020 << "' to location " << loc << " in crush map";
10021 }
10022 }
10023
10024 pending_inc.crush.clear();
10025 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10026 if (loc.empty()) {
10027 ss << "added bucket " << name << " type " << typestr
10028 << " to crush map";
10029 } else {
10030 ss << "added bucket " << name << " type " << typestr
10031 << " to location " << loc;
10032 }
10033 goto update;
10034 } else if (prefix == "osd crush rename-bucket") {
10035 string srcname, dstname;
10036 cmd_getval(cmdmap, "srcname", srcname);
10037 cmd_getval(cmdmap, "dstname", dstname);
10038
10039 err = crush_rename_bucket(srcname, dstname, &ss);
10040 if (err == -EALREADY) // equivalent to success for idempotency
10041 err = 0;
10042 if (err)
10043 goto reply;
10044 else
10045 goto update;
10046 } else if (prefix == "osd crush weight-set create" ||
10047 prefix == "osd crush weight-set create-compat") {
10048 CrushWrapper newcrush;
10049 _get_pending_crush(newcrush);
10050 int64_t pool;
10051 int positions;
10052 if (newcrush.has_non_straw2_buckets()) {
10053 ss << "crush map contains one or more bucket(s) that are not straw2";
10054 err = -EPERM;
10055 goto reply;
10056 }
10057 if (prefix == "osd crush weight-set create") {
10058 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10059 osdmap.require_min_compat_client < ceph_release_t::luminous) {
10060 ss << "require_min_compat_client "
10061 << osdmap.require_min_compat_client
10062 << " < luminous, which is required for per-pool weight-sets. "
10063 << "Try 'ceph osd set-require-min-compat-client luminous' "
10064 << "before using the new interface";
10065 err = -EPERM;
10066 goto reply;
10067 }
10068 string poolname, mode;
10069 cmd_getval(cmdmap, "pool", poolname);
10070 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10071 if (pool < 0) {
10072 ss << "pool '" << poolname << "' not found";
10073 err = -ENOENT;
10074 goto reply;
10075 }
10076 cmd_getval(cmdmap, "mode", mode);
10077 if (mode != "flat" && mode != "positional") {
10078 ss << "unrecognized weight-set mode '" << mode << "'";
10079 err = -EINVAL;
10080 goto reply;
10081 }
10082 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10083 } else {
10084 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10085 positions = 1;
10086 }
10087 if (!newcrush.create_choose_args(pool, positions)) {
10088 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10089 ss << "compat weight-set already created";
10090 } else {
10091 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10092 << "' already created";
10093 }
10094 goto reply;
10095 }
10096 pending_inc.crush.clear();
10097 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10098 goto update;
10099
10100 } else if (prefix == "osd crush weight-set rm" ||
10101 prefix == "osd crush weight-set rm-compat") {
10102 CrushWrapper newcrush;
10103 _get_pending_crush(newcrush);
10104 int64_t pool;
10105 if (prefix == "osd crush weight-set rm") {
10106 string poolname;
10107 cmd_getval(cmdmap, "pool", poolname);
10108 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10109 if (pool < 0) {
10110 ss << "pool '" << poolname << "' not found";
10111 err = -ENOENT;
10112 goto reply;
10113 }
10114 } else {
10115 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10116 }
10117 newcrush.rm_choose_args(pool);
10118 pending_inc.crush.clear();
10119 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10120 goto update;
10121
10122 } else if (prefix == "osd crush weight-set reweight" ||
10123 prefix == "osd crush weight-set reweight-compat") {
10124 string poolname, item;
10125 vector<double> weight;
10126 cmd_getval(cmdmap, "pool", poolname);
10127 cmd_getval(cmdmap, "item", item);
10128 cmd_getval(cmdmap, "weight", weight);
10129 CrushWrapper newcrush;
10130 _get_pending_crush(newcrush);
10131 int64_t pool;
10132 if (prefix == "osd crush weight-set reweight") {
10133 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10134 if (pool < 0) {
10135 ss << "pool '" << poolname << "' not found";
10136 err = -ENOENT;
10137 goto reply;
10138 }
10139 if (!newcrush.have_choose_args(pool)) {
10140 ss << "no weight-set for pool '" << poolname << "'";
10141 err = -ENOENT;
10142 goto reply;
10143 }
10144 auto arg_map = newcrush.choose_args_get(pool);
10145 int positions = newcrush.get_choose_args_positions(arg_map);
10146 if (weight.size() != (size_t)positions) {
10147 ss << "must specify exact " << positions << " weight values";
10148 err = -EINVAL;
10149 goto reply;
10150 }
10151 } else {
10152 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10153 if (!newcrush.have_choose_args(pool)) {
10154 ss << "no backward-compatible weight-set";
10155 err = -ENOENT;
10156 goto reply;
10157 }
10158 }
10159 if (!newcrush.name_exists(item)) {
10160 ss << "item '" << item << "' does not exist";
10161 err = -ENOENT;
10162 goto reply;
10163 }
10164 err = newcrush.choose_args_adjust_item_weightf(
10165 cct,
10166 newcrush.choose_args_get(pool),
10167 newcrush.get_item_id(item),
10168 weight,
10169 &ss);
10170 if (err < 0) {
10171 goto reply;
10172 }
10173 err = 0;
10174 pending_inc.crush.clear();
10175 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10176 goto update;
10177 } else if (osdid_present &&
10178 (prefix == "osd crush set" || prefix == "osd crush add")) {
10179 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10180 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10181 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10182
10183 if (!osdmap.exists(osdid)) {
10184 err = -ENOENT;
10185 ss << osd_name
10186 << " does not exist. Create it before updating the crush map";
10187 goto reply;
10188 }
10189
10190 double weight;
10191 if (!cmd_getval(cmdmap, "weight", weight)) {
10192 ss << "unable to parse weight value '"
10193 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10194 err = -EINVAL;
10195 goto reply;
10196 }
10197
10198 string args;
10199 vector<string> argvec;
10200 cmd_getval(cmdmap, "args", argvec);
10201 map<string,string> loc;
10202 CrushWrapper::parse_loc_map(argvec, &loc);
10203
10204 if (prefix == "osd crush set"
10205 && !_get_stable_crush().item_exists(osdid)) {
10206 err = -ENOENT;
10207 ss << "unable to set item id " << osdid << " name '" << osd_name
10208 << "' weight " << weight << " at location " << loc
10209 << ": does not exist";
10210 goto reply;
10211 }
10212
10213 dout(5) << "adding/updating crush item id " << osdid << " name '"
10214 << osd_name << "' weight " << weight << " at location "
10215 << loc << dendl;
10216 CrushWrapper newcrush;
10217 _get_pending_crush(newcrush);
10218
10219 string action;
10220 if (prefix == "osd crush set" ||
10221 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10222 action = "set";
10223 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10224 } else {
10225 action = "add";
10226 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10227 if (err == 0)
10228 err = 1;
10229 }
10230
10231 if (err < 0)
10232 goto reply;
10233
10234 if (err == 0 && !_have_pending_crush()) {
10235 ss << action << " item id " << osdid << " name '" << osd_name
10236 << "' weight " << weight << " at location " << loc << ": no change";
10237 goto reply;
10238 }
10239
10240 pending_inc.crush.clear();
10241 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10242 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10243 << weight << " at location " << loc << " to crush map";
10244 getline(ss, rs);
10245 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10246 get_last_committed() + 1));
10247 return true;
10248
10249 } else if (prefix == "osd crush create-or-move") {
10250 do {
10251 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10252 if (!osdmap.exists(osdid)) {
10253 err = -ENOENT;
10254 ss << osd_name
10255 << " does not exist. create it before updating the crush map";
10256 goto reply;
10257 }
10258
10259 double weight;
10260 if (!cmd_getval(cmdmap, "weight", weight)) {
10261 ss << "unable to parse weight value '"
10262 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10263 err = -EINVAL;
10264 goto reply;
10265 }
10266
10267 string args;
10268 vector<string> argvec;
10269 cmd_getval(cmdmap, "args", argvec);
10270 map<string,string> loc;
10271 CrushWrapper::parse_loc_map(argvec, &loc);
10272
10273 dout(0) << "create-or-move crush item name '" << osd_name
10274 << "' initial_weight " << weight << " at location " << loc
10275 << dendl;
10276
10277 CrushWrapper newcrush;
10278 _get_pending_crush(newcrush);
10279
10280 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10281 g_conf()->osd_crush_update_weight_set);
10282 if (err == 0) {
10283 ss << "create-or-move updated item name '" << osd_name
10284 << "' weight " << weight
10285 << " at location " << loc << " to crush map";
10286 break;
10287 }
10288 if (err > 0) {
10289 pending_inc.crush.clear();
10290 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10291 ss << "create-or-move updating item name '" << osd_name
10292 << "' weight " << weight
10293 << " at location " << loc << " to crush map";
10294 getline(ss, rs);
10295 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10296 get_last_committed() + 1));
10297 return true;
10298 }
10299 } while (false);
10300
10301 } else if (prefix == "osd crush move") {
10302 do {
10303 // osd crush move <name> <loc1> [<loc2> ...]
10304 string name;
10305 vector<string> argvec;
10306 cmd_getval(cmdmap, "name", name);
10307 cmd_getval(cmdmap, "args", argvec);
10308 map<string,string> loc;
10309 CrushWrapper::parse_loc_map(argvec, &loc);
10310
10311 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10312 CrushWrapper newcrush;
10313 _get_pending_crush(newcrush);
10314
10315 if (!newcrush.name_exists(name)) {
10316 err = -ENOENT;
10317 ss << "item " << name << " does not exist";
10318 break;
10319 }
10320 int id = newcrush.get_item_id(name);
10321
10322 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10323 if (id >= 0) {
10324 err = newcrush.create_or_move_item(
10325 cct, id, 0, name, loc,
10326 g_conf()->osd_crush_update_weight_set);
10327 } else {
10328 err = newcrush.move_bucket(cct, id, loc);
10329 }
10330 if (err >= 0) {
10331 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10332 pending_inc.crush.clear();
10333 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10334 getline(ss, rs);
10335 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10336 get_last_committed() + 1));
10337 return true;
10338 }
10339 } else {
10340 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10341 err = 0;
10342 }
10343 } while (false);
10344 } else if (prefix == "osd crush swap-bucket") {
10345 string source, dest;
10346 cmd_getval(cmdmap, "source", source);
10347 cmd_getval(cmdmap, "dest", dest);
10348
10349 bool force = false;
10350 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10351
10352 CrushWrapper newcrush;
10353 _get_pending_crush(newcrush);
10354 if (!newcrush.name_exists(source)) {
10355 ss << "source item " << source << " does not exist";
10356 err = -ENOENT;
10357 goto reply;
10358 }
10359 if (!newcrush.name_exists(dest)) {
10360 ss << "dest item " << dest << " does not exist";
10361 err = -ENOENT;
10362 goto reply;
10363 }
10364 int sid = newcrush.get_item_id(source);
10365 int did = newcrush.get_item_id(dest);
10366 int sparent;
10367 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10368 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10369 err = -EPERM;
10370 goto reply;
10371 }
10372 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10373 !force) {
10374 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10375 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10376 << "; pass --yes-i-really-mean-it to proceed anyway";
10377 err = -EPERM;
10378 goto reply;
10379 }
10380 int r = newcrush.swap_bucket(cct, sid, did);
10381 if (r < 0) {
10382 ss << "failed to swap bucket contents: " << cpp_strerror(r);
10383 err = r;
10384 goto reply;
10385 }
10386 ss << "swapped bucket of " << source << " to " << dest;
10387 pending_inc.crush.clear();
10388 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10389 wait_for_finished_proposal(op,
10390 new Monitor::C_Command(mon, op, err, ss.str(),
10391 get_last_committed() + 1));
10392 return true;
10393 } else if (prefix == "osd crush link") {
10394 // osd crush link <name> <loc1> [<loc2> ...]
10395 string name;
10396 cmd_getval(cmdmap, "name", name);
10397 vector<string> argvec;
10398 cmd_getval(cmdmap, "args", argvec);
10399 map<string,string> loc;
10400 CrushWrapper::parse_loc_map(argvec, &loc);
10401
10402 // Need an explicit check for name_exists because get_item_id returns
10403 // 0 on unfound.
10404 int id = osdmap.crush->get_item_id(name);
10405 if (!osdmap.crush->name_exists(name)) {
10406 err = -ENOENT;
10407 ss << "item " << name << " does not exist";
10408 goto reply;
10409 } else {
10410 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10411 }
10412 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10413 ss << "no need to move item id " << id << " name '" << name
10414 << "' to location " << loc << " in crush map";
10415 err = 0;
10416 goto reply;
10417 }
10418
10419 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10420 CrushWrapper newcrush;
10421 _get_pending_crush(newcrush);
10422
10423 if (!newcrush.name_exists(name)) {
10424 err = -ENOENT;
10425 ss << "item " << name << " does not exist";
10426 goto reply;
10427 } else {
10428 int id = newcrush.get_item_id(name);
10429 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10430 err = newcrush.link_bucket(cct, id, loc);
10431 if (err >= 0) {
10432 ss << "linked item id " << id << " name '" << name
10433 << "' to location " << loc << " in crush map";
10434 pending_inc.crush.clear();
10435 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10436 } else {
10437 ss << "cannot link item id " << id << " name '" << name
10438 << "' to location " << loc;
10439 goto reply;
10440 }
10441 } else {
10442 ss << "no need to move item id " << id << " name '" << name
10443 << "' to location " << loc << " in crush map";
10444 err = 0;
10445 }
10446 }
10447 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10448 get_last_committed() + 1));
10449 return true;
10450 } else if (prefix == "osd crush rm" ||
10451 prefix == "osd crush remove" ||
10452 prefix == "osd crush unlink") {
10453 do {
10454 // osd crush rm <id> [ancestor]
10455 CrushWrapper newcrush;
10456 _get_pending_crush(newcrush);
10457
10458 string name;
10459 cmd_getval(cmdmap, "name", name);
10460
10461 if (!osdmap.crush->name_exists(name)) {
10462 err = 0;
10463 ss << "device '" << name << "' does not appear in the crush map";
10464 break;
10465 }
10466 if (!newcrush.name_exists(name)) {
10467 err = 0;
10468 ss << "device '" << name << "' does not appear in the crush map";
10469 getline(ss, rs);
10470 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10471 get_last_committed() + 1));
10472 return true;
10473 }
10474 int id = newcrush.get_item_id(name);
10475 int ancestor = 0;
10476
10477 bool unlink_only = prefix == "osd crush unlink";
10478 string ancestor_str;
10479 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10480 if (!newcrush.name_exists(ancestor_str)) {
10481 err = -ENOENT;
10482 ss << "ancestor item '" << ancestor_str
10483 << "' does not appear in the crush map";
10484 break;
10485 }
10486 ancestor = newcrush.get_item_id(ancestor_str);
10487 }
10488
10489 err = prepare_command_osd_crush_remove(
10490 newcrush,
10491 id, ancestor,
10492 (ancestor < 0), unlink_only);
10493
10494 if (err == -ENOENT) {
10495 ss << "item " << id << " does not appear in that position";
10496 err = 0;
10497 break;
10498 }
10499 if (err == 0) {
10500 if (!unlink_only)
10501 pending_inc.new_crush_node_flags[id] = 0;
10502 ss << "removed item id " << id << " name '" << name << "' from crush map";
10503 getline(ss, rs);
10504 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10505 get_last_committed() + 1));
10506 return true;
10507 }
10508 } while (false);
10509
10510 } else if (prefix == "osd crush reweight-all") {
10511 CrushWrapper newcrush;
10512 _get_pending_crush(newcrush);
10513
10514 newcrush.reweight(cct);
10515 pending_inc.crush.clear();
10516 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10517 ss << "reweighted crush hierarchy";
10518 getline(ss, rs);
10519 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10520 get_last_committed() + 1));
10521 return true;
10522 } else if (prefix == "osd crush reweight") {
10523 // osd crush reweight <name> <weight>
10524 CrushWrapper newcrush;
10525 _get_pending_crush(newcrush);
10526
10527 string name;
10528 cmd_getval(cmdmap, "name", name);
10529 if (!newcrush.name_exists(name)) {
10530 err = -ENOENT;
10531 ss << "device '" << name << "' does not appear in the crush map";
10532 goto reply;
10533 }
10534
10535 int id = newcrush.get_item_id(name);
10536 if (id < 0) {
10537 ss << "device '" << name << "' is not a leaf in the crush map";
10538 err = -EINVAL;
10539 goto reply;
10540 }
10541 double w;
10542 if (!cmd_getval(cmdmap, "weight", w)) {
10543 ss << "unable to parse weight value '"
10544 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10545 err = -EINVAL;
10546 goto reply;
10547 }
10548
10549 err = newcrush.adjust_item_weightf(cct, id, w,
10550 g_conf()->osd_crush_update_weight_set);
10551 if (err < 0)
10552 goto reply;
10553 pending_inc.crush.clear();
10554 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10555 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10556 << " in crush map";
10557 getline(ss, rs);
10558 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10559 get_last_committed() + 1));
10560 return true;
10561 } else if (prefix == "osd crush reweight-subtree") {
10562 // osd crush reweight <name> <weight>
10563 CrushWrapper newcrush;
10564 _get_pending_crush(newcrush);
10565
10566 string name;
10567 cmd_getval(cmdmap, "name", name);
10568 if (!newcrush.name_exists(name)) {
10569 err = -ENOENT;
10570 ss << "device '" << name << "' does not appear in the crush map";
10571 goto reply;
10572 }
10573
10574 int id = newcrush.get_item_id(name);
10575 if (id >= 0) {
10576 ss << "device '" << name << "' is not a subtree in the crush map";
10577 err = -EINVAL;
10578 goto reply;
10579 }
10580 double w;
10581 if (!cmd_getval(cmdmap, "weight", w)) {
10582 ss << "unable to parse weight value '"
10583 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10584 err = -EINVAL;
10585 goto reply;
10586 }
10587
10588 err = newcrush.adjust_subtree_weightf(cct, id, w,
10589 g_conf()->osd_crush_update_weight_set);
10590 if (err < 0)
10591 goto reply;
10592 pending_inc.crush.clear();
10593 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10594 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10595 << " in crush map";
10596 getline(ss, rs);
10597 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10598 get_last_committed() + 1));
10599 return true;
10600 } else if (prefix == "osd crush tunables") {
10601 CrushWrapper newcrush;
10602 _get_pending_crush(newcrush);
10603
10604 err = 0;
10605 string profile;
10606 cmd_getval(cmdmap, "profile", profile);
10607 if (profile == "legacy" || profile == "argonaut") {
10608 newcrush.set_tunables_legacy();
10609 } else if (profile == "bobtail") {
10610 newcrush.set_tunables_bobtail();
10611 } else if (profile == "firefly") {
10612 newcrush.set_tunables_firefly();
10613 } else if (profile == "hammer") {
10614 newcrush.set_tunables_hammer();
10615 } else if (profile == "jewel") {
10616 newcrush.set_tunables_jewel();
10617 } else if (profile == "optimal") {
10618 newcrush.set_tunables_optimal();
10619 } else if (profile == "default") {
10620 newcrush.set_tunables_default();
10621 } else {
10622 ss << "unrecognized profile '" << profile << "'";
10623 err = -EINVAL;
10624 goto reply;
10625 }
10626
10627 if (!validate_crush_against_features(&newcrush, ss)) {
10628 err = -EINVAL;
10629 goto reply;
10630 }
10631
10632 pending_inc.crush.clear();
10633 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10634 ss << "adjusted tunables profile to " << profile;
10635 getline(ss, rs);
10636 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10637 get_last_committed() + 1));
10638 return true;
10639 } else if (prefix == "osd crush set-tunable") {
10640 CrushWrapper newcrush;
10641 _get_pending_crush(newcrush);
10642
10643 err = 0;
10644 string tunable;
10645 cmd_getval(cmdmap, "tunable", tunable);
10646
10647 int64_t value = -1;
10648 if (!cmd_getval(cmdmap, "value", value)) {
10649 err = -EINVAL;
10650 ss << "failed to parse integer value "
10651 << cmd_vartype_stringify(cmdmap.at("value"));
10652 goto reply;
10653 }
10654
10655 if (tunable == "straw_calc_version") {
10656 if (value != 0 && value != 1) {
10657 ss << "value must be 0 or 1; got " << value;
10658 err = -EINVAL;
10659 goto reply;
10660 }
10661 newcrush.set_straw_calc_version(value);
10662 } else {
10663 ss << "unrecognized tunable '" << tunable << "'";
10664 err = -EINVAL;
10665 goto reply;
10666 }
10667
10668 if (!validate_crush_against_features(&newcrush, ss)) {
10669 err = -EINVAL;
10670 goto reply;
10671 }
10672
10673 pending_inc.crush.clear();
10674 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10675 ss << "adjusted tunable " << tunable << " to " << value;
10676 getline(ss, rs);
10677 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10678 get_last_committed() + 1));
10679 return true;
10680
10681 } else if (prefix == "osd crush rule create-simple") {
10682 string name, root, type, mode;
10683 cmd_getval(cmdmap, "name", name);
10684 cmd_getval(cmdmap, "root", root);
10685 cmd_getval(cmdmap, "type", type);
10686 cmd_getval(cmdmap, "mode", mode);
10687 if (mode == "")
10688 mode = "firstn";
10689
10690 if (osdmap.crush->rule_exists(name)) {
10691 // The name is uniquely associated to a ruleid and the rule it contains
10692 // From the user point of view, the rule is more meaningfull.
10693 ss << "rule " << name << " already exists";
10694 err = 0;
10695 goto reply;
10696 }
10697
10698 CrushWrapper newcrush;
10699 _get_pending_crush(newcrush);
10700
10701 if (newcrush.rule_exists(name)) {
10702 // The name is uniquely associated to a ruleid and the rule it contains
10703 // From the user point of view, the rule is more meaningfull.
10704 ss << "rule " << name << " already exists";
10705 err = 0;
10706 } else {
10707 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10708 pg_pool_t::TYPE_REPLICATED, &ss);
10709 if (ruleno < 0) {
10710 err = ruleno;
10711 goto reply;
10712 }
10713
10714 pending_inc.crush.clear();
10715 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10716 }
10717 getline(ss, rs);
10718 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10719 get_last_committed() + 1));
10720 return true;
10721
10722 } else if (prefix == "osd crush rule create-replicated") {
10723 string name, root, type, device_class;
10724 cmd_getval(cmdmap, "name", name);
10725 cmd_getval(cmdmap, "root", root);
10726 cmd_getval(cmdmap, "type", type);
10727 cmd_getval(cmdmap, "class", device_class);
10728
10729 if (osdmap.crush->rule_exists(name)) {
10730 // The name is uniquely associated to a ruleid and the rule it contains
10731 // From the user point of view, the rule is more meaningfull.
10732 ss << "rule " << name << " already exists";
10733 err = 0;
10734 goto reply;
10735 }
10736
10737 CrushWrapper newcrush;
10738 _get_pending_crush(newcrush);
10739
10740 if (newcrush.rule_exists(name)) {
10741 // The name is uniquely associated to a ruleid and the rule it contains
10742 // From the user point of view, the rule is more meaningfull.
10743 ss << "rule " << name << " already exists";
10744 err = 0;
10745 } else {
10746 int ruleno = newcrush.add_simple_rule(
10747 name, root, type, device_class,
10748 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10749 if (ruleno < 0) {
10750 err = ruleno;
10751 goto reply;
10752 }
10753
10754 pending_inc.crush.clear();
10755 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10756 }
10757 getline(ss, rs);
10758 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10759 get_last_committed() + 1));
10760 return true;
10761
10762 } else if (prefix == "osd erasure-code-profile rm") {
10763 string name;
10764 cmd_getval(cmdmap, "name", name);
10765
10766 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10767 goto wait;
10768
10769 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10770 err = -EBUSY;
10771 goto reply;
10772 }
10773
10774 if (osdmap.has_erasure_code_profile(name) ||
10775 pending_inc.new_erasure_code_profiles.count(name)) {
10776 if (osdmap.has_erasure_code_profile(name)) {
10777 pending_inc.old_erasure_code_profiles.push_back(name);
10778 } else {
10779 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10780 pending_inc.new_erasure_code_profiles.erase(name);
10781 }
10782
10783 getline(ss, rs);
10784 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10785 get_last_committed() + 1));
10786 return true;
10787 } else {
10788 ss << "erasure-code-profile " << name << " does not exist";
10789 err = 0;
10790 goto reply;
10791 }
10792
10793 } else if (prefix == "osd erasure-code-profile set") {
10794 string name;
10795 cmd_getval(cmdmap, "name", name);
10796 vector<string> profile;
10797 cmd_getval(cmdmap, "profile", profile);
10798
10799 bool force = false;
10800 cmd_getval(cmdmap, "force", force);
10801
10802 map<string,string> profile_map;
10803 err = parse_erasure_code_profile(profile, &profile_map, &ss);
10804 if (err)
10805 goto reply;
10806 if (profile_map.find("plugin") == profile_map.end()) {
10807 ss << "erasure-code-profile " << profile_map
10808 << " must contain a plugin entry" << std::endl;
10809 err = -EINVAL;
10810 goto reply;
10811 }
10812 string plugin = profile_map["plugin"];
10813
10814 if (pending_inc.has_erasure_code_profile(name)) {
10815 dout(20) << "erasure code profile " << name << " try again" << dendl;
10816 goto wait;
10817 } else {
10818 err = normalize_profile(name, profile_map, force, &ss);
10819 if (err)
10820 goto reply;
10821
10822 if (osdmap.has_erasure_code_profile(name)) {
10823 ErasureCodeProfile existing_profile_map =
10824 osdmap.get_erasure_code_profile(name);
10825 err = normalize_profile(name, existing_profile_map, force, &ss);
10826 if (err)
10827 goto reply;
10828
10829 if (existing_profile_map == profile_map) {
10830 err = 0;
10831 goto reply;
10832 }
10833 if (!force) {
10834 err = -EPERM;
10835 ss << "will not override erasure code profile " << name
10836 << " because the existing profile "
10837 << existing_profile_map
10838 << " is different from the proposed profile "
10839 << profile_map;
10840 goto reply;
10841 }
10842 }
10843
10844 dout(20) << "erasure code profile set " << name << "="
10845 << profile_map << dendl;
10846 pending_inc.set_erasure_code_profile(name, profile_map);
10847 }
10848
10849 getline(ss, rs);
10850 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10851 get_last_committed() + 1));
10852 return true;
10853
10854 } else if (prefix == "osd crush rule create-erasure") {
10855 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10856 if (err == -EAGAIN)
10857 goto wait;
10858 if (err)
10859 goto reply;
10860 string name, poolstr;
10861 cmd_getval(cmdmap, "name", name);
10862 string profile;
10863 cmd_getval(cmdmap, "profile", profile);
10864 if (profile == "")
10865 profile = "default";
10866 if (profile == "default") {
10867 if (!osdmap.has_erasure_code_profile(profile)) {
10868 if (pending_inc.has_erasure_code_profile(profile)) {
10869 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10870 goto wait;
10871 }
10872
10873 map<string,string> profile_map;
10874 err = osdmap.get_erasure_code_profile_default(cct,
10875 profile_map,
10876 &ss);
10877 if (err)
10878 goto reply;
10879 err = normalize_profile(name, profile_map, true, &ss);
10880 if (err)
10881 goto reply;
10882 dout(20) << "erasure code profile set " << profile << "="
10883 << profile_map << dendl;
10884 pending_inc.set_erasure_code_profile(profile, profile_map);
10885 goto wait;
10886 }
10887 }
10888
10889 int rule;
10890 err = crush_rule_create_erasure(name, profile, &rule, &ss);
10891 if (err < 0) {
10892 switch(err) {
10893 case -EEXIST: // return immediately
10894 ss << "rule " << name << " already exists";
10895 err = 0;
10896 goto reply;
10897 break;
10898 case -EALREADY: // wait for pending to be proposed
10899 ss << "rule " << name << " already exists";
10900 err = 0;
10901 break;
10902 default: // non recoverable error
10903 goto reply;
10904 break;
10905 }
10906 } else {
10907 ss << "created rule " << name << " at " << rule;
10908 }
10909
10910 getline(ss, rs);
10911 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10912 get_last_committed() + 1));
10913 return true;
10914
10915 } else if (prefix == "osd crush rule rm") {
10916 string name;
10917 cmd_getval(cmdmap, "name", name);
10918
10919 if (!osdmap.crush->rule_exists(name)) {
10920 ss << "rule " << name << " does not exist";
10921 err = 0;
10922 goto reply;
10923 }
10924
10925 CrushWrapper newcrush;
10926 _get_pending_crush(newcrush);
10927
10928 if (!newcrush.rule_exists(name)) {
10929 ss << "rule " << name << " does not exist";
10930 err = 0;
10931 } else {
10932 int ruleno = newcrush.get_rule_id(name);
10933 ceph_assert(ruleno >= 0);
10934
10935 // make sure it is not in use.
10936 // FIXME: this is ok in some situations, but let's not bother with that
10937 // complexity now.
10938 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10939 if (osdmap.crush_rule_in_use(ruleset)) {
10940 ss << "crush ruleset " << name << " " << ruleset << " is in use";
10941 err = -EBUSY;
10942 goto reply;
10943 }
10944
10945 err = newcrush.remove_rule(ruleno);
10946 if (err < 0) {
10947 goto reply;
10948 }
10949
10950 pending_inc.crush.clear();
10951 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10952 }
10953 getline(ss, rs);
10954 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10955 get_last_committed() + 1));
10956 return true;
10957
10958 } else if (prefix == "osd crush rule rename") {
10959 string srcname;
10960 string dstname;
10961 cmd_getval(cmdmap, "srcname", srcname);
10962 cmd_getval(cmdmap, "dstname", dstname);
10963 if (srcname.empty() || dstname.empty()) {
10964 ss << "must specify both source rule name and destination rule name";
10965 err = -EINVAL;
10966 goto reply;
10967 }
10968 if (srcname == dstname) {
10969 ss << "destination rule name is equal to source rule name";
10970 err = 0;
10971 goto reply;
10972 }
10973
10974 CrushWrapper newcrush;
10975 _get_pending_crush(newcrush);
10976 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10977 // srcname does not exist and dstname already exists
10978 // suppose this is a replay and return success
10979 // (so this command is idempotent)
10980 ss << "already renamed to '" << dstname << "'";
10981 err = 0;
10982 goto reply;
10983 }
10984
10985 err = newcrush.rename_rule(srcname, dstname, &ss);
10986 if (err < 0) {
10987 // ss has reason for failure
10988 goto reply;
10989 }
10990 pending_inc.crush.clear();
10991 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10992 getline(ss, rs);
10993 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10994 get_last_committed() + 1));
10995 return true;
10996
10997 } else if (prefix == "osd setmaxosd") {
10998 int64_t newmax;
10999 if (!cmd_getval(cmdmap, "newmax", newmax)) {
11000 ss << "unable to parse 'newmax' value '"
11001 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11002 err = -EINVAL;
11003 goto reply;
11004 }
11005
11006 if (newmax > g_conf()->mon_max_osd) {
11007 err = -ERANGE;
11008 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11009 << g_conf()->mon_max_osd << ")";
11010 goto reply;
11011 }
11012
11013 // Don't allow shrinking OSD number as this will cause data loss
11014 // and may cause kernel crashes.
11015 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11016 if (newmax < osdmap.get_max_osd()) {
11017 // Check if the OSDs exist between current max and new value.
11018 // If there are any OSDs exist, then don't allow shrinking number
11019 // of OSDs.
11020 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11021 if (osdmap.exists(i)) {
11022 err = -EBUSY;
11023 ss << "cannot shrink max_osd to " << newmax
11024 << " because osd." << i << " (and possibly others) still in use";
11025 goto reply;
11026 }
11027 }
11028 }
11029
11030 pending_inc.new_max_osd = newmax;
11031 ss << "set new max_osd = " << pending_inc.new_max_osd;
11032 getline(ss, rs);
11033 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11034 get_last_committed() + 1));
11035 return true;
11036
11037 } else if (prefix == "osd set-full-ratio" ||
11038 prefix == "osd set-backfillfull-ratio" ||
11039 prefix == "osd set-nearfull-ratio") {
11040 double n;
11041 if (!cmd_getval(cmdmap, "ratio", n)) {
11042 ss << "unable to parse 'ratio' value '"
11043 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11044 err = -EINVAL;
11045 goto reply;
11046 }
11047 if (prefix == "osd set-full-ratio")
11048 pending_inc.new_full_ratio = n;
11049 else if (prefix == "osd set-backfillfull-ratio")
11050 pending_inc.new_backfillfull_ratio = n;
11051 else if (prefix == "osd set-nearfull-ratio")
11052 pending_inc.new_nearfull_ratio = n;
11053 ss << prefix << " " << n;
11054 getline(ss, rs);
11055 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11056 get_last_committed() + 1));
11057 return true;
11058 } else if (prefix == "osd set-require-min-compat-client") {
11059 string v;
11060 cmd_getval(cmdmap, "version", v);
11061 ceph_release_t vno = ceph_release_from_name(v);
11062 if (!vno) {
11063 ss << "version " << v << " is not recognized";
11064 err = -EINVAL;
11065 goto reply;
11066 }
11067 OSDMap newmap;
11068 newmap.deepish_copy_from(osdmap);
11069 newmap.apply_incremental(pending_inc);
11070 newmap.require_min_compat_client = vno;
11071 auto mvno = newmap.get_min_compat_client();
11072 if (vno < mvno) {
11073 ss << "osdmap current utilizes features that require " << mvno
11074 << "; cannot set require_min_compat_client below that to " << vno;
11075 err = -EPERM;
11076 goto reply;
11077 }
11078 bool sure = false;
11079 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11080 if (!sure) {
11081 FeatureMap m;
11082 mon->get_combined_feature_map(&m);
11083 uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
11084 bool first = true;
11085 bool ok = true;
11086 for (int type : {
11087 CEPH_ENTITY_TYPE_CLIENT,
11088 CEPH_ENTITY_TYPE_MDS,
11089 CEPH_ENTITY_TYPE_MGR }) {
11090 auto p = m.m.find(type);
11091 if (p == m.m.end()) {
11092 continue;
11093 }
11094 for (auto& q : p->second) {
11095 uint64_t missing = ~q.first & features;
11096 if (missing) {
11097 if (first) {
11098 ss << "cannot set require_min_compat_client to " << v << ": ";
11099 } else {
11100 ss << "; ";
11101 }
11102 first = false;
11103 ss << q.second << " connected " << ceph_entity_type_name(type)
11104 << "(s) look like " << ceph_release_name(
11105 ceph_release_from_features(q.first))
11106 << " (missing 0x" << std::hex << missing << std::dec << ")";
11107 ok = false;
11108 }
11109 }
11110 }
11111 if (!ok) {
11112 ss << "; add --yes-i-really-mean-it to do it anyway";
11113 err = -EPERM;
11114 goto reply;
11115 }
11116 }
11117 ss << "set require_min_compat_client to " << vno;
11118 pending_inc.new_require_min_compat_client = vno;
11119 getline(ss, rs);
11120 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11121 get_last_committed() + 1));
11122 return true;
11123 } else if (prefix == "osd pause") {
11124 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11125
11126 } else if (prefix == "osd unpause") {
11127 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11128
11129 } else if (prefix == "osd set") {
11130 bool sure = false;
11131 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11132
11133 string key;
11134 cmd_getval(cmdmap, "key", key);
11135 if (key == "pause")
11136 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11137 else if (key == "noup")
11138 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11139 else if (key == "nodown")
11140 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11141 else if (key == "noout")
11142 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11143 else if (key == "noin")
11144 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11145 else if (key == "nobackfill")
11146 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11147 else if (key == "norebalance")
11148 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11149 else if (key == "norecover")
11150 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11151 else if (key == "noscrub")
11152 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11153 else if (key == "nodeep-scrub")
11154 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11155 else if (key == "notieragent")
11156 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11157 else if (key == "nosnaptrim")
11158 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11159 else if (key == "pglog_hardlimit") {
11160 if (!osdmap.get_num_up_osds() && !sure) {
11161 ss << "Not advisable to continue since no OSDs are up. Pass "
11162 << "--yes-i-really-mean-it if you really wish to continue.";
11163 err = -EPERM;
11164 goto reply;
11165 }
11166 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11167 // we are reusing a jewel feature bit that was retired in luminous.
11168 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11169 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11170 || sure)) {
11171 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11172 } else {
11173 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11174 err = -EPERM;
11175 goto reply;
11176 }
11177 } else {
11178 ss << "unrecognized flag '" << key << "'";
11179 err = -EINVAL;
11180 }
11181
11182 } else if (prefix == "osd unset") {
11183 string key;
11184 cmd_getval(cmdmap, "key", key);
11185 if (key == "pause")
11186 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11187 else if (key == "noup")
11188 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11189 else if (key == "nodown")
11190 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11191 else if (key == "noout")
11192 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11193 else if (key == "noin")
11194 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11195 else if (key == "nobackfill")
11196 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11197 else if (key == "norebalance")
11198 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11199 else if (key == "norecover")
11200 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11201 else if (key == "noscrub")
11202 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11203 else if (key == "nodeep-scrub")
11204 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11205 else if (key == "notieragent")
11206 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11207 else if (key == "nosnaptrim")
11208 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11209 else {
11210 ss << "unrecognized flag '" << key << "'";
11211 err = -EINVAL;
11212 }
11213
11214 } else if (prefix == "osd require-osd-release") {
11215 string release;
11216 cmd_getval(cmdmap, "release", release);
11217 bool sure = false;
11218 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11219 ceph_release_t rel = ceph_release_from_name(release.c_str());
11220 if (!rel) {
11221 ss << "unrecognized release " << release;
11222 err = -EINVAL;
11223 goto reply;
11224 }
11225 if (rel == osdmap.require_osd_release) {
11226 // idempotent
11227 err = 0;
11228 goto reply;
11229 }
11230 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11231 if (!osdmap.get_num_up_osds() && !sure) {
11232 ss << "Not advisable to continue since no OSDs are up. Pass "
11233 << "--yes-i-really-mean-it if you really wish to continue.";
11234 err = -EPERM;
11235 goto reply;
11236 }
11237 if (rel == ceph_release_t::mimic) {
11238 if (!mon->monmap->get_required_features().contains_all(
11239 ceph::features::mon::FEATURE_MIMIC)) {
11240 ss << "not all mons are mimic";
11241 err = -EPERM;
11242 goto reply;
11243 }
11244 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11245 && !sure) {
11246 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11247 err = -EPERM;
11248 goto reply;
11249 }
11250 } else if (rel == ceph_release_t::nautilus) {
11251 if (!mon->monmap->get_required_features().contains_all(
11252 ceph::features::mon::FEATURE_NAUTILUS)) {
11253 ss << "not all mons are nautilus";
11254 err = -EPERM;
11255 goto reply;
11256 }
11257 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11258 && !sure) {
11259 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11260 err = -EPERM;
11261 goto reply;
11262 }
11263 } else if (rel == ceph_release_t::octopus) {
11264 if (!mon->monmap->get_required_features().contains_all(
11265 ceph::features::mon::FEATURE_OCTOPUS)) {
11266 ss << "not all mons are octopus";
11267 err = -EPERM;
11268 goto reply;
11269 }
11270 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11271 && !sure) {
11272 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11273 err = -EPERM;
11274 goto reply;
11275 }
11276 } else {
11277 ss << "not supported for this release yet";
11278 err = -EPERM;
11279 goto reply;
11280 }
11281 if (rel < osdmap.require_osd_release) {
11282 ss << "require_osd_release cannot be lowered once it has been set";
11283 err = -EPERM;
11284 goto reply;
11285 }
11286 pending_inc.new_require_osd_release = rel;
11287 goto update;
11288 } else if (prefix == "osd down" ||
11289 prefix == "osd out" ||
11290 prefix == "osd in" ||
11291 prefix == "osd rm" ||
11292 prefix == "osd stop") {
11293
11294 bool any = false;
11295 bool stop = false;
11296 bool verbose = true;
11297 bool definitely_dead = false;
11298
11299 vector<string> idvec;
11300 cmd_getval(cmdmap, "ids", idvec);
11301 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11302 derr << "definitely_dead " << (int)definitely_dead << dendl;
11303 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11304 set<int> osds;
11305
11306 // wildcard?
11307 if (j == 0 &&
11308 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11309 if (prefix == "osd in") {
11310 // touch out osds only
11311 osdmap.get_out_existing_osds(osds);
11312 } else {
11313 osdmap.get_all_osds(osds);
11314 }
11315 stop = true;
11316 verbose = false; // so the output is less noisy.
11317 } else {
11318 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11319 if (osd < 0) {
11320 ss << "invalid osd id" << osd;
11321 err = -EINVAL;
11322 continue;
11323 } else if (!osdmap.exists(osd)) {
11324 ss << "osd." << osd << " does not exist. ";
11325 continue;
11326 }
11327
11328 osds.insert(osd);
11329 }
11330
11331 for (auto &osd : osds) {
11332 if (prefix == "osd down") {
11333 if (osdmap.is_down(osd)) {
11334 if (verbose)
11335 ss << "osd." << osd << " is already down. ";
11336 } else {
11337 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11338 ss << "marked down osd." << osd << ". ";
11339 any = true;
11340 }
11341 if (definitely_dead) {
11342 if (!pending_inc.new_xinfo.count(osd)) {
11343 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11344 }
11345 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11346 any = true;
11347 }
11348 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11349 }
11350 } else if (prefix == "osd out") {
11351 if (osdmap.is_out(osd)) {
11352 if (verbose)
11353 ss << "osd." << osd << " is already out. ";
11354 } else {
11355 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11356 if (osdmap.osd_weight[osd]) {
11357 if (pending_inc.new_xinfo.count(osd) == 0) {
11358 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11359 }
11360 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11361 }
11362 ss << "marked out osd." << osd << ". ";
11363 std::ostringstream msg;
11364 msg << "Client " << op->get_session()->entity_name
11365 << " marked osd." << osd << " out";
11366 if (osdmap.is_up(osd)) {
11367 msg << ", while it was still marked up";
11368 } else {
11369 auto period = ceph_clock_now() - down_pending_out[osd];
11370 msg << ", after it was down for " << int(period.sec())
11371 << " seconds";
11372 }
11373
11374 mon->clog->info() << msg.str();
11375 any = true;
11376 }
11377 } else if (prefix == "osd in") {
11378 if (osdmap.is_in(osd)) {
11379 if (verbose)
11380 ss << "osd." << osd << " is already in. ";
11381 } else {
11382 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11383 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11384 if (pending_inc.new_xinfo.count(osd) == 0) {
11385 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11386 }
11387 pending_inc.new_xinfo[osd].old_weight = 0;
11388 } else {
11389 pending_inc.new_weight[osd] = CEPH_OSD_IN;
11390 }
11391 ss << "marked in osd." << osd << ". ";
11392 any = true;
11393 }
11394 } else if (prefix == "osd rm") {
11395 err = prepare_command_osd_remove(osd);
11396
11397 if (err == -EBUSY) {
11398 if (any)
11399 ss << ", ";
11400 ss << "osd." << osd << " is still up; must be down before removal. ";
11401 } else {
11402 ceph_assert(err == 0);
11403 if (any) {
11404 ss << ", osd." << osd;
11405 } else {
11406 ss << "removed osd." << osd;
11407 }
11408 any = true;
11409 }
11410 } else if (prefix == "osd stop") {
11411 if (osdmap.is_stop(osd)) {
11412 if (verbose)
11413 ss << "osd." << osd << " is already stopped. ";
11414 } else if (osdmap.is_down(osd)) {
11415 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11416 ss << "stop down osd." << osd << ". ";
11417 any = true;
11418 } else {
11419 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11420 ss << "stop osd." << osd << ". ";
11421 any = true;
11422 }
11423 }
11424 }
11425 }
11426 if (any) {
11427 getline(ss, rs);
11428 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11429 get_last_committed() + 1));
11430 return true;
11431 }
11432 } else if (prefix == "osd set-group" ||
11433 prefix == "osd unset-group" ||
11434 prefix == "osd add-noup" ||
11435 prefix == "osd add-nodown" ||
11436 prefix == "osd add-noin" ||
11437 prefix == "osd add-noout" ||
11438 prefix == "osd rm-noup" ||
11439 prefix == "osd rm-nodown" ||
11440 prefix == "osd rm-noin" ||
11441 prefix == "osd rm-noout") {
11442 bool do_set = prefix == "osd set-group" ||
11443 prefix.find("add") != string::npos;
11444 string flag_str;
11445 unsigned flags = 0;
11446 vector<string> who;
11447 if (prefix == "osd set-group" || prefix == "osd unset-group") {
11448 cmd_getval(cmdmap, "flags", flag_str);
11449 cmd_getval(cmdmap, "who", who);
11450 vector<string> raw_flags;
11451 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11452 for (auto& f : raw_flags) {
11453 if (f == "noup")
11454 flags |= CEPH_OSD_NOUP;
11455 else if (f == "nodown")
11456 flags |= CEPH_OSD_NODOWN;
11457 else if (f == "noin")
11458 flags |= CEPH_OSD_NOIN;
11459 else if (f == "noout")
11460 flags |= CEPH_OSD_NOOUT;
11461 else {
11462 ss << "unrecognized flag '" << f << "', must be one of "
11463 << "{noup,nodown,noin,noout}";
11464 err = -EINVAL;
11465 goto reply;
11466 }
11467 }
11468 } else {
11469 cmd_getval(cmdmap, "ids", who);
11470 if (prefix.find("noup") != string::npos)
11471 flags = CEPH_OSD_NOUP;
11472 else if (prefix.find("nodown") != string::npos)
11473 flags = CEPH_OSD_NODOWN;
11474 else if (prefix.find("noin") != string::npos)
11475 flags = CEPH_OSD_NOIN;
11476 else if (prefix.find("noout") != string::npos)
11477 flags = CEPH_OSD_NOOUT;
11478 else
11479 ceph_assert(0 == "Unreachable!");
11480 }
11481 if (flags == 0) {
11482 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11483 err = -EINVAL;
11484 goto reply;
11485 }
11486 if (who.empty()) {
11487 ss << "must specify at least one or more targets to set/unset";
11488 err = -EINVAL;
11489 goto reply;
11490 }
11491 set<int> osds;
11492 set<int> crush_nodes;
11493 set<int> device_classes;
11494 for (auto& w : who) {
11495 if (w == "any" || w == "all" || w == "*") {
11496 osdmap.get_all_osds(osds);
11497 break;
11498 }
11499 std::stringstream ts;
11500 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11501 osds.insert(osd);
11502 } else if (osdmap.crush->name_exists(w)) {
11503 crush_nodes.insert(osdmap.crush->get_item_id(w));
11504 } else if (osdmap.crush->class_exists(w)) {
11505 device_classes.insert(osdmap.crush->get_class_id(w));
11506 } else {
11507 ss << "unable to parse osd id or crush node or device class: "
11508 << "\"" << w << "\". ";
11509 }
11510 }
11511 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11512 // ss has reason for failure
11513 err = -EINVAL;
11514 goto reply;
11515 }
11516 bool any = false;
11517 for (auto osd : osds) {
11518 if (!osdmap.exists(osd)) {
11519 ss << "osd." << osd << " does not exist. ";
11520 continue;
11521 }
11522 if (do_set) {
11523 if (flags & CEPH_OSD_NOUP) {
11524 any |= osdmap.is_noup_by_osd(osd) ?
11525 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11526 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11527 }
11528 if (flags & CEPH_OSD_NODOWN) {
11529 any |= osdmap.is_nodown_by_osd(osd) ?
11530 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11531 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11532 }
11533 if (flags & CEPH_OSD_NOIN) {
11534 any |= osdmap.is_noin_by_osd(osd) ?
11535 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11536 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11537 }
11538 if (flags & CEPH_OSD_NOOUT) {
11539 any |= osdmap.is_noout_by_osd(osd) ?
11540 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11541 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11542 }
11543 } else {
11544 if (flags & CEPH_OSD_NOUP) {
11545 any |= osdmap.is_noup_by_osd(osd) ?
11546 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11547 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11548 }
11549 if (flags & CEPH_OSD_NODOWN) {
11550 any |= osdmap.is_nodown_by_osd(osd) ?
11551 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11552 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11553 }
11554 if (flags & CEPH_OSD_NOIN) {
11555 any |= osdmap.is_noin_by_osd(osd) ?
11556 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11557 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11558 }
11559 if (flags & CEPH_OSD_NOOUT) {
11560 any |= osdmap.is_noout_by_osd(osd) ?
11561 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11562 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11563 }
11564 }
11565 }
11566 for (auto& id : crush_nodes) {
11567 auto old_flags = osdmap.get_crush_node_flags(id);
11568 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11569 pending_flags |= old_flags; // adopt existing flags first!
11570 if (do_set) {
11571 pending_flags |= flags;
11572 } else {
11573 pending_flags &= ~flags;
11574 }
11575 any = true;
11576 }
11577 for (auto& id : device_classes) {
11578 auto old_flags = osdmap.get_device_class_flags(id);
11579 auto& pending_flags = pending_inc.new_device_class_flags[id];
11580 pending_flags |= old_flags;
11581 if (do_set) {
11582 pending_flags |= flags;
11583 } else {
11584 pending_flags &= ~flags;
11585 }
11586 any = true;
11587 }
11588 if (any) {
11589 getline(ss, rs);
11590 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11591 get_last_committed() + 1));
11592 return true;
11593 }
11594 } else if (prefix == "osd pg-temp") {
11595 string pgidstr;
11596 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11597 ss << "unable to parse 'pgid' value '"
11598 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11599 err = -EINVAL;
11600 goto reply;
11601 }
11602 pg_t pgid;
11603 if (!pgid.parse(pgidstr.c_str())) {
11604 ss << "invalid pgid '" << pgidstr << "'";
11605 err = -EINVAL;
11606 goto reply;
11607 }
11608 if (!osdmap.pg_exists(pgid)) {
11609 ss << "pg " << pgid << " does not exist";
11610 err = -ENOENT;
11611 goto reply;
11612 }
11613 if (pending_inc.new_pg_temp.count(pgid)) {
11614 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11615 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11616 return true;
11617 }
11618
11619 vector<int64_t> id_vec;
11620 vector<int32_t> new_pg_temp;
11621 cmd_getval(cmdmap, "id", id_vec);
11622 if (id_vec.empty()) {
11623 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11624 ss << "done cleaning up pg_temp of " << pgid;
11625 goto update;
11626 }
11627 for (auto osd : id_vec) {
11628 if (!osdmap.exists(osd)) {
11629 ss << "osd." << osd << " does not exist";
11630 err = -ENOENT;
11631 goto reply;
11632 }
11633 new_pg_temp.push_back(osd);
11634 }
11635
11636 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11637 if ((int)new_pg_temp.size() < pool_min_size) {
11638 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11639 << pool_min_size << ")";
11640 err = -EINVAL;
11641 goto reply;
11642 }
11643
11644 int pool_size = osdmap.get_pg_pool_size(pgid);
11645 if ((int)new_pg_temp.size() > pool_size) {
11646 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11647 << pool_size << ")";
11648 err = -EINVAL;
11649 goto reply;
11650 }
11651
11652 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11653 new_pg_temp.begin(), new_pg_temp.end());
11654 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11655 goto update;
11656 } else if (prefix == "osd primary-temp") {
11657 string pgidstr;
11658 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11659 ss << "unable to parse 'pgid' value '"
11660 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11661 err = -EINVAL;
11662 goto reply;
11663 }
11664 pg_t pgid;
11665 if (!pgid.parse(pgidstr.c_str())) {
11666 ss << "invalid pgid '" << pgidstr << "'";
11667 err = -EINVAL;
11668 goto reply;
11669 }
11670 if (!osdmap.pg_exists(pgid)) {
11671 ss << "pg " << pgid << " does not exist";
11672 err = -ENOENT;
11673 goto reply;
11674 }
11675
11676 int64_t osd;
11677 if (!cmd_getval(cmdmap, "id", osd)) {
11678 ss << "unable to parse 'id' value '"
11679 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11680 err = -EINVAL;
11681 goto reply;
11682 }
11683 if (osd != -1 && !osdmap.exists(osd)) {
11684 ss << "osd." << osd << " does not exist";
11685 err = -ENOENT;
11686 goto reply;
11687 }
11688
11689 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11690 osdmap.require_min_compat_client < ceph_release_t::firefly) {
11691 ss << "require_min_compat_client "
11692 << osdmap.require_min_compat_client
11693 << " < firefly, which is required for primary-temp";
11694 err = -EPERM;
11695 goto reply;
11696 }
11697
11698 pending_inc.new_primary_temp[pgid] = osd;
11699 ss << "set " << pgid << " primary_temp mapping to " << osd;
11700 goto update;
11701 } else if (prefix == "pg repeer") {
11702 pg_t pgid;
11703 string pgidstr;
11704 cmd_getval(cmdmap, "pgid", pgidstr);
11705 if (!pgid.parse(pgidstr.c_str())) {
11706 ss << "invalid pgid '" << pgidstr << "'";
11707 err = -EINVAL;
11708 goto reply;
11709 }
11710 if (!osdmap.pg_exists(pgid)) {
11711 ss << "pg '" << pgidstr << "' does not exist";
11712 err = -ENOENT;
11713 goto reply;
11714 }
11715 vector<int> acting;
11716 int primary;
11717 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11718 if (primary < 0) {
11719 err = -EAGAIN;
11720 ss << "pg currently has no primary";
11721 goto reply;
11722 }
11723 if (acting.size() > 1) {
11724 // map to just primary; it will map back to what it wants
11725 pending_inc.new_pg_temp[pgid] = { primary };
11726 } else {
11727 // hmm, pick another arbitrary osd to induce a change. Note
11728 // that this won't work if there is only one suitable OSD in the cluster.
11729 int i;
11730 bool done = false;
11731 for (i = 0; i < osdmap.get_max_osd(); ++i) {
11732 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11733 continue;
11734 }
11735 pending_inc.new_pg_temp[pgid] = { primary, i };
11736 done = true;
11737 break;
11738 }
11739 if (!done) {
11740 err = -EAGAIN;
11741 ss << "not enough up OSDs in the cluster to force repeer";
11742 goto reply;
11743 }
11744 }
11745 goto update;
11746 } else if (prefix == "osd pg-upmap" ||
11747 prefix == "osd rm-pg-upmap" ||
11748 prefix == "osd pg-upmap-items" ||
11749 prefix == "osd rm-pg-upmap-items") {
11750 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11751 ss << "min_compat_client "
11752 << osdmap.require_min_compat_client
11753 << " < luminous, which is required for pg-upmap. "
11754 << "Try 'ceph osd set-require-min-compat-client luminous' "
11755 << "before using the new interface";
11756 err = -EPERM;
11757 goto reply;
11758 }
11759 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11760 if (err == -EAGAIN)
11761 goto wait;
11762 if (err < 0)
11763 goto reply;
11764 string pgidstr;
11765 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11766 ss << "unable to parse 'pgid' value '"
11767 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11768 err = -EINVAL;
11769 goto reply;
11770 }
11771 pg_t pgid;
11772 if (!pgid.parse(pgidstr.c_str())) {
11773 ss << "invalid pgid '" << pgidstr << "'";
11774 err = -EINVAL;
11775 goto reply;
11776 }
11777 if (!osdmap.pg_exists(pgid)) {
11778 ss << "pg " << pgid << " does not exist";
11779 err = -ENOENT;
11780 goto reply;
11781 }
11782 if (pending_inc.old_pools.count(pgid.pool())) {
11783 ss << "pool of " << pgid << " is pending removal";
11784 err = -ENOENT;
11785 getline(ss, rs);
11786 wait_for_finished_proposal(op,
11787 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11788 return true;
11789 }
11790
11791 enum {
11792 OP_PG_UPMAP,
11793 OP_RM_PG_UPMAP,
11794 OP_PG_UPMAP_ITEMS,
11795 OP_RM_PG_UPMAP_ITEMS,
11796 } option;
11797
11798 if (prefix == "osd pg-upmap") {
11799 option = OP_PG_UPMAP;
11800 } else if (prefix == "osd rm-pg-upmap") {
11801 option = OP_RM_PG_UPMAP;
11802 } else if (prefix == "osd pg-upmap-items") {
11803 option = OP_PG_UPMAP_ITEMS;
11804 } else {
11805 option = OP_RM_PG_UPMAP_ITEMS;
11806 }
11807
11808 // check pending upmap changes
11809 switch (option) {
11810 case OP_PG_UPMAP: // fall through
11811 case OP_RM_PG_UPMAP:
11812 if (pending_inc.new_pg_upmap.count(pgid) ||
11813 pending_inc.old_pg_upmap.count(pgid)) {
11814 dout(10) << __func__ << " waiting for pending update on "
11815 << pgid << dendl;
11816 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11817 return true;
11818 }
11819 break;
11820
11821 case OP_PG_UPMAP_ITEMS: // fall through
11822 case OP_RM_PG_UPMAP_ITEMS:
11823 if (pending_inc.new_pg_upmap_items.count(pgid) ||
11824 pending_inc.old_pg_upmap_items.count(pgid)) {
11825 dout(10) << __func__ << " waiting for pending update on "
11826 << pgid << dendl;
11827 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11828 return true;
11829 }
11830 break;
11831
11832 default:
11833 ceph_abort_msg("invalid option");
11834 }
11835
11836 switch (option) {
11837 case OP_PG_UPMAP:
11838 {
11839 vector<int64_t> id_vec;
11840 if (!cmd_getval(cmdmap, "id", id_vec)) {
11841 ss << "unable to parse 'id' value(s) '"
11842 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11843 err = -EINVAL;
11844 goto reply;
11845 }
11846
11847 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11848 if ((int)id_vec.size() < pool_min_size) {
11849 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11850 << pool_min_size << ")";
11851 err = -EINVAL;
11852 goto reply;
11853 }
11854
11855 int pool_size = osdmap.get_pg_pool_size(pgid);
11856 if ((int)id_vec.size() > pool_size) {
11857 ss << "num of osds (" << id_vec.size() <<") > pool size ("
11858 << pool_size << ")";
11859 err = -EINVAL;
11860 goto reply;
11861 }
11862
11863 vector<int32_t> new_pg_upmap;
11864 for (auto osd : id_vec) {
11865 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11866 ss << "osd." << osd << " does not exist";
11867 err = -ENOENT;
11868 goto reply;
11869 }
11870 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11871 if (it != new_pg_upmap.end()) {
11872 ss << "osd." << osd << " already exists, ";
11873 continue;
11874 }
11875 new_pg_upmap.push_back(osd);
11876 }
11877
11878 if (new_pg_upmap.empty()) {
11879 ss << "no valid upmap items(pairs) is specified";
11880 err = -EINVAL;
11881 goto reply;
11882 }
11883
11884 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11885 new_pg_upmap.begin(), new_pg_upmap.end());
11886 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
11887 }
11888 break;
11889
11890 case OP_RM_PG_UPMAP:
11891 {
11892 pending_inc.old_pg_upmap.insert(pgid);
11893 ss << "clear " << pgid << " pg_upmap mapping";
11894 }
11895 break;
11896
11897 case OP_PG_UPMAP_ITEMS:
11898 {
11899 vector<int64_t> id_vec;
11900 if (!cmd_getval(cmdmap, "id", id_vec)) {
11901 ss << "unable to parse 'id' value(s) '"
11902 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11903 err = -EINVAL;
11904 goto reply;
11905 }
11906
11907 if (id_vec.size() % 2) {
11908 ss << "you must specify pairs of osd ids to be remapped";
11909 err = -EINVAL;
11910 goto reply;
11911 }
11912
11913 int pool_size = osdmap.get_pg_pool_size(pgid);
11914 if ((int)(id_vec.size() / 2) > pool_size) {
11915 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11916 << pool_size << ")";
11917 err = -EINVAL;
11918 goto reply;
11919 }
11920
11921 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11922 ostringstream items;
11923 items << "[";
11924 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11925 int from = *p++;
11926 int to = *p;
11927 if (from == to) {
11928 ss << "from osd." << from << " == to osd." << to << ", ";
11929 continue;
11930 }
11931 if (!osdmap.exists(from)) {
11932 ss << "osd." << from << " does not exist";
11933 err = -ENOENT;
11934 goto reply;
11935 }
11936 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11937 ss << "osd." << to << " does not exist";
11938 err = -ENOENT;
11939 goto reply;
11940 }
11941 pair<int32_t,int32_t> entry = make_pair(from, to);
11942 auto it = std::find(new_pg_upmap_items.begin(),
11943 new_pg_upmap_items.end(), entry);
11944 if (it != new_pg_upmap_items.end()) {
11945 ss << "osd." << from << " -> osd." << to << " already exists, ";
11946 continue;
11947 }
11948 new_pg_upmap_items.push_back(entry);
11949 items << from << "->" << to << ",";
11950 }
11951 string out(items.str());
11952 out.resize(out.size() - 1); // drop last ','
11953 out += "]";
11954
11955 if (new_pg_upmap_items.empty()) {
11956 ss << "no valid upmap items(pairs) is specified";
11957 err = -EINVAL;
11958 goto reply;
11959 }
11960
11961 pending_inc.new_pg_upmap_items[pgid] =
11962 mempool::osdmap::vector<pair<int32_t,int32_t>>(
11963 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11964 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11965 }
11966 break;
11967
11968 case OP_RM_PG_UPMAP_ITEMS:
11969 {
11970 pending_inc.old_pg_upmap_items.insert(pgid);
11971 ss << "clear " << pgid << " pg_upmap_items mapping";
11972 }
11973 break;
11974
11975 default:
11976 ceph_abort_msg("invalid option");
11977 }
11978
11979 goto update;
11980 } else if (prefix == "osd primary-affinity") {
11981 int64_t id;
11982 if (!cmd_getval(cmdmap, "id", id)) {
11983 ss << "invalid osd id value '"
11984 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11985 err = -EINVAL;
11986 goto reply;
11987 }
11988 double w;
11989 if (!cmd_getval(cmdmap, "weight", w)) {
11990 ss << "unable to parse 'weight' value '"
11991 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11992 err = -EINVAL;
11993 goto reply;
11994 }
11995 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11996 if (ww < 0L) {
11997 ss << "weight must be >= 0";
11998 err = -EINVAL;
11999 goto reply;
12000 }
12001 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12002 osdmap.require_min_compat_client < ceph_release_t::firefly) {
12003 ss << "require_min_compat_client "
12004 << osdmap.require_min_compat_client
12005 << " < firefly, which is required for primary-affinity";
12006 err = -EPERM;
12007 goto reply;
12008 }
12009 if (osdmap.exists(id)) {
12010 pending_inc.new_primary_affinity[id] = ww;
12011 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
12012 getline(ss, rs);
12013 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12014 get_last_committed() + 1));
12015 return true;
12016 } else {
12017 ss << "osd." << id << " does not exist";
12018 err = -ENOENT;
12019 goto reply;
12020 }
12021 } else if (prefix == "osd reweight") {
12022 int64_t id;
12023 if (!cmd_getval(cmdmap, "id", id)) {
12024 ss << "unable to parse osd id value '"
12025 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12026 err = -EINVAL;
12027 goto reply;
12028 }
12029 double w;
12030 if (!cmd_getval(cmdmap, "weight", w)) {
12031 ss << "unable to parse weight value '"
12032 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12033 err = -EINVAL;
12034 goto reply;
12035 }
12036 long ww = (int)((double)CEPH_OSD_IN*w);
12037 if (ww < 0L) {
12038 ss << "weight must be >= 0";
12039 err = -EINVAL;
12040 goto reply;
12041 }
12042 if (osdmap.exists(id)) {
12043 pending_inc.new_weight[id] = ww;
12044 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12045 getline(ss, rs);
12046 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12047 get_last_committed() + 1));
12048 return true;
12049 } else {
12050 ss << "osd." << id << " does not exist";
12051 err = -ENOENT;
12052 goto reply;
12053 }
12054 } else if (prefix == "osd reweightn") {
12055 map<int32_t, uint32_t> weights;
12056 err = parse_reweights(cct, cmdmap, osdmap, &weights);
12057 if (err) {
12058 ss << "unable to parse 'weights' value '"
12059 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12060 goto reply;
12061 }
12062 pending_inc.new_weight.insert(weights.begin(), weights.end());
12063 wait_for_finished_proposal(
12064 op,
12065 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12066 return true;
12067 } else if (prefix == "osd lost") {
12068 int64_t id;
12069 if (!cmd_getval(cmdmap, "id", id)) {
12070 ss << "unable to parse osd id value '"
12071 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12072 err = -EINVAL;
12073 goto reply;
12074 }
12075 bool sure = false;
12076 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12077 if (!sure) {
12078 ss << "are you SURE? this might mean real, permanent data loss. pass "
12079 "--yes-i-really-mean-it if you really do.";
12080 err = -EPERM;
12081 goto reply;
12082 } else if (!osdmap.exists(id)) {
12083 ss << "osd." << id << " does not exist";
12084 err = -ENOENT;
12085 goto reply;
12086 } else if (!osdmap.is_down(id)) {
12087 ss << "osd." << id << " is not down";
12088 err = -EBUSY;
12089 goto reply;
12090 } else {
12091 epoch_t e = osdmap.get_info(id).down_at;
12092 pending_inc.new_lost[id] = e;
12093 ss << "marked osd lost in epoch " << e;
12094 getline(ss, rs);
12095 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12096 get_last_committed() + 1));
12097 return true;
12098 }
12099
12100 } else if (prefix == "osd destroy-actual" ||
12101 prefix == "osd purge-actual" ||
12102 prefix == "osd purge-new") {
12103 /* Destroying an OSD means that we don't expect to further make use of
12104 * the OSDs data (which may even become unreadable after this operation),
12105 * and that we are okay with scrubbing all its cephx keys and config-key
12106 * data (which may include lockbox keys, thus rendering the osd's data
12107 * unreadable).
12108 *
12109 * The OSD will not be removed. Instead, we will mark it as destroyed,
12110 * such that a subsequent call to `create` will not reuse the osd id.
12111 * This will play into being able to recreate the OSD, at the same
12112 * crush location, with minimal data movement.
12113 */
12114
12115 // make sure authmon is writeable.
12116 if (!mon->authmon()->is_writeable()) {
12117 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12118 << "osd destroy" << dendl;
12119 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12120 return false;
12121 }
12122
12123 int64_t id;
12124 if (!cmd_getval(cmdmap, "id", id)) {
12125 auto p = cmdmap.find("id");
12126 if (p == cmdmap.end()) {
12127 ss << "no osd id specified";
12128 } else {
12129 ss << "unable to parse osd id value '"
12130 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12131 }
12132 err = -EINVAL;
12133 goto reply;
12134 }
12135
12136 bool is_destroy = (prefix == "osd destroy-actual");
12137 if (!is_destroy) {
12138 ceph_assert("osd purge-actual" == prefix ||
12139 "osd purge-new" == prefix);
12140 }
12141
12142 bool sure = false;
12143 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12144 if (!sure) {
12145 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12146 << "This will mean real, permanent data loss, as well "
12147 << "as deletion of cephx and lockbox keys. "
12148 << "Pass --yes-i-really-mean-it if you really do.";
12149 err = -EPERM;
12150 goto reply;
12151 } else if (!osdmap.exists(id)) {
12152 ss << "osd." << id << " does not exist";
12153 err = 0; // idempotent
12154 goto reply;
12155 } else if (osdmap.is_up(id)) {
12156 ss << "osd." << id << " is not `down`.";
12157 err = -EBUSY;
12158 goto reply;
12159 } else if (is_destroy && osdmap.is_destroyed(id)) {
12160 ss << "destroyed osd." << id;
12161 err = 0;
12162 goto reply;
12163 }
12164
12165 if (prefix == "osd purge-new" &&
12166 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12167 ss << "osd." << id << " is not new";
12168 err = -EPERM;
12169 goto reply;
12170 }
12171
12172 bool goto_reply = false;
12173
12174 paxos->plug();
12175 if (is_destroy) {
12176 err = prepare_command_osd_destroy(id, ss);
12177 // we checked above that it should exist.
12178 ceph_assert(err != -ENOENT);
12179 } else {
12180 err = prepare_command_osd_purge(id, ss);
12181 if (err == -ENOENT) {
12182 err = 0;
12183 ss << "osd." << id << " does not exist.";
12184 goto_reply = true;
12185 }
12186 }
12187 paxos->unplug();
12188
12189 if (err < 0 || goto_reply) {
12190 goto reply;
12191 }
12192
12193 if (is_destroy) {
12194 ss << "destroyed osd." << id;
12195 } else {
12196 ss << "purged osd." << id;
12197 }
12198
12199 getline(ss, rs);
12200 wait_for_finished_proposal(op,
12201 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12202 force_immediate_propose();
12203 return true;
12204
12205 } else if (prefix == "osd new") {
12206
12207 // make sure authmon is writeable.
12208 if (!mon->authmon()->is_writeable()) {
12209 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12210 << "osd new" << dendl;
12211 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12212 return false;
12213 }
12214
12215 map<string,string> param_map;
12216
12217 bufferlist bl = m->get_data();
12218 string param_json = bl.to_str();
12219 dout(20) << __func__ << " osd new json = " << param_json << dendl;
12220
12221 err = get_json_str_map(param_json, ss, &param_map);
12222 if (err < 0)
12223 goto reply;
12224
12225 dout(20) << __func__ << " osd new params " << param_map << dendl;
12226
12227 paxos->plug();
12228 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12229 paxos->unplug();
12230
12231 if (err < 0) {
12232 goto reply;
12233 }
12234
12235 if (f) {
12236 f->flush(rdata);
12237 } else {
12238 rdata.append(ss);
12239 }
12240
12241 if (err == EEXIST) {
12242 // idempotent operation
12243 err = 0;
12244 goto reply;
12245 }
12246
12247 wait_for_finished_proposal(op,
12248 new Monitor::C_Command(mon, op, 0, rs, rdata,
12249 get_last_committed() + 1));
12250 force_immediate_propose();
12251 return true;
12252
12253 } else if (prefix == "osd create") {
12254
12255 // optional id provided?
12256 int64_t id = -1, cmd_id = -1;
12257 if (cmd_getval(cmdmap, "id", cmd_id)) {
12258 if (cmd_id < 0) {
12259 ss << "invalid osd id value '" << cmd_id << "'";
12260 err = -EINVAL;
12261 goto reply;
12262 }
12263 dout(10) << " osd create got id " << cmd_id << dendl;
12264 }
12265
12266 uuid_d uuid;
12267 string uuidstr;
12268 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12269 if (!uuid.parse(uuidstr.c_str())) {
12270 ss << "invalid uuid value '" << uuidstr << "'";
12271 err = -EINVAL;
12272 goto reply;
12273 }
12274 // we only care about the id if we also have the uuid, to
12275 // ensure the operation's idempotency.
12276 id = cmd_id;
12277 }
12278
12279 int32_t new_id = -1;
12280 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12281 if (err < 0) {
12282 if (err == -EAGAIN) {
12283 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12284 return true;
12285 }
12286 // a check has failed; reply to the user.
12287 goto reply;
12288
12289 } else if (err == EEXIST) {
12290 // this is an idempotent operation; we can go ahead and reply.
12291 if (f) {
12292 f->open_object_section("created_osd");
12293 f->dump_int("osdid", new_id);
12294 f->close_section();
12295 f->flush(rdata);
12296 } else {
12297 ss << new_id;
12298 rdata.append(ss);
12299 }
12300 err = 0;
12301 goto reply;
12302 }
12303
12304 string empty_device_class;
12305 do_osd_create(id, uuid, empty_device_class, &new_id);
12306
12307 if (f) {
12308 f->open_object_section("created_osd");
12309 f->dump_int("osdid", new_id);
12310 f->close_section();
12311 f->flush(rdata);
12312 } else {
12313 ss << new_id;
12314 rdata.append(ss);
12315 }
12316 wait_for_finished_proposal(op,
12317 new Monitor::C_Command(mon, op, 0, rs, rdata,
12318 get_last_committed() + 1));
12319 return true;
12320
12321 } else if (prefix == "osd blacklist clear") {
12322 pending_inc.new_blacklist.clear();
12323 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12324 osdmap.get_blacklist(&blacklist);
12325 for (const auto &entry : blacklist) {
12326 pending_inc.old_blacklist.push_back(entry.first);
12327 }
12328 ss << " removed all blacklist entries";
12329 getline(ss, rs);
12330 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12331 get_last_committed() + 1));
12332 return true;
12333 } else if (prefix == "osd blacklist") {
12334 string addrstr;
12335 cmd_getval(cmdmap, "addr", addrstr);
12336 entity_addr_t addr;
12337 if (!addr.parse(addrstr.c_str(), 0)) {
12338 ss << "unable to parse address " << addrstr;
12339 err = -EINVAL;
12340 goto reply;
12341 }
12342 else {
12343 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12344 // always blacklist type ANY
12345 addr.set_type(entity_addr_t::TYPE_ANY);
12346 } else {
12347 addr.set_type(entity_addr_t::TYPE_LEGACY);
12348 }
12349
12350 string blacklistop;
12351 cmd_getval(cmdmap, "blacklistop", blacklistop);
12352 if (blacklistop == "add") {
12353 utime_t expires = ceph_clock_now();
12354 double d;
12355 // default one hour
12356 cmd_getval(cmdmap, "expire", d,
12357 g_conf()->mon_osd_blacklist_default_expire);
12358 expires += d;
12359
12360 pending_inc.new_blacklist[addr] = expires;
12361
12362 {
12363 // cancel any pending un-blacklisting request too
12364 auto it = std::find(pending_inc.old_blacklist.begin(),
12365 pending_inc.old_blacklist.end(), addr);
12366 if (it != pending_inc.old_blacklist.end()) {
12367 pending_inc.old_blacklist.erase(it);
12368 }
12369 }
12370
12371 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12372 getline(ss, rs);
12373 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12374 get_last_committed() + 1));
12375 return true;
12376 } else if (blacklistop == "rm") {
12377 if (osdmap.is_blacklisted(addr) ||
12378 pending_inc.new_blacklist.count(addr)) {
12379 if (osdmap.is_blacklisted(addr))
12380 pending_inc.old_blacklist.push_back(addr);
12381 else
12382 pending_inc.new_blacklist.erase(addr);
12383 ss << "un-blacklisting " << addr;
12384 getline(ss, rs);
12385 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12386 get_last_committed() + 1));
12387 return true;
12388 }
12389 ss << addr << " isn't blacklisted";
12390 err = 0;
12391 goto reply;
12392 }
12393 }
12394 } else if (prefix == "osd pool mksnap") {
12395 string poolstr;
12396 cmd_getval(cmdmap, "pool", poolstr);
12397 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12398 if (pool < 0) {
12399 ss << "unrecognized pool '" << poolstr << "'";
12400 err = -ENOENT;
12401 goto reply;
12402 }
12403 string snapname;
12404 cmd_getval(cmdmap, "snap", snapname);
12405 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12406 if (p->is_unmanaged_snaps_mode()) {
12407 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12408 err = -EINVAL;
12409 goto reply;
12410 } else if (p->snap_exists(snapname.c_str())) {
12411 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12412 err = 0;
12413 goto reply;
12414 } else if (p->is_tier()) {
12415 ss << "pool " << poolstr << " is a cache tier";
12416 err = -EINVAL;
12417 goto reply;
12418 }
12419 pg_pool_t *pp = 0;
12420 if (pending_inc.new_pools.count(pool))
12421 pp = &pending_inc.new_pools[pool];
12422 if (!pp) {
12423 pp = &pending_inc.new_pools[pool];
12424 *pp = *p;
12425 }
12426 if (pp->snap_exists(snapname.c_str())) {
12427 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12428 } else {
12429 pp->add_snap(snapname.c_str(), ceph_clock_now());
12430 pp->set_snap_epoch(pending_inc.epoch);
12431 ss << "created pool " << poolstr << " snap " << snapname;
12432 }
12433 getline(ss, rs);
12434 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12435 get_last_committed() + 1));
12436 return true;
12437 } else if (prefix == "osd pool rmsnap") {
12438 string poolstr;
12439 cmd_getval(cmdmap, "pool", poolstr);
12440 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12441 if (pool < 0) {
12442 ss << "unrecognized pool '" << poolstr << "'";
12443 err = -ENOENT;
12444 goto reply;
12445 }
12446 string snapname;
12447 cmd_getval(cmdmap, "snap", snapname);
12448 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12449 if (p->is_unmanaged_snaps_mode()) {
12450 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12451 err = -EINVAL;
12452 goto reply;
12453 } else if (!p->snap_exists(snapname.c_str())) {
12454 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12455 err = 0;
12456 goto reply;
12457 }
12458 pg_pool_t *pp = 0;
12459 if (pending_inc.new_pools.count(pool))
12460 pp = &pending_inc.new_pools[pool];
12461 if (!pp) {
12462 pp = &pending_inc.new_pools[pool];
12463 *pp = *p;
12464 }
12465 snapid_t sn = pp->snap_exists(snapname.c_str());
12466 if (sn) {
12467 pp->remove_snap(sn);
12468 pp->set_snap_epoch(pending_inc.epoch);
12469 ss << "removed pool " << poolstr << " snap " << snapname;
12470 } else {
12471 ss << "already removed pool " << poolstr << " snap " << snapname;
12472 }
12473 getline(ss, rs);
12474 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12475 get_last_committed() + 1));
12476 return true;
12477 } else if (prefix == "osd pool create") {
12478 int64_t pg_num, pg_num_min;
12479 int64_t pgp_num;
12480 cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12481 cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12482 cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12483
12484 string pool_type_str;
12485 cmd_getval(cmdmap, "pool_type", pool_type_str);
12486 if (pool_type_str.empty())
12487 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12488
12489 string poolstr;
12490 cmd_getval(cmdmap, "pool", poolstr);
12491 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12492 if (pool_id >= 0) {
12493 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12494 if (pool_type_str != p->get_type_name()) {
12495 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12496 err = -EINVAL;
12497 } else {
12498 ss << "pool '" << poolstr << "' already exists";
12499 err = 0;
12500 }
12501 goto reply;
12502 }
12503
12504 int pool_type;
12505 if (pool_type_str == "replicated") {
12506 pool_type = pg_pool_t::TYPE_REPLICATED;
12507 } else if (pool_type_str == "erasure") {
12508 pool_type = pg_pool_t::TYPE_ERASURE;
12509 } else {
12510 ss << "unknown pool type '" << pool_type_str << "'";
12511 err = -EINVAL;
12512 goto reply;
12513 }
12514
12515 bool implicit_rule_creation = false;
12516 int64_t expected_num_objects = 0;
12517 string rule_name;
12518 cmd_getval(cmdmap, "rule", rule_name);
12519 string erasure_code_profile;
12520 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12521
12522 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12523 if (erasure_code_profile == "")
12524 erasure_code_profile = "default";
12525 //handle the erasure code profile
12526 if (erasure_code_profile == "default") {
12527 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12528 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12529 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12530 goto wait;
12531 }
12532
12533 map<string,string> profile_map;
12534 err = osdmap.get_erasure_code_profile_default(cct,
12535 profile_map,
12536 &ss);
12537 if (err)
12538 goto reply;
12539 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12540 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12541 goto wait;
12542 }
12543 }
12544 if (rule_name == "") {
12545 implicit_rule_creation = true;
12546 if (erasure_code_profile == "default") {
12547 rule_name = "erasure-code";
12548 } else {
12549 dout(1) << "implicitly use rule named after the pool: "
12550 << poolstr << dendl;
12551 rule_name = poolstr;
12552 }
12553 }
12554 cmd_getval(cmdmap, "expected_num_objects",
12555 expected_num_objects, int64_t(0));
12556 } else {
12557 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12558 // and put expected_num_objects to rule field
12559 if (erasure_code_profile != "") { // cmd is from CLI
12560 if (rule_name != "") {
12561 string interr;
12562 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12563 if (interr.length()) {
12564 ss << "error parsing integer value '" << rule_name << "': " << interr;
12565 err = -EINVAL;
12566 goto reply;
12567 }
12568 }
12569 rule_name = erasure_code_profile;
12570 } else { // cmd is well-formed
12571 cmd_getval(cmdmap, "expected_num_objects",
12572 expected_num_objects, int64_t(0));
12573 }
12574 }
12575
12576 if (!implicit_rule_creation && rule_name != "") {
12577 int rule;
12578 err = get_crush_rule(rule_name, &rule, &ss);
12579 if (err == -EAGAIN) {
12580 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12581 return true;
12582 }
12583 if (err)
12584 goto reply;
12585 }
12586
12587 if (expected_num_objects < 0) {
12588 ss << "'expected_num_objects' must be non-negative";
12589 err = -EINVAL;
12590 goto reply;
12591 }
12592
12593 set<int32_t> osds;
12594 osdmap.get_all_osds(osds);
12595 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12596 string type;
12597 if (!get_osd_objectstore_type(osd, &type)) {
12598 return type == "filestore";
12599 } else {
12600 return false;
12601 }
12602 });
12603
12604 if (has_filestore_osd &&
12605 expected_num_objects > 0 &&
12606 cct->_conf->filestore_merge_threshold > 0) {
12607 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12608 err = -EINVAL;
12609 goto reply;
12610 }
12611
12612 if (has_filestore_osd &&
12613 expected_num_objects == 0 &&
12614 cct->_conf->filestore_merge_threshold < 0) {
12615 int osds = osdmap.get_num_osds();
12616 bool sure = false;
12617 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12618 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12619 ss << "For better initial performance on pools expected to store a "
12620 << "large number of objects, consider supplying the "
12621 << "expected_num_objects parameter when creating the pool."
12622 << " Pass --yes-i-really-mean-it to ignore it";
12623 err = -EPERM;
12624 goto reply;
12625 }
12626 }
12627
12628 int64_t fast_read_param;
12629 cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12630 FastReadType fast_read = FAST_READ_DEFAULT;
12631 if (fast_read_param == 0)
12632 fast_read = FAST_READ_OFF;
12633 else if (fast_read_param > 0)
12634 fast_read = FAST_READ_ON;
12635
12636 int64_t repl_size = 0;
12637 cmd_getval(cmdmap, "size", repl_size);
12638 int64_t target_size_bytes = 0;
12639 double target_size_ratio = 0.0;
12640 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12641 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12642
12643 string pg_autoscale_mode;
12644 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12645
12646 err = prepare_new_pool(poolstr,
12647 -1, // default crush rule
12648 rule_name,
12649 pg_num, pgp_num, pg_num_min,
12650 repl_size, target_size_bytes, target_size_ratio,
12651 erasure_code_profile, pool_type,
12652 (uint64_t)expected_num_objects,
12653 fast_read,
12654 pg_autoscale_mode,
12655 &ss);
12656 if (err < 0) {
12657 switch(err) {
12658 case -EEXIST:
12659 ss << "pool '" << poolstr << "' already exists";
12660 break;
12661 case -EAGAIN:
12662 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12663 return true;
12664 case -ERANGE:
12665 goto reply;
12666 default:
12667 goto reply;
12668 break;
12669 }
12670 } else {
12671 ss << "pool '" << poolstr << "' created";
12672 }
12673 getline(ss, rs);
12674 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12675 get_last_committed() + 1));
12676 return true;
12677
12678 } else if (prefix == "osd pool delete" ||
12679 prefix == "osd pool rm") {
12680 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12681 string poolstr, poolstr2, sure;
12682 cmd_getval(cmdmap, "pool", poolstr);
12683 cmd_getval(cmdmap, "pool2", poolstr2);
12684 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12685 if (pool < 0) {
12686 ss << "pool '" << poolstr << "' does not exist";
12687 err = 0;
12688 goto reply;
12689 }
12690
12691 bool force_no_fake = false;
12692 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12693 bool force = false;
12694 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12695 if (poolstr2 != poolstr ||
12696 (!force && !force_no_fake)) {
12697 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12698 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12699 << "followed by --yes-i-really-really-mean-it.";
12700 err = -EPERM;
12701 goto reply;
12702 }
12703 err = _prepare_remove_pool(pool, &ss, force_no_fake);
12704 if (err == -EAGAIN) {
12705 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12706 return true;
12707 }
12708 if (err < 0)
12709 goto reply;
12710 goto update;
12711 } else if (prefix == "osd pool rename") {
12712 string srcpoolstr, destpoolstr;
12713 cmd_getval(cmdmap, "srcpool", srcpoolstr);
12714 cmd_getval(cmdmap, "destpool", destpoolstr);
12715 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12716 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12717
12718 if (pool_src < 0) {
12719 if (pool_dst >= 0) {
12720 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12721 // of operations, assume this rename succeeded, as it is not changing
12722 // the current state. Make sure we output something understandable
12723 // for whoever is issuing the command, if they are paying attention,
12724 // in case it was not intentional; or to avoid a "wtf?" and a bug
12725 // report in case it was intentional, while expecting a failure.
12726 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12727 << destpoolstr << "' does -- assuming successful rename";
12728 err = 0;
12729 } else {
12730 ss << "unrecognized pool '" << srcpoolstr << "'";
12731 err = -ENOENT;
12732 }
12733 goto reply;
12734 } else if (pool_dst >= 0) {
12735 // source pool exists and so does the destination pool
12736 ss << "pool '" << destpoolstr << "' already exists";
12737 err = -EEXIST;
12738 goto reply;
12739 }
12740
12741 int ret = _prepare_rename_pool(pool_src, destpoolstr);
12742 if (ret == 0) {
12743 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12744 } else {
12745 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12746 << cpp_strerror(ret);
12747 }
12748 getline(ss, rs);
12749 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12750 get_last_committed() + 1));
12751 return true;
12752
12753 } else if (prefix == "osd pool set") {
12754 err = prepare_command_pool_set(cmdmap, ss);
12755 if (err == -EAGAIN)
12756 goto wait;
12757 if (err < 0)
12758 goto reply;
12759
12760 getline(ss, rs);
12761 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12762 get_last_committed() + 1));
12763 return true;
12764 } else if (prefix == "osd tier add") {
12765 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12766 if (err == -EAGAIN)
12767 goto wait;
12768 if (err)
12769 goto reply;
12770 string poolstr;
12771 cmd_getval(cmdmap, "pool", poolstr);
12772 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12773 if (pool_id < 0) {
12774 ss << "unrecognized pool '" << poolstr << "'";
12775 err = -ENOENT;
12776 goto reply;
12777 }
12778 string tierpoolstr;
12779 cmd_getval(cmdmap, "tierpool", tierpoolstr);
12780 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12781 if (tierpool_id < 0) {
12782 ss << "unrecognized pool '" << tierpoolstr << "'";
12783 err = -ENOENT;
12784 goto reply;
12785 }
12786 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12787 ceph_assert(p);
12788 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12789 ceph_assert(tp);
12790
12791 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12792 goto reply;
12793 }
12794
12795 // make sure new tier is empty
12796 string force_nonempty;
12797 cmd_getval(cmdmap, "force_nonempty", force_nonempty);
12798 const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
12799 if (pstats && pstats->stats.sum.num_objects != 0 &&
12800 force_nonempty != "--force-nonempty") {
12801 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12802 err = -ENOTEMPTY;
12803 goto reply;
12804 }
12805 if (tp->is_erasure()) {
12806 ss << "tier pool '" << tierpoolstr
12807 << "' is an ec pool, which cannot be a tier";
12808 err = -ENOTSUP;
12809 goto reply;
12810 }
12811 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12812 ((force_nonempty != "--force-nonempty") ||
12813 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
12814 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12815 err = -ENOTEMPTY;
12816 goto reply;
12817 }
12818 // go
12819 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12820 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12821 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12822 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12823 return true;
12824 }
12825 np->tiers.insert(tierpool_id);
12826 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12827 ntp->tier_of = pool_id;
12828 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12829 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12830 get_last_committed() + 1));
12831 return true;
12832 } else if (prefix == "osd tier remove" ||
12833 prefix == "osd tier rm") {
12834 string poolstr;
12835 cmd_getval(cmdmap, "pool", poolstr);
12836 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12837 if (pool_id < 0) {
12838 ss << "unrecognized pool '" << poolstr << "'";
12839 err = -ENOENT;
12840 goto reply;
12841 }
12842 string tierpoolstr;
12843 cmd_getval(cmdmap, "tierpool", tierpoolstr);
12844 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12845 if (tierpool_id < 0) {
12846 ss << "unrecognized pool '" << tierpoolstr << "'";
12847 err = -ENOENT;
12848 goto reply;
12849 }
12850 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12851 ceph_assert(p);
12852 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12853 ceph_assert(tp);
12854
12855 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12856 goto reply;
12857 }
12858
12859 if (p->tiers.count(tierpool_id) == 0) {
12860 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12861 err = 0;
12862 goto reply;
12863 }
12864 if (tp->tier_of != pool_id) {
12865 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12866 << osdmap.get_pool_name(tp->tier_of) << "': "
12867 // be scary about it; this is an inconsistency and bells must go off
12868 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12869 err = -EINVAL;
12870 goto reply;
12871 }
12872 if (p->read_tier == tierpool_id) {
12873 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12874 err = -EBUSY;
12875 goto reply;
12876 }
12877 // go
12878 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12879 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12880 if (np->tiers.count(tierpool_id) == 0 ||
12881 ntp->tier_of != pool_id ||
12882 np->read_tier == tierpool_id) {
12883 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12884 return true;
12885 }
12886 np->tiers.erase(tierpool_id);
12887 ntp->clear_tier();
12888 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12889 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12890 get_last_committed() + 1));
12891 return true;
12892 } else if (prefix == "osd tier set-overlay") {
12893 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12894 if (err == -EAGAIN)
12895 goto wait;
12896 if (err)
12897 goto reply;
12898 string poolstr;
12899 cmd_getval(cmdmap, "pool", poolstr);
12900 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12901 if (pool_id < 0) {
12902 ss << "unrecognized pool '" << poolstr << "'";
12903 err = -ENOENT;
12904 goto reply;
12905 }
12906 string overlaypoolstr;
12907 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
12908 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12909 if (overlaypool_id < 0) {
12910 ss << "unrecognized pool '" << overlaypoolstr << "'";
12911 err = -ENOENT;
12912 goto reply;
12913 }
12914 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12915 ceph_assert(p);
12916 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
12917 ceph_assert(overlay_p);
12918 if (p->tiers.count(overlaypool_id) == 0) {
12919 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12920 err = -EINVAL;
12921 goto reply;
12922 }
12923 if (p->read_tier == overlaypool_id) {
12924 err = 0;
12925 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12926 goto reply;
12927 }
12928 if (p->has_read_tier()) {
12929 ss << "pool '" << poolstr << "' has overlay '"
12930 << osdmap.get_pool_name(p->read_tier)
12931 << "'; please remove-overlay first";
12932 err = -EINVAL;
12933 goto reply;
12934 }
12935
12936 // go
12937 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12938 np->read_tier = overlaypool_id;
12939 np->write_tier = overlaypool_id;
12940 np->set_last_force_op_resend(pending_inc.epoch);
12941 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12942 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12943 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12944 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12945 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12946 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12947 get_last_committed() + 1));
12948 return true;
12949 } else if (prefix == "osd tier remove-overlay" ||
12950 prefix == "osd tier rm-overlay") {
12951 string poolstr;
12952 cmd_getval(cmdmap, "pool", poolstr);
12953 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12954 if (pool_id < 0) {
12955 ss << "unrecognized pool '" << poolstr << "'";
12956 err = -ENOENT;
12957 goto reply;
12958 }
12959 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12960 ceph_assert(p);
12961 if (!p->has_read_tier()) {
12962 err = 0;
12963 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12964 goto reply;
12965 }
12966
12967 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12968 goto reply;
12969 }
12970
12971 // go
12972 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12973 if (np->has_read_tier()) {
12974 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12975 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12976 nop->set_last_force_op_resend(pending_inc.epoch);
12977 }
12978 if (np->has_write_tier()) {
12979 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12980 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12981 nop->set_last_force_op_resend(pending_inc.epoch);
12982 }
12983 np->clear_read_tier();
12984 np->clear_write_tier();
12985 np->set_last_force_op_resend(pending_inc.epoch);
12986 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12987 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12988 get_last_committed() + 1));
12989 return true;
12990 } else if (prefix == "osd tier cache-mode") {
12991 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12992 if (err == -EAGAIN)
12993 goto wait;
12994 if (err)
12995 goto reply;
12996 string poolstr;
12997 cmd_getval(cmdmap, "pool", poolstr);
12998 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12999 if (pool_id < 0) {
13000 ss << "unrecognized pool '" << poolstr << "'";
13001 err = -ENOENT;
13002 goto reply;
13003 }
13004 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13005 ceph_assert(p);
13006 if (!p->is_tier()) {
13007 ss << "pool '" << poolstr << "' is not a tier";
13008 err = -EINVAL;
13009 goto reply;
13010 }
13011 string modestr;
13012 cmd_getval(cmdmap, "mode", modestr);
13013 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13014 if (int(mode) < 0) {
13015 ss << "'" << modestr << "' is not a valid cache mode";
13016 err = -EINVAL;
13017 goto reply;
13018 }
13019
13020 bool sure = false;
13021 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13022
13023 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13024 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13025 ss << "'" << modestr << "' is no longer a supported cache mode";
13026 err = -EPERM;
13027 goto reply;
13028 }
13029 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13030 mode != pg_pool_t::CACHEMODE_NONE &&
13031 mode != pg_pool_t::CACHEMODE_PROXY &&
13032 mode != pg_pool_t::CACHEMODE_READPROXY) &&
13033 !sure) {
13034 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13035 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13036 err = -EPERM;
13037 goto reply;
13038 }
13039
13040 // pool already has this cache-mode set and there are no pending changes
13041 if (p->cache_mode == mode &&
13042 (pending_inc.new_pools.count(pool_id) == 0 ||
13043 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13044 ss << "set cache-mode for pool '" << poolstr << "'"
13045 << " to " << pg_pool_t::get_cache_mode_name(mode);
13046 err = 0;
13047 goto reply;
13048 }
13049
13050 /* Mode description:
13051 *
13052 * none: No cache-mode defined
13053 * forward: Forward all reads and writes to base pool [removed]
13054 * writeback: Cache writes, promote reads from base pool
13055 * readonly: Forward writes to base pool
13056 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13057 * proxy: Proxy all reads and writes to base pool
13058 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13059 *
13060 * Hence, these are the allowed transitions:
13061 *
13062 * none -> any
13063 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13064 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13065 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13066 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13067 * writeback -> readproxy || proxy
13068 * readonly -> any
13069 */
13070
13071 // We check if the transition is valid against the current pool mode, as
13072 // it is the only committed state thus far. We will blantly squash
13073 // whatever mode is on the pending state.
13074
13075 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13076 (mode != pg_pool_t::CACHEMODE_PROXY &&
13077 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13078 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13079 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13080 << "' pool; only '"
13081 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13082 << "' allowed.";
13083 err = -EINVAL;
13084 goto reply;
13085 }
13086 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13087 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13088 mode != pg_pool_t::CACHEMODE_PROXY &&
13089 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13090
13091 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13092 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13093 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13094
13095 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13096 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13097 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13098
13099 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13100 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13101 mode != pg_pool_t::CACHEMODE_PROXY &&
13102 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13103
13104 const pool_stat_t* pstats =
13105 mon->mgrstatmon()->get_pool_stat(pool_id);
13106
13107 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13108 ss << "unable to set cache-mode '"
13109 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13110 << "': dirty objects found";
13111 err = -EBUSY;
13112 goto reply;
13113 }
13114 }
13115 // go
13116 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13117 np->cache_mode = mode;
13118 // set this both when moving to and from cache_mode NONE. this is to
13119 // capture legacy pools that were set up before this flag existed.
13120 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13121 ss << "set cache-mode for pool '" << poolstr
13122 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13123 if (mode == pg_pool_t::CACHEMODE_NONE) {
13124 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13125 ceph_assert(base_pool);
13126 if (base_pool->read_tier == pool_id ||
13127 base_pool->write_tier == pool_id)
13128 ss <<" (WARNING: pool is still configured as read or write tier)";
13129 }
13130 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13131 get_last_committed() + 1));
13132 return true;
13133 } else if (prefix == "osd tier add-cache") {
13134 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13135 if (err == -EAGAIN)
13136 goto wait;
13137 if (err)
13138 goto reply;
13139 string poolstr;
13140 cmd_getval(cmdmap, "pool", poolstr);
13141 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13142 if (pool_id < 0) {
13143 ss << "unrecognized pool '" << poolstr << "'";
13144 err = -ENOENT;
13145 goto reply;
13146 }
13147 string tierpoolstr;
13148 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13149 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13150 if (tierpool_id < 0) {
13151 ss << "unrecognized pool '" << tierpoolstr << "'";
13152 err = -ENOENT;
13153 goto reply;
13154 }
13155 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13156 ceph_assert(p);
13157 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13158 ceph_assert(tp);
13159
13160 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13161 goto reply;
13162 }
13163
13164 int64_t size = 0;
13165 if (!cmd_getval(cmdmap, "size", size)) {
13166 ss << "unable to parse 'size' value '"
13167 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13168 err = -EINVAL;
13169 goto reply;
13170 }
13171 // make sure new tier is empty
13172 const pool_stat_t *pstats =
13173 mon->mgrstatmon()->get_pool_stat(tierpool_id);
13174 if (pstats && pstats->stats.sum.num_objects != 0) {
13175 ss << "tier pool '" << tierpoolstr << "' is not empty";
13176 err = -ENOTEMPTY;
13177 goto reply;
13178 }
13179 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13180 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13181 if (int(mode) < 0) {
13182 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13183 err = -EINVAL;
13184 goto reply;
13185 }
13186 HitSet::Params hsp;
13187 auto& cache_hit_set_type =
13188 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13189 if (cache_hit_set_type == "bloom") {
13190 BloomHitSet::Params *bsp = new BloomHitSet::Params;
13191 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13192 hsp = HitSet::Params(bsp);
13193 } else if (cache_hit_set_type == "explicit_hash") {
13194 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13195 } else if (cache_hit_set_type == "explicit_object") {
13196 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13197 } else {
13198 ss << "osd tier cache default hit set type '"
13199 << cache_hit_set_type << "' is not a known type";
13200 err = -EINVAL;
13201 goto reply;
13202 }
13203 // go
13204 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13205 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13206 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13207 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13208 return true;
13209 }
13210 np->tiers.insert(tierpool_id);
13211 np->read_tier = np->write_tier = tierpool_id;
13212 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13213 np->set_last_force_op_resend(pending_inc.epoch);
13214 ntp->set_last_force_op_resend(pending_inc.epoch);
13215 ntp->tier_of = pool_id;
13216 ntp->cache_mode = mode;
13217 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13218 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13219 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13220 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13221 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13222 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13223 ntp->hit_set_params = hsp;
13224 ntp->target_max_bytes = size;
13225 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13226 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13227 get_last_committed() + 1));
13228 return true;
13229 } else if (prefix == "osd pool set-quota") {
13230 string poolstr;
13231 cmd_getval(cmdmap, "pool", poolstr);
13232 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13233 if (pool_id < 0) {
13234 ss << "unrecognized pool '" << poolstr << "'";
13235 err = -ENOENT;
13236 goto reply;
13237 }
13238
13239 string field;
13240 cmd_getval(cmdmap, "field", field);
13241 if (field != "max_objects" && field != "max_bytes") {
13242 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13243 err = -EINVAL;
13244 goto reply;
13245 }
13246
13247 // val could contain unit designations, so we treat as a string
13248 string val;
13249 cmd_getval(cmdmap, "val", val);
13250 string tss;
13251 int64_t value;
13252 if (field == "max_objects") {
13253 value = strict_sistrtoll(val.c_str(), &tss);
13254 } else if (field == "max_bytes") {
13255 value = strict_iecstrtoll(val.c_str(), &tss);
13256 } else {
13257 ceph_abort_msg("unrecognized option");
13258 }
13259 if (!tss.empty()) {
13260 ss << "error parsing value '" << val << "': " << tss;
13261 err = -EINVAL;
13262 goto reply;
13263 }
13264
13265 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13266 if (field == "max_objects") {
13267 pi->quota_max_objects = value;
13268 } else if (field == "max_bytes") {
13269 pi->quota_max_bytes = value;
13270 } else {
13271 ceph_abort_msg("unrecognized option");
13272 }
13273 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13274 rs = ss.str();
13275 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13276 get_last_committed() + 1));
13277 return true;
13278 } else if (prefix == "osd pool application enable" ||
13279 prefix == "osd pool application disable" ||
13280 prefix == "osd pool application set" ||
13281 prefix == "osd pool application rm") {
13282 err = prepare_command_pool_application(prefix, cmdmap, ss);
13283 if (err == -EAGAIN) {
13284 goto wait;
13285 } else if (err < 0) {
13286 goto reply;
13287 } else {
13288 goto update;
13289 }
13290 } else if (prefix == "osd force-create-pg") {
13291 pg_t pgid;
13292 string pgidstr;
13293 cmd_getval(cmdmap, "pgid", pgidstr);
13294 if (!pgid.parse(pgidstr.c_str())) {
13295 ss << "invalid pgid '" << pgidstr << "'";
13296 err = -EINVAL;
13297 goto reply;
13298 }
13299 if (!osdmap.pg_exists(pgid)) {
13300 ss << "pg " << pgid << " should not exist";
13301 err = -ENOENT;
13302 goto reply;
13303 }
13304 bool sure = false;
13305 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13306 if (!sure) {
13307 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13308 << "that the cluster will give up ever trying to recover the lost data. Do this "
13309 << "only if you are certain that all copies of the PG are in fact lost and you are "
13310 << "willing to accept that the data is permanently destroyed. Pass "
13311 << "--yes-i-really-mean-it to proceed.";
13312 err = -EPERM;
13313 goto reply;
13314 }
13315 bool creating_now;
13316 {
13317 std::lock_guard<std::mutex> l(creating_pgs_lock);
13318 auto emplaced = creating_pgs.pgs.emplace(
13319 pgid,
13320 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13321 ceph_clock_now()));
13322 creating_now = emplaced.second;
13323 }
13324 if (creating_now) {
13325 ss << "pg " << pgidstr << " now creating, ok";
13326 // set the pool's CREATING flag so that (1) the osd won't ignore our
13327 // create message and (2) we won't propose any future pg_num changes
13328 // until after the PG has been instantiated.
13329 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13330 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13331 }
13332 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13333 err = 0;
13334 goto update;
13335 } else {
13336 ss << "pg " << pgid << " already creating";
13337 err = 0;
13338 goto reply;
13339 }
13340 } else {
13341 err = -EINVAL;
13342 }
13343
13344 reply:
13345 getline(ss, rs);
13346 if (err < 0 && rs.length() == 0)
13347 rs = cpp_strerror(err);
13348 mon->reply_command(op, err, rs, rdata, get_last_committed());
13349 return ret;
13350
13351 update:
13352 getline(ss, rs);
13353 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13354 get_last_committed() + 1));
13355 return true;
13356
13357 wait:
13358 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13359 return true;
13360 }
13361
13362 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13363 {
13364 op->mark_osdmon_event(__func__);
13365
13366 auto m = op->get_req<MPoolOp>();
13367 MonSession *session = op->get_session();
13368 if (!session) {
13369 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13370 return true;
13371 }
13372
13373 switch (m->op) {
13374 case POOL_OP_CREATE_UNMANAGED_SNAP:
13375 case POOL_OP_DELETE_UNMANAGED_SNAP:
13376 {
13377 const std::string* pool_name = nullptr;
13378 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13379 if (pg_pool != nullptr) {
13380 pool_name = &osdmap.get_pool_name(m->pool);
13381 }
13382
13383 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13384 session->entity_name, session->caps,
13385 session->get_peer_socket_addr(),
13386 pool_name)) {
13387 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13388 << "privileges. message: " << *m << std::endl
13389 << "caps: " << session->caps << dendl;
13390 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13391 return true;
13392 }
13393 }
13394 break;
13395 default:
13396 if (!session->is_capable("osd", MON_CAP_W)) {
13397 dout(0) << "got pool op from entity with insufficient privileges. "
13398 << "message: " << *m << std::endl
13399 << "caps: " << session->caps << dendl;
13400 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13401 return true;
13402 }
13403 break;
13404 }
13405
13406 return false;
13407 }
13408
13409 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13410 {
13411 op->mark_osdmon_event(__func__);
13412 auto m = op->get_req<MPoolOp>();
13413
13414 if (enforce_pool_op_caps(op)) {
13415 return true;
13416 }
13417
13418 if (m->fsid != mon->monmap->fsid) {
13419 dout(0) << __func__ << " drop message on fsid " << m->fsid
13420 << " != " << mon->monmap->fsid << " for " << *m << dendl;
13421 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13422 return true;
13423 }
13424
13425 if (m->op == POOL_OP_CREATE)
13426 return preprocess_pool_op_create(op);
13427
13428 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13429 if (p == nullptr) {
13430 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13431 if (m->op == POOL_OP_DELETE) {
13432 _pool_op_reply(op, 0, osdmap.get_epoch());
13433 } else {
13434 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13435 }
13436 return true;
13437 }
13438
13439 // check if the snap and snapname exist
13440 bool snap_exists = false;
13441 if (p->snap_exists(m->name.c_str()))
13442 snap_exists = true;
13443
13444 switch (m->op) {
13445 case POOL_OP_CREATE_SNAP:
13446 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13447 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13448 return true;
13449 }
13450 if (snap_exists) {
13451 _pool_op_reply(op, 0, osdmap.get_epoch());
13452 return true;
13453 }
13454 return false;
13455 case POOL_OP_CREATE_UNMANAGED_SNAP:
13456 if (p->is_pool_snaps_mode()) {
13457 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13458 return true;
13459 }
13460 return false;
13461 case POOL_OP_DELETE_SNAP:
13462 if (p->is_unmanaged_snaps_mode()) {
13463 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13464 return true;
13465 }
13466 if (!snap_exists) {
13467 _pool_op_reply(op, 0, osdmap.get_epoch());
13468 return true;
13469 }
13470 return false;
13471 case POOL_OP_DELETE_UNMANAGED_SNAP:
13472 if (p->is_pool_snaps_mode()) {
13473 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13474 return true;
13475 }
13476 if (_is_removed_snap(m->pool, m->snapid)) {
13477 _pool_op_reply(op, 0, osdmap.get_epoch());
13478 return true;
13479 }
13480 return false;
13481 case POOL_OP_DELETE:
13482 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13483 _pool_op_reply(op, 0, osdmap.get_epoch());
13484 return true;
13485 }
13486 return false;
13487 case POOL_OP_AUID_CHANGE:
13488 return false;
13489 default:
13490 ceph_abort();
13491 break;
13492 }
13493
13494 return false;
13495 }
13496
13497 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13498 {
13499 if (!osdmap.have_pg_pool(pool)) {
13500 dout(10) << __func__ << " pool " << pool << " snap " << snap
13501 << " - pool dne" << dendl;
13502 return true;
13503 }
13504 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13505 dout(10) << __func__ << " pool " << pool << " snap " << snap
13506 << " - in osdmap removed_snaps_queue" << dendl;
13507 return true;
13508 }
13509 snapid_t begin, end;
13510 int r = lookup_purged_snap(pool, snap, &begin, &end);
13511 if (r == 0) {
13512 dout(10) << __func__ << " pool " << pool << " snap " << snap
13513 << " - purged, [" << begin << "," << end << ")" << dendl;
13514 return true;
13515 }
13516 return false;
13517 }
13518
13519 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13520 {
13521 if (pending_inc.old_pools.count(pool)) {
13522 dout(10) << __func__ << " pool " << pool << " snap " << snap
13523 << " - pool pending deletion" << dendl;
13524 return true;
13525 }
13526 if (pending_inc.in_new_removed_snaps(pool, snap)) {
13527 dout(10) << __func__ << " pool " << pool << " snap " << snap
13528 << " - in pending new_removed_snaps" << dendl;
13529 return true;
13530 }
13531 return false;
13532 }
13533
13534 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13535 {
13536 op->mark_osdmon_event(__func__);
13537 auto m = op->get_req<MPoolOp>();
13538 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13539 if (pool >= 0) {
13540 _pool_op_reply(op, 0, osdmap.get_epoch());
13541 return true;
13542 }
13543
13544 return false;
13545 }
13546
13547 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13548 {
13549 op->mark_osdmon_event(__func__);
13550 auto m = op->get_req<MPoolOp>();
13551 dout(10) << "prepare_pool_op " << *m << dendl;
13552 if (m->op == POOL_OP_CREATE) {
13553 return prepare_pool_op_create(op);
13554 } else if (m->op == POOL_OP_DELETE) {
13555 return prepare_pool_op_delete(op);
13556 }
13557
13558 int ret = 0;
13559 bool changed = false;
13560
13561 if (!osdmap.have_pg_pool(m->pool)) {
13562 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13563 return false;
13564 }
13565
13566 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13567
13568 switch (m->op) {
13569 case POOL_OP_CREATE_SNAP:
13570 if (pool->is_tier()) {
13571 ret = -EINVAL;
13572 _pool_op_reply(op, ret, osdmap.get_epoch());
13573 return false;
13574 } // else, fall through
13575 case POOL_OP_DELETE_SNAP:
13576 if (!pool->is_unmanaged_snaps_mode()) {
13577 bool snap_exists = pool->snap_exists(m->name.c_str());
13578 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13579 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13580 ret = 0;
13581 } else {
13582 break;
13583 }
13584 } else {
13585 ret = -EINVAL;
13586 }
13587 _pool_op_reply(op, ret, osdmap.get_epoch());
13588 return false;
13589
13590 case POOL_OP_DELETE_UNMANAGED_SNAP:
13591 // we won't allow removal of an unmanaged snapshot from a pool
13592 // not in unmanaged snaps mode.
13593 if (!pool->is_unmanaged_snaps_mode()) {
13594 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13595 return false;
13596 }
13597 /* fall-thru */
13598 case POOL_OP_CREATE_UNMANAGED_SNAP:
13599 // but we will allow creating an unmanaged snapshot on any pool
13600 // as long as it is not in 'pool' snaps mode.
13601 if (pool->is_pool_snaps_mode()) {
13602 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13603 return false;
13604 }
13605 }
13606
13607 // projected pool info
13608 pg_pool_t pp;
13609 if (pending_inc.new_pools.count(m->pool))
13610 pp = pending_inc.new_pools[m->pool];
13611 else
13612 pp = *osdmap.get_pg_pool(m->pool);
13613
13614 bufferlist reply_data;
13615
13616 // pool snaps vs unmanaged snaps are mutually exclusive
13617 switch (m->op) {
13618 case POOL_OP_CREATE_SNAP:
13619 case POOL_OP_DELETE_SNAP:
13620 if (pp.is_unmanaged_snaps_mode()) {
13621 ret = -EINVAL;
13622 goto out;
13623 }
13624 break;
13625
13626 case POOL_OP_CREATE_UNMANAGED_SNAP:
13627 case POOL_OP_DELETE_UNMANAGED_SNAP:
13628 if (pp.is_pool_snaps_mode()) {
13629 ret = -EINVAL;
13630 goto out;
13631 }
13632 }
13633
13634 switch (m->op) {
13635 case POOL_OP_CREATE_SNAP:
13636 if (!pp.snap_exists(m->name.c_str())) {
13637 pp.add_snap(m->name.c_str(), ceph_clock_now());
13638 dout(10) << "create snap in pool " << m->pool << " " << m->name
13639 << " seq " << pp.get_snap_epoch() << dendl;
13640 changed = true;
13641 }
13642 break;
13643
13644 case POOL_OP_DELETE_SNAP:
13645 {
13646 snapid_t s = pp.snap_exists(m->name.c_str());
13647 if (s) {
13648 pp.remove_snap(s);
13649 pending_inc.new_removed_snaps[m->pool].insert(s);
13650 changed = true;
13651 }
13652 }
13653 break;
13654
13655 case POOL_OP_CREATE_UNMANAGED_SNAP:
13656 {
13657 uint64_t snapid = pp.add_unmanaged_snap(
13658 osdmap.require_osd_release < ceph_release_t::octopus);
13659 encode(snapid, reply_data);
13660 changed = true;
13661 }
13662 break;
13663
13664 case POOL_OP_DELETE_UNMANAGED_SNAP:
13665 if (!_is_removed_snap(m->pool, m->snapid) &&
13666 !_is_pending_removed_snap(m->pool, m->snapid)) {
13667 if (m->snapid > pp.get_snap_seq()) {
13668 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13669 return false;
13670 }
13671 pp.remove_unmanaged_snap(
13672 m->snapid,
13673 osdmap.require_osd_release < ceph_release_t::octopus);
13674 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13675 // also record the new seq as purged: this avoids a discontinuity
13676 // after all of the snaps have been purged, since the seq assigned
13677 // during removal lives in the same namespace as the actual snaps.
13678 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13679 changed = true;
13680 }
13681 break;
13682
13683 case POOL_OP_AUID_CHANGE:
13684 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13685 return false;
13686
13687 default:
13688 ceph_abort();
13689 break;
13690 }
13691
13692 if (changed) {
13693 pp.set_snap_epoch(pending_inc.epoch);
13694 pending_inc.new_pools[m->pool] = pp;
13695 }
13696
13697 out:
13698 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13699 return true;
13700 }
13701
13702 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13703 {
13704 op->mark_osdmon_event(__func__);
13705 int err = prepare_new_pool(op);
13706 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13707 return true;
13708 }
13709
13710 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13711 ostream *ss)
13712 {
13713 const string& poolstr = osdmap.get_pool_name(pool_id);
13714
13715 // If the Pool is in use by CephFS, refuse to delete it
13716 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13717 if (pending_fsmap.pool_in_use(pool_id)) {
13718 *ss << "pool '" << poolstr << "' is in use by CephFS";
13719 return -EBUSY;
13720 }
13721
13722 if (pool.tier_of >= 0) {
13723 *ss << "pool '" << poolstr << "' is a tier of '"
13724 << osdmap.get_pool_name(pool.tier_of) << "'";
13725 return -EBUSY;
13726 }
13727 if (!pool.tiers.empty()) {
13728 *ss << "pool '" << poolstr << "' has tiers";
13729 for(auto tier : pool.tiers) {
13730 *ss << " " << osdmap.get_pool_name(tier);
13731 }
13732 return -EBUSY;
13733 }
13734
13735 if (!g_conf()->mon_allow_pool_delete) {
13736 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13737 return -EPERM;
13738 }
13739
13740 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13741 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13742 return -EPERM;
13743 }
13744
13745 *ss << "pool '" << poolstr << "' removed";
13746 return 0;
13747 }
13748
13749 /**
13750 * Check if it is safe to add a tier to a base pool
13751 *
13752 * @return
13753 * True if the operation should proceed, false if we should abort here
13754 * (abort doesn't necessarily mean error, could be idempotency)
13755 */
13756 bool OSDMonitor::_check_become_tier(
13757 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13758 const int64_t base_pool_id, const pg_pool_t *base_pool,
13759 int *err,
13760 ostream *ss) const
13761 {
13762 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13763 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13764
13765 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13766 if (pending_fsmap.pool_in_use(tier_pool_id)) {
13767 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13768 *err = -EBUSY;
13769 return false;
13770 }
13771
13772 if (base_pool->tiers.count(tier_pool_id)) {
13773 ceph_assert(tier_pool->tier_of == base_pool_id);
13774 *err = 0;
13775 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13776 << base_pool_name << "'";
13777 return false;
13778 }
13779
13780 if (base_pool->is_tier()) {
13781 *ss << "pool '" << base_pool_name << "' is already a tier of '"
13782 << osdmap.get_pool_name(base_pool->tier_of) << "', "
13783 << "multiple tiers are not yet supported.";
13784 *err = -EINVAL;
13785 return false;
13786 }
13787
13788 if (tier_pool->has_tiers()) {
13789 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13790 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13791 it != tier_pool->tiers.end(); ++it)
13792 *ss << "'" << osdmap.get_pool_name(*it) << "',";
13793 *ss << " multiple tiers are not yet supported.";
13794 *err = -EINVAL;
13795 return false;
13796 }
13797
13798 if (tier_pool->is_tier()) {
13799 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13800 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13801 *err = -EINVAL;
13802 return false;
13803 }
13804
13805 *err = 0;
13806 return true;
13807 }
13808
13809
13810 /**
13811 * Check if it is safe to remove a tier from this base pool
13812 *
13813 * @return
13814 * True if the operation should proceed, false if we should abort here
13815 * (abort doesn't necessarily mean error, could be idempotency)
13816 */
13817 bool OSDMonitor::_check_remove_tier(
13818 const int64_t base_pool_id, const pg_pool_t *base_pool,
13819 const pg_pool_t *tier_pool,
13820 int *err, ostream *ss) const
13821 {
13822 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13823
13824 // Apply CephFS-specific checks
13825 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13826 if (pending_fsmap.pool_in_use(base_pool_id)) {
13827 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13828 // If the underlying pool is erasure coded and does not allow EC
13829 // overwrites, we can't permit the removal of the replicated tier that
13830 // CephFS relies on to access it
13831 *ss << "pool '" << base_pool_name <<
13832 "' does not allow EC overwrites and is in use by CephFS"
13833 " via its tier";
13834 *err = -EBUSY;
13835 return false;
13836 }
13837
13838 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13839 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13840 "tier is still in use as a writeback cache. Change the cache "
13841 "mode and flush the cache before removing it";
13842 *err = -EBUSY;
13843 return false;
13844 }
13845 }
13846
13847 *err = 0;
13848 return true;
13849 }
13850
13851 int OSDMonitor::_prepare_remove_pool(
13852 int64_t pool, ostream *ss, bool no_fake)
13853 {
13854 dout(10) << __func__ << " " << pool << dendl;
13855 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13856 int r = _check_remove_pool(pool, *p, ss);
13857 if (r < 0)
13858 return r;
13859
13860 auto new_pool = pending_inc.new_pools.find(pool);
13861 if (new_pool != pending_inc.new_pools.end()) {
13862 // if there is a problem with the pending info, wait and retry
13863 // this op.
13864 const auto& p = new_pool->second;
13865 int r = _check_remove_pool(pool, p, ss);
13866 if (r < 0)
13867 return -EAGAIN;
13868 }
13869
13870 if (pending_inc.old_pools.count(pool)) {
13871 dout(10) << __func__ << " " << pool << " already pending removal"
13872 << dendl;
13873 return 0;
13874 }
13875
13876 if (g_conf()->mon_fake_pool_delete && !no_fake) {
13877 string old_name = osdmap.get_pool_name(pool);
13878 string new_name = old_name + "." + stringify(pool) + ".DELETED";
13879 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13880 << old_name << " -> " << new_name << dendl;
13881 pending_inc.new_pool_names[pool] = new_name;
13882 return 0;
13883 }
13884
13885 // remove
13886 pending_inc.old_pools.insert(pool);
13887
13888 // remove any pg_temp mappings for this pool
13889 for (auto p = osdmap.pg_temp->begin();
13890 p != osdmap.pg_temp->end();
13891 ++p) {
13892 if (p->first.pool() == pool) {
13893 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
13894 << p->first << dendl;
13895 pending_inc.new_pg_temp[p->first].clear();
13896 }
13897 }
13898 // remove any primary_temp mappings for this pool
13899 for (auto p = osdmap.primary_temp->begin();
13900 p != osdmap.primary_temp->end();
13901 ++p) {
13902 if (p->first.pool() == pool) {
13903 dout(10) << __func__ << " " << pool
13904 << " removing obsolete primary_temp" << p->first << dendl;
13905 pending_inc.new_primary_temp[p->first] = -1;
13906 }
13907 }
13908 // remove any pg_upmap mappings for this pool
13909 for (auto& p : osdmap.pg_upmap) {
13910 if (p.first.pool() == pool) {
13911 dout(10) << __func__ << " " << pool
13912 << " removing obsolete pg_upmap "
13913 << p.first << dendl;
13914 pending_inc.old_pg_upmap.insert(p.first);
13915 }
13916 }
13917 // remove any pending pg_upmap mappings for this pool
13918 {
13919 auto it = pending_inc.new_pg_upmap.begin();
13920 while (it != pending_inc.new_pg_upmap.end()) {
13921 if (it->first.pool() == pool) {
13922 dout(10) << __func__ << " " << pool
13923 << " removing pending pg_upmap "
13924 << it->first << dendl;
13925 it = pending_inc.new_pg_upmap.erase(it);
13926 } else {
13927 it++;
13928 }
13929 }
13930 }
13931 // remove any pg_upmap_items mappings for this pool
13932 for (auto& p : osdmap.pg_upmap_items) {
13933 if (p.first.pool() == pool) {
13934 dout(10) << __func__ << " " << pool
13935 << " removing obsolete pg_upmap_items " << p.first
13936 << dendl;
13937 pending_inc.old_pg_upmap_items.insert(p.first);
13938 }
13939 }
13940 // remove any pending pg_upmap mappings for this pool
13941 {
13942 auto it = pending_inc.new_pg_upmap_items.begin();
13943 while (it != pending_inc.new_pg_upmap_items.end()) {
13944 if (it->first.pool() == pool) {
13945 dout(10) << __func__ << " " << pool
13946 << " removing pending pg_upmap_items "
13947 << it->first << dendl;
13948 it = pending_inc.new_pg_upmap_items.erase(it);
13949 } else {
13950 it++;
13951 }
13952 }
13953 }
13954
13955 // remove any choose_args for this pool
13956 CrushWrapper newcrush;
13957 _get_pending_crush(newcrush);
13958 if (newcrush.have_choose_args(pool)) {
13959 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13960 newcrush.rm_choose_args(pool);
13961 pending_inc.crush.clear();
13962 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13963 }
13964 return 0;
13965 }
13966
13967 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13968 {
13969 dout(10) << "_prepare_rename_pool " << pool << dendl;
13970 if (pending_inc.old_pools.count(pool)) {
13971 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13972 return -ENOENT;
13973 }
13974 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13975 p != pending_inc.new_pool_names.end();
13976 ++p) {
13977 if (p->second == newname && p->first != pool) {
13978 return -EEXIST;
13979 }
13980 }
13981
13982 pending_inc.new_pool_names[pool] = newname;
13983 return 0;
13984 }
13985
13986 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13987 {
13988 op->mark_osdmon_event(__func__);
13989 auto m = op->get_req<MPoolOp>();
13990 ostringstream ss;
13991 int ret = _prepare_remove_pool(m->pool, &ss, false);
13992 if (ret == -EAGAIN) {
13993 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13994 return true;
13995 }
13996 if (ret < 0)
13997 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13998 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13999 pending_inc.epoch));
14000 return true;
14001 }
14002
14003 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14004 int ret, epoch_t epoch, bufferlist *blp)
14005 {
14006 op->mark_osdmon_event(__func__);
14007 auto m = op->get_req<MPoolOp>();
14008 dout(20) << "_pool_op_reply " << ret << dendl;
14009 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14010 ret, epoch, get_last_committed(), blp);
14011 mon->send_reply(op, reply);
14012 }
14013
14014 void OSDMonitor::convert_pool_priorities(void)
14015 {
14016 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14017 int64_t max_prio = 0;
14018 int64_t min_prio = 0;
14019 for (const auto &i : osdmap.get_pools()) {
14020 const auto &pool = i.second;
14021
14022 if (pool.opts.is_set(key)) {
14023 int64_t prio = 0;
14024 pool.opts.get(key, &prio);
14025 if (prio > max_prio)
14026 max_prio = prio;
14027 if (prio < min_prio)
14028 min_prio = prio;
14029 }
14030 }
14031 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14032 dout(20) << __func__ << " nothing to fix" << dendl;
14033 return;
14034 }
14035 // Current pool priorities exceeds new maximum
14036 for (const auto &i : osdmap.get_pools()) {
14037 const auto pool_id = i.first;
14038 pg_pool_t pool = i.second;
14039
14040 int64_t prio = 0;
14041 pool.opts.get(key, &prio);
14042 int64_t n;
14043
14044 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14045 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14046 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14047 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14048 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14049 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14050 } else {
14051 continue;
14052 }
14053 if (n == 0) {
14054 pool.opts.unset(key);
14055 } else {
14056 pool.opts.set(key, static_cast<int64_t>(n));
14057 }
14058 dout(10) << __func__ << " pool " << pool_id
14059 << " recovery_priority adjusted "
14060 << prio << " to " << n << dendl;
14061 pool.last_change = pending_inc.epoch;
14062 pending_inc.new_pools[pool_id] = pool;
14063 }
14064 }