]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
892580c3ff6e75e3d357d98a30ec5c5aaa4b9cff
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
61
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
69
70 #include "common/config.h"
71 #include "common/errno.h"
72
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
76
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
86
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
89
90 #include "json_spirit/json_spirit_reader.h"
91
92 #include <boost/algorithm/string/predicate.hpp>
93
94 #define dout_subsys ceph_subsys_mon
95 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
96 static const string OSD_METADATA_PREFIX("osd_metadata");
97 static const string OSD_SNAP_PREFIX("osd_snap");
98
99 /*
100
101 OSD snapshot metadata
102 ---------------------
103
104 -- starting with mimic, removed in octopus --
105
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
108
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
111
112
113 -- starting with mimic --
114
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
117
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
122
123
124 -- starting with octopus --
125
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
128
129 */
130 using namespace TOPNSPC::common;
131 namespace {
132
133 struct OSDMemCache : public PriorityCache::PriCache {
134 OSDMonitor *osdmon;
135 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
136 int64_t committed_bytes = 0;
137 double cache_ratio = 0;
138
139 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
140
141 virtual uint64_t _get_used_bytes() const = 0;
142
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri, uint64_t total_cache) const {
145 int64_t assigned = get_cache_bytes(pri);
146
147 switch (pri) {
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1:
150 {
151 int64_t request = _get_used_bytes();
152 return (request > assigned) ? request - assigned : 0;
153 }
154 default:
155 break;
156 }
157 return -EOPNOTSUPP;
158 }
159
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
161 return cache_bytes[pri];
162 }
163
164 virtual int64_t get_cache_bytes() const {
165 int64_t total = 0;
166
167 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
168 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
169 total += get_cache_bytes(pri);
170 }
171 return total;
172 }
173
174 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
175 cache_bytes[pri] = bytes;
176 }
177 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
178 cache_bytes[pri] += bytes;
179 }
180 virtual int64_t commit_cache_size(uint64_t total_cache) {
181 committed_bytes = PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache);
183 return committed_bytes;
184 }
185 virtual int64_t get_committed_size() const {
186 return committed_bytes;
187 }
188 virtual double get_cache_ratio() const {
189 return cache_ratio;
190 }
191 virtual void set_cache_ratio(double ratio) {
192 cache_ratio = ratio;
193 }
194 virtual string get_cache_name() const = 0;
195 };
196
197 struct IncCache : public OSDMemCache {
198 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
199
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon->inc_osd_cache.get_bytes();
202 }
203
204 virtual string get_cache_name() const {
205 return "OSDMap Inc Cache";
206 }
207
208 uint64_t _get_num_osdmaps() const {
209 return osdmon->inc_osd_cache.get_size();
210 }
211 };
212
213 struct FullCache : public OSDMemCache {
214 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
215
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon->full_osd_cache.get_bytes();
218 }
219
220 virtual string get_cache_name() const {
221 return "OSDMap Full Cache";
222 }
223
224 uint64_t _get_num_osdmaps() const {
225 return osdmon->full_osd_cache.get_size();
226 }
227 };
228
229 std::shared_ptr<IncCache> inc_cache;
230 std::shared_ptr<FullCache> full_cache;
231
232 const uint32_t MAX_POOL_APPLICATIONS = 4;
233 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
234 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
235
236 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant.spec.allow & OSD_CAP_W) != 0) {
239 auto& match = grant.match;
240 if (match.is_match_all()) {
241 return true;
242 } else if (pool_name != nullptr &&
243 !match.pool_namespace.pool_name.empty() &&
244 match.pool_namespace.pool_name == *pool_name) {
245 return true;
246 }
247 }
248 return false;
249 }
250
251 bool is_unmanaged_snap_op_permitted(CephContext* cct,
252 const KeyServer& key_server,
253 const EntityName& entity_name,
254 const MonCap& mon_caps,
255 const entity_addr_t& peer_socket_addr,
256 const std::string* pool_name)
257 {
258 typedef std::map<std::string, std::string> CommandArgs;
259
260 if (mon_caps.is_capable(
261 cct, entity_name, "osd",
262 "osd pool op unmanaged-snap",
263 (pool_name == nullptr ?
264 CommandArgs{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs{{"poolname", *pool_name}}),
266 false, true, false,
267 peer_socket_addr)) {
268 return true;
269 }
270
271 AuthCapsInfo caps_info;
272 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
273 caps_info)) {
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl;
276 return false;
277 }
278
279 string caps_str;
280 if (caps_info.caps.length() > 0) {
281 auto p = caps_info.caps.cbegin();
282 try {
283 decode(caps_str, p);
284 } catch (const buffer::error &err) {
285 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
286 << dendl;
287 return false;
288 }
289 }
290
291 OSDCap osd_cap;
292 if (!osd_cap.parse(caps_str, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl;
295 return false;
296 }
297
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap.allow_all()) {
301 return true;
302 }
303
304 for (auto& grant : osd_cap.grants) {
305 if (grant.profile.is_valid()) {
306 for (auto& profile_grant : grant.profile_grants) {
307 if (is_osd_writable(profile_grant, pool_name)) {
308 return true;
309 }
310 }
311 } else if (is_osd_writable(grant, pool_name)) {
312 return true;
313 }
314 }
315
316 return false;
317 }
318
319 } // anonymous namespace
320
321 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
322 {
323 if (epoch_by_pg.size() <= ps) {
324 epoch_by_pg.resize(ps + 1, 0);
325 }
326 const auto old_lec = epoch_by_pg[ps];
327 if (old_lec >= last_epoch_clean) {
328 // stale lec
329 return;
330 }
331 epoch_by_pg[ps] = last_epoch_clean;
332 if (last_epoch_clean < floor) {
333 floor = last_epoch_clean;
334 } else if (last_epoch_clean > floor) {
335 if (old_lec == floor) {
336 // probably should increase floor?
337 auto new_floor = std::min_element(std::begin(epoch_by_pg),
338 std::end(epoch_by_pg));
339 floor = *new_floor;
340 }
341 }
342 if (ps != next_missing) {
343 return;
344 }
345 for (; next_missing < epoch_by_pg.size(); next_missing++) {
346 if (epoch_by_pg[next_missing] == 0) {
347 break;
348 }
349 }
350 }
351
352 void LastEpochClean::remove_pool(uint64_t pool)
353 {
354 report_by_pool.erase(pool);
355 }
356
357 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
358 {
359 auto& lec = report_by_pool[pg.pool()];
360 return lec.report(pg.ps(), last_epoch_clean);
361 }
362
363 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
364 {
365 auto floor = latest.get_epoch();
366 for (auto& pool : latest.get_pools()) {
367 auto reported = report_by_pool.find(pool.first);
368 if (reported == report_by_pool.end()) {
369 return 0;
370 }
371 if (reported->second.next_missing < pool.second.get_pg_num()) {
372 return 0;
373 }
374 if (reported->second.floor < floor) {
375 floor = reported->second.floor;
376 }
377 }
378 return floor;
379 }
380
381 void LastEpochClean::dump(Formatter *f) const
382 {
383 f->open_array_section("per_pool");
384
385 for (auto& it : report_by_pool) {
386 f->open_object_section("pool");
387 f->dump_unsigned("poolid", it.first);
388 f->dump_unsigned("floor", it.second.floor);
389 f->close_section();
390 }
391
392 f->close_section();
393 }
394
395 class C_UpdateCreatingPGs : public Context {
396 public:
397 OSDMonitor *osdmon;
398 utime_t start;
399 epoch_t epoch;
400 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
401 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
402 void finish(int r) override {
403 if (r >= 0) {
404 utime_t end = ceph_clock_now();
405 dout(10) << "osdmap epoch " << epoch << " mapping took "
406 << (end - start) << " seconds" << dendl;
407 osdmon->update_creating_pgs();
408 osdmon->check_pg_creates_subs();
409 }
410 }
411 };
412
413 #undef dout_prefix
414 #define dout_prefix _prefix(_dout, mon, osdmap)
415 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
416 return *_dout << "mon." << mon->name << "@" << mon->rank
417 << "(" << mon->get_state_name()
418 << ").osd e" << osdmap.get_epoch() << " ";
419 }
420
421 OSDMonitor::OSDMonitor(
422 CephContext *cct,
423 Monitor *mn,
424 Paxos *p,
425 const string& service_name)
426 : PaxosService(mn, p, service_name),
427 cct(cct),
428 inc_osd_cache(g_conf()->mon_osd_cache_size),
429 full_osd_cache(g_conf()->mon_osd_cache_size),
430 has_osdmap_manifest(false),
431 mapper(mn->cct, &mn->cpu_tp)
432 {
433 inc_cache = std::make_shared<IncCache>(this);
434 full_cache = std::make_shared<FullCache>(this);
435 cct->_conf.add_observer(this);
436 int r = _set_cache_sizes();
437 if (r < 0) {
438 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
439 << g_conf()->mon_osd_cache_size
440 << ") without priority cache management"
441 << dendl;
442 }
443 }
444
445 const char **OSDMonitor::get_tracked_conf_keys() const
446 {
447 static const char* KEYS[] = {
448 "mon_memory_target",
449 "mon_memory_autotune",
450 "rocksdb_cache_size",
451 NULL
452 };
453 return KEYS;
454 }
455
456 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
457 const std::set<std::string> &changed)
458 {
459 dout(10) << __func__ << " " << changed << dendl;
460
461 if (changed.count("mon_memory_autotune")) {
462 _set_cache_autotuning();
463 }
464 if (changed.count("mon_memory_target") ||
465 changed.count("rocksdb_cache_size")) {
466 int r = _update_mon_cache_settings();
467 if (r < 0) {
468 derr << __func__ << " mon_memory_target:"
469 << g_conf()->mon_memory_target
470 << " rocksdb_cache_size:"
471 << g_conf()->rocksdb_cache_size
472 << ". Unable to update cache size."
473 << dendl;
474 }
475 }
476 }
477
478 void OSDMonitor::_set_cache_autotuning()
479 {
480 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
481 // Disable cache autotuning
482 std::lock_guard l(balancer_lock);
483 pcm = nullptr;
484 }
485
486 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
487 int r = register_cache_with_pcm();
488 if (r < 0) {
489 dout(10) << __func__
490 << " Error while registering osdmon caches with pcm."
491 << " Cache auto tuning not enabled."
492 << dendl;
493 mon_memory_autotune = false;
494 } else {
495 mon_memory_autotune = true;
496 }
497 }
498 }
499
500 int OSDMonitor::_update_mon_cache_settings()
501 {
502 if (g_conf()->mon_memory_target <= 0 ||
503 g_conf()->mon_memory_target < mon_memory_min ||
504 g_conf()->rocksdb_cache_size <= 0) {
505 return -EINVAL;
506 }
507
508 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
509 derr << __func__ << " not using pcm and rocksdb" << dendl;
510 return -EINVAL;
511 }
512
513 uint64_t old_mon_memory_target = mon_memory_target;
514 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
515
516 // Set the new pcm memory cache sizes
517 mon_memory_target = g_conf()->mon_memory_target;
518 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
519
520 uint64_t base = mon_memory_base;
521 double fragmentation = mon_memory_fragmentation;
522 uint64_t target = mon_memory_target;
523 uint64_t min = mon_memory_min;
524 uint64_t max = min;
525
526 uint64_t ltarget = (1.0 - fragmentation) * target;
527 if (ltarget > base + min) {
528 max = ltarget - base;
529 }
530
531 int r = _set_cache_ratios();
532 if (r < 0) {
533 derr << __func__ << " Cache ratios for pcm could not be set."
534 << " Review the kv (rocksdb) and mon_memory_target sizes."
535 << dendl;
536 mon_memory_target = old_mon_memory_target;
537 rocksdb_cache_size = old_rocksdb_cache_size;
538 return -EINVAL;
539 }
540
541 if (mon_memory_autotune && pcm != nullptr) {
542 std::lock_guard l(balancer_lock);
543 // set pcm cache levels
544 pcm->set_target_memory(target);
545 pcm->set_min_memory(min);
546 pcm->set_max_memory(max);
547 // tune memory based on new values
548 pcm->tune_memory();
549 pcm->balance();
550 _set_new_cache_sizes();
551 dout(1) << __func__ << " Updated mon cache setting."
552 << " target: " << target
553 << " min: " << min
554 << " max: " << max
555 << dendl;
556 }
557 return 0;
558 }
559
560 int OSDMonitor::_set_cache_sizes()
561 {
562 if (g_conf()->mon_memory_autotune) {
563 // set the new osdmon cache targets to be managed by pcm
564 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
565 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
566 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
567 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
568 mon_memory_target = g_conf()->mon_memory_target;
569 mon_memory_min = g_conf()->mon_osd_cache_size_min;
570 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
571 derr << __func__ << " mon_memory_target:" << mon_memory_target
572 << " mon_memory_min:" << mon_memory_min
573 << ". Invalid size option(s) provided."
574 << dendl;
575 return -EINVAL;
576 }
577 // Set the initial inc and full LRU cache sizes
578 inc_osd_cache.set_bytes(mon_memory_min);
579 full_osd_cache.set_bytes(mon_memory_min);
580 mon_memory_autotune = g_conf()->mon_memory_autotune;
581 }
582 return 0;
583 }
584
585 bool OSDMonitor::_have_pending_crush()
586 {
587 return pending_inc.crush.length() > 0;
588 }
589
590 CrushWrapper &OSDMonitor::_get_stable_crush()
591 {
592 return *osdmap.crush;
593 }
594
595 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
596 {
597 bufferlist bl;
598 if (pending_inc.crush.length())
599 bl = pending_inc.crush;
600 else
601 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
602
603 auto p = bl.cbegin();
604 newcrush.decode(p);
605 }
606
607 void OSDMonitor::create_initial()
608 {
609 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
610
611 OSDMap newmap;
612
613 bufferlist bl;
614 mon->store->get("mkfs", "osdmap", bl);
615
616 if (bl.length()) {
617 newmap.decode(bl);
618 newmap.set_fsid(mon->monmap->fsid);
619 } else {
620 newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
621 }
622 newmap.set_epoch(1);
623 newmap.created = newmap.modified = ceph_clock_now();
624
625 // new clusters should sort bitwise by default.
626 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
627
628 newmap.flags |=
629 CEPH_OSDMAP_RECOVERY_DELETES |
630 CEPH_OSDMAP_PURGED_SNAPDIRS |
631 CEPH_OSDMAP_PGLOG_HARDLIMIT;
632 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
633 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
634 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
635 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
636 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
637 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
638
639 // new cluster should require latest by default
640 if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
641 if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
642 derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
643 newmap.require_osd_release = ceph_release_t::mimic;
644 } else {
645 derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
646 newmap.require_osd_release = ceph_release_t::nautilus;
647 }
648 } else {
649 newmap.require_osd_release = ceph_release_t::octopus;
650 ceph_release_t r = ceph_release_from_name(
651 g_conf()->mon_osd_initial_require_min_compat_client);
652 if (!r) {
653 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
654 }
655 newmap.require_min_compat_client = r;
656 }
657
658 // encode into pending incremental
659 uint64_t features = newmap.get_encoding_features();
660 newmap.encode(pending_inc.fullmap,
661 features | CEPH_FEATURE_RESERVED);
662 pending_inc.full_crc = newmap.get_crc();
663 dout(20) << " full crc " << pending_inc.full_crc << dendl;
664 }
665
666 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
667 {
668 s.insert(service_name);
669 s.insert(OSD_PG_CREATING_PREFIX);
670 s.insert(OSD_METADATA_PREFIX);
671 s.insert(OSD_SNAP_PREFIX);
672 }
673
674 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
675 {
676 // we really don't care if the version has been updated, because we may
677 // have trimmed without having increased the last committed; yet, we may
678 // need to update the in-memory manifest.
679 load_osdmap_manifest();
680
681 version_t version = get_last_committed();
682 if (version == osdmap.epoch)
683 return;
684 ceph_assert(version > osdmap.epoch);
685
686 dout(15) << "update_from_paxos paxos e " << version
687 << ", my e " << osdmap.epoch << dendl;
688
689 if (mapping_job) {
690 if (!mapping_job->is_done()) {
691 dout(1) << __func__ << " mapping job "
692 << mapping_job.get() << " did not complete, "
693 << mapping_job->shards << " left, canceling" << dendl;
694 mapping_job->abort();
695 }
696 mapping_job.reset();
697 }
698
699 load_health();
700
701 /*
702 * We will possibly have a stashed latest that *we* wrote, and we will
703 * always be sure to have the oldest full map in the first..last range
704 * due to encode_trim_extra(), which includes the oldest full map in the trim
705 * transaction.
706 *
707 * encode_trim_extra() does not however write the full map's
708 * version to 'full_latest'. This is only done when we are building the
709 * full maps from the incremental versions. But don't panic! We make sure
710 * that the following conditions find whichever full map version is newer.
711 */
712 version_t latest_full = get_version_latest_full();
713 if (latest_full == 0 && get_first_committed() > 1)
714 latest_full = get_first_committed();
715
716 if (get_first_committed() > 1 &&
717 latest_full < get_first_committed()) {
718 // the monitor could be just sync'ed with its peer, and the latest_full key
719 // is not encoded in the paxos commits in encode_pending(), so we need to
720 // make sure we get it pointing to a proper version.
721 version_t lc = get_last_committed();
722 version_t fc = get_first_committed();
723
724 dout(10) << __func__ << " looking for valid full map in interval"
725 << " [" << fc << ", " << lc << "]" << dendl;
726
727 latest_full = 0;
728 for (version_t v = lc; v >= fc; v--) {
729 string full_key = "full_" + stringify(v);
730 if (mon->store->exists(get_service_name(), full_key)) {
731 dout(10) << __func__ << " found latest full map v " << v << dendl;
732 latest_full = v;
733 break;
734 }
735 }
736
737 ceph_assert(latest_full > 0);
738 auto t(std::make_shared<MonitorDBStore::Transaction>());
739 put_version_latest_full(t, latest_full);
740 mon->store->apply_transaction(t);
741 dout(10) << __func__ << " updated the on-disk full map version to "
742 << latest_full << dendl;
743 }
744
745 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
746 bufferlist latest_bl;
747 get_version_full(latest_full, latest_bl);
748 ceph_assert(latest_bl.length() != 0);
749 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
750 osdmap = OSDMap();
751 osdmap.decode(latest_bl);
752 }
753
754 bufferlist bl;
755 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
756 auto p = bl.cbegin();
757 std::lock_guard<std::mutex> l(creating_pgs_lock);
758 creating_pgs.decode(p);
759 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
760 << creating_pgs.last_scan_epoch
761 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
762 } else {
763 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
764 << dendl;
765 }
766
767 // walk through incrementals
768 MonitorDBStore::TransactionRef t;
769 size_t tx_size = 0;
770 while (version > osdmap.epoch) {
771 bufferlist inc_bl;
772 int err = get_version(osdmap.epoch+1, inc_bl);
773 ceph_assert(err == 0);
774 ceph_assert(inc_bl.length());
775 // set priority cache manager levels if the osdmap is
776 // being populated for the first time.
777 if (mon_memory_autotune && pcm == nullptr) {
778 int r = register_cache_with_pcm();
779 if (r < 0) {
780 dout(10) << __func__
781 << " Error while registering osdmon caches with pcm."
782 << " Proceeding without cache auto tuning."
783 << dendl;
784 }
785 }
786
787 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
788 << dendl;
789 OSDMap::Incremental inc(inc_bl);
790 err = osdmap.apply_incremental(inc);
791 ceph_assert(err == 0);
792
793 if (!t)
794 t.reset(new MonitorDBStore::Transaction);
795
796 // Write out the full map for all past epochs. Encode the full
797 // map with the same features as the incremental. If we don't
798 // know, use the quorum features. If we don't know those either,
799 // encode with all features.
800 uint64_t f = inc.encode_features;
801 if (!f)
802 f = mon->get_quorum_con_features();
803 if (!f)
804 f = -1;
805 bufferlist full_bl;
806 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
807 tx_size += full_bl.length();
808
809 bufferlist orig_full_bl;
810 get_version_full(osdmap.epoch, orig_full_bl);
811 if (orig_full_bl.length()) {
812 // the primary provided the full map
813 ceph_assert(inc.have_crc);
814 if (inc.full_crc != osdmap.crc) {
815 // This will happen if the mons were running mixed versions in
816 // the past or some other circumstance made the full encoded
817 // maps divergent. Reloading here will bring us back into
818 // sync with the primary for this and all future maps. OSDs
819 // will also be brought back into sync when they discover the
820 // crc mismatch and request a full map from a mon.
821 derr << __func__ << " full map CRC mismatch, resetting to canonical"
822 << dendl;
823
824 dout(20) << __func__ << " my (bad) full osdmap:\n";
825 JSONFormatter jf(true);
826 jf.dump_object("osdmap", osdmap);
827 jf.flush(*_dout);
828 *_dout << "\nhexdump:\n";
829 full_bl.hexdump(*_dout);
830 *_dout << dendl;
831
832 osdmap = OSDMap();
833 osdmap.decode(orig_full_bl);
834
835 dout(20) << __func__ << " canonical full osdmap:\n";
836 JSONFormatter jf(true);
837 jf.dump_object("osdmap", osdmap);
838 jf.flush(*_dout);
839 *_dout << "\nhexdump:\n";
840 orig_full_bl.hexdump(*_dout);
841 *_dout << dendl;
842 }
843 } else {
844 ceph_assert(!inc.have_crc);
845 put_version_full(t, osdmap.epoch, full_bl);
846 }
847 put_version_latest_full(t, osdmap.epoch);
848
849 // share
850 dout(1) << osdmap << dendl;
851
852 if (osdmap.epoch == 1) {
853 t->erase("mkfs", "osdmap");
854 }
855
856 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
857 mon->store->apply_transaction(t);
858 t = MonitorDBStore::TransactionRef();
859 tx_size = 0;
860 }
861 for (const auto &osd_state : inc.new_state) {
862 if (osd_state.second & CEPH_OSD_UP) {
863 // could be marked up *or* down, but we're too lazy to check which
864 last_osd_report.erase(osd_state.first);
865 }
866 if (osd_state.second & CEPH_OSD_EXISTS) {
867 // could be created *or* destroyed, but we can safely drop it
868 osd_epochs.erase(osd_state.first);
869 }
870 }
871 }
872
873 if (t) {
874 mon->store->apply_transaction(t);
875 }
876
877 for (int o = 0; o < osdmap.get_max_osd(); o++) {
878 if (osdmap.is_out(o))
879 continue;
880 auto found = down_pending_out.find(o);
881 if (osdmap.is_down(o)) {
882 // populate down -> out map
883 if (found == down_pending_out.end()) {
884 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
885 down_pending_out[o] = ceph_clock_now();
886 }
887 } else {
888 if (found != down_pending_out.end()) {
889 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
890 down_pending_out.erase(found);
891 }
892 }
893 }
894 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
895
896 check_osdmap_subs();
897 check_pg_creates_subs();
898
899 share_map_with_random_osd();
900 update_logger();
901 process_failures();
902
903 // make sure our feature bits reflect the latest map
904 update_msgr_features();
905
906 if (!mon->is_leader()) {
907 // will be called by on_active() on the leader, avoid doing so twice
908 start_mapping();
909 }
910 }
911
912 int OSDMonitor::register_cache_with_pcm()
913 {
914 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
915 derr << __func__ << " Invalid memory size specified for mon caches."
916 << " Caches will not be auto-tuned."
917 << dendl;
918 return -EINVAL;
919 }
920 uint64_t base = mon_memory_base;
921 double fragmentation = mon_memory_fragmentation;
922 // For calculating total target memory, consider rocksdb cache size.
923 uint64_t target = mon_memory_target;
924 uint64_t min = mon_memory_min;
925 uint64_t max = min;
926
927 // Apply the same logic as in bluestore to set the max amount
928 // of memory to use for cache. Assume base memory for OSDMaps
929 // and then add in some overhead for fragmentation.
930 uint64_t ltarget = (1.0 - fragmentation) * target;
931 if (ltarget > base + min) {
932 max = ltarget - base;
933 }
934
935 rocksdb_binned_kv_cache = mon->store->get_priority_cache();
936 if (!rocksdb_binned_kv_cache) {
937 derr << __func__ << " not using rocksdb" << dendl;
938 return -EINVAL;
939 }
940
941 int r = _set_cache_ratios();
942 if (r < 0) {
943 derr << __func__ << " Cache ratios for pcm could not be set."
944 << " Review the kv (rocksdb) and mon_memory_target sizes."
945 << dendl;
946 return -EINVAL;
947 }
948
949 pcm = std::make_shared<PriorityCache::Manager>(
950 cct, min, max, target, true);
951 pcm->insert("kv", rocksdb_binned_kv_cache, true);
952 pcm->insert("inc", inc_cache, true);
953 pcm->insert("full", full_cache, true);
954 dout(1) << __func__ << " pcm target: " << target
955 << " pcm max: " << max
956 << " pcm min: " << min
957 << " inc_osd_cache size: " << inc_osd_cache.get_size()
958 << dendl;
959 return 0;
960 }
961
962 int OSDMonitor::_set_cache_ratios()
963 {
964 double old_cache_kv_ratio = cache_kv_ratio;
965
966 // Set the cache ratios for kv(rocksdb), inc and full caches
967 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
968 if (cache_kv_ratio >= 1.0) {
969 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
970 << ") must be in range [0,<1.0]."
971 << dendl;
972 cache_kv_ratio = old_cache_kv_ratio;
973 return -EINVAL;
974 }
975 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
976 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
977 inc_cache->set_cache_ratio(cache_inc_ratio);
978 full_cache->set_cache_ratio(cache_full_ratio);
979
980 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
981 << " inc ratio " << cache_inc_ratio
982 << " full ratio " << cache_full_ratio
983 << dendl;
984 return 0;
985 }
986
987 void OSDMonitor::start_mapping()
988 {
989 // initiate mapping job
990 if (mapping_job) {
991 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
992 << dendl;
993 mapping_job->abort();
994 }
995 if (!osdmap.get_pools().empty()) {
996 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
997 mapping_job = mapping.start_update(osdmap, mapper,
998 g_conf()->mon_osd_mapping_pgs_per_chunk);
999 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1000 << " at " << fin->start << dendl;
1001 mapping_job->set_finish_event(fin);
1002 } else {
1003 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1004 mapping_job = nullptr;
1005 }
1006 }
1007
1008 void OSDMonitor::update_msgr_features()
1009 {
1010 set<int> types;
1011 types.insert((int)entity_name_t::TYPE_OSD);
1012 types.insert((int)entity_name_t::TYPE_CLIENT);
1013 types.insert((int)entity_name_t::TYPE_MDS);
1014 types.insert((int)entity_name_t::TYPE_MON);
1015 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1016 uint64_t mask;
1017 uint64_t features = osdmap.get_features(*q, &mask);
1018 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1019 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1020 ceph::net::Policy p = mon->messenger->get_policy(*q);
1021 p.features_required = (p.features_required & ~mask) | features;
1022 mon->messenger->set_policy(*q, p);
1023 }
1024 }
1025 }
1026
1027 void OSDMonitor::on_active()
1028 {
1029 update_logger();
1030
1031 if (mon->is_leader()) {
1032 mon->clog->debug() << "osdmap " << osdmap;
1033 if (!priority_convert) {
1034 // Only do this once at start-up
1035 convert_pool_priorities();
1036 priority_convert = true;
1037 }
1038 } else {
1039 list<MonOpRequestRef> ls;
1040 take_all_failures(ls);
1041 while (!ls.empty()) {
1042 MonOpRequestRef op = ls.front();
1043 op->mark_osdmon_event(__func__);
1044 dispatch(op);
1045 ls.pop_front();
1046 }
1047 }
1048 start_mapping();
1049 }
1050
1051 void OSDMonitor::on_restart()
1052 {
1053 last_osd_report.clear();
1054 }
1055
1056 void OSDMonitor::on_shutdown()
1057 {
1058 dout(10) << __func__ << dendl;
1059 if (mapping_job) {
1060 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1061 << dendl;
1062 mapping_job->abort();
1063 }
1064
1065 // discard failure info, waiters
1066 list<MonOpRequestRef> ls;
1067 take_all_failures(ls);
1068 ls.clear();
1069 }
1070
1071 void OSDMonitor::update_logger()
1072 {
1073 dout(10) << "update_logger" << dendl;
1074
1075 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1076 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1077 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1078 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1079 }
1080
1081 void OSDMonitor::create_pending()
1082 {
1083 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1084 pending_inc.fsid = mon->monmap->fsid;
1085 pending_metadata.clear();
1086 pending_metadata_rm.clear();
1087 pending_pseudo_purged_snaps.clear();
1088
1089 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1090
1091 // safety checks (this shouldn't really happen)
1092 {
1093 if (osdmap.backfillfull_ratio <= 0) {
1094 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1095 if (pending_inc.new_backfillfull_ratio > 1.0)
1096 pending_inc.new_backfillfull_ratio /= 100;
1097 dout(1) << __func__ << " setting backfillfull_ratio = "
1098 << pending_inc.new_backfillfull_ratio << dendl;
1099 }
1100 if (osdmap.full_ratio <= 0) {
1101 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1102 if (pending_inc.new_full_ratio > 1.0)
1103 pending_inc.new_full_ratio /= 100;
1104 dout(1) << __func__ << " setting full_ratio = "
1105 << pending_inc.new_full_ratio << dendl;
1106 }
1107 if (osdmap.nearfull_ratio <= 0) {
1108 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1109 if (pending_inc.new_nearfull_ratio > 1.0)
1110 pending_inc.new_nearfull_ratio /= 100;
1111 dout(1) << __func__ << " setting nearfull_ratio = "
1112 << pending_inc.new_nearfull_ratio << dendl;
1113 }
1114 }
1115
1116 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1117 // structure.
1118 if (osdmap.crush->has_legacy_rule_ids()) {
1119 CrushWrapper newcrush;
1120 _get_pending_crush(newcrush);
1121
1122 // First, for all pools, work out which rule they really used
1123 // by resolving ruleset to rule.
1124 for (const auto &i : osdmap.get_pools()) {
1125 const auto pool_id = i.first;
1126 const auto &pool = i.second;
1127 int new_rule_id = newcrush.find_rule(pool.crush_rule,
1128 pool.type, pool.size);
1129
1130 dout(1) << __func__ << " rewriting pool "
1131 << osdmap.get_pool_name(pool_id) << " crush ruleset "
1132 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1133 if (pending_inc.new_pools.count(pool_id) == 0) {
1134 pending_inc.new_pools[pool_id] = pool;
1135 }
1136 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1137 }
1138
1139 // Now, go ahead and renumber all the rules so that their
1140 // rule_id field corresponds to their position in the array
1141 auto old_to_new = newcrush.renumber_rules();
1142 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1143 for (const auto &i : old_to_new) {
1144 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1145 }
1146 pending_inc.crush.clear();
1147 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1148 }
1149 }
1150
1151 creating_pgs_t
1152 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1153 const OSDMap& nextmap)
1154 {
1155 dout(10) << __func__ << dendl;
1156 creating_pgs_t pending_creatings;
1157 {
1158 std::lock_guard<std::mutex> l(creating_pgs_lock);
1159 pending_creatings = creating_pgs;
1160 }
1161 // check for new or old pools
1162 if (pending_creatings.last_scan_epoch < inc.epoch) {
1163 unsigned queued = 0;
1164 queued += scan_for_creating_pgs(osdmap.get_pools(),
1165 inc.old_pools,
1166 inc.modified,
1167 &pending_creatings);
1168 queued += scan_for_creating_pgs(inc.new_pools,
1169 inc.old_pools,
1170 inc.modified,
1171 &pending_creatings);
1172 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1173 for (auto deleted_pool : inc.old_pools) {
1174 auto removed = pending_creatings.remove_pool(deleted_pool);
1175 dout(10) << __func__ << " " << removed
1176 << " pg removed because containing pool deleted: "
1177 << deleted_pool << dendl;
1178 last_epoch_clean.remove_pool(deleted_pool);
1179 }
1180 // pgmon updates its creating_pgs in check_osd_map() which is called by
1181 // on_active() and check_osd_map() could be delayed if lease expires, so its
1182 // creating_pgs could be stale in comparison with the one of osdmon. let's
1183 // trim them here. otherwise, they will be added back after being erased.
1184 unsigned removed = 0;
1185 for (auto& pg : pending_created_pgs) {
1186 dout(20) << __func__ << " noting created pg " << pg << dendl;
1187 pending_creatings.created_pools.insert(pg.pool());
1188 removed += pending_creatings.pgs.erase(pg);
1189 }
1190 pending_created_pgs.clear();
1191 dout(10) << __func__ << " " << removed
1192 << " pgs removed because they're created" << dendl;
1193 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1194 }
1195
1196 // filter out any pgs that shouldn't exist.
1197 {
1198 auto i = pending_creatings.pgs.begin();
1199 while (i != pending_creatings.pgs.end()) {
1200 if (!nextmap.pg_exists(i->first)) {
1201 dout(10) << __func__ << " removing pg " << i->first
1202 << " which should not exist" << dendl;
1203 i = pending_creatings.pgs.erase(i);
1204 } else {
1205 ++i;
1206 }
1207 }
1208 }
1209
1210 // process queue
1211 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1212 const auto total = pending_creatings.pgs.size();
1213 while (pending_creatings.pgs.size() < max &&
1214 !pending_creatings.queue.empty()) {
1215 auto p = pending_creatings.queue.begin();
1216 int64_t poolid = p->first;
1217 dout(10) << __func__ << " pool " << poolid
1218 << " created " << p->second.created
1219 << " modified " << p->second.modified
1220 << " [" << p->second.start << "-" << p->second.end << ")"
1221 << dendl;
1222 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1223 p->second.end - p->second.start);
1224 ps_t first = p->second.start;
1225 ps_t end = first + n;
1226 for (ps_t ps = first; ps < end; ++ps) {
1227 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1228 // NOTE: use the *current* epoch as the PG creation epoch so that the
1229 // OSD does not have to generate a long set of PastIntervals.
1230 pending_creatings.pgs.emplace(
1231 pgid,
1232 creating_pgs_t::pg_create_info(inc.epoch,
1233 p->second.modified));
1234 dout(10) << __func__ << " adding " << pgid << dendl;
1235 }
1236 p->second.start = end;
1237 if (p->second.done()) {
1238 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1239 pending_creatings.queue.erase(p);
1240 } else {
1241 dout(10) << __func__ << " pool " << poolid
1242 << " now [" << p->second.start << "-" << p->second.end << ")"
1243 << dendl;
1244 }
1245 }
1246 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1247 << " pools" << dendl;
1248
1249 if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1250 // walk creating pgs' history and past_intervals forward
1251 for (auto& i : pending_creatings.pgs) {
1252 // this mirrors PG::start_peering_interval()
1253 pg_t pgid = i.first;
1254
1255 // this is a bit imprecise, but sufficient?
1256 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1257 const pg_pool_t *pi;
1258 bool operator()(const set<pg_shard_t> &have) const {
1259 return have.size() >= pi->min_size;
1260 }
1261 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1262 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1263
1264 vector<int> up, acting;
1265 int up_primary, acting_primary;
1266 nextmap.pg_to_up_acting_osds(
1267 pgid, &up, &up_primary, &acting, &acting_primary);
1268 if (i.second.history.epoch_created == 0) {
1269 // new pg entry, set it up
1270 i.second.up = up;
1271 i.second.acting = acting;
1272 i.second.up_primary = up_primary;
1273 i.second.acting_primary = acting_primary;
1274 i.second.history = pg_history_t(i.second.create_epoch,
1275 i.second.create_stamp);
1276 dout(10) << __func__ << " pg " << pgid << " just added, "
1277 << " up " << i.second.up
1278 << " p " << i.second.up_primary
1279 << " acting " << i.second.acting
1280 << " p " << i.second.acting_primary
1281 << " history " << i.second.history
1282 << " past_intervals " << i.second.past_intervals
1283 << dendl;
1284 } else {
1285 std::stringstream debug;
1286 if (PastIntervals::check_new_interval(
1287 i.second.acting_primary, acting_primary,
1288 i.second.acting, acting,
1289 i.second.up_primary, up_primary,
1290 i.second.up, up,
1291 i.second.history.same_interval_since,
1292 i.second.history.last_epoch_clean,
1293 &nextmap,
1294 &osdmap,
1295 pgid,
1296 min_size_predicate,
1297 &i.second.past_intervals,
1298 &debug)) {
1299 epoch_t e = inc.epoch;
1300 i.second.history.same_interval_since = e;
1301 if (i.second.up != up) {
1302 i.second.history.same_up_since = e;
1303 }
1304 if (i.second.acting_primary != acting_primary) {
1305 i.second.history.same_primary_since = e;
1306 }
1307 if (pgid.is_split(
1308 osdmap.get_pg_num(pgid.pool()),
1309 nextmap.get_pg_num(pgid.pool()),
1310 nullptr)) {
1311 i.second.history.last_epoch_split = e;
1312 }
1313 dout(10) << __func__ << " pg " << pgid << " new interval,"
1314 << " up " << i.second.up << " -> " << up
1315 << " p " << i.second.up_primary << " -> " << up_primary
1316 << " acting " << i.second.acting << " -> " << acting
1317 << " p " << i.second.acting_primary << " -> "
1318 << acting_primary
1319 << " history " << i.second.history
1320 << " past_intervals " << i.second.past_intervals
1321 << dendl;
1322 dout(20) << " debug: " << debug.str() << dendl;
1323 i.second.up = up;
1324 i.second.acting = acting;
1325 i.second.up_primary = up_primary;
1326 i.second.acting_primary = acting_primary;
1327 }
1328 }
1329 }
1330 }
1331 dout(10) << __func__
1332 << " " << (pending_creatings.pgs.size() - total)
1333 << "/" << pending_creatings.pgs.size()
1334 << " pgs added from queued pools" << dendl;
1335 return pending_creatings;
1336 }
1337
1338 void OSDMonitor::maybe_prime_pg_temp()
1339 {
1340 bool all = false;
1341 if (pending_inc.crush.length()) {
1342 dout(10) << __func__ << " new crush map, all" << dendl;
1343 all = true;
1344 }
1345
1346 if (!pending_inc.new_up_client.empty()) {
1347 dout(10) << __func__ << " new up osds, all" << dendl;
1348 all = true;
1349 }
1350
1351 // check for interesting OSDs
1352 set<int> osds;
1353 for (auto p = pending_inc.new_state.begin();
1354 !all && p != pending_inc.new_state.end();
1355 ++p) {
1356 if ((p->second & CEPH_OSD_UP) &&
1357 osdmap.is_up(p->first)) {
1358 osds.insert(p->first);
1359 }
1360 }
1361 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1362 !all && p != pending_inc.new_weight.end();
1363 ++p) {
1364 if (p->second < osdmap.get_weight(p->first)) {
1365 // weight reduction
1366 osds.insert(p->first);
1367 } else {
1368 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1369 << dendl;
1370 all = true;
1371 }
1372 }
1373
1374 if (!all && osds.empty())
1375 return;
1376
1377 if (!all) {
1378 unsigned estimate =
1379 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1380 if (estimate > mapping.get_num_pgs() *
1381 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1382 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1383 << osds.size() << " osds >= "
1384 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1385 << mapping.get_num_pgs() << " pgs, all"
1386 << dendl;
1387 all = true;
1388 } else {
1389 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1390 << osds.size() << " osds" << dendl;
1391 }
1392 }
1393
1394 OSDMap next;
1395 next.deepish_copy_from(osdmap);
1396 next.apply_incremental(pending_inc);
1397
1398 if (next.get_pools().empty()) {
1399 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1400 } else if (all) {
1401 PrimeTempJob job(next, this);
1402 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1403 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1404 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1405 } else {
1406 dout(10) << __func__ << " did not finish in "
1407 << g_conf()->mon_osd_prime_pg_temp_max_time
1408 << ", stopping" << dendl;
1409 job.abort();
1410 }
1411 } else {
1412 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1413 utime_t stop = ceph_clock_now();
1414 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1415 const int chunk = 1000;
1416 int n = chunk;
1417 std::unordered_set<pg_t> did_pgs;
1418 for (auto osd : osds) {
1419 auto& pgs = mapping.get_osd_acting_pgs(osd);
1420 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1421 for (auto pgid : pgs) {
1422 if (!did_pgs.insert(pgid).second) {
1423 continue;
1424 }
1425 prime_pg_temp(next, pgid);
1426 if (--n <= 0) {
1427 n = chunk;
1428 if (ceph_clock_now() > stop) {
1429 dout(10) << __func__ << " consumed more than "
1430 << g_conf()->mon_osd_prime_pg_temp_max_time
1431 << " seconds, stopping"
1432 << dendl;
1433 return;
1434 }
1435 }
1436 }
1437 }
1438 }
1439 }
1440
1441 void OSDMonitor::prime_pg_temp(
1442 const OSDMap& next,
1443 pg_t pgid)
1444 {
1445 // TODO: remove this creating_pgs direct access?
1446 if (creating_pgs.pgs.count(pgid)) {
1447 return;
1448 }
1449 if (!osdmap.pg_exists(pgid)) {
1450 return;
1451 }
1452
1453 vector<int> up, acting;
1454 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1455
1456 vector<int> next_up, next_acting;
1457 int next_up_primary, next_acting_primary;
1458 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1459 &next_acting, &next_acting_primary);
1460 if (acting == next_acting &&
1461 !(up != acting && next_up == next_acting))
1462 return; // no change since last epoch
1463
1464 if (acting.empty())
1465 return; // if previously empty now we can be no worse off
1466 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1467 if (pool && acting.size() < pool->min_size)
1468 return; // can be no worse off than before
1469
1470 if (next_up == next_acting) {
1471 acting.clear();
1472 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1473 << dendl;
1474 }
1475
1476 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1477 << " -> " << next_up << "/" << next_acting
1478 << ", priming " << acting
1479 << dendl;
1480 {
1481 std::lock_guard l(prime_pg_temp_lock);
1482 // do not touch a mapping if a change is pending
1483 pending_inc.new_pg_temp.emplace(
1484 pgid,
1485 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1486 }
1487 }
1488
1489 /**
1490 * @note receiving a transaction in this function gives a fair amount of
1491 * freedom to the service implementation if it does need it. It shouldn't.
1492 */
1493 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1494 {
1495 dout(10) << "encode_pending e " << pending_inc.epoch
1496 << dendl;
1497
1498 if (do_prune(t)) {
1499 dout(1) << __func__ << " osdmap full prune encoded e"
1500 << pending_inc.epoch << dendl;
1501 }
1502
1503 // finalize up pending_inc
1504 pending_inc.modified = ceph_clock_now();
1505
1506 int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1507 ceph_assert(r == 0);
1508
1509 if (mapping_job) {
1510 if (!mapping_job->is_done()) {
1511 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1512 << mapping_job.get() << " did not complete, "
1513 << mapping_job->shards << " left" << dendl;
1514 mapping_job->abort();
1515 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1516 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1517 << mapping_job.get() << " is prior epoch "
1518 << mapping.get_epoch() << dendl;
1519 } else {
1520 if (g_conf()->mon_osd_prime_pg_temp) {
1521 maybe_prime_pg_temp();
1522 }
1523 }
1524 } else if (g_conf()->mon_osd_prime_pg_temp) {
1525 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1526 << dendl;
1527 }
1528 mapping_job.reset();
1529
1530 // ensure we don't have blank new_state updates. these are interrpeted as
1531 // CEPH_OSD_UP (and almost certainly not what we want!).
1532 auto p = pending_inc.new_state.begin();
1533 while (p != pending_inc.new_state.end()) {
1534 if (p->second == 0) {
1535 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1536 p = pending_inc.new_state.erase(p);
1537 } else {
1538 if (p->second & CEPH_OSD_UP) {
1539 pending_inc.new_last_up_change = pending_inc.modified;
1540 }
1541 ++p;
1542 }
1543 }
1544 if (!pending_inc.new_up_client.empty()) {
1545 pending_inc.new_last_up_change = pending_inc.modified;
1546 }
1547 for (auto& i : pending_inc.new_weight) {
1548 if (i.first >= osdmap.max_osd) {
1549 if (i.second) {
1550 // new osd is already marked in
1551 pending_inc.new_last_in_change = pending_inc.modified;
1552 break;
1553 }
1554 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1555 // existing osd marked in or out
1556 pending_inc.new_last_in_change = pending_inc.modified;
1557 break;
1558 }
1559 }
1560
1561 {
1562 OSDMap tmp;
1563 tmp.deepish_copy_from(osdmap);
1564 tmp.apply_incremental(pending_inc);
1565
1566 // clean pg_temp mappings
1567 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1568
1569 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1570 {
1571 // check every upmapped pg for now
1572 // until we could reliably identify certain cases to ignore,
1573 // which is obviously the hard part TBD..
1574 vector<pg_t> pgs_to_check;
1575 tmp.get_upmap_pgs(&pgs_to_check);
1576 if (pgs_to_check.size() <
1577 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1578 // not enough pgs, do it inline
1579 tmp.clean_pg_upmaps(cct, &pending_inc);
1580 } else {
1581 CleanUpmapJob job(cct, tmp, pending_inc);
1582 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1583 job.wait();
1584 }
1585 }
1586
1587 // update creating pgs first so that we can remove the created pgid and
1588 // process the pool flag removal below in the same osdmap epoch.
1589 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1590 bufferlist creatings_bl;
1591 uint64_t features = CEPH_FEATURES_ALL;
1592 if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1593 dout(20) << __func__ << " encoding pending pgs without octopus features"
1594 << dendl;
1595 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1596 }
1597 encode(pending_creatings, creatings_bl, features);
1598 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1599
1600 // remove any old (or incompat) POOL_CREATING flags
1601 for (auto& i : tmp.get_pools()) {
1602 if (tmp.require_osd_release < ceph_release_t::nautilus) {
1603 // pre-nautilus OSDMaps shouldn't get this flag.
1604 if (pending_inc.new_pools.count(i.first)) {
1605 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1606 }
1607 }
1608 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1609 !pending_creatings.still_creating_pool(i.first)) {
1610 dout(10) << __func__ << " done creating pool " << i.first
1611 << ", clearing CREATING flag" << dendl;
1612 if (pending_inc.new_pools.count(i.first) == 0) {
1613 pending_inc.new_pools[i.first] = i.second;
1614 }
1615 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1616 }
1617 }
1618
1619 // collect which pools are currently affected by
1620 // the near/backfill/full osd(s),
1621 // and set per-pool near/backfill/full flag instead
1622 set<int64_t> full_pool_ids;
1623 set<int64_t> backfillfull_pool_ids;
1624 set<int64_t> nearfull_pool_ids;
1625 tmp.get_full_pools(cct,
1626 &full_pool_ids,
1627 &backfillfull_pool_ids,
1628 &nearfull_pool_ids);
1629 if (full_pool_ids.empty() ||
1630 backfillfull_pool_ids.empty() ||
1631 nearfull_pool_ids.empty()) {
1632 // normal case - no nearfull, backfillfull or full osds
1633 // try cancel any improper nearfull/backfillfull/full pool
1634 // flags first
1635 for (auto &pool: tmp.get_pools()) {
1636 auto p = pool.first;
1637 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1638 nearfull_pool_ids.empty()) {
1639 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1640 << "'s nearfull flag" << dendl;
1641 if (pending_inc.new_pools.count(p) == 0) {
1642 // load original pool info first!
1643 pending_inc.new_pools[p] = pool.second;
1644 }
1645 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1646 }
1647 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1648 backfillfull_pool_ids.empty()) {
1649 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650 << "'s backfillfull flag" << dendl;
1651 if (pending_inc.new_pools.count(p) == 0) {
1652 pending_inc.new_pools[p] = pool.second;
1653 }
1654 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1655 }
1656 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1657 full_pool_ids.empty()) {
1658 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1659 // set by EQUOTA, skipping
1660 continue;
1661 }
1662 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1663 << "'s full flag" << dendl;
1664 if (pending_inc.new_pools.count(p) == 0) {
1665 pending_inc.new_pools[p] = pool.second;
1666 }
1667 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1668 }
1669 }
1670 }
1671 if (!full_pool_ids.empty()) {
1672 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1673 << " as full" << dendl;
1674 for (auto &p: full_pool_ids) {
1675 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1676 continue;
1677 }
1678 if (pending_inc.new_pools.count(p) == 0) {
1679 pending_inc.new_pools[p] = tmp.pools[p];
1680 }
1681 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1682 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1683 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1684 }
1685 // cancel FLAG_FULL for pools which are no longer full too
1686 for (auto &pool: tmp.get_pools()) {
1687 auto p = pool.first;
1688 if (full_pool_ids.count(p)) {
1689 // skip pools we have just marked as full above
1690 continue;
1691 }
1692 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1693 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1694 // don't touch if currently is not full
1695 // or is running out of quota (and hence considered as full)
1696 continue;
1697 }
1698 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1699 << "'s full flag" << dendl;
1700 if (pending_inc.new_pools.count(p) == 0) {
1701 pending_inc.new_pools[p] = pool.second;
1702 }
1703 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1704 }
1705 }
1706 if (!backfillfull_pool_ids.empty()) {
1707 for (auto &p: backfillfull_pool_ids) {
1708 if (full_pool_ids.count(p)) {
1709 // skip pools we have already considered as full above
1710 continue;
1711 }
1712 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1713 // make sure FLAG_FULL is truly set, so we are safe not
1714 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1716 continue;
1717 }
1718 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1719 // don't bother if pool is already marked as backfillfull
1720 continue;
1721 }
1722 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1723 << "'s as backfillfull" << dendl;
1724 if (pending_inc.new_pools.count(p) == 0) {
1725 pending_inc.new_pools[p] = tmp.pools[p];
1726 }
1727 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1728 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1729 }
1730 // cancel FLAG_BACKFILLFULL for pools
1731 // which are no longer backfillfull too
1732 for (auto &pool: tmp.get_pools()) {
1733 auto p = pool.first;
1734 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1735 // skip pools we have just marked as backfillfull/full above
1736 continue;
1737 }
1738 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1739 // and don't touch if currently is not backfillfull
1740 continue;
1741 }
1742 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743 << "'s backfillfull flag" << dendl;
1744 if (pending_inc.new_pools.count(p) == 0) {
1745 pending_inc.new_pools[p] = pool.second;
1746 }
1747 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1748 }
1749 }
1750 if (!nearfull_pool_ids.empty()) {
1751 for (auto &p: nearfull_pool_ids) {
1752 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1753 continue;
1754 }
1755 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1756 // make sure FLAG_FULL is truly set, so we are safe not
1757 // to set a extra (redundant) FLAG_NEARFULL flag
1758 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1759 continue;
1760 }
1761 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1762 // don't bother if pool is already marked as nearfull
1763 continue;
1764 }
1765 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1766 << "'s as nearfull" << dendl;
1767 if (pending_inc.new_pools.count(p) == 0) {
1768 pending_inc.new_pools[p] = tmp.pools[p];
1769 }
1770 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1771 }
1772 // cancel FLAG_NEARFULL for pools
1773 // which are no longer nearfull too
1774 for (auto &pool: tmp.get_pools()) {
1775 auto p = pool.first;
1776 if (full_pool_ids.count(p) ||
1777 backfillfull_pool_ids.count(p) ||
1778 nearfull_pool_ids.count(p)) {
1779 // skip pools we have just marked as
1780 // nearfull/backfillfull/full above
1781 continue;
1782 }
1783 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1784 // and don't touch if currently is not nearfull
1785 continue;
1786 }
1787 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1788 << "'s nearfull flag" << dendl;
1789 if (pending_inc.new_pools.count(p) == 0) {
1790 pending_inc.new_pools[p] = pool.second;
1791 }
1792 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1793 }
1794 }
1795
1796 // min_compat_client?
1797 if (!tmp.require_min_compat_client) {
1798 auto mv = tmp.get_min_compat_client();
1799 dout(1) << __func__ << " setting require_min_compat_client to currently "
1800 << "required " << mv << dendl;
1801 mon->clog->info() << "setting require_min_compat_client to currently "
1802 << "required " << mv;
1803 pending_inc.new_require_min_compat_client = mv;
1804 }
1805
1806 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1807 tmp.require_osd_release >= ceph_release_t::nautilus) {
1808 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1809 // add creating flags?
1810 for (auto& i : tmp.get_pools()) {
1811 if (pending_creatings.still_creating_pool(i.first)) {
1812 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1813 << dendl;
1814 if (pending_inc.new_pools.count(i.first) == 0) {
1815 pending_inc.new_pools[i.first] = i.second;
1816 }
1817 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1818 }
1819 }
1820 // adjust blacklist items to all be TYPE_ANY
1821 for (auto& i : tmp.blacklist) {
1822 auto a = i.first;
1823 a.set_type(entity_addr_t::TYPE_ANY);
1824 pending_inc.new_blacklist[a] = i.second;
1825 pending_inc.old_blacklist.push_back(i.first);
1826 }
1827 }
1828
1829 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1830 tmp.require_osd_release >= ceph_release_t::octopus) {
1831 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1832
1833 // adjust obsoleted cache modes
1834 for (auto& [poolid, pi] : tmp.pools) {
1835 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1836 if (pending_inc.new_pools.count(poolid) == 0) {
1837 pending_inc.new_pools[poolid] = pi;
1838 }
1839 dout(10) << __func__ << " switching pool " << poolid
1840 << " cachemode from forward -> proxy" << dendl;
1841 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1842 }
1843 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1844 if (pending_inc.new_pools.count(poolid) == 0) {
1845 pending_inc.new_pools[poolid] = pi;
1846 }
1847 dout(10) << __func__ << " switching pool " << poolid
1848 << " cachemode from readforward -> readproxy" << dendl;
1849 pending_inc.new_pools[poolid].cache_mode =
1850 pg_pool_t::CACHEMODE_READPROXY;
1851 }
1852 }
1853
1854 // clear removed_snaps for every pool
1855 for (auto& [poolid, pi] : tmp.pools) {
1856 if (pi.removed_snaps.empty()) {
1857 continue;
1858 }
1859 if (pending_inc.new_pools.count(poolid) == 0) {
1860 pending_inc.new_pools[poolid] = pi;
1861 }
1862 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1863 << dendl;
1864 pending_inc.new_pools[poolid].removed_snaps.clear();
1865 }
1866
1867 // create a combined purged snap epoch key for all purged snaps
1868 // prior to this epoch, and store it in the current epoch (i.e.,
1869 // the last pre-octopus epoch, just prior to the one we're
1870 // encoding now).
1871 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1872 it->lower_bound("purged_snap_");
1873 map<int64_t,snap_interval_set_t> combined;
1874 while (it->valid()) {
1875 if (it->key().find("purged_snap_") != 0) {
1876 break;
1877 }
1878 string k = it->key();
1879 long long unsigned pool;
1880 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1881 if (n != 1) {
1882 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1883 } else {
1884 bufferlist v = it->value();
1885 auto p = v.cbegin();
1886 snapid_t begin, end;
1887 ceph::decode(begin, p);
1888 ceph::decode(end, p);
1889 combined[pool].insert(begin, end - begin);
1890 }
1891 it->next();
1892 }
1893 if (!combined.empty()) {
1894 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1895 bufferlist v;
1896 ceph::encode(combined, v);
1897 t->put(OSD_SNAP_PREFIX, k, v);
1898 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1899 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1900 << dendl;
1901 } else {
1902 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1903 << dendl;
1904 }
1905
1906 // clean out the old removed_snap_ and removed_epoch keys
1907 // ('`' is ASCII '_' + 1)
1908 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1909 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1910 }
1911 }
1912
1913 // tell me about it
1914 for (auto i = pending_inc.new_state.begin();
1915 i != pending_inc.new_state.end();
1916 ++i) {
1917 int s = i->second ? i->second : CEPH_OSD_UP;
1918 if (s & CEPH_OSD_UP)
1919 dout(2) << " osd." << i->first << " DOWN" << dendl;
1920 if (s & CEPH_OSD_EXISTS)
1921 dout(2) << " osd." << i->first << " DNE" << dendl;
1922 }
1923 for (auto i = pending_inc.new_up_client.begin();
1924 i != pending_inc.new_up_client.end();
1925 ++i) {
1926 //FIXME: insert cluster addresses too
1927 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1928 }
1929 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1930 i != pending_inc.new_weight.end();
1931 ++i) {
1932 if (i->second == CEPH_OSD_OUT) {
1933 dout(2) << " osd." << i->first << " OUT" << dendl;
1934 } else if (i->second == CEPH_OSD_IN) {
1935 dout(2) << " osd." << i->first << " IN" << dendl;
1936 } else {
1937 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1938 }
1939 }
1940
1941 // features for osdmap and its incremental
1942 uint64_t features;
1943
1944 // encode full map and determine its crc
1945 OSDMap tmp;
1946 {
1947 tmp.deepish_copy_from(osdmap);
1948 tmp.apply_incremental(pending_inc);
1949
1950 // determine appropriate features
1951 features = tmp.get_encoding_features();
1952 dout(10) << __func__ << " encoding full map with "
1953 << tmp.require_osd_release
1954 << " features " << features << dendl;
1955
1956 // the features should be a subset of the mon quorum's features!
1957 ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1958
1959 bufferlist fullbl;
1960 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1961 pending_inc.full_crc = tmp.get_crc();
1962
1963 // include full map in the txn. note that old monitors will
1964 // overwrite this. new ones will now skip the local full map
1965 // encode and reload from this.
1966 put_version_full(t, pending_inc.epoch, fullbl);
1967 }
1968
1969 // encode
1970 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1971 bufferlist bl;
1972 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1973
1974 dout(20) << " full_crc " << tmp.get_crc()
1975 << " inc_crc " << pending_inc.inc_crc << dendl;
1976
1977 /* put everything in the transaction */
1978 put_version(t, pending_inc.epoch, bl);
1979 put_last_committed(t, pending_inc.epoch);
1980
1981 // metadata, too!
1982 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1983 p != pending_metadata.end();
1984 ++p)
1985 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1986 for (set<int>::iterator p = pending_metadata_rm.begin();
1987 p != pending_metadata_rm.end();
1988 ++p)
1989 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1990 pending_metadata.clear();
1991 pending_metadata_rm.clear();
1992
1993 // purged_snaps
1994 if (tmp.require_osd_release >= ceph_release_t::octopus &&
1995 !pending_inc.new_purged_snaps.empty()) {
1996 // all snaps purged this epoch (across all pools)
1997 string k = make_purged_snap_epoch_key(pending_inc.epoch);
1998 bufferlist v;
1999 encode(pending_inc.new_purged_snaps, v);
2000 t->put(OSD_SNAP_PREFIX, k, v);
2001 }
2002 for (auto& i : pending_inc.new_purged_snaps) {
2003 for (auto q = i.second.begin();
2004 q != i.second.end();
2005 ++q) {
2006 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2007 pending_inc.epoch,
2008 t);
2009 }
2010 }
2011 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2012 for (auto snap : snaps) {
2013 insert_purged_snap_update(pool, snap, snap + 1,
2014 pending_inc.epoch,
2015 t);
2016 }
2017 }
2018
2019 // health
2020 health_check_map_t next;
2021 tmp.check_health(cct, &next);
2022 encode_health(next, t);
2023 }
2024
2025 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2026 {
2027 bufferlist bl;
2028 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2029 if (r < 0)
2030 return r;
2031 try {
2032 auto p = bl.cbegin();
2033 decode(m, p);
2034 }
2035 catch (buffer::error& e) {
2036 if (err)
2037 *err << "osd." << osd << " metadata is corrupt";
2038 return -EIO;
2039 }
2040 return 0;
2041 }
2042
2043 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2044 {
2045 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2046 if (osdmap.is_up(osd)) {
2047 map<string,string> meta;
2048 load_metadata(osd, meta, nullptr);
2049 auto p = meta.find(field);
2050 if (p == meta.end()) {
2051 (*out)["unknown"]++;
2052 } else {
2053 (*out)[p->second]++;
2054 }
2055 }
2056 }
2057 }
2058
2059 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2060 {
2061 map<string,int> by_val;
2062 count_metadata(field, &by_val);
2063 f->open_object_section(field.c_str());
2064 for (auto& p : by_val) {
2065 f->dump_int(p.first.c_str(), p.second);
2066 }
2067 f->close_section();
2068 }
2069
2070 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2071 {
2072 map<string, string> metadata;
2073 int r = load_metadata(osd, metadata, nullptr);
2074 if (r < 0)
2075 return r;
2076
2077 auto it = metadata.find("osd_objectstore");
2078 if (it == metadata.end())
2079 return -ENOENT;
2080 *type = it->second;
2081 return 0;
2082 }
2083
2084 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2085 const pg_pool_t &pool,
2086 ostream *err)
2087 {
2088 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2089 // since filestore osds could always join the pool later
2090 set<int> checked_osds;
2091 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2092 vector<int> up, acting;
2093 pg_t pgid(ps, pool_id);
2094 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2095 for (int osd : up) {
2096 if (checked_osds.find(osd) != checked_osds.end())
2097 continue;
2098 string objectstore_type;
2099 int r = get_osd_objectstore_type(osd, &objectstore_type);
2100 // allow with missing metadata, e.g. due to an osd never booting yet
2101 if (r < 0 || objectstore_type == "bluestore") {
2102 checked_osds.insert(osd);
2103 continue;
2104 }
2105 *err << "osd." << osd << " uses " << objectstore_type;
2106 return false;
2107 }
2108 }
2109 return true;
2110 }
2111
2112 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2113 {
2114 map<string,string> m;
2115 if (int r = load_metadata(osd, m, err))
2116 return r;
2117 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2118 f->dump_string(p->first.c_str(), p->second);
2119 return 0;
2120 }
2121
2122 void OSDMonitor::print_nodes(Formatter *f)
2123 {
2124 // group OSDs by their hosts
2125 map<string, list<int> > osds; // hostname => osd
2126 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2127 map<string, string> m;
2128 if (load_metadata(osd, m, NULL)) {
2129 continue;
2130 }
2131 map<string, string>::iterator hostname = m.find("hostname");
2132 if (hostname == m.end()) {
2133 // not likely though
2134 continue;
2135 }
2136 osds[hostname->second].push_back(osd);
2137 }
2138
2139 dump_services(f, osds, "osd");
2140 }
2141
2142 void OSDMonitor::share_map_with_random_osd()
2143 {
2144 if (osdmap.get_num_up_osds() == 0) {
2145 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2146 return;
2147 }
2148
2149 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2150 if (!s) {
2151 dout(10) << __func__ << " no up osd on our session map" << dendl;
2152 return;
2153 }
2154
2155 dout(10) << "committed, telling random " << s->name
2156 << " all about it" << dendl;
2157
2158 // get feature of the peer
2159 // use quorum_con_features, if it's an anonymous connection.
2160 uint64_t features = s->con_features ? s->con_features :
2161 mon->get_quorum_con_features();
2162 // whatev, they'll request more if they need it
2163 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2164 s->con->send_message(m);
2165 // NOTE: do *not* record osd has up to this epoch (as we do
2166 // elsewhere) as they may still need to request older values.
2167 }
2168
2169 version_t OSDMonitor::get_trim_to() const
2170 {
2171 if (mon->get_quorum().empty()) {
2172 dout(10) << __func__ << ": quorum not formed" << dendl;
2173 return 0;
2174 }
2175
2176 {
2177 std::lock_guard<std::mutex> l(creating_pgs_lock);
2178 if (!creating_pgs.pgs.empty()) {
2179 return 0;
2180 }
2181 }
2182
2183 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2184 dout(0) << __func__
2185 << " blocking osdmap trim"
2186 " ('mon_debug_block_osdmap_trim' set to 'true')"
2187 << dendl;
2188 return 0;
2189 }
2190
2191 {
2192 epoch_t floor = get_min_last_epoch_clean();
2193 dout(10) << " min_last_epoch_clean " << floor << dendl;
2194 if (g_conf()->mon_osd_force_trim_to > 0 &&
2195 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2196 floor = g_conf()->mon_osd_force_trim_to;
2197 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2198 }
2199 unsigned min = g_conf()->mon_min_osdmap_epochs;
2200 if (floor + min > get_last_committed()) {
2201 if (min < get_last_committed())
2202 floor = get_last_committed() - min;
2203 else
2204 floor = 0;
2205 }
2206 if (floor > get_first_committed())
2207 return floor;
2208 }
2209 return 0;
2210 }
2211
2212 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2213 {
2214 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2215 // also scan osd epochs
2216 // don't trim past the oldest reported osd epoch
2217 for (auto& osd_epoch : osd_epochs) {
2218 if (osd_epoch.second < floor &&
2219 osdmap.is_out(osd_epoch.first)) {
2220 floor = osd_epoch.second;
2221 }
2222 }
2223 return floor;
2224 }
2225
2226 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2227 version_t first)
2228 {
2229 dout(10) << __func__ << " including full map for e " << first << dendl;
2230 bufferlist bl;
2231 get_version_full(first, bl);
2232 put_version_full(tx, first, bl);
2233
2234 if (has_osdmap_manifest &&
2235 first > osdmap_manifest.get_first_pinned()) {
2236 _prune_update_trimmed(tx, first);
2237 }
2238 }
2239
2240
2241 /* full osdmap prune
2242 *
2243 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2244 */
2245
2246 void OSDMonitor::load_osdmap_manifest()
2247 {
2248 bool store_has_manifest =
2249 mon->store->exists(get_service_name(), "osdmap_manifest");
2250
2251 if (!store_has_manifest) {
2252 if (!has_osdmap_manifest) {
2253 return;
2254 }
2255
2256 dout(20) << __func__
2257 << " dropping osdmap manifest from memory." << dendl;
2258 osdmap_manifest = osdmap_manifest_t();
2259 has_osdmap_manifest = false;
2260 return;
2261 }
2262
2263 dout(20) << __func__
2264 << " osdmap manifest detected in store; reload." << dendl;
2265
2266 bufferlist manifest_bl;
2267 int r = get_value("osdmap_manifest", manifest_bl);
2268 if (r < 0) {
2269 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2270 ceph_abort_msg("error reading manifest");
2271 }
2272 osdmap_manifest.decode(manifest_bl);
2273 has_osdmap_manifest = true;
2274
2275 dout(10) << __func__ << " store osdmap manifest pinned ("
2276 << osdmap_manifest.get_first_pinned()
2277 << " .. "
2278 << osdmap_manifest.get_last_pinned()
2279 << ")"
2280 << dendl;
2281 }
2282
2283 bool OSDMonitor::should_prune() const
2284 {
2285 version_t first = get_first_committed();
2286 version_t last = get_last_committed();
2287 version_t min_osdmap_epochs =
2288 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2289 version_t prune_min =
2290 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2291 version_t prune_interval =
2292 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2293 version_t last_pinned = osdmap_manifest.get_last_pinned();
2294 version_t last_to_pin = last - min_osdmap_epochs;
2295
2296 // Make it or break it constraints.
2297 //
2298 // If any of these conditions fails, we will not prune, regardless of
2299 // whether we have an on-disk manifest with an on-going pruning state.
2300 //
2301 if ((last - first) <= min_osdmap_epochs) {
2302 // between the first and last committed epochs, we don't have
2303 // enough epochs to trim, much less to prune.
2304 dout(10) << __func__
2305 << " currently holding only " << (last - first)
2306 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2307 << "); do not prune."
2308 << dendl;
2309 return false;
2310
2311 } else if ((last_to_pin - first) < prune_min) {
2312 // between the first committed epoch and the last epoch we would prune,
2313 // we simply don't have enough versions over the minimum to prune maps.
2314 dout(10) << __func__
2315 << " could only prune " << (last_to_pin - first)
2316 << " epochs (" << first << ".." << last_to_pin << "), which"
2317 " is less than the required minimum (" << prune_min << ")"
2318 << dendl;
2319 return false;
2320
2321 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2322 dout(10) << __func__
2323 << " we have pruned as far as we can; do not prune."
2324 << dendl;
2325 return false;
2326
2327 } else if (last_pinned + prune_interval > last_to_pin) {
2328 dout(10) << __func__
2329 << " not enough epochs to form an interval (last pinned: "
2330 << last_pinned << ", last to pin: "
2331 << last_to_pin << ", interval: " << prune_interval << ")"
2332 << dendl;
2333 return false;
2334 }
2335
2336 dout(15) << __func__
2337 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2338 << " lc (" << first << ".." << last << ")"
2339 << dendl;
2340 return true;
2341 }
2342
2343 void OSDMonitor::_prune_update_trimmed(
2344 MonitorDBStore::TransactionRef tx,
2345 version_t first)
2346 {
2347 dout(10) << __func__
2348 << " first " << first
2349 << " last_pinned " << osdmap_manifest.get_last_pinned()
2350 << " last_pinned " << osdmap_manifest.get_last_pinned()
2351 << dendl;
2352
2353 osdmap_manifest_t manifest = osdmap_manifest;
2354
2355 if (!manifest.is_pinned(first)) {
2356 manifest.pin(first);
2357 }
2358
2359 set<version_t>::iterator p_end = manifest.pinned.find(first);
2360 set<version_t>::iterator p = manifest.pinned.begin();
2361 manifest.pinned.erase(p, p_end);
2362 ceph_assert(manifest.get_first_pinned() == first);
2363
2364 if (manifest.get_last_pinned() == first+1 ||
2365 manifest.pinned.size() == 1) {
2366 // we reached the end of the line, as pinned maps go; clean up our
2367 // manifest, and let `should_prune()` decide whether we should prune
2368 // again.
2369 tx->erase(get_service_name(), "osdmap_manifest");
2370 return;
2371 }
2372
2373 bufferlist bl;
2374 manifest.encode(bl);
2375 tx->put(get_service_name(), "osdmap_manifest", bl);
2376 }
2377
2378 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2379 {
2380 dout(1) << __func__ << dendl;
2381
2382 version_t pin_first;
2383
2384 // verify constrainsts on stable in-memory state
2385 if (!has_osdmap_manifest) {
2386 // we must have never pruned, OR if we pruned the state must no longer
2387 // be relevant (i.e., the state must have been removed alongside with
2388 // the trim that *must* have removed past the last pinned map in a
2389 // previous prune).
2390 ceph_assert(osdmap_manifest.pinned.empty());
2391 ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2392 pin_first = get_first_committed();
2393
2394 } else {
2395 // we must have pruned in the past AND its state is still relevant
2396 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2397 // and thus we still hold a manifest in the store).
2398 ceph_assert(!osdmap_manifest.pinned.empty());
2399 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2400 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2401
2402 dout(10) << __func__
2403 << " first_pinned " << osdmap_manifest.get_first_pinned()
2404 << " last_pinned " << osdmap_manifest.get_last_pinned()
2405 << dendl;
2406
2407 pin_first = osdmap_manifest.get_last_pinned();
2408 }
2409
2410 manifest.pin(pin_first);
2411 }
2412
2413 bool OSDMonitor::_prune_sanitize_options() const
2414 {
2415 uint64_t prune_interval =
2416 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2417 uint64_t prune_min =
2418 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2419 uint64_t txsize =
2420 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2421
2422 bool r = true;
2423
2424 if (prune_interval == 0) {
2425 derr << __func__
2426 << " prune is enabled BUT prune interval is zero; abort."
2427 << dendl;
2428 r = false;
2429 } else if (prune_interval == 1) {
2430 derr << __func__
2431 << " prune interval is equal to one, which essentially means"
2432 " no pruning; abort."
2433 << dendl;
2434 r = false;
2435 }
2436 if (prune_min == 0) {
2437 derr << __func__
2438 << " prune is enabled BUT prune min is zero; abort."
2439 << dendl;
2440 r = false;
2441 }
2442 if (prune_interval > prune_min) {
2443 derr << __func__
2444 << " impossible to ascertain proper prune interval because"
2445 << " it is greater than the minimum prune epochs"
2446 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2447 << dendl;
2448 r = false;
2449 }
2450
2451 if (txsize < prune_interval - 1) {
2452 derr << __func__
2453 << "'mon_osdmap_full_prune_txsize' (" << txsize
2454 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2455 << "); abort." << dendl;
2456 r = false;
2457 }
2458 return r;
2459 }
2460
2461 bool OSDMonitor::is_prune_enabled() const {
2462 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2463 }
2464
2465 bool OSDMonitor::is_prune_supported() const {
2466 return mon->get_required_mon_features().contains_any(
2467 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2468 }
2469
2470 /** do_prune
2471 *
2472 * @returns true if has side-effects; false otherwise.
2473 */
2474 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2475 {
2476 bool enabled = is_prune_enabled();
2477
2478 dout(1) << __func__ << " osdmap full prune "
2479 << ( enabled ? "enabled" : "disabled")
2480 << dendl;
2481
2482 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2483 return false;
2484 }
2485
2486 // we are beyond the minimum prune versions, we need to remove maps because
2487 // otherwise the store will grow unbounded and we may end up having issues
2488 // with available disk space or store hangs.
2489
2490 // we will not pin all versions. We will leave a buffer number of versions.
2491 // this allows us the monitor to trim maps without caring too much about
2492 // pinned maps, and then allow us to use another ceph-mon without these
2493 // capabilities, without having to repair the store.
2494
2495 osdmap_manifest_t manifest = osdmap_manifest;
2496
2497 version_t first = get_first_committed();
2498 version_t last = get_last_committed();
2499
2500 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2501 version_t last_pinned = manifest.get_last_pinned();
2502 uint64_t prune_interval =
2503 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2504 uint64_t txsize =
2505 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2506
2507 prune_init(manifest);
2508
2509 // we need to get rid of some osdmaps
2510
2511 dout(5) << __func__
2512 << " lc (" << first << " .. " << last << ")"
2513 << " last_pinned " << last_pinned
2514 << " interval " << prune_interval
2515 << " last_to_pin " << last_to_pin
2516 << dendl;
2517
2518 // We will be erasing maps as we go.
2519 //
2520 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2521 //
2522 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2523 // we stop pruning. We could prune the maps between `next_to_pin` and
2524 // `last_to_pin`, but by not doing it we end up with neater pruned
2525 // intervals, aligned with `prune_interval`. Besides, this should not be a
2526 // problem as long as `prune_interval` is set to a sane value, instead of
2527 // hundreds or thousands of maps.
2528
2529 auto map_exists = [this](version_t v) {
2530 string k = mon->store->combine_strings("full", v);
2531 return mon->store->exists(get_service_name(), k);
2532 };
2533
2534 // 'interval' represents the number of maps from the last pinned
2535 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2536 // version 11 next; all intermediate versions will be removed.
2537 //
2538 // 'txsize' represents the maximum number of versions we'll be removing in
2539 // this iteration. If 'txsize' is large enough to perform multiple passes
2540 // pinning and removing maps, we will do so; if not, we'll do at least one
2541 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2542 // ensure that we never go *over* the maximum.
2543
2544 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2545 uint64_t removal_interval = prune_interval - 1;
2546
2547 if (txsize < removal_interval) {
2548 dout(5) << __func__
2549 << " setting txsize to removal interval size ("
2550 << removal_interval << " versions"
2551 << dendl;
2552 txsize = removal_interval;
2553 }
2554 ceph_assert(removal_interval > 0);
2555
2556 uint64_t num_pruned = 0;
2557 while (num_pruned + removal_interval <= txsize) {
2558 last_pinned = manifest.get_last_pinned();
2559
2560 if (last_pinned + prune_interval > last_to_pin) {
2561 break;
2562 }
2563 ceph_assert(last_pinned < last_to_pin);
2564
2565 version_t next_pinned = last_pinned + prune_interval;
2566 ceph_assert(next_pinned <= last_to_pin);
2567 manifest.pin(next_pinned);
2568
2569 dout(20) << __func__
2570 << " last_pinned " << last_pinned
2571 << " next_pinned " << next_pinned
2572 << " num_pruned " << num_pruned
2573 << " removal interval (" << (last_pinned+1)
2574 << ".." << (next_pinned-1) << ")"
2575 << " txsize " << txsize << dendl;
2576
2577 ceph_assert(map_exists(last_pinned));
2578 ceph_assert(map_exists(next_pinned));
2579
2580 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2581 ceph_assert(!manifest.is_pinned(v));
2582
2583 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2584 string full_key = mon->store->combine_strings("full", v);
2585 tx->erase(get_service_name(), full_key);
2586 ++num_pruned;
2587 }
2588 }
2589
2590 ceph_assert(num_pruned > 0);
2591
2592 bufferlist bl;
2593 manifest.encode(bl);
2594 tx->put(get_service_name(), "osdmap_manifest", bl);
2595
2596 return true;
2597 }
2598
2599
2600 // -------------
2601
2602 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2603 {
2604 op->mark_osdmon_event(__func__);
2605 Message *m = op->get_req();
2606 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2607
2608 switch (m->get_type()) {
2609 // READs
2610 case MSG_MON_COMMAND:
2611 try {
2612 return preprocess_command(op);
2613 } catch (const bad_cmd_get& e) {
2614 bufferlist bl;
2615 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2616 return true;
2617 }
2618 case CEPH_MSG_MON_GET_OSDMAP:
2619 return preprocess_get_osdmap(op);
2620
2621 // damp updates
2622 case MSG_OSD_MARK_ME_DOWN:
2623 return preprocess_mark_me_down(op);
2624 case MSG_OSD_MARK_ME_DEAD:
2625 return preprocess_mark_me_dead(op);
2626 case MSG_OSD_FULL:
2627 return preprocess_full(op);
2628 case MSG_OSD_FAILURE:
2629 return preprocess_failure(op);
2630 case MSG_OSD_BOOT:
2631 return preprocess_boot(op);
2632 case MSG_OSD_ALIVE:
2633 return preprocess_alive(op);
2634 case MSG_OSD_PG_CREATED:
2635 return preprocess_pg_created(op);
2636 case MSG_OSD_PG_READY_TO_MERGE:
2637 return preprocess_pg_ready_to_merge(op);
2638 case MSG_OSD_PGTEMP:
2639 return preprocess_pgtemp(op);
2640 case MSG_OSD_BEACON:
2641 return preprocess_beacon(op);
2642
2643 case CEPH_MSG_POOLOP:
2644 return preprocess_pool_op(op);
2645
2646 case MSG_REMOVE_SNAPS:
2647 return preprocess_remove_snaps(op);
2648
2649 case MSG_MON_GET_PURGED_SNAPS:
2650 return preprocess_get_purged_snaps(op);
2651
2652 default:
2653 ceph_abort();
2654 return true;
2655 }
2656 }
2657
2658 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2659 {
2660 op->mark_osdmon_event(__func__);
2661 Message *m = op->get_req();
2662 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2663
2664 switch (m->get_type()) {
2665 // damp updates
2666 case MSG_OSD_MARK_ME_DOWN:
2667 return prepare_mark_me_down(op);
2668 case MSG_OSD_MARK_ME_DEAD:
2669 return prepare_mark_me_dead(op);
2670 case MSG_OSD_FULL:
2671 return prepare_full(op);
2672 case MSG_OSD_FAILURE:
2673 return prepare_failure(op);
2674 case MSG_OSD_BOOT:
2675 return prepare_boot(op);
2676 case MSG_OSD_ALIVE:
2677 return prepare_alive(op);
2678 case MSG_OSD_PG_CREATED:
2679 return prepare_pg_created(op);
2680 case MSG_OSD_PGTEMP:
2681 return prepare_pgtemp(op);
2682 case MSG_OSD_PG_READY_TO_MERGE:
2683 return prepare_pg_ready_to_merge(op);
2684 case MSG_OSD_BEACON:
2685 return prepare_beacon(op);
2686
2687 case MSG_MON_COMMAND:
2688 try {
2689 return prepare_command(op);
2690 } catch (const bad_cmd_get& e) {
2691 bufferlist bl;
2692 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2693 return true;
2694 }
2695
2696 case CEPH_MSG_POOLOP:
2697 return prepare_pool_op(op);
2698
2699 case MSG_REMOVE_SNAPS:
2700 return prepare_remove_snaps(op);
2701
2702
2703 default:
2704 ceph_abort();
2705 }
2706
2707 return false;
2708 }
2709
2710 bool OSDMonitor::should_propose(double& delay)
2711 {
2712 dout(10) << "should_propose" << dendl;
2713
2714 // if full map, propose immediately! any subsequent changes will be clobbered.
2715 if (pending_inc.fullmap.length())
2716 return true;
2717
2718 // adjust osd weights?
2719 if (!osd_weight.empty() &&
2720 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2721 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2722 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2723 delay = 0.0;
2724 osd_weight.clear();
2725 return true;
2726 }
2727
2728 return PaxosService::should_propose(delay);
2729 }
2730
2731
2732
2733 // ---------------------------
2734 // READs
2735
2736 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2737 {
2738 op->mark_osdmon_event(__func__);
2739 auto m = op->get_req<MMonGetOSDMap>();
2740
2741 uint64_t features = mon->get_quorum_con_features();
2742 if (op->get_session() && op->get_session()->con_features)
2743 features = op->get_session()->con_features;
2744
2745 dout(10) << __func__ << " " << *m << dendl;
2746 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2747 epoch_t first = get_first_committed();
2748 epoch_t last = osdmap.get_epoch();
2749 int max = g_conf()->osd_map_message_max;
2750 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2751 for (epoch_t e = std::max(first, m->get_full_first());
2752 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2753 ++e, --max) {
2754 bufferlist& bl = reply->maps[e];
2755 int r = get_version_full(e, features, bl);
2756 ceph_assert(r >= 0);
2757 max_bytes -= bl.length();
2758 }
2759 for (epoch_t e = std::max(first, m->get_inc_first());
2760 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2761 ++e, --max) {
2762 bufferlist& bl = reply->incremental_maps[e];
2763 int r = get_version(e, features, bl);
2764 ceph_assert(r >= 0);
2765 max_bytes -= bl.length();
2766 }
2767 reply->oldest_map = first;
2768 reply->newest_map = last;
2769 mon->send_reply(op, reply);
2770 return true;
2771 }
2772
2773
2774 // ---------------------------
2775 // UPDATEs
2776
2777 // failure --
2778
2779 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2780 // check permissions
2781 MonSession *session = op->get_session();
2782 if (!session)
2783 return true;
2784 if (!session->is_capable("osd", MON_CAP_X)) {
2785 dout(0) << "got MOSDFailure from entity with insufficient caps "
2786 << session->caps << dendl;
2787 return true;
2788 }
2789 if (fsid != mon->monmap->fsid) {
2790 dout(0) << "check_source: on fsid " << fsid
2791 << " != " << mon->monmap->fsid << dendl;
2792 return true;
2793 }
2794 return false;
2795 }
2796
2797
2798 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2799 {
2800 op->mark_osdmon_event(__func__);
2801 auto m = op->get_req<MOSDFailure>();
2802 // who is target_osd
2803 int badboy = m->get_target_osd();
2804
2805 // check permissions
2806 if (check_source(op, m->fsid))
2807 goto didit;
2808
2809 // first, verify the reporting host is valid
2810 if (m->get_orig_source().is_osd()) {
2811 int from = m->get_orig_source().num();
2812 if (!osdmap.exists(from) ||
2813 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2814 (osdmap.is_down(from) && m->if_osd_failed())) {
2815 dout(5) << "preprocess_failure from dead osd." << from
2816 << ", ignoring" << dendl;
2817 send_incremental(op, m->get_epoch()+1);
2818 goto didit;
2819 }
2820 }
2821
2822
2823 // weird?
2824 if (osdmap.is_down(badboy)) {
2825 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2826 << " " << m->get_target_addrs()
2827 << ", from " << m->get_orig_source() << dendl;
2828 if (m->get_epoch() < osdmap.get_epoch())
2829 send_incremental(op, m->get_epoch()+1);
2830 goto didit;
2831 }
2832 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2833 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2834 << " " << m->get_target_addrs()
2835 << " != map's " << osdmap.get_addrs(badboy)
2836 << ", from " << m->get_orig_source() << dendl;
2837 if (m->get_epoch() < osdmap.get_epoch())
2838 send_incremental(op, m->get_epoch()+1);
2839 goto didit;
2840 }
2841
2842 // already reported?
2843 if (osdmap.is_down(badboy) ||
2844 osdmap.get_up_from(badboy) > m->get_epoch()) {
2845 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2846 << " " << m->get_target_addrs()
2847 << ", from " << m->get_orig_source() << dendl;
2848 if (m->get_epoch() < osdmap.get_epoch())
2849 send_incremental(op, m->get_epoch()+1);
2850 goto didit;
2851 }
2852
2853 if (!can_mark_down(badboy)) {
2854 dout(5) << "preprocess_failure ignoring report of osd."
2855 << m->get_target_osd() << " " << m->get_target_addrs()
2856 << " from " << m->get_orig_source() << dendl;
2857 goto didit;
2858 }
2859
2860 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2861 << " " << m->get_target_addrs()
2862 << ", from " << m->get_orig_source() << dendl;
2863 return false;
2864
2865 didit:
2866 mon->no_reply(op);
2867 return true;
2868 }
2869
2870 class C_AckMarkedDown : public C_MonOp {
2871 OSDMonitor *osdmon;
2872 public:
2873 C_AckMarkedDown(
2874 OSDMonitor *osdmon,
2875 MonOpRequestRef op)
2876 : C_MonOp(op), osdmon(osdmon) {}
2877
2878 void _finish(int r) override {
2879 if (r == 0) {
2880 auto m = op->get_req<MOSDMarkMeDown>();
2881 osdmon->mon->send_reply(
2882 op,
2883 new MOSDMarkMeDown(
2884 m->fsid,
2885 m->target_osd,
2886 m->target_addrs,
2887 m->get_epoch(),
2888 false)); // ACK itself does not request an ack
2889 } else if (r == -EAGAIN) {
2890 osdmon->dispatch(op);
2891 } else {
2892 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2893 }
2894 }
2895 ~C_AckMarkedDown() override {
2896 }
2897 };
2898
2899 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2900 {
2901 op->mark_osdmon_event(__func__);
2902 auto m = op->get_req<MOSDMarkMeDown>();
2903 int from = m->target_osd;
2904
2905 // check permissions
2906 if (check_source(op, m->fsid))
2907 goto reply;
2908
2909 // first, verify the reporting host is valid
2910 if (!m->get_orig_source().is_osd())
2911 goto reply;
2912
2913 if (!osdmap.exists(from) ||
2914 osdmap.is_down(from) ||
2915 osdmap.get_addrs(from) != m->target_addrs) {
2916 dout(5) << "preprocess_mark_me_down from dead osd."
2917 << from << ", ignoring" << dendl;
2918 send_incremental(op, m->get_epoch()+1);
2919 goto reply;
2920 }
2921
2922 // no down might be set
2923 if (!can_mark_down(from))
2924 goto reply;
2925
2926 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2927 << " " << m->target_addrs << dendl;
2928 return false;
2929
2930 reply:
2931 if (m->request_ack) {
2932 Context *c(new C_AckMarkedDown(this, op));
2933 c->complete(0);
2934 }
2935 return true;
2936 }
2937
2938 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2939 {
2940 op->mark_osdmon_event(__func__);
2941 auto m = op->get_req<MOSDMarkMeDown>();
2942 int target_osd = m->target_osd;
2943
2944 ceph_assert(osdmap.is_up(target_osd));
2945 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2946
2947 mon->clog->info() << "osd." << target_osd << " marked itself down";
2948 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2949 if (m->request_ack)
2950 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2951 return true;
2952 }
2953
2954 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2955 {
2956 op->mark_osdmon_event(__func__);
2957 auto m = op->get_req<MOSDMarkMeDead>();
2958 int from = m->target_osd;
2959
2960 // check permissions
2961 if (check_source(op, m->fsid)) {
2962 mon->no_reply(op);
2963 return true;
2964 }
2965
2966 // first, verify the reporting host is valid
2967 if (!m->get_orig_source().is_osd()) {
2968 mon->no_reply(op);
2969 return true;
2970 }
2971
2972 if (!osdmap.exists(from) ||
2973 !osdmap.is_down(from)) {
2974 dout(5) << __func__ << " from nonexistent or up osd." << from
2975 << ", ignoring" << dendl;
2976 send_incremental(op, m->get_epoch()+1);
2977 mon->no_reply(op);
2978 return true;
2979 }
2980
2981 return false;
2982 }
2983
2984 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2985 {
2986 op->mark_osdmon_event(__func__);
2987 auto m = op->get_req<MOSDMarkMeDead>();
2988 int target_osd = m->target_osd;
2989
2990 ceph_assert(osdmap.is_down(target_osd));
2991
2992 mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
2993 << m->get_epoch();
2994 if (!pending_inc.new_xinfo.count(target_osd)) {
2995 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
2996 }
2997 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
2998 wait_for_finished_proposal(
2999 op,
3000 new LambdaContext(
3001 [op, this] (int r) {
3002 if (r >= 0) {
3003 mon->no_reply(op); // ignore on success
3004 }
3005 }
3006 ));
3007 return true;
3008 }
3009
3010 bool OSDMonitor::can_mark_down(int i)
3011 {
3012 if (osdmap.is_nodown(i)) {
3013 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3014 << "will not mark it down" << dendl;
3015 return false;
3016 }
3017
3018 int num_osds = osdmap.get_num_osds();
3019 if (num_osds == 0) {
3020 dout(5) << __func__ << " no osds" << dendl;
3021 return false;
3022 }
3023 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3024 float up_ratio = (float)up / (float)num_osds;
3025 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3026 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3027 << g_conf()->mon_osd_min_up_ratio
3028 << ", will not mark osd." << i << " down" << dendl;
3029 return false;
3030 }
3031 return true;
3032 }
3033
3034 bool OSDMonitor::can_mark_up(int i)
3035 {
3036 if (osdmap.is_noup(i)) {
3037 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3038 << "will not mark it up" << dendl;
3039 return false;
3040 }
3041
3042 return true;
3043 }
3044
3045 /**
3046 * @note the parameter @p i apparently only exists here so we can output the
3047 * osd's id on messages.
3048 */
3049 bool OSDMonitor::can_mark_out(int i)
3050 {
3051 if (osdmap.is_noout(i)) {
3052 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3053 << "will not mark it out" << dendl;
3054 return false;
3055 }
3056
3057 int num_osds = osdmap.get_num_osds();
3058 if (num_osds == 0) {
3059 dout(5) << __func__ << " no osds" << dendl;
3060 return false;
3061 }
3062 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3063 float in_ratio = (float)in / (float)num_osds;
3064 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3065 if (i >= 0)
3066 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3067 << g_conf()->mon_osd_min_in_ratio
3068 << ", will not mark osd." << i << " out" << dendl;
3069 else
3070 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3071 << g_conf()->mon_osd_min_in_ratio
3072 << ", will not mark osds out" << dendl;
3073 return false;
3074 }
3075
3076 return true;
3077 }
3078
3079 bool OSDMonitor::can_mark_in(int i)
3080 {
3081 if (osdmap.is_noin(i)) {
3082 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3083 << "will not mark it in" << dendl;
3084 return false;
3085 }
3086
3087 return true;
3088 }
3089
3090 bool OSDMonitor::check_failures(utime_t now)
3091 {
3092 bool found_failure = false;
3093 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3094 p != failure_info.end();
3095 ++p) {
3096 if (can_mark_down(p->first)) {
3097 found_failure |= check_failure(now, p->first, p->second);
3098 }
3099 }
3100 return found_failure;
3101 }
3102
3103 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3104 {
3105 // already pending failure?
3106 if (pending_inc.new_state.count(target_osd) &&
3107 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3108 dout(10) << " already pending failure" << dendl;
3109 return true;
3110 }
3111
3112 set<string> reporters_by_subtree;
3113 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3114 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3115 utime_t max_failed_since = fi.get_failed_since();
3116 utime_t failed_for = now - max_failed_since;
3117
3118 utime_t grace = orig_grace;
3119 double my_grace = 0, peer_grace = 0;
3120 double decay_k = 0;
3121 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3122 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3123 decay_k = ::log(.5) / halflife;
3124
3125 // scale grace period based on historical probability of 'lagginess'
3126 // (false positive failures due to slowness).
3127 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3128 double decay = exp((double)failed_for * decay_k);
3129 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3130 << " failed_for " << failed_for << " decay " << decay << dendl;
3131 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3132 grace += my_grace;
3133 }
3134
3135 // consider the peers reporting a failure a proxy for a potential
3136 // 'subcluster' over the overall cluster that is similarly
3137 // laggy. this is clearly not true in all cases, but will sometimes
3138 // help us localize the grace correction to a subset of the system
3139 // (say, a rack with a bad switch) that is unhappy.
3140 ceph_assert(fi.reporters.size());
3141 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3142 // get the parent bucket whose type matches with "reporter_subtree_level".
3143 // fall back to OSD if the level doesn't exist.
3144 if (osdmap.exists(p->first)) {
3145 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3146 if (auto iter = reporter_loc.find(reporter_subtree_level);
3147 iter == reporter_loc.end()) {
3148 reporters_by_subtree.insert("osd." + to_string(p->first));
3149 } else {
3150 reporters_by_subtree.insert(iter->second);
3151 }
3152 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3153 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3154 utime_t elapsed = now - xi.down_stamp;
3155 double decay = exp((double)elapsed * decay_k);
3156 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3157 }
3158 ++p;
3159 } else {
3160 fi.cancel_report(p->first);;
3161 p = fi.reporters.erase(p);
3162 }
3163 }
3164
3165 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3166 peer_grace /= (double)fi.reporters.size();
3167 grace += peer_grace;
3168 }
3169
3170 dout(10) << " osd." << target_osd << " has "
3171 << fi.reporters.size() << " reporters, "
3172 << grace << " grace (" << orig_grace << " + " << my_grace
3173 << " + " << peer_grace << "), max_failed_since " << max_failed_since
3174 << dendl;
3175
3176 if (failed_for >= grace &&
3177 reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3178 dout(1) << " we have enough reporters to mark osd." << target_osd
3179 << " down" << dendl;
3180 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3181
3182 mon->clog->info() << "osd." << target_osd << " failed ("
3183 << osdmap.crush->get_full_location_ordered_string(
3184 target_osd)
3185 << ") ("
3186 << (int)reporters_by_subtree.size()
3187 << " reporters from different "
3188 << reporter_subtree_level << " after "
3189 << failed_for << " >= grace " << grace << ")";
3190 return true;
3191 }
3192 return false;
3193 }
3194
3195 void OSDMonitor::force_failure(int target_osd, int by)
3196 {
3197 // already pending failure?
3198 if (pending_inc.new_state.count(target_osd) &&
3199 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3200 dout(10) << " already pending failure" << dendl;
3201 return;
3202 }
3203
3204 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3205 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3206 if (!pending_inc.new_xinfo.count(target_osd)) {
3207 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3208 }
3209 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3210
3211 mon->clog->info() << "osd." << target_osd << " failed ("
3212 << osdmap.crush->get_full_location_ordered_string(target_osd)
3213 << ") (connection refused reported by osd." << by << ")";
3214 return;
3215 }
3216
3217 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3218 {
3219 op->mark_osdmon_event(__func__);
3220 auto m = op->get_req<MOSDFailure>();
3221 dout(1) << "prepare_failure osd." << m->get_target_osd()
3222 << " " << m->get_target_addrs()
3223 << " from " << m->get_orig_source()
3224 << " is reporting failure:" << m->if_osd_failed() << dendl;
3225
3226 int target_osd = m->get_target_osd();
3227 int reporter = m->get_orig_source().num();
3228 ceph_assert(osdmap.is_up(target_osd));
3229 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3230
3231 mon->no_reply(op);
3232
3233 if (m->if_osd_failed()) {
3234 // calculate failure time
3235 utime_t now = ceph_clock_now();
3236 utime_t failed_since =
3237 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3238
3239 // add a report
3240 if (m->is_immediate()) {
3241 mon->clog->debug() << "osd." << m->get_target_osd()
3242 << " reported immediately failed by "
3243 << m->get_orig_source();
3244 force_failure(target_osd, reporter);
3245 return true;
3246 }
3247 mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3248 << m->get_orig_source();
3249
3250 failure_info_t& fi = failure_info[target_osd];
3251 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3252 if (old_op) {
3253 mon->no_reply(old_op);
3254 }
3255
3256 return check_failure(now, target_osd, fi);
3257 } else {
3258 // remove the report
3259 mon->clog->debug() << "osd." << m->get_target_osd()
3260 << " failure report canceled by "
3261 << m->get_orig_source();
3262 if (failure_info.count(target_osd)) {
3263 failure_info_t& fi = failure_info[target_osd];
3264 MonOpRequestRef report_op = fi.cancel_report(reporter);
3265 if (report_op) {
3266 mon->no_reply(report_op);
3267 }
3268 if (fi.reporters.empty()) {
3269 dout(10) << " removing last failure_info for osd." << target_osd
3270 << dendl;
3271 failure_info.erase(target_osd);
3272 } else {
3273 dout(10) << " failure_info for osd." << target_osd << " now "
3274 << fi.reporters.size() << " reporters" << dendl;
3275 }
3276 } else {
3277 dout(10) << " no failure_info for osd." << target_osd << dendl;
3278 }
3279 }
3280
3281 return false;
3282 }
3283
3284 void OSDMonitor::process_failures()
3285 {
3286 map<int,failure_info_t>::iterator p = failure_info.begin();
3287 while (p != failure_info.end()) {
3288 if (osdmap.is_up(p->first)) {
3289 ++p;
3290 } else {
3291 dout(10) << "process_failures osd." << p->first << dendl;
3292 list<MonOpRequestRef> ls;
3293 p->second.take_report_messages(ls);
3294 failure_info.erase(p++);
3295
3296 while (!ls.empty()) {
3297 MonOpRequestRef o = ls.front();
3298 if (o) {
3299 o->mark_event(__func__);
3300 MOSDFailure *m = o->get_req<MOSDFailure>();
3301 send_latest(o, m->get_epoch());
3302 mon->no_reply(o);
3303 }
3304 ls.pop_front();
3305 }
3306 }
3307 }
3308 }
3309
3310 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3311 {
3312 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3313
3314 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3315 p != failure_info.end();
3316 ++p) {
3317 p->second.take_report_messages(ls);
3318 }
3319 failure_info.clear();
3320 }
3321
3322
3323 // boot --
3324
3325 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3326 {
3327 op->mark_osdmon_event(__func__);
3328 auto m = op->get_req<MOSDBoot>();
3329 int from = m->get_orig_source_inst().name.num();
3330
3331 // check permissions, ignore if failed (no response expected)
3332 MonSession *session = op->get_session();
3333 if (!session)
3334 goto ignore;
3335 if (!session->is_capable("osd", MON_CAP_X)) {
3336 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3337 << session->caps << dendl;
3338 goto ignore;
3339 }
3340
3341 if (m->sb.cluster_fsid != mon->monmap->fsid) {
3342 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3343 << " != " << mon->monmap->fsid << dendl;
3344 goto ignore;
3345 }
3346
3347 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3348 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3349 goto ignore;
3350 }
3351
3352 ceph_assert(m->get_orig_source_inst().name.is_osd());
3353
3354 // force all osds to have gone through luminous prior to upgrade to nautilus
3355 {
3356 vector<string> missing;
3357 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3358 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3359 }
3360 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3361 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3362 }
3363 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3364 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3365 }
3366 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3367 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3368 }
3369
3370 if (!missing.empty()) {
3371 using std::experimental::make_ostream_joiner;
3372
3373 stringstream ss;
3374 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3375
3376 mon->clog->info() << "disallowing boot of OSD "
3377 << m->get_orig_source_inst()
3378 << " because the osd lacks " << ss.str();
3379 goto ignore;
3380 }
3381 }
3382
3383 // make sure osd versions do not span more than 3 releases
3384 if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3385 osdmap.require_osd_release < ceph_release_t::mimic) {
3386 mon->clog->info() << "disallowing boot of octopus+ OSD "
3387 << m->get_orig_source_inst()
3388 << " because require_osd_release < mimic";
3389 goto ignore;
3390 }
3391
3392 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3393 // we are reusing a jewel feature bit that was retired in luminous.
3394 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3395 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3396 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3397 mon->clog->info() << "disallowing boot of OSD "
3398 << m->get_orig_source_inst()
3399 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3400 goto ignore;
3401 }
3402
3403 // already booted?
3404 if (osdmap.is_up(from) &&
3405 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3406 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3407 // yup.
3408 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3409 << " " << m->get_orig_source_addrs()
3410 << " =~ " << osdmap.get_addrs(from) << dendl;
3411 _booted(op, false);
3412 return true;
3413 }
3414
3415 if (osdmap.exists(from) &&
3416 !osdmap.get_uuid(from).is_zero() &&
3417 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3418 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3419 << " clashes with existing osd: different fsid"
3420 << " (ours: " << osdmap.get_uuid(from)
3421 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3422 goto ignore;
3423 }
3424
3425 if (osdmap.exists(from) &&
3426 osdmap.get_info(from).up_from > m->version &&
3427 osdmap.get_most_recent_addrs(from).legacy_equals(
3428 m->get_orig_source_addrs())) {
3429 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3430 send_latest(op, m->sb.current_epoch+1);
3431 return true;
3432 }
3433
3434 // noup?
3435 if (!can_mark_up(from)) {
3436 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3437 send_latest(op, m->sb.current_epoch+1);
3438 return true;
3439 }
3440
3441 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3442 return false;
3443
3444 ignore:
3445 return true;
3446 }
3447
3448 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3449 {
3450 op->mark_osdmon_event(__func__);
3451 auto m = op->get_req<MOSDBoot>();
3452 dout(7) << __func__ << " from " << m->get_source()
3453 << " sb " << m->sb
3454 << " client_addrs" << m->get_connection()->get_peer_addrs()
3455 << " cluster_addrs " << m->cluster_addrs
3456 << " hb_back_addrs " << m->hb_back_addrs
3457 << " hb_front_addrs " << m->hb_front_addrs
3458 << dendl;
3459
3460 ceph_assert(m->get_orig_source().is_osd());
3461 int from = m->get_orig_source().num();
3462
3463 // does this osd exist?
3464 if (from >= osdmap.get_max_osd()) {
3465 dout(1) << "boot from osd." << from << " >= max_osd "
3466 << osdmap.get_max_osd() << dendl;
3467 return false;
3468 }
3469
3470 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3471 if (pending_inc.new_state.count(from))
3472 oldstate ^= pending_inc.new_state[from];
3473
3474 // already up? mark down first?
3475 if (osdmap.is_up(from)) {
3476 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3477 << osdmap.get_addrs(from) << dendl;
3478 // preprocess should have caught these; if not, assert.
3479 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3480 m->get_orig_source_addrs()) ||
3481 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3482 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3483
3484 if (pending_inc.new_state.count(from) == 0 ||
3485 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3486 // mark previous guy down
3487 pending_inc.new_state[from] = CEPH_OSD_UP;
3488 }
3489 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3490 } else if (pending_inc.new_up_client.count(from)) {
3491 // already prepared, just wait
3492 dout(7) << __func__ << " already prepared, waiting on "
3493 << m->get_orig_source_addr() << dendl;
3494 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3495 } else {
3496 // mark new guy up.
3497 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3498 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3499 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3500 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3501
3502 down_pending_out.erase(from); // if any
3503
3504 if (m->sb.weight)
3505 osd_weight[from] = m->sb.weight;
3506
3507 // set uuid?
3508 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3509 << dendl;
3510 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3511 // preprocess should have caught this; if not, assert.
3512 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3513 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3514 }
3515
3516 // fresh osd?
3517 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3518 const osd_info_t& i = osdmap.get_info(from);
3519 if (i.up_from > i.lost_at) {
3520 dout(10) << " fresh osd; marking lost_at too" << dendl;
3521 pending_inc.new_lost[from] = osdmap.get_epoch();
3522 }
3523 }
3524
3525 // metadata
3526 bufferlist osd_metadata;
3527 encode(m->metadata, osd_metadata);
3528 pending_metadata[from] = osd_metadata;
3529 pending_metadata_rm.erase(from);
3530
3531 // adjust last clean unmount epoch?
3532 const osd_info_t& info = osdmap.get_info(from);
3533 dout(10) << " old osd_info: " << info << dendl;
3534 if (m->sb.mounted > info.last_clean_begin ||
3535 (m->sb.mounted == info.last_clean_begin &&
3536 m->sb.clean_thru > info.last_clean_end)) {
3537 epoch_t begin = m->sb.mounted;
3538 epoch_t end = m->sb.clean_thru;
3539
3540 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3541 << "[" << info.last_clean_begin << "," << info.last_clean_end
3542 << ") -> [" << begin << "-" << end << ")"
3543 << dendl;
3544 pending_inc.new_last_clean_interval[from] =
3545 pair<epoch_t,epoch_t>(begin, end);
3546 }
3547
3548 if (pending_inc.new_xinfo.count(from) == 0)
3549 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3550 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3551 if (m->boot_epoch == 0) {
3552 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3553 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3554 dout(10) << " not laggy, new xi " << xi << dendl;
3555 } else {
3556 if (xi.down_stamp.sec()) {
3557 int interval = ceph_clock_now().sec() -
3558 xi.down_stamp.sec();
3559 if (g_conf()->mon_osd_laggy_max_interval &&
3560 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3561 interval = g_conf()->mon_osd_laggy_max_interval;
3562 }
3563 xi.laggy_interval =
3564 interval * g_conf()->mon_osd_laggy_weight +
3565 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3566 }
3567 xi.laggy_probability =
3568 g_conf()->mon_osd_laggy_weight +
3569 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3570 dout(10) << " laggy, now xi " << xi << dendl;
3571 }
3572
3573 // set features shared by the osd
3574 if (m->osd_features)
3575 xi.features = m->osd_features;
3576 else
3577 xi.features = m->get_connection()->get_features();
3578
3579 // mark in?
3580 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3581 (oldstate & CEPH_OSD_AUTOOUT)) ||
3582 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3583 (g_conf()->mon_osd_auto_mark_in)) {
3584 if (can_mark_in(from)) {
3585 if (xi.old_weight > 0) {
3586 pending_inc.new_weight[from] = xi.old_weight;
3587 xi.old_weight = 0;
3588 } else {
3589 pending_inc.new_weight[from] = CEPH_OSD_IN;
3590 }
3591 } else {
3592 dout(7) << __func__ << " NOIN set, will not mark in "
3593 << m->get_orig_source_addr() << dendl;
3594 }
3595 }
3596
3597 // wait
3598 wait_for_finished_proposal(op, new C_Booted(this, op));
3599 }
3600 return true;
3601 }
3602
3603 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3604 {
3605 op->mark_osdmon_event(__func__);
3606 auto m = op->get_req<MOSDBoot>();
3607 dout(7) << "_booted " << m->get_orig_source_inst()
3608 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3609
3610 if (logit) {
3611 mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3612 << " boot";
3613 }
3614
3615 send_latest(op, m->sb.current_epoch+1);
3616 }
3617
3618
3619 // -------------
3620 // full
3621
3622 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3623 {
3624 op->mark_osdmon_event(__func__);
3625 auto m = op->get_req<MOSDFull>();
3626 int from = m->get_orig_source().num();
3627 set<string> state;
3628 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3629
3630 // check permissions, ignore if failed
3631 MonSession *session = op->get_session();
3632 if (!session)
3633 goto ignore;
3634 if (!session->is_capable("osd", MON_CAP_X)) {
3635 dout(0) << "MOSDFull from entity with insufficient privileges:"
3636 << session->caps << dendl;
3637 goto ignore;
3638 }
3639
3640 // ignore a full message from the osd instance that already went down
3641 if (!osdmap.exists(from)) {
3642 dout(7) << __func__ << " ignoring full message from nonexistent "
3643 << m->get_orig_source_inst() << dendl;
3644 goto ignore;
3645 }
3646 if ((!osdmap.is_up(from) &&
3647 osdmap.get_most_recent_addrs(from).legacy_equals(
3648 m->get_orig_source_addrs())) ||
3649 (osdmap.is_up(from) &&
3650 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3651 dout(7) << __func__ << " ignoring full message from down "
3652 << m->get_orig_source_inst() << dendl;
3653 goto ignore;
3654 }
3655
3656 OSDMap::calc_state_set(osdmap.get_state(from), state);
3657
3658 if ((osdmap.get_state(from) & mask) == m->state) {
3659 dout(7) << __func__ << " state already " << state << " for osd." << from
3660 << " " << m->get_orig_source_inst() << dendl;
3661 _reply_map(op, m->version);
3662 goto ignore;
3663 }
3664
3665 dout(10) << __func__ << " want state " << state << " for osd." << from
3666 << " " << m->get_orig_source_inst() << dendl;
3667 return false;
3668
3669 ignore:
3670 return true;
3671 }
3672
3673 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3674 {
3675 op->mark_osdmon_event(__func__);
3676 auto m = op->get_req<MOSDFull>();
3677 const int from = m->get_orig_source().num();
3678
3679 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3680 const unsigned want_state = m->state & mask; // safety first
3681
3682 unsigned cur_state = osdmap.get_state(from);
3683 auto p = pending_inc.new_state.find(from);
3684 if (p != pending_inc.new_state.end()) {
3685 cur_state ^= p->second;
3686 }
3687 cur_state &= mask;
3688
3689 set<string> want_state_set, cur_state_set;
3690 OSDMap::calc_state_set(want_state, want_state_set);
3691 OSDMap::calc_state_set(cur_state, cur_state_set);
3692
3693 if (cur_state != want_state) {
3694 if (p != pending_inc.new_state.end()) {
3695 p->second &= ~mask;
3696 } else {
3697 pending_inc.new_state[from] = 0;
3698 }
3699 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3700 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3701 << " -> " << want_state_set << dendl;
3702 } else {
3703 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3704 << " = wanted " << want_state_set << ", just waiting" << dendl;
3705 }
3706
3707 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3708 return true;
3709 }
3710
3711 // -------------
3712 // alive
3713
3714 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3715 {
3716 op->mark_osdmon_event(__func__);
3717 auto m = op->get_req<MOSDAlive>();
3718 int from = m->get_orig_source().num();
3719
3720 // check permissions, ignore if failed
3721 MonSession *session = op->get_session();
3722 if (!session)
3723 goto ignore;
3724 if (!session->is_capable("osd", MON_CAP_X)) {
3725 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3726 << session->caps << dendl;
3727 goto ignore;
3728 }
3729
3730 if (!osdmap.is_up(from) ||
3731 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3732 dout(7) << "preprocess_alive ignoring alive message from down "
3733 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3734 << dendl;
3735 goto ignore;
3736 }
3737
3738 if (osdmap.get_up_thru(from) >= m->want) {
3739 // yup.
3740 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3741 _reply_map(op, m->version);
3742 return true;
3743 }
3744
3745 dout(10) << "preprocess_alive want up_thru " << m->want
3746 << " from " << m->get_orig_source_inst() << dendl;
3747 return false;
3748
3749 ignore:
3750 return true;
3751 }
3752
3753 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3754 {
3755 op->mark_osdmon_event(__func__);
3756 auto m = op->get_req<MOSDAlive>();
3757 int from = m->get_orig_source().num();
3758
3759 if (0) { // we probably don't care much about these
3760 mon->clog->debug() << m->get_orig_source_inst() << " alive";
3761 }
3762
3763 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3764 << " from " << m->get_orig_source_inst() << dendl;
3765
3766 update_up_thru(from, m->version); // set to the latest map the OSD has
3767 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3768 return true;
3769 }
3770
3771 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3772 {
3773 op->mark_osdmon_event(__func__);
3774 dout(7) << "_reply_map " << e
3775 << " from " << op->get_req()->get_orig_source_inst()
3776 << dendl;
3777 send_latest(op, e);
3778 }
3779
3780 // pg_created
3781 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3782 {
3783 op->mark_osdmon_event(__func__);
3784 auto m = op->get_req<MOSDPGCreated>();
3785 dout(10) << __func__ << " " << *m << dendl;
3786 auto session = op->get_session();
3787 mon->no_reply(op);
3788 if (!session) {
3789 dout(10) << __func__ << ": no monitor session!" << dendl;
3790 return true;
3791 }
3792 if (!session->is_capable("osd", MON_CAP_X)) {
3793 derr << __func__ << " received from entity "
3794 << "with insufficient privileges " << session->caps << dendl;
3795 return true;
3796 }
3797 // always forward the "created!" to the leader
3798 return false;
3799 }
3800
3801 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3802 {
3803 op->mark_osdmon_event(__func__);
3804 auto m = op->get_req<MOSDPGCreated>();
3805 dout(10) << __func__ << " " << *m << dendl;
3806 auto src = m->get_orig_source();
3807 auto from = src.num();
3808 if (!src.is_osd() ||
3809 !mon->osdmon()->osdmap.is_up(from) ||
3810 !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3811 m->get_orig_source_addrs())) {
3812 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3813 return false;
3814 }
3815 pending_created_pgs.push_back(m->pgid);
3816 return true;
3817 }
3818
3819 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3820 {
3821 op->mark_osdmon_event(__func__);
3822 auto m = op->get_req<MOSDPGReadyToMerge>();
3823 dout(10) << __func__ << " " << *m << dendl;
3824 const pg_pool_t *pi;
3825 auto session = op->get_session();
3826 if (!session) {
3827 dout(10) << __func__ << ": no monitor session!" << dendl;
3828 goto ignore;
3829 }
3830 if (!session->is_capable("osd", MON_CAP_X)) {
3831 derr << __func__ << " received from entity "
3832 << "with insufficient privileges " << session->caps << dendl;
3833 goto ignore;
3834 }
3835 pi = osdmap.get_pg_pool(m->pgid.pool());
3836 if (!pi) {
3837 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3838 goto ignore;
3839 }
3840 if (pi->get_pg_num() <= m->pgid.ps()) {
3841 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3842 goto ignore;
3843 }
3844 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3845 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3846 goto ignore;
3847 }
3848 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3849 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3850 goto ignore;
3851 }
3852 return false;
3853
3854 ignore:
3855 mon->no_reply(op);
3856 return true;
3857 }
3858
3859 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3860 {
3861 op->mark_osdmon_event(__func__);
3862 auto m = op->get_req<MOSDPGReadyToMerge>();
3863 dout(10) << __func__ << " " << *m << dendl;
3864 pg_pool_t p;
3865 if (pending_inc.new_pools.count(m->pgid.pool()))
3866 p = pending_inc.new_pools[m->pgid.pool()];
3867 else
3868 p = *osdmap.get_pg_pool(m->pgid.pool());
3869 if (p.get_pg_num() != m->pgid.ps() + 1 ||
3870 p.get_pg_num_pending() > m->pgid.ps()) {
3871 dout(10) << __func__
3872 << " race with concurrent pg_num[_pending] update, will retry"
3873 << dendl;
3874 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3875 return true;
3876 }
3877
3878 if (m->ready) {
3879 p.dec_pg_num(m->pgid,
3880 pending_inc.epoch,
3881 m->source_version,
3882 m->target_version,
3883 m->last_epoch_started,
3884 m->last_epoch_clean);
3885 p.last_change = pending_inc.epoch;
3886 } else {
3887 // back off the merge attempt!
3888 p.set_pg_num_pending(p.get_pg_num());
3889 }
3890
3891 // force pre-nautilus clients to resend their ops, since they
3892 // don't understand pg_num_pending changes form a new interval
3893 p.last_force_op_resend_prenautilus = pending_inc.epoch;
3894
3895 pending_inc.new_pools[m->pgid.pool()] = p;
3896
3897 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3898 if (m->ready &&
3899 prob > 0 &&
3900 prob > (double)(rand() % 1000)/1000.0) {
3901 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3902 auto n = new MMonCommand(mon->monmap->get_fsid());
3903 n->set_connection(m->get_connection());
3904 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3905 osdmap.get_pool_name(m->pgid.pool()) +
3906 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3907 stringify(m->pgid.ps() + 1) + "\"}" };
3908 MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3909 nop->set_type_service();
3910 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3911 } else {
3912 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3913 }
3914 return true;
3915 }
3916
3917
3918 // -------------
3919 // pg_temp changes
3920
3921 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3922 {
3923 auto m = op->get_req<MOSDPGTemp>();
3924 dout(10) << "preprocess_pgtemp " << *m << dendl;
3925 mempool::osdmap::vector<int> empty;
3926 int from = m->get_orig_source().num();
3927 size_t ignore_cnt = 0;
3928
3929 // check caps
3930 MonSession *session = op->get_session();
3931 if (!session)
3932 goto ignore;
3933 if (!session->is_capable("osd", MON_CAP_X)) {
3934 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3935 << session->caps << dendl;
3936 goto ignore;
3937 }
3938
3939 if (!osdmap.is_up(from) ||
3940 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3941 dout(7) << "ignoring pgtemp message from down "
3942 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3943 << dendl;
3944 goto ignore;
3945 }
3946
3947 if (m->forced) {
3948 return false;
3949 }
3950
3951 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3952 dout(20) << " " << p->first
3953 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3954 << " -> " << p->second << dendl;
3955
3956 // does the pool exist?
3957 if (!osdmap.have_pg_pool(p->first.pool())) {
3958 /*
3959 * 1. If the osdmap does not have the pool, it means the pool has been
3960 * removed in-between the osd sending this message and us handling it.
3961 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3962 * not exist in the pending either, as the osds would not send a
3963 * message about a pool they know nothing about (yet).
3964 * 3. However, if the pool does exist in the pending, then it must be a
3965 * new pool, and not relevant to this message (see 1).
3966 */
3967 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3968 << ": pool has been removed" << dendl;
3969 ignore_cnt++;
3970 continue;
3971 }
3972
3973 int acting_primary = -1;
3974 osdmap.pg_to_up_acting_osds(
3975 p->first, nullptr, nullptr, nullptr, &acting_primary);
3976 if (acting_primary != from) {
3977 /* If the source isn't the primary based on the current osdmap, we know
3978 * that the interval changed and that we can discard this message.
3979 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3980 * which of two pg temp mappings on the same pg is more recent.
3981 */
3982 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3983 << ": primary has changed" << dendl;
3984 ignore_cnt++;
3985 continue;
3986 }
3987
3988 // removal?
3989 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3990 osdmap.primary_temp->count(p->first)))
3991 return false;
3992 // change?
3993 // NOTE: we assume that this will clear pg_primary, so consider
3994 // an existing pg_primary field to imply a change
3995 if (p->second.size() &&
3996 (osdmap.pg_temp->count(p->first) == 0 ||
3997 osdmap.pg_temp->get(p->first) != p->second ||
3998 osdmap.primary_temp->count(p->first)))
3999 return false;
4000 }
4001
4002 // should we ignore all the pgs?
4003 if (ignore_cnt == m->pg_temp.size())
4004 goto ignore;
4005
4006 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4007 _reply_map(op, m->map_epoch);
4008 return true;
4009
4010 ignore:
4011 return true;
4012 }
4013
4014 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4015 {
4016 epoch_t old_up_thru = osdmap.get_up_thru(from);
4017 auto ut = pending_inc.new_up_thru.find(from);
4018 if (ut != pending_inc.new_up_thru.end()) {
4019 old_up_thru = ut->second;
4020 }
4021 if (up_thru > old_up_thru) {
4022 // set up_thru too, so the osd doesn't have to ask again
4023 pending_inc.new_up_thru[from] = up_thru;
4024 }
4025 }
4026
4027 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4028 {
4029 op->mark_osdmon_event(__func__);
4030 auto m = op->get_req<MOSDPGTemp>();
4031 int from = m->get_orig_source().num();
4032 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4033 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4034 uint64_t pool = p->first.pool();
4035 if (pending_inc.old_pools.count(pool)) {
4036 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4037 << ": pool pending removal" << dendl;
4038 continue;
4039 }
4040 if (!osdmap.have_pg_pool(pool)) {
4041 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4042 << ": pool has been removed" << dendl;
4043 continue;
4044 }
4045 pending_inc.new_pg_temp[p->first] =
4046 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4047
4048 // unconditionally clear pg_primary (until this message can encode
4049 // a change for that, too.. at which point we need to also fix
4050 // preprocess_pg_temp)
4051 if (osdmap.primary_temp->count(p->first) ||
4052 pending_inc.new_primary_temp.count(p->first))
4053 pending_inc.new_primary_temp[p->first] = -1;
4054 }
4055
4056 // set up_thru too, so the osd doesn't have to ask again
4057 update_up_thru(from, m->map_epoch);
4058
4059 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4060 return true;
4061 }
4062
4063
4064 // ---
4065
4066 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4067 {
4068 op->mark_osdmon_event(__func__);
4069 auto m = op->get_req<MRemoveSnaps>();
4070 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4071
4072 // check privilege, ignore if failed
4073 MonSession *session = op->get_session();
4074 mon->no_reply(op);
4075 if (!session)
4076 goto ignore;
4077 if (!session->caps.is_capable(
4078 cct,
4079 session->entity_name,
4080 "osd", "osd pool rmsnap", {}, true, true, false,
4081 session->get_peer_socket_addr())) {
4082 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4083 << session->caps << dendl;
4084 goto ignore;
4085 }
4086
4087 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4088 q != m->snaps.end();
4089 ++q) {
4090 if (!osdmap.have_pg_pool(q->first)) {
4091 dout(10) << " ignoring removed_snaps " << q->second
4092 << " on non-existent pool " << q->first << dendl;
4093 continue;
4094 }
4095 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4096 for (vector<snapid_t>::iterator p = q->second.begin();
4097 p != q->second.end();
4098 ++p) {
4099 if (*p > pi->get_snap_seq() ||
4100 !_is_removed_snap(q->first, *p)) {
4101 return false;
4102 }
4103 }
4104 }
4105
4106 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4107 auto reply = make_message<MRemoveSnaps>();
4108 reply->snaps = m->snaps;
4109 mon->send_reply(op, reply.detach());
4110 }
4111
4112 ignore:
4113 return true;
4114 }
4115
4116 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4117 {
4118 op->mark_osdmon_event(__func__);
4119 auto m = op->get_req<MRemoveSnaps>();
4120 dout(7) << "prepare_remove_snaps " << *m << dendl;
4121
4122 for (auto& [pool, snaps] : m->snaps) {
4123 if (!osdmap.have_pg_pool(pool)) {
4124 dout(10) << " ignoring removed_snaps " << snaps
4125 << " on non-existent pool " << pool << dendl;
4126 continue;
4127 }
4128
4129 pg_pool_t& pi = osdmap.pools[pool];
4130 for (auto s : snaps) {
4131 if (!_is_removed_snap(pool, s) &&
4132 (!pending_inc.new_pools.count(pool) ||
4133 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4134 (!pending_inc.new_removed_snaps.count(pool) ||
4135 !pending_inc.new_removed_snaps[pool].contains(s))) {
4136 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4137 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4138 newpi->removed_snaps.insert(s);
4139 dout(10) << " pool " << pool << " removed_snaps added " << s
4140 << " (now " << newpi->removed_snaps << ")" << dendl;
4141 }
4142 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4143 if (s > newpi->get_snap_seq()) {
4144 dout(10) << " pool " << pool << " snap_seq "
4145 << newpi->get_snap_seq() << " -> " << s << dendl;
4146 newpi->set_snap_seq(s);
4147 }
4148 newpi->set_snap_epoch(pending_inc.epoch);
4149 dout(10) << " added pool " << pool << " snap " << s
4150 << " to removed_snaps queue" << dendl;
4151 pending_inc.new_removed_snaps[pool].insert(s);
4152 }
4153 }
4154 }
4155
4156 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4157 auto reply = make_message<MRemoveSnaps>();
4158 reply->snaps = m->snaps;
4159 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4160 }
4161
4162 return true;
4163 }
4164
4165 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4166 {
4167 op->mark_osdmon_event(__func__);
4168 auto m = op->get_req<MMonGetPurgedSnaps>();
4169 dout(7) << __func__ << " " << *m << dendl;
4170
4171 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4172
4173 string k = make_purged_snap_epoch_key(m->start);
4174 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4175 it->upper_bound(k);
4176 unsigned long epoch = m->last;
4177 while (it->valid()) {
4178 if (it->key().find("purged_epoch_") != 0) {
4179 break;
4180 }
4181 string k = it->key();
4182 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4183 if (n != 1) {
4184 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4185 } else if (epoch > m->last) {
4186 break;
4187 } else {
4188 bufferlist bl = it->value();
4189 auto p = bl.cbegin();
4190 auto &v = r[epoch];
4191 try {
4192 ceph::decode(v, p);
4193 } catch (buffer::error& e) {
4194 derr << __func__ << " unable to parse value for key '" << it->key()
4195 << "': \n";
4196 bl.hexdump(*_dout);
4197 *_dout << dendl;
4198 }
4199 n += 4 + v.size() * 16;
4200 }
4201 if (n > 1048576) {
4202 // impose a semi-arbitrary limit to message size
4203 break;
4204 }
4205 it->next();
4206 }
4207
4208 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4209 reply->purged_snaps.swap(r);
4210 mon->send_reply(op, reply.detach());
4211
4212 return true;
4213 }
4214
4215 // osd beacon
4216 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4217 {
4218 op->mark_osdmon_event(__func__);
4219 // check caps
4220 auto session = op->get_session();
4221 mon->no_reply(op);
4222 if (!session) {
4223 dout(10) << __func__ << " no monitor session!" << dendl;
4224 return true;
4225 }
4226 if (!session->is_capable("osd", MON_CAP_X)) {
4227 derr << __func__ << " received from entity "
4228 << "with insufficient privileges " << session->caps << dendl;
4229 return true;
4230 }
4231 // Always forward the beacon to the leader, even if they are the same as
4232 // the old one. The leader will mark as down osds that haven't sent
4233 // beacon for a few minutes.
4234 return false;
4235 }
4236
4237 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4238 {
4239 op->mark_osdmon_event(__func__);
4240 const auto beacon = op->get_req<MOSDBeacon>();
4241 const auto src = beacon->get_orig_source();
4242 dout(10) << __func__ << " " << *beacon
4243 << " from " << src << dendl;
4244 int from = src.num();
4245
4246 if (!src.is_osd() ||
4247 !osdmap.is_up(from) ||
4248 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4249 if (src.is_osd() && !osdmap.is_up(from)) {
4250 // share some new maps with this guy in case it may not be
4251 // aware of its own deadness...
4252 send_latest(op, beacon->version+1);
4253 }
4254 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4255 return false;
4256 }
4257
4258 last_osd_report[from] = ceph_clock_now();
4259 osd_epochs[from] = beacon->version;
4260
4261 for (const auto& pg : beacon->pgs) {
4262 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4263 }
4264
4265 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4266 beacon->last_purged_snaps_scrub) {
4267 if (pending_inc.new_xinfo.count(from) == 0) {
4268 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4269 }
4270 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4271 beacon->last_purged_snaps_scrub;
4272 return true;
4273 } else {
4274 return false;
4275 }
4276 }
4277
4278 // ---------------
4279 // map helpers
4280
4281 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4282 {
4283 op->mark_osdmon_event(__func__);
4284 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4285 << " start " << start << dendl;
4286 if (start == 0)
4287 send_full(op);
4288 else
4289 send_incremental(op, start);
4290 }
4291
4292
4293 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4294 {
4295 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4296 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4297 r->oldest_map = get_first_committed();
4298 r->newest_map = osdmap.get_epoch();
4299 return r;
4300 }
4301
4302 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4303 {
4304 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4305 << std::hex << features << std::dec << dendl;
4306 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
4307 m->oldest_map = get_first_committed();
4308 m->newest_map = osdmap.get_epoch();
4309
4310 for (epoch_t e = to; e >= from && e > 0; e--) {
4311 bufferlist bl;
4312 int err = get_version(e, features, bl);
4313 if (err == 0) {
4314 ceph_assert(bl.length());
4315 // if (get_version(e, bl) > 0) {
4316 dout(20) << "build_incremental inc " << e << " "
4317 << bl.length() << " bytes" << dendl;
4318 m->incremental_maps[e] = bl;
4319 } else {
4320 ceph_assert(err == -ENOENT);
4321 ceph_assert(!bl.length());
4322 get_version_full(e, features, bl);
4323 if (bl.length() > 0) {
4324 //else if (get_version("full", e, bl) > 0) {
4325 dout(20) << "build_incremental full " << e << " "
4326 << bl.length() << " bytes" << dendl;
4327 m->maps[e] = bl;
4328 } else {
4329 ceph_abort(); // we should have all maps.
4330 }
4331 }
4332 }
4333 return m;
4334 }
4335
4336 void OSDMonitor::send_full(MonOpRequestRef op)
4337 {
4338 op->mark_osdmon_event(__func__);
4339 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4340 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
4341 }
4342
4343 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4344 {
4345 op->mark_osdmon_event(__func__);
4346
4347 MonSession *s = op->get_session();
4348 ceph_assert(s);
4349
4350 if (s->proxy_con) {
4351 // oh, we can tell the other mon to do it
4352 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4353 << first << dendl;
4354 MRoute *r = new MRoute(s->proxy_tid, NULL);
4355 r->send_osdmap_first = first;
4356 s->proxy_con->send_message(r);
4357 op->mark_event("reply: send routed send_osdmap_first reply");
4358 } else {
4359 // do it ourselves
4360 send_incremental(first, s, false, op);
4361 }
4362 }
4363
4364 void OSDMonitor::send_incremental(epoch_t first,
4365 MonSession *session,
4366 bool onetime,
4367 MonOpRequestRef req)
4368 {
4369 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4370 << " to " << session->name << dendl;
4371
4372 // get feature of the peer
4373 // use quorum_con_features, if it's an anonymous connection.
4374 uint64_t features = session->con_features ? session->con_features :
4375 mon->get_quorum_con_features();
4376
4377 if (first <= session->osd_epoch) {
4378 dout(10) << __func__ << " " << session->name << " should already have epoch "
4379 << session->osd_epoch << dendl;
4380 first = session->osd_epoch + 1;
4381 }
4382
4383 if (first < get_first_committed()) {
4384 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4385 m->oldest_map = get_first_committed();
4386 m->newest_map = osdmap.get_epoch();
4387
4388 first = get_first_committed();
4389 bufferlist bl;
4390 int err = get_version_full(first, features, bl);
4391 ceph_assert(err == 0);
4392 ceph_assert(bl.length());
4393 dout(20) << "send_incremental starting with base full "
4394 << first << " " << bl.length() << " bytes" << dendl;
4395 m->maps[first] = bl;
4396
4397 if (req) {
4398 mon->send_reply(req, m);
4399 session->osd_epoch = first;
4400 return;
4401 } else {
4402 session->con->send_message(m);
4403 session->osd_epoch = first;
4404 }
4405 first++;
4406 }
4407
4408 while (first <= osdmap.get_epoch()) {
4409 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4410 osdmap.get_epoch());
4411 MOSDMap *m = build_incremental(first, last, features);
4412
4413 if (req) {
4414 // send some maps. it may not be all of them, but it will get them
4415 // started.
4416 mon->send_reply(req, m);
4417 } else {
4418 session->con->send_message(m);
4419 first = last + 1;
4420 }
4421 session->osd_epoch = last;
4422 if (onetime || req)
4423 break;
4424 }
4425 }
4426
4427 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4428 {
4429 return get_version(ver, mon->get_quorum_con_features(), bl);
4430 }
4431
4432 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4433 {
4434 OSDMap::Incremental inc;
4435 auto q = bl.cbegin();
4436 inc.decode(q);
4437 // always encode with subset of osdmap's canonical features
4438 uint64_t f = features & inc.encode_features;
4439 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4440 << dendl;
4441 bl.clear();
4442 if (inc.fullmap.length()) {
4443 // embedded full map?
4444 OSDMap m;
4445 m.decode(inc.fullmap);
4446 inc.fullmap.clear();
4447 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4448 }
4449 if (inc.crush.length()) {
4450 // embedded crush map
4451 CrushWrapper c;
4452 auto p = inc.crush.cbegin();
4453 c.decode(p);
4454 inc.crush.clear();
4455 c.encode(inc.crush, f);
4456 }
4457 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4458 }
4459
4460 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4461 {
4462 OSDMap m;
4463 auto q = bl.cbegin();
4464 m.decode(q);
4465 // always encode with subset of osdmap's canonical features
4466 uint64_t f = features & m.get_encoding_features();
4467 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4468 << dendl;
4469 bl.clear();
4470 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4471 }
4472
4473 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4474 {
4475 uint64_t significant_features = OSDMap::get_significant_features(features);
4476 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4477 return 0;
4478 }
4479 int ret = PaxosService::get_version(ver, bl);
4480 if (ret < 0) {
4481 return ret;
4482 }
4483 // NOTE: this check is imprecise; the OSDMap encoding features may
4484 // be a subset of the latest mon quorum features, but worst case we
4485 // reencode once and then cache the (identical) result under both
4486 // feature masks.
4487 if (significant_features !=
4488 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4489 reencode_incremental_map(bl, features);
4490 }
4491 inc_osd_cache.add_bytes({ver, significant_features}, bl);
4492 return 0;
4493 }
4494
4495 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4496 {
4497 bufferlist inc_bl;
4498 int err = get_version(ver, inc_bl);
4499 ceph_assert(err == 0);
4500 ceph_assert(inc_bl.length());
4501
4502 auto p = inc_bl.cbegin();
4503 inc.decode(p);
4504 dout(10) << __func__ << " "
4505 << " epoch " << inc.epoch
4506 << " inc_crc " << inc.inc_crc
4507 << " full_crc " << inc.full_crc
4508 << " encode_features " << inc.encode_features << dendl;
4509 return 0;
4510 }
4511
4512 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4513 {
4514 dout(10) << __func__ << " ver " << ver << dendl;
4515
4516 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4517 if (closest_pinned == 0) {
4518 return -ENOENT;
4519 }
4520 if (closest_pinned > ver) {
4521 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4522 }
4523 ceph_assert(closest_pinned <= ver);
4524
4525 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4526
4527 // get osdmap incremental maps and apply on top of this one.
4528 bufferlist osdm_bl;
4529 bool has_cached_osdmap = false;
4530 for (version_t v = ver-1; v >= closest_pinned; --v) {
4531 if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4532 &osdm_bl)) {
4533 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4534 closest_pinned = v;
4535 has_cached_osdmap = true;
4536 break;
4537 }
4538 }
4539
4540 if (!has_cached_osdmap) {
4541 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4542 if (err != 0) {
4543 derr << __func__ << " closest pinned map ver " << closest_pinned
4544 << " not available! error: " << cpp_strerror(err) << dendl;
4545 }
4546 ceph_assert(err == 0);
4547 }
4548
4549 ceph_assert(osdm_bl.length());
4550
4551 OSDMap osdm;
4552 osdm.decode(osdm_bl);
4553
4554 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4555 << " e" << osdm.epoch
4556 << " crc " << osdm.get_crc()
4557 << " -- applying incremental maps." << dendl;
4558
4559 uint64_t encode_features = 0;
4560 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4561 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4562
4563 OSDMap::Incremental inc;
4564 int err = get_inc(v, inc);
4565 ceph_assert(err == 0);
4566
4567 encode_features = inc.encode_features;
4568
4569 err = osdm.apply_incremental(inc);
4570 ceph_assert(err == 0);
4571
4572 // this block performs paranoid checks on map retrieval
4573 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4574 inc.full_crc != 0) {
4575
4576 uint64_t f = encode_features;
4577 if (!f) {
4578 f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4579 }
4580
4581 // encode osdmap to force calculating crcs
4582 bufferlist tbl;
4583 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4584 // decode osdmap to compare crcs with what's expected by incremental
4585 OSDMap tosdm;
4586 tosdm.decode(tbl);
4587
4588 if (tosdm.get_crc() != inc.full_crc) {
4589 derr << __func__
4590 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4591 << ", expected " << inc.full_crc << ")" << dendl;
4592 ceph_abort_msg("osdmap crc mismatch");
4593 }
4594 }
4595
4596 // note: we cannot add the recently computed map to the cache, as is,
4597 // because we have not encoded the map into a bl.
4598 }
4599
4600 if (!encode_features) {
4601 dout(10) << __func__
4602 << " last incremental map didn't have features;"
4603 << " defaulting to quorum's or all" << dendl;
4604 encode_features =
4605 (mon->quorum_con_features ? mon->quorum_con_features : -1);
4606 }
4607 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4608
4609 return 0;
4610 }
4611
4612 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4613 {
4614 return get_version_full(ver, mon->get_quorum_con_features(), bl);
4615 }
4616
4617 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4618 bufferlist& bl)
4619 {
4620 uint64_t significant_features = OSDMap::get_significant_features(features);
4621 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4622 return 0;
4623 }
4624 int ret = PaxosService::get_version_full(ver, bl);
4625 if (ret == -ENOENT) {
4626 // build map?
4627 ret = get_full_from_pinned_map(ver, bl);
4628 }
4629 if (ret < 0) {
4630 return ret;
4631 }
4632 // NOTE: this check is imprecise; the OSDMap encoding features may
4633 // be a subset of the latest mon quorum features, but worst case we
4634 // reencode once and then cache the (identical) result under both
4635 // feature masks.
4636 if (significant_features !=
4637 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4638 reencode_full_map(bl, features);
4639 }
4640 full_osd_cache.add_bytes({ver, significant_features}, bl);
4641 return 0;
4642 }
4643
4644 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4645 {
4646 dout(10) << "blacklist " << av << " until " << until << dendl;
4647 for (auto a : av.v) {
4648 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4649 a.set_type(entity_addr_t::TYPE_ANY);
4650 } else {
4651 a.set_type(entity_addr_t::TYPE_LEGACY);
4652 }
4653 pending_inc.new_blacklist[a] = until;
4654 }
4655 return pending_inc.epoch;
4656 }
4657
4658 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4659 {
4660 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4661 a.set_type(entity_addr_t::TYPE_ANY);
4662 } else {
4663 a.set_type(entity_addr_t::TYPE_LEGACY);
4664 }
4665 dout(10) << "blacklist " << a << " until " << until << dendl;
4666 pending_inc.new_blacklist[a] = until;
4667 return pending_inc.epoch;
4668 }
4669
4670
4671 void OSDMonitor::check_osdmap_subs()
4672 {
4673 dout(10) << __func__ << dendl;
4674 if (!osdmap.get_epoch()) {
4675 return;
4676 }
4677 auto osdmap_subs = mon->session_map.subs.find("osdmap");
4678 if (osdmap_subs == mon->session_map.subs.end()) {
4679 return;
4680 }
4681 auto p = osdmap_subs->second->begin();
4682 while (!p.end()) {
4683 auto sub = *p;
4684 ++p;
4685 check_osdmap_sub(sub);
4686 }
4687 }
4688
4689 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4690 {
4691 dout(10) << __func__ << " " << sub << " next " << sub->next
4692 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4693 if (sub->next <= osdmap.get_epoch()) {
4694 if (sub->next >= 1)
4695 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4696 else
4697 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4698 if (sub->onetime)
4699 mon->session_map.remove_sub(sub);
4700 else
4701 sub->next = osdmap.get_epoch() + 1;
4702 }
4703 }
4704
4705 void OSDMonitor::check_pg_creates_subs()
4706 {
4707 if (!osdmap.get_num_up_osds()) {
4708 return;
4709 }
4710 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4711 mon->with_session_map([this](const MonSessionMap& session_map) {
4712 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4713 if (pg_creates_subs == session_map.subs.end()) {
4714 return;
4715 }
4716 for (auto sub : *pg_creates_subs->second) {
4717 check_pg_creates_sub(sub);
4718 }
4719 });
4720 }
4721
4722 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4723 {
4724 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4725 ceph_assert(sub->type == "osd_pg_creates");
4726 // only send these if the OSD is up. we will check_subs() when they do
4727 // come up so they will get the creates then.
4728 if (sub->session->name.is_osd() &&
4729 mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4730 sub->next = send_pg_creates(sub->session->name.num(),
4731 sub->session->con.get(),
4732 sub->next);
4733 }
4734 }
4735
4736 void OSDMonitor::do_application_enable(int64_t pool_id,
4737 const std::string &app_name,
4738 const std::string &app_key,
4739 const std::string &app_value,
4740 bool force)
4741 {
4742 ceph_assert(paxos->is_plugged() && is_writeable());
4743
4744 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4745 << dendl;
4746
4747 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4748
4749 auto pp = osdmap.get_pg_pool(pool_id);
4750 ceph_assert(pp != nullptr);
4751
4752 pg_pool_t p = *pp;
4753 if (pending_inc.new_pools.count(pool_id)) {
4754 p = pending_inc.new_pools[pool_id];
4755 }
4756
4757 if (app_key.empty()) {
4758 p.application_metadata.insert({app_name, {}});
4759 } else {
4760 if (force) {
4761 p.application_metadata[app_name][app_key] = app_value;
4762 } else {
4763 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4764 }
4765 }
4766 p.last_change = pending_inc.epoch;
4767 pending_inc.new_pools[pool_id] = p;
4768 }
4769
4770 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4771 pool_opts_t::key_t opt,
4772 pool_opts_t::value_t val)
4773 {
4774 auto p = pending_inc.new_pools.try_emplace(
4775 pool_id, *osdmap.get_pg_pool(pool_id));
4776 p.first->second.opts.set(opt, val);
4777 }
4778
4779 unsigned OSDMonitor::scan_for_creating_pgs(
4780 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4781 const mempool::osdmap::set<int64_t>& removed_pools,
4782 utime_t modified,
4783 creating_pgs_t* creating_pgs) const
4784 {
4785 unsigned queued = 0;
4786 for (auto& p : pools) {
4787 int64_t poolid = p.first;
4788 if (creating_pgs->created_pools.count(poolid)) {
4789 dout(10) << __func__ << " already created " << poolid << dendl;
4790 continue;
4791 }
4792 const pg_pool_t& pool = p.second;
4793 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4794 pool.get_type(), pool.get_size());
4795 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4796 continue;
4797
4798 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4799 const auto created = pool.get_last_change();
4800 if (last_scan_epoch && created <= last_scan_epoch) {
4801 dout(10) << __func__ << " no change in pool " << poolid
4802 << " " << pool << dendl;
4803 continue;
4804 }
4805 if (removed_pools.count(poolid)) {
4806 dout(10) << __func__ << " pool is being removed: " << poolid
4807 << " " << pool << dendl;
4808 continue;
4809 }
4810 dout(10) << __func__ << " queueing pool create for " << poolid
4811 << " " << pool << dendl;
4812 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4813 created, modified);
4814 queued++;
4815 }
4816 return queued;
4817 }
4818
4819 void OSDMonitor::update_creating_pgs()
4820 {
4821 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4822 << creating_pgs.queue.size() << " pools in queue" << dendl;
4823 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4824 std::lock_guard<std::mutex> l(creating_pgs_lock);
4825 for (const auto& pg : creating_pgs.pgs) {
4826 int acting_primary = -1;
4827 auto pgid = pg.first;
4828 if (!osdmap.pg_exists(pgid)) {
4829 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4830 << dendl;
4831 continue;
4832 }
4833 auto mapped = pg.second.create_epoch;
4834 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4835 spg_t spgid(pgid);
4836 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4837 // check the previous creating_pgs, look for the target to whom the pg was
4838 // previously mapped
4839 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4840 const auto last_acting_primary = pgs_by_epoch.first;
4841 for (auto& pgs: pgs_by_epoch.second) {
4842 if (pgs.second.count(spgid)) {
4843 if (last_acting_primary == acting_primary) {
4844 mapped = pgs.first;
4845 } else {
4846 dout(20) << __func__ << " " << pgid << " "
4847 << " acting_primary:" << last_acting_primary
4848 << " -> " << acting_primary << dendl;
4849 // note epoch if the target of the create message changed.
4850 mapped = mapping.get_epoch();
4851 }
4852 break;
4853 } else {
4854 // newly creating
4855 mapped = mapping.get_epoch();
4856 }
4857 }
4858 }
4859 dout(10) << __func__ << " will instruct osd." << acting_primary
4860 << " to create " << pgid << "@" << mapped << dendl;
4861 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4862 }
4863 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4864 creating_pgs_epoch = mapping.get_epoch();
4865 }
4866
4867 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4868 {
4869 dout(30) << __func__ << " osd." << osd << " next=" << next
4870 << " " << creating_pgs_by_osd_epoch << dendl;
4871 std::lock_guard<std::mutex> l(creating_pgs_lock);
4872 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4873 dout(20) << __func__
4874 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4875 // the subscribers will be updated when the mapping is completed anyway
4876 return next;
4877 }
4878 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4879 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4880 return next;
4881 ceph_assert(!creating_pgs_by_epoch->second.empty());
4882
4883 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4884 MOSDPGCreate2 *m = nullptr;
4885
4886 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
4887
4888 epoch_t last = 0;
4889 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4890 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4891 auto epoch = epoch_pgs->first;
4892 auto& pgs = epoch_pgs->second;
4893 dout(20) << __func__ << " osd." << osd << " from " << next
4894 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4895 last = epoch;
4896 for (auto& pg : pgs) {
4897 // Need the create time from the monitor using its clock to set
4898 // last_scrub_stamp upon pg creation.
4899 auto create = creating_pgs.pgs.find(pg.pgid);
4900 ceph_assert(create != creating_pgs.pgs.end());
4901 if (old) {
4902 if (!oldm) {
4903 oldm = new MOSDPGCreate(creating_pgs_epoch);
4904 }
4905 oldm->mkpg.emplace(pg.pgid,
4906 pg_create_t{create->second.create_epoch, pg.pgid, 0});
4907 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
4908 } else {
4909 if (!m) {
4910 m = new MOSDPGCreate2(creating_pgs_epoch);
4911 }
4912 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4913 create->second.create_stamp));
4914 if (create->second.history.epoch_created) {
4915 dout(20) << __func__ << " " << pg << " " << create->second.history
4916 << " " << create->second.past_intervals << dendl;
4917 m->pg_extra.emplace(pg, make_pair(create->second.history,
4918 create->second.past_intervals));
4919 }
4920 }
4921 dout(20) << __func__ << " will create " << pg
4922 << " at " << create->second.create_epoch << dendl;
4923 }
4924 }
4925 if (m) {
4926 con->send_message(m);
4927 } else if (oldm) {
4928 con->send_message(oldm);
4929 } else {
4930 dout(20) << __func__ << " osd." << osd << " from " << next
4931 << " has nothing to send" << dendl;
4932 return next;
4933 }
4934
4935 // sub is current through last + 1
4936 return last + 1;
4937 }
4938
4939 // TICK
4940
4941
4942 void OSDMonitor::tick()
4943 {
4944 if (!is_active()) return;
4945
4946 dout(10) << osdmap << dendl;
4947
4948 // always update osdmap manifest, regardless of being the leader.
4949 load_osdmap_manifest();
4950
4951 // always tune priority cache manager memory on leader and peons
4952 if (ceph_using_tcmalloc() && mon_memory_autotune) {
4953 std::lock_guard l(balancer_lock);
4954 if (pcm != nullptr) {
4955 pcm->tune_memory();
4956 pcm->balance();
4957 _set_new_cache_sizes();
4958 dout(10) << "tick balancer "
4959 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
4960 << " inc comtd_bytes: " << inc_cache->get_committed_size()
4961 << " inc used_bytes: " << inc_cache->_get_used_bytes()
4962 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
4963 << dendl;
4964 dout(10) << "tick balancer "
4965 << " full cache_bytes: " << full_cache->get_cache_bytes()
4966 << " full comtd_bytes: " << full_cache->get_committed_size()
4967 << " full used_bytes: " << full_cache->_get_used_bytes()
4968 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
4969 << dendl;
4970 }
4971 }
4972
4973 if (!mon->is_leader()) return;
4974
4975 bool do_propose = false;
4976 utime_t now = ceph_clock_now();
4977
4978 if (handle_osd_timeouts(now, last_osd_report)) {
4979 do_propose = true;
4980 }
4981
4982 // mark osds down?
4983 if (check_failures(now)) {
4984 do_propose = true;
4985 }
4986
4987 // Force a proposal if we need to prune; pruning is performed on
4988 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4989 // even if there's nothing going on.
4990 if (is_prune_enabled() && should_prune()) {
4991 do_propose = true;
4992 }
4993
4994 // mark down osds out?
4995
4996 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4997 * influence at all. The decision is made based on the ratio of "in" osds,
4998 * and the function returns false if this ratio is lower that the minimum
4999 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5000 */
5001 if (can_mark_out(-1)) {
5002 string down_out_subtree_limit = g_conf().get_val<string>(
5003 "mon_osd_down_out_subtree_limit");
5004 set<int> down_cache; // quick cache of down subtrees
5005
5006 map<int,utime_t>::iterator i = down_pending_out.begin();
5007 while (i != down_pending_out.end()) {
5008 int o = i->first;
5009 utime_t down = now;
5010 down -= i->second;
5011 ++i;
5012
5013 if (osdmap.is_down(o) &&
5014 osdmap.is_in(o) &&
5015 can_mark_out(o)) {
5016 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5017 utime_t grace = orig_grace;
5018 double my_grace = 0.0;
5019
5020 if (g_conf()->mon_osd_adjust_down_out_interval) {
5021 // scale grace period the same way we do the heartbeat grace.
5022 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5023 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5024 double decay_k = ::log(.5) / halflife;
5025 double decay = exp((double)down * decay_k);
5026 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5027 << " down for " << down << " decay " << decay << dendl;
5028 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5029 grace += my_grace;
5030 }
5031
5032 // is this an entire large subtree down?
5033 if (down_out_subtree_limit.length()) {
5034 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5035 if (type > 0) {
5036 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5037 dout(10) << "tick entire containing " << down_out_subtree_limit
5038 << " subtree for osd." << o
5039 << " is down; resetting timer" << dendl;
5040 // reset timer, too.
5041 down_pending_out[o] = now;
5042 continue;
5043 }
5044 }
5045 }
5046
5047 bool down_out = !osdmap.is_destroyed(o) &&
5048 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5049 bool destroyed_out = osdmap.is_destroyed(o) &&
5050 g_conf()->mon_osd_destroyed_out_interval > 0 &&
5051 // this is not precise enough as we did not make a note when this osd
5052 // was marked as destroyed, but let's not bother with that
5053 // complexity for now.
5054 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5055 if (down_out || destroyed_out) {
5056 dout(10) << "tick marking osd." << o << " OUT after " << down
5057 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5058 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5059
5060 // set the AUTOOUT bit.
5061 if (pending_inc.new_state.count(o) == 0)
5062 pending_inc.new_state[o] = 0;
5063 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5064
5065 // remember previous weight
5066 if (pending_inc.new_xinfo.count(o) == 0)
5067 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5068 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5069
5070 do_propose = true;
5071
5072 mon->clog->info() << "Marking osd." << o << " out (has been down for "
5073 << int(down.sec()) << " seconds)";
5074 } else
5075 continue;
5076 }
5077
5078 down_pending_out.erase(o);
5079 }
5080 } else {
5081 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5082 }
5083
5084 // expire blacklisted items?
5085 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5086 p != osdmap.blacklist.end();
5087 ++p) {
5088 if (p->second < now) {
5089 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5090 pending_inc.old_blacklist.push_back(p->first);
5091 do_propose = true;
5092 }
5093 }
5094
5095 if (try_prune_purged_snaps()) {
5096 do_propose = true;
5097 }
5098
5099 if (update_pools_status())
5100 do_propose = true;
5101
5102 if (do_propose ||
5103 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5104 propose_pending();
5105 }
5106
5107 void OSDMonitor::_set_new_cache_sizes()
5108 {
5109 uint64_t cache_size = 0;
5110 int64_t inc_alloc = 0;
5111 int64_t full_alloc = 0;
5112 int64_t kv_alloc = 0;
5113
5114 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5115 cache_size = pcm->get_tuned_mem();
5116 inc_alloc = inc_cache->get_committed_size();
5117 full_alloc = full_cache->get_committed_size();
5118 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5119 }
5120
5121 inc_osd_cache.set_bytes(inc_alloc);
5122 full_osd_cache.set_bytes(full_alloc);
5123
5124 dout(1) << __func__ << " cache_size:" << cache_size
5125 << " inc_alloc: " << inc_alloc
5126 << " full_alloc: " << full_alloc
5127 << " kv_alloc: " << kv_alloc
5128 << dendl;
5129 }
5130
5131 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5132 std::map<int,utime_t> &last_osd_report)
5133 {
5134 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5135 if (now - mon->get_leader_since() < timeo) {
5136 // We haven't been the leader for long enough to consider OSD timeouts
5137 return false;
5138 }
5139
5140 int max_osd = osdmap.get_max_osd();
5141 bool new_down = false;
5142
5143 for (int i=0; i < max_osd; ++i) {
5144 dout(30) << __func__ << ": checking up on osd " << i << dendl;
5145 if (!osdmap.exists(i)) {
5146 last_osd_report.erase(i); // if any
5147 continue;
5148 }
5149 if (!osdmap.is_up(i))
5150 continue;
5151 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5152 if (t == last_osd_report.end()) {
5153 // it wasn't in the map; start the timer.
5154 last_osd_report[i] = now;
5155 } else if (can_mark_down(i)) {
5156 utime_t diff = now - t->second;
5157 if (diff > timeo) {
5158 mon->clog->info() << "osd." << i << " marked down after no beacon for "
5159 << diff << " seconds";
5160 derr << "no beacon from osd." << i << " since " << t->second
5161 << ", " << diff << " seconds ago. marking down" << dendl;
5162 pending_inc.new_state[i] = CEPH_OSD_UP;
5163 new_down = true;
5164 }
5165 }
5166 }
5167 return new_down;
5168 }
5169
5170 static void dump_cpu_list(Formatter *f, const char *name,
5171 const string& strlist)
5172 {
5173 cpu_set_t cpu_set;
5174 size_t cpu_set_size;
5175 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5176 return;
5177 }
5178 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5179 f->open_array_section(name);
5180 for (auto cpu : cpus) {
5181 f->dump_int("cpu", cpu);
5182 }
5183 f->close_section();
5184 }
5185
5186 void OSDMonitor::dump_info(Formatter *f)
5187 {
5188 f->open_object_section("osdmap");
5189 osdmap.dump(f);
5190 f->close_section();
5191
5192 f->open_array_section("osd_metadata");
5193 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5194 if (osdmap.exists(i)) {
5195 f->open_object_section("osd");
5196 f->dump_unsigned("id", i);
5197 dump_osd_metadata(i, f, NULL);
5198 f->close_section();
5199 }
5200 }
5201 f->close_section();
5202
5203 f->open_object_section("osdmap_clean_epochs");
5204 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5205
5206 f->open_object_section("last_epoch_clean");
5207 last_epoch_clean.dump(f);
5208 f->close_section();
5209
5210 f->open_array_section("osd_epochs");
5211 for (auto& osd_epoch : osd_epochs) {
5212 f->open_object_section("osd");
5213 f->dump_unsigned("id", osd_epoch.first);
5214 f->dump_unsigned("epoch", osd_epoch.second);
5215 f->close_section();
5216 }
5217 f->close_section(); // osd_epochs
5218
5219 f->close_section(); // osd_clean_epochs
5220
5221 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5222 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5223
5224 f->open_object_section("crushmap");
5225 osdmap.crush->dump(f);
5226 f->close_section();
5227
5228 if (has_osdmap_manifest) {
5229 f->open_object_section("osdmap_manifest");
5230 osdmap_manifest.dump(f);
5231 f->close_section();
5232 }
5233 }
5234
5235 namespace {
5236 enum osd_pool_get_choices {
5237 SIZE, MIN_SIZE,
5238 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5239 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5240 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5241 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5242 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5243 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5244 CACHE_TARGET_FULL_RATIO,
5245 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5246 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5247 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5248 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5249 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5250 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5251 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5252 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5253 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5254 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5255 PG_AUTOSCALE_BIAS };
5256
5257 std::set<osd_pool_get_choices>
5258 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5259 const std::set<osd_pool_get_choices>& second)
5260 {
5261 std::set<osd_pool_get_choices> result;
5262 std::set_difference(first.begin(), first.end(),
5263 second.begin(), second.end(),
5264 std::inserter(result, result.end()));
5265 return result;
5266 }
5267 }
5268
5269
5270 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5271 {
5272 op->mark_osdmon_event(__func__);
5273 auto m = op->get_req<MMonCommand>();
5274 int r = 0;
5275 bufferlist rdata;
5276 stringstream ss, ds;
5277
5278 cmdmap_t cmdmap;
5279 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5280 string rs = ss.str();
5281 mon->reply_command(op, -EINVAL, rs, get_last_committed());
5282 return true;
5283 }
5284
5285 MonSession *session = op->get_session();
5286 if (!session) {
5287 derr << __func__ << " no session" << dendl;
5288 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5289 return true;
5290 }
5291
5292 string prefix;
5293 cmd_getval(cmdmap, "prefix", prefix);
5294
5295 string format;
5296 cmd_getval(cmdmap, "format", format, string("plain"));
5297 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5298
5299 if (prefix == "osd stat") {
5300 if (f) {
5301 f->open_object_section("osdmap");
5302 osdmap.print_summary(f.get(), ds, "", true);
5303 f->close_section();
5304 f->flush(rdata);
5305 } else {
5306 osdmap.print_summary(nullptr, ds, "", true);
5307 rdata.append(ds);
5308 }
5309 }
5310 else if (prefix == "osd dump" ||
5311 prefix == "osd tree" ||
5312 prefix == "osd tree-from" ||
5313 prefix == "osd ls" ||
5314 prefix == "osd getmap" ||
5315 prefix == "osd getcrushmap" ||
5316 prefix == "osd ls-tree" ||
5317 prefix == "osd info") {
5318 string val;
5319
5320 epoch_t epoch = 0;
5321 int64_t epochnum;
5322 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5323 epoch = epochnum;
5324
5325 bufferlist osdmap_bl;
5326 int err = get_version_full(epoch, osdmap_bl);
5327 if (err == -ENOENT) {
5328 r = -ENOENT;
5329 ss << "there is no map for epoch " << epoch;
5330 goto reply;
5331 }
5332 ceph_assert(err == 0);
5333 ceph_assert(osdmap_bl.length());
5334
5335 OSDMap *p;
5336 if (epoch == osdmap.get_epoch()) {
5337 p = &osdmap;
5338 } else {
5339 p = new OSDMap;
5340 p->decode(osdmap_bl);
5341 }
5342
5343 auto sg = make_scope_guard([&] {
5344 if (p != &osdmap) {
5345 delete p;
5346 }
5347 });
5348
5349 if (prefix == "osd dump") {
5350 stringstream ds;
5351 if (f) {
5352 f->open_object_section("osdmap");
5353 p->dump(f.get());
5354 f->close_section();
5355 f->flush(ds);
5356 } else {
5357 p->print(ds);
5358 }
5359 rdata.append(ds);
5360 if (!f)
5361 ds << " ";
5362 } else if (prefix == "osd ls") {
5363 if (f) {
5364 f->open_array_section("osds");
5365 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5366 if (osdmap.exists(i)) {
5367 f->dump_int("osd", i);
5368 }
5369 }
5370 f->close_section();
5371 f->flush(ds);
5372 } else {
5373 bool first = true;
5374 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5375 if (osdmap.exists(i)) {
5376 if (!first)
5377 ds << "\n";
5378 first = false;
5379 ds << i;
5380 }
5381 }
5382 }
5383 rdata.append(ds);
5384 } else if (prefix == "osd info") {
5385 int64_t osd_id;
5386 bool do_single_osd = true;
5387 if (!cmd_getval(cmdmap, "id", osd_id)) {
5388 do_single_osd = false;
5389 }
5390
5391 if (do_single_osd && !osdmap.exists(osd_id)) {
5392 ss << "osd." << osd_id << " does not exist";
5393 r = -EINVAL;
5394 goto reply;
5395 }
5396
5397 if (f) {
5398 if (do_single_osd) {
5399 osdmap.dump_osd(osd_id, f.get());
5400 } else {
5401 osdmap.dump_osds(f.get());
5402 }
5403 f->flush(ds);
5404 } else {
5405 if (do_single_osd) {
5406 osdmap.print_osd(osd_id, ds);
5407 } else {
5408 osdmap.print_osds(ds);
5409 }
5410 }
5411 rdata.append(ds);
5412 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5413 string bucket;
5414 if (prefix == "osd tree-from") {
5415 cmd_getval(cmdmap, "bucket", bucket);
5416 if (!osdmap.crush->name_exists(bucket)) {
5417 ss << "bucket '" << bucket << "' does not exist";
5418 r = -ENOENT;
5419 goto reply;
5420 }
5421 int id = osdmap.crush->get_item_id(bucket);
5422 if (id >= 0) {
5423 ss << "\"" << bucket << "\" is not a bucket";
5424 r = -EINVAL;
5425 goto reply;
5426 }
5427 }
5428
5429 vector<string> states;
5430 cmd_getval(cmdmap, "states", states);
5431 unsigned filter = 0;
5432 for (auto& s : states) {
5433 if (s == "up") {
5434 filter |= OSDMap::DUMP_UP;
5435 } else if (s == "down") {
5436 filter |= OSDMap::DUMP_DOWN;
5437 } else if (s == "in") {
5438 filter |= OSDMap::DUMP_IN;
5439 } else if (s == "out") {
5440 filter |= OSDMap::DUMP_OUT;
5441 } else if (s == "destroyed") {
5442 filter |= OSDMap::DUMP_DESTROYED;
5443 } else {
5444 ss << "unrecognized state '" << s << "'";
5445 r = -EINVAL;
5446 goto reply;
5447 }
5448 }
5449 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5450 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5451 ss << "cannot specify both 'in' and 'out'";
5452 r = -EINVAL;
5453 goto reply;
5454 }
5455 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5456 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5457 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5458 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5459 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5460 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5461 ss << "can specify only one of 'up', 'down' and 'destroyed'";
5462 r = -EINVAL;
5463 goto reply;
5464 }
5465 if (f) {
5466 f->open_object_section("tree");
5467 p->print_tree(f.get(), NULL, filter, bucket);
5468 f->close_section();
5469 f->flush(ds);
5470 } else {
5471 p->print_tree(NULL, &ds, filter, bucket);
5472 }
5473 rdata.append(ds);
5474 } else if (prefix == "osd getmap") {
5475 rdata.append(osdmap_bl);
5476 ss << "got osdmap epoch " << p->get_epoch();
5477 } else if (prefix == "osd getcrushmap") {
5478 p->crush->encode(rdata, mon->get_quorum_con_features());
5479 ss << p->get_crush_version();
5480 } else if (prefix == "osd ls-tree") {
5481 string bucket_name;
5482 cmd_getval(cmdmap, "name", bucket_name);
5483 set<int> osds;
5484 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5485 if (r == -ENOENT) {
5486 ss << "\"" << bucket_name << "\" does not exist";
5487 goto reply;
5488 } else if (r < 0) {
5489 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5490 goto reply;
5491 }
5492
5493 if (f) {
5494 f->open_array_section("osds");
5495 for (auto &i : osds) {
5496 if (osdmap.exists(i)) {
5497 f->dump_int("osd", i);
5498 }
5499 }
5500 f->close_section();
5501 f->flush(ds);
5502 } else {
5503 bool first = true;
5504 for (auto &i : osds) {
5505 if (osdmap.exists(i)) {
5506 if (!first)
5507 ds << "\n";
5508 first = false;
5509 ds << i;
5510 }
5511 }
5512 }
5513
5514 rdata.append(ds);
5515 }
5516 } else if (prefix == "osd getmaxosd") {
5517 if (f) {
5518 f->open_object_section("getmaxosd");
5519 f->dump_unsigned("epoch", osdmap.get_epoch());
5520 f->dump_int("max_osd", osdmap.get_max_osd());
5521 f->close_section();
5522 f->flush(rdata);
5523 } else {
5524 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5525 rdata.append(ds);
5526 }
5527 } else if (prefix == "osd utilization") {
5528 string out;
5529 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5530 if (f)
5531 f->flush(rdata);
5532 else
5533 rdata.append(out);
5534 r = 0;
5535 goto reply;
5536 } else if (prefix == "osd find") {
5537 int64_t osd;
5538 if (!cmd_getval(cmdmap, "id", osd)) {
5539 ss << "unable to parse osd id value '"
5540 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5541 r = -EINVAL;
5542 goto reply;
5543 }
5544 if (!osdmap.exists(osd)) {
5545 ss << "osd." << osd << " does not exist";
5546 r = -ENOENT;
5547 goto reply;
5548 }
5549 string format;
5550 cmd_getval(cmdmap, "format", format);
5551 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5552 f->open_object_section("osd_location");
5553 f->dump_int("osd", osd);
5554 f->dump_object("addrs", osdmap.get_addrs(osd));
5555 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5556
5557 // try to identify host, pod/container name, etc.
5558 map<string,string> m;
5559 load_metadata(osd, m, nullptr);
5560 if (auto p = m.find("hostname"); p != m.end()) {
5561 f->dump_string("host", p->second);
5562 }
5563 for (auto& k : {
5564 "pod_name", "pod_namespace", // set by rook
5565 "container_name" // set by cephadm, ceph-ansible
5566 }) {
5567 if (auto p = m.find(k); p != m.end()) {
5568 f->dump_string(k, p->second);
5569 }
5570 }
5571
5572 // crush is helpful too
5573 f->open_object_section("crush_location");
5574 map<string,string> loc = osdmap.crush->get_full_location(osd);
5575 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5576 f->dump_string(p->first.c_str(), p->second);
5577 f->close_section();
5578 f->close_section();
5579 f->flush(rdata);
5580 } else if (prefix == "osd metadata") {
5581 int64_t osd = -1;
5582 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5583 !cmd_getval(cmdmap, "id", osd)) {
5584 ss << "unable to parse osd id value '"
5585 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5586 r = -EINVAL;
5587 goto reply;
5588 }
5589 if (osd >= 0 && !osdmap.exists(osd)) {
5590 ss << "osd." << osd << " does not exist";
5591 r = -ENOENT;
5592 goto reply;
5593 }
5594 string format;
5595 cmd_getval(cmdmap, "format", format);
5596 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5597 if (osd >= 0) {
5598 f->open_object_section("osd_metadata");
5599 f->dump_unsigned("id", osd);
5600 r = dump_osd_metadata(osd, f.get(), &ss);
5601 if (r < 0)
5602 goto reply;
5603 f->close_section();
5604 } else {
5605 r = 0;
5606 f->open_array_section("osd_metadata");
5607 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5608 if (osdmap.exists(i)) {
5609 f->open_object_section("osd");
5610 f->dump_unsigned("id", i);
5611 r = dump_osd_metadata(i, f.get(), NULL);
5612 if (r == -EINVAL || r == -ENOENT) {
5613 // Drop error, continue to get other daemons' metadata
5614 dout(4) << "No metadata for osd." << i << dendl;
5615 r = 0;
5616 } else if (r < 0) {
5617 // Unexpected error
5618 goto reply;
5619 }
5620 f->close_section();
5621 }
5622 }
5623 f->close_section();
5624 }
5625 f->flush(rdata);
5626 } else if (prefix == "osd versions") {
5627 if (!f)
5628 f.reset(Formatter::create("json-pretty"));
5629 count_metadata("ceph_version", f.get());
5630 f->flush(rdata);
5631 r = 0;
5632 } else if (prefix == "osd count-metadata") {
5633 if (!f)
5634 f.reset(Formatter::create("json-pretty"));
5635 string field;
5636 cmd_getval(cmdmap, "property", field);
5637 count_metadata(field, f.get());
5638 f->flush(rdata);
5639 r = 0;
5640 } else if (prefix == "osd numa-status") {
5641 TextTable tbl;
5642 if (f) {
5643 f->open_array_section("osds");
5644 } else {
5645 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5646 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5647 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5648 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5649 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5650 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5651 }
5652 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5653 if (osdmap.exists(i)) {
5654 map<string,string> m;
5655 ostringstream err;
5656 if (load_metadata(i, m, &err) < 0) {
5657 continue;
5658 }
5659 string host;
5660 auto p = m.find("hostname");
5661 if (p != m.end()) {
5662 host = p->second;
5663 }
5664 if (f) {
5665 f->open_object_section("osd");
5666 f->dump_int("osd", i);
5667 f->dump_string("host", host);
5668 for (auto n : { "network_numa_node", "objectstore_numa_node",
5669 "numa_node" }) {
5670 p = m.find(n);
5671 if (p != m.end()) {
5672 f->dump_int(n, atoi(p->second.c_str()));
5673 }
5674 }
5675 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5676 p = m.find(n);
5677 if (p != m.end()) {
5678 list<string> ls = get_str_list(p->second, ",");
5679 f->open_array_section(n);
5680 for (auto node : ls) {
5681 f->dump_int("node", atoi(node.c_str()));
5682 }
5683 f->close_section();
5684 }
5685 }
5686 for (auto n : { "numa_node_cpus" }) {
5687 p = m.find(n);
5688 if (p != m.end()) {
5689 dump_cpu_list(f.get(), n, p->second);
5690 }
5691 }
5692 f->close_section();
5693 } else {
5694 tbl << i;
5695 tbl << host;
5696 p = m.find("network_numa_nodes");
5697 if (p != m.end()) {
5698 tbl << p->second;
5699 } else {
5700 tbl << "-";
5701 }
5702 p = m.find("objectstore_numa_nodes");
5703 if (p != m.end()) {
5704 tbl << p->second;
5705 } else {
5706 tbl << "-";
5707 }
5708 p = m.find("numa_node");
5709 auto q = m.find("numa_node_cpus");
5710 if (p != m.end() && q != m.end()) {
5711 tbl << p->second;
5712 tbl << q->second;
5713 } else {
5714 tbl << "-";
5715 tbl << "-";
5716 }
5717 tbl << TextTable::endrow;
5718 }
5719 }
5720 }
5721 if (f) {
5722 f->close_section();
5723 f->flush(rdata);
5724 } else {
5725 rdata.append(stringify(tbl));
5726 }
5727 } else if (prefix == "osd map") {
5728 string poolstr, objstr, namespacestr;
5729 cmd_getval(cmdmap, "pool", poolstr);
5730 cmd_getval(cmdmap, "object", objstr);
5731 cmd_getval(cmdmap, "nspace", namespacestr);
5732
5733 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5734 if (pool < 0) {
5735 ss << "pool " << poolstr << " does not exist";
5736 r = -ENOENT;
5737 goto reply;
5738 }
5739 object_locator_t oloc(pool, namespacestr);
5740 object_t oid(objstr);
5741 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5742 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5743 vector<int> up, acting;
5744 int up_p, acting_p;
5745 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5746
5747 string fullobjname;
5748 if (!namespacestr.empty())
5749 fullobjname = namespacestr + string("/") + oid.name;
5750 else
5751 fullobjname = oid.name;
5752 if (f) {
5753 f->open_object_section("osd_map");
5754 f->dump_unsigned("epoch", osdmap.get_epoch());
5755 f->dump_string("pool", poolstr);
5756 f->dump_int("pool_id", pool);
5757 f->dump_stream("objname") << fullobjname;
5758 f->dump_stream("raw_pgid") << pgid;
5759 f->dump_stream("pgid") << mpgid;
5760 f->open_array_section("up");
5761 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5762 f->dump_int("osd", *p);
5763 f->close_section();
5764 f->dump_int("up_primary", up_p);
5765 f->open_array_section("acting");
5766 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5767 f->dump_int("osd", *p);
5768 f->close_section();
5769 f->dump_int("acting_primary", acting_p);
5770 f->close_section(); // osd_map
5771 f->flush(rdata);
5772 } else {
5773 ds << "osdmap e" << osdmap.get_epoch()
5774 << " pool '" << poolstr << "' (" << pool << ")"
5775 << " object '" << fullobjname << "' ->"
5776 << " pg " << pgid << " (" << mpgid << ")"
5777 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5778 << pg_vector_string(acting) << ", p" << acting_p << ")";
5779 rdata.append(ds);
5780 }
5781
5782 } else if (prefix == "pg map") {
5783 pg_t pgid;
5784 string pgidstr;
5785 cmd_getval(cmdmap, "pgid", pgidstr);
5786 if (!pgid.parse(pgidstr.c_str())) {
5787 ss << "invalid pgid '" << pgidstr << "'";
5788 r = -EINVAL;
5789 goto reply;
5790 }
5791 vector<int> up, acting;
5792 if (!osdmap.have_pg_pool(pgid.pool())) {
5793 ss << "pg '" << pgidstr << "' does not exist";
5794 r = -ENOENT;
5795 goto reply;
5796 }
5797 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5798 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5799 if (f) {
5800 f->open_object_section("pg_map");
5801 f->dump_unsigned("epoch", osdmap.get_epoch());
5802 f->dump_stream("raw_pgid") << pgid;
5803 f->dump_stream("pgid") << mpgid;
5804 f->open_array_section("up");
5805 for (auto osd : up) {
5806 f->dump_int("up_osd", osd);
5807 }
5808 f->close_section();
5809 f->open_array_section("acting");
5810 for (auto osd : acting) {
5811 f->dump_int("acting_osd", osd);
5812 }
5813 f->close_section();
5814 f->close_section();
5815 f->flush(rdata);
5816 } else {
5817 ds << "osdmap e" << osdmap.get_epoch()
5818 << " pg " << pgid << " (" << mpgid << ")"
5819 << " -> up " << up << " acting " << acting;
5820 rdata.append(ds);
5821 }
5822 goto reply;
5823
5824 } else if (prefix == "osd lspools") {
5825 if (f)
5826 f->open_array_section("pools");
5827 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5828 p != osdmap.pools.end();
5829 ++p) {
5830 if (f) {
5831 f->open_object_section("pool");
5832 f->dump_int("poolnum", p->first);
5833 f->dump_string("poolname", osdmap.pool_name[p->first]);
5834 f->close_section();
5835 } else {
5836 ds << p->first << ' ' << osdmap.pool_name[p->first];
5837 if (next(p) != osdmap.pools.end()) {
5838 ds << '\n';
5839 }
5840 }
5841 }
5842 if (f) {
5843 f->close_section();
5844 f->flush(ds);
5845 }
5846 rdata.append(ds);
5847 } else if (prefix == "osd blacklist ls") {
5848 if (f)
5849 f->open_array_section("blacklist");
5850
5851 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5852 p != osdmap.blacklist.end();
5853 ++p) {
5854 if (f) {
5855 f->open_object_section("entry");
5856 f->dump_string("addr", p->first.get_legacy_str());
5857 f->dump_stream("until") << p->second;
5858 f->close_section();
5859 } else {
5860 stringstream ss;
5861 string s;
5862 ss << p->first << " " << p->second;
5863 getline(ss, s);
5864 s += "\n";
5865 rdata.append(s);
5866 }
5867 }
5868 if (f) {
5869 f->close_section();
5870 f->flush(rdata);
5871 }
5872 ss << "listed " << osdmap.blacklist.size() << " entries";
5873
5874 } else if (prefix == "osd pool ls") {
5875 string detail;
5876 cmd_getval(cmdmap, "detail", detail);
5877 if (!f && detail == "detail") {
5878 ostringstream ss;
5879 osdmap.print_pools(ss);
5880 rdata.append(ss.str());
5881 } else {
5882 if (f)
5883 f->open_array_section("pools");
5884 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5885 it != osdmap.get_pools().end();
5886 ++it) {
5887 if (f) {
5888 if (detail == "detail") {
5889 f->open_object_section("pool");
5890 f->dump_int("pool_id", it->first);
5891 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5892 it->second.dump(f.get());
5893 f->close_section();
5894 } else {
5895 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5896 }
5897 } else {
5898 rdata.append(osdmap.get_pool_name(it->first) + "\n");
5899 }
5900 }
5901 if (f) {
5902 f->close_section();
5903 f->flush(rdata);
5904 }
5905 }
5906
5907 } else if (prefix == "osd crush get-tunable") {
5908 string tunable;
5909 cmd_getval(cmdmap, "tunable", tunable);
5910 ostringstream rss;
5911 if (f)
5912 f->open_object_section("tunable");
5913 if (tunable == "straw_calc_version") {
5914 if (f)
5915 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5916 else
5917 rss << osdmap.crush->get_straw_calc_version() << "\n";
5918 } else {
5919 r = -EINVAL;
5920 goto reply;
5921 }
5922 if (f) {
5923 f->close_section();
5924 f->flush(rdata);
5925 } else {
5926 rdata.append(rss.str());
5927 }
5928 r = 0;
5929
5930 } else if (prefix == "osd pool get") {
5931 string poolstr;
5932 cmd_getval(cmdmap, "pool", poolstr);
5933 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5934 if (pool < 0) {
5935 ss << "unrecognized pool '" << poolstr << "'";
5936 r = -ENOENT;
5937 goto reply;
5938 }
5939
5940 const pg_pool_t *p = osdmap.get_pg_pool(pool);
5941 string var;
5942 cmd_getval(cmdmap, "var", var);
5943
5944 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5945 const choices_map_t ALL_CHOICES = {
5946 {"size", SIZE},
5947 {"min_size", MIN_SIZE},
5948 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5949 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5950 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5951 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5952 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5953 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5954 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5955 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5956 {"use_gmt_hitset", USE_GMT_HITSET},
5957 {"target_max_objects", TARGET_MAX_OBJECTS},
5958 {"target_max_bytes", TARGET_MAX_BYTES},
5959 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5960 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5961 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5962 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5963 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5964 {"erasure_code_profile", ERASURE_CODE_PROFILE},
5965 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5966 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5967 {"fast_read", FAST_READ},
5968 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5969 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5970 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5971 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5972 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5973 {"recovery_priority", RECOVERY_PRIORITY},
5974 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5975 {"scrub_priority", SCRUB_PRIORITY},
5976 {"compression_mode", COMPRESSION_MODE},
5977 {"compression_algorithm", COMPRESSION_ALGORITHM},
5978 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5979 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5980 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5981 {"csum_type", CSUM_TYPE},
5982 {"csum_max_block", CSUM_MAX_BLOCK},
5983 {"csum_min_block", CSUM_MIN_BLOCK},
5984 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5985 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5986 {"pg_num_min", PG_NUM_MIN},
5987 {"target_size_bytes", TARGET_SIZE_BYTES},
5988 {"target_size_ratio", TARGET_SIZE_RATIO},
5989 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
5990 };
5991
5992 typedef std::set<osd_pool_get_choices> choices_set_t;
5993
5994 const choices_set_t ONLY_TIER_CHOICES = {
5995 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5996 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5997 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5998 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5999 MIN_READ_RECENCY_FOR_PROMOTE,
6000 MIN_WRITE_RECENCY_FOR_PROMOTE,
6001 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6002 };
6003 const choices_set_t ONLY_ERASURE_CHOICES = {
6004 EC_OVERWRITES, ERASURE_CODE_PROFILE
6005 };
6006
6007 choices_set_t selected_choices;
6008 if (var == "all") {
6009 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6010 it != ALL_CHOICES.end(); ++it) {
6011 selected_choices.insert(it->second);
6012 }
6013
6014 if(!p->is_tier()) {
6015 selected_choices = subtract_second_from_first(selected_choices,
6016 ONLY_TIER_CHOICES);
6017 }
6018
6019 if(!p->is_erasure()) {
6020 selected_choices = subtract_second_from_first(selected_choices,
6021 ONLY_ERASURE_CHOICES);
6022 }
6023 } else /* var != "all" */ {
6024 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6025 osd_pool_get_choices selected = found->second;
6026
6027 if (!p->is_tier() &&
6028 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6029 ss << "pool '" << poolstr
6030 << "' is not a tier pool: variable not applicable";
6031 r = -EACCES;
6032 goto reply;
6033 }
6034
6035 if (!p->is_erasure() &&
6036 ONLY_ERASURE_CHOICES.find(selected)
6037 != ONLY_ERASURE_CHOICES.end()) {
6038 ss << "pool '" << poolstr
6039 << "' is not a erasure pool: variable not applicable";
6040 r = -EACCES;
6041 goto reply;
6042 }
6043
6044 if (pool_opts_t::is_opt_name(var) &&
6045 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6046 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6047 r = -ENOENT;
6048 goto reply;
6049 }
6050
6051 selected_choices.insert(selected);
6052 }
6053
6054 if (f) {
6055 f->open_object_section("pool");
6056 f->dump_string("pool", poolstr);
6057 f->dump_int("pool_id", pool);
6058 for(choices_set_t::const_iterator it = selected_choices.begin();
6059 it != selected_choices.end(); ++it) {
6060 choices_map_t::const_iterator i;
6061 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6062 if (i->second == *it) {
6063 break;
6064 }
6065 }
6066 ceph_assert(i != ALL_CHOICES.end());
6067 switch(*it) {
6068 case PG_NUM:
6069 f->dump_int("pg_num", p->get_pg_num());
6070 break;
6071 case PGP_NUM:
6072 f->dump_int("pgp_num", p->get_pgp_num());
6073 break;
6074 case SIZE:
6075 f->dump_int("size", p->get_size());
6076 break;
6077 case MIN_SIZE:
6078 f->dump_int("min_size", p->get_min_size());
6079 break;
6080 case CRUSH_RULE:
6081 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6082 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6083 p->get_crush_rule()));
6084 } else {
6085 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6086 }
6087 break;
6088 case EC_OVERWRITES:
6089 f->dump_bool("allow_ec_overwrites",
6090 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6091 break;
6092 case PG_AUTOSCALE_MODE:
6093 f->dump_string("pg_autoscale_mode",
6094 pg_pool_t::get_pg_autoscale_mode_name(
6095 p->pg_autoscale_mode));
6096 break;
6097 case HASHPSPOOL:
6098 case NODELETE:
6099 case NOPGCHANGE:
6100 case NOSIZECHANGE:
6101 case WRITE_FADVISE_DONTNEED:
6102 case NOSCRUB:
6103 case NODEEP_SCRUB:
6104 f->dump_bool(i->first.c_str(),
6105 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6106 break;
6107 case HIT_SET_PERIOD:
6108 f->dump_int("hit_set_period", p->hit_set_period);
6109 break;
6110 case HIT_SET_COUNT:
6111 f->dump_int("hit_set_count", p->hit_set_count);
6112 break;
6113 case HIT_SET_TYPE:
6114 f->dump_string("hit_set_type",
6115 HitSet::get_type_name(p->hit_set_params.get_type()));
6116 break;
6117 case HIT_SET_FPP:
6118 {
6119 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6120 BloomHitSet::Params *bloomp =
6121 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6122 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6123 } else if(var != "all") {
6124 f->close_section();
6125 ss << "hit set is not of type Bloom; " <<
6126 "invalid to get a false positive rate!";
6127 r = -EINVAL;
6128 goto reply;
6129 }
6130 }
6131 break;
6132 case USE_GMT_HITSET:
6133 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6134 break;
6135 case TARGET_MAX_OBJECTS:
6136 f->dump_unsigned("target_max_objects", p->target_max_objects);
6137 break;
6138 case TARGET_MAX_BYTES:
6139 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6140 break;
6141 case CACHE_TARGET_DIRTY_RATIO:
6142 f->dump_unsigned("cache_target_dirty_ratio_micro",
6143 p->cache_target_dirty_ratio_micro);
6144 f->dump_float("cache_target_dirty_ratio",
6145 ((float)p->cache_target_dirty_ratio_micro/1000000));
6146 break;
6147 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6148 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6149 p->cache_target_dirty_high_ratio_micro);
6150 f->dump_float("cache_target_dirty_high_ratio",
6151 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6152 break;
6153 case CACHE_TARGET_FULL_RATIO:
6154 f->dump_unsigned("cache_target_full_ratio_micro",
6155 p->cache_target_full_ratio_micro);
6156 f->dump_float("cache_target_full_ratio",
6157 ((float)p->cache_target_full_ratio_micro/1000000));
6158 break;
6159 case CACHE_MIN_FLUSH_AGE:
6160 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6161 break;
6162 case CACHE_MIN_EVICT_AGE:
6163 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6164 break;
6165 case ERASURE_CODE_PROFILE:
6166 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6167 break;
6168 case MIN_READ_RECENCY_FOR_PROMOTE:
6169 f->dump_int("min_read_recency_for_promote",
6170 p->min_read_recency_for_promote);
6171 break;
6172 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6173 f->dump_int("min_write_recency_for_promote",
6174 p->min_write_recency_for_promote);
6175 break;
6176 case FAST_READ:
6177 f->dump_int("fast_read", p->fast_read);
6178 break;
6179 case HIT_SET_GRADE_DECAY_RATE:
6180 f->dump_int("hit_set_grade_decay_rate",
6181 p->hit_set_grade_decay_rate);
6182 break;
6183 case HIT_SET_SEARCH_LAST_N:
6184 f->dump_int("hit_set_search_last_n",
6185 p->hit_set_search_last_n);
6186 break;
6187 case SCRUB_MIN_INTERVAL:
6188 case SCRUB_MAX_INTERVAL:
6189 case DEEP_SCRUB_INTERVAL:
6190 case RECOVERY_PRIORITY:
6191 case RECOVERY_OP_PRIORITY:
6192 case SCRUB_PRIORITY:
6193 case COMPRESSION_MODE:
6194 case COMPRESSION_ALGORITHM:
6195 case COMPRESSION_REQUIRED_RATIO:
6196 case COMPRESSION_MAX_BLOB_SIZE:
6197 case COMPRESSION_MIN_BLOB_SIZE:
6198 case CSUM_TYPE:
6199 case CSUM_MAX_BLOCK:
6200 case CSUM_MIN_BLOCK:
6201 case FINGERPRINT_ALGORITHM:
6202 case PG_NUM_MIN:
6203 case TARGET_SIZE_BYTES:
6204 case TARGET_SIZE_RATIO:
6205 case PG_AUTOSCALE_BIAS:
6206 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6207 if (p->opts.is_set(key)) {
6208 if(*it == CSUM_TYPE) {
6209 int64_t val;
6210 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6211 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6212 } else {
6213 p->opts.dump(i->first, f.get());
6214 }
6215 }
6216 break;
6217 }
6218 }
6219 f->close_section();
6220 f->flush(rdata);
6221 } else /* !f */ {
6222 for(choices_set_t::const_iterator it = selected_choices.begin();
6223 it != selected_choices.end(); ++it) {
6224 choices_map_t::const_iterator i;
6225 switch(*it) {
6226 case PG_NUM:
6227 ss << "pg_num: " << p->get_pg_num() << "\n";
6228 break;
6229 case PGP_NUM:
6230 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6231 break;
6232 case SIZE:
6233 ss << "size: " << p->get_size() << "\n";
6234 break;
6235 case MIN_SIZE:
6236 ss << "min_size: " << p->get_min_size() << "\n";
6237 break;
6238 case CRUSH_RULE:
6239 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6240 ss << "crush_rule: " << osdmap.crush->get_rule_name(
6241 p->get_crush_rule()) << "\n";
6242 } else {
6243 ss << "crush_rule: " << p->get_crush_rule() << "\n";
6244 }
6245 break;
6246 case PG_AUTOSCALE_MODE:
6247 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6248 p->pg_autoscale_mode) <<"\n";
6249 break;
6250 case HIT_SET_PERIOD:
6251 ss << "hit_set_period: " << p->hit_set_period << "\n";
6252 break;
6253 case HIT_SET_COUNT:
6254 ss << "hit_set_count: " << p->hit_set_count << "\n";
6255 break;
6256 case HIT_SET_TYPE:
6257 ss << "hit_set_type: " <<
6258 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6259 break;
6260 case HIT_SET_FPP:
6261 {
6262 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6263 BloomHitSet::Params *bloomp =
6264 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6265 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6266 } else if(var != "all") {
6267 ss << "hit set is not of type Bloom; " <<
6268 "invalid to get a false positive rate!";
6269 r = -EINVAL;
6270 goto reply;
6271 }
6272 }
6273 break;
6274 case USE_GMT_HITSET:
6275 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6276 break;
6277 case TARGET_MAX_OBJECTS:
6278 ss << "target_max_objects: " << p->target_max_objects << "\n";
6279 break;
6280 case TARGET_MAX_BYTES:
6281 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6282 break;
6283 case CACHE_TARGET_DIRTY_RATIO:
6284 ss << "cache_target_dirty_ratio: "
6285 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6286 break;
6287 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6288 ss << "cache_target_dirty_high_ratio: "
6289 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6290 break;
6291 case CACHE_TARGET_FULL_RATIO:
6292 ss << "cache_target_full_ratio: "
6293 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6294 break;
6295 case CACHE_MIN_FLUSH_AGE:
6296 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6297 break;
6298 case CACHE_MIN_EVICT_AGE:
6299 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6300 break;
6301 case ERASURE_CODE_PROFILE:
6302 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6303 break;
6304 case MIN_READ_RECENCY_FOR_PROMOTE:
6305 ss << "min_read_recency_for_promote: " <<
6306 p->min_read_recency_for_promote << "\n";
6307 break;
6308 case HIT_SET_GRADE_DECAY_RATE:
6309 ss << "hit_set_grade_decay_rate: " <<
6310 p->hit_set_grade_decay_rate << "\n";
6311 break;
6312 case HIT_SET_SEARCH_LAST_N:
6313 ss << "hit_set_search_last_n: " <<
6314 p->hit_set_search_last_n << "\n";
6315 break;
6316 case EC_OVERWRITES:
6317 ss << "allow_ec_overwrites: " <<
6318 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6319 "\n";
6320 break;
6321 case HASHPSPOOL:
6322 case NODELETE:
6323 case NOPGCHANGE:
6324 case NOSIZECHANGE:
6325 case WRITE_FADVISE_DONTNEED:
6326 case NOSCRUB:
6327 case NODEEP_SCRUB:
6328 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6329 if (i->second == *it)
6330 break;
6331 }
6332 ceph_assert(i != ALL_CHOICES.end());
6333 ss << i->first << ": " <<
6334 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6335 "true" : "false") << "\n";
6336 break;
6337 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6338 ss << "min_write_recency_for_promote: " <<
6339 p->min_write_recency_for_promote << "\n";
6340 break;
6341 case FAST_READ:
6342 ss << "fast_read: " << p->fast_read << "\n";
6343 break;
6344 case SCRUB_MIN_INTERVAL:
6345 case SCRUB_MAX_INTERVAL:
6346 case DEEP_SCRUB_INTERVAL:
6347 case RECOVERY_PRIORITY:
6348 case RECOVERY_OP_PRIORITY:
6349 case SCRUB_PRIORITY:
6350 case COMPRESSION_MODE:
6351 case COMPRESSION_ALGORITHM:
6352 case COMPRESSION_REQUIRED_RATIO:
6353 case COMPRESSION_MAX_BLOB_SIZE:
6354 case COMPRESSION_MIN_BLOB_SIZE:
6355 case CSUM_TYPE:
6356 case CSUM_MAX_BLOCK:
6357 case CSUM_MIN_BLOCK:
6358 case FINGERPRINT_ALGORITHM:
6359 case PG_NUM_MIN:
6360 case TARGET_SIZE_BYTES:
6361 case TARGET_SIZE_RATIO:
6362 case PG_AUTOSCALE_BIAS:
6363 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6364 if (i->second == *it)
6365 break;
6366 }
6367 ceph_assert(i != ALL_CHOICES.end());
6368 {
6369 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6370 if (p->opts.is_set(key)) {
6371 if(key == pool_opts_t::CSUM_TYPE) {
6372 int64_t val;
6373 p->opts.get(key, &val);
6374 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6375 } else {
6376 ss << i->first << ": " << p->opts.get(key) << "\n";
6377 }
6378 }
6379 }
6380 break;
6381 }
6382 rdata.append(ss.str());
6383 ss.str("");
6384 }
6385 }
6386 r = 0;
6387 } else if (prefix == "osd pool get-quota") {
6388 string pool_name;
6389 cmd_getval(cmdmap, "pool", pool_name);
6390
6391 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6392 if (poolid < 0) {
6393 ceph_assert(poolid == -ENOENT);
6394 ss << "unrecognized pool '" << pool_name << "'";
6395 r = -ENOENT;
6396 goto reply;
6397 }
6398 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6399 const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6400 const object_stat_sum_t& sum = pstat->stats.sum;
6401 if (f) {
6402 f->open_object_section("pool_quotas");
6403 f->dump_string("pool_name", pool_name);
6404 f->dump_unsigned("pool_id", poolid);
6405 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6406 f->dump_int("current_num_objects", sum.num_objects);
6407 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6408 f->dump_int("current_num_bytes", sum.num_bytes);
6409 f->close_section();
6410 f->flush(rdata);
6411 } else {
6412 stringstream rs;
6413 rs << "quotas for pool '" << pool_name << "':\n"
6414 << " max objects: ";
6415 if (p->quota_max_objects == 0)
6416 rs << "N/A";
6417 else {
6418 rs << si_u_t(p->quota_max_objects) << " objects";
6419 rs << " (current num objects: " << sum.num_objects << " objects)";
6420 }
6421 rs << "\n"
6422 << " max bytes : ";
6423 if (p->quota_max_bytes == 0)
6424 rs << "N/A";
6425 else {
6426 rs << byte_u_t(p->quota_max_bytes);
6427 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6428 }
6429 rdata.append(rs.str());
6430 }
6431 rdata.append("\n");
6432 r = 0;
6433 } else if (prefix == "osd crush rule list" ||
6434 prefix == "osd crush rule ls") {
6435 if (f) {
6436 f->open_array_section("rules");
6437 osdmap.crush->list_rules(f.get());
6438 f->close_section();
6439 f->flush(rdata);
6440 } else {
6441 ostringstream ss;
6442 osdmap.crush->list_rules(&ss);
6443 rdata.append(ss.str());
6444 }
6445 } else if (prefix == "osd crush rule ls-by-class") {
6446 string class_name;
6447 cmd_getval(cmdmap, "class", class_name);
6448 if (class_name.empty()) {
6449 ss << "no class specified";
6450 r = -EINVAL;
6451 goto reply;
6452 }
6453 set<int> rules;
6454 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6455 if (r < 0) {
6456 ss << "failed to get rules by class '" << class_name << "'";
6457 goto reply;
6458 }
6459 if (f) {
6460 f->open_array_section("rules");
6461 for (auto &rule: rules) {
6462 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6463 }
6464 f->close_section();
6465 f->flush(rdata);
6466 } else {
6467 ostringstream rs;
6468 for (auto &rule: rules) {
6469 rs << osdmap.crush->get_rule_name(rule) << "\n";
6470 }
6471 rdata.append(rs.str());
6472 }
6473 } else if (prefix == "osd crush rule dump") {
6474 string name;
6475 cmd_getval(cmdmap, "name", name);
6476 string format;
6477 cmd_getval(cmdmap, "format", format);
6478 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6479 if (name == "") {
6480 f->open_array_section("rules");
6481 osdmap.crush->dump_rules(f.get());
6482 f->close_section();
6483 } else {
6484 int ruleno = osdmap.crush->get_rule_id(name);
6485 if (ruleno < 0) {
6486 ss << "unknown crush rule '" << name << "'";
6487 r = ruleno;
6488 goto reply;
6489 }
6490 osdmap.crush->dump_rule(ruleno, f.get());
6491 }
6492 ostringstream rs;
6493 f->flush(rs);
6494 rs << "\n";
6495 rdata.append(rs.str());
6496 } else if (prefix == "osd crush dump") {
6497 string format;
6498 cmd_getval(cmdmap, "format", format);
6499 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6500 f->open_object_section("crush_map");
6501 osdmap.crush->dump(f.get());
6502 f->close_section();
6503 ostringstream rs;
6504 f->flush(rs);
6505 rs << "\n";
6506 rdata.append(rs.str());
6507 } else if (prefix == "osd crush show-tunables") {
6508 string format;
6509 cmd_getval(cmdmap, "format", format);
6510 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6511 f->open_object_section("crush_map_tunables");
6512 osdmap.crush->dump_tunables(f.get());
6513 f->close_section();
6514 ostringstream rs;
6515 f->flush(rs);
6516 rs << "\n";
6517 rdata.append(rs.str());
6518 } else if (prefix == "osd crush tree") {
6519 string shadow;
6520 cmd_getval(cmdmap, "shadow", shadow);
6521 bool show_shadow = shadow == "--show-shadow";
6522 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6523 if (f) {
6524 f->open_object_section("crush_tree");
6525 osdmap.crush->dump_tree(nullptr,
6526 f.get(),
6527 osdmap.get_pool_names(),
6528 show_shadow);
6529 f->close_section();
6530 f->flush(rdata);
6531 } else {
6532 ostringstream ss;
6533 osdmap.crush->dump_tree(&ss,
6534 nullptr,
6535 osdmap.get_pool_names(),
6536 show_shadow);
6537 rdata.append(ss.str());
6538 }
6539 } else if (prefix == "osd crush ls") {
6540 string name;
6541 if (!cmd_getval(cmdmap, "node", name)) {
6542 ss << "no node specified";
6543 r = -EINVAL;
6544 goto reply;
6545 }
6546 if (!osdmap.crush->name_exists(name)) {
6547 ss << "node '" << name << "' does not exist";
6548 r = -ENOENT;
6549 goto reply;
6550 }
6551 int id = osdmap.crush->get_item_id(name);
6552 list<int> result;
6553 if (id >= 0) {
6554 result.push_back(id);
6555 } else {
6556 int num = osdmap.crush->get_bucket_size(id);
6557 for (int i = 0; i < num; ++i) {
6558 result.push_back(osdmap.crush->get_bucket_item(id, i));
6559 }
6560 }
6561 if (f) {
6562 f->open_array_section("items");
6563 for (auto i : result) {
6564 f->dump_string("item", osdmap.crush->get_item_name(i));
6565 }
6566 f->close_section();
6567 f->flush(rdata);
6568 } else {
6569 ostringstream ss;
6570 for (auto i : result) {
6571 ss << osdmap.crush->get_item_name(i) << "\n";
6572 }
6573 rdata.append(ss.str());
6574 }
6575 r = 0;
6576 } else if (prefix == "osd crush class ls") {
6577 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6578 f->open_array_section("crush_classes");
6579 for (auto i : osdmap.crush->class_name)
6580 f->dump_string("class", i.second);
6581 f->close_section();
6582 f->flush(rdata);
6583 } else if (prefix == "osd crush class ls-osd") {
6584 string name;
6585 cmd_getval(cmdmap, "class", name);
6586 set<int> osds;
6587 osdmap.crush->get_devices_by_class(name, &osds);
6588 if (f) {
6589 f->open_array_section("osds");
6590 for (auto &osd: osds)
6591 f->dump_int("osd", osd);
6592 f->close_section();
6593 f->flush(rdata);
6594 } else {
6595 bool first = true;
6596 for (auto &osd : osds) {
6597 if (!first)
6598 ds << "\n";
6599 first = false;
6600 ds << osd;
6601 }
6602 rdata.append(ds);
6603 }
6604 } else if (prefix == "osd crush get-device-class") {
6605 vector<string> idvec;
6606 cmd_getval(cmdmap, "ids", idvec);
6607 map<int, string> class_by_osd;
6608 for (auto& id : idvec) {
6609 ostringstream ts;
6610 long osd = parse_osd_id(id.c_str(), &ts);
6611 if (osd < 0) {
6612 ss << "unable to parse osd id:'" << id << "'";
6613 r = -EINVAL;
6614 goto reply;
6615 }
6616 auto device_class = osdmap.crush->get_item_class(osd);
6617 if (device_class)
6618 class_by_osd[osd] = device_class;
6619 else
6620 class_by_osd[osd] = ""; // no class
6621 }
6622 if (f) {
6623 f->open_array_section("osd_device_classes");
6624 for (auto& i : class_by_osd) {
6625 f->open_object_section("osd_device_class");
6626 f->dump_int("osd", i.first);
6627 f->dump_string("device_class", i.second);
6628 f->close_section();
6629 }
6630 f->close_section();
6631 f->flush(rdata);
6632 } else {
6633 if (class_by_osd.size() == 1) {
6634 // for single input, make a clean output
6635 ds << class_by_osd.begin()->second;
6636 } else {
6637 // note that we do not group osds by class here
6638 for (auto it = class_by_osd.begin();
6639 it != class_by_osd.end();
6640 it++) {
6641 ds << "osd." << it->first << ' ' << it->second;
6642 if (next(it) != class_by_osd.end())
6643 ds << '\n';
6644 }
6645 }
6646 rdata.append(ds);
6647 }
6648 } else if (prefix == "osd erasure-code-profile ls") {
6649 const auto &profiles = osdmap.get_erasure_code_profiles();
6650 if (f)
6651 f->open_array_section("erasure-code-profiles");
6652 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6653 if (f)
6654 f->dump_string("profile", i->first.c_str());
6655 else
6656 rdata.append(i->first + "\n");
6657 }
6658 if (f) {
6659 f->close_section();
6660 ostringstream rs;
6661 f->flush(rs);
6662 rs << "\n";
6663 rdata.append(rs.str());
6664 }
6665 } else if (prefix == "osd crush weight-set ls") {
6666 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6667 if (f) {
6668 f->open_array_section("weight_sets");
6669 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6670 f->dump_string("pool", "(compat)");
6671 }
6672 for (auto& i : osdmap.crush->choose_args) {
6673 if (i.first >= 0) {
6674 f->dump_string("pool", osdmap.get_pool_name(i.first));
6675 }
6676 }
6677 f->close_section();
6678 f->flush(rdata);
6679 } else {
6680 ostringstream rs;
6681 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6682 rs << "(compat)\n";
6683 }
6684 for (auto& i : osdmap.crush->choose_args) {
6685 if (i.first >= 0) {
6686 rs << osdmap.get_pool_name(i.first) << "\n";
6687 }
6688 }
6689 rdata.append(rs.str());
6690 }
6691 } else if (prefix == "osd crush weight-set dump") {
6692 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6693 "json-pretty"));
6694 osdmap.crush->dump_choose_args(f.get());
6695 f->flush(rdata);
6696 } else if (prefix == "osd erasure-code-profile get") {
6697 string name;
6698 cmd_getval(cmdmap, "name", name);
6699 if (!osdmap.has_erasure_code_profile(name)) {
6700 ss << "unknown erasure code profile '" << name << "'";
6701 r = -ENOENT;
6702 goto reply;
6703 }
6704 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6705 if (f)
6706 f->open_object_section("profile");
6707 for (map<string,string>::const_iterator i = profile.begin();
6708 i != profile.end();
6709 ++i) {
6710 if (f)
6711 f->dump_string(i->first.c_str(), i->second.c_str());
6712 else
6713 rdata.append(i->first + "=" + i->second + "\n");
6714 }
6715 if (f) {
6716 f->close_section();
6717 ostringstream rs;
6718 f->flush(rs);
6719 rs << "\n";
6720 rdata.append(rs.str());
6721 }
6722 } else if (prefix == "osd pool application get") {
6723 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6724 "json-pretty"));
6725 string pool_name;
6726 cmd_getval(cmdmap, "pool", pool_name);
6727 string app;
6728 cmd_getval(cmdmap, "app", app);
6729 string key;
6730 cmd_getval(cmdmap, "key", key);
6731
6732 if (pool_name.empty()) {
6733 // all
6734 f->open_object_section("pools");
6735 for (const auto &pool : osdmap.pools) {
6736 std::string name("<unknown>");
6737 const auto &pni = osdmap.pool_name.find(pool.first);
6738 if (pni != osdmap.pool_name.end())
6739 name = pni->second;
6740 f->open_object_section(name.c_str());
6741 for (auto &app_pair : pool.second.application_metadata) {
6742 f->open_object_section(app_pair.first.c_str());
6743 for (auto &kv_pair : app_pair.second) {
6744 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6745 }
6746 f->close_section();
6747 }
6748 f->close_section(); // name
6749 }
6750 f->close_section(); // pools
6751 f->flush(rdata);
6752 } else {
6753 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6754 if (pool < 0) {
6755 ss << "unrecognized pool '" << pool_name << "'";
6756 r = -ENOENT;
6757 goto reply;
6758 }
6759 auto p = osdmap.get_pg_pool(pool);
6760 // filter by pool
6761 if (app.empty()) {
6762 f->open_object_section(pool_name.c_str());
6763 for (auto &app_pair : p->application_metadata) {
6764 f->open_object_section(app_pair.first.c_str());
6765 for (auto &kv_pair : app_pair.second) {
6766 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6767 }
6768 f->close_section(); // application
6769 }
6770 f->close_section(); // pool_name
6771 f->flush(rdata);
6772 goto reply;
6773 }
6774
6775 auto app_it = p->application_metadata.find(app);
6776 if (app_it == p->application_metadata.end()) {
6777 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6778 r = -ENOENT;
6779 goto reply;
6780 }
6781 // filter by pool + app
6782 if (key.empty()) {
6783 f->open_object_section(app_it->first.c_str());
6784 for (auto &kv_pair : app_it->second) {
6785 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6786 }
6787 f->close_section(); // application
6788 f->flush(rdata);
6789 goto reply;
6790 }
6791 // filter by pool + app + key
6792 auto key_it = app_it->second.find(key);
6793 if (key_it == app_it->second.end()) {
6794 ss << "application '" << app << "' on pool '" << pool_name
6795 << "' does not have key '" << key << "'";
6796 r = -ENOENT;
6797 goto reply;
6798 }
6799 ss << key_it->second << "\n";
6800 rdata.append(ss.str());
6801 ss.str("");
6802 }
6803 } else if (prefix == "osd get-require-min-compat-client") {
6804 ss << osdmap.require_min_compat_client << std::endl;
6805 rdata.append(ss.str());
6806 ss.str("");
6807 goto reply;
6808 } else if (prefix == "osd pool application enable" ||
6809 prefix == "osd pool application disable" ||
6810 prefix == "osd pool application set" ||
6811 prefix == "osd pool application rm") {
6812 bool changed = false;
6813 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6814 if (r != 0) {
6815 // Error, reply.
6816 goto reply;
6817 } else if (changed) {
6818 // Valid mutation, proceed to prepare phase
6819 return false;
6820 } else {
6821 // Idempotent case, reply
6822 goto reply;
6823 }
6824 } else {
6825 // try prepare update
6826 return false;
6827 }
6828
6829 reply:
6830 string rs;
6831 getline(ss, rs);
6832 mon->reply_command(op, r, rs, rdata, get_last_committed());
6833 return true;
6834 }
6835
6836 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6837 {
6838 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6839 osdmap.get_pg_pool(pool_id));
6840 ceph_assert(pool);
6841 pool->set_flag(flags);
6842 }
6843
6844 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6845 {
6846 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6847 osdmap.get_pg_pool(pool_id));
6848 ceph_assert(pool);
6849 pool->unset_flag(flags);
6850 }
6851
6852 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
6853 {
6854 char k[80];
6855 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
6856 return k;
6857 }
6858
6859 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
6860 {
6861 char k[80];
6862 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6863 (unsigned long long)pool, (unsigned long long)snap);
6864 return k;
6865 }
6866
6867 string OSDMonitor::make_purged_snap_key_value(
6868 int64_t pool, snapid_t snap, snapid_t num,
6869 epoch_t epoch, bufferlist *v)
6870 {
6871 // encode the *last* epoch in the key so that we can use forward
6872 // iteration only to search for an epoch in an interval.
6873 encode(snap, *v);
6874 encode(snap + num, *v);
6875 encode(epoch, *v);
6876 return make_purged_snap_key(pool, snap + num - 1);
6877 }
6878
6879
6880 int OSDMonitor::lookup_purged_snap(
6881 int64_t pool, snapid_t snap,
6882 snapid_t *begin, snapid_t *end)
6883 {
6884 string k = make_purged_snap_key(pool, snap);
6885 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6886 it->lower_bound(k);
6887 if (!it->valid()) {
6888 dout(20) << __func__
6889 << " pool " << pool << " snap " << snap
6890 << " - key '" << k << "' not found" << dendl;
6891 return -ENOENT;
6892 }
6893 if (it->key().find("purged_snap_") != 0) {
6894 dout(20) << __func__
6895 << " pool " << pool << " snap " << snap
6896 << " - key '" << k << "' got '" << it->key()
6897 << "', wrong prefix" << dendl;
6898 return -ENOENT;
6899 }
6900 string gotk = it->key();
6901 const char *format = "purged_snap_%llu_";
6902 long long int keypool;
6903 int n = sscanf(gotk.c_str(), format, &keypool);
6904 if (n != 1) {
6905 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6906 return -ENOENT;
6907 }
6908 if (pool != keypool) {
6909 dout(20) << __func__
6910 << " pool " << pool << " snap " << snap
6911 << " - key '" << k << "' got '" << gotk
6912 << "', wrong pool " << keypool
6913 << dendl;
6914 return -ENOENT;
6915 }
6916 bufferlist v = it->value();
6917 auto p = v.cbegin();
6918 decode(*begin, p);
6919 decode(*end, p);
6920 if (snap < *begin || snap >= *end) {
6921 dout(20) << __func__
6922 << " pool " << pool << " snap " << snap
6923 << " - found [" << *begin << "," << *end << "), no overlap"
6924 << dendl;
6925 return -ENOENT;
6926 }
6927 return 0;
6928 }
6929
6930 void OSDMonitor::insert_purged_snap_update(
6931 int64_t pool,
6932 snapid_t start, snapid_t end,
6933 epoch_t epoch,
6934 MonitorDBStore::TransactionRef t)
6935 {
6936 snapid_t before_begin, before_end;
6937 snapid_t after_begin, after_end;
6938 int b = lookup_purged_snap(pool, start - 1,
6939 &before_begin, &before_end);
6940 int a = lookup_purged_snap(pool, end,
6941 &after_begin, &after_end);
6942 if (!b && !a) {
6943 dout(10) << __func__
6944 << " [" << start << "," << end << ") - joins ["
6945 << before_begin << "," << before_end << ") and ["
6946 << after_begin << "," << after_end << ")" << dendl;
6947 // erase only the begin record; we'll overwrite the end one.
6948 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6949 bufferlist v;
6950 string k = make_purged_snap_key_value(pool,
6951 before_begin, after_end - before_begin,
6952 pending_inc.epoch, &v);
6953 t->put(OSD_SNAP_PREFIX, k, v);
6954 } else if (!b) {
6955 dout(10) << __func__
6956 << " [" << start << "," << end << ") - join with earlier ["
6957 << before_begin << "," << before_end << ")" << dendl;
6958 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6959 bufferlist v;
6960 string k = make_purged_snap_key_value(pool,
6961 before_begin, end - before_begin,
6962 pending_inc.epoch, &v);
6963 t->put(OSD_SNAP_PREFIX, k, v);
6964 } else if (!a) {
6965 dout(10) << __func__
6966 << " [" << start << "," << end << ") - join with later ["
6967 << after_begin << "," << after_end << ")" << dendl;
6968 // overwrite after record
6969 bufferlist v;
6970 string k = make_purged_snap_key_value(pool,
6971 start, after_end - start,
6972 pending_inc.epoch, &v);
6973 t->put(OSD_SNAP_PREFIX, k, v);
6974 } else {
6975 dout(10) << __func__
6976 << " [" << start << "," << end << ") - new"
6977 << dendl;
6978 bufferlist v;
6979 string k = make_purged_snap_key_value(pool,
6980 start, end - start,
6981 pending_inc.epoch, &v);
6982 t->put(OSD_SNAP_PREFIX, k, v);
6983 }
6984 }
6985
6986 bool OSDMonitor::try_prune_purged_snaps()
6987 {
6988 if (!mon->mgrstatmon()->is_readable()) {
6989 return false;
6990 }
6991 if (!pending_inc.new_purged_snaps.empty()) {
6992 return false; // we already pruned for this epoch
6993 }
6994
6995 unsigned max_prune = cct->_conf.get_val<uint64_t>(
6996 "mon_max_snap_prune_per_epoch");
6997 if (!max_prune) {
6998 max_prune = 100000;
6999 }
7000 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7001
7002 unsigned actually_pruned = 0;
7003 auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
7004 for (auto& p : osdmap.get_pools()) {
7005 auto q = purged_snaps.find(p.first);
7006 if (q == purged_snaps.end()) {
7007 continue;
7008 }
7009 auto& purged = q->second;
7010 if (purged.empty()) {
7011 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7012 continue;
7013 }
7014 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7015 snap_interval_set_t to_prune;
7016 unsigned maybe_pruned = actually_pruned;
7017 for (auto i = purged.begin(); i != purged.end(); ++i) {
7018 snapid_t begin = i.get_start();
7019 auto end = i.get_start() + i.get_len();
7020 snapid_t pbegin = 0, pend = 0;
7021 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7022 if (r == 0) {
7023 // already purged.
7024 // be a bit aggressive about backing off here, because the mon may
7025 // do a lot of work going through this set, and if we know the
7026 // purged set from the OSDs is at least *partly* stale we may as
7027 // well wait for it to be fresh.
7028 dout(20) << __func__ << " we've already purged " << pbegin
7029 << "~" << (pend - pbegin) << dendl;
7030 break; // next pool
7031 }
7032 if (pbegin && pbegin > begin && pbegin < end) {
7033 // the tail of [begin,end) is purged; shorten the range
7034 end = pbegin;
7035 }
7036 to_prune.insert(begin, end - begin);
7037 maybe_pruned += end - begin;
7038 if (maybe_pruned >= max_prune) {
7039 break;
7040 }
7041 }
7042 if (!to_prune.empty()) {
7043 // PGs may still be reporting things as purged that we have already
7044 // pruned from removed_snaps_queue.
7045 snap_interval_set_t actual;
7046 auto r = osdmap.removed_snaps_queue.find(p.first);
7047 if (r != osdmap.removed_snaps_queue.end()) {
7048 actual.intersection_of(to_prune, r->second);
7049 }
7050 actually_pruned += actual.size();
7051 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7052 << ", actual pruned " << actual << dendl;
7053 if (!actual.empty()) {
7054 pending_inc.new_purged_snaps[p.first].swap(actual);
7055 }
7056 }
7057 if (actually_pruned >= max_prune) {
7058 break;
7059 }
7060 }
7061 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7062 return !!actually_pruned;
7063 }
7064
7065 bool OSDMonitor::update_pools_status()
7066 {
7067 if (!mon->mgrstatmon()->is_readable())
7068 return false;
7069
7070 bool ret = false;
7071
7072 auto& pools = osdmap.get_pools();
7073 for (auto it = pools.begin(); it != pools.end(); ++it) {
7074 const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
7075 if (!pstat)
7076 continue;
7077 const object_stat_sum_t& sum = pstat->stats.sum;
7078 const pg_pool_t &pool = it->second;
7079 const string& pool_name = osdmap.get_pool_name(it->first);
7080
7081 bool pool_is_full =
7082 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7083 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7084
7085 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7086 if (pool_is_full)
7087 continue;
7088
7089 mon->clog->info() << "pool '" << pool_name
7090 << "' no longer out of quota; removing NO_QUOTA flag";
7091 // below we cancel FLAG_FULL too, we'll set it again in
7092 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7093 clear_pool_flags(it->first,
7094 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7095 ret = true;
7096 } else {
7097 if (!pool_is_full)
7098 continue;
7099
7100 if (pool.quota_max_bytes > 0 &&
7101 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7102 mon->clog->warn() << "pool '" << pool_name << "' is full"
7103 << " (reached quota's max_bytes: "
7104 << byte_u_t(pool.quota_max_bytes) << ")";
7105 }
7106 if (pool.quota_max_objects > 0 &&
7107 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7108 mon->clog->warn() << "pool '" << pool_name << "' is full"
7109 << " (reached quota's max_objects: "
7110 << pool.quota_max_objects << ")";
7111 }
7112 // set both FLAG_FULL_QUOTA and FLAG_FULL
7113 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7114 // since FLAG_FULL should always take precedence
7115 set_pool_flags(it->first,
7116 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7117 clear_pool_flags(it->first,
7118 pg_pool_t::FLAG_NEARFULL |
7119 pg_pool_t::FLAG_BACKFILLFULL);
7120 ret = true;
7121 }
7122 }
7123 return ret;
7124 }
7125
7126 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7127 {
7128 op->mark_osdmon_event(__func__);
7129 auto m = op->get_req<MPoolOp>();
7130 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7131 MonSession *session = op->get_session();
7132 if (!session)
7133 return -EPERM;
7134 string erasure_code_profile;
7135 stringstream ss;
7136 string rule_name;
7137 int ret = 0;
7138 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7139 0, 0, 0, 0, 0, 0.0,
7140 erasure_code_profile,
7141 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7142 &ss);
7143
7144 if (ret < 0) {
7145 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7146 }
7147 return ret;
7148 }
7149
7150 int OSDMonitor::crush_rename_bucket(const string& srcname,
7151 const string& dstname,
7152 ostream *ss)
7153 {
7154 int ret;
7155 //
7156 // Avoid creating a pending crush if it does not already exists and
7157 // the rename would fail.
7158 //
7159 if (!_have_pending_crush()) {
7160 ret = _get_stable_crush().can_rename_bucket(srcname,
7161 dstname,
7162 ss);
7163 if (ret)
7164 return ret;
7165 }
7166
7167 CrushWrapper newcrush;
7168 _get_pending_crush(newcrush);
7169
7170 ret = newcrush.rename_bucket(srcname,
7171 dstname,
7172 ss);
7173 if (ret)
7174 return ret;
7175
7176 pending_inc.crush.clear();
7177 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7178 *ss << "renamed bucket " << srcname << " into " << dstname;
7179 return 0;
7180 }
7181
7182 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7183 {
7184 string replacement = "";
7185
7186 if (plugin == "jerasure_generic" ||
7187 plugin == "jerasure_sse3" ||
7188 plugin == "jerasure_sse4" ||
7189 plugin == "jerasure_neon") {
7190 replacement = "jerasure";
7191 } else if (plugin == "shec_generic" ||
7192 plugin == "shec_sse3" ||
7193 plugin == "shec_sse4" ||
7194 plugin == "shec_neon") {
7195 replacement = "shec";
7196 }
7197
7198 if (replacement != "") {
7199 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7200 << plugin << " that has been deprecated. Please use "
7201 << replacement << " instead." << dendl;
7202 }
7203 }
7204
7205 int OSDMonitor::normalize_profile(const string& profilename,
7206 ErasureCodeProfile &profile,
7207 bool force,
7208 ostream *ss)
7209 {
7210 ErasureCodeInterfaceRef erasure_code;
7211 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7212 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7213 check_legacy_ec_plugin(plugin->second, profilename);
7214 int err = instance.factory(plugin->second,
7215 g_conf().get_val<std::string>("erasure_code_dir"),
7216 profile, &erasure_code, ss);
7217 if (err) {
7218 return err;
7219 }
7220
7221 err = erasure_code->init(profile, ss);
7222 if (err) {
7223 return err;
7224 }
7225
7226 auto it = profile.find("stripe_unit");
7227 if (it != profile.end()) {
7228 string err_str;
7229 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7230 if (!err_str.empty()) {
7231 *ss << "could not parse stripe_unit '" << it->second
7232 << "': " << err_str << std::endl;
7233 return -EINVAL;
7234 }
7235 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7236 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7237 if (chunk_size != stripe_unit) {
7238 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7239 << "alignment. Would be padded to " << chunk_size
7240 << std::endl;
7241 return -EINVAL;
7242 }
7243 if ((stripe_unit % 4096) != 0 && !force) {
7244 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7245 << "use --force to override this check" << std::endl;
7246 return -EINVAL;
7247 }
7248 }
7249 return 0;
7250 }
7251
7252 int OSDMonitor::crush_rule_create_erasure(const string &name,
7253 const string &profile,
7254 int *rule,
7255 ostream *ss)
7256 {
7257 int ruleid = osdmap.crush->get_rule_id(name);
7258 if (ruleid != -ENOENT) {
7259 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7260 return -EEXIST;
7261 }
7262
7263 CrushWrapper newcrush;
7264 _get_pending_crush(newcrush);
7265
7266 ruleid = newcrush.get_rule_id(name);
7267 if (ruleid != -ENOENT) {
7268 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7269 return -EALREADY;
7270 } else {
7271 ErasureCodeInterfaceRef erasure_code;
7272 int err = get_erasure_code(profile, &erasure_code, ss);
7273 if (err) {
7274 *ss << "failed to load plugin using profile " << profile << std::endl;
7275 return err;
7276 }
7277
7278 err = erasure_code->create_rule(name, newcrush, ss);
7279 erasure_code.reset();
7280 if (err < 0)
7281 return err;
7282 *rule = err;
7283 pending_inc.crush.clear();
7284 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7285 return 0;
7286 }
7287 }
7288
7289 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7290 ErasureCodeInterfaceRef *erasure_code,
7291 ostream *ss) const
7292 {
7293 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7294 return -EAGAIN;
7295 ErasureCodeProfile profile =
7296 osdmap.get_erasure_code_profile(erasure_code_profile);
7297 ErasureCodeProfile::const_iterator plugin =
7298 profile.find("plugin");
7299 if (plugin == profile.end()) {
7300 *ss << "cannot determine the erasure code plugin"
7301 << " because there is no 'plugin' entry in the erasure_code_profile "
7302 << profile << std::endl;
7303 return -EINVAL;
7304 }
7305 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7306 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7307 return instance.factory(plugin->second,
7308 g_conf().get_val<std::string>("erasure_code_dir"),
7309 profile, erasure_code, ss);
7310 }
7311
7312 int OSDMonitor::check_cluster_features(uint64_t features,
7313 stringstream &ss)
7314 {
7315 stringstream unsupported_ss;
7316 int unsupported_count = 0;
7317 if ((mon->get_quorum_con_features() & features) != features) {
7318 unsupported_ss << "the monitor cluster";
7319 ++unsupported_count;
7320 }
7321
7322 set<int32_t> up_osds;
7323 osdmap.get_up_osds(up_osds);
7324 for (set<int32_t>::iterator it = up_osds.begin();
7325 it != up_osds.end(); ++it) {
7326 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7327 if ((xi.features & features) != features) {
7328 if (unsupported_count > 0)
7329 unsupported_ss << ", ";
7330 unsupported_ss << "osd." << *it;
7331 unsupported_count ++;
7332 }
7333 }
7334
7335 if (unsupported_count > 0) {
7336 ss << "features " << features << " unsupported by: "
7337 << unsupported_ss.str();
7338 return -ENOTSUP;
7339 }
7340
7341 // check pending osd state, too!
7342 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7343 pending_inc.new_xinfo.begin();
7344 p != pending_inc.new_xinfo.end(); ++p) {
7345 const osd_xinfo_t &xi = p->second;
7346 if ((xi.features & features) != features) {
7347 dout(10) << __func__ << " pending osd." << p->first
7348 << " features are insufficient; retry" << dendl;
7349 return -EAGAIN;
7350 }
7351 }
7352
7353 return 0;
7354 }
7355
7356 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7357 stringstream& ss)
7358 {
7359 OSDMap::Incremental new_pending = pending_inc;
7360 encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7361 OSDMap newmap;
7362 newmap.deepish_copy_from(osdmap);
7363 newmap.apply_incremental(new_pending);
7364
7365 // client compat
7366 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7367 auto mv = newmap.get_min_compat_client();
7368 if (mv > newmap.require_min_compat_client) {
7369 ss << "new crush map requires client version " << mv
7370 << " but require_min_compat_client is "
7371 << newmap.require_min_compat_client;
7372 return false;
7373 }
7374 }
7375
7376 // osd compat
7377 uint64_t features =
7378 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7379 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7380 stringstream features_ss;
7381 int r = check_cluster_features(features, features_ss);
7382 if (r) {
7383 ss << "Could not change CRUSH: " << features_ss.str();
7384 return false;
7385 }
7386
7387 return true;
7388 }
7389
7390 bool OSDMonitor::erasure_code_profile_in_use(
7391 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7392 const string &profile,
7393 ostream *ss)
7394 {
7395 bool found = false;
7396 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7397 p != pools.end();
7398 ++p) {
7399 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7400 *ss << osdmap.pool_name[p->first] << " ";
7401 found = true;
7402 }
7403 }
7404 if (found) {
7405 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7406 }
7407 return found;
7408 }
7409
7410 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7411 map<string,string> *erasure_code_profile_map,
7412 ostream *ss)
7413 {
7414 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7415 get_json_str_map,
7416 *ss,
7417 erasure_code_profile_map,
7418 true);
7419 if (r)
7420 return r;
7421 ceph_assert((*erasure_code_profile_map).count("plugin"));
7422 string default_plugin = (*erasure_code_profile_map)["plugin"];
7423 map<string,string> user_map;
7424 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7425 i != erasure_code_profile.end();
7426 ++i) {
7427 size_t equal = i->find('=');
7428 if (equal == string::npos) {
7429 user_map[*i] = string();
7430 (*erasure_code_profile_map)[*i] = string();
7431 } else {
7432 const string key = i->substr(0, equal);
7433 equal++;
7434 const string value = i->substr(equal);
7435 if (key.find("ruleset-") == 0) {
7436 *ss << "property '" << key << "' is no longer supported; try "
7437 << "'crush-" << key.substr(8) << "' instead";
7438 return -EINVAL;
7439 }
7440 user_map[key] = value;
7441 (*erasure_code_profile_map)[key] = value;
7442 }
7443 }
7444
7445 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7446 (*erasure_code_profile_map) = user_map;
7447
7448 return 0;
7449 }
7450
7451 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7452 const string &erasure_code_profile,
7453 uint8_t repl_size,
7454 unsigned *size, unsigned *min_size,
7455 ostream *ss)
7456 {
7457 int err = 0;
7458 switch (pool_type) {
7459 case pg_pool_t::TYPE_REPLICATED:
7460 if (repl_size == 0) {
7461 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7462 }
7463 *size = repl_size;
7464 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7465 break;
7466 case pg_pool_t::TYPE_ERASURE:
7467 {
7468 ErasureCodeInterfaceRef erasure_code;
7469 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7470 if (err == 0) {
7471 *size = erasure_code->get_chunk_count();
7472 *min_size =
7473 erasure_code->get_data_chunk_count() +
7474 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7475 assert(*min_size <= *size);
7476 assert(*min_size >= erasure_code->get_data_chunk_count());
7477 }
7478 }
7479 break;
7480 default:
7481 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7482 err = -EINVAL;
7483 break;
7484 }
7485 return err;
7486 }
7487
7488 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7489 const string &erasure_code_profile,
7490 uint32_t *stripe_width,
7491 ostream *ss)
7492 {
7493 int err = 0;
7494 switch (pool_type) {
7495 case pg_pool_t::TYPE_REPLICATED:
7496 // ignored
7497 break;
7498 case pg_pool_t::TYPE_ERASURE:
7499 {
7500 ErasureCodeProfile profile =
7501 osdmap.get_erasure_code_profile(erasure_code_profile);
7502 ErasureCodeInterfaceRef erasure_code;
7503 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7504 if (err)
7505 break;
7506 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7507 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7508 auto it = profile.find("stripe_unit");
7509 if (it != profile.end()) {
7510 string err_str;
7511 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7512 ceph_assert(err_str.empty());
7513 }
7514 *stripe_width = data_chunks *
7515 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7516 }
7517 break;
7518 default:
7519 *ss << "prepare_pool_stripe_width: "
7520 << pool_type << " is not a known pool type";
7521 err = -EINVAL;
7522 break;
7523 }
7524 return err;
7525 }
7526
7527 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7528 const string &erasure_code_profile,
7529 const string &rule_name,
7530 int *crush_rule,
7531 ostream *ss)
7532 {
7533
7534 if (*crush_rule < 0) {
7535 switch (pool_type) {
7536 case pg_pool_t::TYPE_REPLICATED:
7537 {
7538 if (rule_name == "") {
7539 // Use default rule
7540 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7541 if (*crush_rule < 0) {
7542 // Errors may happen e.g. if no valid rule is available
7543 *ss << "No suitable CRUSH rule exists, check "
7544 << "'osd pool default crush *' config options";
7545 return -ENOENT;
7546 }
7547 } else {
7548 return get_crush_rule(rule_name, crush_rule, ss);
7549 }
7550 }
7551 break;
7552 case pg_pool_t::TYPE_ERASURE:
7553 {
7554 int err = crush_rule_create_erasure(rule_name,
7555 erasure_code_profile,
7556 crush_rule, ss);
7557 switch (err) {
7558 case -EALREADY:
7559 dout(20) << "prepare_pool_crush_rule: rule "
7560 << rule_name << " try again" << dendl;
7561 // fall through
7562 case 0:
7563 // need to wait for the crush rule to be proposed before proceeding
7564 err = -EAGAIN;
7565 break;
7566 case -EEXIST:
7567 err = 0;
7568 break;
7569 }
7570 return err;
7571 }
7572 break;
7573 default:
7574 *ss << "prepare_pool_crush_rule: " << pool_type
7575 << " is not a known pool type";
7576 return -EINVAL;
7577 break;
7578 }
7579 } else {
7580 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7581 *ss << "CRUSH rule " << *crush_rule << " not found";
7582 return -ENOENT;
7583 }
7584 }
7585
7586 return 0;
7587 }
7588
7589 int OSDMonitor::get_crush_rule(const string &rule_name,
7590 int *crush_rule,
7591 ostream *ss)
7592 {
7593 int ret;
7594 ret = osdmap.crush->get_rule_id(rule_name);
7595 if (ret != -ENOENT) {
7596 // found it, use it
7597 *crush_rule = ret;
7598 } else {
7599 CrushWrapper newcrush;
7600 _get_pending_crush(newcrush);
7601
7602 ret = newcrush.get_rule_id(rule_name);
7603 if (ret != -ENOENT) {
7604 // found it, wait for it to be proposed
7605 dout(20) << __func__ << ": rule " << rule_name
7606 << " try again" << dendl;
7607 return -EAGAIN;
7608 } else {
7609 // Cannot find it , return error
7610 *ss << "specified rule " << rule_name << " doesn't exist";
7611 return ret;
7612 }
7613 }
7614 return 0;
7615 }
7616
7617 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7618 {
7619 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7620 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
7621 auto max_pgs = max_pgs_per_osd * num_osds;
7622 uint64_t projected = 0;
7623 if (pool < 0) {
7624 projected += pg_num * size;
7625 }
7626 for (const auto& i : osdmap.get_pools()) {
7627 if (i.first == pool) {
7628 projected += pg_num * size;
7629 } else {
7630 projected += i.second.get_pg_num_target() * i.second.get_size();
7631 }
7632 }
7633 if (projected > max_pgs) {
7634 if (pool >= 0) {
7635 *ss << "pool id " << pool;
7636 }
7637 *ss << " pg_num " << pg_num << " size " << size
7638 << " would mean " << projected
7639 << " total pgs, which exceeds max " << max_pgs
7640 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7641 << " * num_in_osds " << num_osds << ")";
7642 return -ERANGE;
7643 }
7644 return 0;
7645 }
7646
7647 /**
7648 * @param name The name of the new pool
7649 * @param crush_rule The crush rule to use. If <0, will use the system default
7650 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7651 * @param pg_num The pg_num to use. If set to 0, will use the system default
7652 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7653 * @param repl_size Replication factor, or 0 for default
7654 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7655 * @param pool_type TYPE_ERASURE, or TYPE_REP
7656 * @param expected_num_objects expected number of objects on the pool
7657 * @param fast_read fast read type.
7658 * @param ss human readable error message, if any.
7659 *
7660 * @return 0 on success, negative errno on failure.
7661 */
7662 int OSDMonitor::prepare_new_pool(string& name,
7663 int crush_rule,
7664 const string &crush_rule_name,
7665 unsigned pg_num, unsigned pgp_num,
7666 unsigned pg_num_min,
7667 const uint64_t repl_size,
7668 const uint64_t target_size_bytes,
7669 const float target_size_ratio,
7670 const string &erasure_code_profile,
7671 const unsigned pool_type,
7672 const uint64_t expected_num_objects,
7673 FastReadType fast_read,
7674 const string& pg_autoscale_mode,
7675 ostream *ss)
7676 {
7677 if (name.length() == 0)
7678 return -EINVAL;
7679 if (pg_num == 0)
7680 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7681 if (pgp_num == 0)
7682 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7683 if (!pgp_num)
7684 pgp_num = pg_num;
7685 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7686 *ss << "'pg_num' must be greater than 0 and less than or equal to "
7687 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7688 << " (you may adjust 'mon max pool pg num' for higher values)";
7689 return -ERANGE;
7690 }
7691 if (pgp_num > pg_num) {
7692 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7693 << ", which in this case is " << pg_num;
7694 return -ERANGE;
7695 }
7696 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7697 *ss << "'fast_read' can only apply to erasure coding pool";
7698 return -EINVAL;
7699 }
7700 int r;
7701 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7702 crush_rule_name, &crush_rule, ss);
7703 if (r) {
7704 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7705 return r;
7706 }
7707 if (g_conf()->mon_osd_crush_smoke_test) {
7708 CrushWrapper newcrush;
7709 _get_pending_crush(newcrush);
7710 ostringstream err;
7711 CrushTester tester(newcrush, err);
7712 tester.set_min_x(0);
7713 tester.set_max_x(50);
7714 tester.set_rule(crush_rule);
7715 auto start = ceph::coarse_mono_clock::now();
7716 r = tester.test_with_fork(g_conf()->mon_lease);
7717 auto duration = ceph::coarse_mono_clock::now() - start;
7718 if (r < 0) {
7719 dout(10) << "tester.test_with_fork returns " << r
7720 << ": " << err.str() << dendl;
7721 *ss << "crush test failed with " << r << ": " << err.str();
7722 return r;
7723 }
7724 dout(10) << __func__ << " crush smoke test duration: "
7725 << duration << dendl;
7726 }
7727 unsigned size, min_size;
7728 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7729 &size, &min_size, ss);
7730 if (r) {
7731 dout(10) << "prepare_pool_size returns " << r << dendl;
7732 return r;
7733 }
7734 r = check_pg_num(-1, pg_num, size, ss);
7735 if (r) {
7736 dout(10) << "check_pg_num returns " << r << dendl;
7737 return r;
7738 }
7739
7740 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7741 return -EINVAL;
7742 }
7743
7744 uint32_t stripe_width = 0;
7745 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7746 if (r) {
7747 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7748 return r;
7749 }
7750
7751 bool fread = false;
7752 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7753 switch (fast_read) {
7754 case FAST_READ_OFF:
7755 fread = false;
7756 break;
7757 case FAST_READ_ON:
7758 fread = true;
7759 break;
7760 case FAST_READ_DEFAULT:
7761 fread = g_conf()->osd_pool_default_ec_fast_read;
7762 break;
7763 default:
7764 *ss << "invalid fast_read setting: " << fast_read;
7765 return -EINVAL;
7766 }
7767 }
7768
7769 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7770 p != pending_inc.new_pool_names.end();
7771 ++p) {
7772 if (p->second == name)
7773 return 0;
7774 }
7775
7776 if (-1 == pending_inc.new_pool_max)
7777 pending_inc.new_pool_max = osdmap.pool_max;
7778 int64_t pool = ++pending_inc.new_pool_max;
7779 pg_pool_t empty;
7780 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7781 pi->create_time = ceph_clock_now();
7782 pi->type = pool_type;
7783 pi->fast_read = fread;
7784 pi->flags = g_conf()->osd_pool_default_flags;
7785 if (g_conf()->osd_pool_default_flag_hashpspool)
7786 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7787 if (g_conf()->osd_pool_default_flag_nodelete)
7788 pi->set_flag(pg_pool_t::FLAG_NODELETE);
7789 if (g_conf()->osd_pool_default_flag_nopgchange)
7790 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7791 if (g_conf()->osd_pool_default_flag_nosizechange)
7792 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7793 pi->set_flag(pg_pool_t::FLAG_CREATING);
7794 if (g_conf()->osd_pool_use_gmt_hitset)
7795 pi->use_gmt_hitset = true;
7796 else
7797 pi->use_gmt_hitset = false;
7798
7799 pi->size = size;
7800 pi->min_size = min_size;
7801 pi->crush_rule = crush_rule;
7802 pi->expected_num_objects = expected_num_objects;
7803 pi->object_hash = CEPH_STR_HASH_RJENKINS;
7804
7805 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7806 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7807 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7808 pi->pg_autoscale_mode = m;
7809 } else {
7810 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7811 }
7812 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7813 pi->set_pg_num(
7814 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7815 : pg_num);
7816 pi->set_pg_num_pending(pi->get_pg_num());
7817 pi->set_pg_num_target(pg_num);
7818 pi->set_pgp_num(pi->get_pg_num());
7819 pi->set_pgp_num_target(pgp_num);
7820 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7821 pg_num_min) {
7822 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7823 }
7824 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7825 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7826 pi->pg_autoscale_mode = m;
7827 }
7828
7829 pi->last_change = pending_inc.epoch;
7830 pi->auid = 0;
7831
7832 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7833 pi->erasure_code_profile = erasure_code_profile;
7834 } else {
7835 pi->erasure_code_profile = "";
7836 }
7837 pi->stripe_width = stripe_width;
7838
7839 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7840 target_size_bytes) {
7841 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7842 // larger than int32_t max.
7843 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7844 }
7845 if (target_size_ratio > 0.0 &&
7846 osdmap.require_osd_release >= ceph_release_t::nautilus) {
7847 // only store for nautilus+, just to be consistent and tidy.
7848 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7849 }
7850
7851 pi->cache_target_dirty_ratio_micro =
7852 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7853 pi->cache_target_dirty_high_ratio_micro =
7854 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7855 pi->cache_target_full_ratio_micro =
7856 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7857 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7858 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7859
7860 pending_inc.new_pool_names[pool] = name;
7861 return 0;
7862 }
7863
7864 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7865 {
7866 op->mark_osdmon_event(__func__);
7867 ostringstream ss;
7868 if (pending_inc.new_flags < 0)
7869 pending_inc.new_flags = osdmap.get_flags();
7870 pending_inc.new_flags |= flag;
7871 ss << OSDMap::get_flag_string(flag) << " is set";
7872 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7873 get_last_committed() + 1));
7874 return true;
7875 }
7876
7877 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7878 {
7879 op->mark_osdmon_event(__func__);
7880 ostringstream ss;
7881 if (pending_inc.new_flags < 0)
7882 pending_inc.new_flags = osdmap.get_flags();
7883 pending_inc.new_flags &= ~flag;
7884 ss << OSDMap::get_flag_string(flag) << " is unset";
7885 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7886 get_last_committed() + 1));
7887 return true;
7888 }
7889
7890 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7891 stringstream& ss)
7892 {
7893 string poolstr;
7894 cmd_getval(cmdmap, "pool", poolstr);
7895 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7896 if (pool < 0) {
7897 ss << "unrecognized pool '" << poolstr << "'";
7898 return -ENOENT;
7899 }
7900 string var;
7901 cmd_getval(cmdmap, "var", var);
7902
7903 pg_pool_t p = *osdmap.get_pg_pool(pool);
7904 if (pending_inc.new_pools.count(pool))
7905 p = pending_inc.new_pools[pool];
7906
7907 // accept val as a json string in the normal case (current
7908 // generation monitor). parse out int or float values from the
7909 // string as needed. however, if it is not a string, try to pull
7910 // out an int, in case an older monitor with an older json schema is
7911 // forwarding a request.
7912 string val;
7913 string interr, floaterr;
7914 int64_t n = 0;
7915 double f = 0;
7916 int64_t uf = 0; // micro-f
7917 cmd_getval(cmdmap, "val", val);
7918
7919 auto si_options = {
7920 "target_max_objects"
7921 };
7922 auto iec_options = {
7923 "target_max_bytes",
7924 "target_size_bytes",
7925 "compression_max_blob_size",
7926 "compression_min_blob_size",
7927 "csum_max_block",
7928 "csum_min_block",
7929 };
7930 if (count(begin(si_options), end(si_options), var)) {
7931 n = strict_si_cast<int64_t>(val.c_str(), &interr);
7932 } else if (count(begin(iec_options), end(iec_options), var)) {
7933 n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7934 } else {
7935 // parse string as both int and float; different fields use different types.
7936 n = strict_strtoll(val.c_str(), 10, &interr);
7937 f = strict_strtod(val.c_str(), &floaterr);
7938 uf = llrintl(f * (double)1000000.0);
7939 }
7940
7941 if (!p.is_tier() &&
7942 (var == "hit_set_type" || var == "hit_set_period" ||
7943 var == "hit_set_count" || var == "hit_set_fpp" ||
7944 var == "target_max_objects" || var == "target_max_bytes" ||
7945 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7946 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7947 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7948 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7949 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7950 return -EACCES;
7951 }
7952
7953 if (var == "size") {
7954 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7955 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7956 return -EPERM;
7957 }
7958 if (p.type == pg_pool_t::TYPE_ERASURE) {
7959 ss << "can not change the size of an erasure-coded pool";
7960 return -ENOTSUP;
7961 }
7962 if (interr.length()) {
7963 ss << "error parsing integer value '" << val << "': " << interr;
7964 return -EINVAL;
7965 }
7966 if (n <= 0 || n > 10) {
7967 ss << "pool size must be between 1 and 10";
7968 return -EINVAL;
7969 }
7970 if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
7971 return -EINVAL;
7972 }
7973 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7974 if (r < 0) {
7975 return r;
7976 }
7977 p.size = n;
7978 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
7979 } else if (var == "min_size") {
7980 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7981 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7982 return -EPERM;
7983 }
7984 if (interr.length()) {
7985 ss << "error parsing integer value '" << val << "': " << interr;
7986 return -EINVAL;
7987 }
7988
7989 if (p.type != pg_pool_t::TYPE_ERASURE) {
7990 if (n < 1 || n > p.size) {
7991 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7992 return -EINVAL;
7993 }
7994 } else {
7995 ErasureCodeInterfaceRef erasure_code;
7996 int k;
7997 stringstream tmp;
7998 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7999 if (err == 0) {
8000 k = erasure_code->get_data_chunk_count();
8001 } else {
8002 ss << __func__ << " get_erasure_code failed: " << tmp.str();
8003 return err;
8004 }
8005
8006 if (n < k || n > p.size) {
8007 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8008 return -EINVAL;
8009 }
8010 }
8011 p.min_size = n;
8012 } else if (var == "pg_num_actual") {
8013 if (interr.length()) {
8014 ss << "error parsing integer value '" << val << "': " << interr;
8015 return -EINVAL;
8016 }
8017 if (n == (int)p.get_pg_num()) {
8018 return 0;
8019 }
8020 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8021 ss << "'pg_num' must be greater than 0 and less than or equal to "
8022 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8023 << " (you may adjust 'mon max pool pg num' for higher values)";
8024 return -ERANGE;
8025 }
8026 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8027 ss << "cannot adjust pg_num while initial PGs are being created";
8028 return -EBUSY;
8029 }
8030 if (n > (int)p.get_pg_num()) {
8031 if (p.get_pg_num() != p.get_pg_num_pending()) {
8032 // force pre-nautilus clients to resend their ops, since they
8033 // don't understand pg_num_pending changes form a new interval
8034 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8035 }
8036 p.set_pg_num(n);
8037 } else {
8038 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8039 ss << "nautilus OSDs are required to adjust pg_num_pending";
8040 return -EPERM;
8041 }
8042 if (n < (int)p.get_pgp_num()) {
8043 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8044 return -EINVAL;
8045 }
8046 if (n < (int)p.get_pg_num() - 1) {
8047 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8048 << ") - 1; only single pg decrease is currently supported";
8049 return -EINVAL;
8050 }
8051 p.set_pg_num_pending(n);
8052 // force pre-nautilus clients to resend their ops, since they
8053 // don't understand pg_num_pending changes form a new interval
8054 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8055 }
8056 // force pre-luminous clients to resend their ops, since they
8057 // don't understand that split PGs now form a new interval.
8058 p.last_force_op_resend_preluminous = pending_inc.epoch;
8059 } else if (var == "pg_num") {
8060 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8061 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8062 return -EPERM;
8063 }
8064 if (interr.length()) {
8065 ss << "error parsing integer value '" << val << "': " << interr;
8066 return -EINVAL;
8067 }
8068 if (n == (int)p.get_pg_num_target()) {
8069 return 0;
8070 }
8071 if (n <= 0 || static_cast<uint64_t>(n) >
8072 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8073 ss << "'pg_num' must be greater than 0 and less than or equal to "
8074 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8075 << " (you may adjust 'mon max pool pg num' for higher values)";
8076 return -ERANGE;
8077 }
8078 if (n > (int)p.get_pg_num_target()) {
8079 int r = check_pg_num(pool, n, p.get_size(), &ss);
8080 if (r) {
8081 return r;
8082 }
8083 bool force = false;
8084 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8085 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8086 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8087 return -EPERM;
8088 }
8089 } else {
8090 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8091 ss << "nautilus OSDs are required to decrease pg_num";
8092 return -EPERM;
8093 }
8094 }
8095 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8096 // pre-nautilus osdmap format; increase pg_num directly
8097 assert(n > (int)p.get_pg_num());
8098 // force pre-nautilus clients to resend their ops, since they
8099 // don't understand pg_num_target changes form a new interval
8100 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8101 // force pre-luminous clients to resend their ops, since they
8102 // don't understand that split PGs now form a new interval.
8103 p.last_force_op_resend_preluminous = pending_inc.epoch;
8104 p.set_pg_num(n);
8105 } else {
8106 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8107 // make pgp_num track pg_num if it already matches. if it is set
8108 // differently, leave it different and let the user control it
8109 // manually.
8110 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8111 p.set_pgp_num_target(n);
8112 }
8113 p.set_pg_num_target(n);
8114 }
8115 } else if (var == "pgp_num_actual") {
8116 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8117 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8118 return -EPERM;
8119 }
8120 if (interr.length()) {
8121 ss << "error parsing integer value '" << val << "': " << interr;
8122 return -EINVAL;
8123 }
8124 if (n <= 0) {
8125 ss << "specified pgp_num must > 0, but you set to " << n;
8126 return -EINVAL;
8127 }
8128 if (n > (int)p.get_pg_num()) {
8129 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8130 return -EINVAL;
8131 }
8132 if (n > (int)p.get_pg_num_pending()) {
8133 ss << "specified pgp_num " << n
8134 << " > pg_num_pending " << p.get_pg_num_pending();
8135 return -EINVAL;
8136 }
8137 p.set_pgp_num(n);
8138 } else if (var == "pgp_num") {
8139 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8140 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8141 return -EPERM;
8142 }
8143 if (interr.length()) {
8144 ss << "error parsing integer value '" << val << "': " << interr;
8145 return -EINVAL;
8146 }
8147 if (n <= 0) {
8148 ss << "specified pgp_num must > 0, but you set to " << n;
8149 return -EINVAL;
8150 }
8151 if (n > (int)p.get_pg_num_target()) {
8152 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8153 return -EINVAL;
8154 }
8155 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8156 // pre-nautilus osdmap format; increase pgp_num directly
8157 p.set_pgp_num(n);
8158 } else {
8159 p.set_pgp_num_target(n);
8160 }
8161 } else if (var == "pg_autoscale_mode") {
8162 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8163 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8164 ss << "specified invalid mode " << val;
8165 return -EINVAL;
8166 }
8167 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8168 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8169 return -EINVAL;
8170 }
8171 p.pg_autoscale_mode = m;
8172 } else if (var == "crush_rule") {
8173 int id = osdmap.crush->get_rule_id(val);
8174 if (id == -ENOENT) {
8175 ss << "crush rule " << val << " does not exist";
8176 return -ENOENT;
8177 }
8178 if (id < 0) {
8179 ss << cpp_strerror(id);
8180 return -ENOENT;
8181 }
8182 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8183 return -EINVAL;
8184 }
8185 p.crush_rule = id;
8186 } else if (var == "nodelete" || var == "nopgchange" ||
8187 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8188 var == "noscrub" || var == "nodeep-scrub") {
8189 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8190 // make sure we only compare against 'n' if we didn't receive a string
8191 if (val == "true" || (interr.empty() && n == 1)) {
8192 p.set_flag(flag);
8193 } else if (val == "false" || (interr.empty() && n == 0)) {
8194 p.unset_flag(flag);
8195 } else {
8196 ss << "expecting value 'true', 'false', '0', or '1'";
8197 return -EINVAL;
8198 }
8199 } else if (var == "hashpspool") {
8200 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8201 bool force = false;
8202 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8203
8204 if (!force) {
8205 ss << "are you SURE? this will remap all placement groups in this pool,"
8206 " this triggers large data movement,"
8207 " pass --yes-i-really-mean-it if you really do.";
8208 return -EPERM;
8209 }
8210 // make sure we only compare against 'n' if we didn't receive a string
8211 if (val == "true" || (interr.empty() && n == 1)) {
8212 p.set_flag(flag);
8213 } else if (val == "false" || (interr.empty() && n == 0)) {
8214 p.unset_flag(flag);
8215 } else {
8216 ss << "expecting value 'true', 'false', '0', or '1'";
8217 return -EINVAL;
8218 }
8219 } else if (var == "hit_set_type") {
8220 if (val == "none")
8221 p.hit_set_params = HitSet::Params();
8222 else {
8223 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8224 if (err)
8225 return err;
8226 if (val == "bloom") {
8227 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8228 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8229 p.hit_set_params = HitSet::Params(bsp);
8230 } else if (val == "explicit_hash")
8231 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8232 else if (val == "explicit_object")
8233 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8234 else {
8235 ss << "unrecognized hit_set type '" << val << "'";
8236 return -EINVAL;
8237 }
8238 }
8239 } else if (var == "hit_set_period") {
8240 if (interr.length()) {
8241 ss << "error parsing integer value '" << val << "': " << interr;
8242 return -EINVAL;
8243 } else if (n < 0) {
8244 ss << "hit_set_period should be non-negative";
8245 return -EINVAL;
8246 }
8247 p.hit_set_period = n;
8248 } else if (var == "hit_set_count") {
8249 if (interr.length()) {
8250 ss << "error parsing integer value '" << val << "': " << interr;
8251 return -EINVAL;
8252 } else if (n < 0) {
8253 ss << "hit_set_count should be non-negative";
8254 return -EINVAL;
8255 }
8256 p.hit_set_count = n;
8257 } else if (var == "hit_set_fpp") {
8258 if (floaterr.length()) {
8259 ss << "error parsing floating point value '" << val << "': " << floaterr;
8260 return -EINVAL;
8261 } else if (f < 0 || f > 1.0) {
8262 ss << "hit_set_fpp should be in the range 0..1";
8263 return -EINVAL;
8264 }
8265 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8266 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8267 return -EINVAL;
8268 }
8269 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8270 bloomp->set_fpp(f);
8271 } else if (var == "use_gmt_hitset") {
8272 if (val == "true" || (interr.empty() && n == 1)) {
8273 p.use_gmt_hitset = true;
8274 } else {
8275 ss << "expecting value 'true' or '1'";
8276 return -EINVAL;
8277 }
8278 } else if (var == "allow_ec_overwrites") {
8279 if (!p.is_erasure()) {
8280 ss << "ec overwrites can only be enabled for an erasure coded pool";
8281 return -EINVAL;
8282 }
8283 stringstream err;
8284 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8285 !is_pool_currently_all_bluestore(pool, p, &err)) {
8286 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8287 return -EINVAL;
8288 }
8289 if (val == "true" || (interr.empty() && n == 1)) {
8290 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8291 } else if (val == "false" || (interr.empty() && n == 0)) {
8292 ss << "ec overwrites cannot be disabled once enabled";
8293 return -EINVAL;
8294 } else {
8295 ss << "expecting value 'true', 'false', '0', or '1'";
8296 return -EINVAL;
8297 }
8298 } else if (var == "target_max_objects") {
8299 if (interr.length()) {
8300 ss << "error parsing int '" << val << "': " << interr;
8301 return -EINVAL;
8302 }
8303 p.target_max_objects = n;
8304 } else if (var == "target_max_bytes") {
8305 if (interr.length()) {
8306 ss << "error parsing int '" << val << "': " << interr;
8307 return -EINVAL;
8308 }
8309 p.target_max_bytes = n;
8310 } else if (var == "cache_target_dirty_ratio") {
8311 if (floaterr.length()) {
8312 ss << "error parsing float '" << val << "': " << floaterr;
8313 return -EINVAL;
8314 }
8315 if (f < 0 || f > 1.0) {
8316 ss << "value must be in the range 0..1";
8317 return -ERANGE;
8318 }
8319 p.cache_target_dirty_ratio_micro = uf;
8320 } else if (var == "cache_target_dirty_high_ratio") {
8321 if (floaterr.length()) {
8322 ss << "error parsing float '" << val << "': " << floaterr;
8323 return -EINVAL;
8324 }
8325 if (f < 0 || f > 1.0) {
8326 ss << "value must be in the range 0..1";
8327 return -ERANGE;
8328 }
8329 p.cache_target_dirty_high_ratio_micro = uf;
8330 } else if (var == "cache_target_full_ratio") {
8331 if (floaterr.length()) {
8332 ss << "error parsing float '" << val << "': " << floaterr;
8333 return -EINVAL;
8334 }
8335 if (f < 0 || f > 1.0) {
8336 ss << "value must be in the range 0..1";
8337 return -ERANGE;
8338 }
8339 p.cache_target_full_ratio_micro = uf;
8340 } else if (var == "cache_min_flush_age") {
8341 if (interr.length()) {
8342 ss << "error parsing int '" << val << "': " << interr;
8343 return -EINVAL;
8344 }
8345 p.cache_min_flush_age = n;
8346 } else if (var == "cache_min_evict_age") {
8347 if (interr.length()) {
8348 ss << "error parsing int '" << val << "': " << interr;
8349 return -EINVAL;
8350 }
8351 p.cache_min_evict_age = n;
8352 } else if (var == "min_read_recency_for_promote") {
8353 if (interr.length()) {
8354 ss << "error parsing integer value '" << val << "': " << interr;
8355 return -EINVAL;
8356 }
8357 p.min_read_recency_for_promote = n;
8358 } else if (var == "hit_set_grade_decay_rate") {
8359 if (interr.length()) {
8360 ss << "error parsing integer value '" << val << "': " << interr;
8361 return -EINVAL;
8362 }
8363 if (n > 100 || n < 0) {
8364 ss << "value out of range,valid range is 0 - 100";
8365 return -EINVAL;
8366 }
8367 p.hit_set_grade_decay_rate = n;
8368 } else if (var == "hit_set_search_last_n") {
8369 if (interr.length()) {
8370 ss << "error parsing integer value '" << val << "': " << interr;
8371 return -EINVAL;
8372 }
8373 if (n > p.hit_set_count || n < 0) {
8374 ss << "value out of range,valid range is 0 - hit_set_count";
8375 return -EINVAL;
8376 }
8377 p.hit_set_search_last_n = n;
8378 } else if (var == "min_write_recency_for_promote") {
8379 if (interr.length()) {
8380 ss << "error parsing integer value '" << val << "': " << interr;
8381 return -EINVAL;
8382 }
8383 p.min_write_recency_for_promote = n;
8384 } else if (var == "fast_read") {
8385 if (p.is_replicated()) {
8386 ss << "fast read is not supported in replication pool";
8387 return -EINVAL;
8388 }
8389 if (val == "true" || (interr.empty() && n == 1)) {
8390 p.fast_read = true;
8391 } else if (val == "false" || (interr.empty() && n == 0)) {
8392 p.fast_read = false;
8393 } else {
8394 ss << "expecting value 'true', 'false', '0', or '1'";
8395 return -EINVAL;
8396 }
8397 } else if (pool_opts_t::is_opt_name(var)) {
8398 bool unset = val == "unset";
8399 if (var == "compression_mode") {
8400 if (!unset) {
8401 auto cmode = Compressor::get_comp_mode_type(val);
8402 if (!cmode) {
8403 ss << "unrecognized compression mode '" << val << "'";
8404 return -EINVAL;
8405 }
8406 }
8407 } else if (var == "compression_algorithm") {
8408 if (!unset) {
8409 auto alg = Compressor::get_comp_alg_type(val);
8410 if (!alg) {
8411 ss << "unrecognized compression_algorithm '" << val << "'";
8412 return -EINVAL;
8413 }
8414 }
8415 } else if (var == "compression_required_ratio") {
8416 if (floaterr.length()) {
8417 ss << "error parsing float value '" << val << "': " << floaterr;
8418 return -EINVAL;
8419 }
8420 if (f < 0 || f > 1) {
8421 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8422 return -EINVAL;
8423 }
8424 } else if (var == "csum_type") {
8425 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8426 if (t < 0 ) {
8427 ss << "unrecognized csum_type '" << val << "'";
8428 return -EINVAL;
8429 }
8430 //preserve csum_type numeric value
8431 n = t;
8432 interr.clear();
8433 } else if (var == "compression_max_blob_size" ||
8434 var == "compression_min_blob_size" ||
8435 var == "csum_max_block" ||
8436 var == "csum_min_block") {
8437 if (interr.length()) {
8438 ss << "error parsing int value '" << val << "': " << interr;
8439 return -EINVAL;
8440 }
8441 } else if (var == "fingerprint_algorithm") {
8442 if (!unset) {
8443 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8444 if (!alg) {
8445 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8446 return -EINVAL;
8447 }
8448 }
8449 } else if (var == "target_size_bytes") {
8450 if (interr.length()) {
8451 ss << "error parsing unit value '" << val << "': " << interr;
8452 return -EINVAL;
8453 }
8454 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8455 ss << "must set require_osd_release to nautilus or "
8456 << "later before setting target_size_bytes";
8457 return -EINVAL;
8458 }
8459 } else if (var == "pg_num_min") {
8460 if (interr.length()) {
8461 ss << "error parsing int value '" << val << "': " << interr;
8462 return -EINVAL;
8463 }
8464 if (n > (int)p.get_pg_num_target()) {
8465 ss << "specified pg_num_min " << n
8466 << " > pg_num " << p.get_pg_num_target();
8467 return -EINVAL;
8468 }
8469 } else if (var == "recovery_priority") {
8470 if (interr.length()) {
8471 ss << "error parsing int value '" << val << "': " << interr;
8472 return -EINVAL;
8473 }
8474 if (!g_conf()->debug_allow_any_pool_priority) {
8475 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8476 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8477 << " and " << OSD_POOL_PRIORITY_MAX;
8478 return -EINVAL;
8479 }
8480 }
8481 } else if (var == "pg_autoscale_bias") {
8482 if (f < 0.0 || f > 1000.0) {
8483 ss << "pg_autoscale_bias must be between 0 and 1000";
8484 return -EINVAL;
8485 }
8486 }
8487
8488 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8489 switch (desc.type) {
8490 case pool_opts_t::STR:
8491 if (unset) {
8492 p.opts.unset(desc.key);
8493 } else {
8494 p.opts.set(desc.key, static_cast<std::string>(val));
8495 }
8496 break;
8497 case pool_opts_t::INT:
8498 if (interr.length()) {
8499 ss << "error parsing integer value '" << val << "': " << interr;
8500 return -EINVAL;
8501 }
8502 if (n == 0) {
8503 p.opts.unset(desc.key);
8504 } else {
8505 p.opts.set(desc.key, static_cast<int64_t>(n));
8506 }
8507 break;
8508 case pool_opts_t::DOUBLE:
8509 if (floaterr.length()) {
8510 ss << "error parsing floating point value '" << val << "': " << floaterr;
8511 return -EINVAL;
8512 }
8513 if (f == 0) {
8514 p.opts.unset(desc.key);
8515 } else {
8516 p.opts.set(desc.key, static_cast<double>(f));
8517 }
8518 break;
8519 default:
8520 ceph_assert(!"unknown type");
8521 }
8522 } else {
8523 ss << "unrecognized variable '" << var << "'";
8524 return -EINVAL;
8525 }
8526 if (val != "unset") {
8527 ss << "set pool " << pool << " " << var << " to " << val;
8528 } else {
8529 ss << "unset pool " << pool << " " << var;
8530 }
8531 p.last_change = pending_inc.epoch;
8532 pending_inc.new_pools[pool] = p;
8533 return 0;
8534 }
8535
8536 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8537 const cmdmap_t& cmdmap,
8538 stringstream& ss)
8539 {
8540 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8541 }
8542
8543 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8544 const cmdmap_t& cmdmap,
8545 stringstream& ss,
8546 bool *modified)
8547 {
8548 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8549 }
8550
8551
8552 /**
8553 * Common logic for preprocess and prepare phases of pool application
8554 * tag commands. In preprocess mode we're only detecting invalid
8555 * commands, and determining whether it was a modification or a no-op.
8556 * In prepare mode we're actually updating the pending state.
8557 */
8558 int OSDMonitor::_command_pool_application(const string &prefix,
8559 const cmdmap_t& cmdmap,
8560 stringstream& ss,
8561 bool *modified,
8562 bool preparing)
8563 {
8564 string pool_name;
8565 cmd_getval(cmdmap, "pool", pool_name);
8566 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8567 if (pool < 0) {
8568 ss << "unrecognized pool '" << pool_name << "'";
8569 return -ENOENT;
8570 }
8571
8572 pg_pool_t p = *osdmap.get_pg_pool(pool);
8573 if (preparing) {
8574 if (pending_inc.new_pools.count(pool)) {
8575 p = pending_inc.new_pools[pool];
8576 }
8577 }
8578
8579 string app;
8580 cmd_getval(cmdmap, "app", app);
8581 bool app_exists = (p.application_metadata.count(app) > 0);
8582
8583 string key;
8584 cmd_getval(cmdmap, "key", key);
8585 if (key == "all") {
8586 ss << "key cannot be 'all'";
8587 return -EINVAL;
8588 }
8589
8590 string value;
8591 cmd_getval(cmdmap, "value", value);
8592 if (value == "all") {
8593 ss << "value cannot be 'all'";
8594 return -EINVAL;
8595 }
8596
8597 if (boost::algorithm::ends_with(prefix, "enable")) {
8598 if (app.empty()) {
8599 ss << "application name must be provided";
8600 return -EINVAL;
8601 }
8602
8603 if (p.is_tier()) {
8604 ss << "application must be enabled on base tier";
8605 return -EINVAL;
8606 }
8607
8608 bool force = false;
8609 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8610
8611 if (!app_exists && !p.application_metadata.empty() && !force) {
8612 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8613 << "application; pass --yes-i-really-mean-it to proceed anyway";
8614 return -EPERM;
8615 }
8616
8617 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8618 ss << "too many enabled applications on pool '" << pool_name << "'; "
8619 << "max " << MAX_POOL_APPLICATIONS;
8620 return -EINVAL;
8621 }
8622
8623 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8624 ss << "application name '" << app << "' too long; max length "
8625 << MAX_POOL_APPLICATION_LENGTH;
8626 return -EINVAL;
8627 }
8628
8629 if (!app_exists) {
8630 p.application_metadata[app] = {};
8631 }
8632 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8633
8634 } else if (boost::algorithm::ends_with(prefix, "disable")) {
8635 bool force = false;
8636 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8637
8638 if (!force) {
8639 ss << "Are you SURE? Disabling an application within a pool might result "
8640 << "in loss of application functionality; pass "
8641 << "--yes-i-really-mean-it to proceed anyway";
8642 return -EPERM;
8643 }
8644
8645 if (!app_exists) {
8646 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8647 << "'";
8648 return 0; // idempotent
8649 }
8650
8651 p.application_metadata.erase(app);
8652 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8653
8654 } else if (boost::algorithm::ends_with(prefix, "set")) {
8655 if (p.is_tier()) {
8656 ss << "application metadata must be set on base tier";
8657 return -EINVAL;
8658 }
8659
8660 if (!app_exists) {
8661 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8662 << "'";
8663 return -ENOENT;
8664 }
8665
8666 string key;
8667 cmd_getval(cmdmap, "key", key);
8668
8669 if (key.empty()) {
8670 ss << "key must be provided";
8671 return -EINVAL;
8672 }
8673
8674 auto &app_keys = p.application_metadata[app];
8675 if (app_keys.count(key) == 0 &&
8676 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8677 ss << "too many keys set for application '" << app << "' on pool '"
8678 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8679 return -EINVAL;
8680 }
8681
8682 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8683 ss << "key '" << app << "' too long; max length "
8684 << MAX_POOL_APPLICATION_LENGTH;
8685 return -EINVAL;
8686 }
8687
8688 string value;
8689 cmd_getval(cmdmap, "value", value);
8690 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8691 ss << "value '" << value << "' too long; max length "
8692 << MAX_POOL_APPLICATION_LENGTH;
8693 return -EINVAL;
8694 }
8695
8696 p.application_metadata[app][key] = value;
8697 ss << "set application '" << app << "' key '" << key << "' to '"
8698 << value << "' on pool '" << pool_name << "'";
8699 } else if (boost::algorithm::ends_with(prefix, "rm")) {
8700 if (!app_exists) {
8701 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8702 << "'";
8703 return -ENOENT;
8704 }
8705
8706 string key;
8707 cmd_getval(cmdmap, "key", key);
8708 auto it = p.application_metadata[app].find(key);
8709 if (it == p.application_metadata[app].end()) {
8710 ss << "application '" << app << "' on pool '" << pool_name
8711 << "' does not have key '" << key << "'";
8712 return 0; // idempotent
8713 }
8714
8715 p.application_metadata[app].erase(it);
8716 ss << "removed application '" << app << "' key '" << key << "' on pool '"
8717 << pool_name << "'";
8718 } else {
8719 ceph_abort();
8720 }
8721
8722 if (preparing) {
8723 p.last_change = pending_inc.epoch;
8724 pending_inc.new_pools[pool] = p;
8725 }
8726
8727 // Because we fell through this far, we didn't hit no-op cases,
8728 // so pool was definitely modified
8729 if (modified != nullptr) {
8730 *modified = true;
8731 }
8732
8733 return 0;
8734 }
8735
8736 int OSDMonitor::_prepare_command_osd_crush_remove(
8737 CrushWrapper &newcrush,
8738 int32_t id,
8739 int32_t ancestor,
8740 bool has_ancestor,
8741 bool unlink_only)
8742 {
8743 int err = 0;
8744
8745 if (has_ancestor) {
8746 err = newcrush.remove_item_under(cct, id, ancestor,
8747 unlink_only);
8748 } else {
8749 err = newcrush.remove_item(cct, id, unlink_only);
8750 }
8751 return err;
8752 }
8753
8754 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8755 {
8756 pending_inc.crush.clear();
8757 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8758 }
8759
8760 int OSDMonitor::prepare_command_osd_crush_remove(
8761 CrushWrapper &newcrush,
8762 int32_t id,
8763 int32_t ancestor,
8764 bool has_ancestor,
8765 bool unlink_only)
8766 {
8767 int err = _prepare_command_osd_crush_remove(
8768 newcrush, id, ancestor,
8769 has_ancestor, unlink_only);
8770
8771 if (err < 0)
8772 return err;
8773
8774 ceph_assert(err == 0);
8775 do_osd_crush_remove(newcrush);
8776
8777 return 0;
8778 }
8779
8780 int OSDMonitor::prepare_command_osd_remove(int32_t id)
8781 {
8782 if (osdmap.is_up(id)) {
8783 return -EBUSY;
8784 }
8785
8786 pending_inc.new_state[id] = osdmap.get_state(id);
8787 pending_inc.new_uuid[id] = uuid_d();
8788 pending_metadata_rm.insert(id);
8789 pending_metadata.erase(id);
8790
8791 return 0;
8792 }
8793
8794 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8795 {
8796 ceph_assert(existing_id);
8797 *existing_id = -1;
8798
8799 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8800 if (!osdmap.exists(i) &&
8801 pending_inc.new_up_client.count(i) == 0 &&
8802 (pending_inc.new_state.count(i) == 0 ||
8803 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8804 *existing_id = i;
8805 return -1;
8806 }
8807 }
8808
8809 if (pending_inc.new_max_osd < 0) {
8810 return osdmap.get_max_osd();
8811 }
8812 return pending_inc.new_max_osd;
8813 }
8814
8815 void OSDMonitor::do_osd_create(
8816 const int32_t id,
8817 const uuid_d& uuid,
8818 const string& device_class,
8819 int32_t* new_id)
8820 {
8821 dout(10) << __func__ << " uuid " << uuid << dendl;
8822 ceph_assert(new_id);
8823
8824 // We presume validation has been performed prior to calling this
8825 // function. We assert with prejudice.
8826
8827 int32_t allocated_id = -1; // declare here so we can jump
8828 int32_t existing_id = -1;
8829 if (!uuid.is_zero()) {
8830 existing_id = osdmap.identify_osd(uuid);
8831 if (existing_id >= 0) {
8832 ceph_assert(id < 0 || id == existing_id);
8833 *new_id = existing_id;
8834 goto out;
8835 } else if (id >= 0) {
8836 // uuid does not exist, and id has been provided, so just create
8837 // the new osd.id
8838 *new_id = id;
8839 goto out;
8840 }
8841 }
8842
8843 // allocate a new id
8844 allocated_id = _allocate_osd_id(&existing_id);
8845 dout(10) << __func__ << " allocated id " << allocated_id
8846 << " existing id " << existing_id << dendl;
8847 if (existing_id >= 0) {
8848 ceph_assert(existing_id < osdmap.get_max_osd());
8849 ceph_assert(allocated_id < 0);
8850 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8851 *new_id = existing_id;
8852 } else if (allocated_id >= 0) {
8853 ceph_assert(existing_id < 0);
8854 // raise max_osd
8855 if (pending_inc.new_max_osd < 0) {
8856 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8857 } else {
8858 ++pending_inc.new_max_osd;
8859 }
8860 *new_id = pending_inc.new_max_osd - 1;
8861 ceph_assert(*new_id == allocated_id);
8862 } else {
8863 ceph_abort_msg("unexpected condition");
8864 }
8865
8866 out:
8867 if (device_class.size()) {
8868 CrushWrapper newcrush;
8869 _get_pending_crush(newcrush);
8870 if (newcrush.get_max_devices() < *new_id + 1) {
8871 newcrush.set_max_devices(*new_id + 1);
8872 }
8873 string name = string("osd.") + stringify(*new_id);
8874 if (!newcrush.item_exists(*new_id)) {
8875 newcrush.set_item_name(*new_id, name);
8876 }
8877 ostringstream ss;
8878 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8879 if (r < 0) {
8880 derr << __func__ << " failed to set " << name << " device_class "
8881 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8882 << dendl;
8883 // non-fatal... this might be a replay and we want to be idempotent.
8884 } else {
8885 dout(20) << __func__ << " set " << name << " device_class " << device_class
8886 << dendl;
8887 pending_inc.crush.clear();
8888 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8889 }
8890 } else {
8891 dout(20) << __func__ << " no device_class" << dendl;
8892 }
8893
8894 dout(10) << __func__ << " using id " << *new_id << dendl;
8895 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8896 pending_inc.new_max_osd = *new_id + 1;
8897 }
8898
8899 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8900 if (!uuid.is_zero())
8901 pending_inc.new_uuid[*new_id] = uuid;
8902 }
8903
8904 int OSDMonitor::validate_osd_create(
8905 const int32_t id,
8906 const uuid_d& uuid,
8907 const bool check_osd_exists,
8908 int32_t* existing_id,
8909 stringstream& ss)
8910 {
8911
8912 dout(10) << __func__ << " id " << id << " uuid " << uuid
8913 << " check_osd_exists " << check_osd_exists << dendl;
8914
8915 ceph_assert(existing_id);
8916
8917 if (id < 0 && uuid.is_zero()) {
8918 // we have nothing to validate
8919 *existing_id = -1;
8920 return 0;
8921 } else if (uuid.is_zero()) {
8922 // we have an id but we will ignore it - because that's what
8923 // `osd create` does.
8924 return 0;
8925 }
8926
8927 /*
8928 * This function will be used to validate whether we are able to
8929 * create a new osd when the `uuid` is specified.
8930 *
8931 * It will be used by both `osd create` and `osd new`, as the checks
8932 * are basically the same when it pertains to osd id and uuid validation.
8933 * However, `osd create` presumes an `uuid` is optional, for legacy
8934 * reasons, while `osd new` requires the `uuid` to be provided. This
8935 * means that `osd create` will not be idempotent if an `uuid` is not
8936 * provided, but we will always guarantee the idempotency of `osd new`.
8937 */
8938
8939 ceph_assert(!uuid.is_zero());
8940 if (pending_inc.identify_osd(uuid) >= 0) {
8941 // osd is about to exist
8942 return -EAGAIN;
8943 }
8944
8945 int32_t i = osdmap.identify_osd(uuid);
8946 if (i >= 0) {
8947 // osd already exists
8948 if (id >= 0 && i != id) {
8949 ss << "uuid " << uuid << " already in use for different id " << i;
8950 return -EEXIST;
8951 }
8952 // return a positive errno to distinguish between a blocking error
8953 // and an error we consider to not be a problem (i.e., this would be
8954 // an idempotent operation).
8955 *existing_id = i;
8956 return EEXIST;
8957 }
8958 // i < 0
8959 if (id >= 0) {
8960 if (pending_inc.new_state.count(id)) {
8961 // osd is about to exist
8962 return -EAGAIN;
8963 }
8964 // we may not care if an osd exists if we are recreating a previously
8965 // destroyed osd.
8966 if (check_osd_exists && osdmap.exists(id)) {
8967 ss << "id " << id << " already in use and does not match uuid "
8968 << uuid;
8969 return -EINVAL;
8970 }
8971 }
8972 return 0;
8973 }
8974
8975 int OSDMonitor::prepare_command_osd_create(
8976 const int32_t id,
8977 const uuid_d& uuid,
8978 int32_t* existing_id,
8979 stringstream& ss)
8980 {
8981 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8982 ceph_assert(existing_id);
8983 if (osdmap.is_destroyed(id)) {
8984 ss << "ceph osd create has been deprecated. Please use ceph osd new "
8985 "instead.";
8986 return -EINVAL;
8987 }
8988
8989 if (uuid.is_zero()) {
8990 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8991 }
8992
8993 return validate_osd_create(id, uuid, true, existing_id, ss);
8994 }
8995
8996 int OSDMonitor::prepare_command_osd_new(
8997 MonOpRequestRef op,
8998 const cmdmap_t& cmdmap,
8999 const map<string,string>& params,
9000 stringstream &ss,
9001 Formatter *f)
9002 {
9003 uuid_d uuid;
9004 string uuidstr;
9005 int64_t id = -1;
9006
9007 ceph_assert(paxos->is_plugged());
9008
9009 dout(10) << __func__ << " " << op << dendl;
9010
9011 /* validate command. abort now if something's wrong. */
9012
9013 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9014 *
9015 * If `id` is not specified, we will identify any existing osd based
9016 * on `uuid`. Operation will be idempotent iff secrets match.
9017 *
9018 * If `id` is specified, we will identify any existing osd based on
9019 * `uuid` and match against `id`. If they match, operation will be
9020 * idempotent iff secrets match.
9021 *
9022 * `-i secrets.json` will be optional. If supplied, will be used
9023 * to check for idempotency when `id` and `uuid` match.
9024 *
9025 * If `id` is not specified, and `uuid` does not exist, an id will
9026 * be found or allocated for the osd.
9027 *
9028 * If `id` is specified, and the osd has been previously marked
9029 * as destroyed, then the `id` will be reused.
9030 */
9031 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9032 ss << "requires the OSD's UUID to be specified.";
9033 return -EINVAL;
9034 } else if (!uuid.parse(uuidstr.c_str())) {
9035 ss << "invalid UUID value '" << uuidstr << "'.";
9036 return -EINVAL;
9037 }
9038
9039 if (cmd_getval(cmdmap, "id", id) &&
9040 (id < 0)) {
9041 ss << "invalid OSD id; must be greater or equal than zero.";
9042 return -EINVAL;
9043 }
9044
9045 // are we running an `osd create`-like command, or recreating
9046 // a previously destroyed osd?
9047
9048 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9049
9050 // we will care about `id` to assess whether osd is `destroyed`, or
9051 // to create a new osd.
9052 // we will need an `id` by the time we reach auth.
9053
9054 int32_t existing_id = -1;
9055 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9056 &existing_id, ss);
9057
9058 bool may_be_idempotent = false;
9059 if (err == EEXIST) {
9060 // this is idempotent from the osdmon's point-of-view
9061 may_be_idempotent = true;
9062 ceph_assert(existing_id >= 0);
9063 id = existing_id;
9064 } else if (err < 0) {
9065 return err;
9066 }
9067
9068 if (!may_be_idempotent) {
9069 // idempotency is out of the window. We are either creating a new
9070 // osd or recreating a destroyed osd.
9071 //
9072 // We now need to figure out if we have an `id` (and if it's valid),
9073 // of find an `id` if we don't have one.
9074
9075 // NOTE: we need to consider the case where the `id` is specified for
9076 // `osd create`, and we must honor it. So this means checking if
9077 // the `id` is destroyed, and if so assume the destroy; otherwise,
9078 // check if it `exists` - in which case we complain about not being
9079 // `destroyed`. In the end, if nothing fails, we must allow the
9080 // creation, so that we are compatible with `create`.
9081 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9082 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9083 ss << "OSD " << id << " has not yet been destroyed";
9084 return -EINVAL;
9085 } else if (id < 0) {
9086 // find an `id`
9087 id = _allocate_osd_id(&existing_id);
9088 if (id < 0) {
9089 ceph_assert(existing_id >= 0);
9090 id = existing_id;
9091 }
9092 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9093 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9094 dout(10) << __func__ << " recreating osd." << id << dendl;
9095 } else {
9096 dout(10) << __func__ << " creating new osd." << id << dendl;
9097 }
9098 } else {
9099 ceph_assert(id >= 0);
9100 ceph_assert(osdmap.exists(id));
9101 }
9102
9103 // we are now able to either create a brand new osd or reuse an existing
9104 // osd that has been previously destroyed.
9105
9106 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9107
9108 if (may_be_idempotent && params.empty()) {
9109 // nothing to do, really.
9110 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9111 ceph_assert(id >= 0);
9112 if (f) {
9113 f->open_object_section("created_osd");
9114 f->dump_int("osdid", id);
9115 f->close_section();
9116 } else {
9117 ss << id;
9118 }
9119 return EEXIST;
9120 }
9121
9122 string device_class;
9123 auto p = params.find("crush_device_class");
9124 if (p != params.end()) {
9125 device_class = p->second;
9126 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9127 }
9128 string cephx_secret, lockbox_secret, dmcrypt_key;
9129 bool has_lockbox = false;
9130 bool has_secrets = params.count("cephx_secret")
9131 || params.count("cephx_lockbox_secret")
9132 || params.count("dmcrypt_key");
9133
9134 ConfigKeyService *svc = nullptr;
9135 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9136
9137 if (has_secrets) {
9138 if (params.count("cephx_secret") == 0) {
9139 ss << "requires a cephx secret.";
9140 return -EINVAL;
9141 }
9142 cephx_secret = params.at("cephx_secret");
9143
9144 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9145 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9146
9147 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9148 << " dmcrypt " << has_dmcrypt_key << dendl;
9149
9150 if (has_lockbox_secret && has_dmcrypt_key) {
9151 has_lockbox = true;
9152 lockbox_secret = params.at("cephx_lockbox_secret");
9153 dmcrypt_key = params.at("dmcrypt_key");
9154 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9155 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9156 return -EINVAL;
9157 }
9158
9159 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9160
9161 err = mon->authmon()->validate_osd_new(id, uuid,
9162 cephx_secret,
9163 lockbox_secret,
9164 cephx_entity,
9165 lockbox_entity,
9166 ss);
9167 if (err < 0) {
9168 return err;
9169 } else if (may_be_idempotent && err != EEXIST) {
9170 // for this to be idempotent, `id` should already be >= 0; no need
9171 // to use validate_id.
9172 ceph_assert(id >= 0);
9173 ss << "osd." << id << " exists but secrets do not match";
9174 return -EEXIST;
9175 }
9176
9177 if (has_lockbox) {
9178 svc = (ConfigKeyService*)mon->config_key_service;
9179 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9180 if (err < 0) {
9181 return err;
9182 } else if (may_be_idempotent && err != EEXIST) {
9183 ceph_assert(id >= 0);
9184 ss << "osd." << id << " exists but dm-crypt key does not match.";
9185 return -EEXIST;
9186 }
9187 }
9188 }
9189 ceph_assert(!has_secrets || !cephx_secret.empty());
9190 ceph_assert(!has_lockbox || !lockbox_secret.empty());
9191
9192 if (may_be_idempotent) {
9193 // we have nothing to do for either the osdmon or the authmon,
9194 // and we have no lockbox - so the config key service will not be
9195 // touched. This is therefore an idempotent operation, and we can
9196 // just return right away.
9197 dout(10) << __func__ << " idempotent -- no op." << dendl;
9198 ceph_assert(id >= 0);
9199 if (f) {
9200 f->open_object_section("created_osd");
9201 f->dump_int("osdid", id);
9202 f->close_section();
9203 } else {
9204 ss << id;
9205 }
9206 return EEXIST;
9207 }
9208 ceph_assert(!may_be_idempotent);
9209
9210 // perform updates.
9211 if (has_secrets) {
9212 ceph_assert(!cephx_secret.empty());
9213 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9214 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9215
9216 err = mon->authmon()->do_osd_new(cephx_entity,
9217 lockbox_entity,
9218 has_lockbox);
9219 ceph_assert(0 == err);
9220
9221 if (has_lockbox) {
9222 ceph_assert(nullptr != svc);
9223 svc->do_osd_new(uuid, dmcrypt_key);
9224 }
9225 }
9226
9227 if (is_recreate_destroyed) {
9228 ceph_assert(id >= 0);
9229 ceph_assert(osdmap.is_destroyed(id));
9230 pending_inc.new_weight[id] = CEPH_OSD_OUT;
9231 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9232 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9233 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9234 }
9235 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9236 // due to http://tracker.ceph.com/issues/20751 some clusters may
9237 // have UP set for non-existent OSDs; make sure it is cleared
9238 // for a newly created osd.
9239 pending_inc.new_state[id] |= CEPH_OSD_UP;
9240 }
9241 pending_inc.new_uuid[id] = uuid;
9242 } else {
9243 ceph_assert(id >= 0);
9244 int32_t new_id = -1;
9245 do_osd_create(id, uuid, device_class, &new_id);
9246 ceph_assert(new_id >= 0);
9247 ceph_assert(id == new_id);
9248 }
9249
9250 if (f) {
9251 f->open_object_section("created_osd");
9252 f->dump_int("osdid", id);
9253 f->close_section();
9254 } else {
9255 ss << id;
9256 }
9257
9258 return 0;
9259 }
9260
9261 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9262 {
9263 op->mark_osdmon_event(__func__);
9264 auto m = op->get_req<MMonCommand>();
9265 stringstream ss;
9266 cmdmap_t cmdmap;
9267 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9268 string rs = ss.str();
9269 mon->reply_command(op, -EINVAL, rs, get_last_committed());
9270 return true;
9271 }
9272
9273 MonSession *session = op->get_session();
9274 if (!session) {
9275 derr << __func__ << " no session" << dendl;
9276 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9277 return true;
9278 }
9279
9280 return prepare_command_impl(op, cmdmap);
9281 }
9282
9283 static int parse_reweights(CephContext *cct,
9284 const cmdmap_t& cmdmap,
9285 const OSDMap& osdmap,
9286 map<int32_t, uint32_t>* weights)
9287 {
9288 string weights_str;
9289 if (!cmd_getval(cmdmap, "weights", weights_str)) {
9290 return -EINVAL;
9291 }
9292 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9293 json_spirit::mValue json_value;
9294 if (!json_spirit::read(weights_str, json_value)) {
9295 return -EINVAL;
9296 }
9297 if (json_value.type() != json_spirit::obj_type) {
9298 return -EINVAL;
9299 }
9300 const auto obj = json_value.get_obj();
9301 try {
9302 for (auto& osd_weight : obj) {
9303 auto osd_id = std::stoi(osd_weight.first);
9304 if (!osdmap.exists(osd_id)) {
9305 return -ENOENT;
9306 }
9307 if (osd_weight.second.type() != json_spirit::str_type) {
9308 return -EINVAL;
9309 }
9310 auto weight = std::stoul(osd_weight.second.get_str());
9311 weights->insert({osd_id, weight});
9312 }
9313 } catch (const std::logic_error& e) {
9314 return -EINVAL;
9315 }
9316 return 0;
9317 }
9318
9319 int OSDMonitor::prepare_command_osd_destroy(
9320 int32_t id,
9321 stringstream& ss)
9322 {
9323 ceph_assert(paxos->is_plugged());
9324
9325 // we check if the osd exists for the benefit of `osd purge`, which may
9326 // have previously removed the osd. If the osd does not exist, return
9327 // -ENOENT to convey this, and let the caller deal with it.
9328 //
9329 // we presume that all auth secrets and config keys were removed prior
9330 // to this command being called. if they exist by now, we also assume
9331 // they must have been created by some other command and do not pertain
9332 // to this non-existent osd.
9333 if (!osdmap.exists(id)) {
9334 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9335 return -ENOENT;
9336 }
9337
9338 uuid_d uuid = osdmap.get_uuid(id);
9339 dout(10) << __func__ << " destroying osd." << id
9340 << " uuid " << uuid << dendl;
9341
9342 // if it has been destroyed, we assume our work here is done.
9343 if (osdmap.is_destroyed(id)) {
9344 ss << "destroyed osd." << id;
9345 return 0;
9346 }
9347
9348 EntityName cephx_entity, lockbox_entity;
9349 bool idempotent_auth = false, idempotent_cks = false;
9350
9351 int err = mon->authmon()->validate_osd_destroy(id, uuid,
9352 cephx_entity,
9353 lockbox_entity,
9354 ss);
9355 if (err < 0) {
9356 if (err == -ENOENT) {
9357 idempotent_auth = true;
9358 } else {
9359 return err;
9360 }
9361 }
9362
9363 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9364 err = svc->validate_osd_destroy(id, uuid);
9365 if (err < 0) {
9366 ceph_assert(err == -ENOENT);
9367 err = 0;
9368 idempotent_cks = true;
9369 }
9370
9371 if (!idempotent_auth) {
9372 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9373 ceph_assert(0 == err);
9374 }
9375
9376 if (!idempotent_cks) {
9377 svc->do_osd_destroy(id, uuid);
9378 }
9379
9380 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9381 pending_inc.new_uuid[id] = uuid_d();
9382
9383 // we can only propose_pending() once per service, otherwise we'll be
9384 // defying PaxosService and all laws of nature. Therefore, as we may
9385 // be used during 'osd purge', let's keep the caller responsible for
9386 // proposing.
9387 ceph_assert(err == 0);
9388 return 0;
9389 }
9390
9391 int OSDMonitor::prepare_command_osd_purge(
9392 int32_t id,
9393 stringstream& ss)
9394 {
9395 ceph_assert(paxos->is_plugged());
9396 dout(10) << __func__ << " purging osd." << id << dendl;
9397
9398 ceph_assert(!osdmap.is_up(id));
9399
9400 /*
9401 * This may look a bit weird, but this is what's going to happen:
9402 *
9403 * 1. we make sure that removing from crush works
9404 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9405 * error, then we abort the whole operation, as no updates
9406 * have been made. However, we this function will have
9407 * side-effects, thus we need to make sure that all operations
9408 * performed henceforth will *always* succeed.
9409 * 3. we call `prepare_command_osd_remove()`. Although this
9410 * function can return an error, it currently only checks if the
9411 * osd is up - and we have made sure that it is not so, so there
9412 * is no conflict, and it is effectively an update.
9413 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9414 * the crush update we delayed from before.
9415 */
9416
9417 CrushWrapper newcrush;
9418 _get_pending_crush(newcrush);
9419
9420 bool may_be_idempotent = false;
9421
9422 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9423 if (err == -ENOENT) {
9424 err = 0;
9425 may_be_idempotent = true;
9426 } else if (err < 0) {
9427 ss << "error removing osd." << id << " from crush";
9428 return err;
9429 }
9430
9431 // no point destroying the osd again if it has already been marked destroyed
9432 if (!osdmap.is_destroyed(id)) {
9433 err = prepare_command_osd_destroy(id, ss);
9434 if (err < 0) {
9435 if (err == -ENOENT) {
9436 err = 0;
9437 } else {
9438 return err;
9439 }
9440 } else {
9441 may_be_idempotent = false;
9442 }
9443 }
9444 ceph_assert(0 == err);
9445
9446 if (may_be_idempotent && !osdmap.exists(id)) {
9447 dout(10) << __func__ << " osd." << id << " does not exist and "
9448 << "we are idempotent." << dendl;
9449 return -ENOENT;
9450 }
9451
9452 err = prepare_command_osd_remove(id);
9453 // we should not be busy, as we should have made sure this id is not up.
9454 ceph_assert(0 == err);
9455
9456 do_osd_crush_remove(newcrush);
9457 return 0;
9458 }
9459
9460 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9461 const cmdmap_t& cmdmap)
9462 {
9463 op->mark_osdmon_event(__func__);
9464 auto m = op->get_req<MMonCommand>();
9465 bool ret = false;
9466 stringstream ss;
9467 string rs;
9468 bufferlist rdata;
9469 int err = 0;
9470
9471 string format;
9472 cmd_getval(cmdmap, "format", format, string("plain"));
9473 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9474
9475 string prefix;
9476 cmd_getval(cmdmap, "prefix", prefix);
9477
9478 int64_t osdid;
9479 string osd_name;
9480 bool osdid_present = false;
9481 if (prefix != "osd pg-temp" &&
9482 prefix != "osd pg-upmap" &&
9483 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9484 osdid_present = cmd_getval(cmdmap, "id", osdid);
9485 }
9486 if (osdid_present) {
9487 ostringstream oss;
9488 oss << "osd." << osdid;
9489 osd_name = oss.str();
9490 }
9491
9492 // Even if there's a pending state with changes that could affect
9493 // a command, considering that said state isn't yet committed, we
9494 // just don't care about those changes if the command currently being
9495 // handled acts as a no-op against the current committed state.
9496 // In a nutshell, we assume this command happens *before*.
9497 //
9498 // Let me make this clearer:
9499 //
9500 // - If we have only one client, and that client issues some
9501 // operation that would conflict with this operation but is
9502 // still on the pending state, then we would be sure that said
9503 // operation wouldn't have returned yet, so the client wouldn't
9504 // issue this operation (unless the client didn't wait for the
9505 // operation to finish, and that would be the client's own fault).
9506 //
9507 // - If we have more than one client, each client will observe
9508 // whatever is the state at the moment of the commit. So, if we
9509 // have two clients, one issuing an unlink and another issuing a
9510 // link, and if the link happens while the unlink is still on the
9511 // pending state, from the link's point-of-view this is a no-op.
9512 // If different clients are issuing conflicting operations and
9513 // they care about that, then the clients should make sure they
9514 // enforce some kind of concurrency mechanism -- from our
9515 // perspective that's what Douglas Adams would call an SEP.
9516 //
9517 // This should be used as a general guideline for most commands handled
9518 // in this function. Adapt as you see fit, but please bear in mind that
9519 // this is the expected behavior.
9520
9521
9522 if (prefix == "osd setcrushmap" ||
9523 (prefix == "osd crush set" && !osdid_present)) {
9524 if (pending_inc.crush.length()) {
9525 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9526 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9527 return true;
9528 }
9529 dout(10) << "prepare_command setting new crush map" << dendl;
9530 bufferlist data(m->get_data());
9531 CrushWrapper crush;
9532 try {
9533 auto bl = data.cbegin();
9534 crush.decode(bl);
9535 }
9536 catch (const std::exception &e) {
9537 err = -EINVAL;
9538 ss << "Failed to parse crushmap: " << e.what();
9539 goto reply;
9540 }
9541
9542 int64_t prior_version = 0;
9543 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9544 if (prior_version == osdmap.get_crush_version() - 1) {
9545 // see if we are a resend of the last update. this is imperfect
9546 // (multiple racing updaters may not both get reliable success)
9547 // but we expect crush updaters (via this interface) to be rare-ish.
9548 bufferlist current, proposed;
9549 osdmap.crush->encode(current, mon->get_quorum_con_features());
9550 crush.encode(proposed, mon->get_quorum_con_features());
9551 if (current.contents_equal(proposed)) {
9552 dout(10) << __func__
9553 << " proposed matches current and version equals previous"
9554 << dendl;
9555 err = 0;
9556 ss << osdmap.get_crush_version();
9557 goto reply;
9558 }
9559 }
9560 if (prior_version != osdmap.get_crush_version()) {
9561 err = -EPERM;
9562 ss << "prior_version " << prior_version << " != crush version "
9563 << osdmap.get_crush_version();
9564 goto reply;
9565 }
9566 }
9567
9568 if (crush.has_legacy_rule_ids()) {
9569 err = -EINVAL;
9570 ss << "crush maps with ruleset != ruleid are no longer allowed";
9571 goto reply;
9572 }
9573 if (!validate_crush_against_features(&crush, ss)) {
9574 err = -EINVAL;
9575 goto reply;
9576 }
9577
9578 err = osdmap.validate_crush_rules(&crush, &ss);
9579 if (err < 0) {
9580 goto reply;
9581 }
9582
9583 if (g_conf()->mon_osd_crush_smoke_test) {
9584 // sanity check: test some inputs to make sure this map isn't
9585 // totally broken
9586 dout(10) << " testing map" << dendl;
9587 stringstream ess;
9588 CrushTester tester(crush, ess);
9589 tester.set_min_x(0);
9590 tester.set_max_x(50);
9591 auto start = ceph::coarse_mono_clock::now();
9592 int r = tester.test_with_fork(g_conf()->mon_lease);
9593 auto duration = ceph::coarse_mono_clock::now() - start;
9594 if (r < 0) {
9595 dout(10) << " tester.test_with_fork returns " << r
9596 << ": " << ess.str() << dendl;
9597 ss << "crush smoke test failed with " << r << ": " << ess.str();
9598 err = r;
9599 goto reply;
9600 }
9601 dout(10) << __func__ << " crush somke test duration: "
9602 << duration << ", result: " << ess.str() << dendl;
9603 }
9604
9605 pending_inc.crush = data;
9606 ss << osdmap.get_crush_version() + 1;
9607 goto update;
9608
9609 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9610 CrushWrapper newcrush;
9611 _get_pending_crush(newcrush);
9612 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9613 int bid = -1 - b;
9614 if (newcrush.bucket_exists(bid) &&
9615 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9616 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9617 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9618 }
9619 }
9620 if (!validate_crush_against_features(&newcrush, ss)) {
9621 err = -EINVAL;
9622 goto reply;
9623 }
9624 pending_inc.crush.clear();
9625 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9626 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9627 get_last_committed() + 1));
9628 return true;
9629 } else if (prefix == "osd crush set-device-class") {
9630 string device_class;
9631 if (!cmd_getval(cmdmap, "class", device_class)) {
9632 err = -EINVAL; // no value!
9633 goto reply;
9634 }
9635
9636 bool stop = false;
9637 vector<string> idvec;
9638 cmd_getval(cmdmap, "ids", idvec);
9639 CrushWrapper newcrush;
9640 _get_pending_crush(newcrush);
9641 set<int> updated;
9642 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9643 set<int> osds;
9644 // wildcard?
9645 if (j == 0 &&
9646 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9647 osdmap.get_all_osds(osds);
9648 stop = true;
9649 } else {
9650 // try traditional single osd way
9651 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9652 if (osd < 0) {
9653 // ss has reason for failure
9654 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9655 err = -EINVAL;
9656 continue;
9657 }
9658 osds.insert(osd);
9659 }
9660
9661 for (auto &osd : osds) {
9662 if (!osdmap.exists(osd)) {
9663 ss << "osd." << osd << " does not exist. ";
9664 continue;
9665 }
9666
9667 ostringstream oss;
9668 oss << "osd." << osd;
9669 string name = oss.str();
9670
9671 if (newcrush.get_max_devices() < osd + 1) {
9672 newcrush.set_max_devices(osd + 1);
9673 }
9674 string action;
9675 if (newcrush.item_exists(osd)) {
9676 action = "updating";
9677 } else {
9678 action = "creating";
9679 newcrush.set_item_name(osd, name);
9680 }
9681
9682 dout(5) << action << " crush item id " << osd << " name '" << name
9683 << "' device_class '" << device_class << "'"
9684 << dendl;
9685 err = newcrush.update_device_class(osd, device_class, name, &ss);
9686 if (err < 0) {
9687 goto reply;
9688 }
9689 if (err == 0 && !_have_pending_crush()) {
9690 if (!stop) {
9691 // for single osd only, wildcard makes too much noise
9692 ss << "set-device-class item id " << osd << " name '" << name
9693 << "' device_class '" << device_class << "': no change. ";
9694 }
9695 } else {
9696 updated.insert(osd);
9697 }
9698 }
9699 }
9700
9701 if (!updated.empty()) {
9702 pending_inc.crush.clear();
9703 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9704 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9705 getline(ss, rs);
9706 wait_for_finished_proposal(op,
9707 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9708 return true;
9709 }
9710
9711 } else if (prefix == "osd crush rm-device-class") {
9712 bool stop = false;
9713 vector<string> idvec;
9714 cmd_getval(cmdmap, "ids", idvec);
9715 CrushWrapper newcrush;
9716 _get_pending_crush(newcrush);
9717 set<int> updated;
9718
9719 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9720 set<int> osds;
9721
9722 // wildcard?
9723 if (j == 0 &&
9724 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9725 osdmap.get_all_osds(osds);
9726 stop = true;
9727 } else {
9728 // try traditional single osd way
9729 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9730 if (osd < 0) {
9731 // ss has reason for failure
9732 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9733 err = -EINVAL;
9734 goto reply;
9735 }
9736 osds.insert(osd);
9737 }
9738
9739 for (auto &osd : osds) {
9740 if (!osdmap.exists(osd)) {
9741 ss << "osd." << osd << " does not exist. ";
9742 continue;
9743 }
9744
9745 auto class_name = newcrush.get_item_class(osd);
9746 if (!class_name) {
9747 ss << "osd." << osd << " belongs to no class, ";
9748 continue;
9749 }
9750 // note that we do not verify if class_is_in_use here
9751 // in case the device is misclassified and user wants
9752 // to overridely reset...
9753
9754 err = newcrush.remove_device_class(cct, osd, &ss);
9755 if (err < 0) {
9756 // ss has reason for failure
9757 goto reply;
9758 }
9759 updated.insert(osd);
9760 }
9761 }
9762
9763 if (!updated.empty()) {
9764 pending_inc.crush.clear();
9765 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9766 ss << "done removing class of osd(s): " << updated;
9767 getline(ss, rs);
9768 wait_for_finished_proposal(op,
9769 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9770 return true;
9771 }
9772 } else if (prefix == "osd crush class create") {
9773 string device_class;
9774 if (!cmd_getval(cmdmap, "class", device_class)) {
9775 err = -EINVAL; // no value!
9776 goto reply;
9777 }
9778 if (osdmap.require_osd_release < ceph_release_t::luminous) {
9779 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9780 << "luminous' before using crush device classes";
9781 err = -EPERM;
9782 goto reply;
9783 }
9784 if (!_have_pending_crush() &&
9785 _get_stable_crush().class_exists(device_class)) {
9786 ss << "class '" << device_class << "' already exists";
9787 goto reply;
9788 }
9789 CrushWrapper newcrush;
9790 _get_pending_crush(newcrush);
9791 if (newcrush.class_exists(device_class)) {
9792 ss << "class '" << device_class << "' already exists";
9793 goto update;
9794 }
9795 int class_id = newcrush.get_or_create_class_id(device_class);
9796 pending_inc.crush.clear();
9797 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9798 ss << "created class " << device_class << " with id " << class_id
9799 << " to crush map";
9800 goto update;
9801 } else if (prefix == "osd crush class rm") {
9802 string device_class;
9803 if (!cmd_getval(cmdmap, "class", device_class)) {
9804 err = -EINVAL; // no value!
9805 goto reply;
9806 }
9807 if (osdmap.require_osd_release < ceph_release_t::luminous) {
9808 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9809 << "luminous' before using crush device classes";
9810 err = -EPERM;
9811 goto reply;
9812 }
9813
9814 if (!osdmap.crush->class_exists(device_class)) {
9815 err = 0;
9816 goto reply;
9817 }
9818
9819 CrushWrapper newcrush;
9820 _get_pending_crush(newcrush);
9821 if (!newcrush.class_exists(device_class)) {
9822 err = 0; // make command idempotent
9823 goto wait;
9824 }
9825 int class_id = newcrush.get_class_id(device_class);
9826 stringstream ts;
9827 if (newcrush.class_is_in_use(class_id, &ts)) {
9828 err = -EBUSY;
9829 ss << "class '" << device_class << "' " << ts.str();
9830 goto reply;
9831 }
9832
9833 // check if class is used by any erasure-code-profiles
9834 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9835 osdmap.get_erasure_code_profiles();
9836 auto ec_profiles = pending_inc.get_erasure_code_profiles();
9837 #ifdef HAVE_STDLIB_MAP_SPLICING
9838 ec_profiles.merge(old_ec_profiles);
9839 #else
9840 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9841 make_move_iterator(end(old_ec_profiles)));
9842 #endif
9843 list<string> referenced_by;
9844 for (auto &i: ec_profiles) {
9845 for (auto &j: i.second) {
9846 if ("crush-device-class" == j.first && device_class == j.second) {
9847 referenced_by.push_back(i.first);
9848 }
9849 }
9850 }
9851 if (!referenced_by.empty()) {
9852 err = -EBUSY;
9853 ss << "class '" << device_class
9854 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9855 goto reply;
9856 }
9857
9858 set<int> osds;
9859 newcrush.get_devices_by_class(device_class, &osds);
9860 for (auto& p: osds) {
9861 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9862 if (err < 0) {
9863 // ss has reason for failure
9864 goto reply;
9865 }
9866 }
9867
9868 if (osds.empty()) {
9869 // empty class, remove directly
9870 err = newcrush.remove_class_name(device_class);
9871 if (err < 0) {
9872 ss << "class '" << device_class << "' cannot be removed '"
9873 << cpp_strerror(err) << "'";
9874 goto reply;
9875 }
9876 }
9877
9878 pending_inc.crush.clear();
9879 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9880 ss << "removed class " << device_class << " with id " << class_id
9881 << " from crush map";
9882 goto update;
9883 } else if (prefix == "osd crush class rename") {
9884 string srcname, dstname;
9885 if (!cmd_getval(cmdmap, "srcname", srcname)) {
9886 err = -EINVAL;
9887 goto reply;
9888 }
9889 if (!cmd_getval(cmdmap, "dstname", dstname)) {
9890 err = -EINVAL;
9891 goto reply;
9892 }
9893
9894 CrushWrapper newcrush;
9895 _get_pending_crush(newcrush);
9896 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9897 // suppose this is a replay and return success
9898 // so command is idempotent
9899 ss << "already renamed to '" << dstname << "'";
9900 err = 0;
9901 goto reply;
9902 }
9903
9904 err = newcrush.rename_class(srcname, dstname);
9905 if (err < 0) {
9906 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9907 << cpp_strerror(err);
9908 goto reply;
9909 }
9910
9911 pending_inc.crush.clear();
9912 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9913 ss << "rename class '" << srcname << "' to '" << dstname << "'";
9914 goto update;
9915 } else if (prefix == "osd crush add-bucket") {
9916 // os crush add-bucket <name> <type>
9917 string name, typestr;
9918 vector<string> argvec;
9919 cmd_getval(cmdmap, "name", name);
9920 cmd_getval(cmdmap, "type", typestr);
9921 cmd_getval(cmdmap, "args", argvec);
9922 map<string,string> loc;
9923 if (!argvec.empty()) {
9924 CrushWrapper::parse_loc_map(argvec, &loc);
9925 dout(0) << "will create and move bucket '" << name
9926 << "' to location " << loc << dendl;
9927 }
9928
9929 if (!_have_pending_crush() &&
9930 _get_stable_crush().name_exists(name)) {
9931 ss << "bucket '" << name << "' already exists";
9932 goto reply;
9933 }
9934
9935 CrushWrapper newcrush;
9936 _get_pending_crush(newcrush);
9937
9938 if (newcrush.name_exists(name)) {
9939 ss << "bucket '" << name << "' already exists";
9940 goto update;
9941 }
9942 int type = newcrush.get_type_id(typestr);
9943 if (type < 0) {
9944 ss << "type '" << typestr << "' does not exist";
9945 err = -EINVAL;
9946 goto reply;
9947 }
9948 if (type == 0) {
9949 ss << "type '" << typestr << "' is for devices, not buckets";
9950 err = -EINVAL;
9951 goto reply;
9952 }
9953 int bucketno;
9954 err = newcrush.add_bucket(0, 0,
9955 CRUSH_HASH_DEFAULT, type, 0, NULL,
9956 NULL, &bucketno);
9957 if (err < 0) {
9958 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9959 goto reply;
9960 }
9961 err = newcrush.set_item_name(bucketno, name);
9962 if (err < 0) {
9963 ss << "error setting bucket name to '" << name << "'";
9964 goto reply;
9965 }
9966
9967 if (!loc.empty()) {
9968 if (!newcrush.check_item_loc(cct, bucketno, loc,
9969 (int *)NULL)) {
9970 err = newcrush.move_bucket(cct, bucketno, loc);
9971 if (err < 0) {
9972 ss << "error moving bucket '" << name << "' to location " << loc;
9973 goto reply;
9974 }
9975 } else {
9976 ss << "no need to move item id " << bucketno << " name '" << name
9977 << "' to location " << loc << " in crush map";
9978 }
9979 }
9980
9981 pending_inc.crush.clear();
9982 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9983 if (loc.empty()) {
9984 ss << "added bucket " << name << " type " << typestr
9985 << " to crush map";
9986 } else {
9987 ss << "added bucket " << name << " type " << typestr
9988 << " to location " << loc;
9989 }
9990 goto update;
9991 } else if (prefix == "osd crush rename-bucket") {
9992 string srcname, dstname;
9993 cmd_getval(cmdmap, "srcname", srcname);
9994 cmd_getval(cmdmap, "dstname", dstname);
9995
9996 err = crush_rename_bucket(srcname, dstname, &ss);
9997 if (err == -EALREADY) // equivalent to success for idempotency
9998 err = 0;
9999 if (err)
10000 goto reply;
10001 else
10002 goto update;
10003 } else if (prefix == "osd crush weight-set create" ||
10004 prefix == "osd crush weight-set create-compat") {
10005 CrushWrapper newcrush;
10006 _get_pending_crush(newcrush);
10007 int64_t pool;
10008 int positions;
10009 if (newcrush.has_non_straw2_buckets()) {
10010 ss << "crush map contains one or more bucket(s) that are not straw2";
10011 err = -EPERM;
10012 goto reply;
10013 }
10014 if (prefix == "osd crush weight-set create") {
10015 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10016 osdmap.require_min_compat_client < ceph_release_t::luminous) {
10017 ss << "require_min_compat_client "
10018 << osdmap.require_min_compat_client
10019 << " < luminous, which is required for per-pool weight-sets. "
10020 << "Try 'ceph osd set-require-min-compat-client luminous' "
10021 << "before using the new interface";
10022 err = -EPERM;
10023 goto reply;
10024 }
10025 string poolname, mode;
10026 cmd_getval(cmdmap, "pool", poolname);
10027 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10028 if (pool < 0) {
10029 ss << "pool '" << poolname << "' not found";
10030 err = -ENOENT;
10031 goto reply;
10032 }
10033 cmd_getval(cmdmap, "mode", mode);
10034 if (mode != "flat" && mode != "positional") {
10035 ss << "unrecognized weight-set mode '" << mode << "'";
10036 err = -EINVAL;
10037 goto reply;
10038 }
10039 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10040 } else {
10041 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10042 positions = 1;
10043 }
10044 if (!newcrush.create_choose_args(pool, positions)) {
10045 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10046 ss << "compat weight-set already created";
10047 } else {
10048 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10049 << "' already created";
10050 }
10051 goto reply;
10052 }
10053 pending_inc.crush.clear();
10054 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10055 goto update;
10056
10057 } else if (prefix == "osd crush weight-set rm" ||
10058 prefix == "osd crush weight-set rm-compat") {
10059 CrushWrapper newcrush;
10060 _get_pending_crush(newcrush);
10061 int64_t pool;
10062 if (prefix == "osd crush weight-set rm") {
10063 string poolname;
10064 cmd_getval(cmdmap, "pool", poolname);
10065 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10066 if (pool < 0) {
10067 ss << "pool '" << poolname << "' not found";
10068 err = -ENOENT;
10069 goto reply;
10070 }
10071 } else {
10072 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10073 }
10074 newcrush.rm_choose_args(pool);
10075 pending_inc.crush.clear();
10076 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10077 goto update;
10078
10079 } else if (prefix == "osd crush weight-set reweight" ||
10080 prefix == "osd crush weight-set reweight-compat") {
10081 string poolname, item;
10082 vector<double> weight;
10083 cmd_getval(cmdmap, "pool", poolname);
10084 cmd_getval(cmdmap, "item", item);
10085 cmd_getval(cmdmap, "weight", weight);
10086 CrushWrapper newcrush;
10087 _get_pending_crush(newcrush);
10088 int64_t pool;
10089 if (prefix == "osd crush weight-set reweight") {
10090 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10091 if (pool < 0) {
10092 ss << "pool '" << poolname << "' not found";
10093 err = -ENOENT;
10094 goto reply;
10095 }
10096 if (!newcrush.have_choose_args(pool)) {
10097 ss << "no weight-set for pool '" << poolname << "'";
10098 err = -ENOENT;
10099 goto reply;
10100 }
10101 auto arg_map = newcrush.choose_args_get(pool);
10102 int positions = newcrush.get_choose_args_positions(arg_map);
10103 if (weight.size() != (size_t)positions) {
10104 ss << "must specify exact " << positions << " weight values";
10105 err = -EINVAL;
10106 goto reply;
10107 }
10108 } else {
10109 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10110 if (!newcrush.have_choose_args(pool)) {
10111 ss << "no backward-compatible weight-set";
10112 err = -ENOENT;
10113 goto reply;
10114 }
10115 }
10116 if (!newcrush.name_exists(item)) {
10117 ss << "item '" << item << "' does not exist";
10118 err = -ENOENT;
10119 goto reply;
10120 }
10121 err = newcrush.choose_args_adjust_item_weightf(
10122 cct,
10123 newcrush.choose_args_get(pool),
10124 newcrush.get_item_id(item),
10125 weight,
10126 &ss);
10127 if (err < 0) {
10128 goto reply;
10129 }
10130 err = 0;
10131 pending_inc.crush.clear();
10132 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10133 goto update;
10134 } else if (osdid_present &&
10135 (prefix == "osd crush set" || prefix == "osd crush add")) {
10136 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10137 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10138 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10139
10140 if (!osdmap.exists(osdid)) {
10141 err = -ENOENT;
10142 ss << osd_name
10143 << " does not exist. Create it before updating the crush map";
10144 goto reply;
10145 }
10146
10147 double weight;
10148 if (!cmd_getval(cmdmap, "weight", weight)) {
10149 ss << "unable to parse weight value '"
10150 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10151 err = -EINVAL;
10152 goto reply;
10153 }
10154
10155 string args;
10156 vector<string> argvec;
10157 cmd_getval(cmdmap, "args", argvec);
10158 map<string,string> loc;
10159 CrushWrapper::parse_loc_map(argvec, &loc);
10160
10161 if (prefix == "osd crush set"
10162 && !_get_stable_crush().item_exists(osdid)) {
10163 err = -ENOENT;
10164 ss << "unable to set item id " << osdid << " name '" << osd_name
10165 << "' weight " << weight << " at location " << loc
10166 << ": does not exist";
10167 goto reply;
10168 }
10169
10170 dout(5) << "adding/updating crush item id " << osdid << " name '"
10171 << osd_name << "' weight " << weight << " at location "
10172 << loc << dendl;
10173 CrushWrapper newcrush;
10174 _get_pending_crush(newcrush);
10175
10176 string action;
10177 if (prefix == "osd crush set" ||
10178 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10179 action = "set";
10180 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10181 } else {
10182 action = "add";
10183 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10184 if (err == 0)
10185 err = 1;
10186 }
10187
10188 if (err < 0)
10189 goto reply;
10190
10191 if (err == 0 && !_have_pending_crush()) {
10192 ss << action << " item id " << osdid << " name '" << osd_name
10193 << "' weight " << weight << " at location " << loc << ": no change";
10194 goto reply;
10195 }
10196
10197 pending_inc.crush.clear();
10198 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10199 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10200 << weight << " at location " << loc << " to crush map";
10201 getline(ss, rs);
10202 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10203 get_last_committed() + 1));
10204 return true;
10205
10206 } else if (prefix == "osd crush create-or-move") {
10207 do {
10208 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10209 if (!osdmap.exists(osdid)) {
10210 err = -ENOENT;
10211 ss << osd_name
10212 << " does not exist. create it before updating the crush map";
10213 goto reply;
10214 }
10215
10216 double weight;
10217 if (!cmd_getval(cmdmap, "weight", weight)) {
10218 ss << "unable to parse weight value '"
10219 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10220 err = -EINVAL;
10221 goto reply;
10222 }
10223
10224 string args;
10225 vector<string> argvec;
10226 cmd_getval(cmdmap, "args", argvec);
10227 map<string,string> loc;
10228 CrushWrapper::parse_loc_map(argvec, &loc);
10229
10230 dout(0) << "create-or-move crush item name '" << osd_name
10231 << "' initial_weight " << weight << " at location " << loc
10232 << dendl;
10233
10234 CrushWrapper newcrush;
10235 _get_pending_crush(newcrush);
10236
10237 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10238 g_conf()->osd_crush_update_weight_set);
10239 if (err == 0) {
10240 ss << "create-or-move updated item name '" << osd_name
10241 << "' weight " << weight
10242 << " at location " << loc << " to crush map";
10243 break;
10244 }
10245 if (err > 0) {
10246 pending_inc.crush.clear();
10247 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10248 ss << "create-or-move updating item name '" << osd_name
10249 << "' weight " << weight
10250 << " at location " << loc << " to crush map";
10251 getline(ss, rs);
10252 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10253 get_last_committed() + 1));
10254 return true;
10255 }
10256 } while (false);
10257
10258 } else if (prefix == "osd crush move") {
10259 do {
10260 // osd crush move <name> <loc1> [<loc2> ...]
10261 string name;
10262 vector<string> argvec;
10263 cmd_getval(cmdmap, "name", name);
10264 cmd_getval(cmdmap, "args", argvec);
10265 map<string,string> loc;
10266 CrushWrapper::parse_loc_map(argvec, &loc);
10267
10268 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10269 CrushWrapper newcrush;
10270 _get_pending_crush(newcrush);
10271
10272 if (!newcrush.name_exists(name)) {
10273 err = -ENOENT;
10274 ss << "item " << name << " does not exist";
10275 break;
10276 }
10277 int id = newcrush.get_item_id(name);
10278
10279 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10280 if (id >= 0) {
10281 err = newcrush.create_or_move_item(
10282 cct, id, 0, name, loc,
10283 g_conf()->osd_crush_update_weight_set);
10284 } else {
10285 err = newcrush.move_bucket(cct, id, loc);
10286 }
10287 if (err >= 0) {
10288 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10289 pending_inc.crush.clear();
10290 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10291 getline(ss, rs);
10292 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10293 get_last_committed() + 1));
10294 return true;
10295 }
10296 } else {
10297 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10298 err = 0;
10299 }
10300 } while (false);
10301 } else if (prefix == "osd crush swap-bucket") {
10302 string source, dest;
10303 cmd_getval(cmdmap, "source", source);
10304 cmd_getval(cmdmap, "dest", dest);
10305
10306 bool force = false;
10307 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10308
10309 CrushWrapper newcrush;
10310 _get_pending_crush(newcrush);
10311 if (!newcrush.name_exists(source)) {
10312 ss << "source item " << source << " does not exist";
10313 err = -ENOENT;
10314 goto reply;
10315 }
10316 if (!newcrush.name_exists(dest)) {
10317 ss << "dest item " << dest << " does not exist";
10318 err = -ENOENT;
10319 goto reply;
10320 }
10321 int sid = newcrush.get_item_id(source);
10322 int did = newcrush.get_item_id(dest);
10323 int sparent;
10324 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10325 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10326 err = -EPERM;
10327 goto reply;
10328 }
10329 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10330 !force) {
10331 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10332 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10333 << "; pass --yes-i-really-mean-it to proceed anyway";
10334 err = -EPERM;
10335 goto reply;
10336 }
10337 int r = newcrush.swap_bucket(cct, sid, did);
10338 if (r < 0) {
10339 ss << "failed to swap bucket contents: " << cpp_strerror(r);
10340 err = r;
10341 goto reply;
10342 }
10343 ss << "swapped bucket of " << source << " to " << dest;
10344 pending_inc.crush.clear();
10345 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10346 wait_for_finished_proposal(op,
10347 new Monitor::C_Command(mon, op, err, ss.str(),
10348 get_last_committed() + 1));
10349 return true;
10350 } else if (prefix == "osd crush link") {
10351 // osd crush link <name> <loc1> [<loc2> ...]
10352 string name;
10353 cmd_getval(cmdmap, "name", name);
10354 vector<string> argvec;
10355 cmd_getval(cmdmap, "args", argvec);
10356 map<string,string> loc;
10357 CrushWrapper::parse_loc_map(argvec, &loc);
10358
10359 // Need an explicit check for name_exists because get_item_id returns
10360 // 0 on unfound.
10361 int id = osdmap.crush->get_item_id(name);
10362 if (!osdmap.crush->name_exists(name)) {
10363 err = -ENOENT;
10364 ss << "item " << name << " does not exist";
10365 goto reply;
10366 } else {
10367 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10368 }
10369 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10370 ss << "no need to move item id " << id << " name '" << name
10371 << "' to location " << loc << " in crush map";
10372 err = 0;
10373 goto reply;
10374 }
10375
10376 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10377 CrushWrapper newcrush;
10378 _get_pending_crush(newcrush);
10379
10380 if (!newcrush.name_exists(name)) {
10381 err = -ENOENT;
10382 ss << "item " << name << " does not exist";
10383 goto reply;
10384 } else {
10385 int id = newcrush.get_item_id(name);
10386 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10387 err = newcrush.link_bucket(cct, id, loc);
10388 if (err >= 0) {
10389 ss << "linked item id " << id << " name '" << name
10390 << "' to location " << loc << " in crush map";
10391 pending_inc.crush.clear();
10392 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10393 } else {
10394 ss << "cannot link item id " << id << " name '" << name
10395 << "' to location " << loc;
10396 goto reply;
10397 }
10398 } else {
10399 ss << "no need to move item id " << id << " name '" << name
10400 << "' to location " << loc << " in crush map";
10401 err = 0;
10402 }
10403 }
10404 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10405 get_last_committed() + 1));
10406 return true;
10407 } else if (prefix == "osd crush rm" ||
10408 prefix == "osd crush remove" ||
10409 prefix == "osd crush unlink") {
10410 do {
10411 // osd crush rm <id> [ancestor]
10412 CrushWrapper newcrush;
10413 _get_pending_crush(newcrush);
10414
10415 string name;
10416 cmd_getval(cmdmap, "name", name);
10417
10418 if (!osdmap.crush->name_exists(name)) {
10419 err = 0;
10420 ss << "device '" << name << "' does not appear in the crush map";
10421 break;
10422 }
10423 if (!newcrush.name_exists(name)) {
10424 err = 0;
10425 ss << "device '" << name << "' does not appear in the crush map";
10426 getline(ss, rs);
10427 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10428 get_last_committed() + 1));
10429 return true;
10430 }
10431 int id = newcrush.get_item_id(name);
10432 int ancestor = 0;
10433
10434 bool unlink_only = prefix == "osd crush unlink";
10435 string ancestor_str;
10436 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10437 if (!newcrush.name_exists(ancestor_str)) {
10438 err = -ENOENT;
10439 ss << "ancestor item '" << ancestor_str
10440 << "' does not appear in the crush map";
10441 break;
10442 }
10443 ancestor = newcrush.get_item_id(ancestor_str);
10444 }
10445
10446 err = prepare_command_osd_crush_remove(
10447 newcrush,
10448 id, ancestor,
10449 (ancestor < 0), unlink_only);
10450
10451 if (err == -ENOENT) {
10452 ss << "item " << id << " does not appear in that position";
10453 err = 0;
10454 break;
10455 }
10456 if (err == 0) {
10457 if (!unlink_only)
10458 pending_inc.new_crush_node_flags[id] = 0;
10459 ss << "removed item id " << id << " name '" << name << "' from crush map";
10460 getline(ss, rs);
10461 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10462 get_last_committed() + 1));
10463 return true;
10464 }
10465 } while (false);
10466
10467 } else if (prefix == "osd crush reweight-all") {
10468 CrushWrapper newcrush;
10469 _get_pending_crush(newcrush);
10470
10471 newcrush.reweight(cct);
10472 pending_inc.crush.clear();
10473 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10474 ss << "reweighted crush hierarchy";
10475 getline(ss, rs);
10476 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10477 get_last_committed() + 1));
10478 return true;
10479 } else if (prefix == "osd crush reweight") {
10480 // osd crush reweight <name> <weight>
10481 CrushWrapper newcrush;
10482 _get_pending_crush(newcrush);
10483
10484 string name;
10485 cmd_getval(cmdmap, "name", name);
10486 if (!newcrush.name_exists(name)) {
10487 err = -ENOENT;
10488 ss << "device '" << name << "' does not appear in the crush map";
10489 goto reply;
10490 }
10491
10492 int id = newcrush.get_item_id(name);
10493 if (id < 0) {
10494 ss << "device '" << name << "' is not a leaf in the crush map";
10495 err = -EINVAL;
10496 goto reply;
10497 }
10498 double w;
10499 if (!cmd_getval(cmdmap, "weight", w)) {
10500 ss << "unable to parse weight value '"
10501 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10502 err = -EINVAL;
10503 goto reply;
10504 }
10505
10506 err = newcrush.adjust_item_weightf(cct, id, w,
10507 g_conf()->osd_crush_update_weight_set);
10508 if (err < 0)
10509 goto reply;
10510 pending_inc.crush.clear();
10511 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10512 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10513 << " in crush map";
10514 getline(ss, rs);
10515 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10516 get_last_committed() + 1));
10517 return true;
10518 } else if (prefix == "osd crush reweight-subtree") {
10519 // osd crush reweight <name> <weight>
10520 CrushWrapper newcrush;
10521 _get_pending_crush(newcrush);
10522
10523 string name;
10524 cmd_getval(cmdmap, "name", name);
10525 if (!newcrush.name_exists(name)) {
10526 err = -ENOENT;
10527 ss << "device '" << name << "' does not appear in the crush map";
10528 goto reply;
10529 }
10530
10531 int id = newcrush.get_item_id(name);
10532 if (id >= 0) {
10533 ss << "device '" << name << "' is not a subtree in the crush map";
10534 err = -EINVAL;
10535 goto reply;
10536 }
10537 double w;
10538 if (!cmd_getval(cmdmap, "weight", w)) {
10539 ss << "unable to parse weight value '"
10540 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10541 err = -EINVAL;
10542 goto reply;
10543 }
10544
10545 err = newcrush.adjust_subtree_weightf(cct, id, w,
10546 g_conf()->osd_crush_update_weight_set);
10547 if (err < 0)
10548 goto reply;
10549 pending_inc.crush.clear();
10550 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10551 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10552 << " in crush map";
10553 getline(ss, rs);
10554 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10555 get_last_committed() + 1));
10556 return true;
10557 } else if (prefix == "osd crush tunables") {
10558 CrushWrapper newcrush;
10559 _get_pending_crush(newcrush);
10560
10561 err = 0;
10562 string profile;
10563 cmd_getval(cmdmap, "profile", profile);
10564 if (profile == "legacy" || profile == "argonaut") {
10565 newcrush.set_tunables_legacy();
10566 } else if (profile == "bobtail") {
10567 newcrush.set_tunables_bobtail();
10568 } else if (profile == "firefly") {
10569 newcrush.set_tunables_firefly();
10570 } else if (profile == "hammer") {
10571 newcrush.set_tunables_hammer();
10572 } else if (profile == "jewel") {
10573 newcrush.set_tunables_jewel();
10574 } else if (profile == "optimal") {
10575 newcrush.set_tunables_optimal();
10576 } else if (profile == "default") {
10577 newcrush.set_tunables_default();
10578 } else {
10579 ss << "unrecognized profile '" << profile << "'";
10580 err = -EINVAL;
10581 goto reply;
10582 }
10583
10584 if (!validate_crush_against_features(&newcrush, ss)) {
10585 err = -EINVAL;
10586 goto reply;
10587 }
10588
10589 pending_inc.crush.clear();
10590 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10591 ss << "adjusted tunables profile to " << profile;
10592 getline(ss, rs);
10593 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10594 get_last_committed() + 1));
10595 return true;
10596 } else if (prefix == "osd crush set-tunable") {
10597 CrushWrapper newcrush;
10598 _get_pending_crush(newcrush);
10599
10600 err = 0;
10601 string tunable;
10602 cmd_getval(cmdmap, "tunable", tunable);
10603
10604 int64_t value = -1;
10605 if (!cmd_getval(cmdmap, "value", value)) {
10606 err = -EINVAL;
10607 ss << "failed to parse integer value "
10608 << cmd_vartype_stringify(cmdmap.at("value"));
10609 goto reply;
10610 }
10611
10612 if (tunable == "straw_calc_version") {
10613 if (value != 0 && value != 1) {
10614 ss << "value must be 0 or 1; got " << value;
10615 err = -EINVAL;
10616 goto reply;
10617 }
10618 newcrush.set_straw_calc_version(value);
10619 } else {
10620 ss << "unrecognized tunable '" << tunable << "'";
10621 err = -EINVAL;
10622 goto reply;
10623 }
10624
10625 if (!validate_crush_against_features(&newcrush, ss)) {
10626 err = -EINVAL;
10627 goto reply;
10628 }
10629
10630 pending_inc.crush.clear();
10631 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10632 ss << "adjusted tunable " << tunable << " to " << value;
10633 getline(ss, rs);
10634 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10635 get_last_committed() + 1));
10636 return true;
10637
10638 } else if (prefix == "osd crush rule create-simple") {
10639 string name, root, type, mode;
10640 cmd_getval(cmdmap, "name", name);
10641 cmd_getval(cmdmap, "root", root);
10642 cmd_getval(cmdmap, "type", type);
10643 cmd_getval(cmdmap, "mode", mode);
10644 if (mode == "")
10645 mode = "firstn";
10646
10647 if (osdmap.crush->rule_exists(name)) {
10648 // The name is uniquely associated to a ruleid and the rule it contains
10649 // From the user point of view, the rule is more meaningfull.
10650 ss << "rule " << name << " already exists";
10651 err = 0;
10652 goto reply;
10653 }
10654
10655 CrushWrapper newcrush;
10656 _get_pending_crush(newcrush);
10657
10658 if (newcrush.rule_exists(name)) {
10659 // The name is uniquely associated to a ruleid and the rule it contains
10660 // From the user point of view, the rule is more meaningfull.
10661 ss << "rule " << name << " already exists";
10662 err = 0;
10663 } else {
10664 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10665 pg_pool_t::TYPE_REPLICATED, &ss);
10666 if (ruleno < 0) {
10667 err = ruleno;
10668 goto reply;
10669 }
10670
10671 pending_inc.crush.clear();
10672 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10673 }
10674 getline(ss, rs);
10675 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10676 get_last_committed() + 1));
10677 return true;
10678
10679 } else if (prefix == "osd crush rule create-replicated") {
10680 string name, root, type, device_class;
10681 cmd_getval(cmdmap, "name", name);
10682 cmd_getval(cmdmap, "root", root);
10683 cmd_getval(cmdmap, "type", type);
10684 cmd_getval(cmdmap, "class", device_class);
10685
10686 if (osdmap.crush->rule_exists(name)) {
10687 // The name is uniquely associated to a ruleid and the rule it contains
10688 // From the user point of view, the rule is more meaningfull.
10689 ss << "rule " << name << " already exists";
10690 err = 0;
10691 goto reply;
10692 }
10693
10694 CrushWrapper newcrush;
10695 _get_pending_crush(newcrush);
10696
10697 if (newcrush.rule_exists(name)) {
10698 // The name is uniquely associated to a ruleid and the rule it contains
10699 // From the user point of view, the rule is more meaningfull.
10700 ss << "rule " << name << " already exists";
10701 err = 0;
10702 } else {
10703 int ruleno = newcrush.add_simple_rule(
10704 name, root, type, device_class,
10705 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10706 if (ruleno < 0) {
10707 err = ruleno;
10708 goto reply;
10709 }
10710
10711 pending_inc.crush.clear();
10712 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10713 }
10714 getline(ss, rs);
10715 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10716 get_last_committed() + 1));
10717 return true;
10718
10719 } else if (prefix == "osd erasure-code-profile rm") {
10720 string name;
10721 cmd_getval(cmdmap, "name", name);
10722
10723 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10724 goto wait;
10725
10726 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10727 err = -EBUSY;
10728 goto reply;
10729 }
10730
10731 if (osdmap.has_erasure_code_profile(name) ||
10732 pending_inc.new_erasure_code_profiles.count(name)) {
10733 if (osdmap.has_erasure_code_profile(name)) {
10734 pending_inc.old_erasure_code_profiles.push_back(name);
10735 } else {
10736 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10737 pending_inc.new_erasure_code_profiles.erase(name);
10738 }
10739
10740 getline(ss, rs);
10741 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10742 get_last_committed() + 1));
10743 return true;
10744 } else {
10745 ss << "erasure-code-profile " << name << " does not exist";
10746 err = 0;
10747 goto reply;
10748 }
10749
10750 } else if (prefix == "osd erasure-code-profile set") {
10751 string name;
10752 cmd_getval(cmdmap, "name", name);
10753 vector<string> profile;
10754 cmd_getval(cmdmap, "profile", profile);
10755
10756 bool force = false;
10757 cmd_getval(cmdmap, "force", force);
10758
10759 map<string,string> profile_map;
10760 err = parse_erasure_code_profile(profile, &profile_map, &ss);
10761 if (err)
10762 goto reply;
10763 if (profile_map.find("plugin") == profile_map.end()) {
10764 ss << "erasure-code-profile " << profile_map
10765 << " must contain a plugin entry" << std::endl;
10766 err = -EINVAL;
10767 goto reply;
10768 }
10769 string plugin = profile_map["plugin"];
10770
10771 if (pending_inc.has_erasure_code_profile(name)) {
10772 dout(20) << "erasure code profile " << name << " try again" << dendl;
10773 goto wait;
10774 } else {
10775 err = normalize_profile(name, profile_map, force, &ss);
10776 if (err)
10777 goto reply;
10778
10779 if (osdmap.has_erasure_code_profile(name)) {
10780 ErasureCodeProfile existing_profile_map =
10781 osdmap.get_erasure_code_profile(name);
10782 err = normalize_profile(name, existing_profile_map, force, &ss);
10783 if (err)
10784 goto reply;
10785
10786 if (existing_profile_map == profile_map) {
10787 err = 0;
10788 goto reply;
10789 }
10790 if (!force) {
10791 err = -EPERM;
10792 ss << "will not override erasure code profile " << name
10793 << " because the existing profile "
10794 << existing_profile_map
10795 << " is different from the proposed profile "
10796 << profile_map;
10797 goto reply;
10798 }
10799 }
10800
10801 dout(20) << "erasure code profile set " << name << "="
10802 << profile_map << dendl;
10803 pending_inc.set_erasure_code_profile(name, profile_map);
10804 }
10805
10806 getline(ss, rs);
10807 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10808 get_last_committed() + 1));
10809 return true;
10810
10811 } else if (prefix == "osd crush rule create-erasure") {
10812 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10813 if (err == -EAGAIN)
10814 goto wait;
10815 if (err)
10816 goto reply;
10817 string name, poolstr;
10818 cmd_getval(cmdmap, "name", name);
10819 string profile;
10820 cmd_getval(cmdmap, "profile", profile);
10821 if (profile == "")
10822 profile = "default";
10823 if (profile == "default") {
10824 if (!osdmap.has_erasure_code_profile(profile)) {
10825 if (pending_inc.has_erasure_code_profile(profile)) {
10826 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10827 goto wait;
10828 }
10829
10830 map<string,string> profile_map;
10831 err = osdmap.get_erasure_code_profile_default(cct,
10832 profile_map,
10833 &ss);
10834 if (err)
10835 goto reply;
10836 err = normalize_profile(name, profile_map, true, &ss);
10837 if (err)
10838 goto reply;
10839 dout(20) << "erasure code profile set " << profile << "="
10840 << profile_map << dendl;
10841 pending_inc.set_erasure_code_profile(profile, profile_map);
10842 goto wait;
10843 }
10844 }
10845
10846 int rule;
10847 err = crush_rule_create_erasure(name, profile, &rule, &ss);
10848 if (err < 0) {
10849 switch(err) {
10850 case -EEXIST: // return immediately
10851 ss << "rule " << name << " already exists";
10852 err = 0;
10853 goto reply;
10854 break;
10855 case -EALREADY: // wait for pending to be proposed
10856 ss << "rule " << name << " already exists";
10857 err = 0;
10858 break;
10859 default: // non recoverable error
10860 goto reply;
10861 break;
10862 }
10863 } else {
10864 ss << "created rule " << name << " at " << rule;
10865 }
10866
10867 getline(ss, rs);
10868 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10869 get_last_committed() + 1));
10870 return true;
10871
10872 } else if (prefix == "osd crush rule rm") {
10873 string name;
10874 cmd_getval(cmdmap, "name", name);
10875
10876 if (!osdmap.crush->rule_exists(name)) {
10877 ss << "rule " << name << " does not exist";
10878 err = 0;
10879 goto reply;
10880 }
10881
10882 CrushWrapper newcrush;
10883 _get_pending_crush(newcrush);
10884
10885 if (!newcrush.rule_exists(name)) {
10886 ss << "rule " << name << " does not exist";
10887 err = 0;
10888 } else {
10889 int ruleno = newcrush.get_rule_id(name);
10890 ceph_assert(ruleno >= 0);
10891
10892 // make sure it is not in use.
10893 // FIXME: this is ok in some situations, but let's not bother with that
10894 // complexity now.
10895 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10896 if (osdmap.crush_rule_in_use(ruleset)) {
10897 ss << "crush ruleset " << name << " " << ruleset << " is in use";
10898 err = -EBUSY;
10899 goto reply;
10900 }
10901
10902 err = newcrush.remove_rule(ruleno);
10903 if (err < 0) {
10904 goto reply;
10905 }
10906
10907 pending_inc.crush.clear();
10908 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10909 }
10910 getline(ss, rs);
10911 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10912 get_last_committed() + 1));
10913 return true;
10914
10915 } else if (prefix == "osd crush rule rename") {
10916 string srcname;
10917 string dstname;
10918 cmd_getval(cmdmap, "srcname", srcname);
10919 cmd_getval(cmdmap, "dstname", dstname);
10920 if (srcname.empty() || dstname.empty()) {
10921 ss << "must specify both source rule name and destination rule name";
10922 err = -EINVAL;
10923 goto reply;
10924 }
10925 if (srcname == dstname) {
10926 ss << "destination rule name is equal to source rule name";
10927 err = 0;
10928 goto reply;
10929 }
10930
10931 CrushWrapper newcrush;
10932 _get_pending_crush(newcrush);
10933 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10934 // srcname does not exist and dstname already exists
10935 // suppose this is a replay and return success
10936 // (so this command is idempotent)
10937 ss << "already renamed to '" << dstname << "'";
10938 err = 0;
10939 goto reply;
10940 }
10941
10942 err = newcrush.rename_rule(srcname, dstname, &ss);
10943 if (err < 0) {
10944 // ss has reason for failure
10945 goto reply;
10946 }
10947 pending_inc.crush.clear();
10948 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10949 getline(ss, rs);
10950 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10951 get_last_committed() + 1));
10952 return true;
10953
10954 } else if (prefix == "osd setmaxosd") {
10955 int64_t newmax;
10956 if (!cmd_getval(cmdmap, "newmax", newmax)) {
10957 ss << "unable to parse 'newmax' value '"
10958 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
10959 err = -EINVAL;
10960 goto reply;
10961 }
10962
10963 if (newmax > g_conf()->mon_max_osd) {
10964 err = -ERANGE;
10965 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
10966 << g_conf()->mon_max_osd << ")";
10967 goto reply;
10968 }
10969
10970 // Don't allow shrinking OSD number as this will cause data loss
10971 // and may cause kernel crashes.
10972 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10973 if (newmax < osdmap.get_max_osd()) {
10974 // Check if the OSDs exist between current max and new value.
10975 // If there are any OSDs exist, then don't allow shrinking number
10976 // of OSDs.
10977 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10978 if (osdmap.exists(i)) {
10979 err = -EBUSY;
10980 ss << "cannot shrink max_osd to " << newmax
10981 << " because osd." << i << " (and possibly others) still in use";
10982 goto reply;
10983 }
10984 }
10985 }
10986
10987 pending_inc.new_max_osd = newmax;
10988 ss << "set new max_osd = " << pending_inc.new_max_osd;
10989 getline(ss, rs);
10990 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10991 get_last_committed() + 1));
10992 return true;
10993
10994 } else if (prefix == "osd set-full-ratio" ||
10995 prefix == "osd set-backfillfull-ratio" ||
10996 prefix == "osd set-nearfull-ratio") {
10997 double n;
10998 if (!cmd_getval(cmdmap, "ratio", n)) {
10999 ss << "unable to parse 'ratio' value '"
11000 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11001 err = -EINVAL;
11002 goto reply;
11003 }
11004 if (prefix == "osd set-full-ratio")
11005 pending_inc.new_full_ratio = n;
11006 else if (prefix == "osd set-backfillfull-ratio")
11007 pending_inc.new_backfillfull_ratio = n;
11008 else if (prefix == "osd set-nearfull-ratio")
11009 pending_inc.new_nearfull_ratio = n;
11010 ss << prefix << " " << n;
11011 getline(ss, rs);
11012 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11013 get_last_committed() + 1));
11014 return true;
11015 } else if (prefix == "osd set-require-min-compat-client") {
11016 string v;
11017 cmd_getval(cmdmap, "version", v);
11018 ceph_release_t vno = ceph_release_from_name(v);
11019 if (!vno) {
11020 ss << "version " << v << " is not recognized";
11021 err = -EINVAL;
11022 goto reply;
11023 }
11024 OSDMap newmap;
11025 newmap.deepish_copy_from(osdmap);
11026 newmap.apply_incremental(pending_inc);
11027 newmap.require_min_compat_client = vno;
11028 auto mvno = newmap.get_min_compat_client();
11029 if (vno < mvno) {
11030 ss << "osdmap current utilizes features that require " << mvno
11031 << "; cannot set require_min_compat_client below that to " << vno;
11032 err = -EPERM;
11033 goto reply;
11034 }
11035 bool sure = false;
11036 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11037 if (!sure) {
11038 FeatureMap m;
11039 mon->get_combined_feature_map(&m);
11040 uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
11041 bool first = true;
11042 bool ok = true;
11043 for (int type : {
11044 CEPH_ENTITY_TYPE_CLIENT,
11045 CEPH_ENTITY_TYPE_MDS,
11046 CEPH_ENTITY_TYPE_MGR }) {
11047 auto p = m.m.find(type);
11048 if (p == m.m.end()) {
11049 continue;
11050 }
11051 for (auto& q : p->second) {
11052 uint64_t missing = ~q.first & features;
11053 if (missing) {
11054 if (first) {
11055 ss << "cannot set require_min_compat_client to " << v << ": ";
11056 } else {
11057 ss << "; ";
11058 }
11059 first = false;
11060 ss << q.second << " connected " << ceph_entity_type_name(type)
11061 << "(s) look like " << ceph_release_name(
11062 ceph_release_from_features(q.first))
11063 << " (missing 0x" << std::hex << missing << std::dec << ")";
11064 ok = false;
11065 }
11066 }
11067 }
11068 if (!ok) {
11069 ss << "; add --yes-i-really-mean-it to do it anyway";
11070 err = -EPERM;
11071 goto reply;
11072 }
11073 }
11074 ss << "set require_min_compat_client to " << vno;
11075 pending_inc.new_require_min_compat_client = vno;
11076 getline(ss, rs);
11077 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11078 get_last_committed() + 1));
11079 return true;
11080 } else if (prefix == "osd pause") {
11081 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11082
11083 } else if (prefix == "osd unpause") {
11084 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11085
11086 } else if (prefix == "osd set") {
11087 bool sure = false;
11088 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11089
11090 string key;
11091 cmd_getval(cmdmap, "key", key);
11092 if (key == "pause")
11093 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11094 else if (key == "noup")
11095 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11096 else if (key == "nodown")
11097 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11098 else if (key == "noout")
11099 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11100 else if (key == "noin")
11101 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11102 else if (key == "nobackfill")
11103 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11104 else if (key == "norebalance")
11105 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11106 else if (key == "norecover")
11107 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11108 else if (key == "noscrub")
11109 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11110 else if (key == "nodeep-scrub")
11111 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11112 else if (key == "notieragent")
11113 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11114 else if (key == "nosnaptrim")
11115 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11116 else if (key == "pglog_hardlimit") {
11117 if (!osdmap.get_num_up_osds() && !sure) {
11118 ss << "Not advisable to continue since no OSDs are up. Pass "
11119 << "--yes-i-really-mean-it if you really wish to continue.";
11120 err = -EPERM;
11121 goto reply;
11122 }
11123 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11124 // we are reusing a jewel feature bit that was retired in luminous.
11125 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11126 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11127 || sure)) {
11128 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11129 } else {
11130 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11131 err = -EPERM;
11132 goto reply;
11133 }
11134 } else {
11135 ss << "unrecognized flag '" << key << "'";
11136 err = -EINVAL;
11137 }
11138
11139 } else if (prefix == "osd unset") {
11140 string key;
11141 cmd_getval(cmdmap, "key", key);
11142 if (key == "pause")
11143 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11144 else if (key == "noup")
11145 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11146 else if (key == "nodown")
11147 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11148 else if (key == "noout")
11149 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11150 else if (key == "noin")
11151 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11152 else if (key == "nobackfill")
11153 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11154 else if (key == "norebalance")
11155 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11156 else if (key == "norecover")
11157 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11158 else if (key == "noscrub")
11159 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11160 else if (key == "nodeep-scrub")
11161 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11162 else if (key == "notieragent")
11163 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11164 else if (key == "nosnaptrim")
11165 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11166 else {
11167 ss << "unrecognized flag '" << key << "'";
11168 err = -EINVAL;
11169 }
11170
11171 } else if (prefix == "osd require-osd-release") {
11172 string release;
11173 cmd_getval(cmdmap, "release", release);
11174 bool sure = false;
11175 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11176 ceph_release_t rel = ceph_release_from_name(release.c_str());
11177 if (!rel) {
11178 ss << "unrecognized release " << release;
11179 err = -EINVAL;
11180 goto reply;
11181 }
11182 if (rel == osdmap.require_osd_release) {
11183 // idempotent
11184 err = 0;
11185 goto reply;
11186 }
11187 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11188 if (!osdmap.get_num_up_osds() && !sure) {
11189 ss << "Not advisable to continue since no OSDs are up. Pass "
11190 << "--yes-i-really-mean-it if you really wish to continue.";
11191 err = -EPERM;
11192 goto reply;
11193 }
11194 if (rel == ceph_release_t::mimic) {
11195 if (!mon->monmap->get_required_features().contains_all(
11196 ceph::features::mon::FEATURE_MIMIC)) {
11197 ss << "not all mons are mimic";
11198 err = -EPERM;
11199 goto reply;
11200 }
11201 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11202 && !sure) {
11203 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11204 err = -EPERM;
11205 goto reply;
11206 }
11207 } else if (rel == ceph_release_t::nautilus) {
11208 if (!mon->monmap->get_required_features().contains_all(
11209 ceph::features::mon::FEATURE_NAUTILUS)) {
11210 ss << "not all mons are nautilus";
11211 err = -EPERM;
11212 goto reply;
11213 }
11214 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11215 && !sure) {
11216 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11217 err = -EPERM;
11218 goto reply;
11219 }
11220 } else if (rel == ceph_release_t::octopus) {
11221 if (!mon->monmap->get_required_features().contains_all(
11222 ceph::features::mon::FEATURE_OCTOPUS)) {
11223 ss << "not all mons are octopus";
11224 err = -EPERM;
11225 goto reply;
11226 }
11227 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11228 && !sure) {
11229 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11230 err = -EPERM;
11231 goto reply;
11232 }
11233 } else {
11234 ss << "not supported for this release yet";
11235 err = -EPERM;
11236 goto reply;
11237 }
11238 if (rel < osdmap.require_osd_release) {
11239 ss << "require_osd_release cannot be lowered once it has been set";
11240 err = -EPERM;
11241 goto reply;
11242 }
11243 pending_inc.new_require_osd_release = rel;
11244 goto update;
11245 } else if (prefix == "osd down" ||
11246 prefix == "osd out" ||
11247 prefix == "osd in" ||
11248 prefix == "osd rm" ||
11249 prefix == "osd stop") {
11250
11251 bool any = false;
11252 bool stop = false;
11253 bool verbose = true;
11254 bool definitely_dead = false;
11255
11256 vector<string> idvec;
11257 cmd_getval(cmdmap, "ids", idvec);
11258 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11259 derr << "definitely_dead " << (int)definitely_dead << dendl;
11260 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11261 set<int> osds;
11262
11263 // wildcard?
11264 if (j == 0 &&
11265 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11266 if (prefix == "osd in") {
11267 // touch out osds only
11268 osdmap.get_out_existing_osds(osds);
11269 } else {
11270 osdmap.get_all_osds(osds);
11271 }
11272 stop = true;
11273 verbose = false; // so the output is less noisy.
11274 } else {
11275 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11276 if (osd < 0) {
11277 ss << "invalid osd id" << osd;
11278 err = -EINVAL;
11279 continue;
11280 } else if (!osdmap.exists(osd)) {
11281 ss << "osd." << osd << " does not exist. ";
11282 continue;
11283 }
11284
11285 osds.insert(osd);
11286 }
11287
11288 for (auto &osd : osds) {
11289 if (prefix == "osd down") {
11290 if (osdmap.is_down(osd)) {
11291 if (verbose)
11292 ss << "osd." << osd << " is already down. ";
11293 } else {
11294 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11295 ss << "marked down osd." << osd << ". ";
11296 any = true;
11297 }
11298 if (definitely_dead) {
11299 if (!pending_inc.new_xinfo.count(osd)) {
11300 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11301 }
11302 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11303 any = true;
11304 }
11305 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11306 }
11307 } else if (prefix == "osd out") {
11308 if (osdmap.is_out(osd)) {
11309 if (verbose)
11310 ss << "osd." << osd << " is already out. ";
11311 } else {
11312 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11313 if (osdmap.osd_weight[osd]) {
11314 if (pending_inc.new_xinfo.count(osd) == 0) {
11315 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11316 }
11317 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11318 }
11319 ss << "marked out osd." << osd << ". ";
11320 std::ostringstream msg;
11321 msg << "Client " << op->get_session()->entity_name
11322 << " marked osd." << osd << " out";
11323 if (osdmap.is_up(osd)) {
11324 msg << ", while it was still marked up";
11325 } else {
11326 auto period = ceph_clock_now() - down_pending_out[osd];
11327 msg << ", after it was down for " << int(period.sec())
11328 << " seconds";
11329 }
11330
11331 mon->clog->info() << msg.str();
11332 any = true;
11333 }
11334 } else if (prefix == "osd in") {
11335 if (osdmap.is_in(osd)) {
11336 if (verbose)
11337 ss << "osd." << osd << " is already in. ";
11338 } else {
11339 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11340 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11341 if (pending_inc.new_xinfo.count(osd) == 0) {
11342 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11343 }
11344 pending_inc.new_xinfo[osd].old_weight = 0;
11345 } else {
11346 pending_inc.new_weight[osd] = CEPH_OSD_IN;
11347 }
11348 ss << "marked in osd." << osd << ". ";
11349 any = true;
11350 }
11351 } else if (prefix == "osd rm") {
11352 err = prepare_command_osd_remove(osd);
11353
11354 if (err == -EBUSY) {
11355 if (any)
11356 ss << ", ";
11357 ss << "osd." << osd << " is still up; must be down before removal. ";
11358 } else {
11359 ceph_assert(err == 0);
11360 if (any) {
11361 ss << ", osd." << osd;
11362 } else {
11363 ss << "removed osd." << osd;
11364 }
11365 any = true;
11366 }
11367 } else if (prefix == "osd stop") {
11368 if (osdmap.is_stop(osd)) {
11369 if (verbose)
11370 ss << "osd." << osd << " is already stopped. ";
11371 } else if (osdmap.is_down(osd)) {
11372 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11373 ss << "stop down osd." << osd << ". ";
11374 any = true;
11375 } else {
11376 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11377 ss << "stop osd." << osd << ". ";
11378 any = true;
11379 }
11380 }
11381 }
11382 }
11383 if (any) {
11384 getline(ss, rs);
11385 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11386 get_last_committed() + 1));
11387 return true;
11388 }
11389 } else if (prefix == "osd set-group" ||
11390 prefix == "osd unset-group" ||
11391 prefix == "osd add-noup" ||
11392 prefix == "osd add-nodown" ||
11393 prefix == "osd add-noin" ||
11394 prefix == "osd add-noout" ||
11395 prefix == "osd rm-noup" ||
11396 prefix == "osd rm-nodown" ||
11397 prefix == "osd rm-noin" ||
11398 prefix == "osd rm-noout") {
11399 bool do_set = prefix == "osd set-group" ||
11400 prefix.find("add") != string::npos;
11401 string flag_str;
11402 unsigned flags = 0;
11403 vector<string> who;
11404 if (prefix == "osd set-group" || prefix == "osd unset-group") {
11405 cmd_getval(cmdmap, "flags", flag_str);
11406 cmd_getval(cmdmap, "who", who);
11407 vector<string> raw_flags;
11408 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11409 for (auto& f : raw_flags) {
11410 if (f == "noup")
11411 flags |= CEPH_OSD_NOUP;
11412 else if (f == "nodown")
11413 flags |= CEPH_OSD_NODOWN;
11414 else if (f == "noin")
11415 flags |= CEPH_OSD_NOIN;
11416 else if (f == "noout")
11417 flags |= CEPH_OSD_NOOUT;
11418 else {
11419 ss << "unrecognized flag '" << f << "', must be one of "
11420 << "{noup,nodown,noin,noout}";
11421 err = -EINVAL;
11422 goto reply;
11423 }
11424 }
11425 } else {
11426 cmd_getval(cmdmap, "ids", who);
11427 if (prefix.find("noup") != string::npos)
11428 flags = CEPH_OSD_NOUP;
11429 else if (prefix.find("nodown") != string::npos)
11430 flags = CEPH_OSD_NODOWN;
11431 else if (prefix.find("noin") != string::npos)
11432 flags = CEPH_OSD_NOIN;
11433 else if (prefix.find("noout") != string::npos)
11434 flags = CEPH_OSD_NOOUT;
11435 else
11436 ceph_assert(0 == "Unreachable!");
11437 }
11438 if (flags == 0) {
11439 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11440 err = -EINVAL;
11441 goto reply;
11442 }
11443 if (who.empty()) {
11444 ss << "must specify at least one or more targets to set/unset";
11445 err = -EINVAL;
11446 goto reply;
11447 }
11448 set<int> osds;
11449 set<int> crush_nodes;
11450 set<int> device_classes;
11451 for (auto& w : who) {
11452 if (w == "any" || w == "all" || w == "*") {
11453 osdmap.get_all_osds(osds);
11454 break;
11455 }
11456 std::stringstream ts;
11457 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11458 osds.insert(osd);
11459 } else if (osdmap.crush->name_exists(w)) {
11460 crush_nodes.insert(osdmap.crush->get_item_id(w));
11461 } else if (osdmap.crush->class_exists(w)) {
11462 device_classes.insert(osdmap.crush->get_class_id(w));
11463 } else {
11464 ss << "unable to parse osd id or crush node or device class: "
11465 << "\"" << w << "\". ";
11466 }
11467 }
11468 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11469 // ss has reason for failure
11470 err = -EINVAL;
11471 goto reply;
11472 }
11473 bool any = false;
11474 for (auto osd : osds) {
11475 if (!osdmap.exists(osd)) {
11476 ss << "osd." << osd << " does not exist. ";
11477 continue;
11478 }
11479 if (do_set) {
11480 if (flags & CEPH_OSD_NOUP) {
11481 any |= osdmap.is_noup_by_osd(osd) ?
11482 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11483 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11484 }
11485 if (flags & CEPH_OSD_NODOWN) {
11486 any |= osdmap.is_nodown_by_osd(osd) ?
11487 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11488 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11489 }
11490 if (flags & CEPH_OSD_NOIN) {
11491 any |= osdmap.is_noin_by_osd(osd) ?
11492 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11493 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11494 }
11495 if (flags & CEPH_OSD_NOOUT) {
11496 any |= osdmap.is_noout_by_osd(osd) ?
11497 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11498 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11499 }
11500 } else {
11501 if (flags & CEPH_OSD_NOUP) {
11502 any |= osdmap.is_noup_by_osd(osd) ?
11503 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11504 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11505 }
11506 if (flags & CEPH_OSD_NODOWN) {
11507 any |= osdmap.is_nodown_by_osd(osd) ?
11508 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11509 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11510 }
11511 if (flags & CEPH_OSD_NOIN) {
11512 any |= osdmap.is_noin_by_osd(osd) ?
11513 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11514 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11515 }
11516 if (flags & CEPH_OSD_NOOUT) {
11517 any |= osdmap.is_noout_by_osd(osd) ?
11518 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11519 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11520 }
11521 }
11522 }
11523 for (auto& id : crush_nodes) {
11524 auto old_flags = osdmap.get_crush_node_flags(id);
11525 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11526 pending_flags |= old_flags; // adopt existing flags first!
11527 if (do_set) {
11528 pending_flags |= flags;
11529 } else {
11530 pending_flags &= ~flags;
11531 }
11532 any = true;
11533 }
11534 for (auto& id : device_classes) {
11535 auto old_flags = osdmap.get_device_class_flags(id);
11536 auto& pending_flags = pending_inc.new_device_class_flags[id];
11537 pending_flags |= old_flags;
11538 if (do_set) {
11539 pending_flags |= flags;
11540 } else {
11541 pending_flags &= ~flags;
11542 }
11543 any = true;
11544 }
11545 if (any) {
11546 getline(ss, rs);
11547 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11548 get_last_committed() + 1));
11549 return true;
11550 }
11551 } else if (prefix == "osd pg-temp") {
11552 string pgidstr;
11553 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11554 ss << "unable to parse 'pgid' value '"
11555 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11556 err = -EINVAL;
11557 goto reply;
11558 }
11559 pg_t pgid;
11560 if (!pgid.parse(pgidstr.c_str())) {
11561 ss << "invalid pgid '" << pgidstr << "'";
11562 err = -EINVAL;
11563 goto reply;
11564 }
11565 if (!osdmap.pg_exists(pgid)) {
11566 ss << "pg " << pgid << " does not exist";
11567 err = -ENOENT;
11568 goto reply;
11569 }
11570 if (pending_inc.new_pg_temp.count(pgid)) {
11571 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11572 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11573 return true;
11574 }
11575
11576 vector<int64_t> id_vec;
11577 vector<int32_t> new_pg_temp;
11578 cmd_getval(cmdmap, "id", id_vec);
11579 if (id_vec.empty()) {
11580 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11581 ss << "done cleaning up pg_temp of " << pgid;
11582 goto update;
11583 }
11584 for (auto osd : id_vec) {
11585 if (!osdmap.exists(osd)) {
11586 ss << "osd." << osd << " does not exist";
11587 err = -ENOENT;
11588 goto reply;
11589 }
11590 new_pg_temp.push_back(osd);
11591 }
11592
11593 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11594 if ((int)new_pg_temp.size() < pool_min_size) {
11595 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11596 << pool_min_size << ")";
11597 err = -EINVAL;
11598 goto reply;
11599 }
11600
11601 int pool_size = osdmap.get_pg_pool_size(pgid);
11602 if ((int)new_pg_temp.size() > pool_size) {
11603 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11604 << pool_size << ")";
11605 err = -EINVAL;
11606 goto reply;
11607 }
11608
11609 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11610 new_pg_temp.begin(), new_pg_temp.end());
11611 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11612 goto update;
11613 } else if (prefix == "osd primary-temp") {
11614 string pgidstr;
11615 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11616 ss << "unable to parse 'pgid' value '"
11617 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11618 err = -EINVAL;
11619 goto reply;
11620 }
11621 pg_t pgid;
11622 if (!pgid.parse(pgidstr.c_str())) {
11623 ss << "invalid pgid '" << pgidstr << "'";
11624 err = -EINVAL;
11625 goto reply;
11626 }
11627 if (!osdmap.pg_exists(pgid)) {
11628 ss << "pg " << pgid << " does not exist";
11629 err = -ENOENT;
11630 goto reply;
11631 }
11632
11633 int64_t osd;
11634 if (!cmd_getval(cmdmap, "id", osd)) {
11635 ss << "unable to parse 'id' value '"
11636 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11637 err = -EINVAL;
11638 goto reply;
11639 }
11640 if (osd != -1 && !osdmap.exists(osd)) {
11641 ss << "osd." << osd << " does not exist";
11642 err = -ENOENT;
11643 goto reply;
11644 }
11645
11646 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11647 osdmap.require_min_compat_client < ceph_release_t::firefly) {
11648 ss << "require_min_compat_client "
11649 << osdmap.require_min_compat_client
11650 << " < firefly, which is required for primary-temp";
11651 err = -EPERM;
11652 goto reply;
11653 }
11654
11655 pending_inc.new_primary_temp[pgid] = osd;
11656 ss << "set " << pgid << " primary_temp mapping to " << osd;
11657 goto update;
11658 } else if (prefix == "pg repeer") {
11659 pg_t pgid;
11660 string pgidstr;
11661 cmd_getval(cmdmap, "pgid", pgidstr);
11662 if (!pgid.parse(pgidstr.c_str())) {
11663 ss << "invalid pgid '" << pgidstr << "'";
11664 err = -EINVAL;
11665 goto reply;
11666 }
11667 if (!osdmap.pg_exists(pgid)) {
11668 ss << "pg '" << pgidstr << "' does not exist";
11669 err = -ENOENT;
11670 goto reply;
11671 }
11672 vector<int> acting;
11673 int primary;
11674 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11675 if (primary < 0) {
11676 err = -EAGAIN;
11677 ss << "pg currently has no primary";
11678 goto reply;
11679 }
11680 if (acting.size() > 1) {
11681 // map to just primary; it will map back to what it wants
11682 pending_inc.new_pg_temp[pgid] = { primary };
11683 } else {
11684 // hmm, pick another arbitrary osd to induce a change. Note
11685 // that this won't work if there is only one suitable OSD in the cluster.
11686 int i;
11687 bool done = false;
11688 for (i = 0; i < osdmap.get_max_osd(); ++i) {
11689 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11690 continue;
11691 }
11692 pending_inc.new_pg_temp[pgid] = { primary, i };
11693 done = true;
11694 break;
11695 }
11696 if (!done) {
11697 err = -EAGAIN;
11698 ss << "not enough up OSDs in the cluster to force repeer";
11699 goto reply;
11700 }
11701 }
11702 goto update;
11703 } else if (prefix == "osd pg-upmap" ||
11704 prefix == "osd rm-pg-upmap" ||
11705 prefix == "osd pg-upmap-items" ||
11706 prefix == "osd rm-pg-upmap-items") {
11707 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11708 ss << "min_compat_client "
11709 << osdmap.require_min_compat_client
11710 << " < luminous, which is required for pg-upmap. "
11711 << "Try 'ceph osd set-require-min-compat-client luminous' "
11712 << "before using the new interface";
11713 err = -EPERM;
11714 goto reply;
11715 }
11716 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11717 if (err == -EAGAIN)
11718 goto wait;
11719 if (err < 0)
11720 goto reply;
11721 string pgidstr;
11722 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11723 ss << "unable to parse 'pgid' value '"
11724 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11725 err = -EINVAL;
11726 goto reply;
11727 }
11728 pg_t pgid;
11729 if (!pgid.parse(pgidstr.c_str())) {
11730 ss << "invalid pgid '" << pgidstr << "'";
11731 err = -EINVAL;
11732 goto reply;
11733 }
11734 if (!osdmap.pg_exists(pgid)) {
11735 ss << "pg " << pgid << " does not exist";
11736 err = -ENOENT;
11737 goto reply;
11738 }
11739 if (pending_inc.old_pools.count(pgid.pool())) {
11740 ss << "pool of " << pgid << " is pending removal";
11741 err = -ENOENT;
11742 getline(ss, rs);
11743 wait_for_finished_proposal(op,
11744 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11745 return true;
11746 }
11747
11748 enum {
11749 OP_PG_UPMAP,
11750 OP_RM_PG_UPMAP,
11751 OP_PG_UPMAP_ITEMS,
11752 OP_RM_PG_UPMAP_ITEMS,
11753 } option;
11754
11755 if (prefix == "osd pg-upmap") {
11756 option = OP_PG_UPMAP;
11757 } else if (prefix == "osd rm-pg-upmap") {
11758 option = OP_RM_PG_UPMAP;
11759 } else if (prefix == "osd pg-upmap-items") {
11760 option = OP_PG_UPMAP_ITEMS;
11761 } else {
11762 option = OP_RM_PG_UPMAP_ITEMS;
11763 }
11764
11765 // check pending upmap changes
11766 switch (option) {
11767 case OP_PG_UPMAP: // fall through
11768 case OP_RM_PG_UPMAP:
11769 if (pending_inc.new_pg_upmap.count(pgid) ||
11770 pending_inc.old_pg_upmap.count(pgid)) {
11771 dout(10) << __func__ << " waiting for pending update on "
11772 << pgid << dendl;
11773 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11774 return true;
11775 }
11776 break;
11777
11778 case OP_PG_UPMAP_ITEMS: // fall through
11779 case OP_RM_PG_UPMAP_ITEMS:
11780 if (pending_inc.new_pg_upmap_items.count(pgid) ||
11781 pending_inc.old_pg_upmap_items.count(pgid)) {
11782 dout(10) << __func__ << " waiting for pending update on "
11783 << pgid << dendl;
11784 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11785 return true;
11786 }
11787 break;
11788
11789 default:
11790 ceph_abort_msg("invalid option");
11791 }
11792
11793 switch (option) {
11794 case OP_PG_UPMAP:
11795 {
11796 vector<int64_t> id_vec;
11797 if (!cmd_getval(cmdmap, "id", id_vec)) {
11798 ss << "unable to parse 'id' value(s) '"
11799 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11800 err = -EINVAL;
11801 goto reply;
11802 }
11803
11804 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11805 if ((int)id_vec.size() < pool_min_size) {
11806 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11807 << pool_min_size << ")";
11808 err = -EINVAL;
11809 goto reply;
11810 }
11811
11812 int pool_size = osdmap.get_pg_pool_size(pgid);
11813 if ((int)id_vec.size() > pool_size) {
11814 ss << "num of osds (" << id_vec.size() <<") > pool size ("
11815 << pool_size << ")";
11816 err = -EINVAL;
11817 goto reply;
11818 }
11819
11820 vector<int32_t> new_pg_upmap;
11821 for (auto osd : id_vec) {
11822 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11823 ss << "osd." << osd << " does not exist";
11824 err = -ENOENT;
11825 goto reply;
11826 }
11827 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11828 if (it != new_pg_upmap.end()) {
11829 ss << "osd." << osd << " already exists, ";
11830 continue;
11831 }
11832 new_pg_upmap.push_back(osd);
11833 }
11834
11835 if (new_pg_upmap.empty()) {
11836 ss << "no valid upmap items(pairs) is specified";
11837 err = -EINVAL;
11838 goto reply;
11839 }
11840
11841 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11842 new_pg_upmap.begin(), new_pg_upmap.end());
11843 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
11844 }
11845 break;
11846
11847 case OP_RM_PG_UPMAP:
11848 {
11849 pending_inc.old_pg_upmap.insert(pgid);
11850 ss << "clear " << pgid << " pg_upmap mapping";
11851 }
11852 break;
11853
11854 case OP_PG_UPMAP_ITEMS:
11855 {
11856 vector<int64_t> id_vec;
11857 if (!cmd_getval(cmdmap, "id", id_vec)) {
11858 ss << "unable to parse 'id' value(s) '"
11859 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11860 err = -EINVAL;
11861 goto reply;
11862 }
11863
11864 if (id_vec.size() % 2) {
11865 ss << "you must specify pairs of osd ids to be remapped";
11866 err = -EINVAL;
11867 goto reply;
11868 }
11869
11870 int pool_size = osdmap.get_pg_pool_size(pgid);
11871 if ((int)(id_vec.size() / 2) > pool_size) {
11872 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11873 << pool_size << ")";
11874 err = -EINVAL;
11875 goto reply;
11876 }
11877
11878 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11879 ostringstream items;
11880 items << "[";
11881 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11882 int from = *p++;
11883 int to = *p;
11884 if (from == to) {
11885 ss << "from osd." << from << " == to osd." << to << ", ";
11886 continue;
11887 }
11888 if (!osdmap.exists(from)) {
11889 ss << "osd." << from << " does not exist";
11890 err = -ENOENT;
11891 goto reply;
11892 }
11893 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11894 ss << "osd." << to << " does not exist";
11895 err = -ENOENT;
11896 goto reply;
11897 }
11898 pair<int32_t,int32_t> entry = make_pair(from, to);
11899 auto it = std::find(new_pg_upmap_items.begin(),
11900 new_pg_upmap_items.end(), entry);
11901 if (it != new_pg_upmap_items.end()) {
11902 ss << "osd." << from << " -> osd." << to << " already exists, ";
11903 continue;
11904 }
11905 new_pg_upmap_items.push_back(entry);
11906 items << from << "->" << to << ",";
11907 }
11908 string out(items.str());
11909 out.resize(out.size() - 1); // drop last ','
11910 out += "]";
11911
11912 if (new_pg_upmap_items.empty()) {
11913 ss << "no valid upmap items(pairs) is specified";
11914 err = -EINVAL;
11915 goto reply;
11916 }
11917
11918 pending_inc.new_pg_upmap_items[pgid] =
11919 mempool::osdmap::vector<pair<int32_t,int32_t>>(
11920 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11921 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11922 }
11923 break;
11924
11925 case OP_RM_PG_UPMAP_ITEMS:
11926 {
11927 pending_inc.old_pg_upmap_items.insert(pgid);
11928 ss << "clear " << pgid << " pg_upmap_items mapping";
11929 }
11930 break;
11931
11932 default:
11933 ceph_abort_msg("invalid option");
11934 }
11935
11936 goto update;
11937 } else if (prefix == "osd primary-affinity") {
11938 int64_t id;
11939 if (!cmd_getval(cmdmap, "id", id)) {
11940 ss << "invalid osd id value '"
11941 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11942 err = -EINVAL;
11943 goto reply;
11944 }
11945 double w;
11946 if (!cmd_getval(cmdmap, "weight", w)) {
11947 ss << "unable to parse 'weight' value '"
11948 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11949 err = -EINVAL;
11950 goto reply;
11951 }
11952 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11953 if (ww < 0L) {
11954 ss << "weight must be >= 0";
11955 err = -EINVAL;
11956 goto reply;
11957 }
11958 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11959 osdmap.require_min_compat_client < ceph_release_t::firefly) {
11960 ss << "require_min_compat_client "
11961 << osdmap.require_min_compat_client
11962 << " < firefly, which is required for primary-affinity";
11963 err = -EPERM;
11964 goto reply;
11965 }
11966 if (osdmap.exists(id)) {
11967 pending_inc.new_primary_affinity[id] = ww;
11968 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11969 getline(ss, rs);
11970 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11971 get_last_committed() + 1));
11972 return true;
11973 } else {
11974 ss << "osd." << id << " does not exist";
11975 err = -ENOENT;
11976 goto reply;
11977 }
11978 } else if (prefix == "osd reweight") {
11979 int64_t id;
11980 if (!cmd_getval(cmdmap, "id", id)) {
11981 ss << "unable to parse osd id value '"
11982 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11983 err = -EINVAL;
11984 goto reply;
11985 }
11986 double w;
11987 if (!cmd_getval(cmdmap, "weight", w)) {
11988 ss << "unable to parse weight value '"
11989 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11990 err = -EINVAL;
11991 goto reply;
11992 }
11993 long ww = (int)((double)CEPH_OSD_IN*w);
11994 if (ww < 0L) {
11995 ss << "weight must be >= 0";
11996 err = -EINVAL;
11997 goto reply;
11998 }
11999 if (osdmap.exists(id)) {
12000 pending_inc.new_weight[id] = ww;
12001 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12002 getline(ss, rs);
12003 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12004 get_last_committed() + 1));
12005 return true;
12006 } else {
12007 ss << "osd." << id << " does not exist";
12008 err = -ENOENT;
12009 goto reply;
12010 }
12011 } else if (prefix == "osd reweightn") {
12012 map<int32_t, uint32_t> weights;
12013 err = parse_reweights(cct, cmdmap, osdmap, &weights);
12014 if (err) {
12015 ss << "unable to parse 'weights' value '"
12016 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12017 goto reply;
12018 }
12019 pending_inc.new_weight.insert(weights.begin(), weights.end());
12020 wait_for_finished_proposal(
12021 op,
12022 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12023 return true;
12024 } else if (prefix == "osd lost") {
12025 int64_t id;
12026 if (!cmd_getval(cmdmap, "id", id)) {
12027 ss << "unable to parse osd id value '"
12028 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12029 err = -EINVAL;
12030 goto reply;
12031 }
12032 bool sure = false;
12033 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12034 if (!sure) {
12035 ss << "are you SURE? this might mean real, permanent data loss. pass "
12036 "--yes-i-really-mean-it if you really do.";
12037 err = -EPERM;
12038 goto reply;
12039 } else if (!osdmap.exists(id)) {
12040 ss << "osd." << id << " does not exist";
12041 err = -ENOENT;
12042 goto reply;
12043 } else if (!osdmap.is_down(id)) {
12044 ss << "osd." << id << " is not down";
12045 err = -EBUSY;
12046 goto reply;
12047 } else {
12048 epoch_t e = osdmap.get_info(id).down_at;
12049 pending_inc.new_lost[id] = e;
12050 ss << "marked osd lost in epoch " << e;
12051 getline(ss, rs);
12052 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12053 get_last_committed() + 1));
12054 return true;
12055 }
12056
12057 } else if (prefix == "osd destroy-actual" ||
12058 prefix == "osd purge-actual" ||
12059 prefix == "osd purge-new") {
12060 /* Destroying an OSD means that we don't expect to further make use of
12061 * the OSDs data (which may even become unreadable after this operation),
12062 * and that we are okay with scrubbing all its cephx keys and config-key
12063 * data (which may include lockbox keys, thus rendering the osd's data
12064 * unreadable).
12065 *
12066 * The OSD will not be removed. Instead, we will mark it as destroyed,
12067 * such that a subsequent call to `create` will not reuse the osd id.
12068 * This will play into being able to recreate the OSD, at the same
12069 * crush location, with minimal data movement.
12070 */
12071
12072 // make sure authmon is writeable.
12073 if (!mon->authmon()->is_writeable()) {
12074 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12075 << "osd destroy" << dendl;
12076 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12077 return false;
12078 }
12079
12080 int64_t id;
12081 if (!cmd_getval(cmdmap, "id", id)) {
12082 auto p = cmdmap.find("id");
12083 if (p == cmdmap.end()) {
12084 ss << "no osd id specified";
12085 } else {
12086 ss << "unable to parse osd id value '"
12087 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12088 }
12089 err = -EINVAL;
12090 goto reply;
12091 }
12092
12093 bool is_destroy = (prefix == "osd destroy-actual");
12094 if (!is_destroy) {
12095 ceph_assert("osd purge-actual" == prefix ||
12096 "osd purge-new" == prefix);
12097 }
12098
12099 bool sure = false;
12100 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12101 if (!sure) {
12102 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12103 << "This will mean real, permanent data loss, as well "
12104 << "as deletion of cephx and lockbox keys. "
12105 << "Pass --yes-i-really-mean-it if you really do.";
12106 err = -EPERM;
12107 goto reply;
12108 } else if (!osdmap.exists(id)) {
12109 ss << "osd." << id << " does not exist";
12110 err = 0; // idempotent
12111 goto reply;
12112 } else if (osdmap.is_up(id)) {
12113 ss << "osd." << id << " is not `down`.";
12114 err = -EBUSY;
12115 goto reply;
12116 } else if (is_destroy && osdmap.is_destroyed(id)) {
12117 ss << "destroyed osd." << id;
12118 err = 0;
12119 goto reply;
12120 }
12121
12122 if (prefix == "osd purge-new" &&
12123 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12124 ss << "osd." << id << " is not new";
12125 err = -EPERM;
12126 goto reply;
12127 }
12128
12129 bool goto_reply = false;
12130
12131 paxos->plug();
12132 if (is_destroy) {
12133 err = prepare_command_osd_destroy(id, ss);
12134 // we checked above that it should exist.
12135 ceph_assert(err != -ENOENT);
12136 } else {
12137 err = prepare_command_osd_purge(id, ss);
12138 if (err == -ENOENT) {
12139 err = 0;
12140 ss << "osd." << id << " does not exist.";
12141 goto_reply = true;
12142 }
12143 }
12144 paxos->unplug();
12145
12146 if (err < 0 || goto_reply) {
12147 goto reply;
12148 }
12149
12150 if (is_destroy) {
12151 ss << "destroyed osd." << id;
12152 } else {
12153 ss << "purged osd." << id;
12154 }
12155
12156 getline(ss, rs);
12157 wait_for_finished_proposal(op,
12158 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12159 force_immediate_propose();
12160 return true;
12161
12162 } else if (prefix == "osd new") {
12163
12164 // make sure authmon is writeable.
12165 if (!mon->authmon()->is_writeable()) {
12166 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12167 << "osd new" << dendl;
12168 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12169 return false;
12170 }
12171
12172 map<string,string> param_map;
12173
12174 bufferlist bl = m->get_data();
12175 string param_json = bl.to_str();
12176 dout(20) << __func__ << " osd new json = " << param_json << dendl;
12177
12178 err = get_json_str_map(param_json, ss, &param_map);
12179 if (err < 0)
12180 goto reply;
12181
12182 dout(20) << __func__ << " osd new params " << param_map << dendl;
12183
12184 paxos->plug();
12185 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12186 paxos->unplug();
12187
12188 if (err < 0) {
12189 goto reply;
12190 }
12191
12192 if (f) {
12193 f->flush(rdata);
12194 } else {
12195 rdata.append(ss);
12196 }
12197
12198 if (err == EEXIST) {
12199 // idempotent operation
12200 err = 0;
12201 goto reply;
12202 }
12203
12204 wait_for_finished_proposal(op,
12205 new Monitor::C_Command(mon, op, 0, rs, rdata,
12206 get_last_committed() + 1));
12207 force_immediate_propose();
12208 return true;
12209
12210 } else if (prefix == "osd create") {
12211
12212 // optional id provided?
12213 int64_t id = -1, cmd_id = -1;
12214 if (cmd_getval(cmdmap, "id", cmd_id)) {
12215 if (cmd_id < 0) {
12216 ss << "invalid osd id value '" << cmd_id << "'";
12217 err = -EINVAL;
12218 goto reply;
12219 }
12220 dout(10) << " osd create got id " << cmd_id << dendl;
12221 }
12222
12223 uuid_d uuid;
12224 string uuidstr;
12225 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12226 if (!uuid.parse(uuidstr.c_str())) {
12227 ss << "invalid uuid value '" << uuidstr << "'";
12228 err = -EINVAL;
12229 goto reply;
12230 }
12231 // we only care about the id if we also have the uuid, to
12232 // ensure the operation's idempotency.
12233 id = cmd_id;
12234 }
12235
12236 int32_t new_id = -1;
12237 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12238 if (err < 0) {
12239 if (err == -EAGAIN) {
12240 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12241 return true;
12242 }
12243 // a check has failed; reply to the user.
12244 goto reply;
12245
12246 } else if (err == EEXIST) {
12247 // this is an idempotent operation; we can go ahead and reply.
12248 if (f) {
12249 f->open_object_section("created_osd");
12250 f->dump_int("osdid", new_id);
12251 f->close_section();
12252 f->flush(rdata);
12253 } else {
12254 ss << new_id;
12255 rdata.append(ss);
12256 }
12257 err = 0;
12258 goto reply;
12259 }
12260
12261 string empty_device_class;
12262 do_osd_create(id, uuid, empty_device_class, &new_id);
12263
12264 if (f) {
12265 f->open_object_section("created_osd");
12266 f->dump_int("osdid", new_id);
12267 f->close_section();
12268 f->flush(rdata);
12269 } else {
12270 ss << new_id;
12271 rdata.append(ss);
12272 }
12273 wait_for_finished_proposal(op,
12274 new Monitor::C_Command(mon, op, 0, rs, rdata,
12275 get_last_committed() + 1));
12276 return true;
12277
12278 } else if (prefix == "osd blacklist clear") {
12279 pending_inc.new_blacklist.clear();
12280 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12281 osdmap.get_blacklist(&blacklist);
12282 for (const auto &entry : blacklist) {
12283 pending_inc.old_blacklist.push_back(entry.first);
12284 }
12285 ss << " removed all blacklist entries";
12286 getline(ss, rs);
12287 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12288 get_last_committed() + 1));
12289 return true;
12290 } else if (prefix == "osd blacklist") {
12291 string addrstr;
12292 cmd_getval(cmdmap, "addr", addrstr);
12293 entity_addr_t addr;
12294 if (!addr.parse(addrstr.c_str(), 0)) {
12295 ss << "unable to parse address " << addrstr;
12296 err = -EINVAL;
12297 goto reply;
12298 }
12299 else {
12300 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12301 // always blacklist type ANY
12302 addr.set_type(entity_addr_t::TYPE_ANY);
12303 } else {
12304 addr.set_type(entity_addr_t::TYPE_LEGACY);
12305 }
12306
12307 string blacklistop;
12308 cmd_getval(cmdmap, "blacklistop", blacklistop);
12309 if (blacklistop == "add") {
12310 utime_t expires = ceph_clock_now();
12311 double d;
12312 // default one hour
12313 cmd_getval(cmdmap, "expire", d,
12314 g_conf()->mon_osd_blacklist_default_expire);
12315 expires += d;
12316
12317 pending_inc.new_blacklist[addr] = expires;
12318
12319 {
12320 // cancel any pending un-blacklisting request too
12321 auto it = std::find(pending_inc.old_blacklist.begin(),
12322 pending_inc.old_blacklist.end(), addr);
12323 if (it != pending_inc.old_blacklist.end()) {
12324 pending_inc.old_blacklist.erase(it);
12325 }
12326 }
12327
12328 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12329 getline(ss, rs);
12330 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12331 get_last_committed() + 1));
12332 return true;
12333 } else if (blacklistop == "rm") {
12334 if (osdmap.is_blacklisted(addr) ||
12335 pending_inc.new_blacklist.count(addr)) {
12336 if (osdmap.is_blacklisted(addr))
12337 pending_inc.old_blacklist.push_back(addr);
12338 else
12339 pending_inc.new_blacklist.erase(addr);
12340 ss << "un-blacklisting " << addr;
12341 getline(ss, rs);
12342 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12343 get_last_committed() + 1));
12344 return true;
12345 }
12346 ss << addr << " isn't blacklisted";
12347 err = 0;
12348 goto reply;
12349 }
12350 }
12351 } else if (prefix == "osd pool mksnap") {
12352 string poolstr;
12353 cmd_getval(cmdmap, "pool", poolstr);
12354 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12355 if (pool < 0) {
12356 ss << "unrecognized pool '" << poolstr << "'";
12357 err = -ENOENT;
12358 goto reply;
12359 }
12360 string snapname;
12361 cmd_getval(cmdmap, "snap", snapname);
12362 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12363 if (p->is_unmanaged_snaps_mode()) {
12364 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12365 err = -EINVAL;
12366 goto reply;
12367 } else if (p->snap_exists(snapname.c_str())) {
12368 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12369 err = 0;
12370 goto reply;
12371 } else if (p->is_tier()) {
12372 ss << "pool " << poolstr << " is a cache tier";
12373 err = -EINVAL;
12374 goto reply;
12375 }
12376 pg_pool_t *pp = 0;
12377 if (pending_inc.new_pools.count(pool))
12378 pp = &pending_inc.new_pools[pool];
12379 if (!pp) {
12380 pp = &pending_inc.new_pools[pool];
12381 *pp = *p;
12382 }
12383 if (pp->snap_exists(snapname.c_str())) {
12384 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12385 } else {
12386 pp->add_snap(snapname.c_str(), ceph_clock_now());
12387 pp->set_snap_epoch(pending_inc.epoch);
12388 ss << "created pool " << poolstr << " snap " << snapname;
12389 }
12390 getline(ss, rs);
12391 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12392 get_last_committed() + 1));
12393 return true;
12394 } else if (prefix == "osd pool rmsnap") {
12395 string poolstr;
12396 cmd_getval(cmdmap, "pool", poolstr);
12397 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12398 if (pool < 0) {
12399 ss << "unrecognized pool '" << poolstr << "'";
12400 err = -ENOENT;
12401 goto reply;
12402 }
12403 string snapname;
12404 cmd_getval(cmdmap, "snap", snapname);
12405 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12406 if (p->is_unmanaged_snaps_mode()) {
12407 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12408 err = -EINVAL;
12409 goto reply;
12410 } else if (!p->snap_exists(snapname.c_str())) {
12411 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12412 err = 0;
12413 goto reply;
12414 }
12415 pg_pool_t *pp = 0;
12416 if (pending_inc.new_pools.count(pool))
12417 pp = &pending_inc.new_pools[pool];
12418 if (!pp) {
12419 pp = &pending_inc.new_pools[pool];
12420 *pp = *p;
12421 }
12422 snapid_t sn = pp->snap_exists(snapname.c_str());
12423 if (sn) {
12424 pp->remove_snap(sn);
12425 pp->set_snap_epoch(pending_inc.epoch);
12426 ss << "removed pool " << poolstr << " snap " << snapname;
12427 } else {
12428 ss << "already removed pool " << poolstr << " snap " << snapname;
12429 }
12430 getline(ss, rs);
12431 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12432 get_last_committed() + 1));
12433 return true;
12434 } else if (prefix == "osd pool create") {
12435 int64_t pg_num, pg_num_min;
12436 int64_t pgp_num;
12437 cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12438 cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12439 cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12440
12441 string pool_type_str;
12442 cmd_getval(cmdmap, "pool_type", pool_type_str);
12443 if (pool_type_str.empty())
12444 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12445
12446 string poolstr;
12447 cmd_getval(cmdmap, "pool", poolstr);
12448 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12449 if (pool_id >= 0) {
12450 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12451 if (pool_type_str != p->get_type_name()) {
12452 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12453 err = -EINVAL;
12454 } else {
12455 ss << "pool '" << poolstr << "' already exists";
12456 err = 0;
12457 }
12458 goto reply;
12459 }
12460
12461 int pool_type;
12462 if (pool_type_str == "replicated") {
12463 pool_type = pg_pool_t::TYPE_REPLICATED;
12464 } else if (pool_type_str == "erasure") {
12465 pool_type = pg_pool_t::TYPE_ERASURE;
12466 } else {
12467 ss << "unknown pool type '" << pool_type_str << "'";
12468 err = -EINVAL;
12469 goto reply;
12470 }
12471
12472 bool implicit_rule_creation = false;
12473 int64_t expected_num_objects = 0;
12474 string rule_name;
12475 cmd_getval(cmdmap, "rule", rule_name);
12476 string erasure_code_profile;
12477 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12478
12479 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12480 if (erasure_code_profile == "")
12481 erasure_code_profile = "default";
12482 //handle the erasure code profile
12483 if (erasure_code_profile == "default") {
12484 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12485 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12486 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12487 goto wait;
12488 }
12489
12490 map<string,string> profile_map;
12491 err = osdmap.get_erasure_code_profile_default(cct,
12492 profile_map,
12493 &ss);
12494 if (err)
12495 goto reply;
12496 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12497 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12498 goto wait;
12499 }
12500 }
12501 if (rule_name == "") {
12502 implicit_rule_creation = true;
12503 if (erasure_code_profile == "default") {
12504 rule_name = "erasure-code";
12505 } else {
12506 dout(1) << "implicitly use rule named after the pool: "
12507 << poolstr << dendl;
12508 rule_name = poolstr;
12509 }
12510 }
12511 cmd_getval(cmdmap, "expected_num_objects",
12512 expected_num_objects, int64_t(0));
12513 } else {
12514 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12515 // and put expected_num_objects to rule field
12516 if (erasure_code_profile != "") { // cmd is from CLI
12517 if (rule_name != "") {
12518 string interr;
12519 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12520 if (interr.length()) {
12521 ss << "error parsing integer value '" << rule_name << "': " << interr;
12522 err = -EINVAL;
12523 goto reply;
12524 }
12525 }
12526 rule_name = erasure_code_profile;
12527 } else { // cmd is well-formed
12528 cmd_getval(cmdmap, "expected_num_objects",
12529 expected_num_objects, int64_t(0));
12530 }
12531 }
12532
12533 if (!implicit_rule_creation && rule_name != "") {
12534 int rule;
12535 err = get_crush_rule(rule_name, &rule, &ss);
12536 if (err == -EAGAIN) {
12537 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12538 return true;
12539 }
12540 if (err)
12541 goto reply;
12542 }
12543
12544 if (expected_num_objects < 0) {
12545 ss << "'expected_num_objects' must be non-negative";
12546 err = -EINVAL;
12547 goto reply;
12548 }
12549
12550 if (expected_num_objects > 0 &&
12551 cct->_conf->osd_objectstore == "filestore" &&
12552 cct->_conf->filestore_merge_threshold > 0) {
12553 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12554 err = -EINVAL;
12555 goto reply;
12556 }
12557
12558 if (expected_num_objects == 0 &&
12559 cct->_conf->osd_objectstore == "filestore" &&
12560 cct->_conf->filestore_merge_threshold < 0) {
12561 int osds = osdmap.get_num_osds();
12562 if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12563 ss << "For better initial performance on pools expected to store a "
12564 << "large number of objects, consider supplying the "
12565 << "expected_num_objects parameter when creating the pool.\n";
12566 }
12567 }
12568
12569 int64_t fast_read_param;
12570 cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12571 FastReadType fast_read = FAST_READ_DEFAULT;
12572 if (fast_read_param == 0)
12573 fast_read = FAST_READ_OFF;
12574 else if (fast_read_param > 0)
12575 fast_read = FAST_READ_ON;
12576
12577 int64_t repl_size = 0;
12578 cmd_getval(cmdmap, "size", repl_size);
12579 int64_t target_size_bytes = 0;
12580 double target_size_ratio = 0.0;
12581 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12582 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12583
12584 string pg_autoscale_mode;
12585 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12586
12587 err = prepare_new_pool(poolstr,
12588 -1, // default crush rule
12589 rule_name,
12590 pg_num, pgp_num, pg_num_min,
12591 repl_size, target_size_bytes, target_size_ratio,
12592 erasure_code_profile, pool_type,
12593 (uint64_t)expected_num_objects,
12594 fast_read,
12595 pg_autoscale_mode,
12596 &ss);
12597 if (err < 0) {
12598 switch(err) {
12599 case -EEXIST:
12600 ss << "pool '" << poolstr << "' already exists";
12601 break;
12602 case -EAGAIN:
12603 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12604 return true;
12605 case -ERANGE:
12606 goto reply;
12607 default:
12608 goto reply;
12609 break;
12610 }
12611 } else {
12612 ss << "pool '" << poolstr << "' created";
12613 }
12614 getline(ss, rs);
12615 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12616 get_last_committed() + 1));
12617 return true;
12618
12619 } else if (prefix == "osd pool delete" ||
12620 prefix == "osd pool rm") {
12621 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12622 string poolstr, poolstr2, sure;
12623 cmd_getval(cmdmap, "pool", poolstr);
12624 cmd_getval(cmdmap, "pool2", poolstr2);
12625 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12626 if (pool < 0) {
12627 ss << "pool '" << poolstr << "' does not exist";
12628 err = 0;
12629 goto reply;
12630 }
12631
12632 bool force_no_fake = false;
12633 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12634 bool force = false;
12635 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12636 if (poolstr2 != poolstr ||
12637 (!force && !force_no_fake)) {
12638 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12639 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12640 << "followed by --yes-i-really-really-mean-it.";
12641 err = -EPERM;
12642 goto reply;
12643 }
12644 err = _prepare_remove_pool(pool, &ss, force_no_fake);
12645 if (err == -EAGAIN) {
12646 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12647 return true;
12648 }
12649 if (err < 0)
12650 goto reply;
12651 goto update;
12652 } else if (prefix == "osd pool rename") {
12653 string srcpoolstr, destpoolstr;
12654 cmd_getval(cmdmap, "srcpool", srcpoolstr);
12655 cmd_getval(cmdmap, "destpool", destpoolstr);
12656 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12657 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12658
12659 if (pool_src < 0) {
12660 if (pool_dst >= 0) {
12661 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12662 // of operations, assume this rename succeeded, as it is not changing
12663 // the current state. Make sure we output something understandable
12664 // for whoever is issuing the command, if they are paying attention,
12665 // in case it was not intentional; or to avoid a "wtf?" and a bug
12666 // report in case it was intentional, while expecting a failure.
12667 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12668 << destpoolstr << "' does -- assuming successful rename";
12669 err = 0;
12670 } else {
12671 ss << "unrecognized pool '" << srcpoolstr << "'";
12672 err = -ENOENT;
12673 }
12674 goto reply;
12675 } else if (pool_dst >= 0) {
12676 // source pool exists and so does the destination pool
12677 ss << "pool '" << destpoolstr << "' already exists";
12678 err = -EEXIST;
12679 goto reply;
12680 }
12681
12682 int ret = _prepare_rename_pool(pool_src, destpoolstr);
12683 if (ret == 0) {
12684 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12685 } else {
12686 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12687 << cpp_strerror(ret);
12688 }
12689 getline(ss, rs);
12690 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12691 get_last_committed() + 1));
12692 return true;
12693
12694 } else if (prefix == "osd pool set") {
12695 err = prepare_command_pool_set(cmdmap, ss);
12696 if (err == -EAGAIN)
12697 goto wait;
12698 if (err < 0)
12699 goto reply;
12700
12701 getline(ss, rs);
12702 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12703 get_last_committed() + 1));
12704 return true;
12705 } else if (prefix == "osd tier add") {
12706 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12707 if (err == -EAGAIN)
12708 goto wait;
12709 if (err)
12710 goto reply;
12711 string poolstr;
12712 cmd_getval(cmdmap, "pool", poolstr);
12713 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12714 if (pool_id < 0) {
12715 ss << "unrecognized pool '" << poolstr << "'";
12716 err = -ENOENT;
12717 goto reply;
12718 }
12719 string tierpoolstr;
12720 cmd_getval(cmdmap, "tierpool", tierpoolstr);
12721 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12722 if (tierpool_id < 0) {
12723 ss << "unrecognized pool '" << tierpoolstr << "'";
12724 err = -ENOENT;
12725 goto reply;
12726 }
12727 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12728 ceph_assert(p);
12729 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12730 ceph_assert(tp);
12731
12732 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12733 goto reply;
12734 }
12735
12736 // make sure new tier is empty
12737 string force_nonempty;
12738 cmd_getval(cmdmap, "force_nonempty", force_nonempty);
12739 const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
12740 if (pstats && pstats->stats.sum.num_objects != 0 &&
12741 force_nonempty != "--force-nonempty") {
12742 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12743 err = -ENOTEMPTY;
12744 goto reply;
12745 }
12746 if (tp->is_erasure()) {
12747 ss << "tier pool '" << tierpoolstr
12748 << "' is an ec pool, which cannot be a tier";
12749 err = -ENOTSUP;
12750 goto reply;
12751 }
12752 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12753 ((force_nonempty != "--force-nonempty") ||
12754 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
12755 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12756 err = -ENOTEMPTY;
12757 goto reply;
12758 }
12759 // go
12760 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12761 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12762 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12763 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12764 return true;
12765 }
12766 np->tiers.insert(tierpool_id);
12767 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12768 ntp->tier_of = pool_id;
12769 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12770 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12771 get_last_committed() + 1));
12772 return true;
12773 } else if (prefix == "osd tier remove" ||
12774 prefix == "osd tier rm") {
12775 string poolstr;
12776 cmd_getval(cmdmap, "pool", poolstr);
12777 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12778 if (pool_id < 0) {
12779 ss << "unrecognized pool '" << poolstr << "'";
12780 err = -ENOENT;
12781 goto reply;
12782 }
12783 string tierpoolstr;
12784 cmd_getval(cmdmap, "tierpool", tierpoolstr);
12785 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12786 if (tierpool_id < 0) {
12787 ss << "unrecognized pool '" << tierpoolstr << "'";
12788 err = -ENOENT;
12789 goto reply;
12790 }
12791 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12792 ceph_assert(p);
12793 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12794 ceph_assert(tp);
12795
12796 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12797 goto reply;
12798 }
12799
12800 if (p->tiers.count(tierpool_id) == 0) {
12801 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12802 err = 0;
12803 goto reply;
12804 }
12805 if (tp->tier_of != pool_id) {
12806 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12807 << osdmap.get_pool_name(tp->tier_of) << "': "
12808 // be scary about it; this is an inconsistency and bells must go off
12809 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12810 err = -EINVAL;
12811 goto reply;
12812 }
12813 if (p->read_tier == tierpool_id) {
12814 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12815 err = -EBUSY;
12816 goto reply;
12817 }
12818 // go
12819 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12820 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12821 if (np->tiers.count(tierpool_id) == 0 ||
12822 ntp->tier_of != pool_id ||
12823 np->read_tier == tierpool_id) {
12824 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12825 return true;
12826 }
12827 np->tiers.erase(tierpool_id);
12828 ntp->clear_tier();
12829 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12830 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12831 get_last_committed() + 1));
12832 return true;
12833 } else if (prefix == "osd tier set-overlay") {
12834 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12835 if (err == -EAGAIN)
12836 goto wait;
12837 if (err)
12838 goto reply;
12839 string poolstr;
12840 cmd_getval(cmdmap, "pool", poolstr);
12841 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12842 if (pool_id < 0) {
12843 ss << "unrecognized pool '" << poolstr << "'";
12844 err = -ENOENT;
12845 goto reply;
12846 }
12847 string overlaypoolstr;
12848 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
12849 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12850 if (overlaypool_id < 0) {
12851 ss << "unrecognized pool '" << overlaypoolstr << "'";
12852 err = -ENOENT;
12853 goto reply;
12854 }
12855 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12856 ceph_assert(p);
12857 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
12858 ceph_assert(overlay_p);
12859 if (p->tiers.count(overlaypool_id) == 0) {
12860 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12861 err = -EINVAL;
12862 goto reply;
12863 }
12864 if (p->read_tier == overlaypool_id) {
12865 err = 0;
12866 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12867 goto reply;
12868 }
12869 if (p->has_read_tier()) {
12870 ss << "pool '" << poolstr << "' has overlay '"
12871 << osdmap.get_pool_name(p->read_tier)
12872 << "'; please remove-overlay first";
12873 err = -EINVAL;
12874 goto reply;
12875 }
12876
12877 // go
12878 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12879 np->read_tier = overlaypool_id;
12880 np->write_tier = overlaypool_id;
12881 np->set_last_force_op_resend(pending_inc.epoch);
12882 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12883 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12884 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12885 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12886 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12887 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12888 get_last_committed() + 1));
12889 return true;
12890 } else if (prefix == "osd tier remove-overlay" ||
12891 prefix == "osd tier rm-overlay") {
12892 string poolstr;
12893 cmd_getval(cmdmap, "pool", poolstr);
12894 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12895 if (pool_id < 0) {
12896 ss << "unrecognized pool '" << poolstr << "'";
12897 err = -ENOENT;
12898 goto reply;
12899 }
12900 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12901 ceph_assert(p);
12902 if (!p->has_read_tier()) {
12903 err = 0;
12904 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12905 goto reply;
12906 }
12907
12908 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12909 goto reply;
12910 }
12911
12912 // go
12913 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12914 if (np->has_read_tier()) {
12915 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12916 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12917 nop->set_last_force_op_resend(pending_inc.epoch);
12918 }
12919 if (np->has_write_tier()) {
12920 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12921 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12922 nop->set_last_force_op_resend(pending_inc.epoch);
12923 }
12924 np->clear_read_tier();
12925 np->clear_write_tier();
12926 np->set_last_force_op_resend(pending_inc.epoch);
12927 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12928 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12929 get_last_committed() + 1));
12930 return true;
12931 } else if (prefix == "osd tier cache-mode") {
12932 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12933 if (err == -EAGAIN)
12934 goto wait;
12935 if (err)
12936 goto reply;
12937 string poolstr;
12938 cmd_getval(cmdmap, "pool", poolstr);
12939 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12940 if (pool_id < 0) {
12941 ss << "unrecognized pool '" << poolstr << "'";
12942 err = -ENOENT;
12943 goto reply;
12944 }
12945 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12946 ceph_assert(p);
12947 if (!p->is_tier()) {
12948 ss << "pool '" << poolstr << "' is not a tier";
12949 err = -EINVAL;
12950 goto reply;
12951 }
12952 string modestr;
12953 cmd_getval(cmdmap, "mode", modestr);
12954 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12955 if (int(mode) < 0) {
12956 ss << "'" << modestr << "' is not a valid cache mode";
12957 err = -EINVAL;
12958 goto reply;
12959 }
12960
12961 bool sure = false;
12962 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12963
12964 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
12965 mode == pg_pool_t::CACHEMODE_READFORWARD) {
12966 ss << "'" << modestr << "' is no longer a supported cache mode";
12967 err = -EPERM;
12968 goto reply;
12969 }
12970 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12971 mode != pg_pool_t::CACHEMODE_NONE &&
12972 mode != pg_pool_t::CACHEMODE_PROXY &&
12973 mode != pg_pool_t::CACHEMODE_READPROXY) &&
12974 !sure) {
12975 ss << "'" << modestr << "' is not a well-supported cache mode and may "
12976 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12977 err = -EPERM;
12978 goto reply;
12979 }
12980
12981 // pool already has this cache-mode set and there are no pending changes
12982 if (p->cache_mode == mode &&
12983 (pending_inc.new_pools.count(pool_id) == 0 ||
12984 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12985 ss << "set cache-mode for pool '" << poolstr << "'"
12986 << " to " << pg_pool_t::get_cache_mode_name(mode);
12987 err = 0;
12988 goto reply;
12989 }
12990
12991 /* Mode description:
12992 *
12993 * none: No cache-mode defined
12994 * forward: Forward all reads and writes to base pool [removed]
12995 * writeback: Cache writes, promote reads from base pool
12996 * readonly: Forward writes to base pool
12997 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
12998 * proxy: Proxy all reads and writes to base pool
12999 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13000 *
13001 * Hence, these are the allowed transitions:
13002 *
13003 * none -> any
13004 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13005 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13006 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13007 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13008 * writeback -> readproxy || proxy
13009 * readonly -> any
13010 */
13011
13012 // We check if the transition is valid against the current pool mode, as
13013 // it is the only committed state thus far. We will blantly squash
13014 // whatever mode is on the pending state.
13015
13016 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13017 (mode != pg_pool_t::CACHEMODE_PROXY &&
13018 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13019 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13020 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13021 << "' pool; only '"
13022 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
13023 << "','"
13024 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13025 << "' allowed.";
13026 err = -EINVAL;
13027 goto reply;
13028 }
13029 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13030 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13031 mode != pg_pool_t::CACHEMODE_PROXY &&
13032 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13033
13034 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13035 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13036 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13037
13038 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13039 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13040 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13041
13042 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13043 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13044 mode != pg_pool_t::CACHEMODE_PROXY &&
13045 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13046
13047 const pool_stat_t* pstats =
13048 mon->mgrstatmon()->get_pool_stat(pool_id);
13049
13050 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13051 ss << "unable to set cache-mode '"
13052 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13053 << "': dirty objects found";
13054 err = -EBUSY;
13055 goto reply;
13056 }
13057 }
13058 // go
13059 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13060 np->cache_mode = mode;
13061 // set this both when moving to and from cache_mode NONE. this is to
13062 // capture legacy pools that were set up before this flag existed.
13063 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13064 ss << "set cache-mode for pool '" << poolstr
13065 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13066 if (mode == pg_pool_t::CACHEMODE_NONE) {
13067 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13068 ceph_assert(base_pool);
13069 if (base_pool->read_tier == pool_id ||
13070 base_pool->write_tier == pool_id)
13071 ss <<" (WARNING: pool is still configured as read or write tier)";
13072 }
13073 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13074 get_last_committed() + 1));
13075 return true;
13076 } else if (prefix == "osd tier add-cache") {
13077 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13078 if (err == -EAGAIN)
13079 goto wait;
13080 if (err)
13081 goto reply;
13082 string poolstr;
13083 cmd_getval(cmdmap, "pool", poolstr);
13084 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13085 if (pool_id < 0) {
13086 ss << "unrecognized pool '" << poolstr << "'";
13087 err = -ENOENT;
13088 goto reply;
13089 }
13090 string tierpoolstr;
13091 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13092 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13093 if (tierpool_id < 0) {
13094 ss << "unrecognized pool '" << tierpoolstr << "'";
13095 err = -ENOENT;
13096 goto reply;
13097 }
13098 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13099 ceph_assert(p);
13100 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13101 ceph_assert(tp);
13102
13103 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13104 goto reply;
13105 }
13106
13107 int64_t size = 0;
13108 if (!cmd_getval(cmdmap, "size", size)) {
13109 ss << "unable to parse 'size' value '"
13110 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13111 err = -EINVAL;
13112 goto reply;
13113 }
13114 // make sure new tier is empty
13115 const pool_stat_t *pstats =
13116 mon->mgrstatmon()->get_pool_stat(tierpool_id);
13117 if (pstats && pstats->stats.sum.num_objects != 0) {
13118 ss << "tier pool '" << tierpoolstr << "' is not empty";
13119 err = -ENOTEMPTY;
13120 goto reply;
13121 }
13122 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13123 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13124 if (int(mode) < 0) {
13125 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13126 err = -EINVAL;
13127 goto reply;
13128 }
13129 HitSet::Params hsp;
13130 auto& cache_hit_set_type =
13131 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13132 if (cache_hit_set_type == "bloom") {
13133 BloomHitSet::Params *bsp = new BloomHitSet::Params;
13134 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13135 hsp = HitSet::Params(bsp);
13136 } else if (cache_hit_set_type == "explicit_hash") {
13137 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13138 } else if (cache_hit_set_type == "explicit_object") {
13139 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13140 } else {
13141 ss << "osd tier cache default hit set type '"
13142 << cache_hit_set_type << "' is not a known type";
13143 err = -EINVAL;
13144 goto reply;
13145 }
13146 // go
13147 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13148 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13149 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13150 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13151 return true;
13152 }
13153 np->tiers.insert(tierpool_id);
13154 np->read_tier = np->write_tier = tierpool_id;
13155 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13156 np->set_last_force_op_resend(pending_inc.epoch);
13157 ntp->set_last_force_op_resend(pending_inc.epoch);
13158 ntp->tier_of = pool_id;
13159 ntp->cache_mode = mode;
13160 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13161 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13162 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13163 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13164 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13165 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13166 ntp->hit_set_params = hsp;
13167 ntp->target_max_bytes = size;
13168 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13169 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13170 get_last_committed() + 1));
13171 return true;
13172 } else if (prefix == "osd pool set-quota") {
13173 string poolstr;
13174 cmd_getval(cmdmap, "pool", poolstr);
13175 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13176 if (pool_id < 0) {
13177 ss << "unrecognized pool '" << poolstr << "'";
13178 err = -ENOENT;
13179 goto reply;
13180 }
13181
13182 string field;
13183 cmd_getval(cmdmap, "field", field);
13184 if (field != "max_objects" && field != "max_bytes") {
13185 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13186 err = -EINVAL;
13187 goto reply;
13188 }
13189
13190 // val could contain unit designations, so we treat as a string
13191 string val;
13192 cmd_getval(cmdmap, "val", val);
13193 string tss;
13194 int64_t value;
13195 if (field == "max_objects") {
13196 value = strict_sistrtoll(val.c_str(), &tss);
13197 } else if (field == "max_bytes") {
13198 value = strict_iecstrtoll(val.c_str(), &tss);
13199 } else {
13200 ceph_abort_msg("unrecognized option");
13201 }
13202 if (!tss.empty()) {
13203 ss << "error parsing value '" << val << "': " << tss;
13204 err = -EINVAL;
13205 goto reply;
13206 }
13207
13208 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13209 if (field == "max_objects") {
13210 pi->quota_max_objects = value;
13211 } else if (field == "max_bytes") {
13212 pi->quota_max_bytes = value;
13213 } else {
13214 ceph_abort_msg("unrecognized option");
13215 }
13216 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13217 rs = ss.str();
13218 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13219 get_last_committed() + 1));
13220 return true;
13221 } else if (prefix == "osd pool application enable" ||
13222 prefix == "osd pool application disable" ||
13223 prefix == "osd pool application set" ||
13224 prefix == "osd pool application rm") {
13225 err = prepare_command_pool_application(prefix, cmdmap, ss);
13226 if (err == -EAGAIN) {
13227 goto wait;
13228 } else if (err < 0) {
13229 goto reply;
13230 } else {
13231 goto update;
13232 }
13233 } else if (prefix == "osd force-create-pg") {
13234 pg_t pgid;
13235 string pgidstr;
13236 cmd_getval(cmdmap, "pgid", pgidstr);
13237 if (!pgid.parse(pgidstr.c_str())) {
13238 ss << "invalid pgid '" << pgidstr << "'";
13239 err = -EINVAL;
13240 goto reply;
13241 }
13242 if (!osdmap.pg_exists(pgid)) {
13243 ss << "pg " << pgid << " should not exist";
13244 err = -ENOENT;
13245 goto reply;
13246 }
13247 bool sure = false;
13248 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13249 if (!sure) {
13250 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13251 << "that the cluster will give up ever trying to recover the lost data. Do this "
13252 << "only if you are certain that all copies of the PG are in fact lost and you are "
13253 << "willing to accept that the data is permanently destroyed. Pass "
13254 << "--yes-i-really-mean-it to proceed.";
13255 err = -EPERM;
13256 goto reply;
13257 }
13258 bool creating_now;
13259 {
13260 std::lock_guard<std::mutex> l(creating_pgs_lock);
13261 auto emplaced = creating_pgs.pgs.emplace(
13262 pgid,
13263 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13264 ceph_clock_now()));
13265 creating_now = emplaced.second;
13266 }
13267 if (creating_now) {
13268 ss << "pg " << pgidstr << " now creating, ok";
13269 // set the pool's CREATING flag so that (1) the osd won't ignore our
13270 // create message and (2) we won't propose any future pg_num changes
13271 // until after the PG has been instantiated.
13272 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13273 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13274 }
13275 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13276 err = 0;
13277 goto update;
13278 } else {
13279 ss << "pg " << pgid << " already creating";
13280 err = 0;
13281 goto reply;
13282 }
13283 } else {
13284 err = -EINVAL;
13285 }
13286
13287 reply:
13288 getline(ss, rs);
13289 if (err < 0 && rs.length() == 0)
13290 rs = cpp_strerror(err);
13291 mon->reply_command(op, err, rs, rdata, get_last_committed());
13292 return ret;
13293
13294 update:
13295 getline(ss, rs);
13296 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13297 get_last_committed() + 1));
13298 return true;
13299
13300 wait:
13301 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13302 return true;
13303 }
13304
13305 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13306 {
13307 op->mark_osdmon_event(__func__);
13308
13309 auto m = op->get_req<MPoolOp>();
13310 MonSession *session = op->get_session();
13311 if (!session) {
13312 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13313 return true;
13314 }
13315
13316 switch (m->op) {
13317 case POOL_OP_CREATE_UNMANAGED_SNAP:
13318 case POOL_OP_DELETE_UNMANAGED_SNAP:
13319 {
13320 const std::string* pool_name = nullptr;
13321 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13322 if (pg_pool != nullptr) {
13323 pool_name = &osdmap.get_pool_name(m->pool);
13324 }
13325
13326 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13327 session->entity_name, session->caps,
13328 session->get_peer_socket_addr(),
13329 pool_name)) {
13330 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13331 << "privileges. message: " << *m << std::endl
13332 << "caps: " << session->caps << dendl;
13333 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13334 return true;
13335 }
13336 }
13337 break;
13338 default:
13339 if (!session->is_capable("osd", MON_CAP_W)) {
13340 dout(0) << "got pool op from entity with insufficient privileges. "
13341 << "message: " << *m << std::endl
13342 << "caps: " << session->caps << dendl;
13343 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13344 return true;
13345 }
13346 break;
13347 }
13348
13349 return false;
13350 }
13351
13352 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13353 {
13354 op->mark_osdmon_event(__func__);
13355 auto m = op->get_req<MPoolOp>();
13356
13357 if (enforce_pool_op_caps(op)) {
13358 return true;
13359 }
13360
13361 if (m->fsid != mon->monmap->fsid) {
13362 dout(0) << __func__ << " drop message on fsid " << m->fsid
13363 << " != " << mon->monmap->fsid << " for " << *m << dendl;
13364 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13365 return true;
13366 }
13367
13368 if (m->op == POOL_OP_CREATE)
13369 return preprocess_pool_op_create(op);
13370
13371 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13372 if (p == nullptr) {
13373 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13374 if (m->op == POOL_OP_DELETE) {
13375 _pool_op_reply(op, 0, osdmap.get_epoch());
13376 } else {
13377 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13378 }
13379 return true;
13380 }
13381
13382 // check if the snap and snapname exist
13383 bool snap_exists = false;
13384 if (p->snap_exists(m->name.c_str()))
13385 snap_exists = true;
13386
13387 switch (m->op) {
13388 case POOL_OP_CREATE_SNAP:
13389 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13390 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13391 return true;
13392 }
13393 if (snap_exists) {
13394 _pool_op_reply(op, 0, osdmap.get_epoch());
13395 return true;
13396 }
13397 return false;
13398 case POOL_OP_CREATE_UNMANAGED_SNAP:
13399 if (p->is_pool_snaps_mode()) {
13400 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13401 return true;
13402 }
13403 return false;
13404 case POOL_OP_DELETE_SNAP:
13405 if (p->is_unmanaged_snaps_mode()) {
13406 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13407 return true;
13408 }
13409 if (!snap_exists) {
13410 _pool_op_reply(op, 0, osdmap.get_epoch());
13411 return true;
13412 }
13413 return false;
13414 case POOL_OP_DELETE_UNMANAGED_SNAP:
13415 if (p->is_pool_snaps_mode()) {
13416 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13417 return true;
13418 }
13419 if (_is_removed_snap(m->pool, m->snapid)) {
13420 _pool_op_reply(op, 0, osdmap.get_epoch());
13421 return true;
13422 }
13423 return false;
13424 case POOL_OP_DELETE:
13425 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13426 _pool_op_reply(op, 0, osdmap.get_epoch());
13427 return true;
13428 }
13429 return false;
13430 case POOL_OP_AUID_CHANGE:
13431 return false;
13432 default:
13433 ceph_abort();
13434 break;
13435 }
13436
13437 return false;
13438 }
13439
13440 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13441 {
13442 if (!osdmap.have_pg_pool(pool)) {
13443 dout(10) << __func__ << " pool " << pool << " snap " << snap
13444 << " - pool dne" << dendl;
13445 return true;
13446 }
13447 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13448 dout(10) << __func__ << " pool " << pool << " snap " << snap
13449 << " - in osdmap removed_snaps_queue" << dendl;
13450 return true;
13451 }
13452 snapid_t begin, end;
13453 int r = lookup_purged_snap(pool, snap, &begin, &end);
13454 if (r == 0) {
13455 dout(10) << __func__ << " pool " << pool << " snap " << snap
13456 << " - purged, [" << begin << "," << end << ")" << dendl;
13457 return true;
13458 }
13459 return false;
13460 }
13461
13462 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13463 {
13464 if (pending_inc.old_pools.count(pool)) {
13465 dout(10) << __func__ << " pool " << pool << " snap " << snap
13466 << " - pool pending deletion" << dendl;
13467 return true;
13468 }
13469 if (pending_inc.in_new_removed_snaps(pool, snap)) {
13470 dout(10) << __func__ << " pool " << pool << " snap " << snap
13471 << " - in pending new_removed_snaps" << dendl;
13472 return true;
13473 }
13474 return false;
13475 }
13476
13477 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13478 {
13479 op->mark_osdmon_event(__func__);
13480 auto m = op->get_req<MPoolOp>();
13481 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13482 if (pool >= 0) {
13483 _pool_op_reply(op, 0, osdmap.get_epoch());
13484 return true;
13485 }
13486
13487 return false;
13488 }
13489
13490 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13491 {
13492 op->mark_osdmon_event(__func__);
13493 auto m = op->get_req<MPoolOp>();
13494 dout(10) << "prepare_pool_op " << *m << dendl;
13495 if (m->op == POOL_OP_CREATE) {
13496 return prepare_pool_op_create(op);
13497 } else if (m->op == POOL_OP_DELETE) {
13498 return prepare_pool_op_delete(op);
13499 }
13500
13501 int ret = 0;
13502 bool changed = false;
13503
13504 if (!osdmap.have_pg_pool(m->pool)) {
13505 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13506 return false;
13507 }
13508
13509 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13510
13511 switch (m->op) {
13512 case POOL_OP_CREATE_SNAP:
13513 if (pool->is_tier()) {
13514 ret = -EINVAL;
13515 _pool_op_reply(op, ret, osdmap.get_epoch());
13516 return false;
13517 } // else, fall through
13518 case POOL_OP_DELETE_SNAP:
13519 if (!pool->is_unmanaged_snaps_mode()) {
13520 bool snap_exists = pool->snap_exists(m->name.c_str());
13521 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13522 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13523 ret = 0;
13524 } else {
13525 break;
13526 }
13527 } else {
13528 ret = -EINVAL;
13529 }
13530 _pool_op_reply(op, ret, osdmap.get_epoch());
13531 return false;
13532
13533 case POOL_OP_DELETE_UNMANAGED_SNAP:
13534 // we won't allow removal of an unmanaged snapshot from a pool
13535 // not in unmanaged snaps mode.
13536 if (!pool->is_unmanaged_snaps_mode()) {
13537 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13538 return false;
13539 }
13540 /* fall-thru */
13541 case POOL_OP_CREATE_UNMANAGED_SNAP:
13542 // but we will allow creating an unmanaged snapshot on any pool
13543 // as long as it is not in 'pool' snaps mode.
13544 if (pool->is_pool_snaps_mode()) {
13545 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13546 return false;
13547 }
13548 }
13549
13550 // projected pool info
13551 pg_pool_t pp;
13552 if (pending_inc.new_pools.count(m->pool))
13553 pp = pending_inc.new_pools[m->pool];
13554 else
13555 pp = *osdmap.get_pg_pool(m->pool);
13556
13557 bufferlist reply_data;
13558
13559 // pool snaps vs unmanaged snaps are mutually exclusive
13560 switch (m->op) {
13561 case POOL_OP_CREATE_SNAP:
13562 case POOL_OP_DELETE_SNAP:
13563 if (pp.is_unmanaged_snaps_mode()) {
13564 ret = -EINVAL;
13565 goto out;
13566 }
13567 break;
13568
13569 case POOL_OP_CREATE_UNMANAGED_SNAP:
13570 case POOL_OP_DELETE_UNMANAGED_SNAP:
13571 if (pp.is_pool_snaps_mode()) {
13572 ret = -EINVAL;
13573 goto out;
13574 }
13575 }
13576
13577 switch (m->op) {
13578 case POOL_OP_CREATE_SNAP:
13579 if (!pp.snap_exists(m->name.c_str())) {
13580 pp.add_snap(m->name.c_str(), ceph_clock_now());
13581 dout(10) << "create snap in pool " << m->pool << " " << m->name
13582 << " seq " << pp.get_snap_epoch() << dendl;
13583 changed = true;
13584 }
13585 break;
13586
13587 case POOL_OP_DELETE_SNAP:
13588 {
13589 snapid_t s = pp.snap_exists(m->name.c_str());
13590 if (s) {
13591 pp.remove_snap(s);
13592 pending_inc.new_removed_snaps[m->pool].insert(s);
13593 changed = true;
13594 }
13595 }
13596 break;
13597
13598 case POOL_OP_CREATE_UNMANAGED_SNAP:
13599 {
13600 uint64_t snapid = pp.add_unmanaged_snap(
13601 osdmap.require_osd_release < ceph_release_t::octopus);
13602 encode(snapid, reply_data);
13603 changed = true;
13604 }
13605 break;
13606
13607 case POOL_OP_DELETE_UNMANAGED_SNAP:
13608 if (!_is_removed_snap(m->pool, m->snapid) &&
13609 !_is_pending_removed_snap(m->pool, m->snapid)) {
13610 if (m->snapid > pp.get_snap_seq()) {
13611 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13612 return false;
13613 }
13614 pp.remove_unmanaged_snap(
13615 m->snapid,
13616 osdmap.require_osd_release < ceph_release_t::octopus);
13617 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13618 // also record the new seq as purged: this avoids a discontinuity
13619 // after all of the snaps have been purged, since the seq assigned
13620 // during removal lives in the same namespace as the actual snaps.
13621 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13622 changed = true;
13623 }
13624 break;
13625
13626 case POOL_OP_AUID_CHANGE:
13627 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13628 return false;
13629
13630 default:
13631 ceph_abort();
13632 break;
13633 }
13634
13635 if (changed) {
13636 pp.set_snap_epoch(pending_inc.epoch);
13637 pending_inc.new_pools[m->pool] = pp;
13638 }
13639
13640 out:
13641 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13642 return true;
13643 }
13644
13645 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13646 {
13647 op->mark_osdmon_event(__func__);
13648 int err = prepare_new_pool(op);
13649 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13650 return true;
13651 }
13652
13653 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13654 ostream *ss)
13655 {
13656 const string& poolstr = osdmap.get_pool_name(pool_id);
13657
13658 // If the Pool is in use by CephFS, refuse to delete it
13659 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13660 if (pending_fsmap.pool_in_use(pool_id)) {
13661 *ss << "pool '" << poolstr << "' is in use by CephFS";
13662 return -EBUSY;
13663 }
13664
13665 if (pool.tier_of >= 0) {
13666 *ss << "pool '" << poolstr << "' is a tier of '"
13667 << osdmap.get_pool_name(pool.tier_of) << "'";
13668 return -EBUSY;
13669 }
13670 if (!pool.tiers.empty()) {
13671 *ss << "pool '" << poolstr << "' has tiers";
13672 for(auto tier : pool.tiers) {
13673 *ss << " " << osdmap.get_pool_name(tier);
13674 }
13675 return -EBUSY;
13676 }
13677
13678 if (!g_conf()->mon_allow_pool_delete) {
13679 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13680 return -EPERM;
13681 }
13682
13683 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13684 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13685 return -EPERM;
13686 }
13687
13688 *ss << "pool '" << poolstr << "' removed";
13689 return 0;
13690 }
13691
13692 /**
13693 * Check if it is safe to add a tier to a base pool
13694 *
13695 * @return
13696 * True if the operation should proceed, false if we should abort here
13697 * (abort doesn't necessarily mean error, could be idempotency)
13698 */
13699 bool OSDMonitor::_check_become_tier(
13700 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13701 const int64_t base_pool_id, const pg_pool_t *base_pool,
13702 int *err,
13703 ostream *ss) const
13704 {
13705 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13706 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13707
13708 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13709 if (pending_fsmap.pool_in_use(tier_pool_id)) {
13710 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13711 *err = -EBUSY;
13712 return false;
13713 }
13714
13715 if (base_pool->tiers.count(tier_pool_id)) {
13716 ceph_assert(tier_pool->tier_of == base_pool_id);
13717 *err = 0;
13718 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13719 << base_pool_name << "'";
13720 return false;
13721 }
13722
13723 if (base_pool->is_tier()) {
13724 *ss << "pool '" << base_pool_name << "' is already a tier of '"
13725 << osdmap.get_pool_name(base_pool->tier_of) << "', "
13726 << "multiple tiers are not yet supported.";
13727 *err = -EINVAL;
13728 return false;
13729 }
13730
13731 if (tier_pool->has_tiers()) {
13732 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13733 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13734 it != tier_pool->tiers.end(); ++it)
13735 *ss << "'" << osdmap.get_pool_name(*it) << "',";
13736 *ss << " multiple tiers are not yet supported.";
13737 *err = -EINVAL;
13738 return false;
13739 }
13740
13741 if (tier_pool->is_tier()) {
13742 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13743 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13744 *err = -EINVAL;
13745 return false;
13746 }
13747
13748 *err = 0;
13749 return true;
13750 }
13751
13752
13753 /**
13754 * Check if it is safe to remove a tier from this base pool
13755 *
13756 * @return
13757 * True if the operation should proceed, false if we should abort here
13758 * (abort doesn't necessarily mean error, could be idempotency)
13759 */
13760 bool OSDMonitor::_check_remove_tier(
13761 const int64_t base_pool_id, const pg_pool_t *base_pool,
13762 const pg_pool_t *tier_pool,
13763 int *err, ostream *ss) const
13764 {
13765 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13766
13767 // Apply CephFS-specific checks
13768 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13769 if (pending_fsmap.pool_in_use(base_pool_id)) {
13770 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13771 // If the underlying pool is erasure coded and does not allow EC
13772 // overwrites, we can't permit the removal of the replicated tier that
13773 // CephFS relies on to access it
13774 *ss << "pool '" << base_pool_name <<
13775 "' does not allow EC overwrites and is in use by CephFS"
13776 " via its tier";
13777 *err = -EBUSY;
13778 return false;
13779 }
13780
13781 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13782 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13783 "tier is still in use as a writeback cache. Change the cache "
13784 "mode and flush the cache before removing it";
13785 *err = -EBUSY;
13786 return false;
13787 }
13788 }
13789
13790 *err = 0;
13791 return true;
13792 }
13793
13794 int OSDMonitor::_prepare_remove_pool(
13795 int64_t pool, ostream *ss, bool no_fake)
13796 {
13797 dout(10) << __func__ << " " << pool << dendl;
13798 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13799 int r = _check_remove_pool(pool, *p, ss);
13800 if (r < 0)
13801 return r;
13802
13803 auto new_pool = pending_inc.new_pools.find(pool);
13804 if (new_pool != pending_inc.new_pools.end()) {
13805 // if there is a problem with the pending info, wait and retry
13806 // this op.
13807 const auto& p = new_pool->second;
13808 int r = _check_remove_pool(pool, p, ss);
13809 if (r < 0)
13810 return -EAGAIN;
13811 }
13812
13813 if (pending_inc.old_pools.count(pool)) {
13814 dout(10) << __func__ << " " << pool << " already pending removal"
13815 << dendl;
13816 return 0;
13817 }
13818
13819 if (g_conf()->mon_fake_pool_delete && !no_fake) {
13820 string old_name = osdmap.get_pool_name(pool);
13821 string new_name = old_name + "." + stringify(pool) + ".DELETED";
13822 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13823 << old_name << " -> " << new_name << dendl;
13824 pending_inc.new_pool_names[pool] = new_name;
13825 return 0;
13826 }
13827
13828 // remove
13829 pending_inc.old_pools.insert(pool);
13830
13831 // remove any pg_temp mappings for this pool
13832 for (auto p = osdmap.pg_temp->begin();
13833 p != osdmap.pg_temp->end();
13834 ++p) {
13835 if (p->first.pool() == pool) {
13836 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
13837 << p->first << dendl;
13838 pending_inc.new_pg_temp[p->first].clear();
13839 }
13840 }
13841 // remove any primary_temp mappings for this pool
13842 for (auto p = osdmap.primary_temp->begin();
13843 p != osdmap.primary_temp->end();
13844 ++p) {
13845 if (p->first.pool() == pool) {
13846 dout(10) << __func__ << " " << pool
13847 << " removing obsolete primary_temp" << p->first << dendl;
13848 pending_inc.new_primary_temp[p->first] = -1;
13849 }
13850 }
13851 // remove any pg_upmap mappings for this pool
13852 for (auto& p : osdmap.pg_upmap) {
13853 if (p.first.pool() == pool) {
13854 dout(10) << __func__ << " " << pool
13855 << " removing obsolete pg_upmap "
13856 << p.first << dendl;
13857 pending_inc.old_pg_upmap.insert(p.first);
13858 }
13859 }
13860 // remove any pending pg_upmap mappings for this pool
13861 {
13862 auto it = pending_inc.new_pg_upmap.begin();
13863 while (it != pending_inc.new_pg_upmap.end()) {
13864 if (it->first.pool() == pool) {
13865 dout(10) << __func__ << " " << pool
13866 << " removing pending pg_upmap "
13867 << it->first << dendl;
13868 it = pending_inc.new_pg_upmap.erase(it);
13869 } else {
13870 it++;
13871 }
13872 }
13873 }
13874 // remove any pg_upmap_items mappings for this pool
13875 for (auto& p : osdmap.pg_upmap_items) {
13876 if (p.first.pool() == pool) {
13877 dout(10) << __func__ << " " << pool
13878 << " removing obsolete pg_upmap_items " << p.first
13879 << dendl;
13880 pending_inc.old_pg_upmap_items.insert(p.first);
13881 }
13882 }
13883 // remove any pending pg_upmap mappings for this pool
13884 {
13885 auto it = pending_inc.new_pg_upmap_items.begin();
13886 while (it != pending_inc.new_pg_upmap_items.end()) {
13887 if (it->first.pool() == pool) {
13888 dout(10) << __func__ << " " << pool
13889 << " removing pending pg_upmap_items "
13890 << it->first << dendl;
13891 it = pending_inc.new_pg_upmap_items.erase(it);
13892 } else {
13893 it++;
13894 }
13895 }
13896 }
13897
13898 // remove any choose_args for this pool
13899 CrushWrapper newcrush;
13900 _get_pending_crush(newcrush);
13901 if (newcrush.have_choose_args(pool)) {
13902 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13903 newcrush.rm_choose_args(pool);
13904 pending_inc.crush.clear();
13905 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13906 }
13907 return 0;
13908 }
13909
13910 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13911 {
13912 dout(10) << "_prepare_rename_pool " << pool << dendl;
13913 if (pending_inc.old_pools.count(pool)) {
13914 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13915 return -ENOENT;
13916 }
13917 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13918 p != pending_inc.new_pool_names.end();
13919 ++p) {
13920 if (p->second == newname && p->first != pool) {
13921 return -EEXIST;
13922 }
13923 }
13924
13925 pending_inc.new_pool_names[pool] = newname;
13926 return 0;
13927 }
13928
13929 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13930 {
13931 op->mark_osdmon_event(__func__);
13932 auto m = op->get_req<MPoolOp>();
13933 ostringstream ss;
13934 int ret = _prepare_remove_pool(m->pool, &ss, false);
13935 if (ret == -EAGAIN) {
13936 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13937 return true;
13938 }
13939 if (ret < 0)
13940 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13941 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13942 pending_inc.epoch));
13943 return true;
13944 }
13945
13946 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
13947 int ret, epoch_t epoch, bufferlist *blp)
13948 {
13949 op->mark_osdmon_event(__func__);
13950 auto m = op->get_req<MPoolOp>();
13951 dout(20) << "_pool_op_reply " << ret << dendl;
13952 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13953 ret, epoch, get_last_committed(), blp);
13954 mon->send_reply(op, reply);
13955 }
13956
13957 void OSDMonitor::convert_pool_priorities(void)
13958 {
13959 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13960 int64_t max_prio = 0;
13961 int64_t min_prio = 0;
13962 for (const auto &i : osdmap.get_pools()) {
13963 const auto &pool = i.second;
13964
13965 if (pool.opts.is_set(key)) {
13966 int64_t prio = 0;
13967 pool.opts.get(key, &prio);
13968 if (prio > max_prio)
13969 max_prio = prio;
13970 if (prio < min_prio)
13971 min_prio = prio;
13972 }
13973 }
13974 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13975 dout(20) << __func__ << " nothing to fix" << dendl;
13976 return;
13977 }
13978 // Current pool priorities exceeds new maximum
13979 for (const auto &i : osdmap.get_pools()) {
13980 const auto pool_id = i.first;
13981 pg_pool_t pool = i.second;
13982
13983 int64_t prio = 0;
13984 pool.opts.get(key, &prio);
13985 int64_t n;
13986
13987 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13988 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13989 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13990 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13991 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13992 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13993 } else {
13994 continue;
13995 }
13996 if (n == 0) {
13997 pool.opts.unset(key);
13998 } else {
13999 pool.opts.set(key, static_cast<int64_t>(n));
14000 }
14001 dout(10) << __func__ << " pool " << pool_id
14002 << " recovery_priority adjusted "
14003 << prio << " to " << n << dendl;
14004 pool.last_change = pending_inc.epoch;
14005 pending_inc.new_pools[pool_id] = pool;
14006 }
14007 }