]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
61
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
69
70 #include "common/config.h"
71 #include "common/errno.h"
72
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
76
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
86
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
89
90 #include "json_spirit/json_spirit_reader.h"
91
92 #include <boost/algorithm/string/predicate.hpp>
93
94 #define dout_subsys ceph_subsys_mon
95 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
96 static const string OSD_METADATA_PREFIX("osd_metadata");
97 static const string OSD_SNAP_PREFIX("osd_snap");
98
99 /*
100
101 OSD snapshot metadata
102 ---------------------
103
104 -- starting with mimic, removed in octopus --
105
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
108
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
111
112
113 -- starting with mimic --
114
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
117
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
122
123
124 -- starting with octopus --
125
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
128
129 */
130 using namespace TOPNSPC::common;
131 namespace {
132
133 struct OSDMemCache : public PriorityCache::PriCache {
134 OSDMonitor *osdmon;
135 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
136 int64_t committed_bytes = 0;
137 double cache_ratio = 0;
138
139 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
140
141 virtual uint64_t _get_used_bytes() const = 0;
142
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri, uint64_t total_cache) const {
145 int64_t assigned = get_cache_bytes(pri);
146
147 switch (pri) {
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1:
150 {
151 int64_t request = _get_used_bytes();
152 return (request > assigned) ? request - assigned : 0;
153 }
154 default:
155 break;
156 }
157 return -EOPNOTSUPP;
158 }
159
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
161 return cache_bytes[pri];
162 }
163
164 virtual int64_t get_cache_bytes() const {
165 int64_t total = 0;
166
167 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
168 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
169 total += get_cache_bytes(pri);
170 }
171 return total;
172 }
173
174 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
175 cache_bytes[pri] = bytes;
176 }
177 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
178 cache_bytes[pri] += bytes;
179 }
180 virtual int64_t commit_cache_size(uint64_t total_cache) {
181 committed_bytes = PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache);
183 return committed_bytes;
184 }
185 virtual int64_t get_committed_size() const {
186 return committed_bytes;
187 }
188 virtual double get_cache_ratio() const {
189 return cache_ratio;
190 }
191 virtual void set_cache_ratio(double ratio) {
192 cache_ratio = ratio;
193 }
194 virtual string get_cache_name() const = 0;
195 };
196
197 struct IncCache : public OSDMemCache {
198 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
199
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon->inc_osd_cache.get_bytes();
202 }
203
204 virtual string get_cache_name() const {
205 return "OSDMap Inc Cache";
206 }
207
208 uint64_t _get_num_osdmaps() const {
209 return osdmon->inc_osd_cache.get_size();
210 }
211 };
212
213 struct FullCache : public OSDMemCache {
214 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
215
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon->full_osd_cache.get_bytes();
218 }
219
220 virtual string get_cache_name() const {
221 return "OSDMap Full Cache";
222 }
223
224 uint64_t _get_num_osdmaps() const {
225 return osdmon->full_osd_cache.get_size();
226 }
227 };
228
229 std::shared_ptr<IncCache> inc_cache;
230 std::shared_ptr<FullCache> full_cache;
231
232 const uint32_t MAX_POOL_APPLICATIONS = 4;
233 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
234 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
235
236 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant.spec.allow & OSD_CAP_W) != 0) {
239 auto& match = grant.match;
240 if (match.is_match_all()) {
241 return true;
242 } else if (pool_name != nullptr &&
243 !match.pool_namespace.pool_name.empty() &&
244 match.pool_namespace.pool_name == *pool_name) {
245 return true;
246 }
247 }
248 return false;
249 }
250
251 bool is_unmanaged_snap_op_permitted(CephContext* cct,
252 const KeyServer& key_server,
253 const EntityName& entity_name,
254 const MonCap& mon_caps,
255 const entity_addr_t& peer_socket_addr,
256 const std::string* pool_name)
257 {
258 typedef std::map<std::string, std::string> CommandArgs;
259
260 if (mon_caps.is_capable(
261 cct, entity_name, "osd",
262 "osd pool op unmanaged-snap",
263 (pool_name == nullptr ?
264 CommandArgs{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs{{"poolname", *pool_name}}),
266 false, true, false,
267 peer_socket_addr)) {
268 return true;
269 }
270
271 AuthCapsInfo caps_info;
272 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
273 caps_info)) {
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl;
276 return false;
277 }
278
279 string caps_str;
280 if (caps_info.caps.length() > 0) {
281 auto p = caps_info.caps.cbegin();
282 try {
283 decode(caps_str, p);
284 } catch (const buffer::error &err) {
285 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
286 << dendl;
287 return false;
288 }
289 }
290
291 OSDCap osd_cap;
292 if (!osd_cap.parse(caps_str, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl;
295 return false;
296 }
297
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap.allow_all()) {
301 return true;
302 }
303
304 for (auto& grant : osd_cap.grants) {
305 if (grant.profile.is_valid()) {
306 for (auto& profile_grant : grant.profile_grants) {
307 if (is_osd_writable(profile_grant, pool_name)) {
308 return true;
309 }
310 }
311 } else if (is_osd_writable(grant, pool_name)) {
312 return true;
313 }
314 }
315
316 return false;
317 }
318
319 } // anonymous namespace
320
321 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
322 {
323 if (epoch_by_pg.size() <= ps) {
324 epoch_by_pg.resize(ps + 1, 0);
325 }
326 const auto old_lec = epoch_by_pg[ps];
327 if (old_lec >= last_epoch_clean) {
328 // stale lec
329 return;
330 }
331 epoch_by_pg[ps] = last_epoch_clean;
332 if (last_epoch_clean < floor) {
333 floor = last_epoch_clean;
334 } else if (last_epoch_clean > floor) {
335 if (old_lec == floor) {
336 // probably should increase floor?
337 auto new_floor = std::min_element(std::begin(epoch_by_pg),
338 std::end(epoch_by_pg));
339 floor = *new_floor;
340 }
341 }
342 if (ps != next_missing) {
343 return;
344 }
345 for (; next_missing < epoch_by_pg.size(); next_missing++) {
346 if (epoch_by_pg[next_missing] == 0) {
347 break;
348 }
349 }
350 }
351
352 void LastEpochClean::remove_pool(uint64_t pool)
353 {
354 report_by_pool.erase(pool);
355 }
356
357 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
358 {
359 auto& lec = report_by_pool[pg.pool()];
360 return lec.report(pg.ps(), last_epoch_clean);
361 }
362
363 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
364 {
365 auto floor = latest.get_epoch();
366 for (auto& pool : latest.get_pools()) {
367 auto reported = report_by_pool.find(pool.first);
368 if (reported == report_by_pool.end()) {
369 return 0;
370 }
371 if (reported->second.next_missing < pool.second.get_pg_num()) {
372 return 0;
373 }
374 if (reported->second.floor < floor) {
375 floor = reported->second.floor;
376 }
377 }
378 return floor;
379 }
380
381
382 class C_UpdateCreatingPGs : public Context {
383 public:
384 OSDMonitor *osdmon;
385 utime_t start;
386 epoch_t epoch;
387 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
388 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
389 void finish(int r) override {
390 if (r >= 0) {
391 utime_t end = ceph_clock_now();
392 dout(10) << "osdmap epoch " << epoch << " mapping took "
393 << (end - start) << " seconds" << dendl;
394 osdmon->update_creating_pgs();
395 osdmon->check_pg_creates_subs();
396 }
397 }
398 };
399
400 #undef dout_prefix
401 #define dout_prefix _prefix(_dout, mon, osdmap)
402 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
403 return *_dout << "mon." << mon->name << "@" << mon->rank
404 << "(" << mon->get_state_name()
405 << ").osd e" << osdmap.get_epoch() << " ";
406 }
407
408 OSDMonitor::OSDMonitor(
409 CephContext *cct,
410 Monitor *mn,
411 Paxos *p,
412 const string& service_name)
413 : PaxosService(mn, p, service_name),
414 cct(cct),
415 inc_osd_cache(g_conf()->mon_osd_cache_size),
416 full_osd_cache(g_conf()->mon_osd_cache_size),
417 has_osdmap_manifest(false),
418 mapper(mn->cct, &mn->cpu_tp)
419 {
420 inc_cache = std::make_shared<IncCache>(this);
421 full_cache = std::make_shared<FullCache>(this);
422 cct->_conf.add_observer(this);
423 int r = _set_cache_sizes();
424 if (r < 0) {
425 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
426 << g_conf()->mon_osd_cache_size
427 << ") without priority cache management"
428 << dendl;
429 }
430 }
431
432 const char **OSDMonitor::get_tracked_conf_keys() const
433 {
434 static const char* KEYS[] = {
435 "mon_memory_target",
436 "mon_memory_autotune",
437 "rocksdb_cache_size",
438 NULL
439 };
440 return KEYS;
441 }
442
443 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
444 const std::set<std::string> &changed)
445 {
446 dout(10) << __func__ << " " << changed << dendl;
447
448 if (changed.count("mon_memory_autotune")) {
449 _set_cache_autotuning();
450 }
451 if (changed.count("mon_memory_target") ||
452 changed.count("rocksdb_cache_size")) {
453 int r = _update_mon_cache_settings();
454 if (r < 0) {
455 derr << __func__ << " mon_memory_target:"
456 << g_conf()->mon_memory_target
457 << " rocksdb_cache_size:"
458 << g_conf()->rocksdb_cache_size
459 << ". Unable to update cache size."
460 << dendl;
461 }
462 }
463 }
464
465 void OSDMonitor::_set_cache_autotuning()
466 {
467 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
468 // Disable cache autotuning
469 std::lock_guard l(balancer_lock);
470 pcm = nullptr;
471 }
472
473 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
474 int r = register_cache_with_pcm();
475 if (r < 0) {
476 dout(10) << __func__
477 << " Error while registering osdmon caches with pcm."
478 << " Cache auto tuning not enabled."
479 << dendl;
480 mon_memory_autotune = false;
481 } else {
482 mon_memory_autotune = true;
483 }
484 }
485 }
486
487 int OSDMonitor::_update_mon_cache_settings()
488 {
489 if (g_conf()->mon_memory_target <= 0 ||
490 g_conf()->mon_memory_target < mon_memory_min ||
491 g_conf()->rocksdb_cache_size <= 0) {
492 return -EINVAL;
493 }
494
495 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
496 derr << __func__ << " not using pcm and rocksdb" << dendl;
497 return -EINVAL;
498 }
499
500 uint64_t old_mon_memory_target = mon_memory_target;
501 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
502
503 // Set the new pcm memory cache sizes
504 mon_memory_target = g_conf()->mon_memory_target;
505 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
506
507 uint64_t base = mon_memory_base;
508 double fragmentation = mon_memory_fragmentation;
509 uint64_t target = mon_memory_target;
510 uint64_t min = mon_memory_min;
511 uint64_t max = min;
512
513 uint64_t ltarget = (1.0 - fragmentation) * target;
514 if (ltarget > base + min) {
515 max = ltarget - base;
516 }
517
518 int r = _set_cache_ratios();
519 if (r < 0) {
520 derr << __func__ << " Cache ratios for pcm could not be set."
521 << " Review the kv (rocksdb) and mon_memory_target sizes."
522 << dendl;
523 mon_memory_target = old_mon_memory_target;
524 rocksdb_cache_size = old_rocksdb_cache_size;
525 return -EINVAL;
526 }
527
528 if (mon_memory_autotune && pcm != nullptr) {
529 std::lock_guard l(balancer_lock);
530 // set pcm cache levels
531 pcm->set_target_memory(target);
532 pcm->set_min_memory(min);
533 pcm->set_max_memory(max);
534 // tune memory based on new values
535 pcm->tune_memory();
536 pcm->balance();
537 _set_new_cache_sizes();
538 dout(1) << __func__ << " Updated mon cache setting."
539 << " target: " << target
540 << " min: " << min
541 << " max: " << max
542 << dendl;
543 }
544 return 0;
545 }
546
547 int OSDMonitor::_set_cache_sizes()
548 {
549 if (g_conf()->mon_memory_autotune) {
550 // set the new osdmon cache targets to be managed by pcm
551 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
552 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
553 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
554 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
555 mon_memory_target = g_conf()->mon_memory_target;
556 mon_memory_min = g_conf()->mon_osd_cache_size_min;
557 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
558 derr << __func__ << " mon_memory_target:" << mon_memory_target
559 << " mon_memory_min:" << mon_memory_min
560 << ". Invalid size option(s) provided."
561 << dendl;
562 return -EINVAL;
563 }
564 // Set the initial inc and full LRU cache sizes
565 inc_osd_cache.set_bytes(mon_memory_min);
566 full_osd_cache.set_bytes(mon_memory_min);
567 mon_memory_autotune = g_conf()->mon_memory_autotune;
568 }
569 return 0;
570 }
571
572 bool OSDMonitor::_have_pending_crush()
573 {
574 return pending_inc.crush.length() > 0;
575 }
576
577 CrushWrapper &OSDMonitor::_get_stable_crush()
578 {
579 return *osdmap.crush;
580 }
581
582 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
583 {
584 bufferlist bl;
585 if (pending_inc.crush.length())
586 bl = pending_inc.crush;
587 else
588 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
589
590 auto p = bl.cbegin();
591 newcrush.decode(p);
592 }
593
594 void OSDMonitor::create_initial()
595 {
596 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
597
598 OSDMap newmap;
599
600 bufferlist bl;
601 mon->store->get("mkfs", "osdmap", bl);
602
603 if (bl.length()) {
604 newmap.decode(bl);
605 newmap.set_fsid(mon->monmap->fsid);
606 } else {
607 newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
608 }
609 newmap.set_epoch(1);
610 newmap.created = newmap.modified = ceph_clock_now();
611
612 // new clusters should sort bitwise by default.
613 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
614
615 newmap.flags |=
616 CEPH_OSDMAP_RECOVERY_DELETES |
617 CEPH_OSDMAP_PURGED_SNAPDIRS |
618 CEPH_OSDMAP_PGLOG_HARDLIMIT;
619 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
620 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
621 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
622 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
623 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
624 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
625
626 // new cluster should require latest by default
627 if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
628 if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
629 derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
630 newmap.require_osd_release = ceph_release_t::mimic;
631 } else {
632 derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
633 newmap.require_osd_release = ceph_release_t::nautilus;
634 }
635 } else {
636 newmap.require_osd_release = ceph_release_t::octopus;
637 ceph_release_t r = ceph_release_from_name(
638 g_conf()->mon_osd_initial_require_min_compat_client);
639 if (!r) {
640 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
641 }
642 newmap.require_min_compat_client = r;
643 }
644
645 // encode into pending incremental
646 uint64_t features = newmap.get_encoding_features();
647 newmap.encode(pending_inc.fullmap,
648 features | CEPH_FEATURE_RESERVED);
649 pending_inc.full_crc = newmap.get_crc();
650 dout(20) << " full crc " << pending_inc.full_crc << dendl;
651 }
652
653 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
654 {
655 s.insert(service_name);
656 s.insert(OSD_PG_CREATING_PREFIX);
657 s.insert(OSD_METADATA_PREFIX);
658 s.insert(OSD_SNAP_PREFIX);
659 }
660
661 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
662 {
663 // we really don't care if the version has been updated, because we may
664 // have trimmed without having increased the last committed; yet, we may
665 // need to update the in-memory manifest.
666 load_osdmap_manifest();
667
668 version_t version = get_last_committed();
669 if (version == osdmap.epoch)
670 return;
671 ceph_assert(version > osdmap.epoch);
672
673 dout(15) << "update_from_paxos paxos e " << version
674 << ", my e " << osdmap.epoch << dendl;
675
676 if (mapping_job) {
677 if (!mapping_job->is_done()) {
678 dout(1) << __func__ << " mapping job "
679 << mapping_job.get() << " did not complete, "
680 << mapping_job->shards << " left, canceling" << dendl;
681 mapping_job->abort();
682 }
683 mapping_job.reset();
684 }
685
686 load_health();
687
688 /*
689 * We will possibly have a stashed latest that *we* wrote, and we will
690 * always be sure to have the oldest full map in the first..last range
691 * due to encode_trim_extra(), which includes the oldest full map in the trim
692 * transaction.
693 *
694 * encode_trim_extra() does not however write the full map's
695 * version to 'full_latest'. This is only done when we are building the
696 * full maps from the incremental versions. But don't panic! We make sure
697 * that the following conditions find whichever full map version is newer.
698 */
699 version_t latest_full = get_version_latest_full();
700 if (latest_full == 0 && get_first_committed() > 1)
701 latest_full = get_first_committed();
702
703 if (get_first_committed() > 1 &&
704 latest_full < get_first_committed()) {
705 // the monitor could be just sync'ed with its peer, and the latest_full key
706 // is not encoded in the paxos commits in encode_pending(), so we need to
707 // make sure we get it pointing to a proper version.
708 version_t lc = get_last_committed();
709 version_t fc = get_first_committed();
710
711 dout(10) << __func__ << " looking for valid full map in interval"
712 << " [" << fc << ", " << lc << "]" << dendl;
713
714 latest_full = 0;
715 for (version_t v = lc; v >= fc; v--) {
716 string full_key = "full_" + stringify(v);
717 if (mon->store->exists(get_service_name(), full_key)) {
718 dout(10) << __func__ << " found latest full map v " << v << dendl;
719 latest_full = v;
720 break;
721 }
722 }
723
724 ceph_assert(latest_full > 0);
725 auto t(std::make_shared<MonitorDBStore::Transaction>());
726 put_version_latest_full(t, latest_full);
727 mon->store->apply_transaction(t);
728 dout(10) << __func__ << " updated the on-disk full map version to "
729 << latest_full << dendl;
730 }
731
732 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
733 bufferlist latest_bl;
734 get_version_full(latest_full, latest_bl);
735 ceph_assert(latest_bl.length() != 0);
736 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
737 osdmap = OSDMap();
738 osdmap.decode(latest_bl);
739 }
740
741 bufferlist bl;
742 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
743 auto p = bl.cbegin();
744 std::lock_guard<std::mutex> l(creating_pgs_lock);
745 creating_pgs.decode(p);
746 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
747 << creating_pgs.last_scan_epoch
748 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
749 } else {
750 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
751 << dendl;
752 }
753
754 // walk through incrementals
755 MonitorDBStore::TransactionRef t;
756 size_t tx_size = 0;
757 while (version > osdmap.epoch) {
758 bufferlist inc_bl;
759 int err = get_version(osdmap.epoch+1, inc_bl);
760 ceph_assert(err == 0);
761 ceph_assert(inc_bl.length());
762 // set priority cache manager levels if the osdmap is
763 // being populated for the first time.
764 if (mon_memory_autotune && pcm == nullptr) {
765 int r = register_cache_with_pcm();
766 if (r < 0) {
767 dout(10) << __func__
768 << " Error while registering osdmon caches with pcm."
769 << " Proceeding without cache auto tuning."
770 << dendl;
771 }
772 }
773
774 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
775 << dendl;
776 OSDMap::Incremental inc(inc_bl);
777 err = osdmap.apply_incremental(inc);
778 ceph_assert(err == 0);
779
780 if (!t)
781 t.reset(new MonitorDBStore::Transaction);
782
783 // Write out the full map for all past epochs. Encode the full
784 // map with the same features as the incremental. If we don't
785 // know, use the quorum features. If we don't know those either,
786 // encode with all features.
787 uint64_t f = inc.encode_features;
788 if (!f)
789 f = mon->get_quorum_con_features();
790 if (!f)
791 f = -1;
792 bufferlist full_bl;
793 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
794 tx_size += full_bl.length();
795
796 bufferlist orig_full_bl;
797 get_version_full(osdmap.epoch, orig_full_bl);
798 if (orig_full_bl.length()) {
799 // the primary provided the full map
800 ceph_assert(inc.have_crc);
801 if (inc.full_crc != osdmap.crc) {
802 // This will happen if the mons were running mixed versions in
803 // the past or some other circumstance made the full encoded
804 // maps divergent. Reloading here will bring us back into
805 // sync with the primary for this and all future maps. OSDs
806 // will also be brought back into sync when they discover the
807 // crc mismatch and request a full map from a mon.
808 derr << __func__ << " full map CRC mismatch, resetting to canonical"
809 << dendl;
810
811 dout(20) << __func__ << " my (bad) full osdmap:\n";
812 JSONFormatter jf(true);
813 jf.dump_object("osdmap", osdmap);
814 jf.flush(*_dout);
815 *_dout << "\nhexdump:\n";
816 full_bl.hexdump(*_dout);
817 *_dout << dendl;
818
819 osdmap = OSDMap();
820 osdmap.decode(orig_full_bl);
821
822 dout(20) << __func__ << " canonical full osdmap:\n";
823 JSONFormatter jf(true);
824 jf.dump_object("osdmap", osdmap);
825 jf.flush(*_dout);
826 *_dout << "\nhexdump:\n";
827 orig_full_bl.hexdump(*_dout);
828 *_dout << dendl;
829 }
830 } else {
831 ceph_assert(!inc.have_crc);
832 put_version_full(t, osdmap.epoch, full_bl);
833 }
834 put_version_latest_full(t, osdmap.epoch);
835
836 // share
837 dout(1) << osdmap << dendl;
838
839 if (osdmap.epoch == 1) {
840 t->erase("mkfs", "osdmap");
841 }
842
843 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
844 mon->store->apply_transaction(t);
845 t = MonitorDBStore::TransactionRef();
846 tx_size = 0;
847 }
848 for (const auto &osd_state : inc.new_state) {
849 if (osd_state.second & CEPH_OSD_UP) {
850 // could be marked up *or* down, but we're too lazy to check which
851 last_osd_report.erase(osd_state.first);
852 }
853 if (osd_state.second & CEPH_OSD_EXISTS) {
854 // could be created *or* destroyed, but we can safely drop it
855 osd_epochs.erase(osd_state.first);
856 }
857 }
858 }
859
860 if (t) {
861 mon->store->apply_transaction(t);
862 }
863
864 for (int o = 0; o < osdmap.get_max_osd(); o++) {
865 if (osdmap.is_out(o))
866 continue;
867 auto found = down_pending_out.find(o);
868 if (osdmap.is_down(o)) {
869 // populate down -> out map
870 if (found == down_pending_out.end()) {
871 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
872 down_pending_out[o] = ceph_clock_now();
873 }
874 } else {
875 if (found != down_pending_out.end()) {
876 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
877 down_pending_out.erase(found);
878 }
879 }
880 }
881 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
882
883 check_osdmap_subs();
884 check_pg_creates_subs();
885
886 share_map_with_random_osd();
887 update_logger();
888 process_failures();
889
890 // make sure our feature bits reflect the latest map
891 update_msgr_features();
892
893 if (!mon->is_leader()) {
894 // will be called by on_active() on the leader, avoid doing so twice
895 start_mapping();
896 }
897 }
898
899 int OSDMonitor::register_cache_with_pcm()
900 {
901 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
902 derr << __func__ << " Invalid memory size specified for mon caches."
903 << " Caches will not be auto-tuned."
904 << dendl;
905 return -EINVAL;
906 }
907 uint64_t base = mon_memory_base;
908 double fragmentation = mon_memory_fragmentation;
909 // For calculating total target memory, consider rocksdb cache size.
910 uint64_t target = mon_memory_target;
911 uint64_t min = mon_memory_min;
912 uint64_t max = min;
913
914 // Apply the same logic as in bluestore to set the max amount
915 // of memory to use for cache. Assume base memory for OSDMaps
916 // and then add in some overhead for fragmentation.
917 uint64_t ltarget = (1.0 - fragmentation) * target;
918 if (ltarget > base + min) {
919 max = ltarget - base;
920 }
921
922 rocksdb_binned_kv_cache = mon->store->get_priority_cache();
923 if (!rocksdb_binned_kv_cache) {
924 derr << __func__ << " not using rocksdb" << dendl;
925 return -EINVAL;
926 }
927
928 int r = _set_cache_ratios();
929 if (r < 0) {
930 derr << __func__ << " Cache ratios for pcm could not be set."
931 << " Review the kv (rocksdb) and mon_memory_target sizes."
932 << dendl;
933 return -EINVAL;
934 }
935
936 pcm = std::make_shared<PriorityCache::Manager>(
937 cct, min, max, target, true);
938 pcm->insert("kv", rocksdb_binned_kv_cache, true);
939 pcm->insert("inc", inc_cache, true);
940 pcm->insert("full", full_cache, true);
941 dout(1) << __func__ << " pcm target: " << target
942 << " pcm max: " << max
943 << " pcm min: " << min
944 << " inc_osd_cache size: " << inc_osd_cache.get_size()
945 << dendl;
946 return 0;
947 }
948
949 int OSDMonitor::_set_cache_ratios()
950 {
951 double old_cache_kv_ratio = cache_kv_ratio;
952
953 // Set the cache ratios for kv(rocksdb), inc and full caches
954 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
955 if (cache_kv_ratio >= 1.0) {
956 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
957 << ") must be in range [0,<1.0]."
958 << dendl;
959 cache_kv_ratio = old_cache_kv_ratio;
960 return -EINVAL;
961 }
962 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
963 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
964 inc_cache->set_cache_ratio(cache_inc_ratio);
965 full_cache->set_cache_ratio(cache_full_ratio);
966
967 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
968 << " inc ratio " << cache_inc_ratio
969 << " full ratio " << cache_full_ratio
970 << dendl;
971 return 0;
972 }
973
974 void OSDMonitor::start_mapping()
975 {
976 // initiate mapping job
977 if (mapping_job) {
978 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
979 << dendl;
980 mapping_job->abort();
981 }
982 if (!osdmap.get_pools().empty()) {
983 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
984 mapping_job = mapping.start_update(osdmap, mapper,
985 g_conf()->mon_osd_mapping_pgs_per_chunk);
986 dout(10) << __func__ << " started mapping job " << mapping_job.get()
987 << " at " << fin->start << dendl;
988 mapping_job->set_finish_event(fin);
989 } else {
990 dout(10) << __func__ << " no pools, no mapping job" << dendl;
991 mapping_job = nullptr;
992 }
993 }
994
995 void OSDMonitor::update_msgr_features()
996 {
997 set<int> types;
998 types.insert((int)entity_name_t::TYPE_OSD);
999 types.insert((int)entity_name_t::TYPE_CLIENT);
1000 types.insert((int)entity_name_t::TYPE_MDS);
1001 types.insert((int)entity_name_t::TYPE_MON);
1002 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1003 uint64_t mask;
1004 uint64_t features = osdmap.get_features(*q, &mask);
1005 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1006 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1007 ceph::net::Policy p = mon->messenger->get_policy(*q);
1008 p.features_required = (p.features_required & ~mask) | features;
1009 mon->messenger->set_policy(*q, p);
1010 }
1011 }
1012 }
1013
1014 void OSDMonitor::on_active()
1015 {
1016 update_logger();
1017
1018 if (mon->is_leader()) {
1019 mon->clog->debug() << "osdmap " << osdmap;
1020 if (!priority_convert) {
1021 // Only do this once at start-up
1022 convert_pool_priorities();
1023 priority_convert = true;
1024 }
1025 } else {
1026 list<MonOpRequestRef> ls;
1027 take_all_failures(ls);
1028 while (!ls.empty()) {
1029 MonOpRequestRef op = ls.front();
1030 op->mark_osdmon_event(__func__);
1031 dispatch(op);
1032 ls.pop_front();
1033 }
1034 }
1035 start_mapping();
1036 }
1037
1038 void OSDMonitor::on_restart()
1039 {
1040 last_osd_report.clear();
1041 }
1042
1043 void OSDMonitor::on_shutdown()
1044 {
1045 dout(10) << __func__ << dendl;
1046 if (mapping_job) {
1047 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1048 << dendl;
1049 mapping_job->abort();
1050 }
1051
1052 // discard failure info, waiters
1053 list<MonOpRequestRef> ls;
1054 take_all_failures(ls);
1055 ls.clear();
1056 }
1057
1058 void OSDMonitor::update_logger()
1059 {
1060 dout(10) << "update_logger" << dendl;
1061
1062 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1063 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1064 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1065 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1066 }
1067
1068 void OSDMonitor::create_pending()
1069 {
1070 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1071 pending_inc.fsid = mon->monmap->fsid;
1072 pending_metadata.clear();
1073 pending_metadata_rm.clear();
1074 pending_pseudo_purged_snaps.clear();
1075
1076 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1077
1078 // safety checks (this shouldn't really happen)
1079 {
1080 if (osdmap.backfillfull_ratio <= 0) {
1081 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1082 if (pending_inc.new_backfillfull_ratio > 1.0)
1083 pending_inc.new_backfillfull_ratio /= 100;
1084 dout(1) << __func__ << " setting backfillfull_ratio = "
1085 << pending_inc.new_backfillfull_ratio << dendl;
1086 }
1087 if (osdmap.full_ratio <= 0) {
1088 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1089 if (pending_inc.new_full_ratio > 1.0)
1090 pending_inc.new_full_ratio /= 100;
1091 dout(1) << __func__ << " setting full_ratio = "
1092 << pending_inc.new_full_ratio << dendl;
1093 }
1094 if (osdmap.nearfull_ratio <= 0) {
1095 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1096 if (pending_inc.new_nearfull_ratio > 1.0)
1097 pending_inc.new_nearfull_ratio /= 100;
1098 dout(1) << __func__ << " setting nearfull_ratio = "
1099 << pending_inc.new_nearfull_ratio << dendl;
1100 }
1101 }
1102
1103 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1104 // structure.
1105 if (osdmap.crush->has_legacy_rule_ids()) {
1106 CrushWrapper newcrush;
1107 _get_pending_crush(newcrush);
1108
1109 // First, for all pools, work out which rule they really used
1110 // by resolving ruleset to rule.
1111 for (const auto &i : osdmap.get_pools()) {
1112 const auto pool_id = i.first;
1113 const auto &pool = i.second;
1114 int new_rule_id = newcrush.find_rule(pool.crush_rule,
1115 pool.type, pool.size);
1116
1117 dout(1) << __func__ << " rewriting pool "
1118 << osdmap.get_pool_name(pool_id) << " crush ruleset "
1119 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1120 if (pending_inc.new_pools.count(pool_id) == 0) {
1121 pending_inc.new_pools[pool_id] = pool;
1122 }
1123 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1124 }
1125
1126 // Now, go ahead and renumber all the rules so that their
1127 // rule_id field corresponds to their position in the array
1128 auto old_to_new = newcrush.renumber_rules();
1129 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1130 for (const auto &i : old_to_new) {
1131 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1132 }
1133 pending_inc.crush.clear();
1134 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1135 }
1136 }
1137
1138 creating_pgs_t
1139 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1140 const OSDMap& nextmap)
1141 {
1142 dout(10) << __func__ << dendl;
1143 creating_pgs_t pending_creatings;
1144 {
1145 std::lock_guard<std::mutex> l(creating_pgs_lock);
1146 pending_creatings = creating_pgs;
1147 }
1148 // check for new or old pools
1149 if (pending_creatings.last_scan_epoch < inc.epoch) {
1150 unsigned queued = 0;
1151 queued += scan_for_creating_pgs(osdmap.get_pools(),
1152 inc.old_pools,
1153 inc.modified,
1154 &pending_creatings);
1155 queued += scan_for_creating_pgs(inc.new_pools,
1156 inc.old_pools,
1157 inc.modified,
1158 &pending_creatings);
1159 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1160 for (auto deleted_pool : inc.old_pools) {
1161 auto removed = pending_creatings.remove_pool(deleted_pool);
1162 dout(10) << __func__ << " " << removed
1163 << " pg removed because containing pool deleted: "
1164 << deleted_pool << dendl;
1165 last_epoch_clean.remove_pool(deleted_pool);
1166 }
1167 // pgmon updates its creating_pgs in check_osd_map() which is called by
1168 // on_active() and check_osd_map() could be delayed if lease expires, so its
1169 // creating_pgs could be stale in comparison with the one of osdmon. let's
1170 // trim them here. otherwise, they will be added back after being erased.
1171 unsigned removed = 0;
1172 for (auto& pg : pending_created_pgs) {
1173 dout(20) << __func__ << " noting created pg " << pg << dendl;
1174 pending_creatings.created_pools.insert(pg.pool());
1175 removed += pending_creatings.pgs.erase(pg);
1176 }
1177 pending_created_pgs.clear();
1178 dout(10) << __func__ << " " << removed
1179 << " pgs removed because they're created" << dendl;
1180 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1181 }
1182
1183 // filter out any pgs that shouldn't exist.
1184 {
1185 auto i = pending_creatings.pgs.begin();
1186 while (i != pending_creatings.pgs.end()) {
1187 if (!nextmap.pg_exists(i->first)) {
1188 dout(10) << __func__ << " removing pg " << i->first
1189 << " which should not exist" << dendl;
1190 i = pending_creatings.pgs.erase(i);
1191 } else {
1192 ++i;
1193 }
1194 }
1195 }
1196
1197 // process queue
1198 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1199 const auto total = pending_creatings.pgs.size();
1200 while (pending_creatings.pgs.size() < max &&
1201 !pending_creatings.queue.empty()) {
1202 auto p = pending_creatings.queue.begin();
1203 int64_t poolid = p->first;
1204 dout(10) << __func__ << " pool " << poolid
1205 << " created " << p->second.created
1206 << " modified " << p->second.modified
1207 << " [" << p->second.start << "-" << p->second.end << ")"
1208 << dendl;
1209 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1210 p->second.end - p->second.start);
1211 ps_t first = p->second.start;
1212 ps_t end = first + n;
1213 for (ps_t ps = first; ps < end; ++ps) {
1214 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1215 // NOTE: use the *current* epoch as the PG creation epoch so that the
1216 // OSD does not have to generate a long set of PastIntervals.
1217 pending_creatings.pgs.emplace(
1218 pgid,
1219 creating_pgs_t::pg_create_info(inc.epoch,
1220 p->second.modified));
1221 dout(10) << __func__ << " adding " << pgid << dendl;
1222 }
1223 p->second.start = end;
1224 if (p->second.done()) {
1225 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1226 pending_creatings.queue.erase(p);
1227 } else {
1228 dout(10) << __func__ << " pool " << poolid
1229 << " now [" << p->second.start << "-" << p->second.end << ")"
1230 << dendl;
1231 }
1232 }
1233 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1234 << " pools" << dendl;
1235
1236 if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1237 // walk creating pgs' history and past_intervals forward
1238 for (auto& i : pending_creatings.pgs) {
1239 // this mirrors PG::start_peering_interval()
1240 pg_t pgid = i.first;
1241
1242 // this is a bit imprecise, but sufficient?
1243 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1244 const pg_pool_t *pi;
1245 bool operator()(const set<pg_shard_t> &have) const {
1246 return have.size() >= pi->min_size;
1247 }
1248 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1249 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1250
1251 vector<int> up, acting;
1252 int up_primary, acting_primary;
1253 nextmap.pg_to_up_acting_osds(
1254 pgid, &up, &up_primary, &acting, &acting_primary);
1255 if (i.second.history.epoch_created == 0) {
1256 // new pg entry, set it up
1257 i.second.up = up;
1258 i.second.acting = acting;
1259 i.second.up_primary = up_primary;
1260 i.second.acting_primary = acting_primary;
1261 i.second.history = pg_history_t(i.second.create_epoch,
1262 i.second.create_stamp);
1263 dout(10) << __func__ << " pg " << pgid << " just added, "
1264 << " up " << i.second.up
1265 << " p " << i.second.up_primary
1266 << " acting " << i.second.acting
1267 << " p " << i.second.acting_primary
1268 << " history " << i.second.history
1269 << " past_intervals " << i.second.past_intervals
1270 << dendl;
1271 } else {
1272 std::stringstream debug;
1273 if (PastIntervals::check_new_interval(
1274 i.second.acting_primary, acting_primary,
1275 i.second.acting, acting,
1276 i.second.up_primary, up_primary,
1277 i.second.up, up,
1278 i.second.history.same_interval_since,
1279 i.second.history.last_epoch_clean,
1280 &nextmap,
1281 &osdmap,
1282 pgid,
1283 min_size_predicate,
1284 &i.second.past_intervals,
1285 &debug)) {
1286 epoch_t e = inc.epoch;
1287 i.second.history.same_interval_since = e;
1288 if (i.second.up != up) {
1289 i.second.history.same_up_since = e;
1290 }
1291 if (i.second.acting_primary != acting_primary) {
1292 i.second.history.same_primary_since = e;
1293 }
1294 if (pgid.is_split(
1295 osdmap.get_pg_num(pgid.pool()),
1296 nextmap.get_pg_num(pgid.pool()),
1297 nullptr)) {
1298 i.second.history.last_epoch_split = e;
1299 }
1300 dout(10) << __func__ << " pg " << pgid << " new interval,"
1301 << " up " << i.second.up << " -> " << up
1302 << " p " << i.second.up_primary << " -> " << up_primary
1303 << " acting " << i.second.acting << " -> " << acting
1304 << " p " << i.second.acting_primary << " -> "
1305 << acting_primary
1306 << " history " << i.second.history
1307 << " past_intervals " << i.second.past_intervals
1308 << dendl;
1309 dout(20) << " debug: " << debug.str() << dendl;
1310 i.second.up = up;
1311 i.second.acting = acting;
1312 i.second.up_primary = up_primary;
1313 i.second.acting_primary = acting_primary;
1314 }
1315 }
1316 }
1317 }
1318 dout(10) << __func__
1319 << " " << (pending_creatings.pgs.size() - total)
1320 << "/" << pending_creatings.pgs.size()
1321 << " pgs added from queued pools" << dendl;
1322 return pending_creatings;
1323 }
1324
1325 void OSDMonitor::maybe_prime_pg_temp()
1326 {
1327 bool all = false;
1328 if (pending_inc.crush.length()) {
1329 dout(10) << __func__ << " new crush map, all" << dendl;
1330 all = true;
1331 }
1332
1333 if (!pending_inc.new_up_client.empty()) {
1334 dout(10) << __func__ << " new up osds, all" << dendl;
1335 all = true;
1336 }
1337
1338 // check for interesting OSDs
1339 set<int> osds;
1340 for (auto p = pending_inc.new_state.begin();
1341 !all && p != pending_inc.new_state.end();
1342 ++p) {
1343 if ((p->second & CEPH_OSD_UP) &&
1344 osdmap.is_up(p->first)) {
1345 osds.insert(p->first);
1346 }
1347 }
1348 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1349 !all && p != pending_inc.new_weight.end();
1350 ++p) {
1351 if (p->second < osdmap.get_weight(p->first)) {
1352 // weight reduction
1353 osds.insert(p->first);
1354 } else {
1355 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1356 << dendl;
1357 all = true;
1358 }
1359 }
1360
1361 if (!all && osds.empty())
1362 return;
1363
1364 if (!all) {
1365 unsigned estimate =
1366 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1367 if (estimate > mapping.get_num_pgs() *
1368 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1369 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1370 << osds.size() << " osds >= "
1371 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1372 << mapping.get_num_pgs() << " pgs, all"
1373 << dendl;
1374 all = true;
1375 } else {
1376 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1377 << osds.size() << " osds" << dendl;
1378 }
1379 }
1380
1381 OSDMap next;
1382 next.deepish_copy_from(osdmap);
1383 next.apply_incremental(pending_inc);
1384
1385 if (next.get_pools().empty()) {
1386 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1387 } else if (all) {
1388 PrimeTempJob job(next, this);
1389 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1390 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1391 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1392 } else {
1393 dout(10) << __func__ << " did not finish in "
1394 << g_conf()->mon_osd_prime_pg_temp_max_time
1395 << ", stopping" << dendl;
1396 job.abort();
1397 }
1398 } else {
1399 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1400 utime_t stop = ceph_clock_now();
1401 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1402 const int chunk = 1000;
1403 int n = chunk;
1404 std::unordered_set<pg_t> did_pgs;
1405 for (auto osd : osds) {
1406 auto& pgs = mapping.get_osd_acting_pgs(osd);
1407 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1408 for (auto pgid : pgs) {
1409 if (!did_pgs.insert(pgid).second) {
1410 continue;
1411 }
1412 prime_pg_temp(next, pgid);
1413 if (--n <= 0) {
1414 n = chunk;
1415 if (ceph_clock_now() > stop) {
1416 dout(10) << __func__ << " consumed more than "
1417 << g_conf()->mon_osd_prime_pg_temp_max_time
1418 << " seconds, stopping"
1419 << dendl;
1420 return;
1421 }
1422 }
1423 }
1424 }
1425 }
1426 }
1427
1428 void OSDMonitor::prime_pg_temp(
1429 const OSDMap& next,
1430 pg_t pgid)
1431 {
1432 // TODO: remove this creating_pgs direct access?
1433 if (creating_pgs.pgs.count(pgid)) {
1434 return;
1435 }
1436 if (!osdmap.pg_exists(pgid)) {
1437 return;
1438 }
1439
1440 vector<int> up, acting;
1441 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1442
1443 vector<int> next_up, next_acting;
1444 int next_up_primary, next_acting_primary;
1445 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1446 &next_acting, &next_acting_primary);
1447 if (acting == next_acting &&
1448 !(up != acting && next_up == next_acting))
1449 return; // no change since last epoch
1450
1451 if (acting.empty())
1452 return; // if previously empty now we can be no worse off
1453 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1454 if (pool && acting.size() < pool->min_size)
1455 return; // can be no worse off than before
1456
1457 if (next_up == next_acting) {
1458 acting.clear();
1459 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1460 << dendl;
1461 }
1462
1463 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1464 << " -> " << next_up << "/" << next_acting
1465 << ", priming " << acting
1466 << dendl;
1467 {
1468 std::lock_guard l(prime_pg_temp_lock);
1469 // do not touch a mapping if a change is pending
1470 pending_inc.new_pg_temp.emplace(
1471 pgid,
1472 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1473 }
1474 }
1475
1476 /**
1477 * @note receiving a transaction in this function gives a fair amount of
1478 * freedom to the service implementation if it does need it. It shouldn't.
1479 */
1480 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1481 {
1482 dout(10) << "encode_pending e " << pending_inc.epoch
1483 << dendl;
1484
1485 if (do_prune(t)) {
1486 dout(1) << __func__ << " osdmap full prune encoded e"
1487 << pending_inc.epoch << dendl;
1488 }
1489
1490 // finalize up pending_inc
1491 pending_inc.modified = ceph_clock_now();
1492
1493 int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1494 ceph_assert(r == 0);
1495
1496 if (mapping_job) {
1497 if (!mapping_job->is_done()) {
1498 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1499 << mapping_job.get() << " did not complete, "
1500 << mapping_job->shards << " left" << dendl;
1501 mapping_job->abort();
1502 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1503 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1504 << mapping_job.get() << " is prior epoch "
1505 << mapping.get_epoch() << dendl;
1506 } else {
1507 if (g_conf()->mon_osd_prime_pg_temp) {
1508 maybe_prime_pg_temp();
1509 }
1510 }
1511 } else if (g_conf()->mon_osd_prime_pg_temp) {
1512 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1513 << dendl;
1514 }
1515 mapping_job.reset();
1516
1517 // ensure we don't have blank new_state updates. these are interrpeted as
1518 // CEPH_OSD_UP (and almost certainly not what we want!).
1519 auto p = pending_inc.new_state.begin();
1520 while (p != pending_inc.new_state.end()) {
1521 if (p->second == 0) {
1522 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1523 p = pending_inc.new_state.erase(p);
1524 } else {
1525 if (p->second & CEPH_OSD_UP) {
1526 pending_inc.new_last_up_change = pending_inc.modified;
1527 }
1528 ++p;
1529 }
1530 }
1531 if (!pending_inc.new_up_client.empty()) {
1532 pending_inc.new_last_up_change = pending_inc.modified;
1533 }
1534 for (auto& i : pending_inc.new_weight) {
1535 if (i.first >= osdmap.max_osd) {
1536 if (i.second) {
1537 // new osd is already marked in
1538 pending_inc.new_last_in_change = pending_inc.modified;
1539 break;
1540 }
1541 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1542 // existing osd marked in or out
1543 pending_inc.new_last_in_change = pending_inc.modified;
1544 break;
1545 }
1546 }
1547
1548 {
1549 OSDMap tmp;
1550 tmp.deepish_copy_from(osdmap);
1551 tmp.apply_incremental(pending_inc);
1552
1553 // clean pg_temp mappings
1554 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1555
1556 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1557 {
1558 // check every upmapped pg for now
1559 // until we could reliably identify certain cases to ignore,
1560 // which is obviously the hard part TBD..
1561 vector<pg_t> pgs_to_check;
1562 tmp.get_upmap_pgs(&pgs_to_check);
1563 if (pgs_to_check.size() <
1564 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1565 // not enough pgs, do it inline
1566 tmp.clean_pg_upmaps(cct, &pending_inc);
1567 } else {
1568 CleanUpmapJob job(cct, tmp, pending_inc);
1569 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1570 job.wait();
1571 }
1572 }
1573
1574 // update creating pgs first so that we can remove the created pgid and
1575 // process the pool flag removal below in the same osdmap epoch.
1576 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1577 bufferlist creatings_bl;
1578 uint64_t features = CEPH_FEATURES_ALL;
1579 if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1580 dout(20) << __func__ << " encoding pending pgs without octopus features"
1581 << dendl;
1582 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1583 }
1584 encode(pending_creatings, creatings_bl, features);
1585 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1586
1587 // remove any old (or incompat) POOL_CREATING flags
1588 for (auto& i : tmp.get_pools()) {
1589 if (tmp.require_osd_release < ceph_release_t::nautilus) {
1590 // pre-nautilus OSDMaps shouldn't get this flag.
1591 if (pending_inc.new_pools.count(i.first)) {
1592 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1593 }
1594 }
1595 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1596 !pending_creatings.still_creating_pool(i.first)) {
1597 dout(10) << __func__ << " done creating pool " << i.first
1598 << ", clearing CREATING flag" << dendl;
1599 if (pending_inc.new_pools.count(i.first) == 0) {
1600 pending_inc.new_pools[i.first] = i.second;
1601 }
1602 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1603 }
1604 }
1605
1606 // collect which pools are currently affected by
1607 // the near/backfill/full osd(s),
1608 // and set per-pool near/backfill/full flag instead
1609 set<int64_t> full_pool_ids;
1610 set<int64_t> backfillfull_pool_ids;
1611 set<int64_t> nearfull_pool_ids;
1612 tmp.get_full_pools(cct,
1613 &full_pool_ids,
1614 &backfillfull_pool_ids,
1615 &nearfull_pool_ids);
1616 if (full_pool_ids.empty() ||
1617 backfillfull_pool_ids.empty() ||
1618 nearfull_pool_ids.empty()) {
1619 // normal case - no nearfull, backfillfull or full osds
1620 // try cancel any improper nearfull/backfillfull/full pool
1621 // flags first
1622 for (auto &pool: tmp.get_pools()) {
1623 auto p = pool.first;
1624 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1625 nearfull_pool_ids.empty()) {
1626 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1627 << "'s nearfull flag" << dendl;
1628 if (pending_inc.new_pools.count(p) == 0) {
1629 // load original pool info first!
1630 pending_inc.new_pools[p] = pool.second;
1631 }
1632 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1633 }
1634 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1635 backfillfull_pool_ids.empty()) {
1636 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1637 << "'s backfillfull flag" << dendl;
1638 if (pending_inc.new_pools.count(p) == 0) {
1639 pending_inc.new_pools[p] = pool.second;
1640 }
1641 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1642 }
1643 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1644 full_pool_ids.empty()) {
1645 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1646 // set by EQUOTA, skipping
1647 continue;
1648 }
1649 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650 << "'s full flag" << dendl;
1651 if (pending_inc.new_pools.count(p) == 0) {
1652 pending_inc.new_pools[p] = pool.second;
1653 }
1654 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1655 }
1656 }
1657 }
1658 if (!full_pool_ids.empty()) {
1659 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1660 << " as full" << dendl;
1661 for (auto &p: full_pool_ids) {
1662 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1663 continue;
1664 }
1665 if (pending_inc.new_pools.count(p) == 0) {
1666 pending_inc.new_pools[p] = tmp.pools[p];
1667 }
1668 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1669 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1670 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1671 }
1672 // cancel FLAG_FULL for pools which are no longer full too
1673 for (auto &pool: tmp.get_pools()) {
1674 auto p = pool.first;
1675 if (full_pool_ids.count(p)) {
1676 // skip pools we have just marked as full above
1677 continue;
1678 }
1679 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1680 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1681 // don't touch if currently is not full
1682 // or is running out of quota (and hence considered as full)
1683 continue;
1684 }
1685 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1686 << "'s full flag" << dendl;
1687 if (pending_inc.new_pools.count(p) == 0) {
1688 pending_inc.new_pools[p] = pool.second;
1689 }
1690 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1691 }
1692 }
1693 if (!backfillfull_pool_ids.empty()) {
1694 for (auto &p: backfillfull_pool_ids) {
1695 if (full_pool_ids.count(p)) {
1696 // skip pools we have already considered as full above
1697 continue;
1698 }
1699 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1700 // make sure FLAG_FULL is truly set, so we are safe not
1701 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1702 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1703 continue;
1704 }
1705 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1706 // don't bother if pool is already marked as backfillfull
1707 continue;
1708 }
1709 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1710 << "'s as backfillfull" << dendl;
1711 if (pending_inc.new_pools.count(p) == 0) {
1712 pending_inc.new_pools[p] = tmp.pools[p];
1713 }
1714 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1715 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1716 }
1717 // cancel FLAG_BACKFILLFULL for pools
1718 // which are no longer backfillfull too
1719 for (auto &pool: tmp.get_pools()) {
1720 auto p = pool.first;
1721 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1722 // skip pools we have just marked as backfillfull/full above
1723 continue;
1724 }
1725 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1726 // and don't touch if currently is not backfillfull
1727 continue;
1728 }
1729 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1730 << "'s backfillfull flag" << dendl;
1731 if (pending_inc.new_pools.count(p) == 0) {
1732 pending_inc.new_pools[p] = pool.second;
1733 }
1734 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1735 }
1736 }
1737 if (!nearfull_pool_ids.empty()) {
1738 for (auto &p: nearfull_pool_ids) {
1739 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1740 continue;
1741 }
1742 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1743 // make sure FLAG_FULL is truly set, so we are safe not
1744 // to set a extra (redundant) FLAG_NEARFULL flag
1745 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1746 continue;
1747 }
1748 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1749 // don't bother if pool is already marked as nearfull
1750 continue;
1751 }
1752 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1753 << "'s as nearfull" << dendl;
1754 if (pending_inc.new_pools.count(p) == 0) {
1755 pending_inc.new_pools[p] = tmp.pools[p];
1756 }
1757 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1758 }
1759 // cancel FLAG_NEARFULL for pools
1760 // which are no longer nearfull too
1761 for (auto &pool: tmp.get_pools()) {
1762 auto p = pool.first;
1763 if (full_pool_ids.count(p) ||
1764 backfillfull_pool_ids.count(p) ||
1765 nearfull_pool_ids.count(p)) {
1766 // skip pools we have just marked as
1767 // nearfull/backfillfull/full above
1768 continue;
1769 }
1770 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1771 // and don't touch if currently is not nearfull
1772 continue;
1773 }
1774 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1775 << "'s nearfull flag" << dendl;
1776 if (pending_inc.new_pools.count(p) == 0) {
1777 pending_inc.new_pools[p] = pool.second;
1778 }
1779 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1780 }
1781 }
1782
1783 // min_compat_client?
1784 if (!tmp.require_min_compat_client) {
1785 auto mv = tmp.get_min_compat_client();
1786 dout(1) << __func__ << " setting require_min_compat_client to currently "
1787 << "required " << mv << dendl;
1788 mon->clog->info() << "setting require_min_compat_client to currently "
1789 << "required " << mv;
1790 pending_inc.new_require_min_compat_client = mv;
1791 }
1792
1793 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1794 tmp.require_osd_release >= ceph_release_t::nautilus) {
1795 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1796 // add creating flags?
1797 for (auto& i : tmp.get_pools()) {
1798 if (pending_creatings.still_creating_pool(i.first)) {
1799 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1800 << dendl;
1801 if (pending_inc.new_pools.count(i.first) == 0) {
1802 pending_inc.new_pools[i.first] = i.second;
1803 }
1804 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1805 }
1806 }
1807 // adjust blacklist items to all be TYPE_ANY
1808 for (auto& i : tmp.blacklist) {
1809 auto a = i.first;
1810 a.set_type(entity_addr_t::TYPE_ANY);
1811 pending_inc.new_blacklist[a] = i.second;
1812 pending_inc.old_blacklist.push_back(i.first);
1813 }
1814 }
1815
1816 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1817 tmp.require_osd_release >= ceph_release_t::octopus) {
1818 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1819
1820 // adjust obsoleted cache modes
1821 for (auto& [poolid, pi] : tmp.pools) {
1822 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1823 if (pending_inc.new_pools.count(poolid) == 0) {
1824 pending_inc.new_pools[poolid] = pi;
1825 }
1826 dout(10) << __func__ << " switching pool " << poolid
1827 << " cachemode from forward -> proxy" << dendl;
1828 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1829 }
1830 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1831 if (pending_inc.new_pools.count(poolid) == 0) {
1832 pending_inc.new_pools[poolid] = pi;
1833 }
1834 dout(10) << __func__ << " switching pool " << poolid
1835 << " cachemode from readforward -> readproxy" << dendl;
1836 pending_inc.new_pools[poolid].cache_mode =
1837 pg_pool_t::CACHEMODE_READPROXY;
1838 }
1839 }
1840
1841 // clear removed_snaps for every pool
1842 for (auto& [poolid, pi] : tmp.pools) {
1843 if (pi.removed_snaps.empty()) {
1844 continue;
1845 }
1846 if (pending_inc.new_pools.count(poolid) == 0) {
1847 pending_inc.new_pools[poolid] = pi;
1848 }
1849 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1850 << dendl;
1851 pending_inc.new_pools[poolid].removed_snaps.clear();
1852 }
1853
1854 // create a combined purged snap epoch key for all purged snaps
1855 // prior to this epoch, and store it in the current epoch (i.e.,
1856 // the last pre-octopus epoch, just prior to the one we're
1857 // encoding now).
1858 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1859 it->lower_bound("purged_snap_");
1860 map<int64_t,snap_interval_set_t> combined;
1861 while (it->valid()) {
1862 if (it->key().find("purged_snap_") != 0) {
1863 break;
1864 }
1865 string k = it->key();
1866 long long unsigned pool;
1867 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1868 if (n != 1) {
1869 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1870 } else {
1871 bufferlist v = it->value();
1872 auto p = v.cbegin();
1873 snapid_t begin, end;
1874 ceph::decode(begin, p);
1875 ceph::decode(end, p);
1876 combined[pool].insert(begin, end - begin);
1877 }
1878 it->next();
1879 }
1880 if (!combined.empty()) {
1881 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1882 bufferlist v;
1883 ceph::encode(combined, v);
1884 t->put(OSD_SNAP_PREFIX, k, v);
1885 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1886 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1887 << dendl;
1888 } else {
1889 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1890 << dendl;
1891 }
1892
1893 // clean out the old removed_snap_ and removed_epoch keys
1894 // ('`' is ASCII '_' + 1)
1895 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1896 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1897 }
1898 }
1899
1900 // tell me about it
1901 for (auto i = pending_inc.new_state.begin();
1902 i != pending_inc.new_state.end();
1903 ++i) {
1904 int s = i->second ? i->second : CEPH_OSD_UP;
1905 if (s & CEPH_OSD_UP)
1906 dout(2) << " osd." << i->first << " DOWN" << dendl;
1907 if (s & CEPH_OSD_EXISTS)
1908 dout(2) << " osd." << i->first << " DNE" << dendl;
1909 }
1910 for (auto i = pending_inc.new_up_client.begin();
1911 i != pending_inc.new_up_client.end();
1912 ++i) {
1913 //FIXME: insert cluster addresses too
1914 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1915 }
1916 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1917 i != pending_inc.new_weight.end();
1918 ++i) {
1919 if (i->second == CEPH_OSD_OUT) {
1920 dout(2) << " osd." << i->first << " OUT" << dendl;
1921 } else if (i->second == CEPH_OSD_IN) {
1922 dout(2) << " osd." << i->first << " IN" << dendl;
1923 } else {
1924 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1925 }
1926 }
1927
1928 // features for osdmap and its incremental
1929 uint64_t features;
1930
1931 // encode full map and determine its crc
1932 OSDMap tmp;
1933 {
1934 tmp.deepish_copy_from(osdmap);
1935 tmp.apply_incremental(pending_inc);
1936
1937 // determine appropriate features
1938 features = tmp.get_encoding_features();
1939 dout(10) << __func__ << " encoding full map with "
1940 << tmp.require_osd_release
1941 << " features " << features << dendl;
1942
1943 // the features should be a subset of the mon quorum's features!
1944 ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1945
1946 bufferlist fullbl;
1947 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1948 pending_inc.full_crc = tmp.get_crc();
1949
1950 // include full map in the txn. note that old monitors will
1951 // overwrite this. new ones will now skip the local full map
1952 // encode and reload from this.
1953 put_version_full(t, pending_inc.epoch, fullbl);
1954 }
1955
1956 // encode
1957 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1958 bufferlist bl;
1959 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1960
1961 dout(20) << " full_crc " << tmp.get_crc()
1962 << " inc_crc " << pending_inc.inc_crc << dendl;
1963
1964 /* put everything in the transaction */
1965 put_version(t, pending_inc.epoch, bl);
1966 put_last_committed(t, pending_inc.epoch);
1967
1968 // metadata, too!
1969 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1970 p != pending_metadata.end();
1971 ++p)
1972 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1973 for (set<int>::iterator p = pending_metadata_rm.begin();
1974 p != pending_metadata_rm.end();
1975 ++p)
1976 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1977 pending_metadata.clear();
1978 pending_metadata_rm.clear();
1979
1980 // purged_snaps
1981 if (tmp.require_osd_release >= ceph_release_t::octopus &&
1982 !pending_inc.new_purged_snaps.empty()) {
1983 // all snaps purged this epoch (across all pools)
1984 string k = make_purged_snap_epoch_key(pending_inc.epoch);
1985 bufferlist v;
1986 encode(pending_inc.new_purged_snaps, v);
1987 t->put(OSD_SNAP_PREFIX, k, v);
1988 }
1989 for (auto& i : pending_inc.new_purged_snaps) {
1990 for (auto q = i.second.begin();
1991 q != i.second.end();
1992 ++q) {
1993 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
1994 pending_inc.epoch,
1995 t);
1996 }
1997 }
1998 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
1999 for (auto snap : snaps) {
2000 insert_purged_snap_update(pool, snap, snap + 1,
2001 pending_inc.epoch,
2002 t);
2003 }
2004 }
2005
2006 // health
2007 health_check_map_t next;
2008 tmp.check_health(cct, &next);
2009 encode_health(next, t);
2010 }
2011
2012 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2013 {
2014 bufferlist bl;
2015 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2016 if (r < 0)
2017 return r;
2018 try {
2019 auto p = bl.cbegin();
2020 decode(m, p);
2021 }
2022 catch (buffer::error& e) {
2023 if (err)
2024 *err << "osd." << osd << " metadata is corrupt";
2025 return -EIO;
2026 }
2027 return 0;
2028 }
2029
2030 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2031 {
2032 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2033 if (osdmap.is_up(osd)) {
2034 map<string,string> meta;
2035 load_metadata(osd, meta, nullptr);
2036 auto p = meta.find(field);
2037 if (p == meta.end()) {
2038 (*out)["unknown"]++;
2039 } else {
2040 (*out)[p->second]++;
2041 }
2042 }
2043 }
2044 }
2045
2046 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2047 {
2048 map<string,int> by_val;
2049 count_metadata(field, &by_val);
2050 f->open_object_section(field.c_str());
2051 for (auto& p : by_val) {
2052 f->dump_int(p.first.c_str(), p.second);
2053 }
2054 f->close_section();
2055 }
2056
2057 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2058 {
2059 map<string, string> metadata;
2060 int r = load_metadata(osd, metadata, nullptr);
2061 if (r < 0)
2062 return r;
2063
2064 auto it = metadata.find("osd_objectstore");
2065 if (it == metadata.end())
2066 return -ENOENT;
2067 *type = it->second;
2068 return 0;
2069 }
2070
2071 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2072 const pg_pool_t &pool,
2073 ostream *err)
2074 {
2075 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2076 // since filestore osds could always join the pool later
2077 set<int> checked_osds;
2078 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2079 vector<int> up, acting;
2080 pg_t pgid(ps, pool_id);
2081 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2082 for (int osd : up) {
2083 if (checked_osds.find(osd) != checked_osds.end())
2084 continue;
2085 string objectstore_type;
2086 int r = get_osd_objectstore_type(osd, &objectstore_type);
2087 // allow with missing metadata, e.g. due to an osd never booting yet
2088 if (r < 0 || objectstore_type == "bluestore") {
2089 checked_osds.insert(osd);
2090 continue;
2091 }
2092 *err << "osd." << osd << " uses " << objectstore_type;
2093 return false;
2094 }
2095 }
2096 return true;
2097 }
2098
2099 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2100 {
2101 map<string,string> m;
2102 if (int r = load_metadata(osd, m, err))
2103 return r;
2104 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2105 f->dump_string(p->first.c_str(), p->second);
2106 return 0;
2107 }
2108
2109 void OSDMonitor::print_nodes(Formatter *f)
2110 {
2111 // group OSDs by their hosts
2112 map<string, list<int> > osds; // hostname => osd
2113 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2114 map<string, string> m;
2115 if (load_metadata(osd, m, NULL)) {
2116 continue;
2117 }
2118 map<string, string>::iterator hostname = m.find("hostname");
2119 if (hostname == m.end()) {
2120 // not likely though
2121 continue;
2122 }
2123 osds[hostname->second].push_back(osd);
2124 }
2125
2126 dump_services(f, osds, "osd");
2127 }
2128
2129 void OSDMonitor::share_map_with_random_osd()
2130 {
2131 if (osdmap.get_num_up_osds() == 0) {
2132 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2133 return;
2134 }
2135
2136 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2137 if (!s) {
2138 dout(10) << __func__ << " no up osd on our session map" << dendl;
2139 return;
2140 }
2141
2142 dout(10) << "committed, telling random " << s->name
2143 << " all about it" << dendl;
2144
2145 // get feature of the peer
2146 // use quorum_con_features, if it's an anonymous connection.
2147 uint64_t features = s->con_features ? s->con_features :
2148 mon->get_quorum_con_features();
2149 // whatev, they'll request more if they need it
2150 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2151 s->con->send_message(m);
2152 // NOTE: do *not* record osd has up to this epoch (as we do
2153 // elsewhere) as they may still need to request older values.
2154 }
2155
2156 version_t OSDMonitor::get_trim_to() const
2157 {
2158 if (mon->get_quorum().empty()) {
2159 dout(10) << __func__ << ": quorum not formed" << dendl;
2160 return 0;
2161 }
2162
2163 {
2164 std::lock_guard<std::mutex> l(creating_pgs_lock);
2165 if (!creating_pgs.pgs.empty()) {
2166 return 0;
2167 }
2168 }
2169
2170 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2171 dout(0) << __func__
2172 << " blocking osdmap trim"
2173 " ('mon_debug_block_osdmap_trim' set to 'true')"
2174 << dendl;
2175 return 0;
2176 }
2177
2178 {
2179 epoch_t floor = get_min_last_epoch_clean();
2180 dout(10) << " min_last_epoch_clean " << floor << dendl;
2181 if (g_conf()->mon_osd_force_trim_to > 0 &&
2182 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2183 floor = g_conf()->mon_osd_force_trim_to;
2184 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2185 }
2186 unsigned min = g_conf()->mon_min_osdmap_epochs;
2187 if (floor + min > get_last_committed()) {
2188 if (min < get_last_committed())
2189 floor = get_last_committed() - min;
2190 else
2191 floor = 0;
2192 }
2193 if (floor > get_first_committed())
2194 return floor;
2195 }
2196 return 0;
2197 }
2198
2199 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2200 {
2201 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2202 // also scan osd epochs
2203 // don't trim past the oldest reported osd epoch
2204 for (auto& osd_epoch : osd_epochs) {
2205 if (osd_epoch.second < floor) {
2206 floor = osd_epoch.second;
2207 }
2208 }
2209 return floor;
2210 }
2211
2212 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2213 version_t first)
2214 {
2215 dout(10) << __func__ << " including full map for e " << first << dendl;
2216 bufferlist bl;
2217 get_version_full(first, bl);
2218 put_version_full(tx, first, bl);
2219
2220 if (has_osdmap_manifest &&
2221 first > osdmap_manifest.get_first_pinned()) {
2222 _prune_update_trimmed(tx, first);
2223 }
2224 }
2225
2226
2227 /* full osdmap prune
2228 *
2229 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2230 */
2231
2232 void OSDMonitor::load_osdmap_manifest()
2233 {
2234 bool store_has_manifest =
2235 mon->store->exists(get_service_name(), "osdmap_manifest");
2236
2237 if (!store_has_manifest) {
2238 if (!has_osdmap_manifest) {
2239 return;
2240 }
2241
2242 dout(20) << __func__
2243 << " dropping osdmap manifest from memory." << dendl;
2244 osdmap_manifest = osdmap_manifest_t();
2245 has_osdmap_manifest = false;
2246 return;
2247 }
2248
2249 dout(20) << __func__
2250 << " osdmap manifest detected in store; reload." << dendl;
2251
2252 bufferlist manifest_bl;
2253 int r = get_value("osdmap_manifest", manifest_bl);
2254 if (r < 0) {
2255 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2256 ceph_abort_msg("error reading manifest");
2257 }
2258 osdmap_manifest.decode(manifest_bl);
2259 has_osdmap_manifest = true;
2260
2261 dout(10) << __func__ << " store osdmap manifest pinned ("
2262 << osdmap_manifest.get_first_pinned()
2263 << " .. "
2264 << osdmap_manifest.get_last_pinned()
2265 << ")"
2266 << dendl;
2267 }
2268
2269 bool OSDMonitor::should_prune() const
2270 {
2271 version_t first = get_first_committed();
2272 version_t last = get_last_committed();
2273 version_t min_osdmap_epochs =
2274 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2275 version_t prune_min =
2276 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2277 version_t prune_interval =
2278 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2279 version_t last_pinned = osdmap_manifest.get_last_pinned();
2280 version_t last_to_pin = last - min_osdmap_epochs;
2281
2282 // Make it or break it constraints.
2283 //
2284 // If any of these conditions fails, we will not prune, regardless of
2285 // whether we have an on-disk manifest with an on-going pruning state.
2286 //
2287 if ((last - first) <= min_osdmap_epochs) {
2288 // between the first and last committed epochs, we don't have
2289 // enough epochs to trim, much less to prune.
2290 dout(10) << __func__
2291 << " currently holding only " << (last - first)
2292 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2293 << "); do not prune."
2294 << dendl;
2295 return false;
2296
2297 } else if ((last_to_pin - first) < prune_min) {
2298 // between the first committed epoch and the last epoch we would prune,
2299 // we simply don't have enough versions over the minimum to prune maps.
2300 dout(10) << __func__
2301 << " could only prune " << (last_to_pin - first)
2302 << " epochs (" << first << ".." << last_to_pin << "), which"
2303 " is less than the required minimum (" << prune_min << ")"
2304 << dendl;
2305 return false;
2306
2307 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2308 dout(10) << __func__
2309 << " we have pruned as far as we can; do not prune."
2310 << dendl;
2311 return false;
2312
2313 } else if (last_pinned + prune_interval > last_to_pin) {
2314 dout(10) << __func__
2315 << " not enough epochs to form an interval (last pinned: "
2316 << last_pinned << ", last to pin: "
2317 << last_to_pin << ", interval: " << prune_interval << ")"
2318 << dendl;
2319 return false;
2320 }
2321
2322 dout(15) << __func__
2323 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2324 << " lc (" << first << ".." << last << ")"
2325 << dendl;
2326 return true;
2327 }
2328
2329 void OSDMonitor::_prune_update_trimmed(
2330 MonitorDBStore::TransactionRef tx,
2331 version_t first)
2332 {
2333 dout(10) << __func__
2334 << " first " << first
2335 << " last_pinned " << osdmap_manifest.get_last_pinned()
2336 << " last_pinned " << osdmap_manifest.get_last_pinned()
2337 << dendl;
2338
2339 osdmap_manifest_t manifest = osdmap_manifest;
2340
2341 if (!manifest.is_pinned(first)) {
2342 manifest.pin(first);
2343 }
2344
2345 set<version_t>::iterator p_end = manifest.pinned.find(first);
2346 set<version_t>::iterator p = manifest.pinned.begin();
2347 manifest.pinned.erase(p, p_end);
2348 ceph_assert(manifest.get_first_pinned() == first);
2349
2350 if (manifest.get_last_pinned() == first+1 ||
2351 manifest.pinned.size() == 1) {
2352 // we reached the end of the line, as pinned maps go; clean up our
2353 // manifest, and let `should_prune()` decide whether we should prune
2354 // again.
2355 tx->erase(get_service_name(), "osdmap_manifest");
2356 return;
2357 }
2358
2359 bufferlist bl;
2360 manifest.encode(bl);
2361 tx->put(get_service_name(), "osdmap_manifest", bl);
2362 }
2363
2364 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2365 {
2366 dout(1) << __func__ << dendl;
2367
2368 version_t pin_first;
2369
2370 // verify constrainsts on stable in-memory state
2371 if (!has_osdmap_manifest) {
2372 // we must have never pruned, OR if we pruned the state must no longer
2373 // be relevant (i.e., the state must have been removed alongside with
2374 // the trim that *must* have removed past the last pinned map in a
2375 // previous prune).
2376 ceph_assert(osdmap_manifest.pinned.empty());
2377 ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2378 pin_first = get_first_committed();
2379
2380 } else {
2381 // we must have pruned in the past AND its state is still relevant
2382 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2383 // and thus we still hold a manifest in the store).
2384 ceph_assert(!osdmap_manifest.pinned.empty());
2385 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2386 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2387
2388 dout(10) << __func__
2389 << " first_pinned " << osdmap_manifest.get_first_pinned()
2390 << " last_pinned " << osdmap_manifest.get_last_pinned()
2391 << dendl;
2392
2393 pin_first = osdmap_manifest.get_last_pinned();
2394 }
2395
2396 manifest.pin(pin_first);
2397 }
2398
2399 bool OSDMonitor::_prune_sanitize_options() const
2400 {
2401 uint64_t prune_interval =
2402 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2403 uint64_t prune_min =
2404 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2405 uint64_t txsize =
2406 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2407
2408 bool r = true;
2409
2410 if (prune_interval == 0) {
2411 derr << __func__
2412 << " prune is enabled BUT prune interval is zero; abort."
2413 << dendl;
2414 r = false;
2415 } else if (prune_interval == 1) {
2416 derr << __func__
2417 << " prune interval is equal to one, which essentially means"
2418 " no pruning; abort."
2419 << dendl;
2420 r = false;
2421 }
2422 if (prune_min == 0) {
2423 derr << __func__
2424 << " prune is enabled BUT prune min is zero; abort."
2425 << dendl;
2426 r = false;
2427 }
2428 if (prune_interval > prune_min) {
2429 derr << __func__
2430 << " impossible to ascertain proper prune interval because"
2431 << " it is greater than the minimum prune epochs"
2432 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2433 << dendl;
2434 r = false;
2435 }
2436
2437 if (txsize < prune_interval - 1) {
2438 derr << __func__
2439 << "'mon_osdmap_full_prune_txsize' (" << txsize
2440 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2441 << "); abort." << dendl;
2442 r = false;
2443 }
2444 return r;
2445 }
2446
2447 bool OSDMonitor::is_prune_enabled() const {
2448 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2449 }
2450
2451 bool OSDMonitor::is_prune_supported() const {
2452 return mon->get_required_mon_features().contains_any(
2453 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2454 }
2455
2456 /** do_prune
2457 *
2458 * @returns true if has side-effects; false otherwise.
2459 */
2460 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2461 {
2462 bool enabled = is_prune_enabled();
2463
2464 dout(1) << __func__ << " osdmap full prune "
2465 << ( enabled ? "enabled" : "disabled")
2466 << dendl;
2467
2468 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2469 return false;
2470 }
2471
2472 // we are beyond the minimum prune versions, we need to remove maps because
2473 // otherwise the store will grow unbounded and we may end up having issues
2474 // with available disk space or store hangs.
2475
2476 // we will not pin all versions. We will leave a buffer number of versions.
2477 // this allows us the monitor to trim maps without caring too much about
2478 // pinned maps, and then allow us to use another ceph-mon without these
2479 // capabilities, without having to repair the store.
2480
2481 osdmap_manifest_t manifest = osdmap_manifest;
2482
2483 version_t first = get_first_committed();
2484 version_t last = get_last_committed();
2485
2486 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2487 version_t last_pinned = manifest.get_last_pinned();
2488 uint64_t prune_interval =
2489 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2490 uint64_t txsize =
2491 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2492
2493 prune_init(manifest);
2494
2495 // we need to get rid of some osdmaps
2496
2497 dout(5) << __func__
2498 << " lc (" << first << " .. " << last << ")"
2499 << " last_pinned " << last_pinned
2500 << " interval " << prune_interval
2501 << " last_to_pin " << last_to_pin
2502 << dendl;
2503
2504 // We will be erasing maps as we go.
2505 //
2506 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2507 //
2508 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2509 // we stop pruning. We could prune the maps between `next_to_pin` and
2510 // `last_to_pin`, but by not doing it we end up with neater pruned
2511 // intervals, aligned with `prune_interval`. Besides, this should not be a
2512 // problem as long as `prune_interval` is set to a sane value, instead of
2513 // hundreds or thousands of maps.
2514
2515 auto map_exists = [this](version_t v) {
2516 string k = mon->store->combine_strings("full", v);
2517 return mon->store->exists(get_service_name(), k);
2518 };
2519
2520 // 'interval' represents the number of maps from the last pinned
2521 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2522 // version 11 next; all intermediate versions will be removed.
2523 //
2524 // 'txsize' represents the maximum number of versions we'll be removing in
2525 // this iteration. If 'txsize' is large enough to perform multiple passes
2526 // pinning and removing maps, we will do so; if not, we'll do at least one
2527 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2528 // ensure that we never go *over* the maximum.
2529
2530 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2531 uint64_t removal_interval = prune_interval - 1;
2532
2533 if (txsize < removal_interval) {
2534 dout(5) << __func__
2535 << " setting txsize to removal interval size ("
2536 << removal_interval << " versions"
2537 << dendl;
2538 txsize = removal_interval;
2539 }
2540 ceph_assert(removal_interval > 0);
2541
2542 uint64_t num_pruned = 0;
2543 while (num_pruned + removal_interval <= txsize) {
2544 last_pinned = manifest.get_last_pinned();
2545
2546 if (last_pinned + prune_interval > last_to_pin) {
2547 break;
2548 }
2549 ceph_assert(last_pinned < last_to_pin);
2550
2551 version_t next_pinned = last_pinned + prune_interval;
2552 ceph_assert(next_pinned <= last_to_pin);
2553 manifest.pin(next_pinned);
2554
2555 dout(20) << __func__
2556 << " last_pinned " << last_pinned
2557 << " next_pinned " << next_pinned
2558 << " num_pruned " << num_pruned
2559 << " removal interval (" << (last_pinned+1)
2560 << ".." << (next_pinned-1) << ")"
2561 << " txsize " << txsize << dendl;
2562
2563 ceph_assert(map_exists(last_pinned));
2564 ceph_assert(map_exists(next_pinned));
2565
2566 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2567 ceph_assert(!manifest.is_pinned(v));
2568
2569 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2570 string full_key = mon->store->combine_strings("full", v);
2571 tx->erase(get_service_name(), full_key);
2572 ++num_pruned;
2573 }
2574 }
2575
2576 ceph_assert(num_pruned > 0);
2577
2578 bufferlist bl;
2579 manifest.encode(bl);
2580 tx->put(get_service_name(), "osdmap_manifest", bl);
2581
2582 return true;
2583 }
2584
2585
2586 // -------------
2587
2588 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2589 {
2590 op->mark_osdmon_event(__func__);
2591 Message *m = op->get_req();
2592 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2593
2594 switch (m->get_type()) {
2595 // READs
2596 case MSG_MON_COMMAND:
2597 try {
2598 return preprocess_command(op);
2599 } catch (const bad_cmd_get& e) {
2600 bufferlist bl;
2601 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2602 return true;
2603 }
2604 case CEPH_MSG_MON_GET_OSDMAP:
2605 return preprocess_get_osdmap(op);
2606
2607 // damp updates
2608 case MSG_OSD_MARK_ME_DOWN:
2609 return preprocess_mark_me_down(op);
2610 case MSG_OSD_MARK_ME_DEAD:
2611 return preprocess_mark_me_dead(op);
2612 case MSG_OSD_FULL:
2613 return preprocess_full(op);
2614 case MSG_OSD_FAILURE:
2615 return preprocess_failure(op);
2616 case MSG_OSD_BOOT:
2617 return preprocess_boot(op);
2618 case MSG_OSD_ALIVE:
2619 return preprocess_alive(op);
2620 case MSG_OSD_PG_CREATED:
2621 return preprocess_pg_created(op);
2622 case MSG_OSD_PG_READY_TO_MERGE:
2623 return preprocess_pg_ready_to_merge(op);
2624 case MSG_OSD_PGTEMP:
2625 return preprocess_pgtemp(op);
2626 case MSG_OSD_BEACON:
2627 return preprocess_beacon(op);
2628
2629 case CEPH_MSG_POOLOP:
2630 return preprocess_pool_op(op);
2631
2632 case MSG_REMOVE_SNAPS:
2633 return preprocess_remove_snaps(op);
2634
2635 case MSG_MON_GET_PURGED_SNAPS:
2636 return preprocess_get_purged_snaps(op);
2637
2638 default:
2639 ceph_abort();
2640 return true;
2641 }
2642 }
2643
2644 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2645 {
2646 op->mark_osdmon_event(__func__);
2647 Message *m = op->get_req();
2648 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2649
2650 switch (m->get_type()) {
2651 // damp updates
2652 case MSG_OSD_MARK_ME_DOWN:
2653 return prepare_mark_me_down(op);
2654 case MSG_OSD_MARK_ME_DEAD:
2655 return prepare_mark_me_dead(op);
2656 case MSG_OSD_FULL:
2657 return prepare_full(op);
2658 case MSG_OSD_FAILURE:
2659 return prepare_failure(op);
2660 case MSG_OSD_BOOT:
2661 return prepare_boot(op);
2662 case MSG_OSD_ALIVE:
2663 return prepare_alive(op);
2664 case MSG_OSD_PG_CREATED:
2665 return prepare_pg_created(op);
2666 case MSG_OSD_PGTEMP:
2667 return prepare_pgtemp(op);
2668 case MSG_OSD_PG_READY_TO_MERGE:
2669 return prepare_pg_ready_to_merge(op);
2670 case MSG_OSD_BEACON:
2671 return prepare_beacon(op);
2672
2673 case MSG_MON_COMMAND:
2674 try {
2675 return prepare_command(op);
2676 } catch (const bad_cmd_get& e) {
2677 bufferlist bl;
2678 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2679 return true;
2680 }
2681
2682 case CEPH_MSG_POOLOP:
2683 return prepare_pool_op(op);
2684
2685 case MSG_REMOVE_SNAPS:
2686 return prepare_remove_snaps(op);
2687
2688
2689 default:
2690 ceph_abort();
2691 }
2692
2693 return false;
2694 }
2695
2696 bool OSDMonitor::should_propose(double& delay)
2697 {
2698 dout(10) << "should_propose" << dendl;
2699
2700 // if full map, propose immediately! any subsequent changes will be clobbered.
2701 if (pending_inc.fullmap.length())
2702 return true;
2703
2704 // adjust osd weights?
2705 if (!osd_weight.empty() &&
2706 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2707 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2708 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2709 delay = 0.0;
2710 osd_weight.clear();
2711 return true;
2712 }
2713
2714 return PaxosService::should_propose(delay);
2715 }
2716
2717
2718
2719 // ---------------------------
2720 // READs
2721
2722 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2723 {
2724 op->mark_osdmon_event(__func__);
2725 auto m = op->get_req<MMonGetOSDMap>();
2726
2727 uint64_t features = mon->get_quorum_con_features();
2728 if (op->get_session() && op->get_session()->con_features)
2729 features = op->get_session()->con_features;
2730
2731 dout(10) << __func__ << " " << *m << dendl;
2732 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2733 epoch_t first = get_first_committed();
2734 epoch_t last = osdmap.get_epoch();
2735 int max = g_conf()->osd_map_message_max;
2736 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2737 for (epoch_t e = std::max(first, m->get_full_first());
2738 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2739 ++e, --max) {
2740 bufferlist& bl = reply->maps[e];
2741 int r = get_version_full(e, features, bl);
2742 ceph_assert(r >= 0);
2743 max_bytes -= bl.length();
2744 }
2745 for (epoch_t e = std::max(first, m->get_inc_first());
2746 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2747 ++e, --max) {
2748 bufferlist& bl = reply->incremental_maps[e];
2749 int r = get_version(e, features, bl);
2750 ceph_assert(r >= 0);
2751 max_bytes -= bl.length();
2752 }
2753 reply->oldest_map = first;
2754 reply->newest_map = last;
2755 mon->send_reply(op, reply);
2756 return true;
2757 }
2758
2759
2760 // ---------------------------
2761 // UPDATEs
2762
2763 // failure --
2764
2765 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2766 // check permissions
2767 MonSession *session = op->get_session();
2768 if (!session)
2769 return true;
2770 if (!session->is_capable("osd", MON_CAP_X)) {
2771 dout(0) << "got MOSDFailure from entity with insufficient caps "
2772 << session->caps << dendl;
2773 return true;
2774 }
2775 if (fsid != mon->monmap->fsid) {
2776 dout(0) << "check_source: on fsid " << fsid
2777 << " != " << mon->monmap->fsid << dendl;
2778 return true;
2779 }
2780 return false;
2781 }
2782
2783
2784 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2785 {
2786 op->mark_osdmon_event(__func__);
2787 auto m = op->get_req<MOSDFailure>();
2788 // who is target_osd
2789 int badboy = m->get_target_osd();
2790
2791 // check permissions
2792 if (check_source(op, m->fsid))
2793 goto didit;
2794
2795 // first, verify the reporting host is valid
2796 if (m->get_orig_source().is_osd()) {
2797 int from = m->get_orig_source().num();
2798 if (!osdmap.exists(from) ||
2799 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2800 (osdmap.is_down(from) && m->if_osd_failed())) {
2801 dout(5) << "preprocess_failure from dead osd." << from
2802 << ", ignoring" << dendl;
2803 send_incremental(op, m->get_epoch()+1);
2804 goto didit;
2805 }
2806 }
2807
2808
2809 // weird?
2810 if (osdmap.is_down(badboy)) {
2811 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2812 << " " << m->get_target_addrs()
2813 << ", from " << m->get_orig_source() << dendl;
2814 if (m->get_epoch() < osdmap.get_epoch())
2815 send_incremental(op, m->get_epoch()+1);
2816 goto didit;
2817 }
2818 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2819 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2820 << " " << m->get_target_addrs()
2821 << " != map's " << osdmap.get_addrs(badboy)
2822 << ", from " << m->get_orig_source() << dendl;
2823 if (m->get_epoch() < osdmap.get_epoch())
2824 send_incremental(op, m->get_epoch()+1);
2825 goto didit;
2826 }
2827
2828 // already reported?
2829 if (osdmap.is_down(badboy) ||
2830 osdmap.get_up_from(badboy) > m->get_epoch()) {
2831 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2832 << " " << m->get_target_addrs()
2833 << ", from " << m->get_orig_source() << dendl;
2834 if (m->get_epoch() < osdmap.get_epoch())
2835 send_incremental(op, m->get_epoch()+1);
2836 goto didit;
2837 }
2838
2839 if (!can_mark_down(badboy)) {
2840 dout(5) << "preprocess_failure ignoring report of osd."
2841 << m->get_target_osd() << " " << m->get_target_addrs()
2842 << " from " << m->get_orig_source() << dendl;
2843 goto didit;
2844 }
2845
2846 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2847 << " " << m->get_target_addrs()
2848 << ", from " << m->get_orig_source() << dendl;
2849 return false;
2850
2851 didit:
2852 mon->no_reply(op);
2853 return true;
2854 }
2855
2856 class C_AckMarkedDown : public C_MonOp {
2857 OSDMonitor *osdmon;
2858 public:
2859 C_AckMarkedDown(
2860 OSDMonitor *osdmon,
2861 MonOpRequestRef op)
2862 : C_MonOp(op), osdmon(osdmon) {}
2863
2864 void _finish(int r) override {
2865 if (r == 0) {
2866 auto m = op->get_req<MOSDMarkMeDown>();
2867 osdmon->mon->send_reply(
2868 op,
2869 new MOSDMarkMeDown(
2870 m->fsid,
2871 m->target_osd,
2872 m->target_addrs,
2873 m->get_epoch(),
2874 false)); // ACK itself does not request an ack
2875 } else if (r == -EAGAIN) {
2876 osdmon->dispatch(op);
2877 } else {
2878 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2879 }
2880 }
2881 ~C_AckMarkedDown() override {
2882 }
2883 };
2884
2885 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2886 {
2887 op->mark_osdmon_event(__func__);
2888 auto m = op->get_req<MOSDMarkMeDown>();
2889 int from = m->target_osd;
2890
2891 // check permissions
2892 if (check_source(op, m->fsid))
2893 goto reply;
2894
2895 // first, verify the reporting host is valid
2896 if (!m->get_orig_source().is_osd())
2897 goto reply;
2898
2899 if (!osdmap.exists(from) ||
2900 osdmap.is_down(from) ||
2901 osdmap.get_addrs(from) != m->target_addrs) {
2902 dout(5) << "preprocess_mark_me_down from dead osd."
2903 << from << ", ignoring" << dendl;
2904 send_incremental(op, m->get_epoch()+1);
2905 goto reply;
2906 }
2907
2908 // no down might be set
2909 if (!can_mark_down(from))
2910 goto reply;
2911
2912 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2913 << " " << m->target_addrs << dendl;
2914 return false;
2915
2916 reply:
2917 if (m->request_ack) {
2918 Context *c(new C_AckMarkedDown(this, op));
2919 c->complete(0);
2920 }
2921 return true;
2922 }
2923
2924 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2925 {
2926 op->mark_osdmon_event(__func__);
2927 auto m = op->get_req<MOSDMarkMeDown>();
2928 int target_osd = m->target_osd;
2929
2930 ceph_assert(osdmap.is_up(target_osd));
2931 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2932
2933 mon->clog->info() << "osd." << target_osd << " marked itself down";
2934 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2935 if (m->request_ack)
2936 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2937 return true;
2938 }
2939
2940 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2941 {
2942 op->mark_osdmon_event(__func__);
2943 auto m = op->get_req<MOSDMarkMeDead>();
2944 int from = m->target_osd;
2945
2946 // check permissions
2947 if (check_source(op, m->fsid)) {
2948 mon->no_reply(op);
2949 return true;
2950 }
2951
2952 // first, verify the reporting host is valid
2953 if (!m->get_orig_source().is_osd()) {
2954 mon->no_reply(op);
2955 return true;
2956 }
2957
2958 if (!osdmap.exists(from) ||
2959 !osdmap.is_down(from)) {
2960 dout(5) << __func__ << " from nonexistent or up osd." << from
2961 << ", ignoring" << dendl;
2962 send_incremental(op, m->get_epoch()+1);
2963 mon->no_reply(op);
2964 return true;
2965 }
2966
2967 return false;
2968 }
2969
2970 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2971 {
2972 op->mark_osdmon_event(__func__);
2973 auto m = op->get_req<MOSDMarkMeDead>();
2974 int target_osd = m->target_osd;
2975
2976 ceph_assert(osdmap.is_down(target_osd));
2977
2978 mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
2979 << m->get_epoch();
2980 if (!pending_inc.new_xinfo.count(target_osd)) {
2981 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
2982 }
2983 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
2984 wait_for_finished_proposal(
2985 op,
2986 new LambdaContext(
2987 [op, this] (int r) {
2988 if (r >= 0) {
2989 mon->no_reply(op); // ignore on success
2990 }
2991 }
2992 ));
2993 return true;
2994 }
2995
2996 bool OSDMonitor::can_mark_down(int i)
2997 {
2998 if (osdmap.is_nodown(i)) {
2999 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3000 << "will not mark it down" << dendl;
3001 return false;
3002 }
3003
3004 int num_osds = osdmap.get_num_osds();
3005 if (num_osds == 0) {
3006 dout(5) << __func__ << " no osds" << dendl;
3007 return false;
3008 }
3009 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3010 float up_ratio = (float)up / (float)num_osds;
3011 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3012 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3013 << g_conf()->mon_osd_min_up_ratio
3014 << ", will not mark osd." << i << " down" << dendl;
3015 return false;
3016 }
3017 return true;
3018 }
3019
3020 bool OSDMonitor::can_mark_up(int i)
3021 {
3022 if (osdmap.is_noup(i)) {
3023 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3024 << "will not mark it up" << dendl;
3025 return false;
3026 }
3027
3028 return true;
3029 }
3030
3031 /**
3032 * @note the parameter @p i apparently only exists here so we can output the
3033 * osd's id on messages.
3034 */
3035 bool OSDMonitor::can_mark_out(int i)
3036 {
3037 if (osdmap.is_noout(i)) {
3038 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3039 << "will not mark it out" << dendl;
3040 return false;
3041 }
3042
3043 int num_osds = osdmap.get_num_osds();
3044 if (num_osds == 0) {
3045 dout(5) << __func__ << " no osds" << dendl;
3046 return false;
3047 }
3048 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3049 float in_ratio = (float)in / (float)num_osds;
3050 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3051 if (i >= 0)
3052 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3053 << g_conf()->mon_osd_min_in_ratio
3054 << ", will not mark osd." << i << " out" << dendl;
3055 else
3056 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3057 << g_conf()->mon_osd_min_in_ratio
3058 << ", will not mark osds out" << dendl;
3059 return false;
3060 }
3061
3062 return true;
3063 }
3064
3065 bool OSDMonitor::can_mark_in(int i)
3066 {
3067 if (osdmap.is_noin(i)) {
3068 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3069 << "will not mark it in" << dendl;
3070 return false;
3071 }
3072
3073 return true;
3074 }
3075
3076 bool OSDMonitor::check_failures(utime_t now)
3077 {
3078 bool found_failure = false;
3079 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3080 p != failure_info.end();
3081 ++p) {
3082 if (can_mark_down(p->first)) {
3083 found_failure |= check_failure(now, p->first, p->second);
3084 }
3085 }
3086 return found_failure;
3087 }
3088
3089 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3090 {
3091 // already pending failure?
3092 if (pending_inc.new_state.count(target_osd) &&
3093 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3094 dout(10) << " already pending failure" << dendl;
3095 return true;
3096 }
3097
3098 set<string> reporters_by_subtree;
3099 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3100 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3101 utime_t max_failed_since = fi.get_failed_since();
3102 utime_t failed_for = now - max_failed_since;
3103
3104 utime_t grace = orig_grace;
3105 double my_grace = 0, peer_grace = 0;
3106 double decay_k = 0;
3107 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3108 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3109 decay_k = ::log(.5) / halflife;
3110
3111 // scale grace period based on historical probability of 'lagginess'
3112 // (false positive failures due to slowness).
3113 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3114 double decay = exp((double)failed_for * decay_k);
3115 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3116 << " failed_for " << failed_for << " decay " << decay << dendl;
3117 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3118 grace += my_grace;
3119 }
3120
3121 // consider the peers reporting a failure a proxy for a potential
3122 // 'subcluster' over the overall cluster that is similarly
3123 // laggy. this is clearly not true in all cases, but will sometimes
3124 // help us localize the grace correction to a subset of the system
3125 // (say, a rack with a bad switch) that is unhappy.
3126 ceph_assert(fi.reporters.size());
3127 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3128 // get the parent bucket whose type matches with "reporter_subtree_level".
3129 // fall back to OSD if the level doesn't exist.
3130 if (osdmap.exists(p->first)) {
3131 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3132 if (auto iter = reporter_loc.find(reporter_subtree_level);
3133 iter == reporter_loc.end()) {
3134 reporters_by_subtree.insert("osd." + to_string(p->first));
3135 } else {
3136 reporters_by_subtree.insert(iter->second);
3137 }
3138 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3139 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3140 utime_t elapsed = now - xi.down_stamp;
3141 double decay = exp((double)elapsed * decay_k);
3142 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3143 }
3144 ++p;
3145 } else {
3146 fi.cancel_report(p->first);;
3147 p = fi.reporters.erase(p);
3148 }
3149 }
3150
3151 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3152 peer_grace /= (double)fi.reporters.size();
3153 grace += peer_grace;
3154 }
3155
3156 dout(10) << " osd." << target_osd << " has "
3157 << fi.reporters.size() << " reporters, "
3158 << grace << " grace (" << orig_grace << " + " << my_grace
3159 << " + " << peer_grace << "), max_failed_since " << max_failed_since
3160 << dendl;
3161
3162 if (failed_for >= grace &&
3163 reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3164 dout(1) << " we have enough reporters to mark osd." << target_osd
3165 << " down" << dendl;
3166 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3167
3168 mon->clog->info() << "osd." << target_osd << " failed ("
3169 << osdmap.crush->get_full_location_ordered_string(
3170 target_osd)
3171 << ") ("
3172 << (int)reporters_by_subtree.size()
3173 << " reporters from different "
3174 << reporter_subtree_level << " after "
3175 << failed_for << " >= grace " << grace << ")";
3176 return true;
3177 }
3178 return false;
3179 }
3180
3181 void OSDMonitor::force_failure(int target_osd, int by)
3182 {
3183 // already pending failure?
3184 if (pending_inc.new_state.count(target_osd) &&
3185 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3186 dout(10) << " already pending failure" << dendl;
3187 return;
3188 }
3189
3190 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3191 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3192 if (!pending_inc.new_xinfo.count(target_osd)) {
3193 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3194 }
3195 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3196
3197 mon->clog->info() << "osd." << target_osd << " failed ("
3198 << osdmap.crush->get_full_location_ordered_string(target_osd)
3199 << ") (connection refused reported by osd." << by << ")";
3200 return;
3201 }
3202
3203 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3204 {
3205 op->mark_osdmon_event(__func__);
3206 auto m = op->get_req<MOSDFailure>();
3207 dout(1) << "prepare_failure osd." << m->get_target_osd()
3208 << " " << m->get_target_addrs()
3209 << " from " << m->get_orig_source()
3210 << " is reporting failure:" << m->if_osd_failed() << dendl;
3211
3212 int target_osd = m->get_target_osd();
3213 int reporter = m->get_orig_source().num();
3214 ceph_assert(osdmap.is_up(target_osd));
3215 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3216
3217 mon->no_reply(op);
3218
3219 if (m->if_osd_failed()) {
3220 // calculate failure time
3221 utime_t now = ceph_clock_now();
3222 utime_t failed_since =
3223 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3224
3225 // add a report
3226 if (m->is_immediate()) {
3227 mon->clog->debug() << "osd." << m->get_target_osd()
3228 << " reported immediately failed by "
3229 << m->get_orig_source();
3230 force_failure(target_osd, reporter);
3231 return true;
3232 }
3233 mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3234 << m->get_orig_source();
3235
3236 failure_info_t& fi = failure_info[target_osd];
3237 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3238 if (old_op) {
3239 mon->no_reply(old_op);
3240 }
3241
3242 return check_failure(now, target_osd, fi);
3243 } else {
3244 // remove the report
3245 mon->clog->debug() << "osd." << m->get_target_osd()
3246 << " failure report canceled by "
3247 << m->get_orig_source();
3248 if (failure_info.count(target_osd)) {
3249 failure_info_t& fi = failure_info[target_osd];
3250 MonOpRequestRef report_op = fi.cancel_report(reporter);
3251 if (report_op) {
3252 mon->no_reply(report_op);
3253 }
3254 if (fi.reporters.empty()) {
3255 dout(10) << " removing last failure_info for osd." << target_osd
3256 << dendl;
3257 failure_info.erase(target_osd);
3258 } else {
3259 dout(10) << " failure_info for osd." << target_osd << " now "
3260 << fi.reporters.size() << " reporters" << dendl;
3261 }
3262 } else {
3263 dout(10) << " no failure_info for osd." << target_osd << dendl;
3264 }
3265 }
3266
3267 return false;
3268 }
3269
3270 void OSDMonitor::process_failures()
3271 {
3272 map<int,failure_info_t>::iterator p = failure_info.begin();
3273 while (p != failure_info.end()) {
3274 if (osdmap.is_up(p->first)) {
3275 ++p;
3276 } else {
3277 dout(10) << "process_failures osd." << p->first << dendl;
3278 list<MonOpRequestRef> ls;
3279 p->second.take_report_messages(ls);
3280 failure_info.erase(p++);
3281
3282 while (!ls.empty()) {
3283 MonOpRequestRef o = ls.front();
3284 if (o) {
3285 o->mark_event(__func__);
3286 MOSDFailure *m = o->get_req<MOSDFailure>();
3287 send_latest(o, m->get_epoch());
3288 mon->no_reply(o);
3289 }
3290 ls.pop_front();
3291 }
3292 }
3293 }
3294 }
3295
3296 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3297 {
3298 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3299
3300 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3301 p != failure_info.end();
3302 ++p) {
3303 p->second.take_report_messages(ls);
3304 }
3305 failure_info.clear();
3306 }
3307
3308
3309 // boot --
3310
3311 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3312 {
3313 op->mark_osdmon_event(__func__);
3314 auto m = op->get_req<MOSDBoot>();
3315 int from = m->get_orig_source_inst().name.num();
3316
3317 // check permissions, ignore if failed (no response expected)
3318 MonSession *session = op->get_session();
3319 if (!session)
3320 goto ignore;
3321 if (!session->is_capable("osd", MON_CAP_X)) {
3322 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3323 << session->caps << dendl;
3324 goto ignore;
3325 }
3326
3327 if (m->sb.cluster_fsid != mon->monmap->fsid) {
3328 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3329 << " != " << mon->monmap->fsid << dendl;
3330 goto ignore;
3331 }
3332
3333 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3334 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3335 goto ignore;
3336 }
3337
3338 ceph_assert(m->get_orig_source_inst().name.is_osd());
3339
3340 // force all osds to have gone through luminous prior to upgrade to nautilus
3341 {
3342 vector<string> missing;
3343 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3344 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3345 }
3346 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3347 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3348 }
3349 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3350 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3351 }
3352 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3353 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3354 }
3355
3356 if (!missing.empty()) {
3357 using std::experimental::make_ostream_joiner;
3358
3359 stringstream ss;
3360 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3361
3362 mon->clog->info() << "disallowing boot of OSD "
3363 << m->get_orig_source_inst()
3364 << " because the osd lacks " << ss.str();
3365 goto ignore;
3366 }
3367 }
3368
3369 // make sure osd versions do not span more than 3 releases
3370 if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3371 osdmap.require_osd_release < ceph_release_t::mimic) {
3372 mon->clog->info() << "disallowing boot of octopus+ OSD "
3373 << m->get_orig_source_inst()
3374 << " because require_osd_release < mimic";
3375 goto ignore;
3376 }
3377
3378 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3379 // we are reusing a jewel feature bit that was retired in luminous.
3380 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3381 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3382 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3383 mon->clog->info() << "disallowing boot of OSD "
3384 << m->get_orig_source_inst()
3385 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3386 goto ignore;
3387 }
3388
3389 // already booted?
3390 if (osdmap.is_up(from) &&
3391 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3392 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3393 // yup.
3394 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3395 << " " << m->get_orig_source_addrs()
3396 << " =~ " << osdmap.get_addrs(from) << dendl;
3397 _booted(op, false);
3398 return true;
3399 }
3400
3401 if (osdmap.exists(from) &&
3402 !osdmap.get_uuid(from).is_zero() &&
3403 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3404 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3405 << " clashes with existing osd: different fsid"
3406 << " (ours: " << osdmap.get_uuid(from)
3407 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3408 goto ignore;
3409 }
3410
3411 if (osdmap.exists(from) &&
3412 osdmap.get_info(from).up_from > m->version &&
3413 osdmap.get_most_recent_addrs(from).legacy_equals(
3414 m->get_orig_source_addrs())) {
3415 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3416 send_latest(op, m->sb.current_epoch+1);
3417 return true;
3418 }
3419
3420 // noup?
3421 if (!can_mark_up(from)) {
3422 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3423 send_latest(op, m->sb.current_epoch+1);
3424 return true;
3425 }
3426
3427 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3428 return false;
3429
3430 ignore:
3431 return true;
3432 }
3433
3434 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3435 {
3436 op->mark_osdmon_event(__func__);
3437 auto m = op->get_req<MOSDBoot>();
3438 dout(7) << __func__ << " from " << m->get_source()
3439 << " sb " << m->sb
3440 << " client_addrs" << m->get_connection()->get_peer_addrs()
3441 << " cluster_addrs " << m->cluster_addrs
3442 << " hb_back_addrs " << m->hb_back_addrs
3443 << " hb_front_addrs " << m->hb_front_addrs
3444 << dendl;
3445
3446 ceph_assert(m->get_orig_source().is_osd());
3447 int from = m->get_orig_source().num();
3448
3449 // does this osd exist?
3450 if (from >= osdmap.get_max_osd()) {
3451 dout(1) << "boot from osd." << from << " >= max_osd "
3452 << osdmap.get_max_osd() << dendl;
3453 return false;
3454 }
3455
3456 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3457 if (pending_inc.new_state.count(from))
3458 oldstate ^= pending_inc.new_state[from];
3459
3460 // already up? mark down first?
3461 if (osdmap.is_up(from)) {
3462 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3463 << osdmap.get_addrs(from) << dendl;
3464 // preprocess should have caught these; if not, assert.
3465 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3466 m->get_orig_source_addrs()) ||
3467 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3468 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3469
3470 if (pending_inc.new_state.count(from) == 0 ||
3471 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3472 // mark previous guy down
3473 pending_inc.new_state[from] = CEPH_OSD_UP;
3474 }
3475 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3476 } else if (pending_inc.new_up_client.count(from)) {
3477 // already prepared, just wait
3478 dout(7) << __func__ << " already prepared, waiting on "
3479 << m->get_orig_source_addr() << dendl;
3480 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3481 } else {
3482 // mark new guy up.
3483 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3484 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3485 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3486 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3487
3488 down_pending_out.erase(from); // if any
3489
3490 if (m->sb.weight)
3491 osd_weight[from] = m->sb.weight;
3492
3493 // set uuid?
3494 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3495 << dendl;
3496 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3497 // preprocess should have caught this; if not, assert.
3498 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3499 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3500 }
3501
3502 // fresh osd?
3503 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3504 const osd_info_t& i = osdmap.get_info(from);
3505 if (i.up_from > i.lost_at) {
3506 dout(10) << " fresh osd; marking lost_at too" << dendl;
3507 pending_inc.new_lost[from] = osdmap.get_epoch();
3508 }
3509 }
3510
3511 // metadata
3512 bufferlist osd_metadata;
3513 encode(m->metadata, osd_metadata);
3514 pending_metadata[from] = osd_metadata;
3515 pending_metadata_rm.erase(from);
3516
3517 // adjust last clean unmount epoch?
3518 const osd_info_t& info = osdmap.get_info(from);
3519 dout(10) << " old osd_info: " << info << dendl;
3520 if (m->sb.mounted > info.last_clean_begin ||
3521 (m->sb.mounted == info.last_clean_begin &&
3522 m->sb.clean_thru > info.last_clean_end)) {
3523 epoch_t begin = m->sb.mounted;
3524 epoch_t end = m->sb.clean_thru;
3525
3526 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3527 << "[" << info.last_clean_begin << "," << info.last_clean_end
3528 << ") -> [" << begin << "-" << end << ")"
3529 << dendl;
3530 pending_inc.new_last_clean_interval[from] =
3531 pair<epoch_t,epoch_t>(begin, end);
3532 }
3533
3534 if (pending_inc.new_xinfo.count(from) == 0)
3535 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3536 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3537 if (m->boot_epoch == 0) {
3538 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3539 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3540 dout(10) << " not laggy, new xi " << xi << dendl;
3541 } else {
3542 if (xi.down_stamp.sec()) {
3543 int interval = ceph_clock_now().sec() -
3544 xi.down_stamp.sec();
3545 if (g_conf()->mon_osd_laggy_max_interval &&
3546 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3547 interval = g_conf()->mon_osd_laggy_max_interval;
3548 }
3549 xi.laggy_interval =
3550 interval * g_conf()->mon_osd_laggy_weight +
3551 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3552 }
3553 xi.laggy_probability =
3554 g_conf()->mon_osd_laggy_weight +
3555 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3556 dout(10) << " laggy, now xi " << xi << dendl;
3557 }
3558
3559 // set features shared by the osd
3560 if (m->osd_features)
3561 xi.features = m->osd_features;
3562 else
3563 xi.features = m->get_connection()->get_features();
3564
3565 // mark in?
3566 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3567 (oldstate & CEPH_OSD_AUTOOUT)) ||
3568 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3569 (g_conf()->mon_osd_auto_mark_in)) {
3570 if (can_mark_in(from)) {
3571 if (xi.old_weight > 0) {
3572 pending_inc.new_weight[from] = xi.old_weight;
3573 xi.old_weight = 0;
3574 } else {
3575 pending_inc.new_weight[from] = CEPH_OSD_IN;
3576 }
3577 } else {
3578 dout(7) << __func__ << " NOIN set, will not mark in "
3579 << m->get_orig_source_addr() << dendl;
3580 }
3581 }
3582
3583 // wait
3584 wait_for_finished_proposal(op, new C_Booted(this, op));
3585 }
3586 return true;
3587 }
3588
3589 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3590 {
3591 op->mark_osdmon_event(__func__);
3592 auto m = op->get_req<MOSDBoot>();
3593 dout(7) << "_booted " << m->get_orig_source_inst()
3594 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3595
3596 if (logit) {
3597 mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3598 << " boot";
3599 }
3600
3601 send_latest(op, m->sb.current_epoch+1);
3602 }
3603
3604
3605 // -------------
3606 // full
3607
3608 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3609 {
3610 op->mark_osdmon_event(__func__);
3611 auto m = op->get_req<MOSDFull>();
3612 int from = m->get_orig_source().num();
3613 set<string> state;
3614 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3615
3616 // check permissions, ignore if failed
3617 MonSession *session = op->get_session();
3618 if (!session)
3619 goto ignore;
3620 if (!session->is_capable("osd", MON_CAP_X)) {
3621 dout(0) << "MOSDFull from entity with insufficient privileges:"
3622 << session->caps << dendl;
3623 goto ignore;
3624 }
3625
3626 // ignore a full message from the osd instance that already went down
3627 if (!osdmap.exists(from)) {
3628 dout(7) << __func__ << " ignoring full message from nonexistent "
3629 << m->get_orig_source_inst() << dendl;
3630 goto ignore;
3631 }
3632 if ((!osdmap.is_up(from) &&
3633 osdmap.get_most_recent_addrs(from).legacy_equals(
3634 m->get_orig_source_addrs())) ||
3635 (osdmap.is_up(from) &&
3636 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3637 dout(7) << __func__ << " ignoring full message from down "
3638 << m->get_orig_source_inst() << dendl;
3639 goto ignore;
3640 }
3641
3642 OSDMap::calc_state_set(osdmap.get_state(from), state);
3643
3644 if ((osdmap.get_state(from) & mask) == m->state) {
3645 dout(7) << __func__ << " state already " << state << " for osd." << from
3646 << " " << m->get_orig_source_inst() << dendl;
3647 _reply_map(op, m->version);
3648 goto ignore;
3649 }
3650
3651 dout(10) << __func__ << " want state " << state << " for osd." << from
3652 << " " << m->get_orig_source_inst() << dendl;
3653 return false;
3654
3655 ignore:
3656 return true;
3657 }
3658
3659 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3660 {
3661 op->mark_osdmon_event(__func__);
3662 auto m = op->get_req<MOSDFull>();
3663 const int from = m->get_orig_source().num();
3664
3665 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3666 const unsigned want_state = m->state & mask; // safety first
3667
3668 unsigned cur_state = osdmap.get_state(from);
3669 auto p = pending_inc.new_state.find(from);
3670 if (p != pending_inc.new_state.end()) {
3671 cur_state ^= p->second;
3672 }
3673 cur_state &= mask;
3674
3675 set<string> want_state_set, cur_state_set;
3676 OSDMap::calc_state_set(want_state, want_state_set);
3677 OSDMap::calc_state_set(cur_state, cur_state_set);
3678
3679 if (cur_state != want_state) {
3680 if (p != pending_inc.new_state.end()) {
3681 p->second &= ~mask;
3682 } else {
3683 pending_inc.new_state[from] = 0;
3684 }
3685 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3686 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3687 << " -> " << want_state_set << dendl;
3688 } else {
3689 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3690 << " = wanted " << want_state_set << ", just waiting" << dendl;
3691 }
3692
3693 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3694 return true;
3695 }
3696
3697 // -------------
3698 // alive
3699
3700 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3701 {
3702 op->mark_osdmon_event(__func__);
3703 auto m = op->get_req<MOSDAlive>();
3704 int from = m->get_orig_source().num();
3705
3706 // check permissions, ignore if failed
3707 MonSession *session = op->get_session();
3708 if (!session)
3709 goto ignore;
3710 if (!session->is_capable("osd", MON_CAP_X)) {
3711 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3712 << session->caps << dendl;
3713 goto ignore;
3714 }
3715
3716 if (!osdmap.is_up(from) ||
3717 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3718 dout(7) << "preprocess_alive ignoring alive message from down "
3719 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3720 << dendl;
3721 goto ignore;
3722 }
3723
3724 if (osdmap.get_up_thru(from) >= m->want) {
3725 // yup.
3726 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3727 _reply_map(op, m->version);
3728 return true;
3729 }
3730
3731 dout(10) << "preprocess_alive want up_thru " << m->want
3732 << " from " << m->get_orig_source_inst() << dendl;
3733 return false;
3734
3735 ignore:
3736 return true;
3737 }
3738
3739 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3740 {
3741 op->mark_osdmon_event(__func__);
3742 auto m = op->get_req<MOSDAlive>();
3743 int from = m->get_orig_source().num();
3744
3745 if (0) { // we probably don't care much about these
3746 mon->clog->debug() << m->get_orig_source_inst() << " alive";
3747 }
3748
3749 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3750 << " from " << m->get_orig_source_inst() << dendl;
3751
3752 update_up_thru(from, m->version); // set to the latest map the OSD has
3753 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3754 return true;
3755 }
3756
3757 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3758 {
3759 op->mark_osdmon_event(__func__);
3760 dout(7) << "_reply_map " << e
3761 << " from " << op->get_req()->get_orig_source_inst()
3762 << dendl;
3763 send_latest(op, e);
3764 }
3765
3766 // pg_created
3767 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3768 {
3769 op->mark_osdmon_event(__func__);
3770 auto m = op->get_req<MOSDPGCreated>();
3771 dout(10) << __func__ << " " << *m << dendl;
3772 auto session = op->get_session();
3773 mon->no_reply(op);
3774 if (!session) {
3775 dout(10) << __func__ << ": no monitor session!" << dendl;
3776 return true;
3777 }
3778 if (!session->is_capable("osd", MON_CAP_X)) {
3779 derr << __func__ << " received from entity "
3780 << "with insufficient privileges " << session->caps << dendl;
3781 return true;
3782 }
3783 // always forward the "created!" to the leader
3784 return false;
3785 }
3786
3787 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3788 {
3789 op->mark_osdmon_event(__func__);
3790 auto m = op->get_req<MOSDPGCreated>();
3791 dout(10) << __func__ << " " << *m << dendl;
3792 auto src = m->get_orig_source();
3793 auto from = src.num();
3794 if (!src.is_osd() ||
3795 !mon->osdmon()->osdmap.is_up(from) ||
3796 !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3797 m->get_orig_source_addrs())) {
3798 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3799 return false;
3800 }
3801 pending_created_pgs.push_back(m->pgid);
3802 return true;
3803 }
3804
3805 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3806 {
3807 op->mark_osdmon_event(__func__);
3808 auto m = op->get_req<MOSDPGReadyToMerge>();
3809 dout(10) << __func__ << " " << *m << dendl;
3810 const pg_pool_t *pi;
3811 auto session = op->get_session();
3812 if (!session) {
3813 dout(10) << __func__ << ": no monitor session!" << dendl;
3814 goto ignore;
3815 }
3816 if (!session->is_capable("osd", MON_CAP_X)) {
3817 derr << __func__ << " received from entity "
3818 << "with insufficient privileges " << session->caps << dendl;
3819 goto ignore;
3820 }
3821 pi = osdmap.get_pg_pool(m->pgid.pool());
3822 if (!pi) {
3823 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3824 goto ignore;
3825 }
3826 if (pi->get_pg_num() <= m->pgid.ps()) {
3827 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3828 goto ignore;
3829 }
3830 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3831 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3832 goto ignore;
3833 }
3834 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3835 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3836 goto ignore;
3837 }
3838 return false;
3839
3840 ignore:
3841 mon->no_reply(op);
3842 return true;
3843 }
3844
3845 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3846 {
3847 op->mark_osdmon_event(__func__);
3848 auto m = op->get_req<MOSDPGReadyToMerge>();
3849 dout(10) << __func__ << " " << *m << dendl;
3850 pg_pool_t p;
3851 if (pending_inc.new_pools.count(m->pgid.pool()))
3852 p = pending_inc.new_pools[m->pgid.pool()];
3853 else
3854 p = *osdmap.get_pg_pool(m->pgid.pool());
3855 if (p.get_pg_num() != m->pgid.ps() + 1 ||
3856 p.get_pg_num_pending() > m->pgid.ps()) {
3857 dout(10) << __func__
3858 << " race with concurrent pg_num[_pending] update, will retry"
3859 << dendl;
3860 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3861 return true;
3862 }
3863
3864 if (m->ready) {
3865 p.dec_pg_num(m->pgid,
3866 pending_inc.epoch,
3867 m->source_version,
3868 m->target_version,
3869 m->last_epoch_started,
3870 m->last_epoch_clean);
3871 p.last_change = pending_inc.epoch;
3872 } else {
3873 // back off the merge attempt!
3874 p.set_pg_num_pending(p.get_pg_num());
3875 }
3876
3877 // force pre-nautilus clients to resend their ops, since they
3878 // don't understand pg_num_pending changes form a new interval
3879 p.last_force_op_resend_prenautilus = pending_inc.epoch;
3880
3881 pending_inc.new_pools[m->pgid.pool()] = p;
3882
3883 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3884 if (m->ready &&
3885 prob > 0 &&
3886 prob > (double)(rand() % 1000)/1000.0) {
3887 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3888 auto n = new MMonCommand(mon->monmap->get_fsid());
3889 n->set_connection(m->get_connection());
3890 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3891 osdmap.get_pool_name(m->pgid.pool()) +
3892 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3893 stringify(m->pgid.ps() + 1) + "\"}" };
3894 MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3895 nop->set_type_service();
3896 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3897 } else {
3898 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3899 }
3900 return true;
3901 }
3902
3903
3904 // -------------
3905 // pg_temp changes
3906
3907 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3908 {
3909 auto m = op->get_req<MOSDPGTemp>();
3910 dout(10) << "preprocess_pgtemp " << *m << dendl;
3911 mempool::osdmap::vector<int> empty;
3912 int from = m->get_orig_source().num();
3913 size_t ignore_cnt = 0;
3914
3915 // check caps
3916 MonSession *session = op->get_session();
3917 if (!session)
3918 goto ignore;
3919 if (!session->is_capable("osd", MON_CAP_X)) {
3920 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3921 << session->caps << dendl;
3922 goto ignore;
3923 }
3924
3925 if (!osdmap.is_up(from) ||
3926 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3927 dout(7) << "ignoring pgtemp message from down "
3928 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3929 << dendl;
3930 goto ignore;
3931 }
3932
3933 if (m->forced) {
3934 return false;
3935 }
3936
3937 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3938 dout(20) << " " << p->first
3939 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3940 << " -> " << p->second << dendl;
3941
3942 // does the pool exist?
3943 if (!osdmap.have_pg_pool(p->first.pool())) {
3944 /*
3945 * 1. If the osdmap does not have the pool, it means the pool has been
3946 * removed in-between the osd sending this message and us handling it.
3947 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3948 * not exist in the pending either, as the osds would not send a
3949 * message about a pool they know nothing about (yet).
3950 * 3. However, if the pool does exist in the pending, then it must be a
3951 * new pool, and not relevant to this message (see 1).
3952 */
3953 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3954 << ": pool has been removed" << dendl;
3955 ignore_cnt++;
3956 continue;
3957 }
3958
3959 int acting_primary = -1;
3960 osdmap.pg_to_up_acting_osds(
3961 p->first, nullptr, nullptr, nullptr, &acting_primary);
3962 if (acting_primary != from) {
3963 /* If the source isn't the primary based on the current osdmap, we know
3964 * that the interval changed and that we can discard this message.
3965 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3966 * which of two pg temp mappings on the same pg is more recent.
3967 */
3968 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3969 << ": primary has changed" << dendl;
3970 ignore_cnt++;
3971 continue;
3972 }
3973
3974 // removal?
3975 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3976 osdmap.primary_temp->count(p->first)))
3977 return false;
3978 // change?
3979 // NOTE: we assume that this will clear pg_primary, so consider
3980 // an existing pg_primary field to imply a change
3981 if (p->second.size() &&
3982 (osdmap.pg_temp->count(p->first) == 0 ||
3983 osdmap.pg_temp->get(p->first) != p->second ||
3984 osdmap.primary_temp->count(p->first)))
3985 return false;
3986 }
3987
3988 // should we ignore all the pgs?
3989 if (ignore_cnt == m->pg_temp.size())
3990 goto ignore;
3991
3992 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
3993 _reply_map(op, m->map_epoch);
3994 return true;
3995
3996 ignore:
3997 return true;
3998 }
3999
4000 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4001 {
4002 epoch_t old_up_thru = osdmap.get_up_thru(from);
4003 auto ut = pending_inc.new_up_thru.find(from);
4004 if (ut != pending_inc.new_up_thru.end()) {
4005 old_up_thru = ut->second;
4006 }
4007 if (up_thru > old_up_thru) {
4008 // set up_thru too, so the osd doesn't have to ask again
4009 pending_inc.new_up_thru[from] = up_thru;
4010 }
4011 }
4012
4013 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4014 {
4015 op->mark_osdmon_event(__func__);
4016 auto m = op->get_req<MOSDPGTemp>();
4017 int from = m->get_orig_source().num();
4018 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4019 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4020 uint64_t pool = p->first.pool();
4021 if (pending_inc.old_pools.count(pool)) {
4022 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4023 << ": pool pending removal" << dendl;
4024 continue;
4025 }
4026 if (!osdmap.have_pg_pool(pool)) {
4027 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4028 << ": pool has been removed" << dendl;
4029 continue;
4030 }
4031 pending_inc.new_pg_temp[p->first] =
4032 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4033
4034 // unconditionally clear pg_primary (until this message can encode
4035 // a change for that, too.. at which point we need to also fix
4036 // preprocess_pg_temp)
4037 if (osdmap.primary_temp->count(p->first) ||
4038 pending_inc.new_primary_temp.count(p->first))
4039 pending_inc.new_primary_temp[p->first] = -1;
4040 }
4041
4042 // set up_thru too, so the osd doesn't have to ask again
4043 update_up_thru(from, m->map_epoch);
4044
4045 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4046 return true;
4047 }
4048
4049
4050 // ---
4051
4052 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4053 {
4054 op->mark_osdmon_event(__func__);
4055 auto m = op->get_req<MRemoveSnaps>();
4056 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4057
4058 // check privilege, ignore if failed
4059 MonSession *session = op->get_session();
4060 mon->no_reply(op);
4061 if (!session)
4062 goto ignore;
4063 if (!session->caps.is_capable(
4064 cct,
4065 session->entity_name,
4066 "osd", "osd pool rmsnap", {}, true, true, false,
4067 session->get_peer_socket_addr())) {
4068 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4069 << session->caps << dendl;
4070 goto ignore;
4071 }
4072
4073 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4074 q != m->snaps.end();
4075 ++q) {
4076 if (!osdmap.have_pg_pool(q->first)) {
4077 dout(10) << " ignoring removed_snaps " << q->second
4078 << " on non-existent pool " << q->first << dendl;
4079 continue;
4080 }
4081 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4082 for (vector<snapid_t>::iterator p = q->second.begin();
4083 p != q->second.end();
4084 ++p) {
4085 if (*p > pi->get_snap_seq() ||
4086 !_is_removed_snap(q->first, *p)) {
4087 return false;
4088 }
4089 }
4090 }
4091
4092 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4093 auto reply = make_message<MRemoveSnaps>();
4094 reply->snaps = m->snaps;
4095 mon->send_reply(op, reply.detach());
4096 }
4097
4098 ignore:
4099 return true;
4100 }
4101
4102 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4103 {
4104 op->mark_osdmon_event(__func__);
4105 auto m = op->get_req<MRemoveSnaps>();
4106 dout(7) << "prepare_remove_snaps " << *m << dendl;
4107
4108 for (auto& [pool, snaps] : m->snaps) {
4109 if (!osdmap.have_pg_pool(pool)) {
4110 dout(10) << " ignoring removed_snaps " << snaps
4111 << " on non-existent pool " << pool << dendl;
4112 continue;
4113 }
4114
4115 pg_pool_t& pi = osdmap.pools[pool];
4116 for (auto s : snaps) {
4117 if (!_is_removed_snap(pool, s) &&
4118 (!pending_inc.new_pools.count(pool) ||
4119 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4120 (!pending_inc.new_removed_snaps.count(pool) ||
4121 !pending_inc.new_removed_snaps[pool].contains(s))) {
4122 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4123 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4124 newpi->removed_snaps.insert(s);
4125 dout(10) << " pool " << pool << " removed_snaps added " << s
4126 << " (now " << newpi->removed_snaps << ")" << dendl;
4127 }
4128 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4129 if (s > newpi->get_snap_seq()) {
4130 dout(10) << " pool " << pool << " snap_seq "
4131 << newpi->get_snap_seq() << " -> " << s << dendl;
4132 newpi->set_snap_seq(s);
4133 }
4134 newpi->set_snap_epoch(pending_inc.epoch);
4135 dout(10) << " added pool " << pool << " snap " << s
4136 << " to removed_snaps queue" << dendl;
4137 pending_inc.new_removed_snaps[pool].insert(s);
4138 }
4139 }
4140 }
4141
4142 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4143 auto reply = make_message<MRemoveSnaps>();
4144 reply->snaps = m->snaps;
4145 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4146 }
4147
4148 return true;
4149 }
4150
4151 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4152 {
4153 op->mark_osdmon_event(__func__);
4154 auto m = op->get_req<MMonGetPurgedSnaps>();
4155 dout(7) << __func__ << " " << *m << dendl;
4156
4157 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4158
4159 string k = make_purged_snap_epoch_key(m->start);
4160 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4161 it->upper_bound(k);
4162 unsigned long epoch = m->last;
4163 while (it->valid()) {
4164 if (it->key().find("purged_epoch_") != 0) {
4165 break;
4166 }
4167 string k = it->key();
4168 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4169 if (n != 1) {
4170 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4171 } else if (epoch > m->last) {
4172 break;
4173 } else {
4174 bufferlist bl = it->value();
4175 auto p = bl.cbegin();
4176 auto &v = r[epoch];
4177 try {
4178 ceph::decode(v, p);
4179 } catch (buffer::error& e) {
4180 derr << __func__ << " unable to parse value for key '" << it->key()
4181 << "': \n";
4182 bl.hexdump(*_dout);
4183 *_dout << dendl;
4184 }
4185 n += 4 + v.size() * 16;
4186 }
4187 if (n > 1048576) {
4188 // impose a semi-arbitrary limit to message size
4189 break;
4190 }
4191 it->next();
4192 }
4193
4194 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4195 reply->purged_snaps.swap(r);
4196 mon->send_reply(op, reply.detach());
4197
4198 return true;
4199 }
4200
4201 // osd beacon
4202 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4203 {
4204 op->mark_osdmon_event(__func__);
4205 // check caps
4206 auto session = op->get_session();
4207 mon->no_reply(op);
4208 if (!session) {
4209 dout(10) << __func__ << " no monitor session!" << dendl;
4210 return true;
4211 }
4212 if (!session->is_capable("osd", MON_CAP_X)) {
4213 derr << __func__ << " received from entity "
4214 << "with insufficient privileges " << session->caps << dendl;
4215 return true;
4216 }
4217 // Always forward the beacon to the leader, even if they are the same as
4218 // the old one. The leader will mark as down osds that haven't sent
4219 // beacon for a few minutes.
4220 return false;
4221 }
4222
4223 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4224 {
4225 op->mark_osdmon_event(__func__);
4226 const auto beacon = op->get_req<MOSDBeacon>();
4227 const auto src = beacon->get_orig_source();
4228 dout(10) << __func__ << " " << *beacon
4229 << " from " << src << dendl;
4230 int from = src.num();
4231
4232 if (!src.is_osd() ||
4233 !osdmap.is_up(from) ||
4234 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4235 if (src.is_osd() && !osdmap.is_up(from)) {
4236 // share some new maps with this guy in case it may not be
4237 // aware of its own deadness...
4238 send_latest(op, beacon->version+1);
4239 }
4240 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4241 return false;
4242 }
4243
4244 last_osd_report[from] = ceph_clock_now();
4245 osd_epochs[from] = beacon->version;
4246
4247 for (const auto& pg : beacon->pgs) {
4248 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4249 }
4250
4251 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4252 beacon->last_purged_snaps_scrub) {
4253 if (pending_inc.new_xinfo.count(from) == 0) {
4254 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4255 }
4256 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4257 beacon->last_purged_snaps_scrub;
4258 return true;
4259 } else {
4260 return false;
4261 }
4262 }
4263
4264 // ---------------
4265 // map helpers
4266
4267 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4268 {
4269 op->mark_osdmon_event(__func__);
4270 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4271 << " start " << start << dendl;
4272 if (start == 0)
4273 send_full(op);
4274 else
4275 send_incremental(op, start);
4276 }
4277
4278
4279 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4280 {
4281 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4282 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4283 r->oldest_map = get_first_committed();
4284 r->newest_map = osdmap.get_epoch();
4285 return r;
4286 }
4287
4288 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4289 {
4290 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4291 << std::hex << features << std::dec << dendl;
4292 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
4293 m->oldest_map = get_first_committed();
4294 m->newest_map = osdmap.get_epoch();
4295
4296 for (epoch_t e = to; e >= from && e > 0; e--) {
4297 bufferlist bl;
4298 int err = get_version(e, features, bl);
4299 if (err == 0) {
4300 ceph_assert(bl.length());
4301 // if (get_version(e, bl) > 0) {
4302 dout(20) << "build_incremental inc " << e << " "
4303 << bl.length() << " bytes" << dendl;
4304 m->incremental_maps[e] = bl;
4305 } else {
4306 ceph_assert(err == -ENOENT);
4307 ceph_assert(!bl.length());
4308 get_version_full(e, features, bl);
4309 if (bl.length() > 0) {
4310 //else if (get_version("full", e, bl) > 0) {
4311 dout(20) << "build_incremental full " << e << " "
4312 << bl.length() << " bytes" << dendl;
4313 m->maps[e] = bl;
4314 } else {
4315 ceph_abort(); // we should have all maps.
4316 }
4317 }
4318 }
4319 return m;
4320 }
4321
4322 void OSDMonitor::send_full(MonOpRequestRef op)
4323 {
4324 op->mark_osdmon_event(__func__);
4325 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4326 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
4327 }
4328
4329 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4330 {
4331 op->mark_osdmon_event(__func__);
4332
4333 MonSession *s = op->get_session();
4334 ceph_assert(s);
4335
4336 if (s->proxy_con) {
4337 // oh, we can tell the other mon to do it
4338 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4339 << first << dendl;
4340 MRoute *r = new MRoute(s->proxy_tid, NULL);
4341 r->send_osdmap_first = first;
4342 s->proxy_con->send_message(r);
4343 op->mark_event("reply: send routed send_osdmap_first reply");
4344 } else {
4345 // do it ourselves
4346 send_incremental(first, s, false, op);
4347 }
4348 }
4349
4350 void OSDMonitor::send_incremental(epoch_t first,
4351 MonSession *session,
4352 bool onetime,
4353 MonOpRequestRef req)
4354 {
4355 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4356 << " to " << session->name << dendl;
4357
4358 // get feature of the peer
4359 // use quorum_con_features, if it's an anonymous connection.
4360 uint64_t features = session->con_features ? session->con_features :
4361 mon->get_quorum_con_features();
4362
4363 if (first <= session->osd_epoch) {
4364 dout(10) << __func__ << " " << session->name << " should already have epoch "
4365 << session->osd_epoch << dendl;
4366 first = session->osd_epoch + 1;
4367 }
4368
4369 if (first < get_first_committed()) {
4370 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4371 m->oldest_map = get_first_committed();
4372 m->newest_map = osdmap.get_epoch();
4373
4374 first = get_first_committed();
4375 bufferlist bl;
4376 int err = get_version_full(first, features, bl);
4377 ceph_assert(err == 0);
4378 ceph_assert(bl.length());
4379 dout(20) << "send_incremental starting with base full "
4380 << first << " " << bl.length() << " bytes" << dendl;
4381 m->maps[first] = bl;
4382
4383 if (req) {
4384 mon->send_reply(req, m);
4385 session->osd_epoch = first;
4386 return;
4387 } else {
4388 session->con->send_message(m);
4389 session->osd_epoch = first;
4390 }
4391 first++;
4392 }
4393
4394 while (first <= osdmap.get_epoch()) {
4395 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4396 osdmap.get_epoch());
4397 MOSDMap *m = build_incremental(first, last, features);
4398
4399 if (req) {
4400 // send some maps. it may not be all of them, but it will get them
4401 // started.
4402 mon->send_reply(req, m);
4403 } else {
4404 session->con->send_message(m);
4405 first = last + 1;
4406 }
4407 session->osd_epoch = last;
4408 if (onetime || req)
4409 break;
4410 }
4411 }
4412
4413 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4414 {
4415 return get_version(ver, mon->get_quorum_con_features(), bl);
4416 }
4417
4418 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4419 {
4420 OSDMap::Incremental inc;
4421 auto q = bl.cbegin();
4422 inc.decode(q);
4423 // always encode with subset of osdmap's canonical features
4424 uint64_t f = features & inc.encode_features;
4425 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4426 << dendl;
4427 bl.clear();
4428 if (inc.fullmap.length()) {
4429 // embedded full map?
4430 OSDMap m;
4431 m.decode(inc.fullmap);
4432 inc.fullmap.clear();
4433 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4434 }
4435 if (inc.crush.length()) {
4436 // embedded crush map
4437 CrushWrapper c;
4438 auto p = inc.crush.cbegin();
4439 c.decode(p);
4440 inc.crush.clear();
4441 c.encode(inc.crush, f);
4442 }
4443 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4444 }
4445
4446 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4447 {
4448 OSDMap m;
4449 auto q = bl.cbegin();
4450 m.decode(q);
4451 // always encode with subset of osdmap's canonical features
4452 uint64_t f = features & m.get_encoding_features();
4453 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4454 << dendl;
4455 bl.clear();
4456 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4457 }
4458
4459 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4460 {
4461 uint64_t significant_features = OSDMap::get_significant_features(features);
4462 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4463 return 0;
4464 }
4465 int ret = PaxosService::get_version(ver, bl);
4466 if (ret < 0) {
4467 return ret;
4468 }
4469 // NOTE: this check is imprecise; the OSDMap encoding features may
4470 // be a subset of the latest mon quorum features, but worst case we
4471 // reencode once and then cache the (identical) result under both
4472 // feature masks.
4473 if (significant_features !=
4474 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4475 reencode_incremental_map(bl, features);
4476 }
4477 inc_osd_cache.add_bytes({ver, significant_features}, bl);
4478 return 0;
4479 }
4480
4481 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4482 {
4483 bufferlist inc_bl;
4484 int err = get_version(ver, inc_bl);
4485 ceph_assert(err == 0);
4486 ceph_assert(inc_bl.length());
4487
4488 auto p = inc_bl.cbegin();
4489 inc.decode(p);
4490 dout(10) << __func__ << " "
4491 << " epoch " << inc.epoch
4492 << " inc_crc " << inc.inc_crc
4493 << " full_crc " << inc.full_crc
4494 << " encode_features " << inc.encode_features << dendl;
4495 return 0;
4496 }
4497
4498 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4499 {
4500 dout(10) << __func__ << " ver " << ver << dendl;
4501
4502 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4503 if (closest_pinned == 0) {
4504 return -ENOENT;
4505 }
4506 if (closest_pinned > ver) {
4507 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4508 }
4509 ceph_assert(closest_pinned <= ver);
4510
4511 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4512
4513 // get osdmap incremental maps and apply on top of this one.
4514 bufferlist osdm_bl;
4515 bool has_cached_osdmap = false;
4516 for (version_t v = ver-1; v >= closest_pinned; --v) {
4517 if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4518 &osdm_bl)) {
4519 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4520 closest_pinned = v;
4521 has_cached_osdmap = true;
4522 break;
4523 }
4524 }
4525
4526 if (!has_cached_osdmap) {
4527 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4528 if (err != 0) {
4529 derr << __func__ << " closest pinned map ver " << closest_pinned
4530 << " not available! error: " << cpp_strerror(err) << dendl;
4531 }
4532 ceph_assert(err == 0);
4533 }
4534
4535 ceph_assert(osdm_bl.length());
4536
4537 OSDMap osdm;
4538 osdm.decode(osdm_bl);
4539
4540 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4541 << " e" << osdm.epoch
4542 << " crc " << osdm.get_crc()
4543 << " -- applying incremental maps." << dendl;
4544
4545 uint64_t encode_features = 0;
4546 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4547 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4548
4549 OSDMap::Incremental inc;
4550 int err = get_inc(v, inc);
4551 ceph_assert(err == 0);
4552
4553 encode_features = inc.encode_features;
4554
4555 err = osdm.apply_incremental(inc);
4556 ceph_assert(err == 0);
4557
4558 // this block performs paranoid checks on map retrieval
4559 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4560 inc.full_crc != 0) {
4561
4562 uint64_t f = encode_features;
4563 if (!f) {
4564 f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4565 }
4566
4567 // encode osdmap to force calculating crcs
4568 bufferlist tbl;
4569 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4570 // decode osdmap to compare crcs with what's expected by incremental
4571 OSDMap tosdm;
4572 tosdm.decode(tbl);
4573
4574 if (tosdm.get_crc() != inc.full_crc) {
4575 derr << __func__
4576 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4577 << ", expected " << inc.full_crc << ")" << dendl;
4578 ceph_abort_msg("osdmap crc mismatch");
4579 }
4580 }
4581
4582 // note: we cannot add the recently computed map to the cache, as is,
4583 // because we have not encoded the map into a bl.
4584 }
4585
4586 if (!encode_features) {
4587 dout(10) << __func__
4588 << " last incremental map didn't have features;"
4589 << " defaulting to quorum's or all" << dendl;
4590 encode_features =
4591 (mon->quorum_con_features ? mon->quorum_con_features : -1);
4592 }
4593 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4594
4595 return 0;
4596 }
4597
4598 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4599 {
4600 return get_version_full(ver, mon->get_quorum_con_features(), bl);
4601 }
4602
4603 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4604 bufferlist& bl)
4605 {
4606 uint64_t significant_features = OSDMap::get_significant_features(features);
4607 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4608 return 0;
4609 }
4610 int ret = PaxosService::get_version_full(ver, bl);
4611 if (ret == -ENOENT) {
4612 // build map?
4613 ret = get_full_from_pinned_map(ver, bl);
4614 }
4615 if (ret < 0) {
4616 return ret;
4617 }
4618 // NOTE: this check is imprecise; the OSDMap encoding features may
4619 // be a subset of the latest mon quorum features, but worst case we
4620 // reencode once and then cache the (identical) result under both
4621 // feature masks.
4622 if (significant_features !=
4623 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4624 reencode_full_map(bl, features);
4625 }
4626 full_osd_cache.add_bytes({ver, significant_features}, bl);
4627 return 0;
4628 }
4629
4630 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4631 {
4632 dout(10) << "blacklist " << av << " until " << until << dendl;
4633 for (auto a : av.v) {
4634 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4635 a.set_type(entity_addr_t::TYPE_ANY);
4636 } else {
4637 a.set_type(entity_addr_t::TYPE_LEGACY);
4638 }
4639 pending_inc.new_blacklist[a] = until;
4640 }
4641 return pending_inc.epoch;
4642 }
4643
4644 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4645 {
4646 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4647 a.set_type(entity_addr_t::TYPE_ANY);
4648 } else {
4649 a.set_type(entity_addr_t::TYPE_LEGACY);
4650 }
4651 dout(10) << "blacklist " << a << " until " << until << dendl;
4652 pending_inc.new_blacklist[a] = until;
4653 return pending_inc.epoch;
4654 }
4655
4656
4657 void OSDMonitor::check_osdmap_subs()
4658 {
4659 dout(10) << __func__ << dendl;
4660 if (!osdmap.get_epoch()) {
4661 return;
4662 }
4663 auto osdmap_subs = mon->session_map.subs.find("osdmap");
4664 if (osdmap_subs == mon->session_map.subs.end()) {
4665 return;
4666 }
4667 auto p = osdmap_subs->second->begin();
4668 while (!p.end()) {
4669 auto sub = *p;
4670 ++p;
4671 check_osdmap_sub(sub);
4672 }
4673 }
4674
4675 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4676 {
4677 dout(10) << __func__ << " " << sub << " next " << sub->next
4678 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4679 if (sub->next <= osdmap.get_epoch()) {
4680 if (sub->next >= 1)
4681 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4682 else
4683 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4684 if (sub->onetime)
4685 mon->session_map.remove_sub(sub);
4686 else
4687 sub->next = osdmap.get_epoch() + 1;
4688 }
4689 }
4690
4691 void OSDMonitor::check_pg_creates_subs()
4692 {
4693 if (!osdmap.get_num_up_osds()) {
4694 return;
4695 }
4696 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4697 mon->with_session_map([this](const MonSessionMap& session_map) {
4698 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4699 if (pg_creates_subs == session_map.subs.end()) {
4700 return;
4701 }
4702 for (auto sub : *pg_creates_subs->second) {
4703 check_pg_creates_sub(sub);
4704 }
4705 });
4706 }
4707
4708 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4709 {
4710 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4711 ceph_assert(sub->type == "osd_pg_creates");
4712 // only send these if the OSD is up. we will check_subs() when they do
4713 // come up so they will get the creates then.
4714 if (sub->session->name.is_osd() &&
4715 mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4716 sub->next = send_pg_creates(sub->session->name.num(),
4717 sub->session->con.get(),
4718 sub->next);
4719 }
4720 }
4721
4722 void OSDMonitor::do_application_enable(int64_t pool_id,
4723 const std::string &app_name,
4724 const std::string &app_key,
4725 const std::string &app_value)
4726 {
4727 ceph_assert(paxos->is_plugged() && is_writeable());
4728
4729 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4730 << dendl;
4731
4732 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4733
4734 auto pp = osdmap.get_pg_pool(pool_id);
4735 ceph_assert(pp != nullptr);
4736
4737 pg_pool_t p = *pp;
4738 if (pending_inc.new_pools.count(pool_id)) {
4739 p = pending_inc.new_pools[pool_id];
4740 }
4741
4742 if (app_key.empty()) {
4743 p.application_metadata.insert({app_name, {}});
4744 } else {
4745 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4746 }
4747 p.last_change = pending_inc.epoch;
4748 pending_inc.new_pools[pool_id] = p;
4749 }
4750
4751 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4752 pool_opts_t::key_t opt,
4753 pool_opts_t::value_t val)
4754 {
4755 auto p = pending_inc.new_pools.try_emplace(
4756 pool_id, *osdmap.get_pg_pool(pool_id));
4757 p.first->second.opts.set(opt, val);
4758 }
4759
4760 unsigned OSDMonitor::scan_for_creating_pgs(
4761 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4762 const mempool::osdmap::set<int64_t>& removed_pools,
4763 utime_t modified,
4764 creating_pgs_t* creating_pgs) const
4765 {
4766 unsigned queued = 0;
4767 for (auto& p : pools) {
4768 int64_t poolid = p.first;
4769 if (creating_pgs->created_pools.count(poolid)) {
4770 dout(10) << __func__ << " already created " << poolid << dendl;
4771 continue;
4772 }
4773 const pg_pool_t& pool = p.second;
4774 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4775 pool.get_type(), pool.get_size());
4776 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4777 continue;
4778
4779 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4780 const auto created = pool.get_last_change();
4781 if (last_scan_epoch && created <= last_scan_epoch) {
4782 dout(10) << __func__ << " no change in pool " << poolid
4783 << " " << pool << dendl;
4784 continue;
4785 }
4786 if (removed_pools.count(poolid)) {
4787 dout(10) << __func__ << " pool is being removed: " << poolid
4788 << " " << pool << dendl;
4789 continue;
4790 }
4791 dout(10) << __func__ << " queueing pool create for " << poolid
4792 << " " << pool << dendl;
4793 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4794 created, modified);
4795 queued++;
4796 }
4797 return queued;
4798 }
4799
4800 void OSDMonitor::update_creating_pgs()
4801 {
4802 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4803 << creating_pgs.queue.size() << " pools in queue" << dendl;
4804 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4805 std::lock_guard<std::mutex> l(creating_pgs_lock);
4806 for (const auto& pg : creating_pgs.pgs) {
4807 int acting_primary = -1;
4808 auto pgid = pg.first;
4809 if (!osdmap.pg_exists(pgid)) {
4810 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4811 << dendl;
4812 continue;
4813 }
4814 auto mapped = pg.second.create_epoch;
4815 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4816 spg_t spgid(pgid);
4817 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4818 // check the previous creating_pgs, look for the target to whom the pg was
4819 // previously mapped
4820 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4821 const auto last_acting_primary = pgs_by_epoch.first;
4822 for (auto& pgs: pgs_by_epoch.second) {
4823 if (pgs.second.count(spgid)) {
4824 if (last_acting_primary == acting_primary) {
4825 mapped = pgs.first;
4826 } else {
4827 dout(20) << __func__ << " " << pgid << " "
4828 << " acting_primary:" << last_acting_primary
4829 << " -> " << acting_primary << dendl;
4830 // note epoch if the target of the create message changed.
4831 mapped = mapping.get_epoch();
4832 }
4833 break;
4834 } else {
4835 // newly creating
4836 mapped = mapping.get_epoch();
4837 }
4838 }
4839 }
4840 dout(10) << __func__ << " will instruct osd." << acting_primary
4841 << " to create " << pgid << "@" << mapped << dendl;
4842 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4843 }
4844 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4845 creating_pgs_epoch = mapping.get_epoch();
4846 }
4847
4848 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4849 {
4850 dout(30) << __func__ << " osd." << osd << " next=" << next
4851 << " " << creating_pgs_by_osd_epoch << dendl;
4852 std::lock_guard<std::mutex> l(creating_pgs_lock);
4853 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4854 dout(20) << __func__
4855 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4856 // the subscribers will be updated when the mapping is completed anyway
4857 return next;
4858 }
4859 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4860 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4861 return next;
4862 ceph_assert(!creating_pgs_by_epoch->second.empty());
4863
4864 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4865 MOSDPGCreate2 *m = nullptr;
4866
4867 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
4868
4869 epoch_t last = 0;
4870 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4871 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4872 auto epoch = epoch_pgs->first;
4873 auto& pgs = epoch_pgs->second;
4874 dout(20) << __func__ << " osd." << osd << " from " << next
4875 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4876 last = epoch;
4877 for (auto& pg : pgs) {
4878 // Need the create time from the monitor using its clock to set
4879 // last_scrub_stamp upon pg creation.
4880 auto create = creating_pgs.pgs.find(pg.pgid);
4881 ceph_assert(create != creating_pgs.pgs.end());
4882 if (old) {
4883 if (!oldm) {
4884 oldm = new MOSDPGCreate(creating_pgs_epoch);
4885 }
4886 oldm->mkpg.emplace(pg.pgid,
4887 pg_create_t{create->second.create_epoch, pg.pgid, 0});
4888 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
4889 } else {
4890 if (!m) {
4891 m = new MOSDPGCreate2(creating_pgs_epoch);
4892 }
4893 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4894 create->second.create_stamp));
4895 if (create->second.history.epoch_created) {
4896 dout(20) << __func__ << " " << pg << " " << create->second.history
4897 << " " << create->second.past_intervals << dendl;
4898 m->pg_extra.emplace(pg, make_pair(create->second.history,
4899 create->second.past_intervals));
4900 }
4901 }
4902 dout(20) << __func__ << " will create " << pg
4903 << " at " << create->second.create_epoch << dendl;
4904 }
4905 }
4906 if (m) {
4907 con->send_message(m);
4908 } else if (oldm) {
4909 con->send_message(oldm);
4910 } else {
4911 dout(20) << __func__ << " osd." << osd << " from " << next
4912 << " has nothing to send" << dendl;
4913 return next;
4914 }
4915
4916 // sub is current through last + 1
4917 return last + 1;
4918 }
4919
4920 // TICK
4921
4922
4923 void OSDMonitor::tick()
4924 {
4925 if (!is_active()) return;
4926
4927 dout(10) << osdmap << dendl;
4928
4929 // always update osdmap manifest, regardless of being the leader.
4930 load_osdmap_manifest();
4931
4932 if (!mon->is_leader()) return;
4933
4934 bool do_propose = false;
4935 utime_t now = ceph_clock_now();
4936
4937 if (handle_osd_timeouts(now, last_osd_report)) {
4938 do_propose = true;
4939 }
4940
4941 // mark osds down?
4942 if (check_failures(now)) {
4943 do_propose = true;
4944 }
4945
4946 // Force a proposal if we need to prune; pruning is performed on
4947 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4948 // even if there's nothing going on.
4949 if (is_prune_enabled() && should_prune()) {
4950 do_propose = true;
4951 }
4952
4953 // mark down osds out?
4954
4955 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4956 * influence at all. The decision is made based on the ratio of "in" osds,
4957 * and the function returns false if this ratio is lower that the minimum
4958 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
4959 */
4960 if (can_mark_out(-1)) {
4961 string down_out_subtree_limit = g_conf().get_val<string>(
4962 "mon_osd_down_out_subtree_limit");
4963 set<int> down_cache; // quick cache of down subtrees
4964
4965 map<int,utime_t>::iterator i = down_pending_out.begin();
4966 while (i != down_pending_out.end()) {
4967 int o = i->first;
4968 utime_t down = now;
4969 down -= i->second;
4970 ++i;
4971
4972 if (osdmap.is_down(o) &&
4973 osdmap.is_in(o) &&
4974 can_mark_out(o)) {
4975 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
4976 utime_t grace = orig_grace;
4977 double my_grace = 0.0;
4978
4979 if (g_conf()->mon_osd_adjust_down_out_interval) {
4980 // scale grace period the same way we do the heartbeat grace.
4981 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
4982 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
4983 double decay_k = ::log(.5) / halflife;
4984 double decay = exp((double)down * decay_k);
4985 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
4986 << " down for " << down << " decay " << decay << dendl;
4987 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
4988 grace += my_grace;
4989 }
4990
4991 // is this an entire large subtree down?
4992 if (down_out_subtree_limit.length()) {
4993 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
4994 if (type > 0) {
4995 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
4996 dout(10) << "tick entire containing " << down_out_subtree_limit
4997 << " subtree for osd." << o
4998 << " is down; resetting timer" << dendl;
4999 // reset timer, too.
5000 down_pending_out[o] = now;
5001 continue;
5002 }
5003 }
5004 }
5005
5006 bool down_out = !osdmap.is_destroyed(o) &&
5007 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5008 bool destroyed_out = osdmap.is_destroyed(o) &&
5009 g_conf()->mon_osd_destroyed_out_interval > 0 &&
5010 // this is not precise enough as we did not make a note when this osd
5011 // was marked as destroyed, but let's not bother with that
5012 // complexity for now.
5013 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5014 if (down_out || destroyed_out) {
5015 dout(10) << "tick marking osd." << o << " OUT after " << down
5016 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5017 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5018
5019 // set the AUTOOUT bit.
5020 if (pending_inc.new_state.count(o) == 0)
5021 pending_inc.new_state[o] = 0;
5022 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5023
5024 // remember previous weight
5025 if (pending_inc.new_xinfo.count(o) == 0)
5026 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5027 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5028
5029 do_propose = true;
5030
5031 mon->clog->info() << "Marking osd." << o << " out (has been down for "
5032 << int(down.sec()) << " seconds)";
5033 } else
5034 continue;
5035 }
5036
5037 down_pending_out.erase(o);
5038 }
5039 } else {
5040 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5041 }
5042
5043 // expire blacklisted items?
5044 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5045 p != osdmap.blacklist.end();
5046 ++p) {
5047 if (p->second < now) {
5048 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5049 pending_inc.old_blacklist.push_back(p->first);
5050 do_propose = true;
5051 }
5052 }
5053
5054 if (try_prune_purged_snaps()) {
5055 do_propose = true;
5056 }
5057
5058 if (update_pools_status())
5059 do_propose = true;
5060
5061 if (do_propose ||
5062 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5063 propose_pending();
5064
5065 {
5066 std::lock_guard l(balancer_lock);
5067 if (ceph_using_tcmalloc() && mon_memory_autotune && pcm != nullptr) {
5068 pcm->tune_memory();
5069 pcm->balance();
5070 _set_new_cache_sizes();
5071 dout(10) << "tick balancer "
5072 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5073 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5074 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5075 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5076 << dendl;
5077 dout(10) << "tick balancer "
5078 << " full cache_bytes: " << full_cache->get_cache_bytes()
5079 << " full comtd_bytes: " << full_cache->get_committed_size()
5080 << " full used_bytes: " << full_cache->_get_used_bytes()
5081 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5082 << dendl;
5083 }
5084 }
5085 }
5086
5087 void OSDMonitor::_set_new_cache_sizes()
5088 {
5089 uint64_t cache_size = 0;
5090 int64_t inc_alloc = 0;
5091 int64_t full_alloc = 0;
5092 int64_t kv_alloc = 0;
5093
5094 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5095 cache_size = pcm->get_tuned_mem();
5096 inc_alloc = inc_cache->get_committed_size();
5097 full_alloc = full_cache->get_committed_size();
5098 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5099 }
5100
5101 inc_osd_cache.set_bytes(inc_alloc);
5102 full_osd_cache.set_bytes(full_alloc);
5103
5104 dout(1) << __func__ << " cache_size:" << cache_size
5105 << " inc_alloc: " << inc_alloc
5106 << " full_alloc: " << full_alloc
5107 << " kv_alloc: " << kv_alloc
5108 << dendl;
5109 }
5110
5111 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5112 std::map<int,utime_t> &last_osd_report)
5113 {
5114 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5115 if (now - mon->get_leader_since() < timeo) {
5116 // We haven't been the leader for long enough to consider OSD timeouts
5117 return false;
5118 }
5119
5120 int max_osd = osdmap.get_max_osd();
5121 bool new_down = false;
5122
5123 for (int i=0; i < max_osd; ++i) {
5124 dout(30) << __func__ << ": checking up on osd " << i << dendl;
5125 if (!osdmap.exists(i)) {
5126 last_osd_report.erase(i); // if any
5127 continue;
5128 }
5129 if (!osdmap.is_up(i))
5130 continue;
5131 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5132 if (t == last_osd_report.end()) {
5133 // it wasn't in the map; start the timer.
5134 last_osd_report[i] = now;
5135 } else if (can_mark_down(i)) {
5136 utime_t diff = now - t->second;
5137 if (diff > timeo) {
5138 mon->clog->info() << "osd." << i << " marked down after no beacon for "
5139 << diff << " seconds";
5140 derr << "no beacon from osd." << i << " since " << t->second
5141 << ", " << diff << " seconds ago. marking down" << dendl;
5142 pending_inc.new_state[i] = CEPH_OSD_UP;
5143 new_down = true;
5144 }
5145 }
5146 }
5147 return new_down;
5148 }
5149
5150 static void dump_cpu_list(Formatter *f, const char *name,
5151 const string& strlist)
5152 {
5153 cpu_set_t cpu_set;
5154 size_t cpu_set_size;
5155 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5156 return;
5157 }
5158 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5159 f->open_array_section(name);
5160 for (auto cpu : cpus) {
5161 f->dump_int("cpu", cpu);
5162 }
5163 f->close_section();
5164 }
5165
5166 void OSDMonitor::dump_info(Formatter *f)
5167 {
5168 f->open_object_section("osdmap");
5169 osdmap.dump(f);
5170 f->close_section();
5171
5172 f->open_array_section("osd_metadata");
5173 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5174 if (osdmap.exists(i)) {
5175 f->open_object_section("osd");
5176 f->dump_unsigned("id", i);
5177 dump_osd_metadata(i, f, NULL);
5178 f->close_section();
5179 }
5180 }
5181 f->close_section();
5182
5183 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5184 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5185
5186 f->open_object_section("crushmap");
5187 osdmap.crush->dump(f);
5188 f->close_section();
5189
5190 if (has_osdmap_manifest) {
5191 f->open_object_section("osdmap_manifest");
5192 osdmap_manifest.dump(f);
5193 f->close_section();
5194 }
5195 }
5196
5197 namespace {
5198 enum osd_pool_get_choices {
5199 SIZE, MIN_SIZE,
5200 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5201 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5202 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5203 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5204 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5205 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5206 CACHE_TARGET_FULL_RATIO,
5207 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5208 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5209 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5210 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5211 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5212 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5213 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5214 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5215 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5216 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5217 PG_AUTOSCALE_BIAS };
5218
5219 std::set<osd_pool_get_choices>
5220 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5221 const std::set<osd_pool_get_choices>& second)
5222 {
5223 std::set<osd_pool_get_choices> result;
5224 std::set_difference(first.begin(), first.end(),
5225 second.begin(), second.end(),
5226 std::inserter(result, result.end()));
5227 return result;
5228 }
5229 }
5230
5231
5232 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5233 {
5234 op->mark_osdmon_event(__func__);
5235 auto m = op->get_req<MMonCommand>();
5236 int r = 0;
5237 bufferlist rdata;
5238 stringstream ss, ds;
5239
5240 cmdmap_t cmdmap;
5241 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5242 string rs = ss.str();
5243 mon->reply_command(op, -EINVAL, rs, get_last_committed());
5244 return true;
5245 }
5246
5247 MonSession *session = op->get_session();
5248 if (!session) {
5249 derr << __func__ << " no session" << dendl;
5250 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5251 return true;
5252 }
5253
5254 string prefix;
5255 cmd_getval(cmdmap, "prefix", prefix);
5256
5257 string format;
5258 cmd_getval(cmdmap, "format", format, string("plain"));
5259 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5260
5261 if (prefix == "osd stat") {
5262 if (f) {
5263 f->open_object_section("osdmap");
5264 osdmap.print_summary(f.get(), ds, "", true);
5265 f->close_section();
5266 f->flush(rdata);
5267 } else {
5268 osdmap.print_summary(nullptr, ds, "", true);
5269 rdata.append(ds);
5270 }
5271 }
5272 else if (prefix == "osd dump" ||
5273 prefix == "osd tree" ||
5274 prefix == "osd tree-from" ||
5275 prefix == "osd ls" ||
5276 prefix == "osd getmap" ||
5277 prefix == "osd getcrushmap" ||
5278 prefix == "osd ls-tree" ||
5279 prefix == "osd info") {
5280 string val;
5281
5282 epoch_t epoch = 0;
5283 int64_t epochnum;
5284 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5285 epoch = epochnum;
5286
5287 bufferlist osdmap_bl;
5288 int err = get_version_full(epoch, osdmap_bl);
5289 if (err == -ENOENT) {
5290 r = -ENOENT;
5291 ss << "there is no map for epoch " << epoch;
5292 goto reply;
5293 }
5294 ceph_assert(err == 0);
5295 ceph_assert(osdmap_bl.length());
5296
5297 OSDMap *p;
5298 if (epoch == osdmap.get_epoch()) {
5299 p = &osdmap;
5300 } else {
5301 p = new OSDMap;
5302 p->decode(osdmap_bl);
5303 }
5304
5305 auto sg = make_scope_guard([&] {
5306 if (p != &osdmap) {
5307 delete p;
5308 }
5309 });
5310
5311 if (prefix == "osd dump") {
5312 stringstream ds;
5313 if (f) {
5314 f->open_object_section("osdmap");
5315 p->dump(f.get());
5316 f->close_section();
5317 f->flush(ds);
5318 } else {
5319 p->print(ds);
5320 }
5321 rdata.append(ds);
5322 if (!f)
5323 ds << " ";
5324 } else if (prefix == "osd ls") {
5325 if (f) {
5326 f->open_array_section("osds");
5327 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5328 if (osdmap.exists(i)) {
5329 f->dump_int("osd", i);
5330 }
5331 }
5332 f->close_section();
5333 f->flush(ds);
5334 } else {
5335 bool first = true;
5336 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5337 if (osdmap.exists(i)) {
5338 if (!first)
5339 ds << "\n";
5340 first = false;
5341 ds << i;
5342 }
5343 }
5344 }
5345 rdata.append(ds);
5346 } else if (prefix == "osd info") {
5347 int64_t osd_id;
5348 bool do_single_osd = true;
5349 if (!cmd_getval(cmdmap, "id", osd_id)) {
5350 do_single_osd = false;
5351 }
5352
5353 if (do_single_osd && !osdmap.exists(osd_id)) {
5354 ss << "osd." << osd_id << " does not exist";
5355 r = -EINVAL;
5356 goto reply;
5357 }
5358
5359 if (f) {
5360 if (do_single_osd) {
5361 osdmap.dump_osd(osd_id, f.get());
5362 } else {
5363 osdmap.dump_osds(f.get());
5364 }
5365 f->flush(ds);
5366 } else {
5367 if (do_single_osd) {
5368 osdmap.print_osd(osd_id, ds);
5369 } else {
5370 osdmap.print_osds(ds);
5371 }
5372 }
5373 rdata.append(ds);
5374 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5375 string bucket;
5376 if (prefix == "osd tree-from") {
5377 cmd_getval(cmdmap, "bucket", bucket);
5378 if (!osdmap.crush->name_exists(bucket)) {
5379 ss << "bucket '" << bucket << "' does not exist";
5380 r = -ENOENT;
5381 goto reply;
5382 }
5383 int id = osdmap.crush->get_item_id(bucket);
5384 if (id >= 0) {
5385 ss << "\"" << bucket << "\" is not a bucket";
5386 r = -EINVAL;
5387 goto reply;
5388 }
5389 }
5390
5391 vector<string> states;
5392 cmd_getval(cmdmap, "states", states);
5393 unsigned filter = 0;
5394 for (auto& s : states) {
5395 if (s == "up") {
5396 filter |= OSDMap::DUMP_UP;
5397 } else if (s == "down") {
5398 filter |= OSDMap::DUMP_DOWN;
5399 } else if (s == "in") {
5400 filter |= OSDMap::DUMP_IN;
5401 } else if (s == "out") {
5402 filter |= OSDMap::DUMP_OUT;
5403 } else if (s == "destroyed") {
5404 filter |= OSDMap::DUMP_DESTROYED;
5405 } else {
5406 ss << "unrecognized state '" << s << "'";
5407 r = -EINVAL;
5408 goto reply;
5409 }
5410 }
5411 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5412 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5413 ss << "cannot specify both 'in' and 'out'";
5414 r = -EINVAL;
5415 goto reply;
5416 }
5417 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5418 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5419 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5420 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5421 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5422 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5423 ss << "can specify only one of 'up', 'down' and 'destroyed'";
5424 r = -EINVAL;
5425 goto reply;
5426 }
5427 if (f) {
5428 f->open_object_section("tree");
5429 p->print_tree(f.get(), NULL, filter, bucket);
5430 f->close_section();
5431 f->flush(ds);
5432 } else {
5433 p->print_tree(NULL, &ds, filter, bucket);
5434 }
5435 rdata.append(ds);
5436 } else if (prefix == "osd getmap") {
5437 rdata.append(osdmap_bl);
5438 ss << "got osdmap epoch " << p->get_epoch();
5439 } else if (prefix == "osd getcrushmap") {
5440 p->crush->encode(rdata, mon->get_quorum_con_features());
5441 ss << p->get_crush_version();
5442 } else if (prefix == "osd ls-tree") {
5443 string bucket_name;
5444 cmd_getval(cmdmap, "name", bucket_name);
5445 set<int> osds;
5446 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5447 if (r == -ENOENT) {
5448 ss << "\"" << bucket_name << "\" does not exist";
5449 goto reply;
5450 } else if (r < 0) {
5451 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5452 goto reply;
5453 }
5454
5455 if (f) {
5456 f->open_array_section("osds");
5457 for (auto &i : osds) {
5458 if (osdmap.exists(i)) {
5459 f->dump_int("osd", i);
5460 }
5461 }
5462 f->close_section();
5463 f->flush(ds);
5464 } else {
5465 bool first = true;
5466 for (auto &i : osds) {
5467 if (osdmap.exists(i)) {
5468 if (!first)
5469 ds << "\n";
5470 first = false;
5471 ds << i;
5472 }
5473 }
5474 }
5475
5476 rdata.append(ds);
5477 }
5478 } else if (prefix == "osd getmaxosd") {
5479 if (f) {
5480 f->open_object_section("getmaxosd");
5481 f->dump_unsigned("epoch", osdmap.get_epoch());
5482 f->dump_int("max_osd", osdmap.get_max_osd());
5483 f->close_section();
5484 f->flush(rdata);
5485 } else {
5486 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5487 rdata.append(ds);
5488 }
5489 } else if (prefix == "osd utilization") {
5490 string out;
5491 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5492 if (f)
5493 f->flush(rdata);
5494 else
5495 rdata.append(out);
5496 r = 0;
5497 goto reply;
5498 } else if (prefix == "osd find") {
5499 int64_t osd;
5500 if (!cmd_getval(cmdmap, "id", osd)) {
5501 ss << "unable to parse osd id value '"
5502 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5503 r = -EINVAL;
5504 goto reply;
5505 }
5506 if (!osdmap.exists(osd)) {
5507 ss << "osd." << osd << " does not exist";
5508 r = -ENOENT;
5509 goto reply;
5510 }
5511 string format;
5512 cmd_getval(cmdmap, "format", format);
5513 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5514 f->open_object_section("osd_location");
5515 f->dump_int("osd", osd);
5516 f->dump_object("addrs", osdmap.get_addrs(osd));
5517 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5518
5519 // try to identify host, pod/container name, etc.
5520 map<string,string> m;
5521 load_metadata(osd, m, nullptr);
5522 if (auto p = m.find("hostname"); p != m.end()) {
5523 f->dump_string("host", p->second);
5524 }
5525 for (auto& k : {
5526 "pod_name", "pod_namespace", // set by rook
5527 "container_name" // set by cephadm, ceph-ansible
5528 }) {
5529 if (auto p = m.find(k); p != m.end()) {
5530 f->dump_string(k, p->second);
5531 }
5532 }
5533
5534 // crush is helpful too
5535 f->open_object_section("crush_location");
5536 map<string,string> loc = osdmap.crush->get_full_location(osd);
5537 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5538 f->dump_string(p->first.c_str(), p->second);
5539 f->close_section();
5540 f->close_section();
5541 f->flush(rdata);
5542 } else if (prefix == "osd metadata") {
5543 int64_t osd = -1;
5544 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5545 !cmd_getval(cmdmap, "id", osd)) {
5546 ss << "unable to parse osd id value '"
5547 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5548 r = -EINVAL;
5549 goto reply;
5550 }
5551 if (osd >= 0 && !osdmap.exists(osd)) {
5552 ss << "osd." << osd << " does not exist";
5553 r = -ENOENT;
5554 goto reply;
5555 }
5556 string format;
5557 cmd_getval(cmdmap, "format", format);
5558 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5559 if (osd >= 0) {
5560 f->open_object_section("osd_metadata");
5561 f->dump_unsigned("id", osd);
5562 r = dump_osd_metadata(osd, f.get(), &ss);
5563 if (r < 0)
5564 goto reply;
5565 f->close_section();
5566 } else {
5567 r = 0;
5568 f->open_array_section("osd_metadata");
5569 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5570 if (osdmap.exists(i)) {
5571 f->open_object_section("osd");
5572 f->dump_unsigned("id", i);
5573 r = dump_osd_metadata(i, f.get(), NULL);
5574 if (r == -EINVAL || r == -ENOENT) {
5575 // Drop error, continue to get other daemons' metadata
5576 dout(4) << "No metadata for osd." << i << dendl;
5577 r = 0;
5578 } else if (r < 0) {
5579 // Unexpected error
5580 goto reply;
5581 }
5582 f->close_section();
5583 }
5584 }
5585 f->close_section();
5586 }
5587 f->flush(rdata);
5588 } else if (prefix == "osd versions") {
5589 if (!f)
5590 f.reset(Formatter::create("json-pretty"));
5591 count_metadata("ceph_version", f.get());
5592 f->flush(rdata);
5593 r = 0;
5594 } else if (prefix == "osd count-metadata") {
5595 if (!f)
5596 f.reset(Formatter::create("json-pretty"));
5597 string field;
5598 cmd_getval(cmdmap, "property", field);
5599 count_metadata(field, f.get());
5600 f->flush(rdata);
5601 r = 0;
5602 } else if (prefix == "osd numa-status") {
5603 TextTable tbl;
5604 if (f) {
5605 f->open_array_section("osds");
5606 } else {
5607 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5608 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5609 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5610 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5611 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5612 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5613 }
5614 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5615 if (osdmap.exists(i)) {
5616 map<string,string> m;
5617 ostringstream err;
5618 if (load_metadata(i, m, &err) < 0) {
5619 continue;
5620 }
5621 string host;
5622 auto p = m.find("hostname");
5623 if (p != m.end()) {
5624 host = p->second;
5625 }
5626 if (f) {
5627 f->open_object_section("osd");
5628 f->dump_int("osd", i);
5629 f->dump_string("host", host);
5630 for (auto n : { "network_numa_node", "objectstore_numa_node",
5631 "numa_node" }) {
5632 p = m.find(n);
5633 if (p != m.end()) {
5634 f->dump_int(n, atoi(p->second.c_str()));
5635 }
5636 }
5637 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5638 p = m.find(n);
5639 if (p != m.end()) {
5640 list<string> ls = get_str_list(p->second, ",");
5641 f->open_array_section(n);
5642 for (auto node : ls) {
5643 f->dump_int("node", atoi(node.c_str()));
5644 }
5645 f->close_section();
5646 }
5647 }
5648 for (auto n : { "numa_node_cpus" }) {
5649 p = m.find(n);
5650 if (p != m.end()) {
5651 dump_cpu_list(f.get(), n, p->second);
5652 }
5653 }
5654 f->close_section();
5655 } else {
5656 tbl << i;
5657 tbl << host;
5658 p = m.find("network_numa_nodes");
5659 if (p != m.end()) {
5660 tbl << p->second;
5661 } else {
5662 tbl << "-";
5663 }
5664 p = m.find("objectstore_numa_nodes");
5665 if (p != m.end()) {
5666 tbl << p->second;
5667 } else {
5668 tbl << "-";
5669 }
5670 p = m.find("numa_node");
5671 auto q = m.find("numa_node_cpus");
5672 if (p != m.end() && q != m.end()) {
5673 tbl << p->second;
5674 tbl << q->second;
5675 } else {
5676 tbl << "-";
5677 tbl << "-";
5678 }
5679 tbl << TextTable::endrow;
5680 }
5681 }
5682 }
5683 if (f) {
5684 f->close_section();
5685 f->flush(rdata);
5686 } else {
5687 rdata.append(stringify(tbl));
5688 }
5689 } else if (prefix == "osd map") {
5690 string poolstr, objstr, namespacestr;
5691 cmd_getval(cmdmap, "pool", poolstr);
5692 cmd_getval(cmdmap, "object", objstr);
5693 cmd_getval(cmdmap, "nspace", namespacestr);
5694
5695 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5696 if (pool < 0) {
5697 ss << "pool " << poolstr << " does not exist";
5698 r = -ENOENT;
5699 goto reply;
5700 }
5701 object_locator_t oloc(pool, namespacestr);
5702 object_t oid(objstr);
5703 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5704 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5705 vector<int> up, acting;
5706 int up_p, acting_p;
5707 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5708
5709 string fullobjname;
5710 if (!namespacestr.empty())
5711 fullobjname = namespacestr + string("/") + oid.name;
5712 else
5713 fullobjname = oid.name;
5714 if (f) {
5715 f->open_object_section("osd_map");
5716 f->dump_unsigned("epoch", osdmap.get_epoch());
5717 f->dump_string("pool", poolstr);
5718 f->dump_int("pool_id", pool);
5719 f->dump_stream("objname") << fullobjname;
5720 f->dump_stream("raw_pgid") << pgid;
5721 f->dump_stream("pgid") << mpgid;
5722 f->open_array_section("up");
5723 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5724 f->dump_int("osd", *p);
5725 f->close_section();
5726 f->dump_int("up_primary", up_p);
5727 f->open_array_section("acting");
5728 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5729 f->dump_int("osd", *p);
5730 f->close_section();
5731 f->dump_int("acting_primary", acting_p);
5732 f->close_section(); // osd_map
5733 f->flush(rdata);
5734 } else {
5735 ds << "osdmap e" << osdmap.get_epoch()
5736 << " pool '" << poolstr << "' (" << pool << ")"
5737 << " object '" << fullobjname << "' ->"
5738 << " pg " << pgid << " (" << mpgid << ")"
5739 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5740 << pg_vector_string(acting) << ", p" << acting_p << ")";
5741 rdata.append(ds);
5742 }
5743
5744 } else if (prefix == "pg map") {
5745 pg_t pgid;
5746 string pgidstr;
5747 cmd_getval(cmdmap, "pgid", pgidstr);
5748 if (!pgid.parse(pgidstr.c_str())) {
5749 ss << "invalid pgid '" << pgidstr << "'";
5750 r = -EINVAL;
5751 goto reply;
5752 }
5753 vector<int> up, acting;
5754 if (!osdmap.have_pg_pool(pgid.pool())) {
5755 ss << "pg '" << pgidstr << "' does not exist";
5756 r = -ENOENT;
5757 goto reply;
5758 }
5759 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5760 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5761 if (f) {
5762 f->open_object_section("pg_map");
5763 f->dump_unsigned("epoch", osdmap.get_epoch());
5764 f->dump_stream("raw_pgid") << pgid;
5765 f->dump_stream("pgid") << mpgid;
5766 f->open_array_section("up");
5767 for (auto osd : up) {
5768 f->dump_int("up_osd", osd);
5769 }
5770 f->close_section();
5771 f->open_array_section("acting");
5772 for (auto osd : acting) {
5773 f->dump_int("acting_osd", osd);
5774 }
5775 f->close_section();
5776 f->close_section();
5777 f->flush(rdata);
5778 } else {
5779 ds << "osdmap e" << osdmap.get_epoch()
5780 << " pg " << pgid << " (" << mpgid << ")"
5781 << " -> up " << up << " acting " << acting;
5782 rdata.append(ds);
5783 }
5784 goto reply;
5785
5786 } else if (prefix == "osd lspools") {
5787 if (f)
5788 f->open_array_section("pools");
5789 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5790 p != osdmap.pools.end();
5791 ++p) {
5792 if (f) {
5793 f->open_object_section("pool");
5794 f->dump_int("poolnum", p->first);
5795 f->dump_string("poolname", osdmap.pool_name[p->first]);
5796 f->close_section();
5797 } else {
5798 ds << p->first << ' ' << osdmap.pool_name[p->first];
5799 if (next(p) != osdmap.pools.end()) {
5800 ds << '\n';
5801 }
5802 }
5803 }
5804 if (f) {
5805 f->close_section();
5806 f->flush(ds);
5807 }
5808 rdata.append(ds);
5809 } else if (prefix == "osd blacklist ls") {
5810 if (f)
5811 f->open_array_section("blacklist");
5812
5813 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5814 p != osdmap.blacklist.end();
5815 ++p) {
5816 if (f) {
5817 f->open_object_section("entry");
5818 f->dump_string("addr", p->first.get_legacy_str());
5819 f->dump_stream("until") << p->second;
5820 f->close_section();
5821 } else {
5822 stringstream ss;
5823 string s;
5824 ss << p->first << " " << p->second;
5825 getline(ss, s);
5826 s += "\n";
5827 rdata.append(s);
5828 }
5829 }
5830 if (f) {
5831 f->close_section();
5832 f->flush(rdata);
5833 }
5834 ss << "listed " << osdmap.blacklist.size() << " entries";
5835
5836 } else if (prefix == "osd pool ls") {
5837 string detail;
5838 cmd_getval(cmdmap, "detail", detail);
5839 if (!f && detail == "detail") {
5840 ostringstream ss;
5841 osdmap.print_pools(ss);
5842 rdata.append(ss.str());
5843 } else {
5844 if (f)
5845 f->open_array_section("pools");
5846 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5847 it != osdmap.get_pools().end();
5848 ++it) {
5849 if (f) {
5850 if (detail == "detail") {
5851 f->open_object_section("pool");
5852 f->dump_int("pool_id", it->first);
5853 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5854 it->second.dump(f.get());
5855 f->close_section();
5856 } else {
5857 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5858 }
5859 } else {
5860 rdata.append(osdmap.get_pool_name(it->first) + "\n");
5861 }
5862 }
5863 if (f) {
5864 f->close_section();
5865 f->flush(rdata);
5866 }
5867 }
5868
5869 } else if (prefix == "osd crush get-tunable") {
5870 string tunable;
5871 cmd_getval(cmdmap, "tunable", tunable);
5872 ostringstream rss;
5873 if (f)
5874 f->open_object_section("tunable");
5875 if (tunable == "straw_calc_version") {
5876 if (f)
5877 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5878 else
5879 rss << osdmap.crush->get_straw_calc_version() << "\n";
5880 } else {
5881 r = -EINVAL;
5882 goto reply;
5883 }
5884 if (f) {
5885 f->close_section();
5886 f->flush(rdata);
5887 } else {
5888 rdata.append(rss.str());
5889 }
5890 r = 0;
5891
5892 } else if (prefix == "osd pool get") {
5893 string poolstr;
5894 cmd_getval(cmdmap, "pool", poolstr);
5895 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5896 if (pool < 0) {
5897 ss << "unrecognized pool '" << poolstr << "'";
5898 r = -ENOENT;
5899 goto reply;
5900 }
5901
5902 const pg_pool_t *p = osdmap.get_pg_pool(pool);
5903 string var;
5904 cmd_getval(cmdmap, "var", var);
5905
5906 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5907 const choices_map_t ALL_CHOICES = {
5908 {"size", SIZE},
5909 {"min_size", MIN_SIZE},
5910 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5911 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5912 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5913 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5914 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5915 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5916 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5917 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5918 {"use_gmt_hitset", USE_GMT_HITSET},
5919 {"target_max_objects", TARGET_MAX_OBJECTS},
5920 {"target_max_bytes", TARGET_MAX_BYTES},
5921 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5922 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5923 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5924 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5925 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5926 {"erasure_code_profile", ERASURE_CODE_PROFILE},
5927 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5928 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5929 {"fast_read", FAST_READ},
5930 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5931 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5932 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5933 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5934 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5935 {"recovery_priority", RECOVERY_PRIORITY},
5936 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5937 {"scrub_priority", SCRUB_PRIORITY},
5938 {"compression_mode", COMPRESSION_MODE},
5939 {"compression_algorithm", COMPRESSION_ALGORITHM},
5940 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5941 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5942 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5943 {"csum_type", CSUM_TYPE},
5944 {"csum_max_block", CSUM_MAX_BLOCK},
5945 {"csum_min_block", CSUM_MIN_BLOCK},
5946 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5947 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5948 {"pg_num_min", PG_NUM_MIN},
5949 {"target_size_bytes", TARGET_SIZE_BYTES},
5950 {"target_size_ratio", TARGET_SIZE_RATIO},
5951 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
5952 };
5953
5954 typedef std::set<osd_pool_get_choices> choices_set_t;
5955
5956 const choices_set_t ONLY_TIER_CHOICES = {
5957 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5958 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5959 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5960 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5961 MIN_READ_RECENCY_FOR_PROMOTE,
5962 MIN_WRITE_RECENCY_FOR_PROMOTE,
5963 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
5964 };
5965 const choices_set_t ONLY_ERASURE_CHOICES = {
5966 EC_OVERWRITES, ERASURE_CODE_PROFILE
5967 };
5968
5969 choices_set_t selected_choices;
5970 if (var == "all") {
5971 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
5972 it != ALL_CHOICES.end(); ++it) {
5973 selected_choices.insert(it->second);
5974 }
5975
5976 if(!p->is_tier()) {
5977 selected_choices = subtract_second_from_first(selected_choices,
5978 ONLY_TIER_CHOICES);
5979 }
5980
5981 if(!p->is_erasure()) {
5982 selected_choices = subtract_second_from_first(selected_choices,
5983 ONLY_ERASURE_CHOICES);
5984 }
5985 } else /* var != "all" */ {
5986 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
5987 osd_pool_get_choices selected = found->second;
5988
5989 if (!p->is_tier() &&
5990 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
5991 ss << "pool '" << poolstr
5992 << "' is not a tier pool: variable not applicable";
5993 r = -EACCES;
5994 goto reply;
5995 }
5996
5997 if (!p->is_erasure() &&
5998 ONLY_ERASURE_CHOICES.find(selected)
5999 != ONLY_ERASURE_CHOICES.end()) {
6000 ss << "pool '" << poolstr
6001 << "' is not a erasure pool: variable not applicable";
6002 r = -EACCES;
6003 goto reply;
6004 }
6005
6006 if (pool_opts_t::is_opt_name(var) &&
6007 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6008 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6009 r = -ENOENT;
6010 goto reply;
6011 }
6012
6013 selected_choices.insert(selected);
6014 }
6015
6016 if (f) {
6017 f->open_object_section("pool");
6018 f->dump_string("pool", poolstr);
6019 f->dump_int("pool_id", pool);
6020 for(choices_set_t::const_iterator it = selected_choices.begin();
6021 it != selected_choices.end(); ++it) {
6022 choices_map_t::const_iterator i;
6023 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6024 if (i->second == *it) {
6025 break;
6026 }
6027 }
6028 ceph_assert(i != ALL_CHOICES.end());
6029 switch(*it) {
6030 case PG_NUM:
6031 f->dump_int("pg_num", p->get_pg_num());
6032 break;
6033 case PGP_NUM:
6034 f->dump_int("pgp_num", p->get_pgp_num());
6035 break;
6036 case SIZE:
6037 f->dump_int("size", p->get_size());
6038 break;
6039 case MIN_SIZE:
6040 f->dump_int("min_size", p->get_min_size());
6041 break;
6042 case CRUSH_RULE:
6043 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6044 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6045 p->get_crush_rule()));
6046 } else {
6047 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6048 }
6049 break;
6050 case EC_OVERWRITES:
6051 f->dump_bool("allow_ec_overwrites",
6052 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6053 break;
6054 case PG_AUTOSCALE_MODE:
6055 f->dump_string("pg_autoscale_mode",
6056 pg_pool_t::get_pg_autoscale_mode_name(
6057 p->pg_autoscale_mode));
6058 break;
6059 case HASHPSPOOL:
6060 case NODELETE:
6061 case NOPGCHANGE:
6062 case NOSIZECHANGE:
6063 case WRITE_FADVISE_DONTNEED:
6064 case NOSCRUB:
6065 case NODEEP_SCRUB:
6066 f->dump_bool(i->first.c_str(),
6067 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6068 break;
6069 case HIT_SET_PERIOD:
6070 f->dump_int("hit_set_period", p->hit_set_period);
6071 break;
6072 case HIT_SET_COUNT:
6073 f->dump_int("hit_set_count", p->hit_set_count);
6074 break;
6075 case HIT_SET_TYPE:
6076 f->dump_string("hit_set_type",
6077 HitSet::get_type_name(p->hit_set_params.get_type()));
6078 break;
6079 case HIT_SET_FPP:
6080 {
6081 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6082 BloomHitSet::Params *bloomp =
6083 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6084 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6085 } else if(var != "all") {
6086 f->close_section();
6087 ss << "hit set is not of type Bloom; " <<
6088 "invalid to get a false positive rate!";
6089 r = -EINVAL;
6090 goto reply;
6091 }
6092 }
6093 break;
6094 case USE_GMT_HITSET:
6095 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6096 break;
6097 case TARGET_MAX_OBJECTS:
6098 f->dump_unsigned("target_max_objects", p->target_max_objects);
6099 break;
6100 case TARGET_MAX_BYTES:
6101 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6102 break;
6103 case CACHE_TARGET_DIRTY_RATIO:
6104 f->dump_unsigned("cache_target_dirty_ratio_micro",
6105 p->cache_target_dirty_ratio_micro);
6106 f->dump_float("cache_target_dirty_ratio",
6107 ((float)p->cache_target_dirty_ratio_micro/1000000));
6108 break;
6109 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6110 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6111 p->cache_target_dirty_high_ratio_micro);
6112 f->dump_float("cache_target_dirty_high_ratio",
6113 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6114 break;
6115 case CACHE_TARGET_FULL_RATIO:
6116 f->dump_unsigned("cache_target_full_ratio_micro",
6117 p->cache_target_full_ratio_micro);
6118 f->dump_float("cache_target_full_ratio",
6119 ((float)p->cache_target_full_ratio_micro/1000000));
6120 break;
6121 case CACHE_MIN_FLUSH_AGE:
6122 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6123 break;
6124 case CACHE_MIN_EVICT_AGE:
6125 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6126 break;
6127 case ERASURE_CODE_PROFILE:
6128 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6129 break;
6130 case MIN_READ_RECENCY_FOR_PROMOTE:
6131 f->dump_int("min_read_recency_for_promote",
6132 p->min_read_recency_for_promote);
6133 break;
6134 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6135 f->dump_int("min_write_recency_for_promote",
6136 p->min_write_recency_for_promote);
6137 break;
6138 case FAST_READ:
6139 f->dump_int("fast_read", p->fast_read);
6140 break;
6141 case HIT_SET_GRADE_DECAY_RATE:
6142 f->dump_int("hit_set_grade_decay_rate",
6143 p->hit_set_grade_decay_rate);
6144 break;
6145 case HIT_SET_SEARCH_LAST_N:
6146 f->dump_int("hit_set_search_last_n",
6147 p->hit_set_search_last_n);
6148 break;
6149 case SCRUB_MIN_INTERVAL:
6150 case SCRUB_MAX_INTERVAL:
6151 case DEEP_SCRUB_INTERVAL:
6152 case RECOVERY_PRIORITY:
6153 case RECOVERY_OP_PRIORITY:
6154 case SCRUB_PRIORITY:
6155 case COMPRESSION_MODE:
6156 case COMPRESSION_ALGORITHM:
6157 case COMPRESSION_REQUIRED_RATIO:
6158 case COMPRESSION_MAX_BLOB_SIZE:
6159 case COMPRESSION_MIN_BLOB_SIZE:
6160 case CSUM_TYPE:
6161 case CSUM_MAX_BLOCK:
6162 case CSUM_MIN_BLOCK:
6163 case FINGERPRINT_ALGORITHM:
6164 case PG_NUM_MIN:
6165 case TARGET_SIZE_BYTES:
6166 case TARGET_SIZE_RATIO:
6167 case PG_AUTOSCALE_BIAS:
6168 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6169 if (p->opts.is_set(key)) {
6170 if(*it == CSUM_TYPE) {
6171 int64_t val;
6172 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6173 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6174 } else {
6175 p->opts.dump(i->first, f.get());
6176 }
6177 }
6178 break;
6179 }
6180 }
6181 f->close_section();
6182 f->flush(rdata);
6183 } else /* !f */ {
6184 for(choices_set_t::const_iterator it = selected_choices.begin();
6185 it != selected_choices.end(); ++it) {
6186 choices_map_t::const_iterator i;
6187 switch(*it) {
6188 case PG_NUM:
6189 ss << "pg_num: " << p->get_pg_num() << "\n";
6190 break;
6191 case PGP_NUM:
6192 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6193 break;
6194 case SIZE:
6195 ss << "size: " << p->get_size() << "\n";
6196 break;
6197 case MIN_SIZE:
6198 ss << "min_size: " << p->get_min_size() << "\n";
6199 break;
6200 case CRUSH_RULE:
6201 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6202 ss << "crush_rule: " << osdmap.crush->get_rule_name(
6203 p->get_crush_rule()) << "\n";
6204 } else {
6205 ss << "crush_rule: " << p->get_crush_rule() << "\n";
6206 }
6207 break;
6208 case PG_AUTOSCALE_MODE:
6209 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6210 p->pg_autoscale_mode) <<"\n";
6211 break;
6212 case HIT_SET_PERIOD:
6213 ss << "hit_set_period: " << p->hit_set_period << "\n";
6214 break;
6215 case HIT_SET_COUNT:
6216 ss << "hit_set_count: " << p->hit_set_count << "\n";
6217 break;
6218 case HIT_SET_TYPE:
6219 ss << "hit_set_type: " <<
6220 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6221 break;
6222 case HIT_SET_FPP:
6223 {
6224 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6225 BloomHitSet::Params *bloomp =
6226 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6227 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6228 } else if(var != "all") {
6229 ss << "hit set is not of type Bloom; " <<
6230 "invalid to get a false positive rate!";
6231 r = -EINVAL;
6232 goto reply;
6233 }
6234 }
6235 break;
6236 case USE_GMT_HITSET:
6237 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6238 break;
6239 case TARGET_MAX_OBJECTS:
6240 ss << "target_max_objects: " << p->target_max_objects << "\n";
6241 break;
6242 case TARGET_MAX_BYTES:
6243 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6244 break;
6245 case CACHE_TARGET_DIRTY_RATIO:
6246 ss << "cache_target_dirty_ratio: "
6247 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6248 break;
6249 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6250 ss << "cache_target_dirty_high_ratio: "
6251 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6252 break;
6253 case CACHE_TARGET_FULL_RATIO:
6254 ss << "cache_target_full_ratio: "
6255 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6256 break;
6257 case CACHE_MIN_FLUSH_AGE:
6258 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6259 break;
6260 case CACHE_MIN_EVICT_AGE:
6261 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6262 break;
6263 case ERASURE_CODE_PROFILE:
6264 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6265 break;
6266 case MIN_READ_RECENCY_FOR_PROMOTE:
6267 ss << "min_read_recency_for_promote: " <<
6268 p->min_read_recency_for_promote << "\n";
6269 break;
6270 case HIT_SET_GRADE_DECAY_RATE:
6271 ss << "hit_set_grade_decay_rate: " <<
6272 p->hit_set_grade_decay_rate << "\n";
6273 break;
6274 case HIT_SET_SEARCH_LAST_N:
6275 ss << "hit_set_search_last_n: " <<
6276 p->hit_set_search_last_n << "\n";
6277 break;
6278 case EC_OVERWRITES:
6279 ss << "allow_ec_overwrites: " <<
6280 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6281 "\n";
6282 break;
6283 case HASHPSPOOL:
6284 case NODELETE:
6285 case NOPGCHANGE:
6286 case NOSIZECHANGE:
6287 case WRITE_FADVISE_DONTNEED:
6288 case NOSCRUB:
6289 case NODEEP_SCRUB:
6290 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6291 if (i->second == *it)
6292 break;
6293 }
6294 ceph_assert(i != ALL_CHOICES.end());
6295 ss << i->first << ": " <<
6296 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6297 "true" : "false") << "\n";
6298 break;
6299 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6300 ss << "min_write_recency_for_promote: " <<
6301 p->min_write_recency_for_promote << "\n";
6302 break;
6303 case FAST_READ:
6304 ss << "fast_read: " << p->fast_read << "\n";
6305 break;
6306 case SCRUB_MIN_INTERVAL:
6307 case SCRUB_MAX_INTERVAL:
6308 case DEEP_SCRUB_INTERVAL:
6309 case RECOVERY_PRIORITY:
6310 case RECOVERY_OP_PRIORITY:
6311 case SCRUB_PRIORITY:
6312 case COMPRESSION_MODE:
6313 case COMPRESSION_ALGORITHM:
6314 case COMPRESSION_REQUIRED_RATIO:
6315 case COMPRESSION_MAX_BLOB_SIZE:
6316 case COMPRESSION_MIN_BLOB_SIZE:
6317 case CSUM_TYPE:
6318 case CSUM_MAX_BLOCK:
6319 case CSUM_MIN_BLOCK:
6320 case FINGERPRINT_ALGORITHM:
6321 case PG_NUM_MIN:
6322 case TARGET_SIZE_BYTES:
6323 case TARGET_SIZE_RATIO:
6324 case PG_AUTOSCALE_BIAS:
6325 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6326 if (i->second == *it)
6327 break;
6328 }
6329 ceph_assert(i != ALL_CHOICES.end());
6330 {
6331 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6332 if (p->opts.is_set(key)) {
6333 if(key == pool_opts_t::CSUM_TYPE) {
6334 int64_t val;
6335 p->opts.get(key, &val);
6336 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6337 } else {
6338 ss << i->first << ": " << p->opts.get(key) << "\n";
6339 }
6340 }
6341 }
6342 break;
6343 }
6344 rdata.append(ss.str());
6345 ss.str("");
6346 }
6347 }
6348 r = 0;
6349 } else if (prefix == "osd pool get-quota") {
6350 string pool_name;
6351 cmd_getval(cmdmap, "pool", pool_name);
6352
6353 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6354 if (poolid < 0) {
6355 ceph_assert(poolid == -ENOENT);
6356 ss << "unrecognized pool '" << pool_name << "'";
6357 r = -ENOENT;
6358 goto reply;
6359 }
6360 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6361 const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6362 const object_stat_sum_t& sum = pstat->stats.sum;
6363 if (f) {
6364 f->open_object_section("pool_quotas");
6365 f->dump_string("pool_name", pool_name);
6366 f->dump_unsigned("pool_id", poolid);
6367 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6368 f->dump_int("current_num_objects", sum.num_objects);
6369 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6370 f->dump_int("current_num_bytes", sum.num_bytes);
6371 f->close_section();
6372 f->flush(rdata);
6373 } else {
6374 stringstream rs;
6375 rs << "quotas for pool '" << pool_name << "':\n"
6376 << " max objects: ";
6377 if (p->quota_max_objects == 0)
6378 rs << "N/A";
6379 else {
6380 rs << si_u_t(p->quota_max_objects) << " objects";
6381 rs << " (current num objects: " << sum.num_objects << " objects)";
6382 }
6383 rs << "\n"
6384 << " max bytes : ";
6385 if (p->quota_max_bytes == 0)
6386 rs << "N/A";
6387 else {
6388 rs << byte_u_t(p->quota_max_bytes);
6389 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6390 }
6391 rdata.append(rs.str());
6392 }
6393 rdata.append("\n");
6394 r = 0;
6395 } else if (prefix == "osd crush rule list" ||
6396 prefix == "osd crush rule ls") {
6397 if (f) {
6398 f->open_array_section("rules");
6399 osdmap.crush->list_rules(f.get());
6400 f->close_section();
6401 f->flush(rdata);
6402 } else {
6403 ostringstream ss;
6404 osdmap.crush->list_rules(&ss);
6405 rdata.append(ss.str());
6406 }
6407 } else if (prefix == "osd crush rule ls-by-class") {
6408 string class_name;
6409 cmd_getval(cmdmap, "class", class_name);
6410 if (class_name.empty()) {
6411 ss << "no class specified";
6412 r = -EINVAL;
6413 goto reply;
6414 }
6415 set<int> rules;
6416 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6417 if (r < 0) {
6418 ss << "failed to get rules by class '" << class_name << "'";
6419 goto reply;
6420 }
6421 if (f) {
6422 f->open_array_section("rules");
6423 for (auto &rule: rules) {
6424 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6425 }
6426 f->close_section();
6427 f->flush(rdata);
6428 } else {
6429 ostringstream rs;
6430 for (auto &rule: rules) {
6431 rs << osdmap.crush->get_rule_name(rule) << "\n";
6432 }
6433 rdata.append(rs.str());
6434 }
6435 } else if (prefix == "osd crush rule dump") {
6436 string name;
6437 cmd_getval(cmdmap, "name", name);
6438 string format;
6439 cmd_getval(cmdmap, "format", format);
6440 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6441 if (name == "") {
6442 f->open_array_section("rules");
6443 osdmap.crush->dump_rules(f.get());
6444 f->close_section();
6445 } else {
6446 int ruleno = osdmap.crush->get_rule_id(name);
6447 if (ruleno < 0) {
6448 ss << "unknown crush rule '" << name << "'";
6449 r = ruleno;
6450 goto reply;
6451 }
6452 osdmap.crush->dump_rule(ruleno, f.get());
6453 }
6454 ostringstream rs;
6455 f->flush(rs);
6456 rs << "\n";
6457 rdata.append(rs.str());
6458 } else if (prefix == "osd crush dump") {
6459 string format;
6460 cmd_getval(cmdmap, "format", format);
6461 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6462 f->open_object_section("crush_map");
6463 osdmap.crush->dump(f.get());
6464 f->close_section();
6465 ostringstream rs;
6466 f->flush(rs);
6467 rs << "\n";
6468 rdata.append(rs.str());
6469 } else if (prefix == "osd crush show-tunables") {
6470 string format;
6471 cmd_getval(cmdmap, "format", format);
6472 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6473 f->open_object_section("crush_map_tunables");
6474 osdmap.crush->dump_tunables(f.get());
6475 f->close_section();
6476 ostringstream rs;
6477 f->flush(rs);
6478 rs << "\n";
6479 rdata.append(rs.str());
6480 } else if (prefix == "osd crush tree") {
6481 string shadow;
6482 cmd_getval(cmdmap, "shadow", shadow);
6483 bool show_shadow = shadow == "--show-shadow";
6484 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6485 if (f) {
6486 f->open_object_section("crush_tree");
6487 osdmap.crush->dump_tree(nullptr,
6488 f.get(),
6489 osdmap.get_pool_names(),
6490 show_shadow);
6491 f->close_section();
6492 f->flush(rdata);
6493 } else {
6494 ostringstream ss;
6495 osdmap.crush->dump_tree(&ss,
6496 nullptr,
6497 osdmap.get_pool_names(),
6498 show_shadow);
6499 rdata.append(ss.str());
6500 }
6501 } else if (prefix == "osd crush ls") {
6502 string name;
6503 if (!cmd_getval(cmdmap, "node", name)) {
6504 ss << "no node specified";
6505 r = -EINVAL;
6506 goto reply;
6507 }
6508 if (!osdmap.crush->name_exists(name)) {
6509 ss << "node '" << name << "' does not exist";
6510 r = -ENOENT;
6511 goto reply;
6512 }
6513 int id = osdmap.crush->get_item_id(name);
6514 list<int> result;
6515 if (id >= 0) {
6516 result.push_back(id);
6517 } else {
6518 int num = osdmap.crush->get_bucket_size(id);
6519 for (int i = 0; i < num; ++i) {
6520 result.push_back(osdmap.crush->get_bucket_item(id, i));
6521 }
6522 }
6523 if (f) {
6524 f->open_array_section("items");
6525 for (auto i : result) {
6526 f->dump_string("item", osdmap.crush->get_item_name(i));
6527 }
6528 f->close_section();
6529 f->flush(rdata);
6530 } else {
6531 ostringstream ss;
6532 for (auto i : result) {
6533 ss << osdmap.crush->get_item_name(i) << "\n";
6534 }
6535 rdata.append(ss.str());
6536 }
6537 r = 0;
6538 } else if (prefix == "osd crush class ls") {
6539 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6540 f->open_array_section("crush_classes");
6541 for (auto i : osdmap.crush->class_name)
6542 f->dump_string("class", i.second);
6543 f->close_section();
6544 f->flush(rdata);
6545 } else if (prefix == "osd crush class ls-osd") {
6546 string name;
6547 cmd_getval(cmdmap, "class", name);
6548 set<int> osds;
6549 osdmap.crush->get_devices_by_class(name, &osds);
6550 if (f) {
6551 f->open_array_section("osds");
6552 for (auto &osd: osds)
6553 f->dump_int("osd", osd);
6554 f->close_section();
6555 f->flush(rdata);
6556 } else {
6557 bool first = true;
6558 for (auto &osd : osds) {
6559 if (!first)
6560 ds << "\n";
6561 first = false;
6562 ds << osd;
6563 }
6564 rdata.append(ds);
6565 }
6566 } else if (prefix == "osd crush get-device-class") {
6567 vector<string> idvec;
6568 cmd_getval(cmdmap, "ids", idvec);
6569 map<int, string> class_by_osd;
6570 for (auto& id : idvec) {
6571 ostringstream ts;
6572 long osd = parse_osd_id(id.c_str(), &ts);
6573 if (osd < 0) {
6574 ss << "unable to parse osd id:'" << id << "'";
6575 r = -EINVAL;
6576 goto reply;
6577 }
6578 auto device_class = osdmap.crush->get_item_class(osd);
6579 if (device_class)
6580 class_by_osd[osd] = device_class;
6581 else
6582 class_by_osd[osd] = ""; // no class
6583 }
6584 if (f) {
6585 f->open_array_section("osd_device_classes");
6586 for (auto& i : class_by_osd) {
6587 f->open_object_section("osd_device_class");
6588 f->dump_int("osd", i.first);
6589 f->dump_string("device_class", i.second);
6590 f->close_section();
6591 }
6592 f->close_section();
6593 f->flush(rdata);
6594 } else {
6595 if (class_by_osd.size() == 1) {
6596 // for single input, make a clean output
6597 ds << class_by_osd.begin()->second;
6598 } else {
6599 // note that we do not group osds by class here
6600 for (auto it = class_by_osd.begin();
6601 it != class_by_osd.end();
6602 it++) {
6603 ds << "osd." << it->first << ' ' << it->second;
6604 if (next(it) != class_by_osd.end())
6605 ds << '\n';
6606 }
6607 }
6608 rdata.append(ds);
6609 }
6610 } else if (prefix == "osd erasure-code-profile ls") {
6611 const auto &profiles = osdmap.get_erasure_code_profiles();
6612 if (f)
6613 f->open_array_section("erasure-code-profiles");
6614 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6615 if (f)
6616 f->dump_string("profile", i->first.c_str());
6617 else
6618 rdata.append(i->first + "\n");
6619 }
6620 if (f) {
6621 f->close_section();
6622 ostringstream rs;
6623 f->flush(rs);
6624 rs << "\n";
6625 rdata.append(rs.str());
6626 }
6627 } else if (prefix == "osd crush weight-set ls") {
6628 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6629 if (f) {
6630 f->open_array_section("weight_sets");
6631 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6632 f->dump_string("pool", "(compat)");
6633 }
6634 for (auto& i : osdmap.crush->choose_args) {
6635 if (i.first >= 0) {
6636 f->dump_string("pool", osdmap.get_pool_name(i.first));
6637 }
6638 }
6639 f->close_section();
6640 f->flush(rdata);
6641 } else {
6642 ostringstream rs;
6643 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6644 rs << "(compat)\n";
6645 }
6646 for (auto& i : osdmap.crush->choose_args) {
6647 if (i.first >= 0) {
6648 rs << osdmap.get_pool_name(i.first) << "\n";
6649 }
6650 }
6651 rdata.append(rs.str());
6652 }
6653 } else if (prefix == "osd crush weight-set dump") {
6654 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6655 "json-pretty"));
6656 osdmap.crush->dump_choose_args(f.get());
6657 f->flush(rdata);
6658 } else if (prefix == "osd erasure-code-profile get") {
6659 string name;
6660 cmd_getval(cmdmap, "name", name);
6661 if (!osdmap.has_erasure_code_profile(name)) {
6662 ss << "unknown erasure code profile '" << name << "'";
6663 r = -ENOENT;
6664 goto reply;
6665 }
6666 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6667 if (f)
6668 f->open_object_section("profile");
6669 for (map<string,string>::const_iterator i = profile.begin();
6670 i != profile.end();
6671 ++i) {
6672 if (f)
6673 f->dump_string(i->first.c_str(), i->second.c_str());
6674 else
6675 rdata.append(i->first + "=" + i->second + "\n");
6676 }
6677 if (f) {
6678 f->close_section();
6679 ostringstream rs;
6680 f->flush(rs);
6681 rs << "\n";
6682 rdata.append(rs.str());
6683 }
6684 } else if (prefix == "osd pool application get") {
6685 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6686 "json-pretty"));
6687 string pool_name;
6688 cmd_getval(cmdmap, "pool", pool_name);
6689 string app;
6690 cmd_getval(cmdmap, "app", app);
6691 string key;
6692 cmd_getval(cmdmap, "key", key);
6693
6694 if (pool_name.empty()) {
6695 // all
6696 f->open_object_section("pools");
6697 for (const auto &pool : osdmap.pools) {
6698 std::string name("<unknown>");
6699 const auto &pni = osdmap.pool_name.find(pool.first);
6700 if (pni != osdmap.pool_name.end())
6701 name = pni->second;
6702 f->open_object_section(name.c_str());
6703 for (auto &app_pair : pool.second.application_metadata) {
6704 f->open_object_section(app_pair.first.c_str());
6705 for (auto &kv_pair : app_pair.second) {
6706 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6707 }
6708 f->close_section();
6709 }
6710 f->close_section(); // name
6711 }
6712 f->close_section(); // pools
6713 f->flush(rdata);
6714 } else {
6715 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6716 if (pool < 0) {
6717 ss << "unrecognized pool '" << pool_name << "'";
6718 r = -ENOENT;
6719 goto reply;
6720 }
6721 auto p = osdmap.get_pg_pool(pool);
6722 // filter by pool
6723 if (app.empty()) {
6724 f->open_object_section(pool_name.c_str());
6725 for (auto &app_pair : p->application_metadata) {
6726 f->open_object_section(app_pair.first.c_str());
6727 for (auto &kv_pair : app_pair.second) {
6728 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6729 }
6730 f->close_section(); // application
6731 }
6732 f->close_section(); // pool_name
6733 f->flush(rdata);
6734 goto reply;
6735 }
6736
6737 auto app_it = p->application_metadata.find(app);
6738 if (app_it == p->application_metadata.end()) {
6739 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6740 r = -ENOENT;
6741 goto reply;
6742 }
6743 // filter by pool + app
6744 if (key.empty()) {
6745 f->open_object_section(app_it->first.c_str());
6746 for (auto &kv_pair : app_it->second) {
6747 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6748 }
6749 f->close_section(); // application
6750 f->flush(rdata);
6751 goto reply;
6752 }
6753 // filter by pool + app + key
6754 auto key_it = app_it->second.find(key);
6755 if (key_it == app_it->second.end()) {
6756 ss << "application '" << app << "' on pool '" << pool_name
6757 << "' does not have key '" << key << "'";
6758 r = -ENOENT;
6759 goto reply;
6760 }
6761 ss << key_it->second << "\n";
6762 rdata.append(ss.str());
6763 ss.str("");
6764 }
6765 } else if (prefix == "osd get-require-min-compat-client") {
6766 ss << osdmap.require_min_compat_client << std::endl;
6767 rdata.append(ss.str());
6768 ss.str("");
6769 goto reply;
6770 } else if (prefix == "osd pool application enable" ||
6771 prefix == "osd pool application disable" ||
6772 prefix == "osd pool application set" ||
6773 prefix == "osd pool application rm") {
6774 bool changed = false;
6775 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6776 if (r != 0) {
6777 // Error, reply.
6778 goto reply;
6779 } else if (changed) {
6780 // Valid mutation, proceed to prepare phase
6781 return false;
6782 } else {
6783 // Idempotent case, reply
6784 goto reply;
6785 }
6786 } else {
6787 // try prepare update
6788 return false;
6789 }
6790
6791 reply:
6792 string rs;
6793 getline(ss, rs);
6794 mon->reply_command(op, r, rs, rdata, get_last_committed());
6795 return true;
6796 }
6797
6798 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6799 {
6800 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6801 osdmap.get_pg_pool(pool_id));
6802 ceph_assert(pool);
6803 pool->set_flag(flags);
6804 }
6805
6806 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6807 {
6808 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6809 osdmap.get_pg_pool(pool_id));
6810 ceph_assert(pool);
6811 pool->unset_flag(flags);
6812 }
6813
6814 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
6815 {
6816 char k[80];
6817 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
6818 return k;
6819 }
6820
6821 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
6822 {
6823 char k[80];
6824 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6825 (unsigned long long)pool, (unsigned long long)snap);
6826 return k;
6827 }
6828
6829 string OSDMonitor::make_purged_snap_key_value(
6830 int64_t pool, snapid_t snap, snapid_t num,
6831 epoch_t epoch, bufferlist *v)
6832 {
6833 // encode the *last* epoch in the key so that we can use forward
6834 // iteration only to search for an epoch in an interval.
6835 encode(snap, *v);
6836 encode(snap + num, *v);
6837 encode(epoch, *v);
6838 return make_purged_snap_key(pool, snap + num - 1);
6839 }
6840
6841
6842 int OSDMonitor::lookup_purged_snap(
6843 int64_t pool, snapid_t snap,
6844 snapid_t *begin, snapid_t *end)
6845 {
6846 string k = make_purged_snap_key(pool, snap);
6847 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6848 it->lower_bound(k);
6849 if (!it->valid()) {
6850 dout(20) << __func__
6851 << " pool " << pool << " snap " << snap
6852 << " - key '" << k << "' not found" << dendl;
6853 return -ENOENT;
6854 }
6855 if (it->key().find("purged_snap_") != 0) {
6856 dout(20) << __func__
6857 << " pool " << pool << " snap " << snap
6858 << " - key '" << k << "' got '" << it->key()
6859 << "', wrong prefix" << dendl;
6860 return -ENOENT;
6861 }
6862 string gotk = it->key();
6863 const char *format = "purged_snap_%llu_";
6864 long long int keypool;
6865 int n = sscanf(gotk.c_str(), format, &keypool);
6866 if (n != 1) {
6867 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6868 return -ENOENT;
6869 }
6870 if (pool != keypool) {
6871 dout(20) << __func__
6872 << " pool " << pool << " snap " << snap
6873 << " - key '" << k << "' got '" << gotk
6874 << "', wrong pool " << keypool
6875 << dendl;
6876 return -ENOENT;
6877 }
6878 bufferlist v = it->value();
6879 auto p = v.cbegin();
6880 decode(*begin, p);
6881 decode(*end, p);
6882 if (snap < *begin || snap >= *end) {
6883 dout(20) << __func__
6884 << " pool " << pool << " snap " << snap
6885 << " - found [" << *begin << "," << *end << "), no overlap"
6886 << dendl;
6887 return -ENOENT;
6888 }
6889 return 0;
6890 }
6891
6892 void OSDMonitor::insert_purged_snap_update(
6893 int64_t pool,
6894 snapid_t start, snapid_t end,
6895 epoch_t epoch,
6896 MonitorDBStore::TransactionRef t)
6897 {
6898 snapid_t before_begin, before_end;
6899 snapid_t after_begin, after_end;
6900 int b = lookup_purged_snap(pool, start - 1,
6901 &before_begin, &before_end);
6902 int a = lookup_purged_snap(pool, end,
6903 &after_begin, &after_end);
6904 if (!b && !a) {
6905 dout(10) << __func__
6906 << " [" << start << "," << end << ") - joins ["
6907 << before_begin << "," << before_end << ") and ["
6908 << after_begin << "," << after_end << ")" << dendl;
6909 // erase only the begin record; we'll overwrite the end one.
6910 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6911 bufferlist v;
6912 string k = make_purged_snap_key_value(pool,
6913 before_begin, after_end - before_begin,
6914 pending_inc.epoch, &v);
6915 t->put(OSD_SNAP_PREFIX, k, v);
6916 } else if (!b) {
6917 dout(10) << __func__
6918 << " [" << start << "," << end << ") - join with earlier ["
6919 << before_begin << "," << before_end << ")" << dendl;
6920 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6921 bufferlist v;
6922 string k = make_purged_snap_key_value(pool,
6923 before_begin, end - before_begin,
6924 pending_inc.epoch, &v);
6925 t->put(OSD_SNAP_PREFIX, k, v);
6926 } else if (!a) {
6927 dout(10) << __func__
6928 << " [" << start << "," << end << ") - join with later ["
6929 << after_begin << "," << after_end << ")" << dendl;
6930 // overwrite after record
6931 bufferlist v;
6932 string k = make_purged_snap_key_value(pool,
6933 start, after_end - start,
6934 pending_inc.epoch, &v);
6935 t->put(OSD_SNAP_PREFIX, k, v);
6936 } else {
6937 dout(10) << __func__
6938 << " [" << start << "," << end << ") - new"
6939 << dendl;
6940 bufferlist v;
6941 string k = make_purged_snap_key_value(pool,
6942 start, end - start,
6943 pending_inc.epoch, &v);
6944 t->put(OSD_SNAP_PREFIX, k, v);
6945 }
6946 }
6947
6948 bool OSDMonitor::try_prune_purged_snaps()
6949 {
6950 if (!mon->mgrstatmon()->is_readable()) {
6951 return false;
6952 }
6953 if (!pending_inc.new_purged_snaps.empty()) {
6954 return false; // we already pruned for this epoch
6955 }
6956
6957 unsigned max_prune = cct->_conf.get_val<uint64_t>(
6958 "mon_max_snap_prune_per_epoch");
6959 if (!max_prune) {
6960 max_prune = 100000;
6961 }
6962 dout(10) << __func__ << " max_prune " << max_prune << dendl;
6963
6964 unsigned actually_pruned = 0;
6965 auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
6966 for (auto& p : osdmap.get_pools()) {
6967 auto q = purged_snaps.find(p.first);
6968 if (q == purged_snaps.end()) {
6969 continue;
6970 }
6971 auto& purged = q->second;
6972 if (purged.empty()) {
6973 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
6974 continue;
6975 }
6976 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
6977 snap_interval_set_t to_prune;
6978 unsigned maybe_pruned = actually_pruned;
6979 for (auto i = purged.begin(); i != purged.end(); ++i) {
6980 snapid_t begin = i.get_start();
6981 auto end = i.get_start() + i.get_len();
6982 snapid_t pbegin = 0, pend = 0;
6983 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
6984 if (r == 0) {
6985 // already purged.
6986 // be a bit aggressive about backing off here, because the mon may
6987 // do a lot of work going through this set, and if we know the
6988 // purged set from the OSDs is at least *partly* stale we may as
6989 // well wait for it to be fresh.
6990 dout(20) << __func__ << " we've already purged " << pbegin
6991 << "~" << (pend - pbegin) << dendl;
6992 break; // next pool
6993 }
6994 if (pbegin && pbegin > begin && pbegin < end) {
6995 // the tail of [begin,end) is purged; shorten the range
6996 end = pbegin;
6997 }
6998 to_prune.insert(begin, end - begin);
6999 maybe_pruned += end - begin;
7000 if (maybe_pruned >= max_prune) {
7001 break;
7002 }
7003 }
7004 if (!to_prune.empty()) {
7005 // PGs may still be reporting things as purged that we have already
7006 // pruned from removed_snaps_queue.
7007 snap_interval_set_t actual;
7008 auto r = osdmap.removed_snaps_queue.find(p.first);
7009 if (r != osdmap.removed_snaps_queue.end()) {
7010 actual.intersection_of(to_prune, r->second);
7011 }
7012 actually_pruned += actual.size();
7013 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7014 << ", actual pruned " << actual << dendl;
7015 if (!actual.empty()) {
7016 pending_inc.new_purged_snaps[p.first].swap(actual);
7017 }
7018 }
7019 if (actually_pruned >= max_prune) {
7020 break;
7021 }
7022 }
7023 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7024 return !!actually_pruned;
7025 }
7026
7027 bool OSDMonitor::update_pools_status()
7028 {
7029 if (!mon->mgrstatmon()->is_readable())
7030 return false;
7031
7032 bool ret = false;
7033
7034 auto& pools = osdmap.get_pools();
7035 for (auto it = pools.begin(); it != pools.end(); ++it) {
7036 const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
7037 if (!pstat)
7038 continue;
7039 const object_stat_sum_t& sum = pstat->stats.sum;
7040 const pg_pool_t &pool = it->second;
7041 const string& pool_name = osdmap.get_pool_name(it->first);
7042
7043 bool pool_is_full =
7044 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7045 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7046
7047 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7048 if (pool_is_full)
7049 continue;
7050
7051 mon->clog->info() << "pool '" << pool_name
7052 << "' no longer out of quota; removing NO_QUOTA flag";
7053 // below we cancel FLAG_FULL too, we'll set it again in
7054 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7055 clear_pool_flags(it->first,
7056 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7057 ret = true;
7058 } else {
7059 if (!pool_is_full)
7060 continue;
7061
7062 if (pool.quota_max_bytes > 0 &&
7063 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7064 mon->clog->warn() << "pool '" << pool_name << "' is full"
7065 << " (reached quota's max_bytes: "
7066 << byte_u_t(pool.quota_max_bytes) << ")";
7067 }
7068 if (pool.quota_max_objects > 0 &&
7069 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7070 mon->clog->warn() << "pool '" << pool_name << "' is full"
7071 << " (reached quota's max_objects: "
7072 << pool.quota_max_objects << ")";
7073 }
7074 // set both FLAG_FULL_QUOTA and FLAG_FULL
7075 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7076 // since FLAG_FULL should always take precedence
7077 set_pool_flags(it->first,
7078 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7079 clear_pool_flags(it->first,
7080 pg_pool_t::FLAG_NEARFULL |
7081 pg_pool_t::FLAG_BACKFILLFULL);
7082 ret = true;
7083 }
7084 }
7085 return ret;
7086 }
7087
7088 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7089 {
7090 op->mark_osdmon_event(__func__);
7091 auto m = op->get_req<MPoolOp>();
7092 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7093 MonSession *session = op->get_session();
7094 if (!session)
7095 return -EPERM;
7096 string erasure_code_profile;
7097 stringstream ss;
7098 string rule_name;
7099 int ret = 0;
7100 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7101 0, 0, 0, 0, 0, 0.0,
7102 erasure_code_profile,
7103 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7104 &ss);
7105
7106 if (ret < 0) {
7107 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7108 }
7109 return ret;
7110 }
7111
7112 int OSDMonitor::crush_rename_bucket(const string& srcname,
7113 const string& dstname,
7114 ostream *ss)
7115 {
7116 int ret;
7117 //
7118 // Avoid creating a pending crush if it does not already exists and
7119 // the rename would fail.
7120 //
7121 if (!_have_pending_crush()) {
7122 ret = _get_stable_crush().can_rename_bucket(srcname,
7123 dstname,
7124 ss);
7125 if (ret)
7126 return ret;
7127 }
7128
7129 CrushWrapper newcrush;
7130 _get_pending_crush(newcrush);
7131
7132 ret = newcrush.rename_bucket(srcname,
7133 dstname,
7134 ss);
7135 if (ret)
7136 return ret;
7137
7138 pending_inc.crush.clear();
7139 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7140 *ss << "renamed bucket " << srcname << " into " << dstname;
7141 return 0;
7142 }
7143
7144 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7145 {
7146 string replacement = "";
7147
7148 if (plugin == "jerasure_generic" ||
7149 plugin == "jerasure_sse3" ||
7150 plugin == "jerasure_sse4" ||
7151 plugin == "jerasure_neon") {
7152 replacement = "jerasure";
7153 } else if (plugin == "shec_generic" ||
7154 plugin == "shec_sse3" ||
7155 plugin == "shec_sse4" ||
7156 plugin == "shec_neon") {
7157 replacement = "shec";
7158 }
7159
7160 if (replacement != "") {
7161 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7162 << plugin << " that has been deprecated. Please use "
7163 << replacement << " instead." << dendl;
7164 }
7165 }
7166
7167 int OSDMonitor::normalize_profile(const string& profilename,
7168 ErasureCodeProfile &profile,
7169 bool force,
7170 ostream *ss)
7171 {
7172 ErasureCodeInterfaceRef erasure_code;
7173 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7174 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7175 check_legacy_ec_plugin(plugin->second, profilename);
7176 int err = instance.factory(plugin->second,
7177 g_conf().get_val<std::string>("erasure_code_dir"),
7178 profile, &erasure_code, ss);
7179 if (err) {
7180 return err;
7181 }
7182
7183 err = erasure_code->init(profile, ss);
7184 if (err) {
7185 return err;
7186 }
7187
7188 auto it = profile.find("stripe_unit");
7189 if (it != profile.end()) {
7190 string err_str;
7191 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7192 if (!err_str.empty()) {
7193 *ss << "could not parse stripe_unit '" << it->second
7194 << "': " << err_str << std::endl;
7195 return -EINVAL;
7196 }
7197 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7198 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7199 if (chunk_size != stripe_unit) {
7200 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7201 << "alignment. Would be padded to " << chunk_size
7202 << std::endl;
7203 return -EINVAL;
7204 }
7205 if ((stripe_unit % 4096) != 0 && !force) {
7206 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7207 << "use --force to override this check" << std::endl;
7208 return -EINVAL;
7209 }
7210 }
7211 return 0;
7212 }
7213
7214 int OSDMonitor::crush_rule_create_erasure(const string &name,
7215 const string &profile,
7216 int *rule,
7217 ostream *ss)
7218 {
7219 int ruleid = osdmap.crush->get_rule_id(name);
7220 if (ruleid != -ENOENT) {
7221 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7222 return -EEXIST;
7223 }
7224
7225 CrushWrapper newcrush;
7226 _get_pending_crush(newcrush);
7227
7228 ruleid = newcrush.get_rule_id(name);
7229 if (ruleid != -ENOENT) {
7230 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7231 return -EALREADY;
7232 } else {
7233 ErasureCodeInterfaceRef erasure_code;
7234 int err = get_erasure_code(profile, &erasure_code, ss);
7235 if (err) {
7236 *ss << "failed to load plugin using profile " << profile << std::endl;
7237 return err;
7238 }
7239
7240 err = erasure_code->create_rule(name, newcrush, ss);
7241 erasure_code.reset();
7242 if (err < 0)
7243 return err;
7244 *rule = err;
7245 pending_inc.crush.clear();
7246 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7247 return 0;
7248 }
7249 }
7250
7251 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7252 ErasureCodeInterfaceRef *erasure_code,
7253 ostream *ss) const
7254 {
7255 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7256 return -EAGAIN;
7257 ErasureCodeProfile profile =
7258 osdmap.get_erasure_code_profile(erasure_code_profile);
7259 ErasureCodeProfile::const_iterator plugin =
7260 profile.find("plugin");
7261 if (plugin == profile.end()) {
7262 *ss << "cannot determine the erasure code plugin"
7263 << " because there is no 'plugin' entry in the erasure_code_profile "
7264 << profile << std::endl;
7265 return -EINVAL;
7266 }
7267 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7268 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7269 return instance.factory(plugin->second,
7270 g_conf().get_val<std::string>("erasure_code_dir"),
7271 profile, erasure_code, ss);
7272 }
7273
7274 int OSDMonitor::check_cluster_features(uint64_t features,
7275 stringstream &ss)
7276 {
7277 stringstream unsupported_ss;
7278 int unsupported_count = 0;
7279 if ((mon->get_quorum_con_features() & features) != features) {
7280 unsupported_ss << "the monitor cluster";
7281 ++unsupported_count;
7282 }
7283
7284 set<int32_t> up_osds;
7285 osdmap.get_up_osds(up_osds);
7286 for (set<int32_t>::iterator it = up_osds.begin();
7287 it != up_osds.end(); ++it) {
7288 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7289 if ((xi.features & features) != features) {
7290 if (unsupported_count > 0)
7291 unsupported_ss << ", ";
7292 unsupported_ss << "osd." << *it;
7293 unsupported_count ++;
7294 }
7295 }
7296
7297 if (unsupported_count > 0) {
7298 ss << "features " << features << " unsupported by: "
7299 << unsupported_ss.str();
7300 return -ENOTSUP;
7301 }
7302
7303 // check pending osd state, too!
7304 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7305 pending_inc.new_xinfo.begin();
7306 p != pending_inc.new_xinfo.end(); ++p) {
7307 const osd_xinfo_t &xi = p->second;
7308 if ((xi.features & features) != features) {
7309 dout(10) << __func__ << " pending osd." << p->first
7310 << " features are insufficient; retry" << dendl;
7311 return -EAGAIN;
7312 }
7313 }
7314
7315 return 0;
7316 }
7317
7318 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7319 stringstream& ss)
7320 {
7321 OSDMap::Incremental new_pending = pending_inc;
7322 encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7323 OSDMap newmap;
7324 newmap.deepish_copy_from(osdmap);
7325 newmap.apply_incremental(new_pending);
7326
7327 // client compat
7328 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7329 auto mv = newmap.get_min_compat_client();
7330 if (mv > newmap.require_min_compat_client) {
7331 ss << "new crush map requires client version " << mv
7332 << " but require_min_compat_client is "
7333 << newmap.require_min_compat_client;
7334 return false;
7335 }
7336 }
7337
7338 // osd compat
7339 uint64_t features =
7340 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7341 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7342 stringstream features_ss;
7343 int r = check_cluster_features(features, features_ss);
7344 if (r) {
7345 ss << "Could not change CRUSH: " << features_ss.str();
7346 return false;
7347 }
7348
7349 return true;
7350 }
7351
7352 bool OSDMonitor::erasure_code_profile_in_use(
7353 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7354 const string &profile,
7355 ostream *ss)
7356 {
7357 bool found = false;
7358 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7359 p != pools.end();
7360 ++p) {
7361 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7362 *ss << osdmap.pool_name[p->first] << " ";
7363 found = true;
7364 }
7365 }
7366 if (found) {
7367 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7368 }
7369 return found;
7370 }
7371
7372 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7373 map<string,string> *erasure_code_profile_map,
7374 ostream *ss)
7375 {
7376 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7377 get_json_str_map,
7378 *ss,
7379 erasure_code_profile_map,
7380 true);
7381 if (r)
7382 return r;
7383 ceph_assert((*erasure_code_profile_map).count("plugin"));
7384 string default_plugin = (*erasure_code_profile_map)["plugin"];
7385 map<string,string> user_map;
7386 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7387 i != erasure_code_profile.end();
7388 ++i) {
7389 size_t equal = i->find('=');
7390 if (equal == string::npos) {
7391 user_map[*i] = string();
7392 (*erasure_code_profile_map)[*i] = string();
7393 } else {
7394 const string key = i->substr(0, equal);
7395 equal++;
7396 const string value = i->substr(equal);
7397 if (key.find("ruleset-") == 0) {
7398 *ss << "property '" << key << "' is no longer supported; try "
7399 << "'crush-" << key.substr(8) << "' instead";
7400 return -EINVAL;
7401 }
7402 user_map[key] = value;
7403 (*erasure_code_profile_map)[key] = value;
7404 }
7405 }
7406
7407 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7408 (*erasure_code_profile_map) = user_map;
7409
7410 return 0;
7411 }
7412
7413 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7414 const string &erasure_code_profile,
7415 uint8_t repl_size,
7416 unsigned *size, unsigned *min_size,
7417 ostream *ss)
7418 {
7419 int err = 0;
7420 switch (pool_type) {
7421 case pg_pool_t::TYPE_REPLICATED:
7422 if (repl_size == 0) {
7423 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7424 }
7425 *size = repl_size;
7426 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7427 break;
7428 case pg_pool_t::TYPE_ERASURE:
7429 {
7430 ErasureCodeInterfaceRef erasure_code;
7431 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7432 if (err == 0) {
7433 *size = erasure_code->get_chunk_count();
7434 *min_size =
7435 erasure_code->get_data_chunk_count() +
7436 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7437 assert(*min_size <= *size);
7438 assert(*min_size >= erasure_code->get_data_chunk_count());
7439 }
7440 }
7441 break;
7442 default:
7443 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7444 err = -EINVAL;
7445 break;
7446 }
7447 return err;
7448 }
7449
7450 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7451 const string &erasure_code_profile,
7452 uint32_t *stripe_width,
7453 ostream *ss)
7454 {
7455 int err = 0;
7456 switch (pool_type) {
7457 case pg_pool_t::TYPE_REPLICATED:
7458 // ignored
7459 break;
7460 case pg_pool_t::TYPE_ERASURE:
7461 {
7462 ErasureCodeProfile profile =
7463 osdmap.get_erasure_code_profile(erasure_code_profile);
7464 ErasureCodeInterfaceRef erasure_code;
7465 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7466 if (err)
7467 break;
7468 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7469 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7470 auto it = profile.find("stripe_unit");
7471 if (it != profile.end()) {
7472 string err_str;
7473 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7474 ceph_assert(err_str.empty());
7475 }
7476 *stripe_width = data_chunks *
7477 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7478 }
7479 break;
7480 default:
7481 *ss << "prepare_pool_stripe_width: "
7482 << pool_type << " is not a known pool type";
7483 err = -EINVAL;
7484 break;
7485 }
7486 return err;
7487 }
7488
7489 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7490 const string &erasure_code_profile,
7491 const string &rule_name,
7492 int *crush_rule,
7493 ostream *ss)
7494 {
7495
7496 if (*crush_rule < 0) {
7497 switch (pool_type) {
7498 case pg_pool_t::TYPE_REPLICATED:
7499 {
7500 if (rule_name == "") {
7501 // Use default rule
7502 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7503 if (*crush_rule < 0) {
7504 // Errors may happen e.g. if no valid rule is available
7505 *ss << "No suitable CRUSH rule exists, check "
7506 << "'osd pool default crush *' config options";
7507 return -ENOENT;
7508 }
7509 } else {
7510 return get_crush_rule(rule_name, crush_rule, ss);
7511 }
7512 }
7513 break;
7514 case pg_pool_t::TYPE_ERASURE:
7515 {
7516 int err = crush_rule_create_erasure(rule_name,
7517 erasure_code_profile,
7518 crush_rule, ss);
7519 switch (err) {
7520 case -EALREADY:
7521 dout(20) << "prepare_pool_crush_rule: rule "
7522 << rule_name << " try again" << dendl;
7523 // fall through
7524 case 0:
7525 // need to wait for the crush rule to be proposed before proceeding
7526 err = -EAGAIN;
7527 break;
7528 case -EEXIST:
7529 err = 0;
7530 break;
7531 }
7532 return err;
7533 }
7534 break;
7535 default:
7536 *ss << "prepare_pool_crush_rule: " << pool_type
7537 << " is not a known pool type";
7538 return -EINVAL;
7539 break;
7540 }
7541 } else {
7542 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7543 *ss << "CRUSH rule " << *crush_rule << " not found";
7544 return -ENOENT;
7545 }
7546 }
7547
7548 return 0;
7549 }
7550
7551 int OSDMonitor::get_crush_rule(const string &rule_name,
7552 int *crush_rule,
7553 ostream *ss)
7554 {
7555 int ret;
7556 ret = osdmap.crush->get_rule_id(rule_name);
7557 if (ret != -ENOENT) {
7558 // found it, use it
7559 *crush_rule = ret;
7560 } else {
7561 CrushWrapper newcrush;
7562 _get_pending_crush(newcrush);
7563
7564 ret = newcrush.get_rule_id(rule_name);
7565 if (ret != -ENOENT) {
7566 // found it, wait for it to be proposed
7567 dout(20) << __func__ << ": rule " << rule_name
7568 << " try again" << dendl;
7569 return -EAGAIN;
7570 } else {
7571 // Cannot find it , return error
7572 *ss << "specified rule " << rule_name << " doesn't exist";
7573 return ret;
7574 }
7575 }
7576 return 0;
7577 }
7578
7579 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7580 {
7581 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7582 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
7583 auto max_pgs = max_pgs_per_osd * num_osds;
7584 uint64_t projected = 0;
7585 if (pool < 0) {
7586 projected += pg_num * size;
7587 }
7588 for (const auto& i : osdmap.get_pools()) {
7589 if (i.first == pool) {
7590 projected += pg_num * size;
7591 } else {
7592 projected += i.second.get_pg_num_target() * i.second.get_size();
7593 }
7594 }
7595 if (projected > max_pgs) {
7596 if (pool >= 0) {
7597 *ss << "pool id " << pool;
7598 }
7599 *ss << " pg_num " << pg_num << " size " << size
7600 << " would mean " << projected
7601 << " total pgs, which exceeds max " << max_pgs
7602 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7603 << " * num_in_osds " << num_osds << ")";
7604 return -ERANGE;
7605 }
7606 return 0;
7607 }
7608
7609 /**
7610 * @param name The name of the new pool
7611 * @param crush_rule The crush rule to use. If <0, will use the system default
7612 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7613 * @param pg_num The pg_num to use. If set to 0, will use the system default
7614 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7615 * @param repl_size Replication factor, or 0 for default
7616 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7617 * @param pool_type TYPE_ERASURE, or TYPE_REP
7618 * @param expected_num_objects expected number of objects on the pool
7619 * @param fast_read fast read type.
7620 * @param ss human readable error message, if any.
7621 *
7622 * @return 0 on success, negative errno on failure.
7623 */
7624 int OSDMonitor::prepare_new_pool(string& name,
7625 int crush_rule,
7626 const string &crush_rule_name,
7627 unsigned pg_num, unsigned pgp_num,
7628 unsigned pg_num_min,
7629 const uint64_t repl_size,
7630 const uint64_t target_size_bytes,
7631 const float target_size_ratio,
7632 const string &erasure_code_profile,
7633 const unsigned pool_type,
7634 const uint64_t expected_num_objects,
7635 FastReadType fast_read,
7636 const string& pg_autoscale_mode,
7637 ostream *ss)
7638 {
7639 if (name.length() == 0)
7640 return -EINVAL;
7641 if (pg_num == 0)
7642 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7643 if (pgp_num == 0)
7644 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7645 if (!pgp_num)
7646 pgp_num = pg_num;
7647 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7648 *ss << "'pg_num' must be greater than 0 and less than or equal to "
7649 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7650 << " (you may adjust 'mon max pool pg num' for higher values)";
7651 return -ERANGE;
7652 }
7653 if (pgp_num > pg_num) {
7654 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7655 << ", which in this case is " << pg_num;
7656 return -ERANGE;
7657 }
7658 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7659 *ss << "'fast_read' can only apply to erasure coding pool";
7660 return -EINVAL;
7661 }
7662 int r;
7663 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7664 crush_rule_name, &crush_rule, ss);
7665 if (r) {
7666 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7667 return r;
7668 }
7669 if (g_conf()->mon_osd_crush_smoke_test) {
7670 CrushWrapper newcrush;
7671 _get_pending_crush(newcrush);
7672 ostringstream err;
7673 CrushTester tester(newcrush, err);
7674 tester.set_min_x(0);
7675 tester.set_max_x(50);
7676 tester.set_rule(crush_rule);
7677 auto start = ceph::coarse_mono_clock::now();
7678 r = tester.test_with_fork(g_conf()->mon_lease);
7679 auto duration = ceph::coarse_mono_clock::now() - start;
7680 if (r < 0) {
7681 dout(10) << "tester.test_with_fork returns " << r
7682 << ": " << err.str() << dendl;
7683 *ss << "crush test failed with " << r << ": " << err.str();
7684 return r;
7685 }
7686 dout(10) << __func__ << " crush smoke test duration: "
7687 << duration << dendl;
7688 }
7689 unsigned size, min_size;
7690 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7691 &size, &min_size, ss);
7692 if (r) {
7693 dout(10) << "prepare_pool_size returns " << r << dendl;
7694 return r;
7695 }
7696 r = check_pg_num(-1, pg_num, size, ss);
7697 if (r) {
7698 dout(10) << "check_pg_num returns " << r << dendl;
7699 return r;
7700 }
7701
7702 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7703 return -EINVAL;
7704 }
7705
7706 uint32_t stripe_width = 0;
7707 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7708 if (r) {
7709 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7710 return r;
7711 }
7712
7713 bool fread = false;
7714 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7715 switch (fast_read) {
7716 case FAST_READ_OFF:
7717 fread = false;
7718 break;
7719 case FAST_READ_ON:
7720 fread = true;
7721 break;
7722 case FAST_READ_DEFAULT:
7723 fread = g_conf()->osd_pool_default_ec_fast_read;
7724 break;
7725 default:
7726 *ss << "invalid fast_read setting: " << fast_read;
7727 return -EINVAL;
7728 }
7729 }
7730
7731 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7732 p != pending_inc.new_pool_names.end();
7733 ++p) {
7734 if (p->second == name)
7735 return 0;
7736 }
7737
7738 if (-1 == pending_inc.new_pool_max)
7739 pending_inc.new_pool_max = osdmap.pool_max;
7740 int64_t pool = ++pending_inc.new_pool_max;
7741 pg_pool_t empty;
7742 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7743 pi->create_time = ceph_clock_now();
7744 pi->type = pool_type;
7745 pi->fast_read = fread;
7746 pi->flags = g_conf()->osd_pool_default_flags;
7747 if (g_conf()->osd_pool_default_flag_hashpspool)
7748 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7749 if (g_conf()->osd_pool_default_flag_nodelete)
7750 pi->set_flag(pg_pool_t::FLAG_NODELETE);
7751 if (g_conf()->osd_pool_default_flag_nopgchange)
7752 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7753 if (g_conf()->osd_pool_default_flag_nosizechange)
7754 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7755 pi->set_flag(pg_pool_t::FLAG_CREATING);
7756 if (g_conf()->osd_pool_use_gmt_hitset)
7757 pi->use_gmt_hitset = true;
7758 else
7759 pi->use_gmt_hitset = false;
7760
7761 pi->size = size;
7762 pi->min_size = min_size;
7763 pi->crush_rule = crush_rule;
7764 pi->expected_num_objects = expected_num_objects;
7765 pi->object_hash = CEPH_STR_HASH_RJENKINS;
7766
7767 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7768 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7769 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7770 pi->pg_autoscale_mode = m;
7771 } else {
7772 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7773 }
7774 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7775 pi->set_pg_num(
7776 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7777 : pg_num);
7778 pi->set_pg_num_pending(pi->get_pg_num());
7779 pi->set_pg_num_target(pg_num);
7780 pi->set_pgp_num(pi->get_pg_num());
7781 pi->set_pgp_num_target(pgp_num);
7782 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7783 pg_num_min) {
7784 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7785 }
7786 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7787 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7788 pi->pg_autoscale_mode = m;
7789 }
7790
7791 pi->last_change = pending_inc.epoch;
7792 pi->auid = 0;
7793
7794 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7795 pi->erasure_code_profile = erasure_code_profile;
7796 } else {
7797 pi->erasure_code_profile = "";
7798 }
7799 pi->stripe_width = stripe_width;
7800
7801 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7802 target_size_bytes) {
7803 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7804 // larger than int32_t max.
7805 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7806 }
7807 if (target_size_ratio > 0.0 &&
7808 osdmap.require_osd_release >= ceph_release_t::nautilus) {
7809 // only store for nautilus+, just to be consistent and tidy.
7810 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7811 }
7812
7813 pi->cache_target_dirty_ratio_micro =
7814 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7815 pi->cache_target_dirty_high_ratio_micro =
7816 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7817 pi->cache_target_full_ratio_micro =
7818 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7819 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7820 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7821
7822 pending_inc.new_pool_names[pool] = name;
7823 return 0;
7824 }
7825
7826 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7827 {
7828 op->mark_osdmon_event(__func__);
7829 ostringstream ss;
7830 if (pending_inc.new_flags < 0)
7831 pending_inc.new_flags = osdmap.get_flags();
7832 pending_inc.new_flags |= flag;
7833 ss << OSDMap::get_flag_string(flag) << " is set";
7834 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7835 get_last_committed() + 1));
7836 return true;
7837 }
7838
7839 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7840 {
7841 op->mark_osdmon_event(__func__);
7842 ostringstream ss;
7843 if (pending_inc.new_flags < 0)
7844 pending_inc.new_flags = osdmap.get_flags();
7845 pending_inc.new_flags &= ~flag;
7846 ss << OSDMap::get_flag_string(flag) << " is unset";
7847 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7848 get_last_committed() + 1));
7849 return true;
7850 }
7851
7852 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7853 stringstream& ss)
7854 {
7855 string poolstr;
7856 cmd_getval(cmdmap, "pool", poolstr);
7857 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7858 if (pool < 0) {
7859 ss << "unrecognized pool '" << poolstr << "'";
7860 return -ENOENT;
7861 }
7862 string var;
7863 cmd_getval(cmdmap, "var", var);
7864
7865 pg_pool_t p = *osdmap.get_pg_pool(pool);
7866 if (pending_inc.new_pools.count(pool))
7867 p = pending_inc.new_pools[pool];
7868
7869 // accept val as a json string in the normal case (current
7870 // generation monitor). parse out int or float values from the
7871 // string as needed. however, if it is not a string, try to pull
7872 // out an int, in case an older monitor with an older json schema is
7873 // forwarding a request.
7874 string val;
7875 string interr, floaterr;
7876 int64_t n = 0;
7877 double f = 0;
7878 int64_t uf = 0; // micro-f
7879 cmd_getval(cmdmap, "val", val);
7880
7881 auto si_options = {
7882 "target_max_objects"
7883 };
7884 auto iec_options = {
7885 "target_max_bytes",
7886 "target_size_bytes",
7887 "compression_max_blob_size",
7888 "compression_min_blob_size",
7889 "csum_max_block",
7890 "csum_min_block",
7891 };
7892 if (count(begin(si_options), end(si_options), var)) {
7893 n = strict_si_cast<int64_t>(val.c_str(), &interr);
7894 } else if (count(begin(iec_options), end(iec_options), var)) {
7895 n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7896 } else {
7897 // parse string as both int and float; different fields use different types.
7898 n = strict_strtoll(val.c_str(), 10, &interr);
7899 f = strict_strtod(val.c_str(), &floaterr);
7900 uf = llrintl(f * (double)1000000.0);
7901 }
7902
7903 if (!p.is_tier() &&
7904 (var == "hit_set_type" || var == "hit_set_period" ||
7905 var == "hit_set_count" || var == "hit_set_fpp" ||
7906 var == "target_max_objects" || var == "target_max_bytes" ||
7907 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7908 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7909 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7910 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7911 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7912 return -EACCES;
7913 }
7914
7915 if (var == "size") {
7916 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7917 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7918 return -EPERM;
7919 }
7920 if (p.type == pg_pool_t::TYPE_ERASURE) {
7921 ss << "can not change the size of an erasure-coded pool";
7922 return -ENOTSUP;
7923 }
7924 if (interr.length()) {
7925 ss << "error parsing integer value '" << val << "': " << interr;
7926 return -EINVAL;
7927 }
7928 if (n <= 0 || n > 10) {
7929 ss << "pool size must be between 1 and 10";
7930 return -EINVAL;
7931 }
7932 if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
7933 return -EINVAL;
7934 }
7935 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7936 if (r < 0) {
7937 return r;
7938 }
7939 p.size = n;
7940 if (n < p.min_size)
7941 p.min_size = n;
7942 } else if (var == "min_size") {
7943 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7944 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7945 return -EPERM;
7946 }
7947 if (interr.length()) {
7948 ss << "error parsing integer value '" << val << "': " << interr;
7949 return -EINVAL;
7950 }
7951
7952 if (p.type != pg_pool_t::TYPE_ERASURE) {
7953 if (n < 1 || n > p.size) {
7954 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7955 return -EINVAL;
7956 }
7957 } else {
7958 ErasureCodeInterfaceRef erasure_code;
7959 int k;
7960 stringstream tmp;
7961 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7962 if (err == 0) {
7963 k = erasure_code->get_data_chunk_count();
7964 } else {
7965 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7966 return err;
7967 }
7968
7969 if (n < k || n > p.size) {
7970 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
7971 return -EINVAL;
7972 }
7973 }
7974 p.min_size = n;
7975 } else if (var == "pg_num_actual") {
7976 if (interr.length()) {
7977 ss << "error parsing integer value '" << val << "': " << interr;
7978 return -EINVAL;
7979 }
7980 if (n == (int)p.get_pg_num()) {
7981 return 0;
7982 }
7983 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7984 ss << "'pg_num' must be greater than 0 and less than or equal to "
7985 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7986 << " (you may adjust 'mon max pool pg num' for higher values)";
7987 return -ERANGE;
7988 }
7989 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
7990 ss << "cannot adjust pg_num while initial PGs are being created";
7991 return -EBUSY;
7992 }
7993 if (n > (int)p.get_pg_num()) {
7994 if (p.get_pg_num() != p.get_pg_num_pending()) {
7995 // force pre-nautilus clients to resend their ops, since they
7996 // don't understand pg_num_pending changes form a new interval
7997 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7998 }
7999 p.set_pg_num(n);
8000 } else {
8001 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8002 ss << "nautilus OSDs are required to adjust pg_num_pending";
8003 return -EPERM;
8004 }
8005 if (n < (int)p.get_pgp_num()) {
8006 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8007 return -EINVAL;
8008 }
8009 if (n < (int)p.get_pg_num() - 1) {
8010 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8011 << ") - 1; only single pg decrease is currently supported";
8012 return -EINVAL;
8013 }
8014 p.set_pg_num_pending(n);
8015 // force pre-nautilus clients to resend their ops, since they
8016 // don't understand pg_num_pending changes form a new interval
8017 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8018 }
8019 // force pre-luminous clients to resend their ops, since they
8020 // don't understand that split PGs now form a new interval.
8021 p.last_force_op_resend_preluminous = pending_inc.epoch;
8022 } else if (var == "pg_num") {
8023 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8024 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8025 return -EPERM;
8026 }
8027 if (interr.length()) {
8028 ss << "error parsing integer value '" << val << "': " << interr;
8029 return -EINVAL;
8030 }
8031 if (n == (int)p.get_pg_num_target()) {
8032 return 0;
8033 }
8034 if (n <= 0 || static_cast<uint64_t>(n) >
8035 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8036 ss << "'pg_num' must be greater than 0 and less than or equal to "
8037 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8038 << " (you may adjust 'mon max pool pg num' for higher values)";
8039 return -ERANGE;
8040 }
8041 if (n > (int)p.get_pg_num_target()) {
8042 int r = check_pg_num(pool, n, p.get_size(), &ss);
8043 if (r) {
8044 return r;
8045 }
8046 bool force = false;
8047 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8048 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8049 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8050 return -EPERM;
8051 }
8052 } else {
8053 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8054 ss << "nautilus OSDs are required to decrease pg_num";
8055 return -EPERM;
8056 }
8057 }
8058 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8059 // pre-nautilus osdmap format; increase pg_num directly
8060 assert(n > (int)p.get_pg_num());
8061 // force pre-nautilus clients to resend their ops, since they
8062 // don't understand pg_num_target changes form a new interval
8063 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8064 // force pre-luminous clients to resend their ops, since they
8065 // don't understand that split PGs now form a new interval.
8066 p.last_force_op_resend_preluminous = pending_inc.epoch;
8067 p.set_pg_num(n);
8068 } else {
8069 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8070 // make pgp_num track pg_num if it already matches. if it is set
8071 // differently, leave it different and let the user control it
8072 // manually.
8073 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8074 p.set_pgp_num_target(n);
8075 }
8076 p.set_pg_num_target(n);
8077 }
8078 } else if (var == "pgp_num_actual") {
8079 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8080 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8081 return -EPERM;
8082 }
8083 if (interr.length()) {
8084 ss << "error parsing integer value '" << val << "': " << interr;
8085 return -EINVAL;
8086 }
8087 if (n <= 0) {
8088 ss << "specified pgp_num must > 0, but you set to " << n;
8089 return -EINVAL;
8090 }
8091 if (n > (int)p.get_pg_num()) {
8092 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8093 return -EINVAL;
8094 }
8095 if (n > (int)p.get_pg_num_pending()) {
8096 ss << "specified pgp_num " << n
8097 << " > pg_num_pending " << p.get_pg_num_pending();
8098 return -EINVAL;
8099 }
8100 p.set_pgp_num(n);
8101 } else if (var == "pgp_num") {
8102 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8103 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8104 return -EPERM;
8105 }
8106 if (interr.length()) {
8107 ss << "error parsing integer value '" << val << "': " << interr;
8108 return -EINVAL;
8109 }
8110 if (n <= 0) {
8111 ss << "specified pgp_num must > 0, but you set to " << n;
8112 return -EINVAL;
8113 }
8114 if (n > (int)p.get_pg_num_target()) {
8115 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8116 return -EINVAL;
8117 }
8118 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8119 // pre-nautilus osdmap format; increase pgp_num directly
8120 p.set_pgp_num(n);
8121 } else {
8122 p.set_pgp_num_target(n);
8123 }
8124 } else if (var == "pg_autoscale_mode") {
8125 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8126 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8127 ss << "specified invalid mode " << val;
8128 return -EINVAL;
8129 }
8130 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8131 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8132 return -EINVAL;
8133 }
8134 p.pg_autoscale_mode = m;
8135 } else if (var == "crush_rule") {
8136 int id = osdmap.crush->get_rule_id(val);
8137 if (id == -ENOENT) {
8138 ss << "crush rule " << val << " does not exist";
8139 return -ENOENT;
8140 }
8141 if (id < 0) {
8142 ss << cpp_strerror(id);
8143 return -ENOENT;
8144 }
8145 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8146 return -EINVAL;
8147 }
8148 p.crush_rule = id;
8149 } else if (var == "nodelete" || var == "nopgchange" ||
8150 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8151 var == "noscrub" || var == "nodeep-scrub") {
8152 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8153 // make sure we only compare against 'n' if we didn't receive a string
8154 if (val == "true" || (interr.empty() && n == 1)) {
8155 p.set_flag(flag);
8156 } else if (val == "false" || (interr.empty() && n == 0)) {
8157 p.unset_flag(flag);
8158 } else {
8159 ss << "expecting value 'true', 'false', '0', or '1'";
8160 return -EINVAL;
8161 }
8162 } else if (var == "hashpspool") {
8163 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8164 bool force = false;
8165 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8166
8167 if (!force) {
8168 ss << "are you SURE? this will remap all placement groups in this pool,"
8169 " this triggers large data movement,"
8170 " pass --yes-i-really-mean-it if you really do.";
8171 return -EPERM;
8172 }
8173 // make sure we only compare against 'n' if we didn't receive a string
8174 if (val == "true" || (interr.empty() && n == 1)) {
8175 p.set_flag(flag);
8176 } else if (val == "false" || (interr.empty() && n == 0)) {
8177 p.unset_flag(flag);
8178 } else {
8179 ss << "expecting value 'true', 'false', '0', or '1'";
8180 return -EINVAL;
8181 }
8182 } else if (var == "hit_set_type") {
8183 if (val == "none")
8184 p.hit_set_params = HitSet::Params();
8185 else {
8186 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8187 if (err)
8188 return err;
8189 if (val == "bloom") {
8190 BloomHitSet::Params *bsp = new BloomHitSet::Params;
8191 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8192 p.hit_set_params = HitSet::Params(bsp);
8193 } else if (val == "explicit_hash")
8194 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8195 else if (val == "explicit_object")
8196 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8197 else {
8198 ss << "unrecognized hit_set type '" << val << "'";
8199 return -EINVAL;
8200 }
8201 }
8202 } else if (var == "hit_set_period") {
8203 if (interr.length()) {
8204 ss << "error parsing integer value '" << val << "': " << interr;
8205 return -EINVAL;
8206 } else if (n < 0) {
8207 ss << "hit_set_period should be non-negative";
8208 return -EINVAL;
8209 }
8210 p.hit_set_period = n;
8211 } else if (var == "hit_set_count") {
8212 if (interr.length()) {
8213 ss << "error parsing integer value '" << val << "': " << interr;
8214 return -EINVAL;
8215 } else if (n < 0) {
8216 ss << "hit_set_count should be non-negative";
8217 return -EINVAL;
8218 }
8219 p.hit_set_count = n;
8220 } else if (var == "hit_set_fpp") {
8221 if (floaterr.length()) {
8222 ss << "error parsing floating point value '" << val << "': " << floaterr;
8223 return -EINVAL;
8224 } else if (f < 0 || f > 1.0) {
8225 ss << "hit_set_fpp should be in the range 0..1";
8226 return -EINVAL;
8227 }
8228 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8229 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8230 return -EINVAL;
8231 }
8232 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8233 bloomp->set_fpp(f);
8234 } else if (var == "use_gmt_hitset") {
8235 if (val == "true" || (interr.empty() && n == 1)) {
8236 p.use_gmt_hitset = true;
8237 } else {
8238 ss << "expecting value 'true' or '1'";
8239 return -EINVAL;
8240 }
8241 } else if (var == "allow_ec_overwrites") {
8242 if (!p.is_erasure()) {
8243 ss << "ec overwrites can only be enabled for an erasure coded pool";
8244 return -EINVAL;
8245 }
8246 stringstream err;
8247 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8248 !is_pool_currently_all_bluestore(pool, p, &err)) {
8249 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8250 return -EINVAL;
8251 }
8252 if (val == "true" || (interr.empty() && n == 1)) {
8253 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8254 } else if (val == "false" || (interr.empty() && n == 0)) {
8255 ss << "ec overwrites cannot be disabled once enabled";
8256 return -EINVAL;
8257 } else {
8258 ss << "expecting value 'true', 'false', '0', or '1'";
8259 return -EINVAL;
8260 }
8261 } else if (var == "target_max_objects") {
8262 if (interr.length()) {
8263 ss << "error parsing int '" << val << "': " << interr;
8264 return -EINVAL;
8265 }
8266 p.target_max_objects = n;
8267 } else if (var == "target_max_bytes") {
8268 if (interr.length()) {
8269 ss << "error parsing int '" << val << "': " << interr;
8270 return -EINVAL;
8271 }
8272 p.target_max_bytes = n;
8273 } else if (var == "cache_target_dirty_ratio") {
8274 if (floaterr.length()) {
8275 ss << "error parsing float '" << val << "': " << floaterr;
8276 return -EINVAL;
8277 }
8278 if (f < 0 || f > 1.0) {
8279 ss << "value must be in the range 0..1";
8280 return -ERANGE;
8281 }
8282 p.cache_target_dirty_ratio_micro = uf;
8283 } else if (var == "cache_target_dirty_high_ratio") {
8284 if (floaterr.length()) {
8285 ss << "error parsing float '" << val << "': " << floaterr;
8286 return -EINVAL;
8287 }
8288 if (f < 0 || f > 1.0) {
8289 ss << "value must be in the range 0..1";
8290 return -ERANGE;
8291 }
8292 p.cache_target_dirty_high_ratio_micro = uf;
8293 } else if (var == "cache_target_full_ratio") {
8294 if (floaterr.length()) {
8295 ss << "error parsing float '" << val << "': " << floaterr;
8296 return -EINVAL;
8297 }
8298 if (f < 0 || f > 1.0) {
8299 ss << "value must be in the range 0..1";
8300 return -ERANGE;
8301 }
8302 p.cache_target_full_ratio_micro = uf;
8303 } else if (var == "cache_min_flush_age") {
8304 if (interr.length()) {
8305 ss << "error parsing int '" << val << "': " << interr;
8306 return -EINVAL;
8307 }
8308 p.cache_min_flush_age = n;
8309 } else if (var == "cache_min_evict_age") {
8310 if (interr.length()) {
8311 ss << "error parsing int '" << val << "': " << interr;
8312 return -EINVAL;
8313 }
8314 p.cache_min_evict_age = n;
8315 } else if (var == "min_read_recency_for_promote") {
8316 if (interr.length()) {
8317 ss << "error parsing integer value '" << val << "': " << interr;
8318 return -EINVAL;
8319 }
8320 p.min_read_recency_for_promote = n;
8321 } else if (var == "hit_set_grade_decay_rate") {
8322 if (interr.length()) {
8323 ss << "error parsing integer value '" << val << "': " << interr;
8324 return -EINVAL;
8325 }
8326 if (n > 100 || n < 0) {
8327 ss << "value out of range,valid range is 0 - 100";
8328 return -EINVAL;
8329 }
8330 p.hit_set_grade_decay_rate = n;
8331 } else if (var == "hit_set_search_last_n") {
8332 if (interr.length()) {
8333 ss << "error parsing integer value '" << val << "': " << interr;
8334 return -EINVAL;
8335 }
8336 if (n > p.hit_set_count || n < 0) {
8337 ss << "value out of range,valid range is 0 - hit_set_count";
8338 return -EINVAL;
8339 }
8340 p.hit_set_search_last_n = n;
8341 } else if (var == "min_write_recency_for_promote") {
8342 if (interr.length()) {
8343 ss << "error parsing integer value '" << val << "': " << interr;
8344 return -EINVAL;
8345 }
8346 p.min_write_recency_for_promote = n;
8347 } else if (var == "fast_read") {
8348 if (p.is_replicated()) {
8349 ss << "fast read is not supported in replication pool";
8350 return -EINVAL;
8351 }
8352 if (val == "true" || (interr.empty() && n == 1)) {
8353 p.fast_read = true;
8354 } else if (val == "false" || (interr.empty() && n == 0)) {
8355 p.fast_read = false;
8356 } else {
8357 ss << "expecting value 'true', 'false', '0', or '1'";
8358 return -EINVAL;
8359 }
8360 } else if (pool_opts_t::is_opt_name(var)) {
8361 bool unset = val == "unset";
8362 if (var == "compression_mode") {
8363 if (!unset) {
8364 auto cmode = Compressor::get_comp_mode_type(val);
8365 if (!cmode) {
8366 ss << "unrecognized compression mode '" << val << "'";
8367 return -EINVAL;
8368 }
8369 }
8370 } else if (var == "compression_algorithm") {
8371 if (!unset) {
8372 auto alg = Compressor::get_comp_alg_type(val);
8373 if (!alg) {
8374 ss << "unrecognized compression_algorithm '" << val << "'";
8375 return -EINVAL;
8376 }
8377 }
8378 } else if (var == "compression_required_ratio") {
8379 if (floaterr.length()) {
8380 ss << "error parsing float value '" << val << "': " << floaterr;
8381 return -EINVAL;
8382 }
8383 if (f < 0 || f > 1) {
8384 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8385 return -EINVAL;
8386 }
8387 } else if (var == "csum_type") {
8388 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8389 if (t < 0 ) {
8390 ss << "unrecognized csum_type '" << val << "'";
8391 return -EINVAL;
8392 }
8393 //preserve csum_type numeric value
8394 n = t;
8395 interr.clear();
8396 } else if (var == "compression_max_blob_size" ||
8397 var == "compression_min_blob_size" ||
8398 var == "csum_max_block" ||
8399 var == "csum_min_block") {
8400 if (interr.length()) {
8401 ss << "error parsing int value '" << val << "': " << interr;
8402 return -EINVAL;
8403 }
8404 } else if (var == "fingerprint_algorithm") {
8405 if (!unset) {
8406 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8407 if (!alg) {
8408 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8409 return -EINVAL;
8410 }
8411 }
8412 } else if (var == "target_size_bytes") {
8413 if (interr.length()) {
8414 ss << "error parsing unit value '" << val << "': " << interr;
8415 return -EINVAL;
8416 }
8417 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8418 ss << "must set require_osd_release to nautilus or "
8419 << "later before setting target_size_bytes";
8420 return -EINVAL;
8421 }
8422 } else if (var == "pg_num_min") {
8423 if (interr.length()) {
8424 ss << "error parsing int value '" << val << "': " << interr;
8425 return -EINVAL;
8426 }
8427 if (n > (int)p.get_pg_num_target()) {
8428 ss << "specified pg_num_min " << n
8429 << " > pg_num " << p.get_pg_num_target();
8430 return -EINVAL;
8431 }
8432 } else if (var == "recovery_priority") {
8433 if (interr.length()) {
8434 ss << "error parsing int value '" << val << "': " << interr;
8435 return -EINVAL;
8436 }
8437 if (!g_conf()->debug_allow_any_pool_priority) {
8438 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8439 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8440 << " and " << OSD_POOL_PRIORITY_MAX;
8441 return -EINVAL;
8442 }
8443 }
8444 } else if (var == "pg_autoscale_bias") {
8445 if (f < 0.0 || f > 1000.0) {
8446 ss << "pg_autoscale_bias must be between 0 and 1000";
8447 return -EINVAL;
8448 }
8449 }
8450
8451 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8452 switch (desc.type) {
8453 case pool_opts_t::STR:
8454 if (unset) {
8455 p.opts.unset(desc.key);
8456 } else {
8457 p.opts.set(desc.key, static_cast<std::string>(val));
8458 }
8459 break;
8460 case pool_opts_t::INT:
8461 if (interr.length()) {
8462 ss << "error parsing integer value '" << val << "': " << interr;
8463 return -EINVAL;
8464 }
8465 if (n == 0) {
8466 p.opts.unset(desc.key);
8467 } else {
8468 p.opts.set(desc.key, static_cast<int64_t>(n));
8469 }
8470 break;
8471 case pool_opts_t::DOUBLE:
8472 if (floaterr.length()) {
8473 ss << "error parsing floating point value '" << val << "': " << floaterr;
8474 return -EINVAL;
8475 }
8476 if (f == 0) {
8477 p.opts.unset(desc.key);
8478 } else {
8479 p.opts.set(desc.key, static_cast<double>(f));
8480 }
8481 break;
8482 default:
8483 ceph_assert(!"unknown type");
8484 }
8485 } else {
8486 ss << "unrecognized variable '" << var << "'";
8487 return -EINVAL;
8488 }
8489 if (val != "unset") {
8490 ss << "set pool " << pool << " " << var << " to " << val;
8491 } else {
8492 ss << "unset pool " << pool << " " << var;
8493 }
8494 p.last_change = pending_inc.epoch;
8495 pending_inc.new_pools[pool] = p;
8496 return 0;
8497 }
8498
8499 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8500 const cmdmap_t& cmdmap,
8501 stringstream& ss)
8502 {
8503 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8504 }
8505
8506 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8507 const cmdmap_t& cmdmap,
8508 stringstream& ss,
8509 bool *modified)
8510 {
8511 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8512 }
8513
8514
8515 /**
8516 * Common logic for preprocess and prepare phases of pool application
8517 * tag commands. In preprocess mode we're only detecting invalid
8518 * commands, and determining whether it was a modification or a no-op.
8519 * In prepare mode we're actually updating the pending state.
8520 */
8521 int OSDMonitor::_command_pool_application(const string &prefix,
8522 const cmdmap_t& cmdmap,
8523 stringstream& ss,
8524 bool *modified,
8525 bool preparing)
8526 {
8527 string pool_name;
8528 cmd_getval(cmdmap, "pool", pool_name);
8529 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8530 if (pool < 0) {
8531 ss << "unrecognized pool '" << pool_name << "'";
8532 return -ENOENT;
8533 }
8534
8535 pg_pool_t p = *osdmap.get_pg_pool(pool);
8536 if (preparing) {
8537 if (pending_inc.new_pools.count(pool)) {
8538 p = pending_inc.new_pools[pool];
8539 }
8540 }
8541
8542 string app;
8543 cmd_getval(cmdmap, "app", app);
8544 bool app_exists = (p.application_metadata.count(app) > 0);
8545
8546 string key;
8547 cmd_getval(cmdmap, "key", key);
8548 if (key == "all") {
8549 ss << "key cannot be 'all'";
8550 return -EINVAL;
8551 }
8552
8553 string value;
8554 cmd_getval(cmdmap, "value", value);
8555 if (value == "all") {
8556 ss << "value cannot be 'all'";
8557 return -EINVAL;
8558 }
8559
8560 if (boost::algorithm::ends_with(prefix, "enable")) {
8561 if (app.empty()) {
8562 ss << "application name must be provided";
8563 return -EINVAL;
8564 }
8565
8566 if (p.is_tier()) {
8567 ss << "application must be enabled on base tier";
8568 return -EINVAL;
8569 }
8570
8571 bool force = false;
8572 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8573
8574 if (!app_exists && !p.application_metadata.empty() && !force) {
8575 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8576 << "application; pass --yes-i-really-mean-it to proceed anyway";
8577 return -EPERM;
8578 }
8579
8580 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8581 ss << "too many enabled applications on pool '" << pool_name << "'; "
8582 << "max " << MAX_POOL_APPLICATIONS;
8583 return -EINVAL;
8584 }
8585
8586 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8587 ss << "application name '" << app << "' too long; max length "
8588 << MAX_POOL_APPLICATION_LENGTH;
8589 return -EINVAL;
8590 }
8591
8592 if (!app_exists) {
8593 p.application_metadata[app] = {};
8594 }
8595 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8596
8597 } else if (boost::algorithm::ends_with(prefix, "disable")) {
8598 bool force = false;
8599 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8600
8601 if (!force) {
8602 ss << "Are you SURE? Disabling an application within a pool might result "
8603 << "in loss of application functionality; pass "
8604 << "--yes-i-really-mean-it to proceed anyway";
8605 return -EPERM;
8606 }
8607
8608 if (!app_exists) {
8609 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8610 << "'";
8611 return 0; // idempotent
8612 }
8613
8614 p.application_metadata.erase(app);
8615 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8616
8617 } else if (boost::algorithm::ends_with(prefix, "set")) {
8618 if (p.is_tier()) {
8619 ss << "application metadata must be set on base tier";
8620 return -EINVAL;
8621 }
8622
8623 if (!app_exists) {
8624 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8625 << "'";
8626 return -ENOENT;
8627 }
8628
8629 string key;
8630 cmd_getval(cmdmap, "key", key);
8631
8632 if (key.empty()) {
8633 ss << "key must be provided";
8634 return -EINVAL;
8635 }
8636
8637 auto &app_keys = p.application_metadata[app];
8638 if (app_keys.count(key) == 0 &&
8639 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8640 ss << "too many keys set for application '" << app << "' on pool '"
8641 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8642 return -EINVAL;
8643 }
8644
8645 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8646 ss << "key '" << app << "' too long; max length "
8647 << MAX_POOL_APPLICATION_LENGTH;
8648 return -EINVAL;
8649 }
8650
8651 string value;
8652 cmd_getval(cmdmap, "value", value);
8653 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8654 ss << "value '" << value << "' too long; max length "
8655 << MAX_POOL_APPLICATION_LENGTH;
8656 return -EINVAL;
8657 }
8658
8659 p.application_metadata[app][key] = value;
8660 ss << "set application '" << app << "' key '" << key << "' to '"
8661 << value << "' on pool '" << pool_name << "'";
8662 } else if (boost::algorithm::ends_with(prefix, "rm")) {
8663 if (!app_exists) {
8664 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8665 << "'";
8666 return -ENOENT;
8667 }
8668
8669 string key;
8670 cmd_getval(cmdmap, "key", key);
8671 auto it = p.application_metadata[app].find(key);
8672 if (it == p.application_metadata[app].end()) {
8673 ss << "application '" << app << "' on pool '" << pool_name
8674 << "' does not have key '" << key << "'";
8675 return 0; // idempotent
8676 }
8677
8678 p.application_metadata[app].erase(it);
8679 ss << "removed application '" << app << "' key '" << key << "' on pool '"
8680 << pool_name << "'";
8681 } else {
8682 ceph_abort();
8683 }
8684
8685 if (preparing) {
8686 p.last_change = pending_inc.epoch;
8687 pending_inc.new_pools[pool] = p;
8688 }
8689
8690 // Because we fell through this far, we didn't hit no-op cases,
8691 // so pool was definitely modified
8692 if (modified != nullptr) {
8693 *modified = true;
8694 }
8695
8696 return 0;
8697 }
8698
8699 int OSDMonitor::_prepare_command_osd_crush_remove(
8700 CrushWrapper &newcrush,
8701 int32_t id,
8702 int32_t ancestor,
8703 bool has_ancestor,
8704 bool unlink_only)
8705 {
8706 int err = 0;
8707
8708 if (has_ancestor) {
8709 err = newcrush.remove_item_under(cct, id, ancestor,
8710 unlink_only);
8711 } else {
8712 err = newcrush.remove_item(cct, id, unlink_only);
8713 }
8714 return err;
8715 }
8716
8717 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8718 {
8719 pending_inc.crush.clear();
8720 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8721 }
8722
8723 int OSDMonitor::prepare_command_osd_crush_remove(
8724 CrushWrapper &newcrush,
8725 int32_t id,
8726 int32_t ancestor,
8727 bool has_ancestor,
8728 bool unlink_only)
8729 {
8730 int err = _prepare_command_osd_crush_remove(
8731 newcrush, id, ancestor,
8732 has_ancestor, unlink_only);
8733
8734 if (err < 0)
8735 return err;
8736
8737 ceph_assert(err == 0);
8738 do_osd_crush_remove(newcrush);
8739
8740 return 0;
8741 }
8742
8743 int OSDMonitor::prepare_command_osd_remove(int32_t id)
8744 {
8745 if (osdmap.is_up(id)) {
8746 return -EBUSY;
8747 }
8748
8749 pending_inc.new_state[id] = osdmap.get_state(id);
8750 pending_inc.new_uuid[id] = uuid_d();
8751 pending_metadata_rm.insert(id);
8752 pending_metadata.erase(id);
8753
8754 return 0;
8755 }
8756
8757 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8758 {
8759 ceph_assert(existing_id);
8760 *existing_id = -1;
8761
8762 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8763 if (!osdmap.exists(i) &&
8764 pending_inc.new_up_client.count(i) == 0 &&
8765 (pending_inc.new_state.count(i) == 0 ||
8766 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8767 *existing_id = i;
8768 return -1;
8769 }
8770 }
8771
8772 if (pending_inc.new_max_osd < 0) {
8773 return osdmap.get_max_osd();
8774 }
8775 return pending_inc.new_max_osd;
8776 }
8777
8778 void OSDMonitor::do_osd_create(
8779 const int32_t id,
8780 const uuid_d& uuid,
8781 const string& device_class,
8782 int32_t* new_id)
8783 {
8784 dout(10) << __func__ << " uuid " << uuid << dendl;
8785 ceph_assert(new_id);
8786
8787 // We presume validation has been performed prior to calling this
8788 // function. We assert with prejudice.
8789
8790 int32_t allocated_id = -1; // declare here so we can jump
8791 int32_t existing_id = -1;
8792 if (!uuid.is_zero()) {
8793 existing_id = osdmap.identify_osd(uuid);
8794 if (existing_id >= 0) {
8795 ceph_assert(id < 0 || id == existing_id);
8796 *new_id = existing_id;
8797 goto out;
8798 } else if (id >= 0) {
8799 // uuid does not exist, and id has been provided, so just create
8800 // the new osd.id
8801 *new_id = id;
8802 goto out;
8803 }
8804 }
8805
8806 // allocate a new id
8807 allocated_id = _allocate_osd_id(&existing_id);
8808 dout(10) << __func__ << " allocated id " << allocated_id
8809 << " existing id " << existing_id << dendl;
8810 if (existing_id >= 0) {
8811 ceph_assert(existing_id < osdmap.get_max_osd());
8812 ceph_assert(allocated_id < 0);
8813 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8814 *new_id = existing_id;
8815 } else if (allocated_id >= 0) {
8816 ceph_assert(existing_id < 0);
8817 // raise max_osd
8818 if (pending_inc.new_max_osd < 0) {
8819 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8820 } else {
8821 ++pending_inc.new_max_osd;
8822 }
8823 *new_id = pending_inc.new_max_osd - 1;
8824 ceph_assert(*new_id == allocated_id);
8825 } else {
8826 ceph_abort_msg("unexpected condition");
8827 }
8828
8829 out:
8830 if (device_class.size()) {
8831 CrushWrapper newcrush;
8832 _get_pending_crush(newcrush);
8833 if (newcrush.get_max_devices() < *new_id + 1) {
8834 newcrush.set_max_devices(*new_id + 1);
8835 }
8836 string name = string("osd.") + stringify(*new_id);
8837 if (!newcrush.item_exists(*new_id)) {
8838 newcrush.set_item_name(*new_id, name);
8839 }
8840 ostringstream ss;
8841 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8842 if (r < 0) {
8843 derr << __func__ << " failed to set " << name << " device_class "
8844 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8845 << dendl;
8846 // non-fatal... this might be a replay and we want to be idempotent.
8847 } else {
8848 dout(20) << __func__ << " set " << name << " device_class " << device_class
8849 << dendl;
8850 pending_inc.crush.clear();
8851 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8852 }
8853 } else {
8854 dout(20) << __func__ << " no device_class" << dendl;
8855 }
8856
8857 dout(10) << __func__ << " using id " << *new_id << dendl;
8858 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8859 pending_inc.new_max_osd = *new_id + 1;
8860 }
8861
8862 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8863 if (!uuid.is_zero())
8864 pending_inc.new_uuid[*new_id] = uuid;
8865 }
8866
8867 int OSDMonitor::validate_osd_create(
8868 const int32_t id,
8869 const uuid_d& uuid,
8870 const bool check_osd_exists,
8871 int32_t* existing_id,
8872 stringstream& ss)
8873 {
8874
8875 dout(10) << __func__ << " id " << id << " uuid " << uuid
8876 << " check_osd_exists " << check_osd_exists << dendl;
8877
8878 ceph_assert(existing_id);
8879
8880 if (id < 0 && uuid.is_zero()) {
8881 // we have nothing to validate
8882 *existing_id = -1;
8883 return 0;
8884 } else if (uuid.is_zero()) {
8885 // we have an id but we will ignore it - because that's what
8886 // `osd create` does.
8887 return 0;
8888 }
8889
8890 /*
8891 * This function will be used to validate whether we are able to
8892 * create a new osd when the `uuid` is specified.
8893 *
8894 * It will be used by both `osd create` and `osd new`, as the checks
8895 * are basically the same when it pertains to osd id and uuid validation.
8896 * However, `osd create` presumes an `uuid` is optional, for legacy
8897 * reasons, while `osd new` requires the `uuid` to be provided. This
8898 * means that `osd create` will not be idempotent if an `uuid` is not
8899 * provided, but we will always guarantee the idempotency of `osd new`.
8900 */
8901
8902 ceph_assert(!uuid.is_zero());
8903 if (pending_inc.identify_osd(uuid) >= 0) {
8904 // osd is about to exist
8905 return -EAGAIN;
8906 }
8907
8908 int32_t i = osdmap.identify_osd(uuid);
8909 if (i >= 0) {
8910 // osd already exists
8911 if (id >= 0 && i != id) {
8912 ss << "uuid " << uuid << " already in use for different id " << i;
8913 return -EEXIST;
8914 }
8915 // return a positive errno to distinguish between a blocking error
8916 // and an error we consider to not be a problem (i.e., this would be
8917 // an idempotent operation).
8918 *existing_id = i;
8919 return EEXIST;
8920 }
8921 // i < 0
8922 if (id >= 0) {
8923 if (pending_inc.new_state.count(id)) {
8924 // osd is about to exist
8925 return -EAGAIN;
8926 }
8927 // we may not care if an osd exists if we are recreating a previously
8928 // destroyed osd.
8929 if (check_osd_exists && osdmap.exists(id)) {
8930 ss << "id " << id << " already in use and does not match uuid "
8931 << uuid;
8932 return -EINVAL;
8933 }
8934 }
8935 return 0;
8936 }
8937
8938 int OSDMonitor::prepare_command_osd_create(
8939 const int32_t id,
8940 const uuid_d& uuid,
8941 int32_t* existing_id,
8942 stringstream& ss)
8943 {
8944 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8945 ceph_assert(existing_id);
8946 if (osdmap.is_destroyed(id)) {
8947 ss << "ceph osd create has been deprecated. Please use ceph osd new "
8948 "instead.";
8949 return -EINVAL;
8950 }
8951
8952 if (uuid.is_zero()) {
8953 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8954 }
8955
8956 return validate_osd_create(id, uuid, true, existing_id, ss);
8957 }
8958
8959 int OSDMonitor::prepare_command_osd_new(
8960 MonOpRequestRef op,
8961 const cmdmap_t& cmdmap,
8962 const map<string,string>& params,
8963 stringstream &ss,
8964 Formatter *f)
8965 {
8966 uuid_d uuid;
8967 string uuidstr;
8968 int64_t id = -1;
8969
8970 ceph_assert(paxos->is_plugged());
8971
8972 dout(10) << __func__ << " " << op << dendl;
8973
8974 /* validate command. abort now if something's wrong. */
8975
8976 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8977 *
8978 * If `id` is not specified, we will identify any existing osd based
8979 * on `uuid`. Operation will be idempotent iff secrets match.
8980 *
8981 * If `id` is specified, we will identify any existing osd based on
8982 * `uuid` and match against `id`. If they match, operation will be
8983 * idempotent iff secrets match.
8984 *
8985 * `-i secrets.json` will be optional. If supplied, will be used
8986 * to check for idempotency when `id` and `uuid` match.
8987 *
8988 * If `id` is not specified, and `uuid` does not exist, an id will
8989 * be found or allocated for the osd.
8990 *
8991 * If `id` is specified, and the osd has been previously marked
8992 * as destroyed, then the `id` will be reused.
8993 */
8994 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
8995 ss << "requires the OSD's UUID to be specified.";
8996 return -EINVAL;
8997 } else if (!uuid.parse(uuidstr.c_str())) {
8998 ss << "invalid UUID value '" << uuidstr << "'.";
8999 return -EINVAL;
9000 }
9001
9002 if (cmd_getval(cmdmap, "id", id) &&
9003 (id < 0)) {
9004 ss << "invalid OSD id; must be greater or equal than zero.";
9005 return -EINVAL;
9006 }
9007
9008 // are we running an `osd create`-like command, or recreating
9009 // a previously destroyed osd?
9010
9011 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9012
9013 // we will care about `id` to assess whether osd is `destroyed`, or
9014 // to create a new osd.
9015 // we will need an `id` by the time we reach auth.
9016
9017 int32_t existing_id = -1;
9018 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9019 &existing_id, ss);
9020
9021 bool may_be_idempotent = false;
9022 if (err == EEXIST) {
9023 // this is idempotent from the osdmon's point-of-view
9024 may_be_idempotent = true;
9025 ceph_assert(existing_id >= 0);
9026 id = existing_id;
9027 } else if (err < 0) {
9028 return err;
9029 }
9030
9031 if (!may_be_idempotent) {
9032 // idempotency is out of the window. We are either creating a new
9033 // osd or recreating a destroyed osd.
9034 //
9035 // We now need to figure out if we have an `id` (and if it's valid),
9036 // of find an `id` if we don't have one.
9037
9038 // NOTE: we need to consider the case where the `id` is specified for
9039 // `osd create`, and we must honor it. So this means checking if
9040 // the `id` is destroyed, and if so assume the destroy; otherwise,
9041 // check if it `exists` - in which case we complain about not being
9042 // `destroyed`. In the end, if nothing fails, we must allow the
9043 // creation, so that we are compatible with `create`.
9044 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9045 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9046 ss << "OSD " << id << " has not yet been destroyed";
9047 return -EINVAL;
9048 } else if (id < 0) {
9049 // find an `id`
9050 id = _allocate_osd_id(&existing_id);
9051 if (id < 0) {
9052 ceph_assert(existing_id >= 0);
9053 id = existing_id;
9054 }
9055 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9056 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9057 dout(10) << __func__ << " recreating osd." << id << dendl;
9058 } else {
9059 dout(10) << __func__ << " creating new osd." << id << dendl;
9060 }
9061 } else {
9062 ceph_assert(id >= 0);
9063 ceph_assert(osdmap.exists(id));
9064 }
9065
9066 // we are now able to either create a brand new osd or reuse an existing
9067 // osd that has been previously destroyed.
9068
9069 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9070
9071 if (may_be_idempotent && params.empty()) {
9072 // nothing to do, really.
9073 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9074 ceph_assert(id >= 0);
9075 if (f) {
9076 f->open_object_section("created_osd");
9077 f->dump_int("osdid", id);
9078 f->close_section();
9079 } else {
9080 ss << id;
9081 }
9082 return EEXIST;
9083 }
9084
9085 string device_class;
9086 auto p = params.find("crush_device_class");
9087 if (p != params.end()) {
9088 device_class = p->second;
9089 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9090 }
9091 string cephx_secret, lockbox_secret, dmcrypt_key;
9092 bool has_lockbox = false;
9093 bool has_secrets = params.count("cephx_secret")
9094 || params.count("cephx_lockbox_secret")
9095 || params.count("dmcrypt_key");
9096
9097 ConfigKeyService *svc = nullptr;
9098 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9099
9100 if (has_secrets) {
9101 if (params.count("cephx_secret") == 0) {
9102 ss << "requires a cephx secret.";
9103 return -EINVAL;
9104 }
9105 cephx_secret = params.at("cephx_secret");
9106
9107 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9108 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9109
9110 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9111 << " dmcrypt " << has_dmcrypt_key << dendl;
9112
9113 if (has_lockbox_secret && has_dmcrypt_key) {
9114 has_lockbox = true;
9115 lockbox_secret = params.at("cephx_lockbox_secret");
9116 dmcrypt_key = params.at("dmcrypt_key");
9117 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9118 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9119 return -EINVAL;
9120 }
9121
9122 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9123
9124 err = mon->authmon()->validate_osd_new(id, uuid,
9125 cephx_secret,
9126 lockbox_secret,
9127 cephx_entity,
9128 lockbox_entity,
9129 ss);
9130 if (err < 0) {
9131 return err;
9132 } else if (may_be_idempotent && err != EEXIST) {
9133 // for this to be idempotent, `id` should already be >= 0; no need
9134 // to use validate_id.
9135 ceph_assert(id >= 0);
9136 ss << "osd." << id << " exists but secrets do not match";
9137 return -EEXIST;
9138 }
9139
9140 if (has_lockbox) {
9141 svc = (ConfigKeyService*)mon->config_key_service;
9142 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9143 if (err < 0) {
9144 return err;
9145 } else if (may_be_idempotent && err != EEXIST) {
9146 ceph_assert(id >= 0);
9147 ss << "osd." << id << " exists but dm-crypt key does not match.";
9148 return -EEXIST;
9149 }
9150 }
9151 }
9152 ceph_assert(!has_secrets || !cephx_secret.empty());
9153 ceph_assert(!has_lockbox || !lockbox_secret.empty());
9154
9155 if (may_be_idempotent) {
9156 // we have nothing to do for either the osdmon or the authmon,
9157 // and we have no lockbox - so the config key service will not be
9158 // touched. This is therefore an idempotent operation, and we can
9159 // just return right away.
9160 dout(10) << __func__ << " idempotent -- no op." << dendl;
9161 ceph_assert(id >= 0);
9162 if (f) {
9163 f->open_object_section("created_osd");
9164 f->dump_int("osdid", id);
9165 f->close_section();
9166 } else {
9167 ss << id;
9168 }
9169 return EEXIST;
9170 }
9171 ceph_assert(!may_be_idempotent);
9172
9173 // perform updates.
9174 if (has_secrets) {
9175 ceph_assert(!cephx_secret.empty());
9176 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9177 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9178
9179 err = mon->authmon()->do_osd_new(cephx_entity,
9180 lockbox_entity,
9181 has_lockbox);
9182 ceph_assert(0 == err);
9183
9184 if (has_lockbox) {
9185 ceph_assert(nullptr != svc);
9186 svc->do_osd_new(uuid, dmcrypt_key);
9187 }
9188 }
9189
9190 if (is_recreate_destroyed) {
9191 ceph_assert(id >= 0);
9192 ceph_assert(osdmap.is_destroyed(id));
9193 pending_inc.new_weight[id] = CEPH_OSD_OUT;
9194 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9195 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9196 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9197 }
9198 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9199 // due to http://tracker.ceph.com/issues/20751 some clusters may
9200 // have UP set for non-existent OSDs; make sure it is cleared
9201 // for a newly created osd.
9202 pending_inc.new_state[id] |= CEPH_OSD_UP;
9203 }
9204 pending_inc.new_uuid[id] = uuid;
9205 } else {
9206 ceph_assert(id >= 0);
9207 int32_t new_id = -1;
9208 do_osd_create(id, uuid, device_class, &new_id);
9209 ceph_assert(new_id >= 0);
9210 ceph_assert(id == new_id);
9211 }
9212
9213 if (f) {
9214 f->open_object_section("created_osd");
9215 f->dump_int("osdid", id);
9216 f->close_section();
9217 } else {
9218 ss << id;
9219 }
9220
9221 return 0;
9222 }
9223
9224 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9225 {
9226 op->mark_osdmon_event(__func__);
9227 auto m = op->get_req<MMonCommand>();
9228 stringstream ss;
9229 cmdmap_t cmdmap;
9230 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9231 string rs = ss.str();
9232 mon->reply_command(op, -EINVAL, rs, get_last_committed());
9233 return true;
9234 }
9235
9236 MonSession *session = op->get_session();
9237 if (!session) {
9238 derr << __func__ << " no session" << dendl;
9239 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9240 return true;
9241 }
9242
9243 return prepare_command_impl(op, cmdmap);
9244 }
9245
9246 static int parse_reweights(CephContext *cct,
9247 const cmdmap_t& cmdmap,
9248 const OSDMap& osdmap,
9249 map<int32_t, uint32_t>* weights)
9250 {
9251 string weights_str;
9252 if (!cmd_getval(cmdmap, "weights", weights_str)) {
9253 return -EINVAL;
9254 }
9255 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9256 json_spirit::mValue json_value;
9257 if (!json_spirit::read(weights_str, json_value)) {
9258 return -EINVAL;
9259 }
9260 if (json_value.type() != json_spirit::obj_type) {
9261 return -EINVAL;
9262 }
9263 const auto obj = json_value.get_obj();
9264 try {
9265 for (auto& osd_weight : obj) {
9266 auto osd_id = std::stoi(osd_weight.first);
9267 if (!osdmap.exists(osd_id)) {
9268 return -ENOENT;
9269 }
9270 if (osd_weight.second.type() != json_spirit::str_type) {
9271 return -EINVAL;
9272 }
9273 auto weight = std::stoul(osd_weight.second.get_str());
9274 weights->insert({osd_id, weight});
9275 }
9276 } catch (const std::logic_error& e) {
9277 return -EINVAL;
9278 }
9279 return 0;
9280 }
9281
9282 int OSDMonitor::prepare_command_osd_destroy(
9283 int32_t id,
9284 stringstream& ss)
9285 {
9286 ceph_assert(paxos->is_plugged());
9287
9288 // we check if the osd exists for the benefit of `osd purge`, which may
9289 // have previously removed the osd. If the osd does not exist, return
9290 // -ENOENT to convey this, and let the caller deal with it.
9291 //
9292 // we presume that all auth secrets and config keys were removed prior
9293 // to this command being called. if they exist by now, we also assume
9294 // they must have been created by some other command and do not pertain
9295 // to this non-existent osd.
9296 if (!osdmap.exists(id)) {
9297 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9298 return -ENOENT;
9299 }
9300
9301 uuid_d uuid = osdmap.get_uuid(id);
9302 dout(10) << __func__ << " destroying osd." << id
9303 << " uuid " << uuid << dendl;
9304
9305 // if it has been destroyed, we assume our work here is done.
9306 if (osdmap.is_destroyed(id)) {
9307 ss << "destroyed osd." << id;
9308 return 0;
9309 }
9310
9311 EntityName cephx_entity, lockbox_entity;
9312 bool idempotent_auth = false, idempotent_cks = false;
9313
9314 int err = mon->authmon()->validate_osd_destroy(id, uuid,
9315 cephx_entity,
9316 lockbox_entity,
9317 ss);
9318 if (err < 0) {
9319 if (err == -ENOENT) {
9320 idempotent_auth = true;
9321 } else {
9322 return err;
9323 }
9324 }
9325
9326 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9327 err = svc->validate_osd_destroy(id, uuid);
9328 if (err < 0) {
9329 ceph_assert(err == -ENOENT);
9330 err = 0;
9331 idempotent_cks = true;
9332 }
9333
9334 if (!idempotent_auth) {
9335 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9336 ceph_assert(0 == err);
9337 }
9338
9339 if (!idempotent_cks) {
9340 svc->do_osd_destroy(id, uuid);
9341 }
9342
9343 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9344 pending_inc.new_uuid[id] = uuid_d();
9345
9346 // we can only propose_pending() once per service, otherwise we'll be
9347 // defying PaxosService and all laws of nature. Therefore, as we may
9348 // be used during 'osd purge', let's keep the caller responsible for
9349 // proposing.
9350 ceph_assert(err == 0);
9351 return 0;
9352 }
9353
9354 int OSDMonitor::prepare_command_osd_purge(
9355 int32_t id,
9356 stringstream& ss)
9357 {
9358 ceph_assert(paxos->is_plugged());
9359 dout(10) << __func__ << " purging osd." << id << dendl;
9360
9361 ceph_assert(!osdmap.is_up(id));
9362
9363 /*
9364 * This may look a bit weird, but this is what's going to happen:
9365 *
9366 * 1. we make sure that removing from crush works
9367 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9368 * error, then we abort the whole operation, as no updates
9369 * have been made. However, we this function will have
9370 * side-effects, thus we need to make sure that all operations
9371 * performed henceforth will *always* succeed.
9372 * 3. we call `prepare_command_osd_remove()`. Although this
9373 * function can return an error, it currently only checks if the
9374 * osd is up - and we have made sure that it is not so, so there
9375 * is no conflict, and it is effectively an update.
9376 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9377 * the crush update we delayed from before.
9378 */
9379
9380 CrushWrapper newcrush;
9381 _get_pending_crush(newcrush);
9382
9383 bool may_be_idempotent = false;
9384
9385 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9386 if (err == -ENOENT) {
9387 err = 0;
9388 may_be_idempotent = true;
9389 } else if (err < 0) {
9390 ss << "error removing osd." << id << " from crush";
9391 return err;
9392 }
9393
9394 // no point destroying the osd again if it has already been marked destroyed
9395 if (!osdmap.is_destroyed(id)) {
9396 err = prepare_command_osd_destroy(id, ss);
9397 if (err < 0) {
9398 if (err == -ENOENT) {
9399 err = 0;
9400 } else {
9401 return err;
9402 }
9403 } else {
9404 may_be_idempotent = false;
9405 }
9406 }
9407 ceph_assert(0 == err);
9408
9409 if (may_be_idempotent && !osdmap.exists(id)) {
9410 dout(10) << __func__ << " osd." << id << " does not exist and "
9411 << "we are idempotent." << dendl;
9412 return -ENOENT;
9413 }
9414
9415 err = prepare_command_osd_remove(id);
9416 // we should not be busy, as we should have made sure this id is not up.
9417 ceph_assert(0 == err);
9418
9419 do_osd_crush_remove(newcrush);
9420 return 0;
9421 }
9422
9423 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9424 const cmdmap_t& cmdmap)
9425 {
9426 op->mark_osdmon_event(__func__);
9427 auto m = op->get_req<MMonCommand>();
9428 bool ret = false;
9429 stringstream ss;
9430 string rs;
9431 bufferlist rdata;
9432 int err = 0;
9433
9434 string format;
9435 cmd_getval(cmdmap, "format", format, string("plain"));
9436 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9437
9438 string prefix;
9439 cmd_getval(cmdmap, "prefix", prefix);
9440
9441 int64_t osdid;
9442 string osd_name;
9443 bool osdid_present = false;
9444 if (prefix != "osd pg-temp" &&
9445 prefix != "osd pg-upmap" &&
9446 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9447 osdid_present = cmd_getval(cmdmap, "id", osdid);
9448 }
9449 if (osdid_present) {
9450 ostringstream oss;
9451 oss << "osd." << osdid;
9452 osd_name = oss.str();
9453 }
9454
9455 // Even if there's a pending state with changes that could affect
9456 // a command, considering that said state isn't yet committed, we
9457 // just don't care about those changes if the command currently being
9458 // handled acts as a no-op against the current committed state.
9459 // In a nutshell, we assume this command happens *before*.
9460 //
9461 // Let me make this clearer:
9462 //
9463 // - If we have only one client, and that client issues some
9464 // operation that would conflict with this operation but is
9465 // still on the pending state, then we would be sure that said
9466 // operation wouldn't have returned yet, so the client wouldn't
9467 // issue this operation (unless the client didn't wait for the
9468 // operation to finish, and that would be the client's own fault).
9469 //
9470 // - If we have more than one client, each client will observe
9471 // whatever is the state at the moment of the commit. So, if we
9472 // have two clients, one issuing an unlink and another issuing a
9473 // link, and if the link happens while the unlink is still on the
9474 // pending state, from the link's point-of-view this is a no-op.
9475 // If different clients are issuing conflicting operations and
9476 // they care about that, then the clients should make sure they
9477 // enforce some kind of concurrency mechanism -- from our
9478 // perspective that's what Douglas Adams would call an SEP.
9479 //
9480 // This should be used as a general guideline for most commands handled
9481 // in this function. Adapt as you see fit, but please bear in mind that
9482 // this is the expected behavior.
9483
9484
9485 if (prefix == "osd setcrushmap" ||
9486 (prefix == "osd crush set" && !osdid_present)) {
9487 if (pending_inc.crush.length()) {
9488 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9489 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9490 return true;
9491 }
9492 dout(10) << "prepare_command setting new crush map" << dendl;
9493 bufferlist data(m->get_data());
9494 CrushWrapper crush;
9495 try {
9496 auto bl = data.cbegin();
9497 crush.decode(bl);
9498 }
9499 catch (const std::exception &e) {
9500 err = -EINVAL;
9501 ss << "Failed to parse crushmap: " << e.what();
9502 goto reply;
9503 }
9504
9505 int64_t prior_version = 0;
9506 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9507 if (prior_version == osdmap.get_crush_version() - 1) {
9508 // see if we are a resend of the last update. this is imperfect
9509 // (multiple racing updaters may not both get reliable success)
9510 // but we expect crush updaters (via this interface) to be rare-ish.
9511 bufferlist current, proposed;
9512 osdmap.crush->encode(current, mon->get_quorum_con_features());
9513 crush.encode(proposed, mon->get_quorum_con_features());
9514 if (current.contents_equal(proposed)) {
9515 dout(10) << __func__
9516 << " proposed matches current and version equals previous"
9517 << dendl;
9518 err = 0;
9519 ss << osdmap.get_crush_version();
9520 goto reply;
9521 }
9522 }
9523 if (prior_version != osdmap.get_crush_version()) {
9524 err = -EPERM;
9525 ss << "prior_version " << prior_version << " != crush version "
9526 << osdmap.get_crush_version();
9527 goto reply;
9528 }
9529 }
9530
9531 if (crush.has_legacy_rule_ids()) {
9532 err = -EINVAL;
9533 ss << "crush maps with ruleset != ruleid are no longer allowed";
9534 goto reply;
9535 }
9536 if (!validate_crush_against_features(&crush, ss)) {
9537 err = -EINVAL;
9538 goto reply;
9539 }
9540
9541 err = osdmap.validate_crush_rules(&crush, &ss);
9542 if (err < 0) {
9543 goto reply;
9544 }
9545
9546 if (g_conf()->mon_osd_crush_smoke_test) {
9547 // sanity check: test some inputs to make sure this map isn't
9548 // totally broken
9549 dout(10) << " testing map" << dendl;
9550 stringstream ess;
9551 CrushTester tester(crush, ess);
9552 tester.set_min_x(0);
9553 tester.set_max_x(50);
9554 auto start = ceph::coarse_mono_clock::now();
9555 int r = tester.test_with_fork(g_conf()->mon_lease);
9556 auto duration = ceph::coarse_mono_clock::now() - start;
9557 if (r < 0) {
9558 dout(10) << " tester.test_with_fork returns " << r
9559 << ": " << ess.str() << dendl;
9560 ss << "crush smoke test failed with " << r << ": " << ess.str();
9561 err = r;
9562 goto reply;
9563 }
9564 dout(10) << __func__ << " crush somke test duration: "
9565 << duration << ", result: " << ess.str() << dendl;
9566 }
9567
9568 pending_inc.crush = data;
9569 ss << osdmap.get_crush_version() + 1;
9570 goto update;
9571
9572 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9573 CrushWrapper newcrush;
9574 _get_pending_crush(newcrush);
9575 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9576 int bid = -1 - b;
9577 if (newcrush.bucket_exists(bid) &&
9578 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9579 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9580 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9581 }
9582 }
9583 if (!validate_crush_against_features(&newcrush, ss)) {
9584 err = -EINVAL;
9585 goto reply;
9586 }
9587 pending_inc.crush.clear();
9588 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9589 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9590 get_last_committed() + 1));
9591 return true;
9592 } else if (prefix == "osd crush set-device-class") {
9593 string device_class;
9594 if (!cmd_getval(cmdmap, "class", device_class)) {
9595 err = -EINVAL; // no value!
9596 goto reply;
9597 }
9598
9599 bool stop = false;
9600 vector<string> idvec;
9601 cmd_getval(cmdmap, "ids", idvec);
9602 CrushWrapper newcrush;
9603 _get_pending_crush(newcrush);
9604 set<int> updated;
9605 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9606 set<int> osds;
9607 // wildcard?
9608 if (j == 0 &&
9609 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9610 osdmap.get_all_osds(osds);
9611 stop = true;
9612 } else {
9613 // try traditional single osd way
9614 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9615 if (osd < 0) {
9616 // ss has reason for failure
9617 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9618 err = -EINVAL;
9619 continue;
9620 }
9621 osds.insert(osd);
9622 }
9623
9624 for (auto &osd : osds) {
9625 if (!osdmap.exists(osd)) {
9626 ss << "osd." << osd << " does not exist. ";
9627 continue;
9628 }
9629
9630 ostringstream oss;
9631 oss << "osd." << osd;
9632 string name = oss.str();
9633
9634 if (newcrush.get_max_devices() < osd + 1) {
9635 newcrush.set_max_devices(osd + 1);
9636 }
9637 string action;
9638 if (newcrush.item_exists(osd)) {
9639 action = "updating";
9640 } else {
9641 action = "creating";
9642 newcrush.set_item_name(osd, name);
9643 }
9644
9645 dout(5) << action << " crush item id " << osd << " name '" << name
9646 << "' device_class '" << device_class << "'"
9647 << dendl;
9648 err = newcrush.update_device_class(osd, device_class, name, &ss);
9649 if (err < 0) {
9650 goto reply;
9651 }
9652 if (err == 0 && !_have_pending_crush()) {
9653 if (!stop) {
9654 // for single osd only, wildcard makes too much noise
9655 ss << "set-device-class item id " << osd << " name '" << name
9656 << "' device_class '" << device_class << "': no change. ";
9657 }
9658 } else {
9659 updated.insert(osd);
9660 }
9661 }
9662 }
9663
9664 if (!updated.empty()) {
9665 pending_inc.crush.clear();
9666 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9667 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9668 getline(ss, rs);
9669 wait_for_finished_proposal(op,
9670 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9671 return true;
9672 }
9673
9674 } else if (prefix == "osd crush rm-device-class") {
9675 bool stop = false;
9676 vector<string> idvec;
9677 cmd_getval(cmdmap, "ids", idvec);
9678 CrushWrapper newcrush;
9679 _get_pending_crush(newcrush);
9680 set<int> updated;
9681
9682 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9683 set<int> osds;
9684
9685 // wildcard?
9686 if (j == 0 &&
9687 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9688 osdmap.get_all_osds(osds);
9689 stop = true;
9690 } else {
9691 // try traditional single osd way
9692 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9693 if (osd < 0) {
9694 // ss has reason for failure
9695 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9696 err = -EINVAL;
9697 goto reply;
9698 }
9699 osds.insert(osd);
9700 }
9701
9702 for (auto &osd : osds) {
9703 if (!osdmap.exists(osd)) {
9704 ss << "osd." << osd << " does not exist. ";
9705 continue;
9706 }
9707
9708 auto class_name = newcrush.get_item_class(osd);
9709 if (!class_name) {
9710 ss << "osd." << osd << " belongs to no class, ";
9711 continue;
9712 }
9713 // note that we do not verify if class_is_in_use here
9714 // in case the device is misclassified and user wants
9715 // to overridely reset...
9716
9717 err = newcrush.remove_device_class(cct, osd, &ss);
9718 if (err < 0) {
9719 // ss has reason for failure
9720 goto reply;
9721 }
9722 updated.insert(osd);
9723 }
9724 }
9725
9726 if (!updated.empty()) {
9727 pending_inc.crush.clear();
9728 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9729 ss << "done removing class of osd(s): " << updated;
9730 getline(ss, rs);
9731 wait_for_finished_proposal(op,
9732 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9733 return true;
9734 }
9735 } else if (prefix == "osd crush class create") {
9736 string device_class;
9737 if (!cmd_getval(cmdmap, "class", device_class)) {
9738 err = -EINVAL; // no value!
9739 goto reply;
9740 }
9741 if (osdmap.require_osd_release < ceph_release_t::luminous) {
9742 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9743 << "luminous' before using crush device classes";
9744 err = -EPERM;
9745 goto reply;
9746 }
9747 if (!_have_pending_crush() &&
9748 _get_stable_crush().class_exists(device_class)) {
9749 ss << "class '" << device_class << "' already exists";
9750 goto reply;
9751 }
9752 CrushWrapper newcrush;
9753 _get_pending_crush(newcrush);
9754 if (newcrush.class_exists(device_class)) {
9755 ss << "class '" << device_class << "' already exists";
9756 goto update;
9757 }
9758 int class_id = newcrush.get_or_create_class_id(device_class);
9759 pending_inc.crush.clear();
9760 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9761 ss << "created class " << device_class << " with id " << class_id
9762 << " to crush map";
9763 goto update;
9764 } else if (prefix == "osd crush class rm") {
9765 string device_class;
9766 if (!cmd_getval(cmdmap, "class", device_class)) {
9767 err = -EINVAL; // no value!
9768 goto reply;
9769 }
9770 if (osdmap.require_osd_release < ceph_release_t::luminous) {
9771 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9772 << "luminous' before using crush device classes";
9773 err = -EPERM;
9774 goto reply;
9775 }
9776
9777 if (!osdmap.crush->class_exists(device_class)) {
9778 err = 0;
9779 goto reply;
9780 }
9781
9782 CrushWrapper newcrush;
9783 _get_pending_crush(newcrush);
9784 if (!newcrush.class_exists(device_class)) {
9785 err = 0; // make command idempotent
9786 goto wait;
9787 }
9788 int class_id = newcrush.get_class_id(device_class);
9789 stringstream ts;
9790 if (newcrush.class_is_in_use(class_id, &ts)) {
9791 err = -EBUSY;
9792 ss << "class '" << device_class << "' " << ts.str();
9793 goto reply;
9794 }
9795
9796 // check if class is used by any erasure-code-profiles
9797 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9798 osdmap.get_erasure_code_profiles();
9799 auto ec_profiles = pending_inc.get_erasure_code_profiles();
9800 #ifdef HAVE_STDLIB_MAP_SPLICING
9801 ec_profiles.merge(old_ec_profiles);
9802 #else
9803 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9804 make_move_iterator(end(old_ec_profiles)));
9805 #endif
9806 list<string> referenced_by;
9807 for (auto &i: ec_profiles) {
9808 for (auto &j: i.second) {
9809 if ("crush-device-class" == j.first && device_class == j.second) {
9810 referenced_by.push_back(i.first);
9811 }
9812 }
9813 }
9814 if (!referenced_by.empty()) {
9815 err = -EBUSY;
9816 ss << "class '" << device_class
9817 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9818 goto reply;
9819 }
9820
9821 set<int> osds;
9822 newcrush.get_devices_by_class(device_class, &osds);
9823 for (auto& p: osds) {
9824 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9825 if (err < 0) {
9826 // ss has reason for failure
9827 goto reply;
9828 }
9829 }
9830
9831 if (osds.empty()) {
9832 // empty class, remove directly
9833 err = newcrush.remove_class_name(device_class);
9834 if (err < 0) {
9835 ss << "class '" << device_class << "' cannot be removed '"
9836 << cpp_strerror(err) << "'";
9837 goto reply;
9838 }
9839 }
9840
9841 pending_inc.crush.clear();
9842 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9843 ss << "removed class " << device_class << " with id " << class_id
9844 << " from crush map";
9845 goto update;
9846 } else if (prefix == "osd crush class rename") {
9847 string srcname, dstname;
9848 if (!cmd_getval(cmdmap, "srcname", srcname)) {
9849 err = -EINVAL;
9850 goto reply;
9851 }
9852 if (!cmd_getval(cmdmap, "dstname", dstname)) {
9853 err = -EINVAL;
9854 goto reply;
9855 }
9856
9857 CrushWrapper newcrush;
9858 _get_pending_crush(newcrush);
9859 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9860 // suppose this is a replay and return success
9861 // so command is idempotent
9862 ss << "already renamed to '" << dstname << "'";
9863 err = 0;
9864 goto reply;
9865 }
9866
9867 err = newcrush.rename_class(srcname, dstname);
9868 if (err < 0) {
9869 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9870 << cpp_strerror(err);
9871 goto reply;
9872 }
9873
9874 pending_inc.crush.clear();
9875 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9876 ss << "rename class '" << srcname << "' to '" << dstname << "'";
9877 goto update;
9878 } else if (prefix == "osd crush add-bucket") {
9879 // os crush add-bucket <name> <type>
9880 string name, typestr;
9881 vector<string> argvec;
9882 cmd_getval(cmdmap, "name", name);
9883 cmd_getval(cmdmap, "type", typestr);
9884 cmd_getval(cmdmap, "args", argvec);
9885 map<string,string> loc;
9886 if (!argvec.empty()) {
9887 CrushWrapper::parse_loc_map(argvec, &loc);
9888 dout(0) << "will create and move bucket '" << name
9889 << "' to location " << loc << dendl;
9890 }
9891
9892 if (!_have_pending_crush() &&
9893 _get_stable_crush().name_exists(name)) {
9894 ss << "bucket '" << name << "' already exists";
9895 goto reply;
9896 }
9897
9898 CrushWrapper newcrush;
9899 _get_pending_crush(newcrush);
9900
9901 if (newcrush.name_exists(name)) {
9902 ss << "bucket '" << name << "' already exists";
9903 goto update;
9904 }
9905 int type = newcrush.get_type_id(typestr);
9906 if (type < 0) {
9907 ss << "type '" << typestr << "' does not exist";
9908 err = -EINVAL;
9909 goto reply;
9910 }
9911 if (type == 0) {
9912 ss << "type '" << typestr << "' is for devices, not buckets";
9913 err = -EINVAL;
9914 goto reply;
9915 }
9916 int bucketno;
9917 err = newcrush.add_bucket(0, 0,
9918 CRUSH_HASH_DEFAULT, type, 0, NULL,
9919 NULL, &bucketno);
9920 if (err < 0) {
9921 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9922 goto reply;
9923 }
9924 err = newcrush.set_item_name(bucketno, name);
9925 if (err < 0) {
9926 ss << "error setting bucket name to '" << name << "'";
9927 goto reply;
9928 }
9929
9930 if (!loc.empty()) {
9931 if (!newcrush.check_item_loc(cct, bucketno, loc,
9932 (int *)NULL)) {
9933 err = newcrush.move_bucket(cct, bucketno, loc);
9934 if (err < 0) {
9935 ss << "error moving bucket '" << name << "' to location " << loc;
9936 goto reply;
9937 }
9938 } else {
9939 ss << "no need to move item id " << bucketno << " name '" << name
9940 << "' to location " << loc << " in crush map";
9941 }
9942 }
9943
9944 pending_inc.crush.clear();
9945 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9946 if (loc.empty()) {
9947 ss << "added bucket " << name << " type " << typestr
9948 << " to crush map";
9949 } else {
9950 ss << "added bucket " << name << " type " << typestr
9951 << " to location " << loc;
9952 }
9953 goto update;
9954 } else if (prefix == "osd crush rename-bucket") {
9955 string srcname, dstname;
9956 cmd_getval(cmdmap, "srcname", srcname);
9957 cmd_getval(cmdmap, "dstname", dstname);
9958
9959 err = crush_rename_bucket(srcname, dstname, &ss);
9960 if (err == -EALREADY) // equivalent to success for idempotency
9961 err = 0;
9962 if (err)
9963 goto reply;
9964 else
9965 goto update;
9966 } else if (prefix == "osd crush weight-set create" ||
9967 prefix == "osd crush weight-set create-compat") {
9968 CrushWrapper newcrush;
9969 _get_pending_crush(newcrush);
9970 int64_t pool;
9971 int positions;
9972 if (newcrush.has_non_straw2_buckets()) {
9973 ss << "crush map contains one or more bucket(s) that are not straw2";
9974 err = -EPERM;
9975 goto reply;
9976 }
9977 if (prefix == "osd crush weight-set create") {
9978 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
9979 osdmap.require_min_compat_client < ceph_release_t::luminous) {
9980 ss << "require_min_compat_client "
9981 << osdmap.require_min_compat_client
9982 << " < luminous, which is required for per-pool weight-sets. "
9983 << "Try 'ceph osd set-require-min-compat-client luminous' "
9984 << "before using the new interface";
9985 err = -EPERM;
9986 goto reply;
9987 }
9988 string poolname, mode;
9989 cmd_getval(cmdmap, "pool", poolname);
9990 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9991 if (pool < 0) {
9992 ss << "pool '" << poolname << "' not found";
9993 err = -ENOENT;
9994 goto reply;
9995 }
9996 cmd_getval(cmdmap, "mode", mode);
9997 if (mode != "flat" && mode != "positional") {
9998 ss << "unrecognized weight-set mode '" << mode << "'";
9999 err = -EINVAL;
10000 goto reply;
10001 }
10002 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10003 } else {
10004 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10005 positions = 1;
10006 }
10007 if (!newcrush.create_choose_args(pool, positions)) {
10008 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10009 ss << "compat weight-set already created";
10010 } else {
10011 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10012 << "' already created";
10013 }
10014 goto reply;
10015 }
10016 pending_inc.crush.clear();
10017 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10018 goto update;
10019
10020 } else if (prefix == "osd crush weight-set rm" ||
10021 prefix == "osd crush weight-set rm-compat") {
10022 CrushWrapper newcrush;
10023 _get_pending_crush(newcrush);
10024 int64_t pool;
10025 if (prefix == "osd crush weight-set rm") {
10026 string poolname;
10027 cmd_getval(cmdmap, "pool", poolname);
10028 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10029 if (pool < 0) {
10030 ss << "pool '" << poolname << "' not found";
10031 err = -ENOENT;
10032 goto reply;
10033 }
10034 } else {
10035 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10036 }
10037 newcrush.rm_choose_args(pool);
10038 pending_inc.crush.clear();
10039 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10040 goto update;
10041
10042 } else if (prefix == "osd crush weight-set reweight" ||
10043 prefix == "osd crush weight-set reweight-compat") {
10044 string poolname, item;
10045 vector<double> weight;
10046 cmd_getval(cmdmap, "pool", poolname);
10047 cmd_getval(cmdmap, "item", item);
10048 cmd_getval(cmdmap, "weight", weight);
10049 CrushWrapper newcrush;
10050 _get_pending_crush(newcrush);
10051 int64_t pool;
10052 if (prefix == "osd crush weight-set reweight") {
10053 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10054 if (pool < 0) {
10055 ss << "pool '" << poolname << "' not found";
10056 err = -ENOENT;
10057 goto reply;
10058 }
10059 if (!newcrush.have_choose_args(pool)) {
10060 ss << "no weight-set for pool '" << poolname << "'";
10061 err = -ENOENT;
10062 goto reply;
10063 }
10064 auto arg_map = newcrush.choose_args_get(pool);
10065 int positions = newcrush.get_choose_args_positions(arg_map);
10066 if (weight.size() != (size_t)positions) {
10067 ss << "must specify exact " << positions << " weight values";
10068 err = -EINVAL;
10069 goto reply;
10070 }
10071 } else {
10072 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10073 if (!newcrush.have_choose_args(pool)) {
10074 ss << "no backward-compatible weight-set";
10075 err = -ENOENT;
10076 goto reply;
10077 }
10078 }
10079 if (!newcrush.name_exists(item)) {
10080 ss << "item '" << item << "' does not exist";
10081 err = -ENOENT;
10082 goto reply;
10083 }
10084 err = newcrush.choose_args_adjust_item_weightf(
10085 cct,
10086 newcrush.choose_args_get(pool),
10087 newcrush.get_item_id(item),
10088 weight,
10089 &ss);
10090 if (err < 0) {
10091 goto reply;
10092 }
10093 err = 0;
10094 pending_inc.crush.clear();
10095 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10096 goto update;
10097 } else if (osdid_present &&
10098 (prefix == "osd crush set" || prefix == "osd crush add")) {
10099 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10100 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10101 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10102
10103 if (!osdmap.exists(osdid)) {
10104 err = -ENOENT;
10105 ss << osd_name
10106 << " does not exist. Create it before updating the crush map";
10107 goto reply;
10108 }
10109
10110 double weight;
10111 if (!cmd_getval(cmdmap, "weight", weight)) {
10112 ss << "unable to parse weight value '"
10113 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10114 err = -EINVAL;
10115 goto reply;
10116 }
10117
10118 string args;
10119 vector<string> argvec;
10120 cmd_getval(cmdmap, "args", argvec);
10121 map<string,string> loc;
10122 CrushWrapper::parse_loc_map(argvec, &loc);
10123
10124 if (prefix == "osd crush set"
10125 && !_get_stable_crush().item_exists(osdid)) {
10126 err = -ENOENT;
10127 ss << "unable to set item id " << osdid << " name '" << osd_name
10128 << "' weight " << weight << " at location " << loc
10129 << ": does not exist";
10130 goto reply;
10131 }
10132
10133 dout(5) << "adding/updating crush item id " << osdid << " name '"
10134 << osd_name << "' weight " << weight << " at location "
10135 << loc << dendl;
10136 CrushWrapper newcrush;
10137 _get_pending_crush(newcrush);
10138
10139 string action;
10140 if (prefix == "osd crush set" ||
10141 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10142 action = "set";
10143 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10144 } else {
10145 action = "add";
10146 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10147 if (err == 0)
10148 err = 1;
10149 }
10150
10151 if (err < 0)
10152 goto reply;
10153
10154 if (err == 0 && !_have_pending_crush()) {
10155 ss << action << " item id " << osdid << " name '" << osd_name
10156 << "' weight " << weight << " at location " << loc << ": no change";
10157 goto reply;
10158 }
10159
10160 pending_inc.crush.clear();
10161 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10162 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10163 << weight << " at location " << loc << " to crush map";
10164 getline(ss, rs);
10165 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10166 get_last_committed() + 1));
10167 return true;
10168
10169 } else if (prefix == "osd crush create-or-move") {
10170 do {
10171 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10172 if (!osdmap.exists(osdid)) {
10173 err = -ENOENT;
10174 ss << osd_name
10175 << " does not exist. create it before updating the crush map";
10176 goto reply;
10177 }
10178
10179 double weight;
10180 if (!cmd_getval(cmdmap, "weight", weight)) {
10181 ss << "unable to parse weight value '"
10182 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10183 err = -EINVAL;
10184 goto reply;
10185 }
10186
10187 string args;
10188 vector<string> argvec;
10189 cmd_getval(cmdmap, "args", argvec);
10190 map<string,string> loc;
10191 CrushWrapper::parse_loc_map(argvec, &loc);
10192
10193 dout(0) << "create-or-move crush item name '" << osd_name
10194 << "' initial_weight " << weight << " at location " << loc
10195 << dendl;
10196
10197 CrushWrapper newcrush;
10198 _get_pending_crush(newcrush);
10199
10200 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10201 g_conf()->osd_crush_update_weight_set);
10202 if (err == 0) {
10203 ss << "create-or-move updated item name '" << osd_name
10204 << "' weight " << weight
10205 << " at location " << loc << " to crush map";
10206 break;
10207 }
10208 if (err > 0) {
10209 pending_inc.crush.clear();
10210 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10211 ss << "create-or-move updating item name '" << osd_name
10212 << "' weight " << weight
10213 << " at location " << loc << " to crush map";
10214 getline(ss, rs);
10215 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10216 get_last_committed() + 1));
10217 return true;
10218 }
10219 } while (false);
10220
10221 } else if (prefix == "osd crush move") {
10222 do {
10223 // osd crush move <name> <loc1> [<loc2> ...]
10224 string name;
10225 vector<string> argvec;
10226 cmd_getval(cmdmap, "name", name);
10227 cmd_getval(cmdmap, "args", argvec);
10228 map<string,string> loc;
10229 CrushWrapper::parse_loc_map(argvec, &loc);
10230
10231 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10232 CrushWrapper newcrush;
10233 _get_pending_crush(newcrush);
10234
10235 if (!newcrush.name_exists(name)) {
10236 err = -ENOENT;
10237 ss << "item " << name << " does not exist";
10238 break;
10239 }
10240 int id = newcrush.get_item_id(name);
10241
10242 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10243 if (id >= 0) {
10244 err = newcrush.create_or_move_item(
10245 cct, id, 0, name, loc,
10246 g_conf()->osd_crush_update_weight_set);
10247 } else {
10248 err = newcrush.move_bucket(cct, id, loc);
10249 }
10250 if (err >= 0) {
10251 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10252 pending_inc.crush.clear();
10253 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10254 getline(ss, rs);
10255 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10256 get_last_committed() + 1));
10257 return true;
10258 }
10259 } else {
10260 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10261 err = 0;
10262 }
10263 } while (false);
10264 } else if (prefix == "osd crush swap-bucket") {
10265 string source, dest;
10266 cmd_getval(cmdmap, "source", source);
10267 cmd_getval(cmdmap, "dest", dest);
10268
10269 bool force = false;
10270 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10271
10272 CrushWrapper newcrush;
10273 _get_pending_crush(newcrush);
10274 if (!newcrush.name_exists(source)) {
10275 ss << "source item " << source << " does not exist";
10276 err = -ENOENT;
10277 goto reply;
10278 }
10279 if (!newcrush.name_exists(dest)) {
10280 ss << "dest item " << dest << " does not exist";
10281 err = -ENOENT;
10282 goto reply;
10283 }
10284 int sid = newcrush.get_item_id(source);
10285 int did = newcrush.get_item_id(dest);
10286 int sparent;
10287 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10288 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10289 err = -EPERM;
10290 goto reply;
10291 }
10292 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10293 !force) {
10294 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10295 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10296 << "; pass --yes-i-really-mean-it to proceed anyway";
10297 err = -EPERM;
10298 goto reply;
10299 }
10300 int r = newcrush.swap_bucket(cct, sid, did);
10301 if (r < 0) {
10302 ss << "failed to swap bucket contents: " << cpp_strerror(r);
10303 err = r;
10304 goto reply;
10305 }
10306 ss << "swapped bucket of " << source << " to " << dest;
10307 pending_inc.crush.clear();
10308 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10309 wait_for_finished_proposal(op,
10310 new Monitor::C_Command(mon, op, err, ss.str(),
10311 get_last_committed() + 1));
10312 return true;
10313 } else if (prefix == "osd crush link") {
10314 // osd crush link <name> <loc1> [<loc2> ...]
10315 string name;
10316 cmd_getval(cmdmap, "name", name);
10317 vector<string> argvec;
10318 cmd_getval(cmdmap, "args", argvec);
10319 map<string,string> loc;
10320 CrushWrapper::parse_loc_map(argvec, &loc);
10321
10322 // Need an explicit check for name_exists because get_item_id returns
10323 // 0 on unfound.
10324 int id = osdmap.crush->get_item_id(name);
10325 if (!osdmap.crush->name_exists(name)) {
10326 err = -ENOENT;
10327 ss << "item " << name << " does not exist";
10328 goto reply;
10329 } else {
10330 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10331 }
10332 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10333 ss << "no need to move item id " << id << " name '" << name
10334 << "' to location " << loc << " in crush map";
10335 err = 0;
10336 goto reply;
10337 }
10338
10339 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10340 CrushWrapper newcrush;
10341 _get_pending_crush(newcrush);
10342
10343 if (!newcrush.name_exists(name)) {
10344 err = -ENOENT;
10345 ss << "item " << name << " does not exist";
10346 goto reply;
10347 } else {
10348 int id = newcrush.get_item_id(name);
10349 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10350 err = newcrush.link_bucket(cct, id, loc);
10351 if (err >= 0) {
10352 ss << "linked item id " << id << " name '" << name
10353 << "' to location " << loc << " in crush map";
10354 pending_inc.crush.clear();
10355 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10356 } else {
10357 ss << "cannot link item id " << id << " name '" << name
10358 << "' to location " << loc;
10359 goto reply;
10360 }
10361 } else {
10362 ss << "no need to move item id " << id << " name '" << name
10363 << "' to location " << loc << " in crush map";
10364 err = 0;
10365 }
10366 }
10367 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10368 get_last_committed() + 1));
10369 return true;
10370 } else if (prefix == "osd crush rm" ||
10371 prefix == "osd crush remove" ||
10372 prefix == "osd crush unlink") {
10373 do {
10374 // osd crush rm <id> [ancestor]
10375 CrushWrapper newcrush;
10376 _get_pending_crush(newcrush);
10377
10378 string name;
10379 cmd_getval(cmdmap, "name", name);
10380
10381 if (!osdmap.crush->name_exists(name)) {
10382 err = 0;
10383 ss << "device '" << name << "' does not appear in the crush map";
10384 break;
10385 }
10386 if (!newcrush.name_exists(name)) {
10387 err = 0;
10388 ss << "device '" << name << "' does not appear in the crush map";
10389 getline(ss, rs);
10390 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10391 get_last_committed() + 1));
10392 return true;
10393 }
10394 int id = newcrush.get_item_id(name);
10395 int ancestor = 0;
10396
10397 bool unlink_only = prefix == "osd crush unlink";
10398 string ancestor_str;
10399 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10400 if (!newcrush.name_exists(ancestor_str)) {
10401 err = -ENOENT;
10402 ss << "ancestor item '" << ancestor_str
10403 << "' does not appear in the crush map";
10404 break;
10405 }
10406 ancestor = newcrush.get_item_id(ancestor_str);
10407 }
10408
10409 err = prepare_command_osd_crush_remove(
10410 newcrush,
10411 id, ancestor,
10412 (ancestor < 0), unlink_only);
10413
10414 if (err == -ENOENT) {
10415 ss << "item " << id << " does not appear in that position";
10416 err = 0;
10417 break;
10418 }
10419 if (err == 0) {
10420 if (!unlink_only)
10421 pending_inc.new_crush_node_flags[id] = 0;
10422 ss << "removed item id " << id << " name '" << name << "' from crush map";
10423 getline(ss, rs);
10424 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10425 get_last_committed() + 1));
10426 return true;
10427 }
10428 } while (false);
10429
10430 } else if (prefix == "osd crush reweight-all") {
10431 CrushWrapper newcrush;
10432 _get_pending_crush(newcrush);
10433
10434 newcrush.reweight(cct);
10435 pending_inc.crush.clear();
10436 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10437 ss << "reweighted crush hierarchy";
10438 getline(ss, rs);
10439 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10440 get_last_committed() + 1));
10441 return true;
10442 } else if (prefix == "osd crush reweight") {
10443 // osd crush reweight <name> <weight>
10444 CrushWrapper newcrush;
10445 _get_pending_crush(newcrush);
10446
10447 string name;
10448 cmd_getval(cmdmap, "name", name);
10449 if (!newcrush.name_exists(name)) {
10450 err = -ENOENT;
10451 ss << "device '" << name << "' does not appear in the crush map";
10452 goto reply;
10453 }
10454
10455 int id = newcrush.get_item_id(name);
10456 if (id < 0) {
10457 ss << "device '" << name << "' is not a leaf in the crush map";
10458 err = -EINVAL;
10459 goto reply;
10460 }
10461 double w;
10462 if (!cmd_getval(cmdmap, "weight", w)) {
10463 ss << "unable to parse weight value '"
10464 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10465 err = -EINVAL;
10466 goto reply;
10467 }
10468
10469 err = newcrush.adjust_item_weightf(cct, id, w,
10470 g_conf()->osd_crush_update_weight_set);
10471 if (err < 0)
10472 goto reply;
10473 pending_inc.crush.clear();
10474 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10475 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10476 << " in crush map";
10477 getline(ss, rs);
10478 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10479 get_last_committed() + 1));
10480 return true;
10481 } else if (prefix == "osd crush reweight-subtree") {
10482 // osd crush reweight <name> <weight>
10483 CrushWrapper newcrush;
10484 _get_pending_crush(newcrush);
10485
10486 string name;
10487 cmd_getval(cmdmap, "name", name);
10488 if (!newcrush.name_exists(name)) {
10489 err = -ENOENT;
10490 ss << "device '" << name << "' does not appear in the crush map";
10491 goto reply;
10492 }
10493
10494 int id = newcrush.get_item_id(name);
10495 if (id >= 0) {
10496 ss << "device '" << name << "' is not a subtree in the crush map";
10497 err = -EINVAL;
10498 goto reply;
10499 }
10500 double w;
10501 if (!cmd_getval(cmdmap, "weight", w)) {
10502 ss << "unable to parse weight value '"
10503 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10504 err = -EINVAL;
10505 goto reply;
10506 }
10507
10508 err = newcrush.adjust_subtree_weightf(cct, id, w,
10509 g_conf()->osd_crush_update_weight_set);
10510 if (err < 0)
10511 goto reply;
10512 pending_inc.crush.clear();
10513 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10514 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10515 << " in crush map";
10516 getline(ss, rs);
10517 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10518 get_last_committed() + 1));
10519 return true;
10520 } else if (prefix == "osd crush tunables") {
10521 CrushWrapper newcrush;
10522 _get_pending_crush(newcrush);
10523
10524 err = 0;
10525 string profile;
10526 cmd_getval(cmdmap, "profile", profile);
10527 if (profile == "legacy" || profile == "argonaut") {
10528 newcrush.set_tunables_legacy();
10529 } else if (profile == "bobtail") {
10530 newcrush.set_tunables_bobtail();
10531 } else if (profile == "firefly") {
10532 newcrush.set_tunables_firefly();
10533 } else if (profile == "hammer") {
10534 newcrush.set_tunables_hammer();
10535 } else if (profile == "jewel") {
10536 newcrush.set_tunables_jewel();
10537 } else if (profile == "optimal") {
10538 newcrush.set_tunables_optimal();
10539 } else if (profile == "default") {
10540 newcrush.set_tunables_default();
10541 } else {
10542 ss << "unrecognized profile '" << profile << "'";
10543 err = -EINVAL;
10544 goto reply;
10545 }
10546
10547 if (!validate_crush_against_features(&newcrush, ss)) {
10548 err = -EINVAL;
10549 goto reply;
10550 }
10551
10552 pending_inc.crush.clear();
10553 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10554 ss << "adjusted tunables profile to " << profile;
10555 getline(ss, rs);
10556 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10557 get_last_committed() + 1));
10558 return true;
10559 } else if (prefix == "osd crush set-tunable") {
10560 CrushWrapper newcrush;
10561 _get_pending_crush(newcrush);
10562
10563 err = 0;
10564 string tunable;
10565 cmd_getval(cmdmap, "tunable", tunable);
10566
10567 int64_t value = -1;
10568 if (!cmd_getval(cmdmap, "value", value)) {
10569 err = -EINVAL;
10570 ss << "failed to parse integer value "
10571 << cmd_vartype_stringify(cmdmap.at("value"));
10572 goto reply;
10573 }
10574
10575 if (tunable == "straw_calc_version") {
10576 if (value != 0 && value != 1) {
10577 ss << "value must be 0 or 1; got " << value;
10578 err = -EINVAL;
10579 goto reply;
10580 }
10581 newcrush.set_straw_calc_version(value);
10582 } else {
10583 ss << "unrecognized tunable '" << tunable << "'";
10584 err = -EINVAL;
10585 goto reply;
10586 }
10587
10588 if (!validate_crush_against_features(&newcrush, ss)) {
10589 err = -EINVAL;
10590 goto reply;
10591 }
10592
10593 pending_inc.crush.clear();
10594 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10595 ss << "adjusted tunable " << tunable << " to " << value;
10596 getline(ss, rs);
10597 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10598 get_last_committed() + 1));
10599 return true;
10600
10601 } else if (prefix == "osd crush rule create-simple") {
10602 string name, root, type, mode;
10603 cmd_getval(cmdmap, "name", name);
10604 cmd_getval(cmdmap, "root", root);
10605 cmd_getval(cmdmap, "type", type);
10606 cmd_getval(cmdmap, "mode", mode);
10607 if (mode == "")
10608 mode = "firstn";
10609
10610 if (osdmap.crush->rule_exists(name)) {
10611 // The name is uniquely associated to a ruleid and the rule it contains
10612 // From the user point of view, the rule is more meaningfull.
10613 ss << "rule " << name << " already exists";
10614 err = 0;
10615 goto reply;
10616 }
10617
10618 CrushWrapper newcrush;
10619 _get_pending_crush(newcrush);
10620
10621 if (newcrush.rule_exists(name)) {
10622 // The name is uniquely associated to a ruleid and the rule it contains
10623 // From the user point of view, the rule is more meaningfull.
10624 ss << "rule " << name << " already exists";
10625 err = 0;
10626 } else {
10627 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10628 pg_pool_t::TYPE_REPLICATED, &ss);
10629 if (ruleno < 0) {
10630 err = ruleno;
10631 goto reply;
10632 }
10633
10634 pending_inc.crush.clear();
10635 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10636 }
10637 getline(ss, rs);
10638 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10639 get_last_committed() + 1));
10640 return true;
10641
10642 } else if (prefix == "osd crush rule create-replicated") {
10643 string name, root, type, device_class;
10644 cmd_getval(cmdmap, "name", name);
10645 cmd_getval(cmdmap, "root", root);
10646 cmd_getval(cmdmap, "type", type);
10647 cmd_getval(cmdmap, "class", device_class);
10648
10649 if (osdmap.crush->rule_exists(name)) {
10650 // The name is uniquely associated to a ruleid and the rule it contains
10651 // From the user point of view, the rule is more meaningfull.
10652 ss << "rule " << name << " already exists";
10653 err = 0;
10654 goto reply;
10655 }
10656
10657 CrushWrapper newcrush;
10658 _get_pending_crush(newcrush);
10659
10660 if (newcrush.rule_exists(name)) {
10661 // The name is uniquely associated to a ruleid and the rule it contains
10662 // From the user point of view, the rule is more meaningfull.
10663 ss << "rule " << name << " already exists";
10664 err = 0;
10665 } else {
10666 int ruleno = newcrush.add_simple_rule(
10667 name, root, type, device_class,
10668 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10669 if (ruleno < 0) {
10670 err = ruleno;
10671 goto reply;
10672 }
10673
10674 pending_inc.crush.clear();
10675 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10676 }
10677 getline(ss, rs);
10678 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10679 get_last_committed() + 1));
10680 return true;
10681
10682 } else if (prefix == "osd erasure-code-profile rm") {
10683 string name;
10684 cmd_getval(cmdmap, "name", name);
10685
10686 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10687 goto wait;
10688
10689 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10690 err = -EBUSY;
10691 goto reply;
10692 }
10693
10694 if (osdmap.has_erasure_code_profile(name) ||
10695 pending_inc.new_erasure_code_profiles.count(name)) {
10696 if (osdmap.has_erasure_code_profile(name)) {
10697 pending_inc.old_erasure_code_profiles.push_back(name);
10698 } else {
10699 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10700 pending_inc.new_erasure_code_profiles.erase(name);
10701 }
10702
10703 getline(ss, rs);
10704 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10705 get_last_committed() + 1));
10706 return true;
10707 } else {
10708 ss << "erasure-code-profile " << name << " does not exist";
10709 err = 0;
10710 goto reply;
10711 }
10712
10713 } else if (prefix == "osd erasure-code-profile set") {
10714 string name;
10715 cmd_getval(cmdmap, "name", name);
10716 vector<string> profile;
10717 cmd_getval(cmdmap, "profile", profile);
10718
10719 bool force = false;
10720 cmd_getval(cmdmap, "force", force);
10721
10722 map<string,string> profile_map;
10723 err = parse_erasure_code_profile(profile, &profile_map, &ss);
10724 if (err)
10725 goto reply;
10726 if (profile_map.find("plugin") == profile_map.end()) {
10727 ss << "erasure-code-profile " << profile_map
10728 << " must contain a plugin entry" << std::endl;
10729 err = -EINVAL;
10730 goto reply;
10731 }
10732 string plugin = profile_map["plugin"];
10733
10734 if (pending_inc.has_erasure_code_profile(name)) {
10735 dout(20) << "erasure code profile " << name << " try again" << dendl;
10736 goto wait;
10737 } else {
10738 err = normalize_profile(name, profile_map, force, &ss);
10739 if (err)
10740 goto reply;
10741
10742 if (osdmap.has_erasure_code_profile(name)) {
10743 ErasureCodeProfile existing_profile_map =
10744 osdmap.get_erasure_code_profile(name);
10745 err = normalize_profile(name, existing_profile_map, force, &ss);
10746 if (err)
10747 goto reply;
10748
10749 if (existing_profile_map == profile_map) {
10750 err = 0;
10751 goto reply;
10752 }
10753 if (!force) {
10754 err = -EPERM;
10755 ss << "will not override erasure code profile " << name
10756 << " because the existing profile "
10757 << existing_profile_map
10758 << " is different from the proposed profile "
10759 << profile_map;
10760 goto reply;
10761 }
10762 }
10763
10764 dout(20) << "erasure code profile set " << name << "="
10765 << profile_map << dendl;
10766 pending_inc.set_erasure_code_profile(name, profile_map);
10767 }
10768
10769 getline(ss, rs);
10770 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10771 get_last_committed() + 1));
10772 return true;
10773
10774 } else if (prefix == "osd crush rule create-erasure") {
10775 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10776 if (err == -EAGAIN)
10777 goto wait;
10778 if (err)
10779 goto reply;
10780 string name, poolstr;
10781 cmd_getval(cmdmap, "name", name);
10782 string profile;
10783 cmd_getval(cmdmap, "profile", profile);
10784 if (profile == "")
10785 profile = "default";
10786 if (profile == "default") {
10787 if (!osdmap.has_erasure_code_profile(profile)) {
10788 if (pending_inc.has_erasure_code_profile(profile)) {
10789 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10790 goto wait;
10791 }
10792
10793 map<string,string> profile_map;
10794 err = osdmap.get_erasure_code_profile_default(cct,
10795 profile_map,
10796 &ss);
10797 if (err)
10798 goto reply;
10799 err = normalize_profile(name, profile_map, true, &ss);
10800 if (err)
10801 goto reply;
10802 dout(20) << "erasure code profile set " << profile << "="
10803 << profile_map << dendl;
10804 pending_inc.set_erasure_code_profile(profile, profile_map);
10805 goto wait;
10806 }
10807 }
10808
10809 int rule;
10810 err = crush_rule_create_erasure(name, profile, &rule, &ss);
10811 if (err < 0) {
10812 switch(err) {
10813 case -EEXIST: // return immediately
10814 ss << "rule " << name << " already exists";
10815 err = 0;
10816 goto reply;
10817 break;
10818 case -EALREADY: // wait for pending to be proposed
10819 ss << "rule " << name << " already exists";
10820 err = 0;
10821 break;
10822 default: // non recoverable error
10823 goto reply;
10824 break;
10825 }
10826 } else {
10827 ss << "created rule " << name << " at " << rule;
10828 }
10829
10830 getline(ss, rs);
10831 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10832 get_last_committed() + 1));
10833 return true;
10834
10835 } else if (prefix == "osd crush rule rm") {
10836 string name;
10837 cmd_getval(cmdmap, "name", name);
10838
10839 if (!osdmap.crush->rule_exists(name)) {
10840 ss << "rule " << name << " does not exist";
10841 err = 0;
10842 goto reply;
10843 }
10844
10845 CrushWrapper newcrush;
10846 _get_pending_crush(newcrush);
10847
10848 if (!newcrush.rule_exists(name)) {
10849 ss << "rule " << name << " does not exist";
10850 err = 0;
10851 } else {
10852 int ruleno = newcrush.get_rule_id(name);
10853 ceph_assert(ruleno >= 0);
10854
10855 // make sure it is not in use.
10856 // FIXME: this is ok in some situations, but let's not bother with that
10857 // complexity now.
10858 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10859 if (osdmap.crush_rule_in_use(ruleset)) {
10860 ss << "crush ruleset " << name << " " << ruleset << " is in use";
10861 err = -EBUSY;
10862 goto reply;
10863 }
10864
10865 err = newcrush.remove_rule(ruleno);
10866 if (err < 0) {
10867 goto reply;
10868 }
10869
10870 pending_inc.crush.clear();
10871 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10872 }
10873 getline(ss, rs);
10874 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10875 get_last_committed() + 1));
10876 return true;
10877
10878 } else if (prefix == "osd crush rule rename") {
10879 string srcname;
10880 string dstname;
10881 cmd_getval(cmdmap, "srcname", srcname);
10882 cmd_getval(cmdmap, "dstname", dstname);
10883 if (srcname.empty() || dstname.empty()) {
10884 ss << "must specify both source rule name and destination rule name";
10885 err = -EINVAL;
10886 goto reply;
10887 }
10888 if (srcname == dstname) {
10889 ss << "destination rule name is equal to source rule name";
10890 err = 0;
10891 goto reply;
10892 }
10893
10894 CrushWrapper newcrush;
10895 _get_pending_crush(newcrush);
10896 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10897 // srcname does not exist and dstname already exists
10898 // suppose this is a replay and return success
10899 // (so this command is idempotent)
10900 ss << "already renamed to '" << dstname << "'";
10901 err = 0;
10902 goto reply;
10903 }
10904
10905 err = newcrush.rename_rule(srcname, dstname, &ss);
10906 if (err < 0) {
10907 // ss has reason for failure
10908 goto reply;
10909 }
10910 pending_inc.crush.clear();
10911 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10912 getline(ss, rs);
10913 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10914 get_last_committed() + 1));
10915 return true;
10916
10917 } else if (prefix == "osd setmaxosd") {
10918 int64_t newmax;
10919 if (!cmd_getval(cmdmap, "newmax", newmax)) {
10920 ss << "unable to parse 'newmax' value '"
10921 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
10922 err = -EINVAL;
10923 goto reply;
10924 }
10925
10926 if (newmax > g_conf()->mon_max_osd) {
10927 err = -ERANGE;
10928 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
10929 << g_conf()->mon_max_osd << ")";
10930 goto reply;
10931 }
10932
10933 // Don't allow shrinking OSD number as this will cause data loss
10934 // and may cause kernel crashes.
10935 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10936 if (newmax < osdmap.get_max_osd()) {
10937 // Check if the OSDs exist between current max and new value.
10938 // If there are any OSDs exist, then don't allow shrinking number
10939 // of OSDs.
10940 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10941 if (osdmap.exists(i)) {
10942 err = -EBUSY;
10943 ss << "cannot shrink max_osd to " << newmax
10944 << " because osd." << i << " (and possibly others) still in use";
10945 goto reply;
10946 }
10947 }
10948 }
10949
10950 pending_inc.new_max_osd = newmax;
10951 ss << "set new max_osd = " << pending_inc.new_max_osd;
10952 getline(ss, rs);
10953 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10954 get_last_committed() + 1));
10955 return true;
10956
10957 } else if (prefix == "osd set-full-ratio" ||
10958 prefix == "osd set-backfillfull-ratio" ||
10959 prefix == "osd set-nearfull-ratio") {
10960 double n;
10961 if (!cmd_getval(cmdmap, "ratio", n)) {
10962 ss << "unable to parse 'ratio' value '"
10963 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
10964 err = -EINVAL;
10965 goto reply;
10966 }
10967 if (prefix == "osd set-full-ratio")
10968 pending_inc.new_full_ratio = n;
10969 else if (prefix == "osd set-backfillfull-ratio")
10970 pending_inc.new_backfillfull_ratio = n;
10971 else if (prefix == "osd set-nearfull-ratio")
10972 pending_inc.new_nearfull_ratio = n;
10973 ss << prefix << " " << n;
10974 getline(ss, rs);
10975 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10976 get_last_committed() + 1));
10977 return true;
10978 } else if (prefix == "osd set-require-min-compat-client") {
10979 string v;
10980 cmd_getval(cmdmap, "version", v);
10981 ceph_release_t vno = ceph_release_from_name(v);
10982 if (!vno) {
10983 ss << "version " << v << " is not recognized";
10984 err = -EINVAL;
10985 goto reply;
10986 }
10987 OSDMap newmap;
10988 newmap.deepish_copy_from(osdmap);
10989 newmap.apply_incremental(pending_inc);
10990 newmap.require_min_compat_client = vno;
10991 auto mvno = newmap.get_min_compat_client();
10992 if (vno < mvno) {
10993 ss << "osdmap current utilizes features that require " << mvno
10994 << "; cannot set require_min_compat_client below that to " << vno;
10995 err = -EPERM;
10996 goto reply;
10997 }
10998 bool sure = false;
10999 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11000 if (!sure) {
11001 FeatureMap m;
11002 mon->get_combined_feature_map(&m);
11003 uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
11004 bool first = true;
11005 bool ok = true;
11006 for (int type : {
11007 CEPH_ENTITY_TYPE_CLIENT,
11008 CEPH_ENTITY_TYPE_MDS,
11009 CEPH_ENTITY_TYPE_MGR }) {
11010 auto p = m.m.find(type);
11011 if (p == m.m.end()) {
11012 continue;
11013 }
11014 for (auto& q : p->second) {
11015 uint64_t missing = ~q.first & features;
11016 if (missing) {
11017 if (first) {
11018 ss << "cannot set require_min_compat_client to " << v << ": ";
11019 } else {
11020 ss << "; ";
11021 }
11022 first = false;
11023 ss << q.second << " connected " << ceph_entity_type_name(type)
11024 << "(s) look like " << ceph_release_name(
11025 ceph_release_from_features(q.first))
11026 << " (missing 0x" << std::hex << missing << std::dec << ")";
11027 ok = false;
11028 }
11029 }
11030 }
11031 if (!ok) {
11032 ss << "; add --yes-i-really-mean-it to do it anyway";
11033 err = -EPERM;
11034 goto reply;
11035 }
11036 }
11037 ss << "set require_min_compat_client to " << vno;
11038 pending_inc.new_require_min_compat_client = vno;
11039 getline(ss, rs);
11040 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11041 get_last_committed() + 1));
11042 return true;
11043 } else if (prefix == "osd pause") {
11044 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11045
11046 } else if (prefix == "osd unpause") {
11047 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11048
11049 } else if (prefix == "osd set") {
11050 bool sure = false;
11051 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11052
11053 string key;
11054 cmd_getval(cmdmap, "key", key);
11055 if (key == "pause")
11056 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11057 else if (key == "noup")
11058 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11059 else if (key == "nodown")
11060 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11061 else if (key == "noout")
11062 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11063 else if (key == "noin")
11064 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11065 else if (key == "nobackfill")
11066 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11067 else if (key == "norebalance")
11068 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11069 else if (key == "norecover")
11070 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11071 else if (key == "noscrub")
11072 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11073 else if (key == "nodeep-scrub")
11074 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11075 else if (key == "notieragent")
11076 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11077 else if (key == "nosnaptrim")
11078 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11079 else if (key == "pglog_hardlimit") {
11080 if (!osdmap.get_num_up_osds() && !sure) {
11081 ss << "Not advisable to continue since no OSDs are up. Pass "
11082 << "--yes-i-really-mean-it if you really wish to continue.";
11083 err = -EPERM;
11084 goto reply;
11085 }
11086 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11087 // we are reusing a jewel feature bit that was retired in luminous.
11088 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11089 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11090 || sure)) {
11091 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11092 } else {
11093 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11094 err = -EPERM;
11095 goto reply;
11096 }
11097 } else {
11098 ss << "unrecognized flag '" << key << "'";
11099 err = -EINVAL;
11100 }
11101
11102 } else if (prefix == "osd unset") {
11103 string key;
11104 cmd_getval(cmdmap, "key", key);
11105 if (key == "pause")
11106 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11107 else if (key == "noup")
11108 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11109 else if (key == "nodown")
11110 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11111 else if (key == "noout")
11112 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11113 else if (key == "noin")
11114 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11115 else if (key == "nobackfill")
11116 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11117 else if (key == "norebalance")
11118 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11119 else if (key == "norecover")
11120 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11121 else if (key == "noscrub")
11122 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11123 else if (key == "nodeep-scrub")
11124 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11125 else if (key == "notieragent")
11126 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11127 else if (key == "nosnaptrim")
11128 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11129 else {
11130 ss << "unrecognized flag '" << key << "'";
11131 err = -EINVAL;
11132 }
11133
11134 } else if (prefix == "osd require-osd-release") {
11135 string release;
11136 cmd_getval(cmdmap, "release", release);
11137 bool sure = false;
11138 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11139 ceph_release_t rel = ceph_release_from_name(release.c_str());
11140 if (!rel) {
11141 ss << "unrecognized release " << release;
11142 err = -EINVAL;
11143 goto reply;
11144 }
11145 if (rel == osdmap.require_osd_release) {
11146 // idempotent
11147 err = 0;
11148 goto reply;
11149 }
11150 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11151 if (!osdmap.get_num_up_osds() && !sure) {
11152 ss << "Not advisable to continue since no OSDs are up. Pass "
11153 << "--yes-i-really-mean-it if you really wish to continue.";
11154 err = -EPERM;
11155 goto reply;
11156 }
11157 if (rel == ceph_release_t::mimic) {
11158 if (!mon->monmap->get_required_features().contains_all(
11159 ceph::features::mon::FEATURE_MIMIC)) {
11160 ss << "not all mons are mimic";
11161 err = -EPERM;
11162 goto reply;
11163 }
11164 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11165 && !sure) {
11166 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11167 err = -EPERM;
11168 goto reply;
11169 }
11170 } else if (rel == ceph_release_t::nautilus) {
11171 if (!mon->monmap->get_required_features().contains_all(
11172 ceph::features::mon::FEATURE_NAUTILUS)) {
11173 ss << "not all mons are nautilus";
11174 err = -EPERM;
11175 goto reply;
11176 }
11177 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11178 && !sure) {
11179 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11180 err = -EPERM;
11181 goto reply;
11182 }
11183 } else if (rel == ceph_release_t::octopus) {
11184 if (!mon->monmap->get_required_features().contains_all(
11185 ceph::features::mon::FEATURE_OCTOPUS)) {
11186 ss << "not all mons are octopus";
11187 err = -EPERM;
11188 goto reply;
11189 }
11190 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11191 && !sure) {
11192 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11193 err = -EPERM;
11194 goto reply;
11195 }
11196 } else {
11197 ss << "not supported for this release yet";
11198 err = -EPERM;
11199 goto reply;
11200 }
11201 if (rel < osdmap.require_osd_release) {
11202 ss << "require_osd_release cannot be lowered once it has been set";
11203 err = -EPERM;
11204 goto reply;
11205 }
11206 pending_inc.new_require_osd_release = rel;
11207 goto update;
11208 } else if (prefix == "osd down" ||
11209 prefix == "osd out" ||
11210 prefix == "osd in" ||
11211 prefix == "osd rm" ||
11212 prefix == "osd stop") {
11213
11214 bool any = false;
11215 bool stop = false;
11216 bool verbose = true;
11217 bool definitely_dead = false;
11218
11219 vector<string> idvec;
11220 cmd_getval(cmdmap, "ids", idvec);
11221 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11222 derr << "definitely_dead " << (int)definitely_dead << dendl;
11223 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11224 set<int> osds;
11225
11226 // wildcard?
11227 if (j == 0 &&
11228 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11229 if (prefix == "osd in") {
11230 // touch out osds only
11231 osdmap.get_out_existing_osds(osds);
11232 } else {
11233 osdmap.get_all_osds(osds);
11234 }
11235 stop = true;
11236 verbose = false; // so the output is less noisy.
11237 } else {
11238 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11239 if (osd < 0) {
11240 ss << "invalid osd id" << osd;
11241 err = -EINVAL;
11242 continue;
11243 } else if (!osdmap.exists(osd)) {
11244 ss << "osd." << osd << " does not exist. ";
11245 continue;
11246 }
11247
11248 osds.insert(osd);
11249 }
11250
11251 for (auto &osd : osds) {
11252 if (prefix == "osd down") {
11253 if (osdmap.is_down(osd)) {
11254 if (verbose)
11255 ss << "osd." << osd << " is already down. ";
11256 } else {
11257 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11258 ss << "marked down osd." << osd << ". ";
11259 any = true;
11260 }
11261 if (definitely_dead) {
11262 if (!pending_inc.new_xinfo.count(osd)) {
11263 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11264 }
11265 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11266 any = true;
11267 }
11268 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11269 }
11270 } else if (prefix == "osd out") {
11271 if (osdmap.is_out(osd)) {
11272 if (verbose)
11273 ss << "osd." << osd << " is already out. ";
11274 } else {
11275 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11276 if (osdmap.osd_weight[osd]) {
11277 if (pending_inc.new_xinfo.count(osd) == 0) {
11278 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11279 }
11280 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11281 }
11282 ss << "marked out osd." << osd << ". ";
11283 std::ostringstream msg;
11284 msg << "Client " << op->get_session()->entity_name
11285 << " marked osd." << osd << " out";
11286 if (osdmap.is_up(osd)) {
11287 msg << ", while it was still marked up";
11288 } else {
11289 auto period = ceph_clock_now() - down_pending_out[osd];
11290 msg << ", after it was down for " << int(period.sec())
11291 << " seconds";
11292 }
11293
11294 mon->clog->info() << msg.str();
11295 any = true;
11296 }
11297 } else if (prefix == "osd in") {
11298 if (osdmap.is_in(osd)) {
11299 if (verbose)
11300 ss << "osd." << osd << " is already in. ";
11301 } else {
11302 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11303 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11304 if (pending_inc.new_xinfo.count(osd) == 0) {
11305 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11306 }
11307 pending_inc.new_xinfo[osd].old_weight = 0;
11308 } else {
11309 pending_inc.new_weight[osd] = CEPH_OSD_IN;
11310 }
11311 ss << "marked in osd." << osd << ". ";
11312 any = true;
11313 }
11314 } else if (prefix == "osd rm") {
11315 err = prepare_command_osd_remove(osd);
11316
11317 if (err == -EBUSY) {
11318 if (any)
11319 ss << ", ";
11320 ss << "osd." << osd << " is still up; must be down before removal. ";
11321 } else {
11322 ceph_assert(err == 0);
11323 if (any) {
11324 ss << ", osd." << osd;
11325 } else {
11326 ss << "removed osd." << osd;
11327 }
11328 any = true;
11329 }
11330 } else if (prefix == "osd stop") {
11331 if (osdmap.is_stop(osd)) {
11332 if (verbose)
11333 ss << "osd." << osd << " is already stopped. ";
11334 } else if (osdmap.is_down(osd)) {
11335 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11336 ss << "stop down osd." << osd << ". ";
11337 any = true;
11338 } else {
11339 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11340 ss << "stop osd." << osd << ". ";
11341 any = true;
11342 }
11343 }
11344 }
11345 }
11346 if (any) {
11347 getline(ss, rs);
11348 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11349 get_last_committed() + 1));
11350 return true;
11351 }
11352 } else if (prefix == "osd set-group" ||
11353 prefix == "osd unset-group" ||
11354 prefix == "osd add-noup" ||
11355 prefix == "osd add-nodown" ||
11356 prefix == "osd add-noin" ||
11357 prefix == "osd add-noout" ||
11358 prefix == "osd rm-noup" ||
11359 prefix == "osd rm-nodown" ||
11360 prefix == "osd rm-noin" ||
11361 prefix == "osd rm-noout") {
11362 bool do_set = prefix == "osd set-group" ||
11363 prefix.find("add") != string::npos;
11364 string flag_str;
11365 unsigned flags = 0;
11366 vector<string> who;
11367 if (prefix == "osd set-group" || prefix == "osd unset-group") {
11368 cmd_getval(cmdmap, "flags", flag_str);
11369 cmd_getval(cmdmap, "who", who);
11370 vector<string> raw_flags;
11371 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11372 for (auto& f : raw_flags) {
11373 if (f == "noup")
11374 flags |= CEPH_OSD_NOUP;
11375 else if (f == "nodown")
11376 flags |= CEPH_OSD_NODOWN;
11377 else if (f == "noin")
11378 flags |= CEPH_OSD_NOIN;
11379 else if (f == "noout")
11380 flags |= CEPH_OSD_NOOUT;
11381 else {
11382 ss << "unrecognized flag '" << f << "', must be one of "
11383 << "{noup,nodown,noin,noout}";
11384 err = -EINVAL;
11385 goto reply;
11386 }
11387 }
11388 } else {
11389 cmd_getval(cmdmap, "ids", who);
11390 if (prefix.find("noup") != string::npos)
11391 flags = CEPH_OSD_NOUP;
11392 else if (prefix.find("nodown") != string::npos)
11393 flags = CEPH_OSD_NODOWN;
11394 else if (prefix.find("noin") != string::npos)
11395 flags = CEPH_OSD_NOIN;
11396 else if (prefix.find("noout") != string::npos)
11397 flags = CEPH_OSD_NOOUT;
11398 else
11399 ceph_assert(0 == "Unreachable!");
11400 }
11401 if (flags == 0) {
11402 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11403 err = -EINVAL;
11404 goto reply;
11405 }
11406 if (who.empty()) {
11407 ss << "must specify at least one or more targets to set/unset";
11408 err = -EINVAL;
11409 goto reply;
11410 }
11411 set<int> osds;
11412 set<int> crush_nodes;
11413 set<int> device_classes;
11414 for (auto& w : who) {
11415 if (w == "any" || w == "all" || w == "*") {
11416 osdmap.get_all_osds(osds);
11417 break;
11418 }
11419 std::stringstream ts;
11420 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11421 osds.insert(osd);
11422 } else if (osdmap.crush->name_exists(w)) {
11423 crush_nodes.insert(osdmap.crush->get_item_id(w));
11424 } else if (osdmap.crush->class_exists(w)) {
11425 device_classes.insert(osdmap.crush->get_class_id(w));
11426 } else {
11427 ss << "unable to parse osd id or crush node or device class: "
11428 << "\"" << w << "\". ";
11429 }
11430 }
11431 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11432 // ss has reason for failure
11433 err = -EINVAL;
11434 goto reply;
11435 }
11436 bool any = false;
11437 for (auto osd : osds) {
11438 if (!osdmap.exists(osd)) {
11439 ss << "osd." << osd << " does not exist. ";
11440 continue;
11441 }
11442 if (do_set) {
11443 if (flags & CEPH_OSD_NOUP) {
11444 any |= osdmap.is_noup_by_osd(osd) ?
11445 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11446 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11447 }
11448 if (flags & CEPH_OSD_NODOWN) {
11449 any |= osdmap.is_nodown_by_osd(osd) ?
11450 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11451 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11452 }
11453 if (flags & CEPH_OSD_NOIN) {
11454 any |= osdmap.is_noin_by_osd(osd) ?
11455 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11456 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11457 }
11458 if (flags & CEPH_OSD_NOOUT) {
11459 any |= osdmap.is_noout_by_osd(osd) ?
11460 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11461 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11462 }
11463 } else {
11464 if (flags & CEPH_OSD_NOUP) {
11465 any |= osdmap.is_noup_by_osd(osd) ?
11466 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11467 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11468 }
11469 if (flags & CEPH_OSD_NODOWN) {
11470 any |= osdmap.is_nodown_by_osd(osd) ?
11471 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11472 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11473 }
11474 if (flags & CEPH_OSD_NOIN) {
11475 any |= osdmap.is_noin_by_osd(osd) ?
11476 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11477 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11478 }
11479 if (flags & CEPH_OSD_NOOUT) {
11480 any |= osdmap.is_noout_by_osd(osd) ?
11481 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11482 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11483 }
11484 }
11485 }
11486 for (auto& id : crush_nodes) {
11487 auto old_flags = osdmap.get_crush_node_flags(id);
11488 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11489 pending_flags |= old_flags; // adopt existing flags first!
11490 if (do_set) {
11491 pending_flags |= flags;
11492 } else {
11493 pending_flags &= ~flags;
11494 }
11495 any = true;
11496 }
11497 for (auto& id : device_classes) {
11498 auto old_flags = osdmap.get_device_class_flags(id);
11499 auto& pending_flags = pending_inc.new_device_class_flags[id];
11500 pending_flags |= old_flags;
11501 if (do_set) {
11502 pending_flags |= flags;
11503 } else {
11504 pending_flags &= ~flags;
11505 }
11506 any = true;
11507 }
11508 if (any) {
11509 getline(ss, rs);
11510 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11511 get_last_committed() + 1));
11512 return true;
11513 }
11514 } else if (prefix == "osd pg-temp") {
11515 string pgidstr;
11516 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11517 ss << "unable to parse 'pgid' value '"
11518 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11519 err = -EINVAL;
11520 goto reply;
11521 }
11522 pg_t pgid;
11523 if (!pgid.parse(pgidstr.c_str())) {
11524 ss << "invalid pgid '" << pgidstr << "'";
11525 err = -EINVAL;
11526 goto reply;
11527 }
11528 if (!osdmap.pg_exists(pgid)) {
11529 ss << "pg " << pgid << " does not exist";
11530 err = -ENOENT;
11531 goto reply;
11532 }
11533 if (pending_inc.new_pg_temp.count(pgid)) {
11534 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11535 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11536 return true;
11537 }
11538
11539 vector<int64_t> id_vec;
11540 vector<int32_t> new_pg_temp;
11541 cmd_getval(cmdmap, "id", id_vec);
11542 if (id_vec.empty()) {
11543 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11544 ss << "done cleaning up pg_temp of " << pgid;
11545 goto update;
11546 }
11547 for (auto osd : id_vec) {
11548 if (!osdmap.exists(osd)) {
11549 ss << "osd." << osd << " does not exist";
11550 err = -ENOENT;
11551 goto reply;
11552 }
11553 new_pg_temp.push_back(osd);
11554 }
11555
11556 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11557 if ((int)new_pg_temp.size() < pool_min_size) {
11558 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11559 << pool_min_size << ")";
11560 err = -EINVAL;
11561 goto reply;
11562 }
11563
11564 int pool_size = osdmap.get_pg_pool_size(pgid);
11565 if ((int)new_pg_temp.size() > pool_size) {
11566 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11567 << pool_size << ")";
11568 err = -EINVAL;
11569 goto reply;
11570 }
11571
11572 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11573 new_pg_temp.begin(), new_pg_temp.end());
11574 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11575 goto update;
11576 } else if (prefix == "osd primary-temp") {
11577 string pgidstr;
11578 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11579 ss << "unable to parse 'pgid' value '"
11580 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11581 err = -EINVAL;
11582 goto reply;
11583 }
11584 pg_t pgid;
11585 if (!pgid.parse(pgidstr.c_str())) {
11586 ss << "invalid pgid '" << pgidstr << "'";
11587 err = -EINVAL;
11588 goto reply;
11589 }
11590 if (!osdmap.pg_exists(pgid)) {
11591 ss << "pg " << pgid << " does not exist";
11592 err = -ENOENT;
11593 goto reply;
11594 }
11595
11596 int64_t osd;
11597 if (!cmd_getval(cmdmap, "id", osd)) {
11598 ss << "unable to parse 'id' value '"
11599 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11600 err = -EINVAL;
11601 goto reply;
11602 }
11603 if (osd != -1 && !osdmap.exists(osd)) {
11604 ss << "osd." << osd << " does not exist";
11605 err = -ENOENT;
11606 goto reply;
11607 }
11608
11609 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11610 osdmap.require_min_compat_client < ceph_release_t::firefly) {
11611 ss << "require_min_compat_client "
11612 << osdmap.require_min_compat_client
11613 << " < firefly, which is required for primary-temp";
11614 err = -EPERM;
11615 goto reply;
11616 }
11617
11618 pending_inc.new_primary_temp[pgid] = osd;
11619 ss << "set " << pgid << " primary_temp mapping to " << osd;
11620 goto update;
11621 } else if (prefix == "pg repeer") {
11622 pg_t pgid;
11623 string pgidstr;
11624 cmd_getval(cmdmap, "pgid", pgidstr);
11625 if (!pgid.parse(pgidstr.c_str())) {
11626 ss << "invalid pgid '" << pgidstr << "'";
11627 err = -EINVAL;
11628 goto reply;
11629 }
11630 if (!osdmap.pg_exists(pgid)) {
11631 ss << "pg '" << pgidstr << "' does not exist";
11632 err = -ENOENT;
11633 goto reply;
11634 }
11635 vector<int> acting;
11636 int primary;
11637 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11638 if (primary < 0) {
11639 err = -EAGAIN;
11640 ss << "pg currently has no primary";
11641 goto reply;
11642 }
11643 if (acting.size() > 1) {
11644 // map to just primary; it will map back to what it wants
11645 pending_inc.new_pg_temp[pgid] = { primary };
11646 } else {
11647 // hmm, pick another arbitrary osd to induce a change. Note
11648 // that this won't work if there is only one suitable OSD in the cluster.
11649 int i;
11650 bool done = false;
11651 for (i = 0; i < osdmap.get_max_osd(); ++i) {
11652 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11653 continue;
11654 }
11655 pending_inc.new_pg_temp[pgid] = { primary, i };
11656 done = true;
11657 break;
11658 }
11659 if (!done) {
11660 err = -EAGAIN;
11661 ss << "not enough up OSDs in the cluster to force repeer";
11662 goto reply;
11663 }
11664 }
11665 goto update;
11666 } else if (prefix == "osd pg-upmap" ||
11667 prefix == "osd rm-pg-upmap" ||
11668 prefix == "osd pg-upmap-items" ||
11669 prefix == "osd rm-pg-upmap-items") {
11670 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11671 ss << "min_compat_client "
11672 << osdmap.require_min_compat_client
11673 << " < luminous, which is required for pg-upmap. "
11674 << "Try 'ceph osd set-require-min-compat-client luminous' "
11675 << "before using the new interface";
11676 err = -EPERM;
11677 goto reply;
11678 }
11679 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11680 if (err == -EAGAIN)
11681 goto wait;
11682 if (err < 0)
11683 goto reply;
11684 string pgidstr;
11685 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11686 ss << "unable to parse 'pgid' value '"
11687 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11688 err = -EINVAL;
11689 goto reply;
11690 }
11691 pg_t pgid;
11692 if (!pgid.parse(pgidstr.c_str())) {
11693 ss << "invalid pgid '" << pgidstr << "'";
11694 err = -EINVAL;
11695 goto reply;
11696 }
11697 if (!osdmap.pg_exists(pgid)) {
11698 ss << "pg " << pgid << " does not exist";
11699 err = -ENOENT;
11700 goto reply;
11701 }
11702 if (pending_inc.old_pools.count(pgid.pool())) {
11703 ss << "pool of " << pgid << " is pending removal";
11704 err = -ENOENT;
11705 getline(ss, rs);
11706 wait_for_finished_proposal(op,
11707 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11708 return true;
11709 }
11710
11711 enum {
11712 OP_PG_UPMAP,
11713 OP_RM_PG_UPMAP,
11714 OP_PG_UPMAP_ITEMS,
11715 OP_RM_PG_UPMAP_ITEMS,
11716 } option;
11717
11718 if (prefix == "osd pg-upmap") {
11719 option = OP_PG_UPMAP;
11720 } else if (prefix == "osd rm-pg-upmap") {
11721 option = OP_RM_PG_UPMAP;
11722 } else if (prefix == "osd pg-upmap-items") {
11723 option = OP_PG_UPMAP_ITEMS;
11724 } else {
11725 option = OP_RM_PG_UPMAP_ITEMS;
11726 }
11727
11728 // check pending upmap changes
11729 switch (option) {
11730 case OP_PG_UPMAP: // fall through
11731 case OP_RM_PG_UPMAP:
11732 if (pending_inc.new_pg_upmap.count(pgid) ||
11733 pending_inc.old_pg_upmap.count(pgid)) {
11734 dout(10) << __func__ << " waiting for pending update on "
11735 << pgid << dendl;
11736 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11737 return true;
11738 }
11739 break;
11740
11741 case OP_PG_UPMAP_ITEMS: // fall through
11742 case OP_RM_PG_UPMAP_ITEMS:
11743 if (pending_inc.new_pg_upmap_items.count(pgid) ||
11744 pending_inc.old_pg_upmap_items.count(pgid)) {
11745 dout(10) << __func__ << " waiting for pending update on "
11746 << pgid << dendl;
11747 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11748 return true;
11749 }
11750 break;
11751
11752 default:
11753 ceph_abort_msg("invalid option");
11754 }
11755
11756 switch (option) {
11757 case OP_PG_UPMAP:
11758 {
11759 vector<int64_t> id_vec;
11760 if (!cmd_getval(cmdmap, "id", id_vec)) {
11761 ss << "unable to parse 'id' value(s) '"
11762 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11763 err = -EINVAL;
11764 goto reply;
11765 }
11766
11767 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11768 if ((int)id_vec.size() < pool_min_size) {
11769 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11770 << pool_min_size << ")";
11771 err = -EINVAL;
11772 goto reply;
11773 }
11774
11775 int pool_size = osdmap.get_pg_pool_size(pgid);
11776 if ((int)id_vec.size() > pool_size) {
11777 ss << "num of osds (" << id_vec.size() <<") > pool size ("
11778 << pool_size << ")";
11779 err = -EINVAL;
11780 goto reply;
11781 }
11782
11783 vector<int32_t> new_pg_upmap;
11784 for (auto osd : id_vec) {
11785 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11786 ss << "osd." << osd << " does not exist";
11787 err = -ENOENT;
11788 goto reply;
11789 }
11790 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11791 if (it != new_pg_upmap.end()) {
11792 ss << "osd." << osd << " already exists, ";
11793 continue;
11794 }
11795 new_pg_upmap.push_back(osd);
11796 }
11797
11798 if (new_pg_upmap.empty()) {
11799 ss << "no valid upmap items(pairs) is specified";
11800 err = -EINVAL;
11801 goto reply;
11802 }
11803
11804 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11805 new_pg_upmap.begin(), new_pg_upmap.end());
11806 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
11807 }
11808 break;
11809
11810 case OP_RM_PG_UPMAP:
11811 {
11812 pending_inc.old_pg_upmap.insert(pgid);
11813 ss << "clear " << pgid << " pg_upmap mapping";
11814 }
11815 break;
11816
11817 case OP_PG_UPMAP_ITEMS:
11818 {
11819 vector<int64_t> id_vec;
11820 if (!cmd_getval(cmdmap, "id", id_vec)) {
11821 ss << "unable to parse 'id' value(s) '"
11822 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11823 err = -EINVAL;
11824 goto reply;
11825 }
11826
11827 if (id_vec.size() % 2) {
11828 ss << "you must specify pairs of osd ids to be remapped";
11829 err = -EINVAL;
11830 goto reply;
11831 }
11832
11833 int pool_size = osdmap.get_pg_pool_size(pgid);
11834 if ((int)(id_vec.size() / 2) > pool_size) {
11835 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11836 << pool_size << ")";
11837 err = -EINVAL;
11838 goto reply;
11839 }
11840
11841 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11842 ostringstream items;
11843 items << "[";
11844 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11845 int from = *p++;
11846 int to = *p;
11847 if (from == to) {
11848 ss << "from osd." << from << " == to osd." << to << ", ";
11849 continue;
11850 }
11851 if (!osdmap.exists(from)) {
11852 ss << "osd." << from << " does not exist";
11853 err = -ENOENT;
11854 goto reply;
11855 }
11856 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11857 ss << "osd." << to << " does not exist";
11858 err = -ENOENT;
11859 goto reply;
11860 }
11861 pair<int32_t,int32_t> entry = make_pair(from, to);
11862 auto it = std::find(new_pg_upmap_items.begin(),
11863 new_pg_upmap_items.end(), entry);
11864 if (it != new_pg_upmap_items.end()) {
11865 ss << "osd." << from << " -> osd." << to << " already exists, ";
11866 continue;
11867 }
11868 new_pg_upmap_items.push_back(entry);
11869 items << from << "->" << to << ",";
11870 }
11871 string out(items.str());
11872 out.resize(out.size() - 1); // drop last ','
11873 out += "]";
11874
11875 if (new_pg_upmap_items.empty()) {
11876 ss << "no valid upmap items(pairs) is specified";
11877 err = -EINVAL;
11878 goto reply;
11879 }
11880
11881 pending_inc.new_pg_upmap_items[pgid] =
11882 mempool::osdmap::vector<pair<int32_t,int32_t>>(
11883 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11884 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11885 }
11886 break;
11887
11888 case OP_RM_PG_UPMAP_ITEMS:
11889 {
11890 pending_inc.old_pg_upmap_items.insert(pgid);
11891 ss << "clear " << pgid << " pg_upmap_items mapping";
11892 }
11893 break;
11894
11895 default:
11896 ceph_abort_msg("invalid option");
11897 }
11898
11899 goto update;
11900 } else if (prefix == "osd primary-affinity") {
11901 int64_t id;
11902 if (!cmd_getval(cmdmap, "id", id)) {
11903 ss << "invalid osd id value '"
11904 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11905 err = -EINVAL;
11906 goto reply;
11907 }
11908 double w;
11909 if (!cmd_getval(cmdmap, "weight", w)) {
11910 ss << "unable to parse 'weight' value '"
11911 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11912 err = -EINVAL;
11913 goto reply;
11914 }
11915 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11916 if (ww < 0L) {
11917 ss << "weight must be >= 0";
11918 err = -EINVAL;
11919 goto reply;
11920 }
11921 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11922 osdmap.require_min_compat_client < ceph_release_t::firefly) {
11923 ss << "require_min_compat_client "
11924 << osdmap.require_min_compat_client
11925 << " < firefly, which is required for primary-affinity";
11926 err = -EPERM;
11927 goto reply;
11928 }
11929 if (osdmap.exists(id)) {
11930 pending_inc.new_primary_affinity[id] = ww;
11931 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11932 getline(ss, rs);
11933 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11934 get_last_committed() + 1));
11935 return true;
11936 } else {
11937 ss << "osd." << id << " does not exist";
11938 err = -ENOENT;
11939 goto reply;
11940 }
11941 } else if (prefix == "osd reweight") {
11942 int64_t id;
11943 if (!cmd_getval(cmdmap, "id", id)) {
11944 ss << "unable to parse osd id value '"
11945 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11946 err = -EINVAL;
11947 goto reply;
11948 }
11949 double w;
11950 if (!cmd_getval(cmdmap, "weight", w)) {
11951 ss << "unable to parse weight value '"
11952 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11953 err = -EINVAL;
11954 goto reply;
11955 }
11956 long ww = (int)((double)CEPH_OSD_IN*w);
11957 if (ww < 0L) {
11958 ss << "weight must be >= 0";
11959 err = -EINVAL;
11960 goto reply;
11961 }
11962 if (osdmap.exists(id)) {
11963 pending_inc.new_weight[id] = ww;
11964 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
11965 getline(ss, rs);
11966 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11967 get_last_committed() + 1));
11968 return true;
11969 } else {
11970 ss << "osd." << id << " does not exist";
11971 err = -ENOENT;
11972 goto reply;
11973 }
11974 } else if (prefix == "osd reweightn") {
11975 map<int32_t, uint32_t> weights;
11976 err = parse_reweights(cct, cmdmap, osdmap, &weights);
11977 if (err) {
11978 ss << "unable to parse 'weights' value '"
11979 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
11980 goto reply;
11981 }
11982 pending_inc.new_weight.insert(weights.begin(), weights.end());
11983 wait_for_finished_proposal(
11984 op,
11985 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11986 return true;
11987 } else if (prefix == "osd lost") {
11988 int64_t id;
11989 if (!cmd_getval(cmdmap, "id", id)) {
11990 ss << "unable to parse osd id value '"
11991 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11992 err = -EINVAL;
11993 goto reply;
11994 }
11995 bool sure = false;
11996 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11997 if (!sure) {
11998 ss << "are you SURE? this might mean real, permanent data loss. pass "
11999 "--yes-i-really-mean-it if you really do.";
12000 err = -EPERM;
12001 goto reply;
12002 } else if (!osdmap.exists(id)) {
12003 ss << "osd." << id << " does not exist";
12004 err = -ENOENT;
12005 goto reply;
12006 } else if (!osdmap.is_down(id)) {
12007 ss << "osd." << id << " is not down";
12008 err = -EBUSY;
12009 goto reply;
12010 } else {
12011 epoch_t e = osdmap.get_info(id).down_at;
12012 pending_inc.new_lost[id] = e;
12013 ss << "marked osd lost in epoch " << e;
12014 getline(ss, rs);
12015 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12016 get_last_committed() + 1));
12017 return true;
12018 }
12019
12020 } else if (prefix == "osd destroy-actual" ||
12021 prefix == "osd purge-actual" ||
12022 prefix == "osd purge-new") {
12023 /* Destroying an OSD means that we don't expect to further make use of
12024 * the OSDs data (which may even become unreadable after this operation),
12025 * and that we are okay with scrubbing all its cephx keys and config-key
12026 * data (which may include lockbox keys, thus rendering the osd's data
12027 * unreadable).
12028 *
12029 * The OSD will not be removed. Instead, we will mark it as destroyed,
12030 * such that a subsequent call to `create` will not reuse the osd id.
12031 * This will play into being able to recreate the OSD, at the same
12032 * crush location, with minimal data movement.
12033 */
12034
12035 // make sure authmon is writeable.
12036 if (!mon->authmon()->is_writeable()) {
12037 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12038 << "osd destroy" << dendl;
12039 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12040 return false;
12041 }
12042
12043 int64_t id;
12044 if (!cmd_getval(cmdmap, "id", id)) {
12045 auto p = cmdmap.find("id");
12046 if (p == cmdmap.end()) {
12047 ss << "no osd id specified";
12048 } else {
12049 ss << "unable to parse osd id value '"
12050 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12051 }
12052 err = -EINVAL;
12053 goto reply;
12054 }
12055
12056 bool is_destroy = (prefix == "osd destroy-actual");
12057 if (!is_destroy) {
12058 ceph_assert("osd purge-actual" == prefix ||
12059 "osd purge-new" == prefix);
12060 }
12061
12062 bool sure = false;
12063 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12064 if (!sure) {
12065 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12066 << "This will mean real, permanent data loss, as well "
12067 << "as deletion of cephx and lockbox keys. "
12068 << "Pass --yes-i-really-mean-it if you really do.";
12069 err = -EPERM;
12070 goto reply;
12071 } else if (!osdmap.exists(id)) {
12072 ss << "osd." << id << " does not exist";
12073 err = 0; // idempotent
12074 goto reply;
12075 } else if (osdmap.is_up(id)) {
12076 ss << "osd." << id << " is not `down`.";
12077 err = -EBUSY;
12078 goto reply;
12079 } else if (is_destroy && osdmap.is_destroyed(id)) {
12080 ss << "destroyed osd." << id;
12081 err = 0;
12082 goto reply;
12083 }
12084
12085 if (prefix == "osd purge-new" &&
12086 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12087 ss << "osd." << id << " is not new";
12088 err = -EPERM;
12089 goto reply;
12090 }
12091
12092 bool goto_reply = false;
12093
12094 paxos->plug();
12095 if (is_destroy) {
12096 err = prepare_command_osd_destroy(id, ss);
12097 // we checked above that it should exist.
12098 ceph_assert(err != -ENOENT);
12099 } else {
12100 err = prepare_command_osd_purge(id, ss);
12101 if (err == -ENOENT) {
12102 err = 0;
12103 ss << "osd." << id << " does not exist.";
12104 goto_reply = true;
12105 }
12106 }
12107 paxos->unplug();
12108
12109 if (err < 0 || goto_reply) {
12110 goto reply;
12111 }
12112
12113 if (is_destroy) {
12114 ss << "destroyed osd." << id;
12115 } else {
12116 ss << "purged osd." << id;
12117 }
12118
12119 getline(ss, rs);
12120 wait_for_finished_proposal(op,
12121 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12122 force_immediate_propose();
12123 return true;
12124
12125 } else if (prefix == "osd new") {
12126
12127 // make sure authmon is writeable.
12128 if (!mon->authmon()->is_writeable()) {
12129 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12130 << "osd new" << dendl;
12131 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12132 return false;
12133 }
12134
12135 map<string,string> param_map;
12136
12137 bufferlist bl = m->get_data();
12138 string param_json = bl.to_str();
12139 dout(20) << __func__ << " osd new json = " << param_json << dendl;
12140
12141 err = get_json_str_map(param_json, ss, &param_map);
12142 if (err < 0)
12143 goto reply;
12144
12145 dout(20) << __func__ << " osd new params " << param_map << dendl;
12146
12147 paxos->plug();
12148 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12149 paxos->unplug();
12150
12151 if (err < 0) {
12152 goto reply;
12153 }
12154
12155 if (f) {
12156 f->flush(rdata);
12157 } else {
12158 rdata.append(ss);
12159 }
12160
12161 if (err == EEXIST) {
12162 // idempotent operation
12163 err = 0;
12164 goto reply;
12165 }
12166
12167 wait_for_finished_proposal(op,
12168 new Monitor::C_Command(mon, op, 0, rs, rdata,
12169 get_last_committed() + 1));
12170 force_immediate_propose();
12171 return true;
12172
12173 } else if (prefix == "osd create") {
12174
12175 // optional id provided?
12176 int64_t id = -1, cmd_id = -1;
12177 if (cmd_getval(cmdmap, "id", cmd_id)) {
12178 if (cmd_id < 0) {
12179 ss << "invalid osd id value '" << cmd_id << "'";
12180 err = -EINVAL;
12181 goto reply;
12182 }
12183 dout(10) << " osd create got id " << cmd_id << dendl;
12184 }
12185
12186 uuid_d uuid;
12187 string uuidstr;
12188 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12189 if (!uuid.parse(uuidstr.c_str())) {
12190 ss << "invalid uuid value '" << uuidstr << "'";
12191 err = -EINVAL;
12192 goto reply;
12193 }
12194 // we only care about the id if we also have the uuid, to
12195 // ensure the operation's idempotency.
12196 id = cmd_id;
12197 }
12198
12199 int32_t new_id = -1;
12200 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12201 if (err < 0) {
12202 if (err == -EAGAIN) {
12203 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12204 return true;
12205 }
12206 // a check has failed; reply to the user.
12207 goto reply;
12208
12209 } else if (err == EEXIST) {
12210 // this is an idempotent operation; we can go ahead and reply.
12211 if (f) {
12212 f->open_object_section("created_osd");
12213 f->dump_int("osdid", new_id);
12214 f->close_section();
12215 f->flush(rdata);
12216 } else {
12217 ss << new_id;
12218 rdata.append(ss);
12219 }
12220 err = 0;
12221 goto reply;
12222 }
12223
12224 string empty_device_class;
12225 do_osd_create(id, uuid, empty_device_class, &new_id);
12226
12227 if (f) {
12228 f->open_object_section("created_osd");
12229 f->dump_int("osdid", new_id);
12230 f->close_section();
12231 f->flush(rdata);
12232 } else {
12233 ss << new_id;
12234 rdata.append(ss);
12235 }
12236 wait_for_finished_proposal(op,
12237 new Monitor::C_Command(mon, op, 0, rs, rdata,
12238 get_last_committed() + 1));
12239 return true;
12240
12241 } else if (prefix == "osd blacklist clear") {
12242 pending_inc.new_blacklist.clear();
12243 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12244 osdmap.get_blacklist(&blacklist);
12245 for (const auto &entry : blacklist) {
12246 pending_inc.old_blacklist.push_back(entry.first);
12247 }
12248 ss << " removed all blacklist entries";
12249 getline(ss, rs);
12250 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12251 get_last_committed() + 1));
12252 return true;
12253 } else if (prefix == "osd blacklist") {
12254 string addrstr;
12255 cmd_getval(cmdmap, "addr", addrstr);
12256 entity_addr_t addr;
12257 if (!addr.parse(addrstr.c_str(), 0)) {
12258 ss << "unable to parse address " << addrstr;
12259 err = -EINVAL;
12260 goto reply;
12261 }
12262 else {
12263 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12264 // always blacklist type ANY
12265 addr.set_type(entity_addr_t::TYPE_ANY);
12266 } else {
12267 addr.set_type(entity_addr_t::TYPE_LEGACY);
12268 }
12269
12270 string blacklistop;
12271 cmd_getval(cmdmap, "blacklistop", blacklistop);
12272 if (blacklistop == "add") {
12273 utime_t expires = ceph_clock_now();
12274 double d;
12275 // default one hour
12276 cmd_getval(cmdmap, "expire", d,
12277 g_conf()->mon_osd_blacklist_default_expire);
12278 expires += d;
12279
12280 pending_inc.new_blacklist[addr] = expires;
12281
12282 {
12283 // cancel any pending un-blacklisting request too
12284 auto it = std::find(pending_inc.old_blacklist.begin(),
12285 pending_inc.old_blacklist.end(), addr);
12286 if (it != pending_inc.old_blacklist.end()) {
12287 pending_inc.old_blacklist.erase(it);
12288 }
12289 }
12290
12291 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12292 getline(ss, rs);
12293 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12294 get_last_committed() + 1));
12295 return true;
12296 } else if (blacklistop == "rm") {
12297 if (osdmap.is_blacklisted(addr) ||
12298 pending_inc.new_blacklist.count(addr)) {
12299 if (osdmap.is_blacklisted(addr))
12300 pending_inc.old_blacklist.push_back(addr);
12301 else
12302 pending_inc.new_blacklist.erase(addr);
12303 ss << "un-blacklisting " << addr;
12304 getline(ss, rs);
12305 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12306 get_last_committed() + 1));
12307 return true;
12308 }
12309 ss << addr << " isn't blacklisted";
12310 err = 0;
12311 goto reply;
12312 }
12313 }
12314 } else if (prefix == "osd pool mksnap") {
12315 string poolstr;
12316 cmd_getval(cmdmap, "pool", poolstr);
12317 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12318 if (pool < 0) {
12319 ss << "unrecognized pool '" << poolstr << "'";
12320 err = -ENOENT;
12321 goto reply;
12322 }
12323 string snapname;
12324 cmd_getval(cmdmap, "snap", snapname);
12325 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12326 if (p->is_unmanaged_snaps_mode()) {
12327 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12328 err = -EINVAL;
12329 goto reply;
12330 } else if (p->snap_exists(snapname.c_str())) {
12331 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12332 err = 0;
12333 goto reply;
12334 } else if (p->is_tier()) {
12335 ss << "pool " << poolstr << " is a cache tier";
12336 err = -EINVAL;
12337 goto reply;
12338 }
12339 pg_pool_t *pp = 0;
12340 if (pending_inc.new_pools.count(pool))
12341 pp = &pending_inc.new_pools[pool];
12342 if (!pp) {
12343 pp = &pending_inc.new_pools[pool];
12344 *pp = *p;
12345 }
12346 if (pp->snap_exists(snapname.c_str())) {
12347 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12348 } else {
12349 pp->add_snap(snapname.c_str(), ceph_clock_now());
12350 pp->set_snap_epoch(pending_inc.epoch);
12351 ss << "created pool " << poolstr << " snap " << snapname;
12352 }
12353 getline(ss, rs);
12354 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12355 get_last_committed() + 1));
12356 return true;
12357 } else if (prefix == "osd pool rmsnap") {
12358 string poolstr;
12359 cmd_getval(cmdmap, "pool", poolstr);
12360 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12361 if (pool < 0) {
12362 ss << "unrecognized pool '" << poolstr << "'";
12363 err = -ENOENT;
12364 goto reply;
12365 }
12366 string snapname;
12367 cmd_getval(cmdmap, "snap", snapname);
12368 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12369 if (p->is_unmanaged_snaps_mode()) {
12370 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12371 err = -EINVAL;
12372 goto reply;
12373 } else if (!p->snap_exists(snapname.c_str())) {
12374 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12375 err = 0;
12376 goto reply;
12377 }
12378 pg_pool_t *pp = 0;
12379 if (pending_inc.new_pools.count(pool))
12380 pp = &pending_inc.new_pools[pool];
12381 if (!pp) {
12382 pp = &pending_inc.new_pools[pool];
12383 *pp = *p;
12384 }
12385 snapid_t sn = pp->snap_exists(snapname.c_str());
12386 if (sn) {
12387 pp->remove_snap(sn);
12388 pp->set_snap_epoch(pending_inc.epoch);
12389 ss << "removed pool " << poolstr << " snap " << snapname;
12390 } else {
12391 ss << "already removed pool " << poolstr << " snap " << snapname;
12392 }
12393 getline(ss, rs);
12394 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12395 get_last_committed() + 1));
12396 return true;
12397 } else if (prefix == "osd pool create") {
12398 int64_t pg_num, pg_num_min;
12399 int64_t pgp_num;
12400 cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12401 cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12402 cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12403
12404 string pool_type_str;
12405 cmd_getval(cmdmap, "pool_type", pool_type_str);
12406 if (pool_type_str.empty())
12407 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12408
12409 string poolstr;
12410 cmd_getval(cmdmap, "pool", poolstr);
12411 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12412 if (pool_id >= 0) {
12413 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12414 if (pool_type_str != p->get_type_name()) {
12415 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12416 err = -EINVAL;
12417 } else {
12418 ss << "pool '" << poolstr << "' already exists";
12419 err = 0;
12420 }
12421 goto reply;
12422 }
12423
12424 int pool_type;
12425 if (pool_type_str == "replicated") {
12426 pool_type = pg_pool_t::TYPE_REPLICATED;
12427 } else if (pool_type_str == "erasure") {
12428 pool_type = pg_pool_t::TYPE_ERASURE;
12429 } else {
12430 ss << "unknown pool type '" << pool_type_str << "'";
12431 err = -EINVAL;
12432 goto reply;
12433 }
12434
12435 bool implicit_rule_creation = false;
12436 int64_t expected_num_objects = 0;
12437 string rule_name;
12438 cmd_getval(cmdmap, "rule", rule_name);
12439 string erasure_code_profile;
12440 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12441
12442 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12443 if (erasure_code_profile == "")
12444 erasure_code_profile = "default";
12445 //handle the erasure code profile
12446 if (erasure_code_profile == "default") {
12447 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12448 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12449 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12450 goto wait;
12451 }
12452
12453 map<string,string> profile_map;
12454 err = osdmap.get_erasure_code_profile_default(cct,
12455 profile_map,
12456 &ss);
12457 if (err)
12458 goto reply;
12459 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12460 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12461 goto wait;
12462 }
12463 }
12464 if (rule_name == "") {
12465 implicit_rule_creation = true;
12466 if (erasure_code_profile == "default") {
12467 rule_name = "erasure-code";
12468 } else {
12469 dout(1) << "implicitly use rule named after the pool: "
12470 << poolstr << dendl;
12471 rule_name = poolstr;
12472 }
12473 }
12474 cmd_getval(cmdmap, "expected_num_objects",
12475 expected_num_objects, int64_t(0));
12476 } else {
12477 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12478 // and put expected_num_objects to rule field
12479 if (erasure_code_profile != "") { // cmd is from CLI
12480 if (rule_name != "") {
12481 string interr;
12482 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12483 if (interr.length()) {
12484 ss << "error parsing integer value '" << rule_name << "': " << interr;
12485 err = -EINVAL;
12486 goto reply;
12487 }
12488 }
12489 rule_name = erasure_code_profile;
12490 } else { // cmd is well-formed
12491 cmd_getval(cmdmap, "expected_num_objects",
12492 expected_num_objects, int64_t(0));
12493 }
12494 }
12495
12496 if (!implicit_rule_creation && rule_name != "") {
12497 int rule;
12498 err = get_crush_rule(rule_name, &rule, &ss);
12499 if (err == -EAGAIN) {
12500 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12501 return true;
12502 }
12503 if (err)
12504 goto reply;
12505 }
12506
12507 if (expected_num_objects < 0) {
12508 ss << "'expected_num_objects' must be non-negative";
12509 err = -EINVAL;
12510 goto reply;
12511 }
12512
12513 if (expected_num_objects > 0 &&
12514 cct->_conf->osd_objectstore == "filestore" &&
12515 cct->_conf->filestore_merge_threshold > 0) {
12516 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12517 err = -EINVAL;
12518 goto reply;
12519 }
12520
12521 if (expected_num_objects == 0 &&
12522 cct->_conf->osd_objectstore == "filestore" &&
12523 cct->_conf->filestore_merge_threshold < 0) {
12524 int osds = osdmap.get_num_osds();
12525 if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12526 ss << "For better initial performance on pools expected to store a "
12527 << "large number of objects, consider supplying the "
12528 << "expected_num_objects parameter when creating the pool.\n";
12529 }
12530 }
12531
12532 int64_t fast_read_param;
12533 cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12534 FastReadType fast_read = FAST_READ_DEFAULT;
12535 if (fast_read_param == 0)
12536 fast_read = FAST_READ_OFF;
12537 else if (fast_read_param > 0)
12538 fast_read = FAST_READ_ON;
12539
12540 int64_t repl_size = 0;
12541 cmd_getval(cmdmap, "size", repl_size);
12542 int64_t target_size_bytes = 0;
12543 double target_size_ratio = 0.0;
12544 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12545 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12546
12547 string pg_autoscale_mode;
12548 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12549
12550 err = prepare_new_pool(poolstr,
12551 -1, // default crush rule
12552 rule_name,
12553 pg_num, pgp_num, pg_num_min,
12554 repl_size, target_size_bytes, target_size_ratio,
12555 erasure_code_profile, pool_type,
12556 (uint64_t)expected_num_objects,
12557 fast_read,
12558 pg_autoscale_mode,
12559 &ss);
12560 if (err < 0) {
12561 switch(err) {
12562 case -EEXIST:
12563 ss << "pool '" << poolstr << "' already exists";
12564 break;
12565 case -EAGAIN:
12566 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12567 return true;
12568 case -ERANGE:
12569 goto reply;
12570 default:
12571 goto reply;
12572 break;
12573 }
12574 } else {
12575 ss << "pool '" << poolstr << "' created";
12576 }
12577 getline(ss, rs);
12578 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12579 get_last_committed() + 1));
12580 return true;
12581
12582 } else if (prefix == "osd pool delete" ||
12583 prefix == "osd pool rm") {
12584 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12585 string poolstr, poolstr2, sure;
12586 cmd_getval(cmdmap, "pool", poolstr);
12587 cmd_getval(cmdmap, "pool2", poolstr2);
12588 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12589 if (pool < 0) {
12590 ss << "pool '" << poolstr << "' does not exist";
12591 err = 0;
12592 goto reply;
12593 }
12594
12595 bool force_no_fake = false;
12596 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12597 bool force = false;
12598 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12599 if (poolstr2 != poolstr ||
12600 (!force && !force_no_fake)) {
12601 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12602 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12603 << "followed by --yes-i-really-really-mean-it.";
12604 err = -EPERM;
12605 goto reply;
12606 }
12607 err = _prepare_remove_pool(pool, &ss, force_no_fake);
12608 if (err == -EAGAIN) {
12609 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12610 return true;
12611 }
12612 if (err < 0)
12613 goto reply;
12614 goto update;
12615 } else if (prefix == "osd pool rename") {
12616 string srcpoolstr, destpoolstr;
12617 cmd_getval(cmdmap, "srcpool", srcpoolstr);
12618 cmd_getval(cmdmap, "destpool", destpoolstr);
12619 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12620 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12621
12622 if (pool_src < 0) {
12623 if (pool_dst >= 0) {
12624 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12625 // of operations, assume this rename succeeded, as it is not changing
12626 // the current state. Make sure we output something understandable
12627 // for whoever is issuing the command, if they are paying attention,
12628 // in case it was not intentional; or to avoid a "wtf?" and a bug
12629 // report in case it was intentional, while expecting a failure.
12630 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12631 << destpoolstr << "' does -- assuming successful rename";
12632 err = 0;
12633 } else {
12634 ss << "unrecognized pool '" << srcpoolstr << "'";
12635 err = -ENOENT;
12636 }
12637 goto reply;
12638 } else if (pool_dst >= 0) {
12639 // source pool exists and so does the destination pool
12640 ss << "pool '" << destpoolstr << "' already exists";
12641 err = -EEXIST;
12642 goto reply;
12643 }
12644
12645 int ret = _prepare_rename_pool(pool_src, destpoolstr);
12646 if (ret == 0) {
12647 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12648 } else {
12649 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12650 << cpp_strerror(ret);
12651 }
12652 getline(ss, rs);
12653 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12654 get_last_committed() + 1));
12655 return true;
12656
12657 } else if (prefix == "osd pool set") {
12658 err = prepare_command_pool_set(cmdmap, ss);
12659 if (err == -EAGAIN)
12660 goto wait;
12661 if (err < 0)
12662 goto reply;
12663
12664 getline(ss, rs);
12665 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12666 get_last_committed() + 1));
12667 return true;
12668 } else if (prefix == "osd tier add") {
12669 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12670 if (err == -EAGAIN)
12671 goto wait;
12672 if (err)
12673 goto reply;
12674 string poolstr;
12675 cmd_getval(cmdmap, "pool", poolstr);
12676 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12677 if (pool_id < 0) {
12678 ss << "unrecognized pool '" << poolstr << "'";
12679 err = -ENOENT;
12680 goto reply;
12681 }
12682 string tierpoolstr;
12683 cmd_getval(cmdmap, "tierpool", tierpoolstr);
12684 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12685 if (tierpool_id < 0) {
12686 ss << "unrecognized pool '" << tierpoolstr << "'";
12687 err = -ENOENT;
12688 goto reply;
12689 }
12690 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12691 ceph_assert(p);
12692 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12693 ceph_assert(tp);
12694
12695 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12696 goto reply;
12697 }
12698
12699 // make sure new tier is empty
12700 string force_nonempty;
12701 cmd_getval(cmdmap, "force_nonempty", force_nonempty);
12702 const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
12703 if (pstats && pstats->stats.sum.num_objects != 0 &&
12704 force_nonempty != "--force-nonempty") {
12705 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12706 err = -ENOTEMPTY;
12707 goto reply;
12708 }
12709 if (tp->is_erasure()) {
12710 ss << "tier pool '" << tierpoolstr
12711 << "' is an ec pool, which cannot be a tier";
12712 err = -ENOTSUP;
12713 goto reply;
12714 }
12715 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12716 ((force_nonempty != "--force-nonempty") ||
12717 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
12718 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12719 err = -ENOTEMPTY;
12720 goto reply;
12721 }
12722 // go
12723 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12724 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12725 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12726 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12727 return true;
12728 }
12729 np->tiers.insert(tierpool_id);
12730 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12731 ntp->tier_of = pool_id;
12732 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12733 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12734 get_last_committed() + 1));
12735 return true;
12736 } else if (prefix == "osd tier remove" ||
12737 prefix == "osd tier rm") {
12738 string poolstr;
12739 cmd_getval(cmdmap, "pool", poolstr);
12740 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12741 if (pool_id < 0) {
12742 ss << "unrecognized pool '" << poolstr << "'";
12743 err = -ENOENT;
12744 goto reply;
12745 }
12746 string tierpoolstr;
12747 cmd_getval(cmdmap, "tierpool", tierpoolstr);
12748 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12749 if (tierpool_id < 0) {
12750 ss << "unrecognized pool '" << tierpoolstr << "'";
12751 err = -ENOENT;
12752 goto reply;
12753 }
12754 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12755 ceph_assert(p);
12756 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12757 ceph_assert(tp);
12758
12759 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12760 goto reply;
12761 }
12762
12763 if (p->tiers.count(tierpool_id) == 0) {
12764 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12765 err = 0;
12766 goto reply;
12767 }
12768 if (tp->tier_of != pool_id) {
12769 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12770 << osdmap.get_pool_name(tp->tier_of) << "': "
12771 // be scary about it; this is an inconsistency and bells must go off
12772 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12773 err = -EINVAL;
12774 goto reply;
12775 }
12776 if (p->read_tier == tierpool_id) {
12777 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12778 err = -EBUSY;
12779 goto reply;
12780 }
12781 // go
12782 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12783 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12784 if (np->tiers.count(tierpool_id) == 0 ||
12785 ntp->tier_of != pool_id ||
12786 np->read_tier == tierpool_id) {
12787 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12788 return true;
12789 }
12790 np->tiers.erase(tierpool_id);
12791 ntp->clear_tier();
12792 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12793 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12794 get_last_committed() + 1));
12795 return true;
12796 } else if (prefix == "osd tier set-overlay") {
12797 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12798 if (err == -EAGAIN)
12799 goto wait;
12800 if (err)
12801 goto reply;
12802 string poolstr;
12803 cmd_getval(cmdmap, "pool", poolstr);
12804 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12805 if (pool_id < 0) {
12806 ss << "unrecognized pool '" << poolstr << "'";
12807 err = -ENOENT;
12808 goto reply;
12809 }
12810 string overlaypoolstr;
12811 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
12812 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12813 if (overlaypool_id < 0) {
12814 ss << "unrecognized pool '" << overlaypoolstr << "'";
12815 err = -ENOENT;
12816 goto reply;
12817 }
12818 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12819 ceph_assert(p);
12820 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
12821 ceph_assert(overlay_p);
12822 if (p->tiers.count(overlaypool_id) == 0) {
12823 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12824 err = -EINVAL;
12825 goto reply;
12826 }
12827 if (p->read_tier == overlaypool_id) {
12828 err = 0;
12829 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12830 goto reply;
12831 }
12832 if (p->has_read_tier()) {
12833 ss << "pool '" << poolstr << "' has overlay '"
12834 << osdmap.get_pool_name(p->read_tier)
12835 << "'; please remove-overlay first";
12836 err = -EINVAL;
12837 goto reply;
12838 }
12839
12840 // go
12841 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12842 np->read_tier = overlaypool_id;
12843 np->write_tier = overlaypool_id;
12844 np->set_last_force_op_resend(pending_inc.epoch);
12845 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12846 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12847 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12848 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12849 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12850 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12851 get_last_committed() + 1));
12852 return true;
12853 } else if (prefix == "osd tier remove-overlay" ||
12854 prefix == "osd tier rm-overlay") {
12855 string poolstr;
12856 cmd_getval(cmdmap, "pool", poolstr);
12857 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12858 if (pool_id < 0) {
12859 ss << "unrecognized pool '" << poolstr << "'";
12860 err = -ENOENT;
12861 goto reply;
12862 }
12863 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12864 ceph_assert(p);
12865 if (!p->has_read_tier()) {
12866 err = 0;
12867 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12868 goto reply;
12869 }
12870
12871 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12872 goto reply;
12873 }
12874
12875 // go
12876 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12877 if (np->has_read_tier()) {
12878 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12879 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12880 nop->set_last_force_op_resend(pending_inc.epoch);
12881 }
12882 if (np->has_write_tier()) {
12883 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12884 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12885 nop->set_last_force_op_resend(pending_inc.epoch);
12886 }
12887 np->clear_read_tier();
12888 np->clear_write_tier();
12889 np->set_last_force_op_resend(pending_inc.epoch);
12890 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12891 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12892 get_last_committed() + 1));
12893 return true;
12894 } else if (prefix == "osd tier cache-mode") {
12895 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12896 if (err == -EAGAIN)
12897 goto wait;
12898 if (err)
12899 goto reply;
12900 string poolstr;
12901 cmd_getval(cmdmap, "pool", poolstr);
12902 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12903 if (pool_id < 0) {
12904 ss << "unrecognized pool '" << poolstr << "'";
12905 err = -ENOENT;
12906 goto reply;
12907 }
12908 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12909 ceph_assert(p);
12910 if (!p->is_tier()) {
12911 ss << "pool '" << poolstr << "' is not a tier";
12912 err = -EINVAL;
12913 goto reply;
12914 }
12915 string modestr;
12916 cmd_getval(cmdmap, "mode", modestr);
12917 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12918 if (int(mode) < 0) {
12919 ss << "'" << modestr << "' is not a valid cache mode";
12920 err = -EINVAL;
12921 goto reply;
12922 }
12923
12924 bool sure = false;
12925 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12926
12927 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
12928 mode == pg_pool_t::CACHEMODE_READFORWARD) {
12929 ss << "'" << modestr << "' is no longer a supported cache mode";
12930 err = -EPERM;
12931 goto reply;
12932 }
12933 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12934 mode != pg_pool_t::CACHEMODE_NONE &&
12935 mode != pg_pool_t::CACHEMODE_PROXY &&
12936 mode != pg_pool_t::CACHEMODE_READPROXY) &&
12937 !sure) {
12938 ss << "'" << modestr << "' is not a well-supported cache mode and may "
12939 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12940 err = -EPERM;
12941 goto reply;
12942 }
12943
12944 // pool already has this cache-mode set and there are no pending changes
12945 if (p->cache_mode == mode &&
12946 (pending_inc.new_pools.count(pool_id) == 0 ||
12947 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12948 ss << "set cache-mode for pool '" << poolstr << "'"
12949 << " to " << pg_pool_t::get_cache_mode_name(mode);
12950 err = 0;
12951 goto reply;
12952 }
12953
12954 /* Mode description:
12955 *
12956 * none: No cache-mode defined
12957 * forward: Forward all reads and writes to base pool [removed]
12958 * writeback: Cache writes, promote reads from base pool
12959 * readonly: Forward writes to base pool
12960 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
12961 * proxy: Proxy all reads and writes to base pool
12962 * readproxy: Writes are in writeback mode, Reads are in proxy mode
12963 *
12964 * Hence, these are the allowed transitions:
12965 *
12966 * none -> any
12967 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12968 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
12969 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12970 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
12971 * writeback -> readproxy || proxy
12972 * readonly -> any
12973 */
12974
12975 // We check if the transition is valid against the current pool mode, as
12976 // it is the only committed state thus far. We will blantly squash
12977 // whatever mode is on the pending state.
12978
12979 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
12980 (mode != pg_pool_t::CACHEMODE_PROXY &&
12981 mode != pg_pool_t::CACHEMODE_READPROXY)) {
12982 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
12983 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
12984 << "' pool; only '"
12985 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
12986 << "','"
12987 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
12988 << "' allowed.";
12989 err = -EINVAL;
12990 goto reply;
12991 }
12992 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
12993 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12994 mode != pg_pool_t::CACHEMODE_PROXY &&
12995 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12996
12997 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
12998 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12999 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13000
13001 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13002 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13003 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13004
13005 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13006 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13007 mode != pg_pool_t::CACHEMODE_PROXY &&
13008 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13009
13010 const pool_stat_t* pstats =
13011 mon->mgrstatmon()->get_pool_stat(pool_id);
13012
13013 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13014 ss << "unable to set cache-mode '"
13015 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13016 << "': dirty objects found";
13017 err = -EBUSY;
13018 goto reply;
13019 }
13020 }
13021 // go
13022 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13023 np->cache_mode = mode;
13024 // set this both when moving to and from cache_mode NONE. this is to
13025 // capture legacy pools that were set up before this flag existed.
13026 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13027 ss << "set cache-mode for pool '" << poolstr
13028 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13029 if (mode == pg_pool_t::CACHEMODE_NONE) {
13030 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13031 ceph_assert(base_pool);
13032 if (base_pool->read_tier == pool_id ||
13033 base_pool->write_tier == pool_id)
13034 ss <<" (WARNING: pool is still configured as read or write tier)";
13035 }
13036 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13037 get_last_committed() + 1));
13038 return true;
13039 } else if (prefix == "osd tier add-cache") {
13040 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13041 if (err == -EAGAIN)
13042 goto wait;
13043 if (err)
13044 goto reply;
13045 string poolstr;
13046 cmd_getval(cmdmap, "pool", poolstr);
13047 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13048 if (pool_id < 0) {
13049 ss << "unrecognized pool '" << poolstr << "'";
13050 err = -ENOENT;
13051 goto reply;
13052 }
13053 string tierpoolstr;
13054 cmd_getval(cmdmap, "tierpool", tierpoolstr);
13055 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13056 if (tierpool_id < 0) {
13057 ss << "unrecognized pool '" << tierpoolstr << "'";
13058 err = -ENOENT;
13059 goto reply;
13060 }
13061 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13062 ceph_assert(p);
13063 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13064 ceph_assert(tp);
13065
13066 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13067 goto reply;
13068 }
13069
13070 int64_t size = 0;
13071 if (!cmd_getval(cmdmap, "size", size)) {
13072 ss << "unable to parse 'size' value '"
13073 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13074 err = -EINVAL;
13075 goto reply;
13076 }
13077 // make sure new tier is empty
13078 const pool_stat_t *pstats =
13079 mon->mgrstatmon()->get_pool_stat(tierpool_id);
13080 if (pstats && pstats->stats.sum.num_objects != 0) {
13081 ss << "tier pool '" << tierpoolstr << "' is not empty";
13082 err = -ENOTEMPTY;
13083 goto reply;
13084 }
13085 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13086 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13087 if (int(mode) < 0) {
13088 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13089 err = -EINVAL;
13090 goto reply;
13091 }
13092 HitSet::Params hsp;
13093 auto& cache_hit_set_type =
13094 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13095 if (cache_hit_set_type == "bloom") {
13096 BloomHitSet::Params *bsp = new BloomHitSet::Params;
13097 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13098 hsp = HitSet::Params(bsp);
13099 } else if (cache_hit_set_type == "explicit_hash") {
13100 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13101 } else if (cache_hit_set_type == "explicit_object") {
13102 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13103 } else {
13104 ss << "osd tier cache default hit set type '"
13105 << cache_hit_set_type << "' is not a known type";
13106 err = -EINVAL;
13107 goto reply;
13108 }
13109 // go
13110 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13111 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13112 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13113 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13114 return true;
13115 }
13116 np->tiers.insert(tierpool_id);
13117 np->read_tier = np->write_tier = tierpool_id;
13118 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13119 np->set_last_force_op_resend(pending_inc.epoch);
13120 ntp->set_last_force_op_resend(pending_inc.epoch);
13121 ntp->tier_of = pool_id;
13122 ntp->cache_mode = mode;
13123 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13124 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13125 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13126 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13127 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13128 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13129 ntp->hit_set_params = hsp;
13130 ntp->target_max_bytes = size;
13131 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13132 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13133 get_last_committed() + 1));
13134 return true;
13135 } else if (prefix == "osd pool set-quota") {
13136 string poolstr;
13137 cmd_getval(cmdmap, "pool", poolstr);
13138 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13139 if (pool_id < 0) {
13140 ss << "unrecognized pool '" << poolstr << "'";
13141 err = -ENOENT;
13142 goto reply;
13143 }
13144
13145 string field;
13146 cmd_getval(cmdmap, "field", field);
13147 if (field != "max_objects" && field != "max_bytes") {
13148 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13149 err = -EINVAL;
13150 goto reply;
13151 }
13152
13153 // val could contain unit designations, so we treat as a string
13154 string val;
13155 cmd_getval(cmdmap, "val", val);
13156 string tss;
13157 int64_t value;
13158 if (field == "max_objects") {
13159 value = strict_sistrtoll(val.c_str(), &tss);
13160 } else if (field == "max_bytes") {
13161 value = strict_iecstrtoll(val.c_str(), &tss);
13162 } else {
13163 ceph_abort_msg("unrecognized option");
13164 }
13165 if (!tss.empty()) {
13166 ss << "error parsing value '" << val << "': " << tss;
13167 err = -EINVAL;
13168 goto reply;
13169 }
13170
13171 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13172 if (field == "max_objects") {
13173 pi->quota_max_objects = value;
13174 } else if (field == "max_bytes") {
13175 pi->quota_max_bytes = value;
13176 } else {
13177 ceph_abort_msg("unrecognized option");
13178 }
13179 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13180 rs = ss.str();
13181 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13182 get_last_committed() + 1));
13183 return true;
13184 } else if (prefix == "osd pool application enable" ||
13185 prefix == "osd pool application disable" ||
13186 prefix == "osd pool application set" ||
13187 prefix == "osd pool application rm") {
13188 err = prepare_command_pool_application(prefix, cmdmap, ss);
13189 if (err == -EAGAIN) {
13190 goto wait;
13191 } else if (err < 0) {
13192 goto reply;
13193 } else {
13194 goto update;
13195 }
13196 } else if (prefix == "osd force-create-pg") {
13197 pg_t pgid;
13198 string pgidstr;
13199 cmd_getval(cmdmap, "pgid", pgidstr);
13200 if (!pgid.parse(pgidstr.c_str())) {
13201 ss << "invalid pgid '" << pgidstr << "'";
13202 err = -EINVAL;
13203 goto reply;
13204 }
13205 if (!osdmap.pg_exists(pgid)) {
13206 ss << "pg " << pgid << " should not exist";
13207 err = -ENOENT;
13208 goto reply;
13209 }
13210 bool sure = false;
13211 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13212 if (!sure) {
13213 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13214 << "that the cluster will give up ever trying to recover the lost data. Do this "
13215 << "only if you are certain that all copies of the PG are in fact lost and you are "
13216 << "willing to accept that the data is permanently destroyed. Pass "
13217 << "--yes-i-really-mean-it to proceed.";
13218 err = -EPERM;
13219 goto reply;
13220 }
13221 bool creating_now;
13222 {
13223 std::lock_guard<std::mutex> l(creating_pgs_lock);
13224 auto emplaced = creating_pgs.pgs.emplace(
13225 pgid,
13226 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13227 ceph_clock_now()));
13228 creating_now = emplaced.second;
13229 }
13230 if (creating_now) {
13231 ss << "pg " << pgidstr << " now creating, ok";
13232 // set the pool's CREATING flag so that (1) the osd won't ignore our
13233 // create message and (2) we won't propose any future pg_num changes
13234 // until after the PG has been instantiated.
13235 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13236 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13237 }
13238 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13239 err = 0;
13240 goto update;
13241 } else {
13242 ss << "pg " << pgid << " already creating";
13243 err = 0;
13244 goto reply;
13245 }
13246 } else {
13247 err = -EINVAL;
13248 }
13249
13250 reply:
13251 getline(ss, rs);
13252 if (err < 0 && rs.length() == 0)
13253 rs = cpp_strerror(err);
13254 mon->reply_command(op, err, rs, rdata, get_last_committed());
13255 return ret;
13256
13257 update:
13258 getline(ss, rs);
13259 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13260 get_last_committed() + 1));
13261 return true;
13262
13263 wait:
13264 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13265 return true;
13266 }
13267
13268 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13269 {
13270 op->mark_osdmon_event(__func__);
13271
13272 auto m = op->get_req<MPoolOp>();
13273 MonSession *session = op->get_session();
13274 if (!session) {
13275 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13276 return true;
13277 }
13278
13279 switch (m->op) {
13280 case POOL_OP_CREATE_UNMANAGED_SNAP:
13281 case POOL_OP_DELETE_UNMANAGED_SNAP:
13282 {
13283 const std::string* pool_name = nullptr;
13284 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13285 if (pg_pool != nullptr) {
13286 pool_name = &osdmap.get_pool_name(m->pool);
13287 }
13288
13289 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13290 session->entity_name, session->caps,
13291 session->get_peer_socket_addr(),
13292 pool_name)) {
13293 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13294 << "privileges. message: " << *m << std::endl
13295 << "caps: " << session->caps << dendl;
13296 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13297 return true;
13298 }
13299 }
13300 break;
13301 default:
13302 if (!session->is_capable("osd", MON_CAP_W)) {
13303 dout(0) << "got pool op from entity with insufficient privileges. "
13304 << "message: " << *m << std::endl
13305 << "caps: " << session->caps << dendl;
13306 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13307 return true;
13308 }
13309 break;
13310 }
13311
13312 return false;
13313 }
13314
13315 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13316 {
13317 op->mark_osdmon_event(__func__);
13318 auto m = op->get_req<MPoolOp>();
13319
13320 if (enforce_pool_op_caps(op)) {
13321 return true;
13322 }
13323
13324 if (m->fsid != mon->monmap->fsid) {
13325 dout(0) << __func__ << " drop message on fsid " << m->fsid
13326 << " != " << mon->monmap->fsid << " for " << *m << dendl;
13327 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13328 return true;
13329 }
13330
13331 if (m->op == POOL_OP_CREATE)
13332 return preprocess_pool_op_create(op);
13333
13334 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13335 if (p == nullptr) {
13336 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13337 if (m->op == POOL_OP_DELETE) {
13338 _pool_op_reply(op, 0, osdmap.get_epoch());
13339 } else {
13340 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13341 }
13342 return true;
13343 }
13344
13345 // check if the snap and snapname exist
13346 bool snap_exists = false;
13347 if (p->snap_exists(m->name.c_str()))
13348 snap_exists = true;
13349
13350 switch (m->op) {
13351 case POOL_OP_CREATE_SNAP:
13352 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13353 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13354 return true;
13355 }
13356 if (snap_exists) {
13357 _pool_op_reply(op, 0, osdmap.get_epoch());
13358 return true;
13359 }
13360 return false;
13361 case POOL_OP_CREATE_UNMANAGED_SNAP:
13362 if (p->is_pool_snaps_mode()) {
13363 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13364 return true;
13365 }
13366 return false;
13367 case POOL_OP_DELETE_SNAP:
13368 if (p->is_unmanaged_snaps_mode()) {
13369 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13370 return true;
13371 }
13372 if (!snap_exists) {
13373 _pool_op_reply(op, 0, osdmap.get_epoch());
13374 return true;
13375 }
13376 return false;
13377 case POOL_OP_DELETE_UNMANAGED_SNAP:
13378 if (p->is_pool_snaps_mode()) {
13379 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13380 return true;
13381 }
13382 if (_is_removed_snap(m->pool, m->snapid)) {
13383 _pool_op_reply(op, 0, osdmap.get_epoch());
13384 return true;
13385 }
13386 return false;
13387 case POOL_OP_DELETE:
13388 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13389 _pool_op_reply(op, 0, osdmap.get_epoch());
13390 return true;
13391 }
13392 return false;
13393 case POOL_OP_AUID_CHANGE:
13394 return false;
13395 default:
13396 ceph_abort();
13397 break;
13398 }
13399
13400 return false;
13401 }
13402
13403 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13404 {
13405 if (!osdmap.have_pg_pool(pool)) {
13406 dout(10) << __func__ << " pool " << pool << " snap " << snap
13407 << " - pool dne" << dendl;
13408 return true;
13409 }
13410 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13411 dout(10) << __func__ << " pool " << pool << " snap " << snap
13412 << " - in osdmap removed_snaps_queue" << dendl;
13413 return true;
13414 }
13415 snapid_t begin, end;
13416 int r = lookup_purged_snap(pool, snap, &begin, &end);
13417 if (r == 0) {
13418 dout(10) << __func__ << " pool " << pool << " snap " << snap
13419 << " - purged, [" << begin << "," << end << ")" << dendl;
13420 return true;
13421 }
13422 return false;
13423 }
13424
13425 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13426 {
13427 if (pending_inc.old_pools.count(pool)) {
13428 dout(10) << __func__ << " pool " << pool << " snap " << snap
13429 << " - pool pending deletion" << dendl;
13430 return true;
13431 }
13432 if (pending_inc.in_new_removed_snaps(pool, snap)) {
13433 dout(10) << __func__ << " pool " << pool << " snap " << snap
13434 << " - in pending new_removed_snaps" << dendl;
13435 return true;
13436 }
13437 return false;
13438 }
13439
13440 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13441 {
13442 op->mark_osdmon_event(__func__);
13443 auto m = op->get_req<MPoolOp>();
13444 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13445 if (pool >= 0) {
13446 _pool_op_reply(op, 0, osdmap.get_epoch());
13447 return true;
13448 }
13449
13450 return false;
13451 }
13452
13453 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13454 {
13455 op->mark_osdmon_event(__func__);
13456 auto m = op->get_req<MPoolOp>();
13457 dout(10) << "prepare_pool_op " << *m << dendl;
13458 if (m->op == POOL_OP_CREATE) {
13459 return prepare_pool_op_create(op);
13460 } else if (m->op == POOL_OP_DELETE) {
13461 return prepare_pool_op_delete(op);
13462 }
13463
13464 int ret = 0;
13465 bool changed = false;
13466
13467 if (!osdmap.have_pg_pool(m->pool)) {
13468 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13469 return false;
13470 }
13471
13472 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13473
13474 switch (m->op) {
13475 case POOL_OP_CREATE_SNAP:
13476 if (pool->is_tier()) {
13477 ret = -EINVAL;
13478 _pool_op_reply(op, ret, osdmap.get_epoch());
13479 return false;
13480 } // else, fall through
13481 case POOL_OP_DELETE_SNAP:
13482 if (!pool->is_unmanaged_snaps_mode()) {
13483 bool snap_exists = pool->snap_exists(m->name.c_str());
13484 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13485 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13486 ret = 0;
13487 } else {
13488 break;
13489 }
13490 } else {
13491 ret = -EINVAL;
13492 }
13493 _pool_op_reply(op, ret, osdmap.get_epoch());
13494 return false;
13495
13496 case POOL_OP_DELETE_UNMANAGED_SNAP:
13497 // we won't allow removal of an unmanaged snapshot from a pool
13498 // not in unmanaged snaps mode.
13499 if (!pool->is_unmanaged_snaps_mode()) {
13500 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13501 return false;
13502 }
13503 /* fall-thru */
13504 case POOL_OP_CREATE_UNMANAGED_SNAP:
13505 // but we will allow creating an unmanaged snapshot on any pool
13506 // as long as it is not in 'pool' snaps mode.
13507 if (pool->is_pool_snaps_mode()) {
13508 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13509 return false;
13510 }
13511 }
13512
13513 // projected pool info
13514 pg_pool_t pp;
13515 if (pending_inc.new_pools.count(m->pool))
13516 pp = pending_inc.new_pools[m->pool];
13517 else
13518 pp = *osdmap.get_pg_pool(m->pool);
13519
13520 bufferlist reply_data;
13521
13522 // pool snaps vs unmanaged snaps are mutually exclusive
13523 switch (m->op) {
13524 case POOL_OP_CREATE_SNAP:
13525 case POOL_OP_DELETE_SNAP:
13526 if (pp.is_unmanaged_snaps_mode()) {
13527 ret = -EINVAL;
13528 goto out;
13529 }
13530 break;
13531
13532 case POOL_OP_CREATE_UNMANAGED_SNAP:
13533 case POOL_OP_DELETE_UNMANAGED_SNAP:
13534 if (pp.is_pool_snaps_mode()) {
13535 ret = -EINVAL;
13536 goto out;
13537 }
13538 }
13539
13540 switch (m->op) {
13541 case POOL_OP_CREATE_SNAP:
13542 if (!pp.snap_exists(m->name.c_str())) {
13543 pp.add_snap(m->name.c_str(), ceph_clock_now());
13544 dout(10) << "create snap in pool " << m->pool << " " << m->name
13545 << " seq " << pp.get_snap_epoch() << dendl;
13546 changed = true;
13547 }
13548 break;
13549
13550 case POOL_OP_DELETE_SNAP:
13551 {
13552 snapid_t s = pp.snap_exists(m->name.c_str());
13553 if (s) {
13554 pp.remove_snap(s);
13555 pending_inc.new_removed_snaps[m->pool].insert(s);
13556 changed = true;
13557 }
13558 }
13559 break;
13560
13561 case POOL_OP_CREATE_UNMANAGED_SNAP:
13562 {
13563 uint64_t snapid = pp.add_unmanaged_snap(
13564 osdmap.require_osd_release < ceph_release_t::octopus);
13565 encode(snapid, reply_data);
13566 changed = true;
13567 }
13568 break;
13569
13570 case POOL_OP_DELETE_UNMANAGED_SNAP:
13571 if (!_is_removed_snap(m->pool, m->snapid) &&
13572 !_is_pending_removed_snap(m->pool, m->snapid)) {
13573 if (m->snapid > pp.get_snap_seq()) {
13574 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13575 return false;
13576 }
13577 pp.remove_unmanaged_snap(
13578 m->snapid,
13579 osdmap.require_osd_release < ceph_release_t::octopus);
13580 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13581 // also record the new seq as purged: this avoids a discontinuity
13582 // after all of the snaps have been purged, since the seq assigned
13583 // during removal lives in the same namespace as the actual snaps.
13584 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13585 changed = true;
13586 }
13587 break;
13588
13589 case POOL_OP_AUID_CHANGE:
13590 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13591 return false;
13592
13593 default:
13594 ceph_abort();
13595 break;
13596 }
13597
13598 if (changed) {
13599 pp.set_snap_epoch(pending_inc.epoch);
13600 pending_inc.new_pools[m->pool] = pp;
13601 }
13602
13603 out:
13604 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13605 return true;
13606 }
13607
13608 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13609 {
13610 op->mark_osdmon_event(__func__);
13611 int err = prepare_new_pool(op);
13612 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13613 return true;
13614 }
13615
13616 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13617 ostream *ss)
13618 {
13619 const string& poolstr = osdmap.get_pool_name(pool_id);
13620
13621 // If the Pool is in use by CephFS, refuse to delete it
13622 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13623 if (pending_fsmap.pool_in_use(pool_id)) {
13624 *ss << "pool '" << poolstr << "' is in use by CephFS";
13625 return -EBUSY;
13626 }
13627
13628 if (pool.tier_of >= 0) {
13629 *ss << "pool '" << poolstr << "' is a tier of '"
13630 << osdmap.get_pool_name(pool.tier_of) << "'";
13631 return -EBUSY;
13632 }
13633 if (!pool.tiers.empty()) {
13634 *ss << "pool '" << poolstr << "' has tiers";
13635 for(auto tier : pool.tiers) {
13636 *ss << " " << osdmap.get_pool_name(tier);
13637 }
13638 return -EBUSY;
13639 }
13640
13641 if (!g_conf()->mon_allow_pool_delete) {
13642 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13643 return -EPERM;
13644 }
13645
13646 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13647 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13648 return -EPERM;
13649 }
13650
13651 *ss << "pool '" << poolstr << "' removed";
13652 return 0;
13653 }
13654
13655 /**
13656 * Check if it is safe to add a tier to a base pool
13657 *
13658 * @return
13659 * True if the operation should proceed, false if we should abort here
13660 * (abort doesn't necessarily mean error, could be idempotency)
13661 */
13662 bool OSDMonitor::_check_become_tier(
13663 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13664 const int64_t base_pool_id, const pg_pool_t *base_pool,
13665 int *err,
13666 ostream *ss) const
13667 {
13668 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13669 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13670
13671 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13672 if (pending_fsmap.pool_in_use(tier_pool_id)) {
13673 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13674 *err = -EBUSY;
13675 return false;
13676 }
13677
13678 if (base_pool->tiers.count(tier_pool_id)) {
13679 ceph_assert(tier_pool->tier_of == base_pool_id);
13680 *err = 0;
13681 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13682 << base_pool_name << "'";
13683 return false;
13684 }
13685
13686 if (base_pool->is_tier()) {
13687 *ss << "pool '" << base_pool_name << "' is already a tier of '"
13688 << osdmap.get_pool_name(base_pool->tier_of) << "', "
13689 << "multiple tiers are not yet supported.";
13690 *err = -EINVAL;
13691 return false;
13692 }
13693
13694 if (tier_pool->has_tiers()) {
13695 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13696 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13697 it != tier_pool->tiers.end(); ++it)
13698 *ss << "'" << osdmap.get_pool_name(*it) << "',";
13699 *ss << " multiple tiers are not yet supported.";
13700 *err = -EINVAL;
13701 return false;
13702 }
13703
13704 if (tier_pool->is_tier()) {
13705 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13706 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13707 *err = -EINVAL;
13708 return false;
13709 }
13710
13711 *err = 0;
13712 return true;
13713 }
13714
13715
13716 /**
13717 * Check if it is safe to remove a tier from this base pool
13718 *
13719 * @return
13720 * True if the operation should proceed, false if we should abort here
13721 * (abort doesn't necessarily mean error, could be idempotency)
13722 */
13723 bool OSDMonitor::_check_remove_tier(
13724 const int64_t base_pool_id, const pg_pool_t *base_pool,
13725 const pg_pool_t *tier_pool,
13726 int *err, ostream *ss) const
13727 {
13728 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13729
13730 // Apply CephFS-specific checks
13731 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13732 if (pending_fsmap.pool_in_use(base_pool_id)) {
13733 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13734 // If the underlying pool is erasure coded and does not allow EC
13735 // overwrites, we can't permit the removal of the replicated tier that
13736 // CephFS relies on to access it
13737 *ss << "pool '" << base_pool_name <<
13738 "' does not allow EC overwrites and is in use by CephFS"
13739 " via its tier";
13740 *err = -EBUSY;
13741 return false;
13742 }
13743
13744 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13745 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13746 "tier is still in use as a writeback cache. Change the cache "
13747 "mode and flush the cache before removing it";
13748 *err = -EBUSY;
13749 return false;
13750 }
13751 }
13752
13753 *err = 0;
13754 return true;
13755 }
13756
13757 int OSDMonitor::_prepare_remove_pool(
13758 int64_t pool, ostream *ss, bool no_fake)
13759 {
13760 dout(10) << __func__ << " " << pool << dendl;
13761 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13762 int r = _check_remove_pool(pool, *p, ss);
13763 if (r < 0)
13764 return r;
13765
13766 auto new_pool = pending_inc.new_pools.find(pool);
13767 if (new_pool != pending_inc.new_pools.end()) {
13768 // if there is a problem with the pending info, wait and retry
13769 // this op.
13770 const auto& p = new_pool->second;
13771 int r = _check_remove_pool(pool, p, ss);
13772 if (r < 0)
13773 return -EAGAIN;
13774 }
13775
13776 if (pending_inc.old_pools.count(pool)) {
13777 dout(10) << __func__ << " " << pool << " already pending removal"
13778 << dendl;
13779 return 0;
13780 }
13781
13782 if (g_conf()->mon_fake_pool_delete && !no_fake) {
13783 string old_name = osdmap.get_pool_name(pool);
13784 string new_name = old_name + "." + stringify(pool) + ".DELETED";
13785 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13786 << old_name << " -> " << new_name << dendl;
13787 pending_inc.new_pool_names[pool] = new_name;
13788 return 0;
13789 }
13790
13791 // remove
13792 pending_inc.old_pools.insert(pool);
13793
13794 // remove any pg_temp mappings for this pool
13795 for (auto p = osdmap.pg_temp->begin();
13796 p != osdmap.pg_temp->end();
13797 ++p) {
13798 if (p->first.pool() == pool) {
13799 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
13800 << p->first << dendl;
13801 pending_inc.new_pg_temp[p->first].clear();
13802 }
13803 }
13804 // remove any primary_temp mappings for this pool
13805 for (auto p = osdmap.primary_temp->begin();
13806 p != osdmap.primary_temp->end();
13807 ++p) {
13808 if (p->first.pool() == pool) {
13809 dout(10) << __func__ << " " << pool
13810 << " removing obsolete primary_temp" << p->first << dendl;
13811 pending_inc.new_primary_temp[p->first] = -1;
13812 }
13813 }
13814 // remove any pg_upmap mappings for this pool
13815 for (auto& p : osdmap.pg_upmap) {
13816 if (p.first.pool() == pool) {
13817 dout(10) << __func__ << " " << pool
13818 << " removing obsolete pg_upmap "
13819 << p.first << dendl;
13820 pending_inc.old_pg_upmap.insert(p.first);
13821 }
13822 }
13823 // remove any pending pg_upmap mappings for this pool
13824 {
13825 auto it = pending_inc.new_pg_upmap.begin();
13826 while (it != pending_inc.new_pg_upmap.end()) {
13827 if (it->first.pool() == pool) {
13828 dout(10) << __func__ << " " << pool
13829 << " removing pending pg_upmap "
13830 << it->first << dendl;
13831 it = pending_inc.new_pg_upmap.erase(it);
13832 } else {
13833 it++;
13834 }
13835 }
13836 }
13837 // remove any pg_upmap_items mappings for this pool
13838 for (auto& p : osdmap.pg_upmap_items) {
13839 if (p.first.pool() == pool) {
13840 dout(10) << __func__ << " " << pool
13841 << " removing obsolete pg_upmap_items " << p.first
13842 << dendl;
13843 pending_inc.old_pg_upmap_items.insert(p.first);
13844 }
13845 }
13846 // remove any pending pg_upmap mappings for this pool
13847 {
13848 auto it = pending_inc.new_pg_upmap_items.begin();
13849 while (it != pending_inc.new_pg_upmap_items.end()) {
13850 if (it->first.pool() == pool) {
13851 dout(10) << __func__ << " " << pool
13852 << " removing pending pg_upmap_items "
13853 << it->first << dendl;
13854 it = pending_inc.new_pg_upmap_items.erase(it);
13855 } else {
13856 it++;
13857 }
13858 }
13859 }
13860
13861 // remove any choose_args for this pool
13862 CrushWrapper newcrush;
13863 _get_pending_crush(newcrush);
13864 if (newcrush.have_choose_args(pool)) {
13865 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13866 newcrush.rm_choose_args(pool);
13867 pending_inc.crush.clear();
13868 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13869 }
13870 return 0;
13871 }
13872
13873 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13874 {
13875 dout(10) << "_prepare_rename_pool " << pool << dendl;
13876 if (pending_inc.old_pools.count(pool)) {
13877 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13878 return -ENOENT;
13879 }
13880 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13881 p != pending_inc.new_pool_names.end();
13882 ++p) {
13883 if (p->second == newname && p->first != pool) {
13884 return -EEXIST;
13885 }
13886 }
13887
13888 pending_inc.new_pool_names[pool] = newname;
13889 return 0;
13890 }
13891
13892 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13893 {
13894 op->mark_osdmon_event(__func__);
13895 auto m = op->get_req<MPoolOp>();
13896 ostringstream ss;
13897 int ret = _prepare_remove_pool(m->pool, &ss, false);
13898 if (ret == -EAGAIN) {
13899 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13900 return true;
13901 }
13902 if (ret < 0)
13903 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13904 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13905 pending_inc.epoch));
13906 return true;
13907 }
13908
13909 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
13910 int ret, epoch_t epoch, bufferlist *blp)
13911 {
13912 op->mark_osdmon_event(__func__);
13913 auto m = op->get_req<MPoolOp>();
13914 dout(20) << "_pool_op_reply " << ret << dendl;
13915 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13916 ret, epoch, get_last_committed(), blp);
13917 mon->send_reply(op, reply);
13918 }
13919
13920 void OSDMonitor::convert_pool_priorities(void)
13921 {
13922 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13923 int64_t max_prio = 0;
13924 int64_t min_prio = 0;
13925 for (const auto &i : osdmap.get_pools()) {
13926 const auto &pool = i.second;
13927
13928 if (pool.opts.is_set(key)) {
13929 int64_t prio = 0;
13930 pool.opts.get(key, &prio);
13931 if (prio > max_prio)
13932 max_prio = prio;
13933 if (prio < min_prio)
13934 min_prio = prio;
13935 }
13936 }
13937 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13938 dout(20) << __func__ << " nothing to fix" << dendl;
13939 return;
13940 }
13941 // Current pool priorities exceeds new maximum
13942 for (const auto &i : osdmap.get_pools()) {
13943 const auto pool_id = i.first;
13944 pg_pool_t pool = i.second;
13945
13946 int64_t prio = 0;
13947 pool.opts.get(key, &prio);
13948 int64_t n;
13949
13950 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13951 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13952 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13953 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13954 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13955 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13956 } else {
13957 continue;
13958 }
13959 if (n == 0) {
13960 pool.opts.unset(key);
13961 } else {
13962 pool.opts.set(key, static_cast<int64_t>(n));
13963 }
13964 dout(10) << __func__ << " pool " << pool_id
13965 << " recovery_priority adjusted "
13966 << prio << " to " << n << dendl;
13967 pool.last_change = pending_inc.epoch;
13968 pending_inc.new_pools[pool_id] = pool;
13969 }
13970 }